From 22569d9b511163e49f6822df0eac1b3fed69141d Mon Sep 17 00:00:00 2001
From: Juniper Langenstein <juniper.langenstein@helsinki.fi>
Date: Fri, 18 Nov 2022 12:47:43 +0000
Subject: [PATCH 001/120] Initial work on supporting some async memory
 transfers

---
 Cargo.toml                                    |   6 +-
 .../src/rust_to_cuda/field_copy.rs            |  38 ++
 rust-cuda-derive/src/rust_to_cuda/impl.rs     |   2 +
 rust-cuda-derive/src/rust_to_cuda/mod.rs      |   7 +
 rust-cuda-ptx-jit/Cargo.toml                  |   2 +-
 src/common.rs                                 |  62 +++-
 src/device/mod.rs                             |  12 +-
 src/host.rs                                   | 351 ++++++++++++++++--
 src/utils/aliasing/const.rs                   |  38 +-
 src/utils/aliasing/dynamic.rs                 |  39 +-
 src/utils/aliasing/final.rs                   |  38 +-
 src/utils/device_copy.rs                      |  48 ++-
 src/utils/exchange/buffer/common.rs           |   4 +-
 src/utils/exchange/buffer/device.rs           |  11 +-
 src/utils/exchange/buffer/host.rs             |  65 +++-
 src/utils/exchange/buffer/mod.rs              |   4 +-
 src/utils/exchange/wrapper.rs                 | 213 ++++++++++-
 src/utils/option.rs                           |  83 ++++-
 18 files changed, 942 insertions(+), 81 deletions(-)
diff --git a/Cargo.toml b/Cargo.toml
index e8c86665b..17a279023 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -24,10 +24,10 @@ host = ["rustacuda", "rust-cuda-ptx-jit/host"]
 derive = ["rustacuda_derive", "rust-cuda-derive"]
 
 [dependencies]
-rustacuda_core = "0.1.2"
+rustacuda_core = { git = "https://github.com/MomoLangenstein/RustaCUDA", rev = "fa4bf52" }
 
-rustacuda = { version = "0.1.3", optional = true }
-rustacuda_derive = { version = "0.1.2", optional = true }
+rustacuda = { git = "https://github.com/MomoLangenstein/RustaCUDA", rev = "fa4bf52", optional = true }
+rustacuda_derive = { git = "https://github.com/MomoLangenstein/RustaCUDA", rev = "fa4bf52", optional = true }
 
 const-type-layout = { version = "0.2.0", features = ["derive"] }
 
diff --git a/rust-cuda-derive/src/rust_to_cuda/field_copy.rs b/rust-cuda-derive/src/rust_to_cuda/field_copy.rs
index 0ddca9b28..93326aab6 100644
--- a/rust-cuda-derive/src/rust_to_cuda/field_copy.rs
+++ b/rust-cuda-derive/src/rust_to_cuda/field_copy.rs
@@ -13,8 +13,10 @@ pub fn impl_field_copy_init_and_expand_alloc_type(
     mut combined_cuda_alloc_type: TokenStream,
 
     r2c_field_declarations: &mut Vec<TokenStream>,
+    r2c_field_async_declarations: &mut Vec<TokenStream>,
     r2c_field_initialisations: &mut Vec<TokenStream>,
     r2c_field_destructors: &mut Vec<TokenStream>,
+    r2c_field_async_destructors: &mut Vec<TokenStream>,
 
     c2r_field_initialisations: &mut Vec<TokenStream>,
 ) -> TokenStream {
@@ -35,6 +37,11 @@ pub fn impl_field_copy_init_and_expand_alloc_type(
                     &self.#field_accessor,
                 );
             });
+            r2c_field_async_declarations.push(quote! {
+                let #field_repr_ident = rust_cuda::common::DeviceAccessible::from(
+                    &self.#field_accessor,
+                );
+            });
 
             r2c_field_initialisations.push(quote! {
                 #optional_field_ident #field_repr_ident,
@@ -60,6 +67,13 @@ pub fn impl_field_copy_init_and_expand_alloc_type(
                     alloc_front,
                 )?;
             });
+            r2c_field_async_declarations.push(quote! {
+                let (#field_repr_ident, alloc_front) = rust_cuda::common::RustToCudaAsync::borrow_async(
+                    &self.#field_accessor,
+                    alloc_front,
+                    stream,
+                )?;
+            });
 
             r2c_field_initialisations.push(quote! {
                 #optional_field_ident #field_repr_ident,
@@ -71,6 +85,13 @@ pub fn impl_field_copy_init_and_expand_alloc_type(
                     alloc_front,
                 )?;
             });
+            r2c_field_async_destructors.push(quote! {
+                let alloc_front = rust_cuda::common::RustToCudaAsync::restore_async(
+                    &mut self.#field_accessor,
+                    alloc_front,
+                    stream,
+                )?;
+            });
 
             c2r_field_initialisations.push(quote! {
                 #optional_field_ident {
@@ -94,6 +115,15 @@ pub fn impl_field_copy_init_and_expand_alloc_type(
                     alloc_front,
                 )?;
             });
+            r2c_field_async_declarations.push(quote! {
+                let (#field_repr_ident, alloc_front) = rust_cuda::common::RustToCudaAsync::borrow_async(
+                    <
+                        #proxy_ty as rust_cuda::common::RustToCudaAsyncProxy<#field_ty>
+                    >::from_ref(&self.#field_accessor),
+                    alloc_front,
+                    stream,
+                )?;
+            });
 
             r2c_field_initialisations.push(quote! {
                 #optional_field_ident #field_repr_ident,
@@ -107,6 +137,14 @@ pub fn impl_field_copy_init_and_expand_alloc_type(
                     alloc_front,
                 )?;
             });
+            r2c_field_async_destructors.push(quote! {
+                let alloc_front = rust_cuda::common::RustToCudaAsync::restore_async(
+                    <
+                        #proxy_ty as rust_cuda::common::RustToCudaAsyncProxy<#field_ty>
+                    >::from_mut(&mut self.#field_accessor),
+                    alloc_front,
+                )?;
+            });
 
             c2r_field_initialisations.push(quote! {
                 #optional_field_ident {
diff --git a/rust-cuda-derive/src/rust_to_cuda/impl.rs b/rust-cuda-derive/src/rust_to_cuda/impl.rs
index 8b99e4f73..2c6593068 100644
--- a/rust-cuda-derive/src/rust_to_cuda/impl.rs
+++ b/rust-cuda-derive/src/rust_to_cuda/impl.rs
@@ -42,6 +42,8 @@ pub fn cuda_struct_declaration(
     }
 }
 
+// TODO: derive async impl as well -> need different trait bounds
+
 #[allow(clippy::too_many_arguments)]
 pub fn rust_to_cuda_trait(
     struct_name: &syn::Ident,
diff --git a/rust-cuda-derive/src/rust_to_cuda/mod.rs b/rust-cuda-derive/src/rust_to_cuda/mod.rs
index 18589b78a..00e756c00 100644
--- a/rust-cuda-derive/src/rust_to_cuda/mod.rs
+++ b/rust-cuda-derive/src/rust_to_cuda/mod.rs
@@ -25,8 +25,10 @@ pub fn impl_rust_to_cuda(ast: &syn::DeriveInput) -> proc_macro::TokenStream {
         rust_cuda::host::NullCudaAlloc
     };
     let mut r2c_field_declarations: Vec<TokenStream> = Vec::new();
+    let mut r2c_field_async_declarations: Vec<TokenStream> = Vec::new();
     let mut r2c_field_initialisations: Vec<TokenStream> = Vec::new();
     let mut r2c_field_destructors: Vec<TokenStream> = Vec::new();
+    let mut r2c_field_async_destructors: Vec<TokenStream> = Vec::new();
 
     let mut c2r_field_initialisations: Vec<TokenStream> = Vec::new();
 
@@ -40,6 +42,7 @@ pub fn impl_rust_to_cuda(ast: &syn::DeriveInput) -> proc_macro::TokenStream {
             ..
         }) => {
             let mut r2c_field_destructors_reverse: Vec<TokenStream> = Vec::new();
+            let mut r2c_field_async_destructors_reverse: Vec<TokenStream> = Vec::new();
 
             for (field_index, field) in fields.iter_mut().enumerate() {
                 let cuda_repr_field_ty = field_ty::swap_field_type_and_filter_attrs(field);
@@ -50,14 +53,18 @@ pub fn impl_rust_to_cuda(ast: &syn::DeriveInput) -> proc_macro::TokenStream {
                     &cuda_repr_field_ty,
                     combined_cuda_alloc_type,
                     &mut r2c_field_declarations,
+                    &mut r2c_field_async_declarations,
                     &mut r2c_field_initialisations,
                     &mut r2c_field_destructors_reverse,
+                    &mut r2c_field_async_destructors_reverse,
                     &mut c2r_field_initialisations,
                 );
             }
 
             // The fields must be deallocated in the reverse order of their allocation
             r2c_field_destructors.extend(r2c_field_destructors_reverse.into_iter().rev());
+            r2c_field_async_destructors
+                .extend(r2c_field_async_destructors_reverse.into_iter().rev());
         },
         syn::Fields::Unit => (),
     }
diff --git a/rust-cuda-ptx-jit/Cargo.toml b/rust-cuda-ptx-jit/Cargo.toml
index f2a4cd09a..d5b832eb8 100644
--- a/rust-cuda-ptx-jit/Cargo.toml
+++ b/rust-cuda-ptx-jit/Cargo.toml
@@ -12,6 +12,6 @@ default = []
 host = ["regex", "rustacuda", "lazy_static"]
 
 [dependencies]
-rustacuda = { version = "0.1.3", optional = true }
+rustacuda = { git = "https://github.com/MomoLangenstein/RustaCUDA", rev = "fa4bf52", optional = true }
 regex = { version = "1.5", optional = true }
 lazy_static = { version = "1.4", optional = true }
diff --git a/src/common.rs b/src/common.rs
index b2d398e09..abb196c05 100644
--- a/src/common.rs
+++ b/src/common.rs
@@ -88,12 +88,13 @@ pub unsafe trait RustToCuda {
     #[doc(cfg(feature = "host"))]
     /// # Errors
     ///
-    /// Returns a `rustacuda::errors::CudaError` iff an error occurs inside CUDA
+    /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside
+    /// CUDA
     ///
     /// # Safety
     ///
     /// This is an internal function and should NEVER be called manually
-    /// The returned `Self::CudaRepresentation` must NEVER be accessed on the
+    /// The returned [`Self::CudaRepresentation`] must NEVER be accessed on the
     ///  CPU  as it contains a GPU-resident copy of `self`.
     #[allow(clippy::type_complexity)]
     unsafe fn borrow<A: crate::host::CudaAlloc>(
@@ -108,7 +109,8 @@ pub unsafe trait RustToCuda {
     #[doc(cfg(feature = "host"))]
     /// # Errors
     ///
-    /// Returns a `rustacuda::errors::CudaError` iff an error occurs inside CUDA
+    /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside
+    /// CUDA
     ///
     /// # Safety
     ///
@@ -120,6 +122,53 @@ pub unsafe trait RustToCuda {
     ) -> rustacuda::error::CudaResult<A>;
 }
 
+/// # Safety
+///
+/// This is an internal trait and should ONLY be derived automatically using
+/// `#[derive(LendRustToCuda)]`
+pub unsafe trait RustToCudaAsync: RustToCuda {
+    #[cfg(feature = "host")]
+    #[doc(cfg(feature = "host"))]
+    /// # Errors
+    ///
+    /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside
+    /// CUDA
+    ///
+    /// # Safety
+    ///
+    /// This is an internal function and should NEVER be called manually
+    /// The returned
+    /// [`Self::CudaRepresentation`](RustToCuda::CudaRepresentation) must NEVER
+    /// be accessed on the  CPU  as it contains a GPU-resident copy of
+    /// `self`.
+    #[allow(clippy::type_complexity)]
+    unsafe fn borrow_async<A: crate::host::CudaAlloc>(
+        &self,
+        alloc: A,
+        stream: &rustacuda::stream::Stream,
+    ) -> rustacuda::error::CudaResult<(
+        DeviceAccessible<Self::CudaRepresentation>,
+        crate::host::CombinedCudaAlloc<Self::CudaAllocation, A>,
+    )>;
+
+    #[cfg(feature = "host")]
+    #[doc(cfg(feature = "host"))]
+    /// # Errors
+    ///
+    /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside
+    /// CUDA
+    ///
+    /// # Safety
+    ///
+    /// This is an internal function and should NEVER be called manually
+    #[allow(clippy::type_complexity)]
+    unsafe fn restore_async<A: crate::host::CudaAlloc>(
+        &mut self,
+        alloc: crate::host::CombinedCudaAlloc<Self::CudaAllocation, A>,
+        stream: &rustacuda::stream::Stream,
+    ) -> rustacuda::error::CudaResult<A>;
+}
+
 /// # Safety
 ///
 /// This is an internal trait and should NEVER be implemented manually
@@ -141,6 +190,13 @@ pub trait RustToCudaProxy<T>: RustToCuda {
     fn into(self) -> T;
 }
 
+pub trait RustToCudaAsyncProxy<T>: RustToCudaAsync {
+    fn from_ref(val: &T) -> &Self;
+    fn from_mut(val: &mut T) -> &mut Self;
+
+    fn into(self) -> T;
+}
+
 #[repr(transparent)]
 #[derive(Clone, Copy, TypeLayout)]
 pub struct DeviceConstRef<'r, T: DeviceCopy + 'r> {
diff --git a/src/device/mod.rs b/src/device/mod.rs
index 225bc8252..39ae0719f 100644
--- a/src/device/mod.rs
+++ b/src/device/mod.rs
@@ -18,8 +18,8 @@ pub trait BorrowFromRust: RustToCuda {
     /// # Safety
     ///
     /// This function is only safe to call iff `cuda_repr` is the
-    ///  `DeviceConstRef` borrowed on the CPU using the corresponding
-    ///  `LendToCuda::lend_to_cuda`.
+    ///  [`DeviceConstRef`] borrowed on the CPU using the corresponding
+    ///  [`LendToCuda::lend_to_cuda`](crate::host::LendToCuda::lend_to_cuda).
     unsafe fn with_borrow_from_rust<O, F: FnOnce(&ShallowCopy<Self>) -> O>(
         cuda_repr: DeviceConstRef<DeviceAccessible<<Self as RustToCuda>::CudaRepresentation>>,
         inner: F,
@@ -28,8 +28,8 @@ pub trait BorrowFromRust: RustToCuda {
     /// # Safety
     ///
     /// This function is only safe to call iff `cuda_repr_mut` is the
-    ///  `DeviceMutRef` borrowed on the CPU using the corresponding
-    ///  `LendToCuda::lend_to_cuda_mut`.
+    ///  [`DeviceMutRef`] borrowed on the CPU using the corresponding
+    ///  [`LendToCuda::lend_to_cuda_mut`](crate::host::LendToCuda::lend_to_cuda_mut).
     /// Furthermore, since different GPU threads can access heap storage
     ///  mutably inside the safe `inner` scope, there must not be any
     ///  aliasing between concurrently running threads.
@@ -41,8 +41,8 @@ pub trait BorrowFromRust: RustToCuda {
     /// # Safety
     ///
     /// This function is only safe to call iff `cuda_repr` is the
-    ///  `DeviceMutRef` borrowed on the CPU using the corresponding
-    ///  `LendToCuda::move_to_cuda`.
+    ///  [`DeviceMutRef`] borrowed on the CPU using the corresponding
+    ///  [`LendToCuda::move_to_cuda`](crate::host::LendToCuda::move_to_cuda).
     unsafe fn with_moved_from_rust<O, F: FnOnce(Self) -> O>(
         cuda_repr_mut: DeviceMutRef<DeviceAccessible<<Self as RustToCuda>::CudaRepresentation>>,
         inner: F,
diff --git a/src/host.rs b/src/host.rs
index 6c91a26bc..3c19ac6fd 100644
--- a/src/host.rs
+++ b/src/host.rs
@@ -7,8 +7,9 @@ use core::{
 use rustacuda::{
     context::Context,
     error::{CudaError, CudaResult},
+    event::Event,
     function::Function,
-    memory::{DeviceBox, DeviceBuffer, LockedBuffer},
+    memory::{DeviceBox, DeviceBuffer, LockedBox, LockedBuffer},
     module::Module,
     stream::Stream,
 };
@@ -32,7 +33,7 @@ pub trait Launcher {
 
     /// # Errors
     ///
-    /// Should only return a `CudaError` if some implementation-defined
+    /// Should only return a [`CudaError`] if some implementation-defined
     ///  critical kernel function configuration failed.
     #[allow(unused_variables)]
     fn on_compile(kernel: &Function, watcher: &mut Self::CompilationWatcher) -> CudaResult<()> {
@@ -72,7 +73,7 @@ pub struct TypedKernel<KernelTraitObject: ?Sized> {
 impl<KernelTraitObject: ?Sized> TypedKernel<KernelTraitObject> {
     /// # Errors
     ///
-    /// Returns a `CudaError` if `ptx` or `entry_point` contain nul bytes.
+    /// Returns a [`CudaError`] if `ptx` or `entry_point` contain nul bytes.
     pub fn new(ptx: &str, entry_point: &str) -> CudaResult<Self> {
         let ptx_cstring = std::ffi::CString::new(ptx).map_err(|_| CudaError::InvalidPtx)?;
 
@@ -92,7 +93,7 @@ impl<KernelTraitObject: ?Sized> TypedKernel<KernelTraitObject> {
 
     /// # Errors
     ///
-    /// Returns a `CudaError` if `ptx` (from [`Self::new`]) is not a valid
+    /// Returns a [`CudaError`] if `ptx` (from [`Self::new`]) is not a valid
     ///  PTX source, or it does not contain an entry point named `entry_point`
     ///  (from [`Self::new`]).
     pub fn compile_with_ptx_jit_args(
@@ -122,12 +123,12 @@ impl<KernelTraitObject: ?Sized> TypedKernel<KernelTraitObject> {
 pub trait LendToCuda: RustToCuda {
     /// Lends an immutable copy of `&self` to CUDA:
     /// - code in the CUDA kernel can only access `&self` through the
-    ///   `DeviceConstRef` inside the closure
+    ///   [`DeviceConstRef`] inside the closure
     /// - after the closure, `&self` will not have changed
     ///
     /// # Errors
     ///
-    /// Returns a `rustacuda::errors::CudaError` iff an error occurs inside CUDA
+    /// Returns a [`CudaError`] iff an error occurs inside CUDA
     fn lend_to_cuda<
         O,
         E: From<CudaError>,
@@ -141,7 +142,7 @@ pub trait LendToCuda: RustToCuda {
 
     /// Lends a mutable copy of `&mut self` to CUDA:
     /// - code in the CUDA kernel can only access `&mut self` through the
-    ///   `DeviceMutRef` inside the closure
+    ///   [`DeviceMutRef`] inside the closure
     /// - after the closure, `&mut self` might have changed in the following
     ///   ways:
     ///   - to avoid aliasing, each CUDA thread gets its own shallow copy of
@@ -152,7 +153,7 @@ pub trait LendToCuda: RustToCuda {
     ///
     /// # Errors
     ///
-    /// Returns a `rustacuda::errors::CudaError` iff an error occurs inside CUDA
+    /// Returns a [`CudaError`] iff an error occurs inside CUDA
     fn lend_to_cuda_mut<
         O,
         E: From<CudaError>,
@@ -164,11 +165,11 @@ pub trait LendToCuda: RustToCuda {
         inner: F,
     ) -> Result<O, E>;
 
-    /// Moves `self` to CUDA iff `self` is `SafeDeviceCopy`
+    /// Moves `self` to CUDA iff `self` is [`SafeDeviceCopy`]
     ///
     /// # Errors
     ///
-    /// Returns a `rustacuda::errors::CudaError` iff an error occurs inside CUDA
+    /// Returns a [`CudaError`] iff an error occurs inside CUDA
     fn move_to_cuda<
         O,
         E: From<CudaError>,
@@ -338,6 +339,7 @@ macro_rules! impl_sealed_drop_collection {
 impl_sealed_drop_collection!(DeviceBuffer);
 impl_sealed_drop_collection!(DeviceBox);
 impl_sealed_drop_collection!(LockedBuffer);
+impl_sealed_drop_collection!(LockedBox);
 
 macro_rules! impl_sealed_drop_value {
     ($type:ident) => {
@@ -352,6 +354,64 @@ macro_rules! impl_sealed_drop_value {
 impl_sealed_drop_value!(Module);
 impl_sealed_drop_value!(Stream);
 impl_sealed_drop_value!(Context);
+impl_sealed_drop_value!(Event);
+
+#[repr(transparent)]
+#[allow(clippy::module_name_repetitions)]
+pub struct HostLockedBox<T: DeviceCopy>(*mut T);
+
+impl<T: DeviceCopy> HostLockedBox<T> {
+    /// # Errors
+    /// Returns a [`CudaError`] iff an error occurs inside CUDA
+    pub fn new(value: T) -> CudaResult<Self> {
+        // Safety: uninitialised memory is immediately written to without reading it
+        let locked_ptr = unsafe {
+            let locked_ptr: *mut T = LockedBox::into_raw(LockedBox::uninitialized()?);
+            locked_ptr.write(value);
+            locked_ptr
+        };
+
+        Ok(Self(locked_ptr))
+    }
+}
+
+impl<T: DeviceCopy> Deref for HostLockedBox<T> {
+    type Target = T;
+
+    fn deref(&self) -> &Self::Target {
+        unsafe { &*self.0 }
+    }
+}
+
+impl<T: DeviceCopy> DerefMut for HostLockedBox<T> {
+    fn deref_mut(&mut self) -> &mut Self::Target {
+        unsafe { &mut *self.0 }
+    }
+}
+
+impl<T: DeviceCopy> From<LockedBox<T>> for HostLockedBox<T> {
+    fn from(locked_box: LockedBox<T>) -> Self {
+        Self(LockedBox::into_raw(locked_box))
+    }
+}
+
+impl<T: DeviceCopy> From<HostLockedBox<T>> for LockedBox<T> {
+    fn from(host_locked_box: HostLockedBox<T>) -> Self {
+        // Safety: pointer comes from [`LockedBox::into_raw`]
+        //         i.e. this function completes the roundtrip
+        unsafe { LockedBox::from_raw(host_locked_box.0) }
+    }
+}
+
+impl<T: DeviceCopy> Drop for HostLockedBox<T> {
+    fn drop(&mut self) {
+        // Safety: pointer comes from [`LockedBox::into_raw`]
+        //         i.e. this function completes the roundtrip
+        let locked_box = unsafe { LockedBox::from_raw(self.0) };
+
+        core::mem::drop(CudaDropWrapper::from(locked_box));
+    }
+}
 
 #[repr(transparent)]
 #[allow(clippy::module_name_repetitions)]
@@ -362,9 +422,9 @@ impl<T: DeviceCopy> private::alloc::Sealed for HostDeviceBox<T> {}
 impl<T: DeviceCopy> HostDeviceBox<T> {
     /// # Errors
     ///
-    /// Returns a `CudaError` iff copying from `value` into `self` failed.
+    /// Returns a [`CudaError`] iff copying from `value` into `self` failed.
     pub fn copy_from(&mut self, value: &T) -> CudaResult<()> {
-        // Safety: pointer comes from `DeviceBox::into_device`
+        // Safety: pointer comes from [`DeviceBox::into_device`]
         //         i.e. this function completes the roundtrip
         let mut device_box = unsafe { ManuallyDrop::new(DeviceBox::from_device(self.0)) };
 
@@ -373,14 +433,73 @@ impl<T: DeviceCopy> HostDeviceBox<T> {
 
     /// # Errors
     ///
-    /// Returns a `CudaError` iff copying from `self` into `value` failed.
+    /// Returns a [`CudaError`] iff copying from `self` into `value` failed.
     pub fn copy_to(&self, value: &mut T) -> CudaResult<()> {
-        // Safety: pointer comes from `DeviceBox::into_device`
+        // Safety: pointer comes from [`DeviceBox::into_device`]
         //         i.e. this function completes the roundtrip
         let device_box = unsafe { ManuallyDrop::new(DeviceBox::from_device(self.0)) };
 
         rustacuda::memory::CopyDestination::copy_to(&*device_box, value)
     }
+
+    /// # Errors
+    ///
+    /// Returns a [`CudaError`] iff copying from `value` into `self` failed.
+    ///
+    /// # Safety
+    ///
+    /// To use the data inside the device box, either
+    /// - the passed-in [`Stream`] must be synchronised
+    /// - the kernel must be launched on the passed-in [`Stream`]
+    pub unsafe fn async_copy_from(
+        &mut self,
+        value: &HostLockedBox<T>,
+        stream: &Stream,
+    ) -> CudaResult<()> {
+        // Safety: pointer comes from [`DeviceBox::into_device`]
+        //         i.e. this function completes the roundtrip
+        let mut device_box = unsafe { ManuallyDrop::new(DeviceBox::from_device(self.0)) };
+        // Safety: pointer comes from [`LockedBox::into_raw`]
+        //         i.e. this function completes the roundtrip
+        let locked_box = unsafe { ManuallyDrop::new(LockedBox::from_raw(value.0)) };
+
+        unsafe {
+            rustacuda::memory::AsyncCopyDestination::async_copy_from(
+                &mut *device_box,
+                &*locked_box,
+                stream,
+            )
+        }
+    }
+
+    /// # Errors
+    ///
+    /// Returns a [`CudaError`] iff copying from `self` into `value` failed.
+    ///
+    /// # Safety
+    ///
+    /// To use the data inside `value`, the passed-in [`Stream`] must be
+    /// synchronised.
+    pub unsafe fn async_copy_to(
+        &self,
+        value: &mut HostLockedBox<T>,
+        stream: &Stream,
+    ) -> CudaResult<()> {
+        // Safety: pointer comes from [`DeviceBox::into_device`]
+        //         i.e. this function completes the roundtrip
+        let device_box = unsafe { ManuallyDrop::new(DeviceBox::from_device(self.0)) };
+        // Safety: pointer comes from [`LockedBox::into_raw`]
+        //         i.e. this function completes the roundtrip
+        let mut locked_box = unsafe { ManuallyDrop::new(LockedBox::from_raw(value.0)) };
+
+        unsafe {
+            rustacuda::memory::AsyncCopyDestination::async_copy_to(
+                &*device_box,
+                &mut *locked_box,
+                stream,
+            )
+        }
+    }
 }
 
 impl<T: DeviceCopy> From<DeviceBox<T>> for HostDeviceBox<T> {
@@ -391,7 +510,7 @@ impl<T: DeviceCopy> From<DeviceBox<T>> for HostDeviceBox<T> {
 
 impl<T: DeviceCopy> From<HostDeviceBox<T>> for DeviceBox<T> {
     fn from(host_device_box: HostDeviceBox<T>) -> Self {
-        // Safety: pointer comes from `DeviceBox::into_device`
+        // Safety: pointer comes from [`DeviceBox::into_device`]
         //         i.e. this function completes the roundtrip
         unsafe { DeviceBox::from_device(host_device_box.0) }
     }
@@ -399,7 +518,7 @@ impl<T: DeviceCopy> From<HostDeviceBox<T>> for DeviceBox<T> {
 
 impl<T: DeviceCopy> Drop for HostDeviceBox<T> {
     fn drop(&mut self) {
-        // Safety: pointer comes from `DeviceBox::into_device`
+        // Safety: pointer comes from [`DeviceBox::into_device`]
         //         i.e. this function completes the roundtrip
         let device_box = unsafe { DeviceBox::from_device(self.0) };
 
@@ -426,7 +545,7 @@ impl<'a, T: DeviceCopy> HostAndDeviceMutRef<'a, T> {
 
     /// # Errors
     ///
-    /// Returns a `rustacuda::errors::CudaError` iff `value` cannot be moved
+    /// Returns a [`CudaError`] iff `value` cannot be moved
     ///  to CUDA or an error occurs inside `inner`.
     pub fn with_new<
         O,
@@ -473,9 +592,10 @@ impl<'a, T: DeviceCopy> HostAndDeviceMutRef<'a, T> {
     where
         'a: 'b,
     {
-        // Safety: `device_box` contains EXACTLY the device copy of `host_ref`
-        //          by construction of `HostAndDeviceMutRef`
-        unsafe { HostAndDeviceConstRef::new(self.device_box, self.host_ref) }
+        HostAndDeviceConstRef {
+            device_box: self.device_box,
+            host_ref: self.host_ref,
+        }
     }
 
     #[must_use]
@@ -483,9 +603,10 @@ impl<'a, T: DeviceCopy> HostAndDeviceMutRef<'a, T> {
     where
         'a: 'b,
     {
-        // Safety: `device_box` contains EXACTLY the device copy of `host_ref`
-        //          by construction of `HostAndDeviceMutRef`
-        unsafe { HostAndDeviceMutRef::new(self.device_box, self.host_ref) }
+        HostAndDeviceMutRef {
+            device_box: self.device_box,
+            host_ref: self.host_ref,
+        }
     }
 }
 
@@ -516,7 +637,7 @@ impl<'a, T: DeviceCopy> HostAndDeviceConstRef<'a, T> {
 
     /// # Errors
     ///
-    /// Returns a `rustacuda::errors::CudaError` iff `value` cannot be moved
+    /// Returns a [`CudaError`] iff `value` cannot be moved
     ///  to CUDA or an error occurs inside `inner`.
     pub fn with_new<
         O,
@@ -573,7 +694,187 @@ pub struct HostAndDeviceOwned<'a, T: SafeDeviceCopy + DeviceCopy> {
 impl<'a, T: SafeDeviceCopy + DeviceCopy> HostAndDeviceOwned<'a, T> {
     /// # Errors
     ///
-    /// Returns a `rustacuda::errors::CudaError` iff `value` cannot be moved
+    /// Returns a [`CudaError`] iff `value` cannot be moved
+    ///  to CUDA or an error occurs inside `inner`.
+    pub fn with_new<
+        O,
+        E: From<CudaError>,
+        F: for<'b> FnOnce(HostAndDeviceOwned<'b, T>) -> Result<O, E>,
+    >(
+        mut value: T,
+        inner: F,
+    ) -> Result<O, E> {
+        let mut device_box: HostDeviceBox<_> = DeviceBox::new(&value)?.into();
+
+        // Safety: `device_box` contains exactly the device copy of `value`
+        let result = inner(HostAndDeviceOwned {
+            device_box: &mut device_box,
+            host_val: &mut value,
+        });
+
+        core::mem::drop(device_box);
+        core::mem::drop(value);
+
+        result
+    }
+
+    #[must_use]
+    pub fn for_device(self) -> DeviceMutRef<'a, T> {
+        DeviceMutRef {
+            pointer: self.device_box.0.as_raw_mut(),
+            reference: PhantomData,
+        }
+    }
+
+    #[must_use]
+    pub fn for_host(&'a mut self) -> &'a T {
+        self.host_val
+    }
+}
+
+#[allow(clippy::module_name_repetitions)]
+pub struct HostAndDeviceMutRefAsync<'stream, 'a, T: DeviceCopy> {
+    device_box: &'a mut HostDeviceBox<T>,
+    host_ref: &'a mut T,
+    stream: PhantomData<&'stream Stream>,
+}
+
+impl<'stream, 'a, T: DeviceCopy> HostAndDeviceMutRefAsync<'stream, 'a, T> {
+    /// # Safety
+    ///
+    /// `device_box` must contain EXACTLY the device copy of `host_ref`
+    pub unsafe fn new(
+        device_box: &'a mut HostDeviceBox<T>,
+        host_ref: &'a mut T,
+        stream: &'stream Stream,
+    ) -> Self {
+        let _ = stream;
+
+        Self {
+            device_box,
+            host_ref,
+            stream: PhantomData::<&'stream Stream>,
+        }
+    }
+
+    #[must_use]
+    /// # Safety
+    ///
+    /// The returned [`DeviceMutRef`] must only be used on the constructed-with
+    /// [`Stream`]
+    pub unsafe fn for_device_async<'b>(&'b mut self) -> DeviceMutRef<'a, T>
+    where
+        'a: 'b,
+    {
+        DeviceMutRef {
+            pointer: self.device_box.0.as_raw_mut(),
+            reference: PhantomData,
+        }
+    }
+
+    #[must_use]
+    pub fn for_host<'b: 'a>(&'b self) -> &'a T {
+        self.host_ref
+    }
+
+    #[must_use]
+    pub fn as_ref<'b>(&'b self) -> HostAndDeviceConstRefAsync<'stream, 'b, T>
+    where
+        'a: 'b,
+    {
+        HostAndDeviceConstRefAsync {
+            device_box: self.device_box,
+            host_ref: self.host_ref,
+            stream: self.stream,
+        }
+    }
+
+    #[must_use]
+    pub fn as_mut<'b>(&'b mut self) -> HostAndDeviceMutRefAsync<'stream, 'b, T>
+    where
+        'a: 'b,
+    {
+        HostAndDeviceMutRefAsync {
+            device_box: self.device_box,
+            host_ref: self.host_ref,
+            stream: self.stream,
+        }
+    }
+}
+
+#[allow(clippy::module_name_repetitions)]
+pub struct HostAndDeviceConstRefAsync<'stream, 'a, T: DeviceCopy> {
+    device_box: &'a HostDeviceBox<T>,
+    host_ref: &'a T,
+    stream: PhantomData<&'stream Stream>,
+}
+
+impl<'stream, 'a, T: DeviceCopy> Clone for HostAndDeviceConstRefAsync<'stream, 'a, T> {
+    fn clone(&self) -> Self {
+        *self
+    }
+}
+
+impl<'stream, 'a, T: DeviceCopy> Copy for HostAndDeviceConstRefAsync<'stream, 'a, T> {}
+
+impl<'stream, 'a, T: DeviceCopy> HostAndDeviceConstRefAsync<'stream, 'a, T> {
+    /// # Safety
+    ///
+    /// `device_box` must contain EXACTLY the device copy of `host_ref`
+    pub unsafe fn new(
+        device_box: &'a HostDeviceBox<T>,
+        host_ref: &'a T,
+        stream: &'stream Stream,
+    ) -> Self {
+        let _ = stream;
+
+        Self {
+            device_box,
+            host_ref,
+            stream: PhantomData::<&'stream Stream>,
+        }
+    }
+
+    #[must_use]
+    /// # Safety
+    ///
+    /// The returned [`DeviceConstRef`] must only be used on the
+    /// constructed-with [`Stream`]
+    pub unsafe fn for_device_async<'b>(&'b self) -> DeviceConstRef<'a, T>
+    where
+        'a: 'b,
+    {
+        DeviceConstRef {
+            pointer: self.device_box.0.as_raw(),
+            reference: PhantomData,
+        }
+    }
+
+    #[must_use]
+    pub fn for_host(&'a self) -> &'a T {
+        self.host_ref
+    }
+
+    #[must_use]
+    pub fn as_ref<'b>(&'b self) -> HostAndDeviceConstRefAsync<'stream, 'b, T>
+    where
+        'a: 'b,
+    {
+        *self
+    }
+}
+
+#[allow(clippy::module_name_repetitions)]
+pub struct HostAndDeviceOwnedAsync<'stream, 'a, T: SafeDeviceCopy + DeviceCopy> {
+    device_box: &'a mut HostDeviceBox<T>,
+    host_val: &'a mut T,
+    stream: PhantomData<&'stream Stream>,
+}
+
+impl<'stream, 'a, T: SafeDeviceCopy + DeviceCopy> HostAndDeviceOwnedAsync<'stream, 'a, T> {
+    /// # Errors
+    ///
+    /// Returns a [`CudaError`] iff `value` cannot be moved
     ///  to CUDA or an error occurs inside `inner`.
     pub fn with_new<
         O,
diff --git a/src/utils/aliasing/const.rs b/src/utils/aliasing/const.rs
index 361151ac2..8f7f1ab98 100644
--- a/src/utils/aliasing/const.rs
+++ b/src/utils/aliasing/const.rs
@@ -6,7 +6,7 @@ use core::{
 
 use rustacuda_core::DeviceCopy;
 
-use crate::common::{CudaAsRust, DeviceAccessible, RustToCuda};
+use crate::common::{CudaAsRust, DeviceAccessible, RustToCuda, RustToCudaAsync};
 
 #[repr(transparent)]
 #[derive(Clone, TypeLayout)]
@@ -19,7 +19,8 @@ impl<T, const STRIDE: usize> SplitSliceOverCudaThreadsConstStride<T, STRIDE> {
     }
 }
 
-// Safety: If `T` is `DeviceCopy`, then the newtype struct also is `DeviceCopy`
+// Safety: If [`T`] is [`DeviceCopy`], then the newtype struct also is
+// [`DeviceCopy`]
 unsafe impl<T: DeviceCopy, const STRIDE: usize> DeviceCopy
     for SplitSliceOverCudaThreadsConstStride<T, STRIDE>
 {
@@ -190,6 +191,39 @@ unsafe impl<T: RustToCuda, const STRIDE: usize> RustToCuda
     }
 }
 
+unsafe impl<T: RustToCudaAsync, const STRIDE: usize> RustToCudaAsync
+    for SplitSliceOverCudaThreadsConstStride<T, STRIDE>
+{
+    #[cfg(feature = "host")]
+    #[doc(cfg(feature = "host"))]
+    #[allow(clippy::type_complexity)]
+    unsafe fn borrow_async<A: crate::host::CudaAlloc>(
+        &self,
+        alloc: A,
+        stream: &rustacuda::stream::Stream,
+    ) -> rustacuda::error::CudaResult<(
+        DeviceAccessible<Self::CudaRepresentation>,
+        crate::host::CombinedCudaAlloc<Self::CudaAllocation, A>,
+    )> {
+        let (cuda_repr, alloc) = self.0.borrow_async(alloc, stream)?;
+
+        Ok((
+            DeviceAccessible::from(SplitSliceOverCudaThreadsConstStride::new(cuda_repr)),
+            alloc,
+        ))
+    }
+
+    #[cfg(feature = "host")]
+    #[doc(cfg(feature = "host"))]
+    unsafe fn restore_async<A: crate::host::CudaAlloc>(
+        &mut self,
+        alloc: crate::host::CombinedCudaAlloc<Self::CudaAllocation, A>,
+        stream: &rustacuda::stream::Stream,
+    ) -> rustacuda::error::CudaResult<A> {
+        self.0.restore_async(alloc, stream)
+    }
+}
+
 unsafe impl<T: CudaAsRust, const STRIDE: usize> CudaAsRust
     for SplitSliceOverCudaThreadsConstStride<DeviceAccessible<T>, STRIDE>
 {
diff --git a/src/utils/aliasing/dynamic.rs b/src/utils/aliasing/dynamic.rs
index 8b0446e08..6cba2ff9c 100644
--- a/src/utils/aliasing/dynamic.rs
+++ b/src/utils/aliasing/dynamic.rs
@@ -6,7 +6,7 @@ use core::{
 
 use rustacuda_core::DeviceCopy;
 
-use crate::common::{CudaAsRust, DeviceAccessible, RustToCuda};
+use crate::common::{CudaAsRust, DeviceAccessible, RustToCuda, RustToCudaAsync};
 
 #[repr(C)]
 #[derive(Clone, TypeLayout)]
@@ -22,7 +22,8 @@ impl<T> SplitSliceOverCudaThreadsDynamicStride<T> {
     }
 }
 
-// Safety: If `T` is `DeviceCopy`, then the newtype struct also is `DeviceCopy`
+// Safety: If [`T`] is [`DeviceCopy`], then the newtype struct also is
+// [`DeviceCopy`]
 unsafe impl<T: DeviceCopy> DeviceCopy for SplitSliceOverCudaThreadsDynamicStride<T> {}
 
 #[cfg(all(not(feature = "host"), target_os = "cuda"))]
@@ -167,6 +168,40 @@ unsafe impl<T: RustToCuda> RustToCuda for SplitSliceOverCudaThreadsDynamicStride
     }
 }
 
+unsafe impl<T: RustToCudaAsync> RustToCudaAsync for SplitSliceOverCudaThreadsDynamicStride<T> {
+    #[cfg(feature = "host")]
+    #[doc(cfg(feature = "host"))]
+    #[allow(clippy::type_complexity)]
+    unsafe fn borrow_async<A: crate::host::CudaAlloc>(
+        &self,
+        alloc: A,
+        stream: &rustacuda::stream::Stream,
+    ) -> rustacuda::error::CudaResult<(
+        DeviceAccessible<Self::CudaRepresentation>,
+        crate::host::CombinedCudaAlloc<Self::CudaAllocation, A>,
+    )> {
+        let (cuda_repr, alloc) = self.inner.borrow_async(alloc, stream)?;
+
+        Ok((
+            DeviceAccessible::from(SplitSliceOverCudaThreadsDynamicStride::new(
+                cuda_repr,
+                self.stride,
+            )),
+            alloc,
+        ))
+    }
+
+    #[cfg(feature = "host")]
+    #[doc(cfg(feature = "host"))]
+    unsafe fn restore_async<A: crate::host::CudaAlloc>(
+        &mut self,
+        alloc: crate::host::CombinedCudaAlloc<Self::CudaAllocation, A>,
+        stream: &rustacuda::stream::Stream,
+    ) -> rustacuda::error::CudaResult<A> {
+        self.inner.restore_async(alloc, stream)
+    }
+}
+
 unsafe impl<T: CudaAsRust> CudaAsRust
     for SplitSliceOverCudaThreadsDynamicStride<DeviceAccessible<T>>
 {
diff --git a/src/utils/aliasing/final.rs b/src/utils/aliasing/final.rs
index f8d96d5e2..5a3d1695c 100644
--- a/src/utils/aliasing/final.rs
+++ b/src/utils/aliasing/final.rs
@@ -1,6 +1,6 @@
 use r#final::Final;
 
-use crate::common::{CudaAsRust, DeviceAccessible, RustToCuda};
+use crate::common::{CudaAsRust, DeviceAccessible, RustToCuda, RustToCudaAsync};
 
 #[doc(hidden)]
 #[repr(transparent)]
@@ -8,7 +8,7 @@ use crate::common::{CudaAsRust, DeviceAccessible, RustToCuda};
 #[allow(clippy::module_name_repetitions)]
 pub struct FinalCudaRepresentation<T: CudaAsRust>(DeviceAccessible<T>);
 
-// Safety: If `T` is `CudaAsRust`, then the newtype struct is `DeviceCopy`
+// Safety: If [`T`] is [`CudaAsRust`], then the newtype struct is [`DeviceCopy`]
 unsafe impl<T: CudaAsRust> rustacuda_core::DeviceCopy for FinalCudaRepresentation<T> {}
 
 unsafe impl<T: RustToCuda> RustToCuda for Final<T> {
@@ -48,6 +48,40 @@ unsafe impl<T: RustToCuda> RustToCuda for Final<T> {
     }
 }
 
+unsafe impl<T: RustToCudaAsync> RustToCudaAsync for Final<T> {
+    #[cfg(feature = "host")]
+    #[doc(cfg(feature = "host"))]
+    #[allow(clippy::type_complexity)]
+    unsafe fn borrow_async<A: crate::host::CudaAlloc>(
+        &self,
+        alloc: A,
+        stream: &rustacuda::stream::Stream,
+    ) -> rustacuda::error::CudaResult<(
+        DeviceAccessible<Self::CudaRepresentation>,
+        crate::host::CombinedCudaAlloc<Self::CudaAllocation, A>,
+    )> {
+        let (cuda_repr, alloc) = (**self).borrow_async(alloc, stream)?;
+
+        Ok((
+            DeviceAccessible::from(FinalCudaRepresentation(cuda_repr)),
+            alloc,
+        ))
+    }
+
+    #[cfg(feature = "host")]
+    #[doc(cfg(feature = "host"))]
+    unsafe fn restore_async<A: crate::host::CudaAlloc>(
+        &mut self,
+        alloc: crate::host::CombinedCudaAlloc<Self::CudaAllocation, A>,
+        stream: &rustacuda::stream::Stream,
+    ) -> rustacuda::error::CudaResult<A> {
+        // Safety: Final is a repr(transparent) newtype wrapper around T
+        let inner: &mut T = &mut *(self as *mut Self).cast();
+
+        inner.restore_async(alloc, stream)
+    }
+}
+
 unsafe impl<T: CudaAsRust> CudaAsRust for FinalCudaRepresentation<T> {
     type RustRepresentation = Final<T::RustRepresentation>;
 
diff --git a/src/utils/device_copy.rs b/src/utils/device_copy.rs
index 289ef9969..1ae0515f9 100644
--- a/src/utils/device_copy.rs
+++ b/src/utils/device_copy.rs
@@ -3,7 +3,7 @@
 use const_type_layout::TypeGraphLayout;
 
 use crate::{
-    common::{CudaAsRust, DeviceAccessible, RustToCuda},
+    common::{CudaAsRust, DeviceAccessible, RustToCuda, RustToCudaAsync},
     safety::SafeDeviceCopy,
 };
 
@@ -30,42 +30,42 @@ impl<T: SafeDeviceCopy + TypeGraphLayout> SafeDeviceCopyWrapper<T> {
     }
 
     pub fn from_ref(reference: &T) -> &Self {
-        // Safety: `SafeDeviceCopyWrapper` is a transparent newtype around `T`
+        // Safety: [`SafeDeviceCopyWrapper`] is a transparent newtype around [`T`]
         unsafe { &*(reference as *const T).cast() }
     }
 
     pub fn into_ref(&self) -> &T {
-        // Safety: `SafeDeviceCopyWrapper` is a transparent newtype around `T`
+        // Safety: [`SafeDeviceCopyWrapper`] is a transparent newtype around [`T`]
         unsafe { &*(self as *const Self).cast() }
     }
 
     pub fn from_mut(reference: &mut T) -> &mut Self {
-        // Safety: `SafeDeviceCopyWrapper` is a transparent newtype around `T`
+        // Safety: [`SafeDeviceCopyWrapper`] is a transparent newtype around [`T`]
         unsafe { &mut *(reference as *mut T).cast() }
     }
 
     pub fn into_mut(&mut self) -> &mut T {
-        // Safety: `SafeDeviceCopyWrapper` is a transparent newtype around `T`
+        // Safety: [`SafeDeviceCopyWrapper`] is a transparent newtype around [`T`]
         unsafe { &mut *(self as *mut Self).cast() }
     }
 
     pub fn from_slice(slice: &[T]) -> &[Self] {
-        // Safety: `SafeDeviceCopyWrapper` is a transparent newtype around `T`
+        // Safety: [`SafeDeviceCopyWrapper`] is a transparent newtype around [`T`]
         unsafe { core::slice::from_raw_parts(slice.as_ptr().cast(), slice.len()) }
     }
 
     pub fn into_slice(slice: &[Self]) -> &[T] {
-        // Safety: `SafeDeviceCopyWrapper` is a transparent newtype around `T`
+        // Safety: [`SafeDeviceCopyWrapper`] is a transparent newtype around [`T`]
         unsafe { core::slice::from_raw_parts(slice.as_ptr().cast(), slice.len()) }
     }
 
     pub fn from_mut_slice(slice: &mut [T]) -> &mut [Self] {
-        // Safety: `SafeDeviceCopyWrapper` is a transparent newtype around `T`
+        // Safety: [`SafeDeviceCopyWrapper`] is a transparent newtype around [`T`]
         unsafe { core::slice::from_raw_parts_mut(slice.as_mut_ptr().cast(), slice.len()) }
     }
 
     pub fn into_mut_slice(slice: &mut [Self]) -> &mut [T] {
-        // Safety: `SafeDeviceCopyWrapper` is a transparent newtype around `T`
+        // Safety: [`SafeDeviceCopyWrapper`] is a transparent newtype around [`T`]
         unsafe { core::slice::from_raw_parts_mut(slice.as_mut_ptr().cast(), slice.len()) }
     }
 }
@@ -100,6 +100,36 @@ unsafe impl<T: SafeDeviceCopy + TypeGraphLayout> RustToCuda for SafeDeviceCopyWr
     }
 }
 
+unsafe impl<T: SafeDeviceCopy + TypeGraphLayout> RustToCudaAsync
+    for SafeDeviceCopyWrapper<T>
+{
+    #[cfg(feature = "host")]
+    #[allow(clippy::type_complexity)]
+    unsafe fn borrow_async<A: crate::host::CudaAlloc>(
+        &self,
+        alloc: A,
+        _stream: &rustacuda::stream::Stream,
+    ) -> rustacuda::error::CudaResult<(
+        DeviceAccessible<Self::CudaRepresentation>,
+        crate::host::CombinedCudaAlloc<Self::CudaAllocation, A>,
+    )> {
+        let alloc = crate::host::CombinedCudaAlloc::new(crate::host::NullCudaAlloc, alloc);
+        Ok((DeviceAccessible::from(&self.0), alloc))
+    }
+
+    #[cfg(feature = "host")]
+    #[doc(cfg(feature = "host"))]
+    unsafe fn restore_async<A: crate::host::CudaAlloc>(
+        &mut self,
+        alloc: crate::host::CombinedCudaAlloc<Self::CudaAllocation, A>,
+        _stream: &rustacuda::stream::Stream,
+    ) -> rustacuda::error::CudaResult<A> {
+        let (_alloc_front, alloc_tail): (crate::host::NullCudaAlloc, A) = alloc.split();
+
+        Ok(alloc_tail)
+    }
+}
+
 unsafe impl<T: SafeDeviceCopy + TypeGraphLayout> CudaAsRust for SafeDeviceCopyWrapper<T> {
     type RustRepresentation = Self;
 
diff --git a/src/utils/exchange/buffer/common.rs b/src/utils/exchange/buffer/common.rs
index a153da4d0..c5d1f9128 100644
--- a/src/utils/exchange/buffer/common.rs
+++ b/src/utils/exchange/buffer/common.rs
@@ -16,8 +16,8 @@ pub struct CudaExchangeBufferCudaRepresentation<T, const M2D: bool, const M2H: b
 where
     T: SafeDeviceCopy + TypeGraphLayout;
 
-// Safety: `CudaExchangeBufferCudaRepresentation<T>` is `DeviceCopy`
-//         iff `T` is `SafeDeviceCopy`
+// Safety: [`CudaExchangeBufferCudaRepresentation<T>`] is [`DeviceCopy`]
+//         iff [`T`] is [`SafeDeviceCopy`]
 unsafe impl<T: SafeDeviceCopy + TypeGraphLayout, const M2D: bool, const M2H: bool> DeviceCopy
     for CudaExchangeBufferCudaRepresentation<T, M2D, M2H>
 {
diff --git a/src/utils/exchange/buffer/device.rs b/src/utils/exchange/buffer/device.rs
index d284e1193..1ecaf91d2 100644
--- a/src/utils/exchange/buffer/device.rs
+++ b/src/utils/exchange/buffer/device.rs
@@ -2,7 +2,10 @@ use core::ops::{Deref, DerefMut};
 
 use const_type_layout::TypeGraphLayout;
 
-use crate::{common::RustToCuda, safety::SafeDeviceCopy};
+use crate::{
+    common::{RustToCuda, RustToCudaAsync},
+    safety::SafeDeviceCopy,
+};
 
 use super::{common::CudaExchangeBufferCudaRepresentation, CudaExchangeItem};
 
@@ -42,3 +45,9 @@ unsafe impl<T: SafeDeviceCopy + TypeGraphLayout, const M2D: bool, const M2H: boo
 {
     type CudaRepresentation = CudaExchangeBufferCudaRepresentation<T, M2D, M2H>;
 }
+
+#[cfg(not(all(doc, feature = "host")))]
+unsafe impl<T: SafeDeviceCopy + TypeGraphLayout, const M2D: bool, const M2H: bool>
+    RustToCudaAsync for CudaExchangeBufferDevice<T, M2D, M2H>
+{
+}
diff --git a/src/utils/exchange/buffer/host.rs b/src/utils/exchange/buffer/host.rs
index ad522629f..debe33059 100644
--- a/src/utils/exchange/buffer/host.rs
+++ b/src/utils/exchange/buffer/host.rs
@@ -11,7 +11,7 @@ use rustacuda::{
 };
 
 use crate::{
-    common::{DeviceAccessible, RustToCuda},
+    common::{DeviceAccessible, RustToCuda, RustToCudaAsync},
     host::{CombinedCudaAlloc, CudaAlloc, CudaDropWrapper, NullCudaAlloc},
     safety::SafeDeviceCopy,
 };
@@ -39,7 +39,8 @@ impl<T: Clone + SafeDeviceCopy + TypeGraphLayout, const M2D: bool, const M2H: bo
     CudaExchangeBufferHost<T, M2D, M2H>
 {
     /// # Errors
-    /// Returns a `rustacuda::errors::CudaError` iff an error occurs inside CUDA
+    /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside
+    /// CUDA
     pub fn new(elem: &T, capacity: usize) -> CudaResult<Self> {
         // Safety: CudaExchangeItem is a `repr(transparent)` wrapper around T
         let elem: &CudaExchangeItem<T, M2D, M2H> = unsafe { &*(elem as *const T).cast() };
@@ -60,7 +61,8 @@ impl<T: SafeDeviceCopy + TypeGraphLayout, const M2D: bool, const M2H: bool>
     CudaExchangeBufferHost<T, M2D, M2H>
 {
     /// # Errors
-    /// Returns a `rustacuda::errors::CudaError` iff an error occurs inside CUDA
+    /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside
+    /// CUDA
     pub fn from_vec(vec: Vec<T>) -> CudaResult<Self> {
         let mut host_buffer_uninit =
             CudaDropWrapper::from(unsafe { LockedBuffer::uninitialized(vec.len())? });
@@ -155,3 +157,60 @@ unsafe impl<T: SafeDeviceCopy + TypeGraphLayout, const M2D: bool, const M2H: boo
         Ok(alloc_tail)
     }
 }
+
+unsafe impl<T: SafeDeviceCopy + TypeGraphLayout, const M2D: bool, const M2H: bool>
+    RustToCudaAsync for CudaExchangeBufferHost<T, M2D, M2H>
+{
+    #[allow(clippy::type_complexity)]
+    unsafe fn borrow_async<A: CudaAlloc>(
+        &self,
+        alloc: A,
+        stream: &rustacuda::stream::Stream,
+    ) -> rustacuda::error::CudaResult<(
+        DeviceAccessible<Self::CudaRepresentation>,
+        CombinedCudaAlloc<Self::CudaAllocation, A>,
+    )> {
+        // Safety: device_buffer is inside an UnsafeCell
+        //         borrow checks must be satisfied through LendToCuda
+        let device_buffer = &mut *self.device_buffer.get();
+
+        if M2D {
+            // Only move the buffer contents to the device if needed
+
+            rustacuda::memory::AsyncCopyDestination::async_copy_from(
+                &mut ***device_buffer,
+                self.host_buffer.as_slice(),
+                stream,
+            )?;
+        }
+
+        Ok((
+            DeviceAccessible::from(CudaExchangeBufferCudaRepresentation(
+                device_buffer.as_mut_ptr(),
+                device_buffer.len(),
+            )),
+            CombinedCudaAlloc::new(NullCudaAlloc, alloc),
+        ))
+    }
+
+    #[allow(clippy::type_complexity)]
+    unsafe fn restore_async<A: CudaAlloc>(
+        &mut self,
+        alloc: CombinedCudaAlloc<Self::CudaAllocation, A>,
+        stream: &rustacuda::stream::Stream,
+    ) -> rustacuda::error::CudaResult<A> {
+        let (_alloc_front, alloc_tail) = alloc.split();
+
+        if M2H {
+            // Only move the buffer contents back to the host if needed
+
+            rustacuda::memory::AsyncCopyDestination::async_copy_to(
+                &***self.device_buffer.get_mut(),
+                self.host_buffer.as_mut_slice(),
+                stream,
+            )?;
+        }
+
+        Ok(alloc_tail)
+    }
+}
diff --git a/src/utils/exchange/buffer/mod.rs b/src/utils/exchange/buffer/mod.rs
index 3648f9d04..1a940faa0 100644
--- a/src/utils/exchange/buffer/mod.rs
+++ b/src/utils/exchange/buffer/mod.rs
@@ -20,8 +20,8 @@ use crate::safety::SafeDeviceCopy;
 #[derive(Clone, Copy, TypeLayout)]
 pub struct CudaExchangeItem<T: SafeDeviceCopy, const M2D: bool, const M2H: bool>(T);
 
-// Safety: Transparent newtype wrapper around `SafeDeviceCopy`
-//          is `DeviceCopy`
+// Safety: Transparent newtype wrapper around [`SafeDeviceCopy`]
+//          is [`DeviceCopy`]
 unsafe impl<T: SafeDeviceCopy, const M2D: bool, const M2H: bool> rustacuda_core::DeviceCopy
     for CudaExchangeItem<T, M2D, M2H>
 {
diff --git a/src/utils/exchange/wrapper.rs b/src/utils/exchange/wrapper.rs
index 26958f491..f22a6defe 100644
--- a/src/utils/exchange/wrapper.rs
+++ b/src/utils/exchange/wrapper.rs
@@ -1,12 +1,20 @@
-use core::ops::{Deref, DerefMut};
+use core::{
+    marker::PhantomData,
+    ops::{Deref, DerefMut},
+};
 
-use rustacuda::{error::CudaResult, memory::DeviceBox};
+use rustacuda::{
+    error::CudaResult,
+    event::{Event, EventFlags},
+    memory::DeviceBox,
+    stream::Stream,
+};
 
 use crate::{
-    common::{DeviceAccessible, RustToCuda},
+    common::{DeviceAccessible, RustToCuda, RustToCudaAsync},
     host::{
-        CombinedCudaAlloc, EmptyCudaAlloc, HostAndDeviceConstRef, HostAndDeviceMutRef,
-        HostDeviceBox, NullCudaAlloc,
+        CombinedCudaAlloc, CudaDropWrapper, EmptyCudaAlloc, HostAndDeviceConstRef,
+        HostAndDeviceMutRef, HostDeviceBox, HostLockedBox, NullCudaAlloc,
     },
 };
 
@@ -14,39 +22,143 @@ use crate::{
 pub struct ExchangeWrapperOnHost<T: RustToCuda<CudaAllocation: EmptyCudaAlloc>> {
     value: T,
     device_box: HostDeviceBox<DeviceAccessible<<T as RustToCuda>::CudaRepresentation>>,
+    locked_cuda_repr: HostLockedBox<DeviceAccessible<<T as RustToCuda>::CudaRepresentation>>,
+    move_event: CudaDropWrapper<Event>,
+}
+
+#[allow(clippy::module_name_repetitions)]
+pub struct ExchangeWrapperOnHostAsync<'stream, T: RustToCuda<CudaAllocation: EmptyCudaAlloc>> {
+    value: T,
+    device_box: HostDeviceBox<DeviceAccessible<<T as RustToCuda>::CudaRepresentation>>,
+    locked_cuda_repr: HostLockedBox<DeviceAccessible<<T as RustToCuda>::CudaRepresentation>>,
+    move_event: CudaDropWrapper<Event>,
+    stream: PhantomData<&'stream Stream>,
 }
 
 #[allow(clippy::module_name_repetitions)]
 pub struct ExchangeWrapperOnDevice<T: RustToCuda<CudaAllocation: EmptyCudaAlloc>> {
     value: T,
     device_box: HostDeviceBox<DeviceAccessible<<T as RustToCuda>::CudaRepresentation>>,
-    cuda_repr: DeviceAccessible<<T as RustToCuda>::CudaRepresentation>,
+    locked_cuda_repr: HostLockedBox<DeviceAccessible<<T as RustToCuda>::CudaRepresentation>>,
     null_alloc: CombinedCudaAlloc<<T as RustToCuda>::CudaAllocation, NullCudaAlloc>,
+    move_event: CudaDropWrapper<Event>,
+}
+
+#[allow(clippy::module_name_repetitions)]
+pub struct ExchangeWrapperOnDeviceAsync<'stream, T: RustToCuda<CudaAllocation: EmptyCudaAlloc>> {
+    value: T,
+    device_box: HostDeviceBox<DeviceAccessible<<T as RustToCuda>::CudaRepresentation>>,
+    locked_cuda_repr: HostLockedBox<DeviceAccessible<<T as RustToCuda>::CudaRepresentation>>,
+    null_alloc: CombinedCudaAlloc<<T as RustToCuda>::CudaAllocation, NullCudaAlloc>,
+    move_event: CudaDropWrapper<Event>,
+    stream: PhantomData<&'stream Stream>,
 }
 
 impl<T: RustToCuda<CudaAllocation: EmptyCudaAlloc>> ExchangeWrapperOnHost<T> {
     /// # Errors
-    /// Returns a `rustacuda::errors::CudaError` iff an error occurs inside CUDA
+    /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside
+    /// CUDA
     pub fn new(value: T) -> CudaResult<Self> {
+        // Safety: The uninitialised memory is never exposed
+        //         To access the device memory, [`Self::move_to_device`] has to be
+        // called first,           which initialised the memory.
+        let device_box = unsafe { DeviceBox::uninitialized() }?.into();
+
         let (cuda_repr, _null_alloc) = unsafe { value.borrow(NullCudaAlloc) }?;
+        let locked_cuda_repr = HostLockedBox::new(cuda_repr)?;
 
-        let device_box = DeviceBox::new(&cuda_repr)?.into();
+        let move_event = Event::new(EventFlags::DISABLE_TIMING)?.into();
 
-        Ok(Self { value, device_box })
+        Ok(Self {
+            value,
+            device_box,
+            locked_cuda_repr,
+            move_event,
+        })
     }
 
+    /// Moves the data synchronously to the CUDA device, where it can then be
+    /// lent out immutably via [`ExchangeWrapperOnDevice::as_ref`], or mutably
+    /// via [`ExchangeWrapperOnDevice::as_mut`].
+    ///
+    /// To avoid aliasing, each CUDA thread will get access to its own shallow
+    /// copy of the data. Hence,
+    /// - any shallow changes to the data will NOT be reflected back to the CPU
+    /// - any deep changes to the data WILL be reflected back to the CPU
+    ///
     /// # Errors
-    /// Returns a `rustacuda::errors::CudaError` iff an error occurs inside CUDA
+    /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside
+    /// CUDA
     pub fn move_to_device(mut self) -> CudaResult<ExchangeWrapperOnDevice<T>> {
         let (cuda_repr, null_alloc) = unsafe { self.value.borrow(NullCudaAlloc) }?;
+        *self.locked_cuda_repr = cuda_repr;
 
-        self.device_box.copy_from(&cuda_repr)?;
+        self.device_box.copy_from(&self.locked_cuda_repr)?;
 
         Ok(ExchangeWrapperOnDevice {
             value: self.value,
             device_box: self.device_box,
-            cuda_repr,
+            locked_cuda_repr: self.locked_cuda_repr,
             null_alloc,
+            move_event: self.move_event,
+        })
+    }
+}
+
+impl<T: RustToCudaAsync<CudaAllocation: EmptyCudaAlloc>> ExchangeWrapperOnHost<T> {
+    /// Moves the data asynchronously to the CUDA device.
+    ///
+    /// To avoid aliasing, each CUDA thread will get access to its own shallow
+    /// copy of the data. Hence,
+    /// - any shallow changes to the data will NOT be reflected back to the CPU
+    /// - any deep changes to the data WILL be reflected back to the CPU
+    ///
+    /// # Errors
+    /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside
+    /// CUDA
+    pub fn move_to_device_async<'stream>(
+        mut self,
+        stream: &'stream Stream,
+    ) -> CudaResult<ExchangeWrapperOnDeviceAsync<'stream, T>> {
+        let (cuda_repr, null_alloc) = unsafe { self.value.borrow_async(NullCudaAlloc, stream) }?;
+        *self.locked_cuda_repr = cuda_repr;
+
+        // Safety: The device value is not safely exposed until either
+        // - the passed-in [`Stream`] is synchronised
+        // - the kernel is launched on the passed-in [`Stream`]
+        unsafe {
+            self.device_box
+                .async_copy_from(&self.locked_cuda_repr, stream)
+        }?;
+        self.move_event.record(stream)?;
+
+        Ok(ExchangeWrapperOnDeviceAsync {
+            value: self.value,
+            device_box: self.device_box,
+            locked_cuda_repr: self.locked_cuda_repr,
+            null_alloc,
+            move_event: self.move_event,
+            stream: PhantomData::<&'stream Stream>,
+        })
+    }
+}
+
+impl<'stream, T: RustToCuda<CudaAllocation: EmptyCudaAlloc>>
+    ExchangeWrapperOnHostAsync<'stream, T>
+{
+    /// Synchronises the host CPU thread until the data has moved to the CPU.
+    ///
+    /// # Errors
+    /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside
+    /// CUDA
+    pub fn sync_to_host(self) -> CudaResult<ExchangeWrapperOnHost<T>> {
+        self.move_event.synchronize()?;
+
+        Ok(ExchangeWrapperOnHost {
+            value: self.value,
+            device_box: self.device_box,
+            locked_cuda_repr: self.locked_cuda_repr,
+            move_event: self.move_event,
         })
     }
 }
@@ -65,29 +177,96 @@ impl<T: RustToCuda<CudaAllocation: EmptyCudaAlloc>> DerefMut for ExchangeWrapper
     }
 }
 
+impl<'stream, T: RustToCuda<CudaAllocation: EmptyCudaAlloc>>
+    ExchangeWrapperOnDeviceAsync<'stream, T>
+{
+    /// Synchronises the host CPU thread until the data has moved to the GPU.
+    ///
+    /// # Errors
+    /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside
+    /// CUDA
+    pub fn sync_to_device(self) -> CudaResult<ExchangeWrapperOnDevice<T>> {
+        self.move_event.synchronize()?;
+
+        Ok(ExchangeWrapperOnDevice {
+            value: self.value,
+            device_box: self.device_box,
+            locked_cuda_repr: self.locked_cuda_repr,
+            null_alloc: self.null_alloc,
+            move_event: self.move_event,
+        })
+    }
+}
+
 impl<T: RustToCuda<CudaAllocation: EmptyCudaAlloc>> ExchangeWrapperOnDevice<T> {
+    /// Moves the data synchronously back to the host CPU device.
+    ///
+    /// To avoid aliasing, each CUDA thread only got access to its own shallow
+    /// copy of the data. Hence,
+    /// - any shallow changes to the data will NOT be reflected back to the CPU
+    /// - any deep changes to the data WILL be reflected back to the CPU
+    ///
     /// # Errors
-    /// Returns a `rustacuda::errors::CudaError` iff an error occurs inside CUDA
+    /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside
+    /// CUDA
     pub fn move_to_host(mut self) -> CudaResult<ExchangeWrapperOnHost<T>> {
+        // Reflect deep changes back to the CPU
         let _null_alloc: NullCudaAlloc = unsafe { self.value.restore(self.null_alloc) }?;
 
+        // Note: Shallow changes are not reflected back to the CPU
+
         Ok(ExchangeWrapperOnHost {
             value: self.value,
             device_box: self.device_box,
+            locked_cuda_repr: self.locked_cuda_repr,
+            move_event: self.move_event,
         })
     }
 
     pub fn as_ref(
         &self,
     ) -> HostAndDeviceConstRef<DeviceAccessible<<T as RustToCuda>::CudaRepresentation>> {
-        // Safety: `device_box` contains exactly the device copy of `cuda_repr`
-        unsafe { HostAndDeviceConstRef::new(&self.device_box, &self.cuda_repr) }
+        // Safety: `device_box` contains exactly the device copy of `locked_cuda_repr`
+        unsafe { HostAndDeviceConstRef::new(&self.device_box, &self.locked_cuda_repr) }
     }
 
     pub fn as_mut(
         &mut self,
     ) -> HostAndDeviceMutRef<DeviceAccessible<<T as RustToCuda>::CudaRepresentation>> {
-        // Safety: `device_box` contains exactly the device copy of `cuda_repr`
-        unsafe { HostAndDeviceMutRef::new(&mut self.device_box, &mut self.cuda_repr) }
+        // Safety: `device_box` contains exactly the device copy of `locked_cuda_repr`
+        unsafe { HostAndDeviceMutRef::new(&mut self.device_box, &mut self.locked_cuda_repr) }
+    }
+}
+
+impl<T: RustToCudaAsync<CudaAllocation: EmptyCudaAlloc>> ExchangeWrapperOnDevice<T> {
+    /// Moves the data asynchronously back to the host CPU device.
+    ///
+    /// To avoid aliasing, each CUDA thread only got access to its own shallow
+    /// copy of the data. Hence,
+    /// - any shallow changes to the data will NOT be reflected back to the CPU
+    /// - any deep changes to the data WILL be reflected back to the CPU
+    ///
+    /// # Errors
+    /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside
+    /// CUDA
+    pub fn move_to_host_async<'stream>(
+        mut self,
+        stream: &'stream Stream,
+    ) -> CudaResult<ExchangeWrapperOnHostAsync<'stream, T>> {
+        // Reflect deep changes back to the CPU
+        let _null_alloc: NullCudaAlloc =
+            unsafe { self.value.restore_async(self.null_alloc, stream) }?;
+
+        // Note: Shallow changes are not reflected back to the CPU
+
+        self.move_event.record(stream)?;
+
+        Ok(ExchangeWrapperOnHostAsync {
+            value: self.value,
+            device_box: self.device_box,
+            locked_cuda_repr: self.locked_cuda_repr,
+            move_event: self.move_event,
+            stream: PhantomData::<&'stream Stream>,
+        })
     }
 }
diff --git a/src/utils/option.rs b/src/utils/option.rs
index 7ef601137..18b86527b 100644
--- a/src/utils/option.rs
+++ b/src/utils/option.rs
@@ -3,7 +3,10 @@ use core::mem::MaybeUninit;
 use const_type_layout::TypeGraphLayout;
 
 use crate::{
-    common::{CudaAsRust, DeviceAccessible, RustToCuda, RustToCudaProxy},
+    common::{
+        CudaAsRust, DeviceAccessible, RustToCuda, RustToCudaAsync, RustToCudaAsyncProxy,
+        RustToCudaProxy,
+    },
     safety::SafeDeviceCopy,
     utils::device_copy::SafeDeviceCopyWrapper,
 };
@@ -83,6 +86,62 @@ unsafe impl<T: RustToCuda> RustToCuda for Option<T> {
     }
 }
 
+unsafe impl<T: RustToCudaAsync> RustToCudaAsync for Option<T> {
+    #[cfg(feature = "host")]
+    #[doc(cfg(feature = "host"))]
+    #[allow(clippy::type_complexity)]
+    unsafe fn borrow_async<A: CudaAlloc>(
+        &self,
+        alloc: A,
+        stream: &rustacuda::stream::Stream,
+    ) -> CudaResult<(
+        DeviceAccessible<Self::CudaRepresentation>,
+        CombinedCudaAlloc<Self::CudaAllocation, A>,
+    )> {
+        let (cuda_repr, alloc) = match self {
+            None => (
+                OptionCudaRepresentation {
+                    maybe: MaybeUninit::uninit(),
+                    present: false,
+                },
+                CombinedCudaAlloc::new(None, alloc),
+            ),
+            Some(value) => {
+                let (cuda_repr, alloc) = value.borrow_async(alloc, stream)?;
+
+                let (alloc_front, alloc_tail) = alloc.split();
+
+                (
+                    OptionCudaRepresentation {
+                        maybe: MaybeUninit::new(cuda_repr),
+                        present: true,
+                    },
+                    CombinedCudaAlloc::new(Some(alloc_front), alloc_tail),
+                )
+            },
+        };
+
+        Ok((DeviceAccessible::from(cuda_repr), alloc))
+    }
+
+    #[cfg(feature = "host")]
+    #[doc(cfg(feature = "host"))]
+    unsafe fn restore_async<A: CudaAlloc>(
+        &mut self,
+        alloc: CombinedCudaAlloc<Self::CudaAllocation, A>,
+        stream: &rustacuda::stream::Stream,
+    ) -> CudaResult<A> {
+        let (alloc_front, alloc_tail) = alloc.split();
+
+        match (self, alloc_front) {
+            (Some(value), Some(alloc_front)) => {
+                value.restore_async(CombinedCudaAlloc::new(alloc_front, alloc_tail), stream)
+            },
+            _ => Ok(alloc_tail),
+        }
+    }
+}
+
 unsafe impl<T: CudaAsRust> CudaAsRust for OptionCudaRepresentation<T> {
     type RustRepresentation = Option<<T as CudaAsRust>::RustRepresentation>;
 
@@ -101,12 +160,30 @@ impl<T: SafeDeviceCopy + TypeGraphLayout> RustToCudaProxy<Option<T>>
     for Option<SafeDeviceCopyWrapper<T>>
 {
     fn from_ref(val: &Option<T>) -> &Self {
-        // Safety: `SafeDeviceCopyWrapper` is a transparent newtype
+        // Safety: [`SafeDeviceCopyWrapper`] is a transparent newtype
+        unsafe { &*(val as *const Option<T>).cast() }
+    }
+
+    fn from_mut(val: &mut Option<T>) -> &mut Self {
+        // Safety: [`SafeDeviceCopyWrapper`] is a transparent newtype
+        unsafe { &mut *(val as *mut Option<T>).cast() }
+    }
+
+    fn into(self) -> Option<T> {
+        self.map(SafeDeviceCopyWrapper::into_inner)
+    }
+}
+
+impl<T: SafeDeviceCopy + TypeGraphLayout> RustToCudaAsyncProxy<Option<T>>
+    for Option<SafeDeviceCopyWrapper<T>>
+{
+    fn from_ref(val: &Option<T>) -> &Self {
+        // Safety: [`SafeDeviceCopyWrapper`] is a transparent newtype
         unsafe { &*(val as *const Option<T>).cast() }
     }
 
     fn from_mut(val: &mut Option<T>) -> &mut Self {
-        // Safety: `SafeDeviceCopyWrapper` is a transparent newtype
+        // Safety: [`SafeDeviceCopyWrapper`] is a transparent newtype
         unsafe { &mut *(val as *mut Option<T>).cast() }
     }
 

From 4edc14b82c2357a1cebee097ae6f1f87eb3d972e Mon Sep 17 00:00:00 2001
From: Juniper Langenstein <juniper.langenstein@helsinki.fi>
Date: Sat, 19 Nov 2022 14:36:34 +0000
Subject: [PATCH 002/120] Experiments with Rust Futures

---
 src/host.rs                   |  26 ++++--
 src/lib.rs                    |   1 +
 src/utils/exchange/wrapper.rs | 153 ++++++++++++++++++++++++++++++++--
 3 files changed, 163 insertions(+), 17 deletions(-)

diff --git a/src/host.rs b/src/host.rs
index 3c19ac6fd..d434e8d67 100644
--- a/src/host.rs
+++ b/src/host.rs
@@ -297,19 +297,29 @@ impl<A: CudaAlloc, B: CudaAlloc> CombinedCudaAlloc<A, B> {
     }
 }
 
-pub struct CudaDropWrapper<C: private::drop::Sealed>(Option<C>);
+#[repr(transparent)]
+pub struct CudaDropWrapper<C: private::drop::Sealed>(ManuallyDrop<C>);
 impl<C: private::drop::Sealed> private::alloc::Sealed for CudaDropWrapper<C> {}
 impl<C: private::drop::Sealed> From<C> for CudaDropWrapper<C> {
     fn from(val: C) -> Self {
-        Self(Some(val))
+        Self(ManuallyDrop::new(val))
+    }
+}
+impl<C: private::drop::Sealed> CudaDropWrapper<C> {
+    pub fn into_inner(self) -> C {
+        let this = ManuallyDrop::new(self);
+
+        // Safety: move out of drop, caller now has to deal with CUDA drop again
+        unsafe { core::ptr::read(&*this.0) }
     }
 }
 impl<C: private::drop::Sealed> Drop for CudaDropWrapper<C> {
     fn drop(&mut self) {
-        if let Some(val) = self.0.take() {
-            if let Err((_err, val)) = C::drop(val) {
-                core::mem::forget(val);
-            }
+        // Safety: drop is only ever called once
+        let val = unsafe { ManuallyDrop::take(&mut self.0) };
+
+        if let Err((_err, val)) = C::drop(val) {
+            core::mem::forget(val);
         }
     }
 }
@@ -317,12 +327,12 @@ impl<C: private::drop::Sealed> Deref for CudaDropWrapper<C> {
     type Target = C;
 
     fn deref(&self) -> &Self::Target {
-        self.0.as_ref().unwrap()
+        &self.0
     }
 }
 impl<C: private::drop::Sealed> DerefMut for CudaDropWrapper<C> {
     fn deref_mut(&mut self) -> &mut Self::Target {
-        self.0.as_mut().unwrap()
+        &mut self.0
     }
 }
 
diff --git a/src/lib.rs b/src/lib.rs
index 3c176e4a2..2c202ffee 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -14,6 +14,7 @@
 #![feature(const_type_name)]
 #![feature(offset_of)]
 #![feature(adt_const_params)]
+#![feature(impl_trait_in_assoc_type)]
 #![allow(incomplete_features)]
 #![feature(generic_const_exprs)]
 #![doc(html_root_url = "https://juntyr.github.io/rust-cuda/")]
diff --git a/src/utils/exchange/wrapper.rs b/src/utils/exchange/wrapper.rs
index f22a6defe..61ab9899f 100644
--- a/src/utils/exchange/wrapper.rs
+++ b/src/utils/exchange/wrapper.rs
@@ -1,20 +1,25 @@
 use core::{
+    future::{Future, IntoFuture},
     marker::PhantomData,
     ops::{Deref, DerefMut},
+    task::{Poll, Waker},
 };
+use std::sync::Mutex;
 
+use alloc::sync::Arc;
 use rustacuda::{
-    error::CudaResult,
-    event::{Event, EventFlags},
+    error::{CudaError, CudaResult},
+    event::{Event, EventFlags, EventStatus},
     memory::DeviceBox,
-    stream::Stream,
+    stream::{Stream, StreamWaitEventFlags},
 };
 
 use crate::{
     common::{DeviceAccessible, RustToCuda, RustToCudaAsync},
     host::{
         CombinedCudaAlloc, CudaDropWrapper, EmptyCudaAlloc, HostAndDeviceConstRef,
-        HostAndDeviceMutRef, HostDeviceBox, HostLockedBox, NullCudaAlloc,
+        HostAndDeviceConstRefAsync, HostAndDeviceMutRef, HostAndDeviceMutRefAsync, HostDeviceBox,
+        HostLockedBox, NullCudaAlloc,
     },
 };
 
@@ -51,7 +56,8 @@ pub struct ExchangeWrapperOnDeviceAsync<'stream, T: RustToCuda<CudaAllocation: E
     locked_cuda_repr: HostLockedBox<DeviceAccessible<<T as RustToCuda>::CudaRepresentation>>,
     null_alloc: CombinedCudaAlloc<<T as RustToCuda>::CudaAllocation, NullCudaAlloc>,
     move_event: CudaDropWrapper<Event>,
-    stream: PhantomData<&'stream Stream>,
+    stream: &'stream Stream,
+    waker: Arc<Mutex<Option<Waker>>>,
 }
 
 impl<T: RustToCuda<CudaAllocation: EmptyCudaAlloc>> ExchangeWrapperOnHost<T> {
@@ -116,10 +122,10 @@ impl<T: RustToCudaAsync<CudaAllocation: EmptyCudaAlloc>> ExchangeWrapperOnHost<T
     /// # Errors
     /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside
     /// CUDA
-    pub fn move_to_device_async<'stream>(
+    pub fn move_to_device_async(
         mut self,
-        stream: &'stream Stream,
-    ) -> CudaResult<ExchangeWrapperOnDeviceAsync<'stream, T>> {
+        stream: &Stream,
+    ) -> CudaResult<ExchangeWrapperOnDeviceAsync<'_, T>> {
         let (cuda_repr, null_alloc) = unsafe { self.value.borrow_async(NullCudaAlloc, stream) }?;
         *self.locked_cuda_repr = cuda_repr;
 
@@ -132,13 +138,25 @@ impl<T: RustToCudaAsync<CudaAllocation: EmptyCudaAlloc>> ExchangeWrapperOnHost<T
         }?;
         self.move_event.record(stream)?;
 
+        let waker: Arc<Mutex<Option<Waker>>> = Arc::new(Mutex::new(None));
+
+        let waker_callback = waker.clone();
+        stream.add_callback(Box::new(move |_| {
+            if let Ok(mut w) = waker_callback.lock() {
+                if let Some(w) = w.take() {
+                    w.wake();
+                }
+            }
+        }))?;
+
         Ok(ExchangeWrapperOnDeviceAsync {
             value: self.value,
             device_box: self.device_box,
             locked_cuda_repr: self.locked_cuda_repr,
             null_alloc,
             move_event: self.move_event,
-            stream: PhantomData::<&'stream Stream>,
+            stream,
+            waker,
         })
     }
 }
@@ -161,6 +179,30 @@ impl<'stream, T: RustToCuda<CudaAllocation: EmptyCudaAlloc>>
             move_event: self.move_event,
         })
     }
+
+    /// Moves the asynchronous data move to a different [`Stream`].
+    ///
+    /// # Errors
+    /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside
+    /// CUDA
+    pub fn move_to_stream<'stream2>(
+        self,
+        stream: &'stream2 Stream,
+    ) -> CudaResult<ExchangeWrapperOnHostAsync<'stream2, T>> {
+        let old_event = self.move_event.into_inner();
+        let new_event: CudaDropWrapper<Event> = Event::new(EventFlags::DISABLE_TIMING)?.into();
+
+        stream.wait_event(old_event, StreamWaitEventFlags::DEFAULT)?;
+        new_event.record(stream)?;
+
+        Ok(ExchangeWrapperOnHostAsync {
+            value: self.value,
+            device_box: self.device_box,
+            locked_cuda_repr: self.locked_cuda_repr,
+            move_event: new_event,
+            stream: PhantomData::<&'stream2 Stream>,
+        })
+    }
 }
 
 impl<T: RustToCuda<CudaAllocation: EmptyCudaAlloc>> Deref for ExchangeWrapperOnHost<T> {
@@ -196,6 +238,99 @@ impl<'stream, T: RustToCuda<CudaAllocation: EmptyCudaAlloc>>
             move_event: self.move_event,
         })
     }
+
+    /// Moves the asynchronous data move to a different [`Stream`].
+    ///
+    /// # Errors
+    /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside
+    /// CUDA
+    pub fn move_to_stream(
+        self,
+        stream: &Stream,
+    ) -> CudaResult<ExchangeWrapperOnDeviceAsync<'_, T>> {
+        let old_event = self.move_event.into_inner();
+        let new_event: CudaDropWrapper<Event> = Event::new(EventFlags::DISABLE_TIMING)?.into();
+
+        stream.wait_event(old_event, StreamWaitEventFlags::DEFAULT)?;
+        new_event.record(stream)?;
+
+        let waker_callback = self.waker.clone();
+        stream.add_callback(Box::new(move |_| {
+            if let Ok(mut w) = waker_callback.lock() {
+                if let Some(w) = w.take() {
+                    w.wake();
+                }
+            }
+        }))?;
+
+        Ok(ExchangeWrapperOnDeviceAsync {
+            value: self.value,
+            device_box: self.device_box,
+            locked_cuda_repr: self.locked_cuda_repr,
+            null_alloc: self.null_alloc,
+            move_event: new_event,
+            stream,
+            waker: self.waker,
+        })
+    }
+
+    pub fn as_ref_async(
+        &self,
+    ) -> HostAndDeviceConstRefAsync<DeviceAccessible<<T as RustToCuda>::CudaRepresentation>> {
+        // Safety: `device_box` contains exactly the device copy of `locked_cuda_repr`
+        unsafe {
+            HostAndDeviceConstRefAsync::new(&self.device_box, &self.locked_cuda_repr, self.stream)
+        }
+    }
+
+    pub fn as_mut_async(
+        &mut self,
+    ) -> HostAndDeviceMutRefAsync<DeviceAccessible<<T as RustToCuda>::CudaRepresentation>> {
+        // Safety: `device_box` contains exactly the device copy of `locked_cuda_repr`
+        unsafe {
+            HostAndDeviceMutRefAsync::new(
+                &mut self.device_box,
+                &mut self.locked_cuda_repr,
+                self.stream,
+            )
+        }
+    }
+}
+
+impl<'stream, T: RustToCuda<CudaAllocation: EmptyCudaAlloc>> IntoFuture
+    for ExchangeWrapperOnDeviceAsync<'stream, T>
+{
+    type Output = CudaResult<ExchangeWrapperOnDevice<T>>;
+
+    type IntoFuture = impl Future<Output = Self::Output>;
+
+    fn into_future(self) -> Self::IntoFuture {
+        let mut wrapper = Some(self);
+
+        core::future::poll_fn(move |cx| match &wrapper {
+            Some(inner) => match inner.move_event.query() {
+                Ok(EventStatus::NotReady) => match inner.waker.lock() {
+                    Ok(mut w) => {
+                        *w = Some(cx.waker().clone());
+                        Poll::Pending
+                    },
+                    Err(_) => Poll::Ready(Err(CudaError::OperatingSystemError)),
+                },
+                Ok(EventStatus::Ready) => match wrapper.take() {
+                    Some(inner) => Poll::Ready(Ok(ExchangeWrapperOnDevice {
+                        value: inner.value,
+                        device_box: inner.device_box,
+                        locked_cuda_repr: inner.locked_cuda_repr,
+                        null_alloc: inner.null_alloc,
+                        move_event: inner.move_event,
+                    })),
+                    None => Poll::Ready(Err(CudaError::AlreadyAcquired)),
+                },
+                Err(err) => Poll::Ready(Err(err)),
+            },
+            None => Poll::Ready(Err(CudaError::AlreadyAcquired)),
+        })
+    }
 }
 
 impl<T: RustToCuda<CudaAllocation: EmptyCudaAlloc>> ExchangeWrapperOnDevice<T> {

From 8aa63164eb25d5aec06121fc46941b5e2893ea0c Mon Sep 17 00:00:00 2001
From: Juniper Langenstein <juniper.langenstein@helsinki.fi>
Date: Sat, 19 Nov 2022 16:53:29 +0000
Subject: [PATCH 003/120] Implemented derive for RustToCudaAsync

---
 rust-cuda-derive/src/rust_to_cuda/generics.rs |  37 ++++-
 rust-cuda-derive/src/rust_to_cuda/impl.rs     |  73 ++++++++-
 rust-cuda-derive/src/rust_to_cuda/mod.rs      |  14 +-
 src/utils/exchange/wrapper.rs                 | 144 ++++++++++++++++--
 4 files changed, 250 insertions(+), 18 deletions(-)

diff --git a/rust-cuda-derive/src/rust_to_cuda/generics.rs b/rust-cuda-derive/src/rust_to_cuda/generics.rs
index 8b21246d2..d08b1e7c3 100644
--- a/rust-cuda-derive/src/rust_to_cuda/generics.rs
+++ b/rust-cuda-derive/src/rust_to_cuda/generics.rs
@@ -4,7 +4,12 @@ use syn::spanned::Spanned;
 #[allow(clippy::too_many_lines)]
 pub fn expand_cuda_struct_generics_where_requested_in_attrs(
     ast: &syn::DeriveInput,
-) -> (Vec<syn::Attribute>, syn::Generics, Vec<syn::Attribute>) {
+) -> (
+    Vec<syn::Attribute>,
+    syn::Generics,
+    syn::Generics,
+    Vec<syn::Attribute>,
+) {
     let mut type_params = ast
         .generics
         .type_params()
@@ -13,6 +18,7 @@ pub fn expand_cuda_struct_generics_where_requested_in_attrs(
 
     let mut struct_attrs_cuda = ast.attrs.clone();
     let mut struct_generics_cuda = ast.generics.clone();
+    let mut struct_generics_cuda_async = ast.generics.clone();
     let mut struct_layout_attrs = Vec::new();
 
     for ty in &type_params {
@@ -36,11 +42,17 @@ pub fn expand_cuda_struct_generics_where_requested_in_attrs(
                             path,
                             lit: syn::Lit::Str(s),
                             ..
-                        })) if path.is_ident("bound") => match syn::parse_str(&s.value()) {
-                            Ok(bound) => struct_generics_cuda
-                                .make_where_clause()
-                                .predicates
-                                .push(bound),
+                        })) if path.is_ident("bound") => match syn::parse_str::<syn::WherePredicate>(&s.value()) {
+                            Ok(bound) => {
+                                struct_generics_cuda
+                                    .make_where_clause()
+                                    .predicates
+                                    .push(bound.clone());
+                                struct_generics_cuda_async
+                                    .make_where_clause()
+                                    .predicates
+                                    .push(bound);
+                            },
                             Err(err) => emit_error!(
                                 s.span(),
                                 "[rust-cuda]: Invalid #[cuda(bound = \"<where-predicate>\")] \
@@ -136,7 +148,18 @@ pub fn expand_cuda_struct_generics_where_requested_in_attrs(
             .push(syn::parse_quote! {
                 #ty: ::rust_cuda::common::RustToCuda
             });
+        struct_generics_cuda_async
+            .make_where_clause()
+            .predicates
+            .push(syn::parse_quote! {
+                #ty: ::rust_cuda::common::RustToCudaAsync
+            });
     }
 
-    (struct_attrs_cuda, struct_generics_cuda, struct_layout_attrs)
+    (
+        struct_attrs_cuda,
+        struct_generics_cuda,
+        struct_generics_cuda_async,
+        struct_layout_attrs,
+    )
 }
diff --git a/rust-cuda-derive/src/rust_to_cuda/impl.rs b/rust-cuda-derive/src/rust_to_cuda/impl.rs
index 2c6593068..1028f0ed6 100644
--- a/rust-cuda-derive/src/rust_to_cuda/impl.rs
+++ b/rust-cuda-derive/src/rust_to_cuda/impl.rs
@@ -82,7 +82,8 @@ pub fn rust_to_cuda_trait(
 
             #[cfg(not(target_os = "cuda"))]
             unsafe fn borrow<CudaAllocType: rust_cuda::host::CudaAlloc>(
-                &self, alloc: CudaAllocType
+                &self,
+                alloc: CudaAllocType,
             ) -> rust_cuda::rustacuda::error::CudaResult<(
                 rust_cuda::common::DeviceAccessible<Self::CudaRepresentation>,
                 rust_cuda::host::CombinedCudaAlloc<Self::CudaAllocation, CudaAllocType>
@@ -117,6 +118,76 @@ pub fn rust_to_cuda_trait(
     }
 }
 
+#[allow(clippy::too_many_arguments)]
+pub fn rust_to_cuda_async_trait(
+    struct_name: &syn::Ident,
+    struct_name_cuda: &syn::Ident,
+    struct_generics_cuda_async: &syn::Generics,
+    struct_fields_cuda: &syn::Fields,
+    r2c_field_async_declarations: &[TokenStream],
+    r2c_field_initialisations: &[TokenStream],
+    r2c_field_async_destructors: &[TokenStream],
+) -> TokenStream {
+    let rust_to_cuda_struct_construction = match struct_fields_cuda {
+        syn::Fields::Named(_) => quote! {
+            #struct_name_cuda {
+                #(#r2c_field_initialisations)*
+            }
+        },
+        syn::Fields::Unnamed(_) => quote! {
+            #struct_name_cuda (
+                #(#r2c_field_initialisations)*
+            )
+        },
+        syn::Fields::Unit => quote! { #struct_name_cuda },
+    };
+
+    let (impl_generics, ty_generics, where_clause) = struct_generics_cuda_async.split_for_impl();
+
+    quote! {
+        unsafe impl #impl_generics rust_cuda::common::RustToCudaAsync for #struct_name #ty_generics
+            #where_clause
+        {
+            #[cfg(not(target_os = "cuda"))]
+            unsafe fn borrow_async<CudaAllocType: rust_cuda::host::CudaAlloc>(
+                &self,
+                alloc: CudaAllocType,
+                stream: &rust_cuda::rustacuda::stream::Stream,
+            ) -> rust_cuda::rustacuda::error::CudaResult<(
+                rust_cuda::common::DeviceAccessible<Self::CudaRepresentation>,
+                rust_cuda::host::CombinedCudaAlloc<Self::CudaAllocation, CudaAllocType>
+            )> {
+                let alloc_front = rust_cuda::host::NullCudaAlloc;
+                let alloc_tail = alloc;
+
+                #(#r2c_field_async_declarations)*
+
+                let borrow = #rust_to_cuda_struct_construction;
+
+                Ok((
+                    rust_cuda::common::DeviceAccessible::from(borrow),
+                    rust_cuda::host::CombinedCudaAlloc::new(alloc_front, alloc_tail)
+                ))
+            }
+
+            #[cfg(not(target_os = "cuda"))]
+            unsafe fn restore_async<CudaAllocType: rust_cuda::host::CudaAlloc>(
+                &mut self,
+                alloc: rust_cuda::host::CombinedCudaAlloc<
+                    Self::CudaAllocation, CudaAllocType
+                >,
+                stream: &rust_cuda::rustacuda::stream::Stream,
+            ) -> rust_cuda::rustacuda::error::CudaResult<CudaAllocType> {
+                let (alloc_front, alloc_tail) = alloc.split();
+
+                #(#r2c_field_async_destructors)*
+
+                Ok(alloc_tail)
+            }
+        }
+    }
+}
+
 pub fn cuda_as_rust_trait(
     struct_name: &syn::Ident,
     struct_name_cuda: &syn::Ident,
diff --git a/rust-cuda-derive/src/rust_to_cuda/mod.rs b/rust-cuda-derive/src/rust_to_cuda/mod.rs
index 00e756c00..5cfa6fb18 100644
--- a/rust-cuda-derive/src/rust_to_cuda/mod.rs
+++ b/rust-cuda-derive/src/rust_to_cuda/mod.rs
@@ -69,7 +69,7 @@ pub fn impl_rust_to_cuda(ast: &syn::DeriveInput) -> proc_macro::TokenStream {
         syn::Fields::Unit => (),
     }
 
-    let (struct_attrs_cuda, struct_generics_cuda, struct_layout_attrs) =
+    let (struct_attrs_cuda, struct_generics_cuda, struct_generics_cuda_async, struct_layout_attrs) =
         generics::expand_cuda_struct_generics_where_requested_in_attrs(ast);
 
     let cuda_struct_declaration = r#impl::cuda_struct_declaration(
@@ -93,6 +93,16 @@ pub fn impl_rust_to_cuda(ast: &syn::DeriveInput) -> proc_macro::TokenStream {
         &r2c_field_destructors,
     );
 
+    let rust_to_cuda_async_trait_impl = r#impl::rust_to_cuda_async_trait(
+        struct_name,
+        &struct_name_cuda,
+        &struct_generics_cuda_async,
+        &struct_fields_cuda,
+        &r2c_field_async_declarations,
+        &r2c_field_initialisations,
+        &r2c_field_async_destructors,
+    );
+
     let cuda_as_rust_trait_impl = r#impl::cuda_as_rust_trait(
         struct_name,
         &struct_name_cuda,
@@ -106,6 +116,8 @@ pub fn impl_rust_to_cuda(ast: &syn::DeriveInput) -> proc_macro::TokenStream {
 
         #rust_to_cuda_trait_impl
 
+        #rust_to_cuda_async_trait_impl
+
         #cuda_as_rust_trait_impl
     })
     .into()
diff --git a/src/utils/exchange/wrapper.rs b/src/utils/exchange/wrapper.rs
index 61ab9899f..e9d5a0329 100644
--- a/src/utils/exchange/wrapper.rs
+++ b/src/utils/exchange/wrapper.rs
@@ -38,6 +38,7 @@ pub struct ExchangeWrapperOnHostAsync<'stream, T: RustToCuda<CudaAllocation: Emp
     locked_cuda_repr: HostLockedBox<DeviceAccessible<<T as RustToCuda>::CudaRepresentation>>,
     move_event: CudaDropWrapper<Event>,
     stream: PhantomData<&'stream Stream>,
+    waker: Arc<Mutex<Option<Waker>>>,
 }
 
 #[allow(clippy::module_name_repetitions)]
@@ -185,22 +186,64 @@ impl<'stream, T: RustToCuda<CudaAllocation: EmptyCudaAlloc>>
     /// # Errors
     /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside
     /// CUDA
-    pub fn move_to_stream<'stream2>(
-        self,
-        stream: &'stream2 Stream,
-    ) -> CudaResult<ExchangeWrapperOnHostAsync<'stream2, T>> {
+    pub fn move_to_stream(self, stream: &Stream) -> CudaResult<ExchangeWrapperOnHostAsync<'_, T>> {
         let old_event = self.move_event.into_inner();
         let new_event: CudaDropWrapper<Event> = Event::new(EventFlags::DISABLE_TIMING)?.into();
 
         stream.wait_event(old_event, StreamWaitEventFlags::DEFAULT)?;
         new_event.record(stream)?;
 
+        let waker_callback = self.waker.clone();
+        stream.add_callback(Box::new(move |_| {
+            if let Ok(mut w) = waker_callback.lock() {
+                if let Some(w) = w.take() {
+                    w.wake();
+                }
+            }
+        }))?;
+
         Ok(ExchangeWrapperOnHostAsync {
             value: self.value,
             device_box: self.device_box,
             locked_cuda_repr: self.locked_cuda_repr,
             move_event: new_event,
-            stream: PhantomData::<&'stream2 Stream>,
+            stream: PhantomData::<&Stream>,
+            waker: self.waker,
+        })
+    }
+}
+
+impl<'stream, T: RustToCuda<CudaAllocation: EmptyCudaAlloc>> IntoFuture
+    for ExchangeWrapperOnHostAsync<'stream, T>
+{
+    type Output = CudaResult<ExchangeWrapperOnHost<T>>;
+
+    type IntoFuture = impl Future<Output = Self::Output>;
+
+    fn into_future(self) -> Self::IntoFuture {
+        let mut wrapper = Some(self);
+
+        core::future::poll_fn(move |cx| match &wrapper {
+            Some(inner) => match inner.move_event.query() {
+                Ok(EventStatus::NotReady) => match inner.waker.lock() {
+                    Ok(mut w) => {
+                        *w = Some(cx.waker().clone());
+                        Poll::Pending
+                    },
+                    Err(_) => Poll::Ready(Err(CudaError::OperatingSystemError)),
+                },
+                Ok(EventStatus::Ready) => match wrapper.take() {
+                    Some(inner) => Poll::Ready(Ok(ExchangeWrapperOnHost {
+                        value: inner.value,
+                        device_box: inner.device_box,
+                        locked_cuda_repr: inner.locked_cuda_repr,
+                        move_event: inner.move_event,
+                    })),
+                    None => Poll::Ready(Err(CudaError::AlreadyAcquired)),
+                },
+                Err(err) => Poll::Ready(Err(err)),
+            },
+            None => Poll::Ready(Err(CudaError::AlreadyAcquired)),
         })
     }
 }
@@ -295,6 +338,77 @@ impl<'stream, T: RustToCuda<CudaAllocation: EmptyCudaAlloc>>
             )
         }
     }
+
+    /// Moves the data synchronously back to the host CPU device.
+    ///
+    /// To avoid aliasing, each CUDA thread only got access to its own shallow
+    /// copy of the data. Hence,
+    /// - any shallow changes to the data will NOT be reflected back to the CPU
+    /// - any deep changes to the data WILL be reflected back to the CPU
+    ///
+    /// # Errors
+    /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside
+    /// CUDA
+    pub fn move_to_host(mut self) -> CudaResult<ExchangeWrapperOnHost<T>> {
+        // Reflect deep changes back to the CPU
+        let _null_alloc: NullCudaAlloc = unsafe { self.value.restore(self.null_alloc) }?;
+
+        // Note: Shallow changes are not reflected back to the CPU
+
+        Ok(ExchangeWrapperOnHost {
+            value: self.value,
+            device_box: self.device_box,
+            locked_cuda_repr: self.locked_cuda_repr,
+            move_event: self.move_event,
+        })
+    }
+}
+
+impl<'stream, T: RustToCudaAsync<CudaAllocation: EmptyCudaAlloc>>
+    ExchangeWrapperOnDeviceAsync<'stream, T>
+{
+    /// Moves the data asynchronously back to the host CPU device.
+    ///
+    /// To avoid aliasing, each CUDA thread only got access to its own shallow
+    /// copy of the data. Hence,
+    /// - any shallow changes to the data will NOT be reflected back to the CPU
+    /// - any deep changes to the data WILL be reflected back to the CPU
+    ///
+    /// # Errors
+    /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside
+    /// CUDA
+    pub fn move_to_host_async(
+        mut self,
+        stream: &'stream Stream,
+    ) -> CudaResult<ExchangeWrapperOnHostAsync<'stream, T>> {
+        // Reflect deep changes back to the CPU
+        let _null_alloc: NullCudaAlloc =
+            unsafe { self.value.restore_async(self.null_alloc, stream) }?;
+
+        // Note: Shallow changes are not reflected back to the CPU
+
+        self.move_event.record(stream)?;
+
+        let waker: Arc<Mutex<Option<Waker>>> = Arc::new(Mutex::new(None));
+
+        let waker_callback = waker.clone();
+        stream.add_callback(Box::new(move |_| {
+            if let Ok(mut w) = waker_callback.lock() {
+                if let Some(w) = w.take() {
+                    w.wake();
+                }
+            }
+        }))?;
+
+        Ok(ExchangeWrapperOnHostAsync {
+            value: self.value,
+            device_box: self.device_box,
+            locked_cuda_repr: self.locked_cuda_repr,
+            move_event: self.move_event,
+            stream: PhantomData::<&'stream Stream>,
+            waker,
+        })
+    }
 }
 
 impl<'stream, T: RustToCuda<CudaAllocation: EmptyCudaAlloc>> IntoFuture
@@ -384,10 +498,10 @@ impl<T: RustToCudaAsync<CudaAllocation: EmptyCudaAlloc>> ExchangeWrapperOnDevice
     /// # Errors
     /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside
     /// CUDA
-    pub fn move_to_host_async<'stream>(
+    pub fn move_to_host_async(
         mut self,
-        stream: &'stream Stream,
-    ) -> CudaResult<ExchangeWrapperOnHostAsync<'stream, T>> {
+        stream: &Stream,
+    ) -> CudaResult<ExchangeWrapperOnHostAsync<'_, T>> {
         // Reflect deep changes back to the CPU
         let _null_alloc: NullCudaAlloc =
             unsafe { self.value.restore_async(self.null_alloc, stream) }?;
@@ -396,12 +510,24 @@ impl<T: RustToCudaAsync<CudaAllocation: EmptyCudaAlloc>> ExchangeWrapperOnDevice
 
         self.move_event.record(stream)?;
 
+        let waker: Arc<Mutex<Option<Waker>>> = Arc::new(Mutex::new(None));
+
+        let waker_callback = waker.clone();
+        stream.add_callback(Box::new(move |_| {
+            if let Ok(mut w) = waker_callback.lock() {
+                if let Some(w) = w.take() {
+                    w.wake();
+                }
+            }
+        }))?;
+
         Ok(ExchangeWrapperOnHostAsync {
             value: self.value,
             device_box: self.device_box,
             locked_cuda_repr: self.locked_cuda_repr,
             move_event: self.move_event,
-            stream: PhantomData::<&'stream Stream>,
+            stream: PhantomData::<&Stream>,
+            waker,
         })
     }
 }

From e7b6174ff1112bf3765db8411f6c873be6653a7e Mon Sep 17 00:00:00 2001
From: Juniper Langenstein <juniper.langenstein@helsinki.fi>
Date: Sat, 19 Nov 2022 18:47:36 +0000
Subject: [PATCH 004/120] Implemented async kernel launch

---
 .../generate/cpu_linker_macro/get_ptx_str.rs  |  2 +-
 .../generate/cpu_linker_macro/kernel_func.rs  | 18 +++--
 .../async_func_types.rs}                      |  8 +--
 .../launch_types.rs                           |  0
 .../mod.rs                                    | 24 +++----
 .../type_wrap.rs                              |  4 +-
 .../wrapper/generate/cpu_linker_macro/mod.rs  |  8 +--
 .../kernel/wrapper/generate/cpu_wrapper.rs    | 19 +++---
 rust-cuda-derive/src/kernel/wrapper/mod.rs    |  4 +-
 rust-cuda-derive/src/kernel/wrapper/parse.rs  |  2 +-
 src/host.rs                                   | 68 +++++++++++--------
 11 files changed, 87 insertions(+), 70 deletions(-)
 rename rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/{kernel_func_raw/raw_func_types.rs => kernel_func_async/async_func_types.rs} (93%)
 rename rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/{kernel_func_raw => kernel_func_async}/launch_types.rs (100%)
 rename rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/{kernel_func_raw => kernel_func_async}/mod.rs (87%)
 rename rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/{kernel_func_raw => kernel_func_async}/type_wrap.rs (89%)

diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/get_ptx_str.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/get_ptx_str.rs
index dadda41ec..d39246484 100644
--- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/get_ptx_str.rs
+++ b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/get_ptx_str.rs
@@ -30,7 +30,7 @@ pub(super) fn quote_get_ptx_str(
         .unwrap_or_else(|err| abort_call_site!("Failed to read crate path: {:?}.", err));
 
     let cpu_func_lifetime_erased_types =
-        super::kernel_func_raw::generate_launch_types(config, generics, inputs, macro_type_ids).1;
+        super::kernel_func_async::generate_launch_types(config, generics, inputs, macro_type_ids).1;
 
     let matching_kernel_assert = if skip_kernel_compilation() {
         quote!()
diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func.rs
index 7cad78e05..fda5b96e4 100644
--- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func.rs
+++ b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func.rs
@@ -89,7 +89,9 @@ fn generate_raw_func_input_wrap(
         func_inputs,
         func_input_cuda_types,
     }: &FunctionInputs,
-    FuncIdent { func_ident_raw, .. }: &FuncIdent,
+    FuncIdent {
+        func_ident_async, ..
+    }: &FuncIdent,
     func_params: &[syn::Ident],
 ) -> TokenStream {
     func_inputs
@@ -99,7 +101,11 @@ fn generate_raw_func_input_wrap(
         .rev()
         .fold(
             quote! {
-                self.#func_ident_raw(#(#func_params),*)
+                self.#func_ident_async(#(#func_params),*)?;
+                let rust_cuda::host::LaunchPackage {
+                    stream, ..
+                } = rust_cuda::host::Launcher::get_launch_package(self);
+                stream.synchronize()
             },
             |inner, ((arg, param), (cuda_mode, _ptx_jit))| match arg {
                 syn::FnArg::Typed(syn::PatType { pat, ty, .. }) => match cuda_mode {
@@ -119,7 +125,7 @@ fn generate_raw_func_input_wrap(
                                 let __result = (|#pat| { #inner })(unsafe {
                                     rust_cuda::host::HostAndDeviceConstRef::new(
                                         &#pat_box,  rust_cuda::utils::device_copy::SafeDeviceCopyWrapper::from_ref(#pat)
-                                    )
+                                    ).as_async()
                                 });
 
                                 #[allow(invalid_reference_casting)]
@@ -149,16 +155,16 @@ fn generate_raw_func_input_wrap(
                         if let syn::Type::Reference(syn::TypeReference { mutability, .. }) = &**ty {
                             if mutability.is_some() {
                                 quote! { rust_cuda::host::LendToCuda::lend_to_cuda_mut(
-                                    #pat, |#pat| { #inner }
+                                    #pat, |mut #pat| { (|#pat| { #inner })(#pat.as_async()) }
                                 ) }
                             } else {
                                 quote! { rust_cuda::host::LendToCuda::lend_to_cuda(
-                                    #pat, |#pat| { #inner }
+                                    #pat, |#pat| { (|#pat| { #inner })(#pat.as_async()) }
                                 ) }
                             }
                         } else {
                             quote! { rust_cuda::host::LendToCuda::move_to_cuda(
-                                #pat, |#pat| { #inner }
+                                #pat, |mut #pat| { (|#pat| { #inner })(#pat.as_async()) }
                             ) }
                         }
                     },
diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func_raw/raw_func_types.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func_async/async_func_types.rs
similarity index 93%
rename from rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func_raw/raw_func_types.rs
rename to rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func_async/async_func_types.rs
index 380048ec5..50e74b02e 100644
--- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func_raw/raw_func_types.rs
+++ b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func_async/async_func_types.rs
@@ -5,7 +5,7 @@ use crate::kernel::utils::r2c_move_lifetime;
 
 use super::super::super::super::{DeclGenerics, FunctionInputs, InputCudaType, KernelConfig};
 
-pub(super) fn generate_raw_func_types(
+pub(super) fn generate_async_func_types(
     KernelConfig { args, .. }: &KernelConfig,
     DeclGenerics {
         generic_start_token,
@@ -62,11 +62,11 @@ pub(super) fn generate_raw_func_types(
                         }
 
                         quote!(
-                            rust_cuda::host::HostAndDeviceMutRef<#lifetime, #cuda_type>
+                            rust_cuda::host::HostAndDeviceMutRefAsync<'stream, #lifetime, #cuda_type>
                         )
                     } else {
                         quote!(
-                            rust_cuda::host::HostAndDeviceConstRef<#lifetime, #cuda_type>
+                            rust_cuda::host::HostAndDeviceConstRefAsync<'stream, #lifetime, #cuda_type>
                         )
                     };
 
@@ -77,7 +77,7 @@ pub(super) fn generate_raw_func_types(
                     let lifetime = r2c_move_lifetime(i, ty);
 
                     let wrapped_type = quote! {
-                        rust_cuda::host::HostAndDeviceOwned<#lifetime, #cuda_type>
+                        rust_cuda::host::HostAndDeviceOwnedAsync<'stream, #lifetime, #cuda_type>
                     };
 
                     quote! {
diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func_raw/launch_types.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func_async/launch_types.rs
similarity index 100%
rename from rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func_raw/launch_types.rs
rename to rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func_async/launch_types.rs
diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func_raw/mod.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func_async/mod.rs
similarity index 87%
rename from rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func_raw/mod.rs
rename to rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func_async/mod.rs
index ab352b4c8..112e760c9 100644
--- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func_raw/mod.rs
+++ b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func_async/mod.rs
@@ -2,32 +2,32 @@ use proc_macro2::TokenStream;
 
 use super::super::super::{DeclGenerics, FuncIdent, FunctionInputs, KernelConfig};
 
+mod async_func_types;
 mod launch_types;
-mod raw_func_types;
 mod type_wrap;
 
+use async_func_types::generate_async_func_types;
 pub(super) use launch_types::generate_launch_types;
-use raw_func_types::generate_raw_func_types;
 use type_wrap::generate_func_input_and_ptx_jit_wraps;
 
 #[allow(clippy::too_many_arguments)]
-pub(super) fn quote_kernel_func_raw(
+pub(super) fn quote_kernel_func_async(
     config @ KernelConfig { args, .. }: &KernelConfig,
     decl_generics @ DeclGenerics {
-        generic_start_token,
         generic_wrapper_params,
-        generic_close_token,
         generic_wrapper_where_clause,
         ..
     }: &DeclGenerics,
     func_inputs: &FunctionInputs,
-    FuncIdent { func_ident_raw, .. }: &FuncIdent,
+    FuncIdent {
+        func_ident_async, ..
+    }: &FuncIdent,
     func_params: &[syn::Ident],
     func_attrs: &[syn::Attribute],
     macro_type_ids: &[syn::Ident],
 ) -> TokenStream {
-    let new_func_inputs_raw =
-        generate_raw_func_types(config, decl_generics, func_inputs, macro_type_ids);
+    let new_func_inputs_async =
+        generate_async_func_types(config, decl_generics, func_inputs, macro_type_ids);
     let (func_input_wrap, func_cpu_ptx_jit_wrap) =
         generate_func_input_and_ptx_jit_wraps(func_inputs);
     let (cpu_func_types_launch, cpu_func_lifetime_erased_types, cpu_func_unboxed_types) =
@@ -36,8 +36,8 @@ pub(super) fn quote_kernel_func_raw(
     quote! {
         #(#func_attrs)*
         #[allow(clippy::extra_unused_type_parameters)]
-        fn #func_ident_raw #generic_start_token #generic_wrapper_params #generic_close_token (
-            &mut self, #(#new_func_inputs_raw),*
+        fn #func_ident_async <'stream, #generic_wrapper_params> (
+            &'stream mut self, #(#new_func_inputs_async),*
         ) -> rust_cuda::rustacuda::error::CudaResult<()>
             #generic_wrapper_where_clause
         {
@@ -102,9 +102,7 @@ pub(super) fn quote_kernel_func_raw(
                             &#func_params as *const _ as *mut ::std::ffi::c_void
                         ),*
                     ]
-                ) }?;
-
-                stream.synchronize()
+                ) }
             })(#(#func_input_wrap),*)
         }
     }
diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func_raw/type_wrap.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func_async/type_wrap.rs
similarity index 89%
rename from rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func_raw/type_wrap.rs
rename to rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func_async/type_wrap.rs
index 432930731..50ea505f1 100644
--- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func_raw/type_wrap.rs
+++ b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func_async/type_wrap.rs
@@ -17,9 +17,9 @@ pub(super) fn generate_func_input_and_ptx_jit_wraps(
             syn::FnArg::Typed(syn::PatType { pat, ty, .. }) => {
                 #[allow(clippy::if_same_then_else)]
                 let func_input = if let syn::Type::Reference(_) = &**ty {
-                    quote! { #pat.for_device() }
+                    quote! { unsafe { #pat.for_device_async() } }
                 } else if matches!(cuda_mode, InputCudaType::LendRustToCuda) {
-                    quote! { #pat.for_device() }
+                    quote! { unsafe { #pat.for_device_async() } }
                 } else {
                     quote! { #pat }
                 };
diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/mod.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/mod.rs
index 7ab891e7e..52fd5c506 100644
--- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/mod.rs
+++ b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/mod.rs
@@ -4,12 +4,12 @@ use super::super::{DeclGenerics, FuncIdent, FunctionInputs, KernelConfig};
 
 mod get_ptx_str;
 mod kernel_func;
-mod kernel_func_raw;
+mod kernel_func_async;
 mod new_kernel;
 
 use get_ptx_str::quote_get_ptx_str;
 use kernel_func::quote_kernel_func;
-use kernel_func_raw::quote_kernel_func_raw;
+use kernel_func_async::quote_kernel_func_async;
 use new_kernel::quote_new_kernel;
 
 pub(in super::super) fn quote_cpu_linker_macro(
@@ -73,7 +73,7 @@ pub(in super::super) fn quote_cpu_linker_macro(
         func_attrs,
         &macro_type_ids,
     );
-    let kernel_func_raw = quote_kernel_func_raw(
+    let kernel_func_async = quote_kernel_func_async(
         config,
         decl_generics,
         func_inputs,
@@ -97,7 +97,7 @@ pub(in super::super) fn quote_cpu_linker_macro(
 
                     #kernel_func
 
-                    #kernel_func_raw
+                    #kernel_func_async
                 }
             };
         }
diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper.rs
index cad3cdc6a..ed93c61dc 100644
--- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper.rs
+++ b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper.rs
@@ -23,7 +23,7 @@ pub(in super::super) fn quote_cpu_wrapper(
     func_inputs: &FunctionInputs,
     FuncIdent {
         func_ident,
-        func_ident_raw,
+        func_ident_async,
         ..
     }: &FuncIdent,
     func_attrs: &[syn::Attribute],
@@ -54,7 +54,7 @@ pub(in super::super) fn quote_cpu_wrapper(
         },
     };
 
-    let (new_func_inputs_decl, new_func_inputs_raw_decl) =
+    let (new_func_inputs_decl, new_func_inputs_async_decl) =
         generate_new_func_inputs_decl(config, impl_generics, func_inputs);
 
     quote! {
@@ -76,8 +76,8 @@ pub(in super::super) fn quote_cpu_wrapper(
                 #generic_wrapper_where_clause;
 
             #(#func_attrs)*
-            fn #func_ident_raw #generic_start_token #generic_wrapper_params #generic_close_token (
-                &mut self, #(#new_func_inputs_raw_decl),*
+            fn #func_ident_async <'stream, #generic_wrapper_params> (
+                &'stream mut self, #(#new_func_inputs_async_decl),*
             ) -> rust_cuda::rustacuda::error::CudaResult<()>
                 #generic_wrapper_where_clause;
         }
@@ -157,11 +157,11 @@ fn generate_new_func_inputs_decl(
                         {
                             let wrapped_type = if mutability.is_some() {
                                 syn::parse_quote!(
-                                    rust_cuda::host::HostAndDeviceMutRef<#lifetime, #cuda_type>
+                                    rust_cuda::host::HostAndDeviceMutRefAsync<'stream, #lifetime, #cuda_type>
                                 )
                             } else {
                                 syn::parse_quote!(
-                                    rust_cuda::host::HostAndDeviceConstRef<#lifetime, #cuda_type>
+                                    rust_cuda::host::HostAndDeviceConstRefAsync<'stream, #lifetime, #cuda_type>
                                 )
                             };
 
@@ -170,7 +170,7 @@ fn generate_new_func_inputs_decl(
                             let lifetime = r2c_move_lifetime(i, ty);
 
                             let wrapped_type = syn::parse_quote!(
-                                rust_cuda::host::HostAndDeviceOwned<#lifetime, #cuda_type>
+                                rust_cuda::host::HostAndDeviceOwnedAsync<'stream, #lifetime, #cuda_type>
                             );
 
                             Box::new(wrapped_type)
@@ -178,9 +178,8 @@ fn generate_new_func_inputs_decl(
                             cuda_type
                         }
                     },
-                }),
+                })
             ),
             syn::FnArg::Receiver(_) => unreachable!(),
-        })
-        .unzip()
+        }).unzip()
 }
diff --git a/rust-cuda-derive/src/kernel/wrapper/mod.rs b/rust-cuda-derive/src/kernel/wrapper/mod.rs
index 6f63af892..c057fe7f1 100644
--- a/rust-cuda-derive/src/kernel/wrapper/mod.rs
+++ b/rust-cuda-derive/src/kernel/wrapper/mod.rs
@@ -128,7 +128,7 @@ pub fn kernel(attr: TokenStream, func: TokenStream) -> TokenStream {
 
     let func_ident = FuncIdent {
         func_ident: &func.sig.ident,
-        func_ident_raw: quote::format_ident!("{}_raw", &func.sig.ident),
+        func_ident_async: quote::format_ident!("{}_async", &func.sig.ident),
         func_ident_hash: quote::format_ident!("{}_{:016x}", &func.sig.ident, kernel_hash),
     };
 
@@ -251,7 +251,7 @@ struct ImplGenerics<'f> {
 #[allow(clippy::struct_field_names)]
 struct FuncIdent<'f> {
     func_ident: &'f syn::Ident,
-    func_ident_raw: syn::Ident,
+    func_ident_async: syn::Ident,
     func_ident_hash: syn::Ident,
 }
 
diff --git a/rust-cuda-derive/src/kernel/wrapper/parse.rs b/rust-cuda-derive/src/kernel/wrapper/parse.rs
index 936143cf2..7d523adb0 100644
--- a/rust-cuda-derive/src/kernel/wrapper/parse.rs
+++ b/rust-cuda-derive/src/kernel/wrapper/parse.rs
@@ -23,7 +23,7 @@ pub(super) fn parse_kernel_fn(tokens: TokenStream) -> syn::ItemFn {
     if func.sig.asyncness.is_some() {
         abort!(
             func.sig.asyncness.span(),
-            "Kernel function must not (yet) be async."
+            "Kernel function must not be async."
         );
     }
 
diff --git a/src/host.rs b/src/host.rs
index d434e8d67..f600d9b6e 100644
--- a/src/host.rs
+++ b/src/host.rs
@@ -618,6 +618,18 @@ impl<'a, T: DeviceCopy> HostAndDeviceMutRef<'a, T> {
             host_ref: self.host_ref,
         }
     }
+
+    #[must_use]
+    pub fn as_async<'stream, 'b>(&'b mut self) -> HostAndDeviceMutRefAsync<'stream, 'b, T>
+    where
+        'a: 'b,
+    {
+        HostAndDeviceMutRefAsync {
+            device_box: self.device_box,
+            host_ref: self.host_ref,
+            stream: PhantomData::<&'stream Stream>,
+        }
+    }
 }
 
 #[allow(clippy::module_name_repetitions)]
@@ -693,6 +705,18 @@ impl<'a, T: DeviceCopy> HostAndDeviceConstRef<'a, T> {
     {
         *self
     }
+
+    #[must_use]
+    pub fn as_async<'stream, 'b>(&'b self) -> HostAndDeviceConstRefAsync<'stream, 'b, T>
+    where
+        'a: 'b,
+    {
+        HostAndDeviceConstRefAsync {
+            device_box: self.device_box,
+            host_ref: self.host_ref,
+            stream: PhantomData::<&'stream Stream>,
+        }
+    }
 }
 
 #[allow(clippy::module_name_repetitions)]
@@ -740,6 +764,18 @@ impl<'a, T: SafeDeviceCopy + DeviceCopy> HostAndDeviceOwned<'a, T> {
     pub fn for_host(&'a mut self) -> &'a T {
         self.host_val
     }
+
+    #[must_use]
+    pub fn as_async<'stream, 'b>(&'b mut self) -> HostAndDeviceOwnedAsync<'stream, 'b, T>
+    where
+        'a: 'b,
+    {
+        HostAndDeviceOwnedAsync {
+            device_box: self.device_box,
+            host_val: self.host_val,
+            stream: PhantomData::<&'stream Stream>,
+        }
+    }
 }
 
 #[allow(clippy::module_name_repetitions)]
@@ -882,34 +918,12 @@ pub struct HostAndDeviceOwnedAsync<'stream, 'a, T: SafeDeviceCopy + DeviceCopy>
 }
 
 impl<'stream, 'a, T: SafeDeviceCopy + DeviceCopy> HostAndDeviceOwnedAsync<'stream, 'a, T> {
-    /// # Errors
-    ///
-    /// Returns a [`CudaError`] iff `value` cannot be moved
-    ///  to CUDA or an error occurs inside `inner`.
-    pub fn with_new<
-        O,
-        E: From<CudaError>,
-        F: for<'b> FnOnce(HostAndDeviceOwned<'b, T>) -> Result<O, E>,
-    >(
-        mut value: T,
-        inner: F,
-    ) -> Result<O, E> {
-        let mut device_box: HostDeviceBox<_> = DeviceBox::new(&value)?.into();
-
-        // Safety: `device_box` contains exactly the device copy of `value`
-        let result = inner(HostAndDeviceOwned {
-            device_box: &mut device_box,
-            host_val: &mut value,
-        });
-
-        core::mem::drop(device_box);
-        core::mem::drop(value);
-
-        result
-    }
-
     #[must_use]
-    pub fn for_device(self) -> DeviceMutRef<'a, T> {
+    /// # Safety
+    ///
+    /// The returned [`DeviceConstRef`] must only be used on the
+    /// constructed-with [`Stream`]
+    pub unsafe fn for_device_async(self) -> DeviceMutRef<'a, T> {
         DeviceMutRef {
             pointer: self.device_box.0.as_raw_mut(),
             reference: PhantomData,

From d93fc4ccb8bdbe86dbce07e2c5959e61f2de5f4c Mon Sep 17 00:00:00 2001
From: Juniper Langenstein <juniper.langenstein@helsinki.fi>
Date: Sat, 19 Nov 2022 19:09:12 +0000
Subject: [PATCH 005/120] Fixed RustToCudaAsync derive

---
 .../src/rust_to_cuda/field_copy.rs            |  1 +
 rust-cuda-derive/src/rust_to_cuda/generics.rs | 25 +++++++++++---
 rust-cuda-derive/src/rust_to_cuda/mod.rs      | 33 ++++++++++++-------
 3 files changed, 42 insertions(+), 17 deletions(-)

diff --git a/rust-cuda-derive/src/rust_to_cuda/field_copy.rs b/rust-cuda-derive/src/rust_to_cuda/field_copy.rs
index 93326aab6..61891aa8c 100644
--- a/rust-cuda-derive/src/rust_to_cuda/field_copy.rs
+++ b/rust-cuda-derive/src/rust_to_cuda/field_copy.rs
@@ -143,6 +143,7 @@ pub fn impl_field_copy_init_and_expand_alloc_type(
                         #proxy_ty as rust_cuda::common::RustToCudaAsyncProxy<#field_ty>
                     >::from_mut(&mut self.#field_accessor),
                     alloc_front,
+                    stream,
                 )?;
             });
 
diff --git a/rust-cuda-derive/src/rust_to_cuda/generics.rs b/rust-cuda-derive/src/rust_to_cuda/generics.rs
index d08b1e7c3..646686534 100644
--- a/rust-cuda-derive/src/rust_to_cuda/generics.rs
+++ b/rust-cuda-derive/src/rust_to_cuda/generics.rs
@@ -9,6 +9,7 @@ pub fn expand_cuda_struct_generics_where_requested_in_attrs(
     syn::Generics,
     syn::Generics,
     Vec<syn::Attribute>,
+    bool,
 ) {
     let mut type_params = ast
         .generics
@@ -30,6 +31,8 @@ pub fn expand_cuda_struct_generics_where_requested_in_attrs(
 
     let mut r2c_ignore = false;
 
+    let mut r2c_async_impl = None;
+
     struct_attrs_cuda.retain(|attr| {
         if attr.path.is_ident("cuda") {
             if let Ok(syn::Meta::List(list)) = attr.parse_meta() {
@@ -90,11 +93,22 @@ pub fn expand_cuda_struct_generics_where_requested_in_attrs(
                             }
                         },
                         syn::NestedMeta::Meta(syn::Meta::NameValue(syn::MetaNameValue {
-                            path:
-                                syn::Path {
-                                    leading_colon: None,
-                                    segments,
-                                },
+                            path,
+                            lit: syn::Lit::Bool(b),
+                            ..
+                        })) if path.is_ident("async") => if r2c_async_impl.is_none() {
+                            r2c_async_impl = Some(b.value());
+                        } else {
+                            emit_error!(
+                                b.span(),
+                                "[rust-cuda]: Duplicate #[cuda(async)] attribute.",
+                            );
+                        },
+                        syn::NestedMeta::Meta(syn::Meta::NameValue(syn::MetaNameValue {
+                            path: syn::Path {
+                                leading_colon: None,
+                                segments,
+                            },
                             lit: syn::Lit::Str(s),
                             ..
                         })) if segments.len() == 2
@@ -161,5 +175,6 @@ pub fn expand_cuda_struct_generics_where_requested_in_attrs(
         struct_generics_cuda,
         struct_generics_cuda_async,
         struct_layout_attrs,
+        r2c_async_impl.unwrap_or(true),
     )
 }
diff --git a/rust-cuda-derive/src/rust_to_cuda/mod.rs b/rust-cuda-derive/src/rust_to_cuda/mod.rs
index 5cfa6fb18..dc8eb6491 100644
--- a/rust-cuda-derive/src/rust_to_cuda/mod.rs
+++ b/rust-cuda-derive/src/rust_to_cuda/mod.rs
@@ -10,7 +10,7 @@ fn get_cuda_repr_ident(rust_repr_ident: &proc_macro2::Ident) -> proc_macro2::Ide
     format_ident!("{}CudaRepresentation", rust_repr_ident)
 }
 
-#[allow(clippy::module_name_repetitions)]
+#[allow(clippy::module_name_repetitions, clippy::too_many_lines)]
 pub fn impl_rust_to_cuda(ast: &syn::DeriveInput) -> proc_macro::TokenStream {
     let (mut struct_fields_cuda, struct_semi_cuda) = if let syn::Data::Struct(s) = &ast.data {
         (s.fields.clone(), s.semi_token)
@@ -69,8 +69,13 @@ pub fn impl_rust_to_cuda(ast: &syn::DeriveInput) -> proc_macro::TokenStream {
         syn::Fields::Unit => (),
     }
 
-    let (struct_attrs_cuda, struct_generics_cuda, struct_generics_cuda_async, struct_layout_attrs) =
-        generics::expand_cuda_struct_generics_where_requested_in_attrs(ast);
+    let (
+        struct_attrs_cuda,
+        struct_generics_cuda,
+        struct_generics_cuda_async,
+        struct_layout_attrs,
+        r2c_async_impl,
+    ) = generics::expand_cuda_struct_generics_where_requested_in_attrs(ast);
 
     let cuda_struct_declaration = r#impl::cuda_struct_declaration(
         &struct_attrs_cuda,
@@ -93,15 +98,19 @@ pub fn impl_rust_to_cuda(ast: &syn::DeriveInput) -> proc_macro::TokenStream {
         &r2c_field_destructors,
     );
 
-    let rust_to_cuda_async_trait_impl = r#impl::rust_to_cuda_async_trait(
-        struct_name,
-        &struct_name_cuda,
-        &struct_generics_cuda_async,
-        &struct_fields_cuda,
-        &r2c_field_async_declarations,
-        &r2c_field_initialisations,
-        &r2c_field_async_destructors,
-    );
+    let rust_to_cuda_async_trait_impl = if r2c_async_impl {
+        r#impl::rust_to_cuda_async_trait(
+            struct_name,
+            &struct_name_cuda,
+            &struct_generics_cuda_async,
+            &struct_fields_cuda,
+            &r2c_field_async_declarations,
+            &r2c_field_initialisations,
+            &r2c_field_async_destructors,
+        )
+    } else {
+        TokenStream::new()
+    };
 
     let cuda_as_rust_trait_impl = r#impl::cuda_as_rust_trait(
         struct_name,

From 5481e47fd51f644f4fbc65b488d54a702ca4e722 Mon Sep 17 00:00:00 2001
From: Juniper Langenstein <juniper.langenstein@helsinki.fi>
Date: Sat, 19 Nov 2022 19:32:01 +0000
Subject: [PATCH 006/120] LaunchPackage with non-mut Stream

---
 src/host.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/host.rs b/src/host.rs
index f600d9b6e..cc44abe9a 100644
--- a/src/host.rs
+++ b/src/host.rs
@@ -53,7 +53,7 @@ pub struct LaunchPackage<'l, L: ?Sized + Launcher> {
     pub config: LaunchConfig,
 
     pub kernel: &'l mut TypedKernel<L::KernelTraitObject>,
-    pub stream: &'l mut Stream,
+    pub stream: &'l Stream,
 
     pub watcher: &'l mut L::CompilationWatcher,
 }

From 6a9d4b6c3692a2848b15903c5ff18c5b9090b3e9 Mon Sep 17 00:00:00 2001
From: Juniper Langenstein <juniper.langenstein@helsinki.fi>
Date: Sat, 19 Nov 2022 20:18:53 +0000
Subject: [PATCH 007/120] Moved stream to be an explicit kernel argument

---
 .../generate/cpu_linker_macro/kernel_func.rs       | 11 +++++------
 .../cpu_linker_macro/kernel_func_async/mod.rs      |  8 +++++---
 .../src/kernel/wrapper/generate/cpu_wrapper.rs     | 14 ++++++++++----
 src/host.rs                                        |  3 ---
 4 files changed, 20 insertions(+), 16 deletions(-)

diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func.rs
index fda5b96e4..41d4244b0 100644
--- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func.rs
+++ b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func.rs
@@ -57,8 +57,10 @@ pub(super) fn quote_kernel_func(
     quote! {
         #(#func_attrs)*
         #[allow(clippy::needless_lifetimes)]
-        fn #func_ident #generic_start_token #generic_wrapper_params #generic_close_token (
-            &mut self, #(#new_func_inputs),*
+        fn #func_ident <'stream, #generic_wrapper_params>(
+            &mut self,
+            stream: &'stream rust_cuda::rustacuda::stream::Stream,
+            #(#new_func_inputs),*
         ) -> rust_cuda::rustacuda::error::CudaResult<()>
             #generic_wrapper_where_clause
         {
@@ -101,10 +103,7 @@ fn generate_raw_func_input_wrap(
         .rev()
         .fold(
             quote! {
-                self.#func_ident_async(#(#func_params),*)?;
-                let rust_cuda::host::LaunchPackage {
-                    stream, ..
-                } = rust_cuda::host::Launcher::get_launch_package(self);
+                self.#func_ident_async(stream, #(#func_params),*)?;
                 stream.synchronize()
             },
             |inner, ((arg, param), (cuda_mode, _ptx_jit))| match arg {
diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func_async/mod.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func_async/mod.rs
index 112e760c9..6980a5753 100644
--- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func_async/mod.rs
+++ b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func_async/mod.rs
@@ -36,13 +36,15 @@ pub(super) fn quote_kernel_func_async(
     quote! {
         #(#func_attrs)*
         #[allow(clippy::extra_unused_type_parameters)]
-        fn #func_ident_async <'stream, #generic_wrapper_params> (
-            &'stream mut self, #(#new_func_inputs_async),*
+        fn #func_ident_async <'stream, #generic_wrapper_params>(
+            &mut self,
+            stream: &'stream rust_cuda::rustacuda::stream::Stream,
+            #(#new_func_inputs_async),*
         ) -> rust_cuda::rustacuda::error::CudaResult<()>
             #generic_wrapper_where_clause
         {
             let rust_cuda::host::LaunchPackage {
-                kernel, watcher, config, stream
+                kernel, watcher, config
             } = rust_cuda::host::Launcher::get_launch_package(self);
 
             let kernel_jit_result = if config.ptx_jit {
diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper.rs
index ed93c61dc..e5c318140 100644
--- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper.rs
+++ b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper.rs
@@ -70,14 +70,20 @@ pub(in super::super) fn quote_cpu_wrapper(
             > where #launcher_predicate;
 
             #(#func_attrs)*
-            fn #func_ident #generic_start_token #generic_wrapper_params #generic_close_token (
-                &mut self, #(#new_func_inputs_decl),*
+            #[allow(clippy::too_many_arguments)]
+            fn #func_ident <'stream, #generic_wrapper_params>(
+                &mut self,
+                stream: &'stream rust_cuda::rustacuda::stream::Stream,
+                #(#new_func_inputs_decl),*
             ) -> rust_cuda::rustacuda::error::CudaResult<()>
                 #generic_wrapper_where_clause;
 
             #(#func_attrs)*
-            fn #func_ident_async <'stream, #generic_wrapper_params> (
-                &'stream mut self, #(#new_func_inputs_async_decl),*
+            #[allow(clippy::too_many_arguments)]
+            fn #func_ident_async <'stream, #generic_wrapper_params>(
+                &mut self,
+                stream: &'stream rust_cuda::rustacuda::stream::Stream,
+                #(#new_func_inputs_async_decl),*
             ) -> rust_cuda::rustacuda::error::CudaResult<()>
                 #generic_wrapper_where_clause;
         }
diff --git a/src/host.rs b/src/host.rs
index cc44abe9a..98ad817ba 100644
--- a/src/host.rs
+++ b/src/host.rs
@@ -51,10 +51,7 @@ pub struct LaunchConfig {
 
 pub struct LaunchPackage<'l, L: ?Sized + Launcher> {
     pub config: LaunchConfig,
-
     pub kernel: &'l mut TypedKernel<L::KernelTraitObject>,
-    pub stream: &'l Stream,
-
     pub watcher: &'l mut L::CompilationWatcher,
 }
 

From d1ae9aba11bfe69375be41ef1204704406cd15ff Mon Sep 17 00:00:00 2001
From: Juniper Tyree <juniper.tyree@helsinki.fi>
Date: Sun, 20 Nov 2022 04:39:03 -0800
Subject: [PATCH 008/120] Updated
 ExchangeWrapperOn[Device|Host]Async::move_to_stream

---
 Cargo.toml                                |  6 +++---
 rust-cuda-derive/src/rust_to_cuda/impl.rs |  2 --
 rust-cuda-ptx-jit/Cargo.toml              |  2 +-
 src/utils/exchange/wrapper.rs             | 18 ++++++------------
 4 files changed, 10 insertions(+), 18 deletions(-)

diff --git a/Cargo.toml b/Cargo.toml
index 17a279023..4dc3f5af1 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -24,10 +24,10 @@ host = ["rustacuda", "rust-cuda-ptx-jit/host"]
 derive = ["rustacuda_derive", "rust-cuda-derive"]
 
 [dependencies]
-rustacuda_core = { git = "https://github.com/MomoLangenstein/RustaCUDA", rev = "fa4bf52" }
+rustacuda_core = { git = "https://github.com/MomoLangenstein/RustaCUDA", rev = "bc2c42d" }
 
-rustacuda = { git = "https://github.com/MomoLangenstein/RustaCUDA", rev = "fa4bf52", optional = true }
-rustacuda_derive = { git = "https://github.com/MomoLangenstein/RustaCUDA", rev = "fa4bf52", optional = true }
+rustacuda = { git = "https://github.com/MomoLangenstein/RustaCUDA", rev = "bc2c42d", optional = true }
+rustacuda_derive = { git = "https://github.com/MomoLangenstein/RustaCUDA", rev = "bc2c42d", optional = true }
 
 const-type-layout = { version = "0.2.0", features = ["derive"] }
 
diff --git a/rust-cuda-derive/src/rust_to_cuda/impl.rs b/rust-cuda-derive/src/rust_to_cuda/impl.rs
index 1028f0ed6..ff607af28 100644
--- a/rust-cuda-derive/src/rust_to_cuda/impl.rs
+++ b/rust-cuda-derive/src/rust_to_cuda/impl.rs
@@ -42,8 +42,6 @@ pub fn cuda_struct_declaration(
     }
 }
 
-// TODO: derive async impl as well -> need different trait bounds
-
 #[allow(clippy::too_many_arguments)]
 pub fn rust_to_cuda_trait(
     struct_name: &syn::Ident,
diff --git a/rust-cuda-ptx-jit/Cargo.toml b/rust-cuda-ptx-jit/Cargo.toml
index d5b832eb8..5afb336e4 100644
--- a/rust-cuda-ptx-jit/Cargo.toml
+++ b/rust-cuda-ptx-jit/Cargo.toml
@@ -12,6 +12,6 @@ default = []
 host = ["regex", "rustacuda", "lazy_static"]
 
 [dependencies]
-rustacuda = { git = "https://github.com/MomoLangenstein/RustaCUDA", rev = "fa4bf52", optional = true }
+rustacuda = { git = "https://github.com/MomoLangenstein/RustaCUDA", rev = "bc2c42d", optional = true }
 regex = { version = "1.5", optional = true }
 lazy_static = { version = "1.4", optional = true }
diff --git a/src/utils/exchange/wrapper.rs b/src/utils/exchange/wrapper.rs
index e9d5a0329..a4a8e50f7 100644
--- a/src/utils/exchange/wrapper.rs
+++ b/src/utils/exchange/wrapper.rs
@@ -187,11 +187,8 @@ impl<'stream, T: RustToCuda<CudaAllocation: EmptyCudaAlloc>>
     /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside
     /// CUDA
     pub fn move_to_stream(self, stream: &Stream) -> CudaResult<ExchangeWrapperOnHostAsync<'_, T>> {
-        let old_event = self.move_event.into_inner();
-        let new_event: CudaDropWrapper<Event> = Event::new(EventFlags::DISABLE_TIMING)?.into();
-
-        stream.wait_event(old_event, StreamWaitEventFlags::DEFAULT)?;
-        new_event.record(stream)?;
+        stream.wait_event(&self.move_event, StreamWaitEventFlags::DEFAULT)?;
+        self.move_event.record(stream)?;
 
         let waker_callback = self.waker.clone();
         stream.add_callback(Box::new(move |_| {
@@ -206,7 +203,7 @@ impl<'stream, T: RustToCuda<CudaAllocation: EmptyCudaAlloc>>
             value: self.value,
             device_box: self.device_box,
             locked_cuda_repr: self.locked_cuda_repr,
-            move_event: new_event,
+            move_event: self.move_event,
             stream: PhantomData::<&Stream>,
             waker: self.waker,
         })
@@ -291,11 +288,8 @@ impl<'stream, T: RustToCuda<CudaAllocation: EmptyCudaAlloc>>
         self,
         stream: &Stream,
     ) -> CudaResult<ExchangeWrapperOnDeviceAsync<'_, T>> {
-        let old_event = self.move_event.into_inner();
-        let new_event: CudaDropWrapper<Event> = Event::new(EventFlags::DISABLE_TIMING)?.into();
-
-        stream.wait_event(old_event, StreamWaitEventFlags::DEFAULT)?;
-        new_event.record(stream)?;
+        stream.wait_event(&self.move_event, StreamWaitEventFlags::DEFAULT)?;
+        self.move_event.record(stream)?;
 
         let waker_callback = self.waker.clone();
         stream.add_callback(Box::new(move |_| {
@@ -311,7 +305,7 @@ impl<'stream, T: RustToCuda<CudaAllocation: EmptyCudaAlloc>>
             device_box: self.device_box,
             locked_cuda_repr: self.locked_cuda_repr,
             null_alloc: self.null_alloc,
-            move_event: new_event,
+            move_event: self.move_event,
             stream,
             waker: self.waker,
         })

From d70ea5c5107c98a9b0d51de4ea4e7dae95a78bda Mon Sep 17 00:00:00 2001
From: Juniper Tyree <juniper.tyree@helsinki.fi>
Date: Sun, 20 Nov 2022 05:35:17 -0800
Subject: [PATCH 009/120] Upgraded to fixed RustaCuda

---
 Cargo.toml                   | 6 +++---
 rust-cuda-ptx-jit/Cargo.toml | 2 +-
 src/host.rs                  | 8 --------
 3 files changed, 4 insertions(+), 12 deletions(-)

diff --git a/Cargo.toml b/Cargo.toml
index 4dc3f5af1..2ebfbe32e 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -24,10 +24,10 @@ host = ["rustacuda", "rust-cuda-ptx-jit/host"]
 derive = ["rustacuda_derive", "rust-cuda-derive"]
 
 [dependencies]
-rustacuda_core = { git = "https://github.com/MomoLangenstein/RustaCUDA", rev = "bc2c42d" }
+rustacuda_core = { git = "https://github.com/juntyr/RustaCUDA", rev = "c6ea7cc" }
 
-rustacuda = { git = "https://github.com/MomoLangenstein/RustaCUDA", rev = "bc2c42d", optional = true }
-rustacuda_derive = { git = "https://github.com/MomoLangenstein/RustaCUDA", rev = "bc2c42d", optional = true }
+rustacuda = { git = "https://github.com/juntyr/RustaCUDA", rev = "c6ea7cc", optional = true }
+rustacuda_derive = { git = "https://github.com/juntyr/RustaCUDA", rev = "c6ea7cc", optional = true }
 
 const-type-layout = { version = "0.2.0", features = ["derive"] }
 
diff --git a/rust-cuda-ptx-jit/Cargo.toml b/rust-cuda-ptx-jit/Cargo.toml
index 5afb336e4..aa7fa32c6 100644
--- a/rust-cuda-ptx-jit/Cargo.toml
+++ b/rust-cuda-ptx-jit/Cargo.toml
@@ -12,6 +12,6 @@ default = []
 host = ["regex", "rustacuda", "lazy_static"]
 
 [dependencies]
-rustacuda = { git = "https://github.com/MomoLangenstein/RustaCUDA", rev = "bc2c42d", optional = true }
+rustacuda = { git = "https://github.com/juntyr/RustaCUDA", rev = "c6ea7cc", optional = true }
 regex = { version = "1.5", optional = true }
 lazy_static = { version = "1.4", optional = true }
diff --git a/src/host.rs b/src/host.rs
index 98ad817ba..a104c50a3 100644
--- a/src/host.rs
+++ b/src/host.rs
@@ -302,14 +302,6 @@ impl<C: private::drop::Sealed> From<C> for CudaDropWrapper<C> {
         Self(ManuallyDrop::new(val))
     }
 }
-impl<C: private::drop::Sealed> CudaDropWrapper<C> {
-    pub fn into_inner(self) -> C {
-        let this = ManuallyDrop::new(self);
-
-        // Safety: move out of drop, caller now has to deal with CUDA drop again
-        unsafe { core::ptr::read(&*this.0) }
-    }
-}
 impl<C: private::drop::Sealed> Drop for CudaDropWrapper<C> {
     fn drop(&mut self) {
         // Safety: drop is only ever called once

From 077e965a4d9bb44bfaefb1d1549da682437bf302 Mon Sep 17 00:00:00 2001
From: Juniper Tyree <juniper.tyree@helsinki.fi>
Date: Sun, 27 Nov 2022 02:08:34 -0800
Subject: [PATCH 010/120] Added scratch-space methods for uni-directional
 CudaExchangeItem

---
 src/utils/exchange/buffer/mod.rs | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/src/utils/exchange/buffer/mod.rs b/src/utils/exchange/buffer/mod.rs
index 1a940faa0..e7141a43e 100644
--- a/src/utils/exchange/buffer/mod.rs
+++ b/src/utils/exchange/buffer/mod.rs
@@ -60,3 +60,19 @@ impl<T: SafeDeviceCopy> AsMut<T> for CudaExchangeItem<T, true, true> {
         &mut self.0
     }
 }
+
+impl<T: SafeDeviceCopy> CudaExchangeItem<T, false, true> {
+    #[cfg(any(feature = "host", doc))]
+    #[doc(cfg(feature = "host"))]
+    pub fn as_scratch_mut(&mut self) -> &mut T {
+        &mut self.0
+    }
+}
+
+impl<T: SafeDeviceCopy> CudaExchangeItem<T, true, false> {
+    #[cfg(any(not(feature = "host"), doc))]
+    #[doc(cfg(not(feature = "host")))]
+    pub fn as_scratch_mut(&mut self) -> &mut T {
+        &mut self.0
+    }
+}

From ea6e45997348fbd9d6bb20cfe18dbce0026bbb3a Mon Sep 17 00:00:00 2001
From: Juniper Tyree <juniper.tyree@helsinki.fi>
Date: Sun, 27 Nov 2022 02:32:38 -0800
Subject: [PATCH 011/120] Added unsafe-aliasing API to
 SplitSlideOverCudaThreads[Const|Dynamic]Stride

---
 src/utils/aliasing/const.rs   | 21 +++++++++++++++++++++
 src/utils/aliasing/dynamic.rs | 21 +++++++++++++++++++++
 2 files changed, 42 insertions(+)

diff --git a/src/utils/aliasing/const.rs b/src/utils/aliasing/const.rs
index 8f7f1ab98..e1d069710 100644
--- a/src/utils/aliasing/const.rs
+++ b/src/utils/aliasing/const.rs
@@ -42,6 +42,27 @@ fn split_slice_const_stride_mut<E, const STRIDE: usize>(slice: &mut [E]) -> &mut
     unsafe { core::slice::from_raw_parts_mut(slice.as_mut_ptr().add(offset), len) }
 }
 
+#[cfg(all(not(feature = "host"), target_os = "cuda"))]
+impl<T, const STRIDE: usize> SplitSliceOverCudaThreadsConstStride<T, STRIDE> {
+    /// # Safety
+    ///
+    /// All cross-CUDA-thread aliasing guarantees are lost with this method.
+    /// Instead, the caller must ensure that no two threads in a kernel launch
+    /// access the same underlying elements.
+    pub unsafe fn get_unchecked(&self) -> &T {
+        &self.0
+    }
+
+    /// # Safety
+    ///
+    /// All cross-CUDA-thread aliasing guarantees are lost with this method.
+    /// Instead, the caller must ensure that no two threads in a kernel launch
+    /// access the same underlying elements.
+    pub unsafe fn get_mut_unchecked(&mut self) -> &mut T {
+        &mut self.0
+    }
+}
+
 #[cfg(all(not(feature = "host"), target_os = "cuda"))]
 impl<E, T: Deref<Target = [E]>, const STRIDE: usize> Deref
     for SplitSliceOverCudaThreadsConstStride<T, STRIDE>
diff --git a/src/utils/aliasing/dynamic.rs b/src/utils/aliasing/dynamic.rs
index 6cba2ff9c..c07bd60a4 100644
--- a/src/utils/aliasing/dynamic.rs
+++ b/src/utils/aliasing/dynamic.rs
@@ -42,6 +42,27 @@ fn split_slice_dynamic_stride_mut<E>(slice: &mut [E], stride: usize) -> &mut [E]
     unsafe { core::slice::from_raw_parts_mut(slice.as_mut_ptr().add(offset), len) }
 }
 
+#[cfg(all(not(feature = "host"), target_os = "cuda"))]
+impl<T> SplitSliceOverCudaThreadsDynamicStride<T> {
+    /// # Safety
+    ///
+    /// All cross-CUDA-thread aliasing guarantees are lost with this method.
+    /// Instead, the caller must ensure that no two threads in a kernel launch
+    /// access the same underlying elements.
+    pub unsafe fn get_unchecked(&self) -> &T {
+        &self.inner
+    }
+
+    /// # Safety
+    ///
+    /// All cross-CUDA-thread aliasing guarantees are lost with this method.
+    /// Instead, the caller must ensure that no two threads in a kernel launch
+    /// access the same underlying elements.
+    pub unsafe fn get_mut_unchecked(&mut self) -> &mut T {
+        &mut self.inner
+    }
+}
+
 #[cfg(all(not(feature = "host"), target_os = "cuda"))]
 impl<E, T: Deref<Target = [E]>> Deref for SplitSliceOverCudaThreadsDynamicStride<T> {
     type Target = [E];

From 578453f2e48466f50e9559d4506fd85ebe4baf14 Mon Sep 17 00:00:00 2001
From: Juniper Tyree <juniper.tyree@helsinki.fi>
Date: Sun, 27 Nov 2022 03:11:01 -0800
Subject: [PATCH 012/120] Extended the CudaExchangeItem API with scratch and
 uMaybeUninit

---
 src/utils/exchange/buffer/mod.rs | 54 ++++++++++++++++++++++++++++++++
 1 file changed, 54 insertions(+)

diff --git a/src/utils/exchange/buffer/mod.rs b/src/utils/exchange/buffer/mod.rs
index e7141a43e..c4e4b24bd 100644
--- a/src/utils/exchange/buffer/mod.rs
+++ b/src/utils/exchange/buffer/mod.rs
@@ -1,3 +1,5 @@
+use core::mem::MaybeUninit;
+
 mod common;
 #[cfg(any(not(feature = "host"), doc))]
 mod device;
@@ -62,6 +64,12 @@ impl<T: SafeDeviceCopy> AsMut<T> for CudaExchangeItem<T, true, true> {
 }
 
 impl<T: SafeDeviceCopy> CudaExchangeItem<T, false, true> {
+    #[cfg(any(feature = "host", doc))]
+    #[doc(cfg(feature = "host"))]
+    pub fn as_scratch(&self) -> &T {
+        &self.0
+    }
+
     #[cfg(any(feature = "host", doc))]
     #[doc(cfg(feature = "host"))]
     pub fn as_scratch_mut(&mut self) -> &mut T {
@@ -70,9 +78,55 @@ impl<T: SafeDeviceCopy> CudaExchangeItem<T, false, true> {
 }
 
 impl<T: SafeDeviceCopy> CudaExchangeItem<T, true, false> {
+    #[cfg(any(not(feature = "host"), doc))]
+    #[doc(cfg(not(feature = "host")))]
+    pub fn as_scratch(&self) -> &T {
+        &self.0
+    }
+
     #[cfg(any(not(feature = "host"), doc))]
     #[doc(cfg(not(feature = "host")))]
     pub fn as_scratch_mut(&mut self) -> &mut T {
         &mut self.0
     }
 }
+
+impl<T: SafeDeviceCopy> CudaExchangeItem<T, true, false> {
+    #[cfg(any(feature = "host", doc))]
+    #[doc(cfg(feature = "host"))]
+    pub fn as_uninit(&self) -> &MaybeUninit<T> {
+        // Safety:
+        // - MaybeUninit is a transparent newtype union
+        // - CudaExchangeItem is a transparent newtype
+        unsafe { &*(self as *const Self).cast() }
+    }
+
+    #[cfg(any(feature = "host", doc))]
+    #[doc(cfg(feature = "host"))]
+    pub fn as_uninit_mut(&mut self) -> &mut MaybeUninit<T> {
+        // Safety:
+        // - MaybeUninit is a transparent newtype union
+        // - CudaExchangeItem is a transparent newtype
+        unsafe { &mut *(self as *mut Self).cast() }
+    }
+}
+
+impl<T: SafeDeviceCopy> CudaExchangeItem<T, false, true> {
+    #[cfg(any(not(feature = "host"), doc))]
+    #[doc(cfg(not(feature = "host")))]
+    pub fn as_uninit(&self) -> &MaybeUninit<T> {
+        // Safety:
+        // - MaybeUninit is a transparent newtype union
+        // - CudaExchangeItem is a transparent newtype
+        unsafe { &*(self as *const Self).cast() }
+    }
+
+    #[cfg(any(not(feature = "host"), doc))]
+    #[doc(cfg(not(feature = "host")))]
+    pub fn as_uninit_mut(&mut self) -> &mut MaybeUninit<T> {
+        // Safety:
+        // - MaybeUninit is a transparent newtype union
+        // - CudaExchangeItem is a transparent newtype
+        unsafe { &mut *(self as *mut Self).cast() }
+    }
+}

From c55d26979094e859557babafc449fb7cdbc40c16 Mon Sep 17 00:00:00 2001
From: Juniper Tyree <juniper.tyree@helsinki.fi>
Date: Sun, 27 Nov 2022 03:43:06 -0800
Subject: [PATCH 013/120] Rename
 SplitSliceOverCudaThreads[Const|Dynamic]Strude::alias_[mut_]unchecked

---
 src/utils/aliasing/const.rs   | 4 ++--
 src/utils/aliasing/dynamic.rs | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/utils/aliasing/const.rs b/src/utils/aliasing/const.rs
index e1d069710..a60a94eb9 100644
--- a/src/utils/aliasing/const.rs
+++ b/src/utils/aliasing/const.rs
@@ -49,7 +49,7 @@ impl<T, const STRIDE: usize> SplitSliceOverCudaThreadsConstStride<T, STRIDE> {
     /// All cross-CUDA-thread aliasing guarantees are lost with this method.
     /// Instead, the caller must ensure that no two threads in a kernel launch
     /// access the same underlying elements.
-    pub unsafe fn get_unchecked(&self) -> &T {
+    pub unsafe fn alias_unchecked(&self) -> &T {
         &self.0
     }
 
@@ -58,7 +58,7 @@ impl<T, const STRIDE: usize> SplitSliceOverCudaThreadsConstStride<T, STRIDE> {
     /// All cross-CUDA-thread aliasing guarantees are lost with this method.
     /// Instead, the caller must ensure that no two threads in a kernel launch
     /// access the same underlying elements.
-    pub unsafe fn get_mut_unchecked(&mut self) -> &mut T {
+    pub unsafe fn alias_mut_unchecked(&mut self) -> &mut T {
         &mut self.0
     }
 }
diff --git a/src/utils/aliasing/dynamic.rs b/src/utils/aliasing/dynamic.rs
index c07bd60a4..668112f88 100644
--- a/src/utils/aliasing/dynamic.rs
+++ b/src/utils/aliasing/dynamic.rs
@@ -49,7 +49,7 @@ impl<T> SplitSliceOverCudaThreadsDynamicStride<T> {
     /// All cross-CUDA-thread aliasing guarantees are lost with this method.
     /// Instead, the caller must ensure that no two threads in a kernel launch
     /// access the same underlying elements.
-    pub unsafe fn get_unchecked(&self) -> &T {
+    pub unsafe fn alias_unchecked(&self) -> &T {
         &self.inner
     }
 
@@ -58,7 +58,7 @@ impl<T> SplitSliceOverCudaThreadsDynamicStride<T> {
     /// All cross-CUDA-thread aliasing guarantees are lost with this method.
     /// Instead, the caller must ensure that no two threads in a kernel launch
     /// access the same underlying elements.
-    pub unsafe fn get_mut_unchecked(&mut self) -> &mut T {
+    pub unsafe fn alias_mut_unchecked(&mut self) -> &mut T {
         &mut self.inner
     }
 }

From 96036b08253aafd0c68a91a814b03c1768656f34 Mon Sep 17 00:00:00 2001
From: Juniper Langenstein <juniper.langenstein@helsinki.fi>
Date: Tue, 29 Nov 2022 09:12:14 +0000
Subject: [PATCH 014/120] Implemented #[cuda(crate)] and #[kernel(crate)]
 attributes

---
 examples/derive/Cargo.toml                    |  2 +-
 examples/derive/src/lib.rs                    |  6 +-
 examples/single-source/Cargo.toml             |  4 +-
 examples/single-source/src/main.rs            | 32 ++++----
 .../generate/cpu_linker_macro/get_ptx_str.rs  | 31 +++++---
 .../generate/cpu_linker_macro/kernel_func.rs  | 28 ++++---
 .../kernel_func_async/async_func_types.rs     | 13 ++--
 .../kernel_func_async/launch_types.rs         | 19 ++---
 .../cpu_linker_macro/kernel_func_async/mod.rs | 42 +++++++----
 .../wrapper/generate/cpu_linker_macro/mod.rs  | 12 ++-
 .../generate/cpu_linker_macro/new_kernel.rs   |  9 ++-
 .../kernel/wrapper/generate/cpu_wrapper.rs    | 30 ++++----
 .../kernel/wrapper/generate/cuda_wrapper.rs   | 51 +++++++------
 rust-cuda-derive/src/kernel/wrapper/mod.rs    | 74 +++++++++++++++++--
 .../src/rust_to_cuda/field_copy.rs            | 45 +++++------
 rust-cuda-derive/src/rust_to_cuda/field_ty.rs | 17 +++--
 rust-cuda-derive/src/rust_to_cuda/generics.rs | 47 ++++++++++--
 rust-cuda-derive/src/rust_to_cuda/impl.rs     | 64 +++++++++-------
 rust-cuda-derive/src/rust_to_cuda/mod.rs      | 27 ++++---
 19 files changed, 354 insertions(+), 199 deletions(-)

diff --git a/examples/derive/Cargo.toml b/examples/derive/Cargo.toml
index e59a344af..f4ea53d90 100644
--- a/examples/derive/Cargo.toml
+++ b/examples/derive/Cargo.toml
@@ -9,4 +9,4 @@ edition = "2021"
 
 [dependencies]
 const-type-layout = { version = "0.2.0" }
-rust-cuda = { path = "../../", features = ["derive", "host"] }
+rc = { package = "rust-cuda", path = "../../", features = ["derive", "host"] }
diff --git a/examples/derive/src/lib.rs b/examples/derive/src/lib.rs
index 814e30f61..76a7d3cb1 100644
--- a/examples/derive/src/lib.rs
+++ b/examples/derive/src/lib.rs
@@ -2,13 +2,15 @@
 #![feature(const_type_name)]
 #![feature(offset_of)]
 
-#[derive(rust_cuda::common::LendRustToCuda)]
+#[derive(rc::common::LendRustToCuda)]
+#[cuda(crate = "rc")]
 struct Inner<T: Copy> {
     #[cuda(embed)]
     inner: T,
 }
 
-#[derive(rust_cuda::common::LendRustToCuda)]
+#[derive(rc::common::LendRustToCuda)]
+#[cuda(crate = "rc")]
 struct Outer<T: Copy> {
     #[cuda(embed)]
     inner: Inner<T>,
diff --git a/examples/single-source/Cargo.toml b/examples/single-source/Cargo.toml
index 128da7cef..351d694a0 100644
--- a/examples/single-source/Cargo.toml
+++ b/examples/single-source/Cargo.toml
@@ -11,7 +11,7 @@ edition = "2021"
 const-type-layout = { version = "0.2.0" }
 
 [target.'cfg(target_os = "cuda")'.dependencies]
-rust-cuda = { path = "../../", features = ["derive"] }
+rc = { package = "rust-cuda", path = "../../", features = ["derive"] }
 
 [target.'cfg(not(target_os = "cuda"))'.dependencies]
-rust-cuda = { path = "../../", features = ["derive", "host"] }
+rc = { package = "rust-cuda", path = "../../", features = ["derive", "host"] }
diff --git a/examples/single-source/src/main.rs b/examples/single-source/src/main.rs
index 36c0736c6..79f6e3ec1 100644
--- a/examples/single-source/src/main.rs
+++ b/examples/single-source/src/main.rs
@@ -10,32 +10,34 @@
 
 extern crate alloc;
 
-#[macro_use]
-extern crate const_type_layout;
-
 #[cfg(not(target_os = "cuda"))]
 fn main() {}
 
 #[repr(C)]
-#[derive(TypeLayout)]
+#[derive(rc::const_type_layout::TypeLayout)]
+#[layout(crate = "rc::const_type_layout")]
 pub struct Dummy(i32);
 
-#[derive(rust_cuda::common::LendRustToCuda)]
+#[derive(rc::common::LendRustToCuda)]
+#[cuda(crate = "rc")]
 #[allow(dead_code)]
 pub struct Wrapper<T> {
     #[cuda(embed)]
     inner: T,
 }
 
-#[derive(rust_cuda::common::LendRustToCuda)]
+#[derive(rc::common::LendRustToCuda)]
+#[cuda(crate = "rc")]
 pub struct Empty([u8; 0]);
 
 #[repr(C)]
-#[derive(TypeLayout)]
+#[derive(rc::const_type_layout::TypeLayout)]
+#[layout(crate = "rc::const_type_layout")]
 pub struct Tuple(u32, i32);
 
-#[rust_cuda::common::kernel(use link_kernel! as impl Kernel<KernelArgs> for Launcher)]
-pub fn kernel<'a, T: rust_cuda::common::RustToCuda>(
+#[rc::common::kernel(use link_kernel! as impl Kernel<KernelArgs> for Launcher)]
+#[kernel(crate = "rc")]
+pub fn kernel<'a, T: rc::common::RustToCuda>(
     #[kernel(pass = SafeDeviceCopy)] _x: &Dummy,
     #[kernel(pass = LendRustToCuda, jit)] _y: &mut ShallowCopy<Wrapper<T>>,
     #[kernel(pass = LendRustToCuda)] _z: &ShallowCopy<Wrapper<T>>,
@@ -43,7 +45,7 @@ pub fn kernel<'a, T: rust_cuda::common::RustToCuda>(
     #[kernel(pass = LendRustToCuda)] _: Wrapper<T>,
     #[kernel(pass = SafeDeviceCopy)] Tuple(_s, mut __t): Tuple,
 ) where
-    <T as rust_cuda::common::RustToCuda>::CudaRepresentation: rust_cuda::safety::StackOnly,
+    <T as rc::common::RustToCuda>::CudaRepresentation: rc::safety::StackOnly,
 {
 }
 
@@ -52,16 +54,16 @@ mod host {
     use super::{Kernel, KernelArgs};
 
     #[allow(dead_code)]
-    struct Launcher<T: rust_cuda::common::RustToCuda>(core::marker::PhantomData<T>);
+    struct Launcher<T: rc::common::RustToCuda>(core::marker::PhantomData<T>);
 
     link_kernel!(crate::Empty);
-    link_kernel!(rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<u64>);
+    link_kernel!(rc::utils::device_copy::SafeDeviceCopyWrapper<u64>);
 
-    impl<T: rust_cuda::common::RustToCuda> rust_cuda::host::Launcher for Launcher<T> {
+    impl<T: rc::common::RustToCuda> rc::host::Launcher for Launcher<T> {
         type CompilationWatcher = ();
         type KernelTraitObject = dyn Kernel<T>;
 
-        fn get_launch_package(&mut self) -> rust_cuda::host::LaunchPackage<Self> {
+        fn get_launch_package(&mut self) -> rc::host::LaunchPackage<Self> {
             unimplemented!()
         }
     }
@@ -71,7 +73,7 @@ mod host {
 mod cuda_prelude {
     use core::arch::nvptx;
 
-    use rust_cuda::device::utils;
+    use rc::device::utils;
 
     #[global_allocator]
     static _GLOBAL_ALLOCATOR: utils::PTXAllocator = utils::PTXAllocator;
diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/get_ptx_str.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/get_ptx_str.rs
index d39246484..179ba7eed 100644
--- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/get_ptx_str.rs
+++ b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/get_ptx_str.rs
@@ -6,6 +6,7 @@ use crate::kernel::utils::skip_kernel_compilation;
 use super::super::super::{DeclGenerics, FuncIdent, FunctionInputs, KernelConfig};
 
 pub(super) fn quote_get_ptx_str(
+    crate_path: &syn::Path,
     FuncIdent {
         func_ident,
         func_ident_hash,
@@ -29,19 +30,25 @@ pub(super) fn quote_get_ptx_str(
     let crate_manifest_dir = proc_macro::tracked_env::var("CARGO_MANIFEST_DIR")
         .unwrap_or_else(|err| abort_call_site!("Failed to read crate path: {:?}.", err));
 
-    let cpu_func_lifetime_erased_types =
-        super::kernel_func_async::generate_launch_types(config, generics, inputs, macro_type_ids).1;
+    let cpu_func_lifetime_erased_types = super::kernel_func_async::generate_launch_types(
+        crate_path,
+        config,
+        generics,
+        inputs,
+        macro_type_ids,
+    )
+    .1;
 
     let matching_kernel_assert = if skip_kernel_compilation() {
         quote!()
     } else {
         quote::quote_spanned! { func_ident.span()=>
-            const _: ::rust_cuda::safety::kernel_signature::Assert<{
-                ::rust_cuda::safety::kernel_signature::CpuAndGpuKernelSignatures::Match
-            }> = ::rust_cuda::safety::kernel_signature::Assert::<{
-                ::rust_cuda::safety::kernel_signature::check(
+            const _: #crate_path::safety::kernel_signature::Assert<{
+                #crate_path::safety::kernel_signature::CpuAndGpuKernelSignatures::Match
+            }> = #crate_path::safety::kernel_signature::Assert::<{
+                #crate_path::safety::kernel_signature::check(
                     PTX_STR.as_bytes(),
-                    concat!(".visible .entry ", rust_cuda::host::specialise_kernel_call!(
+                    concat!(".visible .entry ", #crate_path::host::specialise_kernel_call!(
                         #func_ident_hash #generic_start_token
                             #($#macro_type_ids),*
                         #generic_close_token
@@ -64,10 +71,10 @@ pub(super) fn quote_get_ptx_str(
                 );
 
                 quote::quote_spanned! { ty.span()=>
-                    const _: ::rust_cuda::safety::type_layout::Assert<{
-                        ::rust_cuda::safety::type_layout::CpuAndGpuTypeLayouts::Match
-                    }> = ::rust_cuda::safety::type_layout::Assert::<{
-                        ::rust_cuda::safety::type_layout::check::<#ty>(#layout_param)
+                    const _: #crate_path::safety::type_layout::Assert<{
+                        #crate_path::safety::type_layout::CpuAndGpuTypeLayouts::Match
+                    }> = #crate_path::safety::type_layout::Assert::<{
+                        #crate_path::safety::type_layout::check::<#ty>(#layout_param)
                     }>;
                 }
             })
@@ -76,7 +83,7 @@ pub(super) fn quote_get_ptx_str(
 
     quote! {
         fn get_ptx_str() -> &'static str {
-            rust_cuda::host::link_kernel!{
+            #crate_path::host::link_kernel!{
                 #func_ident #args #crate_name #crate_manifest_dir #generic_start_token
                     #($#macro_type_ids),*
                 #generic_close_token
diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func.rs
index 41d4244b0..d6e70e276 100644
--- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func.rs
+++ b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func.rs
@@ -2,7 +2,9 @@ use proc_macro2::TokenStream;
 
 use super::super::super::{DeclGenerics, FuncIdent, FunctionInputs, InputCudaType, KernelConfig};
 
+#[allow(clippy::too_many_arguments)]
 pub(super) fn quote_kernel_func(
+    crate_path: &syn::Path,
     KernelConfig { args, .. }: &KernelConfig,
     DeclGenerics {
         generic_start_token,
@@ -52,16 +54,17 @@ pub(super) fn quote_kernel_func(
         })
         .collect::<Vec<_>>();
 
-    let raw_func_input_wrap = generate_raw_func_input_wrap(inputs, fn_ident, func_params);
+    let raw_func_input_wrap =
+        generate_raw_func_input_wrap(crate_path, inputs, fn_ident, func_params);
 
     quote! {
         #(#func_attrs)*
         #[allow(clippy::needless_lifetimes)]
         fn #func_ident <'stream, #generic_wrapper_params>(
             &mut self,
-            stream: &'stream rust_cuda::rustacuda::stream::Stream,
+            stream: &'stream #crate_path::rustacuda::stream::Stream,
             #(#new_func_inputs),*
-        ) -> rust_cuda::rustacuda::error::CudaResult<()>
+        ) -> #crate_path::rustacuda::error::CudaResult<()>
             #generic_wrapper_where_clause
         {
             // impls check adapted from Nikolai Vazquez's `impls` crate:
@@ -87,6 +90,7 @@ pub(super) fn quote_kernel_func(
 
 #[allow(clippy::too_many_lines)]
 fn generate_raw_func_input_wrap(
+    crate_path: &syn::Path,
     FunctionInputs {
         func_inputs,
         func_input_cuda_types,
@@ -114,16 +118,16 @@ fn generate_raw_func_input_wrap(
 
                             // DeviceCopy mode only supports immutable references
                             quote! {
-                                let mut #pat_box = rust_cuda::host::HostDeviceBox::from(
-                                    rust_cuda::rustacuda::memory::DeviceBox::new(
-                                        rust_cuda::utils::device_copy::SafeDeviceCopyWrapper::from_ref(#pat)
+                                let mut #pat_box = #crate_path::host::HostDeviceBox::from(
+                                    #crate_path::rustacuda::memory::DeviceBox::new(
+                                        #crate_path::utils::device_copy::SafeDeviceCopyWrapper::from_ref(#pat)
                                     )?
                                 );
                                 #[allow(clippy::redundant_closure_call)]
                                 // Safety: `#pat_box` contains exactly the device copy of `#pat`
                                 let __result = (|#pat| { #inner })(unsafe {
-                                    rust_cuda::host::HostAndDeviceConstRef::new(
-                                        &#pat_box,  rust_cuda::utils::device_copy::SafeDeviceCopyWrapper::from_ref(#pat)
+                                    #crate_path::host::HostAndDeviceConstRef::new(
+                                        &#pat_box,  #crate_path::utils::device_copy::SafeDeviceCopyWrapper::from_ref(#pat)
                                     ).as_async()
                                 });
 
@@ -145,7 +149,7 @@ fn generate_raw_func_input_wrap(
                             }
                         } else {
                             quote! { {
-                                let #pat = rust_cuda::utils::device_copy::SafeDeviceCopyWrapper::from(#pat);
+                                let #pat = #crate_path::utils::device_copy::SafeDeviceCopyWrapper::from(#pat);
                                 #inner
                             } }
                         }
@@ -153,16 +157,16 @@ fn generate_raw_func_input_wrap(
                     InputCudaType::LendRustToCuda => {
                         if let syn::Type::Reference(syn::TypeReference { mutability, .. }) = &**ty {
                             if mutability.is_some() {
-                                quote! { rust_cuda::host::LendToCuda::lend_to_cuda_mut(
+                                quote! { #crate_path::host::LendToCuda::lend_to_cuda_mut(
                                     #pat, |mut #pat| { (|#pat| { #inner })(#pat.as_async()) }
                                 ) }
                             } else {
-                                quote! { rust_cuda::host::LendToCuda::lend_to_cuda(
+                                quote! { #crate_path::host::LendToCuda::lend_to_cuda(
                                     #pat, |#pat| { (|#pat| { #inner })(#pat.as_async()) }
                                 ) }
                             }
                         } else {
-                            quote! { rust_cuda::host::LendToCuda::move_to_cuda(
+                            quote! { #crate_path::host::LendToCuda::move_to_cuda(
                                 #pat, |mut #pat| { (|#pat| { #inner })(#pat.as_async()) }
                             ) }
                         }
diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func_async/async_func_types.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func_async/async_func_types.rs
index 50e74b02e..c24406c9a 100644
--- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func_async/async_func_types.rs
+++ b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func_async/async_func_types.rs
@@ -6,6 +6,7 @@ use crate::kernel::utils::r2c_move_lifetime;
 use super::super::super::super::{DeclGenerics, FunctionInputs, InputCudaType, KernelConfig};
 
 pub(super) fn generate_async_func_types(
+    crate_path: &syn::Path,
     KernelConfig { args, .. }: &KernelConfig,
     DeclGenerics {
         generic_start_token,
@@ -38,11 +39,11 @@ pub(super) fn generate_async_func_types(
 
                 let cuda_type = match cuda_mode {
                     InputCudaType::SafeDeviceCopy => quote! {
-                        rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<#syn_type>
+                        #crate_path::utils::device_copy::SafeDeviceCopyWrapper<#syn_type>
                     },
                     InputCudaType::LendRustToCuda => quote! {
-                        rust_cuda::common::DeviceAccessible<
-                            <#syn_type as rust_cuda::common::RustToCuda>::CudaRepresentation
+                        #crate_path::common::DeviceAccessible<
+                            <#syn_type as #crate_path::common::RustToCuda>::CudaRepresentation
                         >
                     },
                 };
@@ -62,11 +63,11 @@ pub(super) fn generate_async_func_types(
                         }
 
                         quote!(
-                            rust_cuda::host::HostAndDeviceMutRefAsync<'stream, #lifetime, #cuda_type>
+                            #crate_path::host::HostAndDeviceMutRefAsync<'stream, #lifetime, #cuda_type>
                         )
                     } else {
                         quote!(
-                            rust_cuda::host::HostAndDeviceConstRefAsync<'stream, #lifetime, #cuda_type>
+                            #crate_path::host::HostAndDeviceConstRefAsync<'stream, #lifetime, #cuda_type>
                         )
                     };
 
@@ -77,7 +78,7 @@ pub(super) fn generate_async_func_types(
                     let lifetime = r2c_move_lifetime(i, ty);
 
                     let wrapped_type = quote! {
-                        rust_cuda::host::HostAndDeviceOwnedAsync<'stream, #lifetime, #cuda_type>
+                        #crate_path::host::HostAndDeviceOwnedAsync<'stream, #lifetime, #cuda_type>
                     };
 
                     quote! {
diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func_async/launch_types.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func_async/launch_types.rs
index 0fed7282f..16cd0008e 100644
--- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func_async/launch_types.rs
+++ b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func_async/launch_types.rs
@@ -6,6 +6,7 @@ use crate::kernel::utils::r2c_move_lifetime;
 use super::super::super::super::{DeclGenerics, FunctionInputs, InputCudaType, KernelConfig};
 
 pub(in super::super) fn generate_launch_types(
+    crate_path: &syn::Path,
     KernelConfig { args, .. }: &KernelConfig,
     DeclGenerics {
         generic_start_token,
@@ -39,11 +40,11 @@ pub(in super::super) fn generate_launch_types(
 
                 let cuda_type = match cuda_mode {
                     InputCudaType::SafeDeviceCopy => quote::quote_spanned! { ty.span()=>
-                        rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<#syn_type>
+                        #crate_path::utils::device_copy::SafeDeviceCopyWrapper<#syn_type>
                     },
                     InputCudaType::LendRustToCuda => quote::quote_spanned! { ty.span()=>
-                        rust_cuda::common::DeviceAccessible<
-                            <#syn_type as rust_cuda::common::RustToCuda>::CudaRepresentation
+                        #crate_path::common::DeviceAccessible<
+                            <#syn_type as #crate_path::common::RustToCuda>::CudaRepresentation
                         >
                     },
                 };
@@ -57,18 +58,18 @@ pub(in super::super) fn generate_launch_types(
                     {
                         if mutability.is_some() {
                             quote::quote_spanned! { ty.span()=>
-                                rust_cuda::common::DeviceMutRef<#lifetime, #cuda_type>
+                                #crate_path::common::DeviceMutRef<#lifetime, #cuda_type>
                             }
                         } else {
                             quote::quote_spanned! { ty.span()=>
-                                rust_cuda::common::DeviceConstRef<#lifetime, #cuda_type>
+                                #crate_path::common::DeviceConstRef<#lifetime, #cuda_type>
                             }
                         }
                     } else if matches!(cuda_mode, InputCudaType::LendRustToCuda) {
                         let lifetime = r2c_move_lifetime(i, ty);
 
                         quote::quote_spanned! { ty.span()=>
-                            rust_cuda::common::DeviceMutRef<#lifetime, #cuda_type>
+                            #crate_path::common::DeviceMutRef<#lifetime, #cuda_type>
                         }
                     } else {
                         quote! { #cuda_type }
@@ -79,16 +80,16 @@ pub(in super::super) fn generate_launch_types(
                     if let syn::Type::Reference(syn::TypeReference { mutability, .. }) = &**ty {
                         if mutability.is_some() {
                             quote::quote_spanned! { ty.span()=>
-                                rust_cuda::common::DeviceMutRef<'static, #cuda_type>
+                                #crate_path::common::DeviceMutRef<'static, #cuda_type>
                             }
                         } else {
                             quote::quote_spanned! { ty.span()=>
-                                rust_cuda::common::DeviceConstRef<'static, #cuda_type>
+                                #crate_path::common::DeviceConstRef<'static, #cuda_type>
                             }
                         }
                     } else if matches!(cuda_mode, InputCudaType::LendRustToCuda) {
                         quote::quote_spanned! { ty.span()=>
-                            rust_cuda::common::DeviceMutRef<'static, #cuda_type>
+                            #crate_path::common::DeviceMutRef<'static, #cuda_type>
                         }
                     } else {
                         cuda_type
diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func_async/mod.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func_async/mod.rs
index 6980a5753..44cc4d904 100644
--- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func_async/mod.rs
+++ b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func_async/mod.rs
@@ -12,6 +12,7 @@ use type_wrap::generate_func_input_and_ptx_jit_wraps;
 
 #[allow(clippy::too_many_arguments)]
 pub(super) fn quote_kernel_func_async(
+    crate_path: &syn::Path,
     config @ KernelConfig { args, .. }: &KernelConfig,
     decl_generics @ DeclGenerics {
         generic_wrapper_params,
@@ -26,29 +27,40 @@ pub(super) fn quote_kernel_func_async(
     func_attrs: &[syn::Attribute],
     macro_type_ids: &[syn::Ident],
 ) -> TokenStream {
-    let new_func_inputs_async =
-        generate_async_func_types(config, decl_generics, func_inputs, macro_type_ids);
+    let new_func_inputs_async = generate_async_func_types(
+        crate_path,
+        config,
+        decl_generics,
+        func_inputs,
+        macro_type_ids,
+    );
     let (func_input_wrap, func_cpu_ptx_jit_wrap) =
         generate_func_input_and_ptx_jit_wraps(func_inputs);
     let (cpu_func_types_launch, cpu_func_lifetime_erased_types, cpu_func_unboxed_types) =
-        generate_launch_types(config, decl_generics, func_inputs, macro_type_ids);
+        generate_launch_types(
+            crate_path,
+            config,
+            decl_generics,
+            func_inputs,
+            macro_type_ids,
+        );
 
     quote! {
         #(#func_attrs)*
         #[allow(clippy::extra_unused_type_parameters)]
         fn #func_ident_async <'stream, #generic_wrapper_params>(
             &mut self,
-            stream: &'stream rust_cuda::rustacuda::stream::Stream,
+            stream: &'stream #crate_path::rustacuda::stream::Stream,
             #(#new_func_inputs_async),*
-        ) -> rust_cuda::rustacuda::error::CudaResult<()>
+        ) -> #crate_path::rustacuda::error::CudaResult<()>
             #generic_wrapper_where_clause
         {
-            let rust_cuda::host::LaunchPackage {
+            let #crate_path::host::LaunchPackage {
                 kernel, watcher, config
-            } = rust_cuda::host::Launcher::get_launch_package(self);
+            } = #crate_path::host::Launcher::get_launch_package(self);
 
             let kernel_jit_result = if config.ptx_jit {
-                rust_cuda::ptx_jit::compilePtxJITwithArguments! {
+                #crate_path::ptx_jit::compilePtxJITwithArguments! {
                     kernel.compile_with_ptx_jit_args(#(#func_cpu_ptx_jit_wrap),*)
                 }?
             } else {
@@ -56,13 +68,13 @@ pub(super) fn quote_kernel_func_async(
             };
 
             let function = match kernel_jit_result {
-                rust_cuda::host::KernelJITResult::Recompiled(function) => {
+                #crate_path::host::KernelJITResult::Recompiled(function) => {
                     // Call launcher hook on kernel compilation
-                    <Self as rust_cuda::host::Launcher>::on_compile(function, watcher)?;
+                    <Self as #crate_path::host::Launcher>::on_compile(function, watcher)?;
 
                     function
                 },
-                rust_cuda::host::KernelJITResult::Cached(function) => function,
+                #crate_path::host::KernelJITResult::Cached(function) => function,
             };
 
             #[allow(clippy::redundant_closure_call)]
@@ -79,14 +91,14 @@ pub(super) fn quote_kernel_func_async(
 
                 if false {
                     #[allow(dead_code)]
-                    fn assert_impl_devicecopy<T: rust_cuda::rustacuda_core::DeviceCopy>(_val: &T) {}
+                    fn assert_impl_devicecopy<T: #crate_path::rustacuda_core::DeviceCopy>(_val: &T) {}
 
                     #[allow(dead_code)]
-                    fn assert_impl_no_aliasing<T: rust_cuda::safety::NoAliasing>() {}
+                    fn assert_impl_no_aliasing<T: #crate_path::safety::NoAliasing>() {}
 
                     #[allow(dead_code)]
                     fn assert_impl_fits_into_device_register<
-                        T: rust_cuda::safety::FitsIntoDeviceRegister,
+                        T: #crate_path::safety::FitsIntoDeviceRegister,
                     >(_val: &T) {}
 
                     #(assert_impl_devicecopy(&#func_params);)*
@@ -94,7 +106,7 @@ pub(super) fn quote_kernel_func_async(
                     #(assert_impl_fits_into_device_register(&#func_params);)*
                 }
 
-                let rust_cuda::host::LaunchConfig {
+                let #crate_path::host::LaunchConfig {
                     grid, block, shared_memory_size, ptx_jit: _,
                 } = config;
 
diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/mod.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/mod.rs
index 52fd5c506..aedf1e12e 100644
--- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/mod.rs
+++ b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/mod.rs
@@ -13,6 +13,7 @@ use kernel_func_async::quote_kernel_func_async;
 use new_kernel::quote_new_kernel;
 
 pub(in super::super) fn quote_cpu_linker_macro(
+    crate_path: &syn::Path,
     config @ KernelConfig {
         visibility,
         kernel,
@@ -56,6 +57,7 @@ pub(in super::super) fn quote_cpu_linker_macro(
     };
 
     let get_ptx_str = quote_get_ptx_str(
+        crate_path,
         func_ident,
         config,
         decl_generics,
@@ -63,8 +65,15 @@ pub(in super::super) fn quote_cpu_linker_macro(
         func_params,
         &macro_type_ids,
     );
-    let new_kernel = quote_new_kernel(config, decl_generics, func_ident, &macro_type_ids);
+    let new_kernel = quote_new_kernel(
+        crate_path,
+        config,
+        decl_generics,
+        func_ident,
+        &macro_type_ids,
+    );
     let kernel_func = quote_kernel_func(
+        crate_path,
         config,
         decl_generics,
         func_inputs,
@@ -74,6 +83,7 @@ pub(in super::super) fn quote_cpu_linker_macro(
         &macro_type_ids,
     );
     let kernel_func_async = quote_kernel_func_async(
+        crate_path,
         config,
         decl_generics,
         func_inputs,
diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/new_kernel.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/new_kernel.rs
index fa32591db..6b53954e4 100644
--- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/new_kernel.rs
+++ b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/new_kernel.rs
@@ -3,6 +3,7 @@ use proc_macro2::TokenStream;
 use super::super::super::{DeclGenerics, FuncIdent, KernelConfig};
 
 pub(super) fn quote_new_kernel(
+    crate_path: &syn::Path,
     KernelConfig { kernel, .. }: &KernelConfig,
     DeclGenerics {
         generic_start_token,
@@ -15,19 +16,19 @@ pub(super) fn quote_new_kernel(
     macro_type_ids: &[syn::Ident],
 ) -> TokenStream {
     quote! {
-        fn new_kernel() -> rust_cuda::rustacuda::error::CudaResult<
-            rust_cuda::host::TypedKernel<dyn #kernel #generic_start_token
+        fn new_kernel() -> #crate_path::rustacuda::error::CudaResult<
+            #crate_path::host::TypedKernel<dyn #kernel #generic_start_token
                 #($#macro_type_ids),*
             #generic_close_token>
         > {
             let ptx = Self::get_ptx_str();
-            let entry_point = rust_cuda::host::specialise_kernel_call!(
+            let entry_point = #crate_path::host::specialise_kernel_call!(
                 #func_ident_hash #generic_start_token
                     #($#macro_type_ids),*
                 #generic_close_token
             );
 
-            rust_cuda::host::TypedKernel::new(ptx, entry_point)
+            #crate_path::host::TypedKernel::new(ptx, entry_point)
         }
     }
 }
diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper.rs
index e5c318140..4851af9ce 100644
--- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper.rs
+++ b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper.rs
@@ -7,6 +7,7 @@ use super::super::{
 };
 
 pub(in super::super) fn quote_cpu_wrapper(
+    crate_path: &syn::Path,
     config @ KernelConfig {
         visibility, kernel, ..
     }: &KernelConfig,
@@ -29,7 +30,7 @@ pub(in super::super) fn quote_cpu_wrapper(
     func_attrs: &[syn::Attribute],
 ) -> TokenStream {
     let launcher_predicate = quote! {
-        Self: Sized + rust_cuda::host::Launcher<
+        Self: Sized + #crate_path::host::Launcher<
             KernelTraitObject = dyn #kernel #ty_generics
         >
     };
@@ -55,7 +56,7 @@ pub(in super::super) fn quote_cpu_wrapper(
     };
 
     let (new_func_inputs_decl, new_func_inputs_async_decl) =
-        generate_new_func_inputs_decl(config, impl_generics, func_inputs);
+        generate_new_func_inputs_decl(crate_path, config, impl_generics, func_inputs);
 
     quote! {
         #[cfg(not(target_os = "cuda"))]
@@ -65,32 +66,33 @@ pub(in super::super) fn quote_cpu_wrapper(
         {
             fn get_ptx_str() -> &'static str where #launcher_predicate;
 
-            fn new_kernel() -> rust_cuda::rustacuda::error::CudaResult<
-                rust_cuda::host::TypedKernel<dyn #kernel #ty_generics>
+            fn new_kernel() -> #crate_path::rustacuda::error::CudaResult<
+                #crate_path::host::TypedKernel<dyn #kernel #ty_generics>
             > where #launcher_predicate;
 
             #(#func_attrs)*
             #[allow(clippy::too_many_arguments)]
             fn #func_ident <'stream, #generic_wrapper_params>(
                 &mut self,
-                stream: &'stream rust_cuda::rustacuda::stream::Stream,
+                stream: &'stream #crate_path::rustacuda::stream::Stream,
                 #(#new_func_inputs_decl),*
-            ) -> rust_cuda::rustacuda::error::CudaResult<()>
+            ) -> #crate_path::rustacuda::error::CudaResult<()>
                 #generic_wrapper_where_clause;
 
             #(#func_attrs)*
             #[allow(clippy::too_many_arguments)]
             fn #func_ident_async <'stream, #generic_wrapper_params>(
                 &mut self,
-                stream: &'stream rust_cuda::rustacuda::stream::Stream,
+                stream: &'stream #crate_path::rustacuda::stream::Stream,
                 #(#new_func_inputs_async_decl),*
-            ) -> rust_cuda::rustacuda::error::CudaResult<()>
+            ) -> #crate_path::rustacuda::error::CudaResult<()>
                 #generic_wrapper_where_clause;
         }
     }
 }
 
 fn generate_new_func_inputs_decl(
+    crate_path: &syn::Path,
     KernelConfig { args, .. }: &KernelConfig,
     ImplGenerics { ty_generics, .. }: &ImplGenerics,
     FunctionInputs {
@@ -146,11 +148,11 @@ fn generate_new_func_inputs_decl(
 
                         let cuda_type = match cuda_mode {
                             InputCudaType::SafeDeviceCopy => syn::parse_quote!(
-                                rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<#syn_type>
+                                #crate_path::utils::device_copy::SafeDeviceCopyWrapper<#syn_type>
                             ),
                             InputCudaType::LendRustToCuda => syn::parse_quote!(
-                                rust_cuda::common::DeviceAccessible<
-                                    <#syn_type as rust_cuda::common::RustToCuda>::CudaRepresentation
+                                #crate_path::common::DeviceAccessible<
+                                    <#syn_type as #crate_path::common::RustToCuda>::CudaRepresentation
                                 >
                             ),
                         };
@@ -163,11 +165,11 @@ fn generate_new_func_inputs_decl(
                         {
                             let wrapped_type = if mutability.is_some() {
                                 syn::parse_quote!(
-                                    rust_cuda::host::HostAndDeviceMutRefAsync<'stream, #lifetime, #cuda_type>
+                                    #crate_path::host::HostAndDeviceMutRefAsync<'stream, #lifetime, #cuda_type>
                                 )
                             } else {
                                 syn::parse_quote!(
-                                    rust_cuda::host::HostAndDeviceConstRefAsync<'stream, #lifetime, #cuda_type>
+                                    #crate_path::host::HostAndDeviceConstRefAsync<'stream, #lifetime, #cuda_type>
                                 )
                             };
 
@@ -176,7 +178,7 @@ fn generate_new_func_inputs_decl(
                             let lifetime = r2c_move_lifetime(i, ty);
 
                             let wrapped_type = syn::parse_quote!(
-                                rust_cuda::host::HostAndDeviceOwnedAsync<'stream, #lifetime, #cuda_type>
+                                #crate_path::host::HostAndDeviceOwnedAsync<'stream, #lifetime, #cuda_type>
                             );
 
                             Box::new(wrapped_type)
diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cuda_wrapper.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cuda_wrapper.rs
index d017efae1..36e316708 100644
--- a/rust-cuda-derive/src/kernel/wrapper/generate/cuda_wrapper.rs
+++ b/rust-cuda-derive/src/kernel/wrapper/generate/cuda_wrapper.rs
@@ -6,6 +6,7 @@ use super::super::{FuncIdent, FunctionInputs, InputCudaType, KernelConfig};
 
 #[allow(clippy::too_many_lines)]
 pub(in super::super) fn quote_cuda_wrapper(
+    crate_path: &syn::Path,
     config @ KernelConfig { args, .. }: &KernelConfig,
     inputs @ FunctionInputs {
         func_inputs,
@@ -19,8 +20,8 @@ pub(in super::super) fn quote_cuda_wrapper(
     func_attrs: &[syn::Attribute],
     func_params: &[syn::Ident],
 ) -> TokenStream {
-    let (ptx_func_inputs, ptx_func_types) = specialise_ptx_func_inputs(config, inputs);
-    let ptx_func_unboxed_types = specialise_ptx_unboxed_types(config, inputs);
+    let (ptx_func_inputs, ptx_func_types) = specialise_ptx_func_inputs(crate_path, config, inputs);
+    let ptx_func_unboxed_types = specialise_ptx_unboxed_types(crate_path, config, inputs);
 
     let func_layout_params = func_params
         .iter()
@@ -46,13 +47,13 @@ pub(in super::super) fn quote_cuda_wrapper(
                 // Emit PTX JIT load markers
                 let ptx_jit_load = if ptx_jit.0 {
                     quote! {
-                        rust_cuda::ptx_jit::PtxJITConstLoad!([#i] => #pat.as_ref())
+                        #crate_path::ptx_jit::PtxJITConstLoad!([#i] => #pat.as_ref())
                     }
                 } else { quote! {} };
 
                 let type_ident = quote::format_ident!("__T_{}", i);
                 let syn_type = quote::quote_spanned! { ty.span()=>
-                    rust_cuda::device::specialise_kernel_type!(#args :: #type_ident)
+                    #crate_path::device::specialise_kernel_type!(#args :: #type_ident)
                 };
 
                 match cuda_mode {
@@ -70,22 +71,22 @@ pub(in super::super) fn quote_cuda_wrapper(
                         if mutability.is_some() {
                             quote! {
                                 #ptx_jit_load;
-                                rust_cuda::device::BorrowFromRust::with_borrow_from_rust_mut(
-                                    #pat, |#pat: #and_token #mutability rust_cuda::device::ShallowCopy<#syn_type>| { #inner },
+                                #crate_path::device::BorrowFromRust::with_borrow_from_rust_mut(
+                                    #pat, |#pat: #and_token #mutability #crate_path::device::ShallowCopy<#syn_type>| { #inner },
                                 )
                             }
                         } else {
                             quote! {
                                 #ptx_jit_load;
-                                rust_cuda::device::BorrowFromRust::with_borrow_from_rust(
-                                    #pat, |#pat: #and_token rust_cuda::device::ShallowCopy<#syn_type>| { #inner },
+                                #crate_path::device::BorrowFromRust::with_borrow_from_rust(
+                                    #pat, |#pat: #and_token #crate_path::device::ShallowCopy<#syn_type>| { #inner },
                                 )
                             }
                         }
                     } else {
                         quote! {
                             #ptx_jit_load;
-                            rust_cuda::device::BorrowFromRust::with_moved_from_rust(
+                            #crate_path::device::BorrowFromRust::with_moved_from_rust(
                                 #pat, |#pat: #syn_type| { #inner },
                             )
                         }
@@ -99,22 +100,22 @@ pub(in super::super) fn quote_cuda_wrapper(
 
     quote! {
         #[cfg(target_os = "cuda")]
-        #[rust_cuda::device::specialise_kernel_entry(#args)]
+        #[#crate_path::device::specialise_kernel_entry(#args)]
         #[no_mangle]
         #(#func_attrs)*
         pub unsafe extern "ptx-kernel" fn #func_type_layout_ident(#(#func_params: &mut &[u8]),*) {
             #(
                 #[no_mangle]
                 static #func_layout_params: [
-                    u8; rust_cuda::const_type_layout::serialised_type_graph_len::<#ptx_func_types>()
-                ] = rust_cuda::const_type_layout::serialise_type_graph::<#ptx_func_types>();
+                    u8; #crate_path::const_type_layout::serialised_type_graph_len::<#ptx_func_types>()
+                ] = #crate_path::const_type_layout::serialise_type_graph::<#ptx_func_types>();
 
                 *#func_params = &#func_layout_params;
             )*
         }
 
         #[cfg(target_os = "cuda")]
-        #[rust_cuda::device::specialise_kernel_entry(#args)]
+        #[#crate_path::device::specialise_kernel_entry(#args)]
         #[no_mangle]
         #(#func_attrs)*
         pub unsafe extern "ptx-kernel" fn #func_ident_hash(#(#ptx_func_inputs),*) {
@@ -130,14 +131,14 @@ pub(in super::super) fn quote_cuda_wrapper(
 
             if false {
                 #[allow(dead_code)]
-                fn assert_impl_devicecopy<T: rust_cuda::rustacuda_core::DeviceCopy>(_val: &T) {}
+                fn assert_impl_devicecopy<T: #crate_path::rustacuda_core::DeviceCopy>(_val: &T) {}
 
                 #[allow(dead_code)]
-                fn assert_impl_no_aliasing<T: rust_cuda::safety::NoAliasing>() {}
+                fn assert_impl_no_aliasing<T: #crate_path::safety::NoAliasing>() {}
 
                 #[allow(dead_code)]
                 fn assert_impl_fits_into_device_register<
-                    T: rust_cuda::safety::FitsIntoDeviceRegister,
+                    T: #crate_path::safety::FitsIntoDeviceRegister,
                 >(_val: &T) {}
 
                 #(assert_impl_devicecopy(&#func_params);)*
@@ -151,6 +152,7 @@ pub(in super::super) fn quote_cuda_wrapper(
 }
 
 fn specialise_ptx_func_inputs(
+    crate_path: &syn::Path,
     KernelConfig { args, .. }: &KernelConfig,
     FunctionInputs {
         func_inputs,
@@ -172,16 +174,16 @@ fn specialise_ptx_func_inputs(
             ) => {
                 let type_ident = quote::format_ident!("__T_{}", i);
                 let syn_type = quote::quote_spanned! { ty.span()=>
-                    rust_cuda::device::specialise_kernel_type!(#args :: #type_ident)
+                    #crate_path::device::specialise_kernel_type!(#args :: #type_ident)
                 };
 
                 let cuda_type = match cuda_mode {
                     InputCudaType::SafeDeviceCopy => quote::quote_spanned! { ty.span()=>
-                        rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<#syn_type>
+                        #crate_path::utils::device_copy::SafeDeviceCopyWrapper<#syn_type>
                     },
                     InputCudaType::LendRustToCuda => quote::quote_spanned! { ty.span()=>
-                        rust_cuda::common::DeviceAccessible<
-                            <#syn_type as rust_cuda::common::RustToCuda>::CudaRepresentation
+                        #crate_path::common::DeviceAccessible<
+                            <#syn_type as #crate_path::common::RustToCuda>::CudaRepresentation
                         >
                     },
                 };
@@ -198,11 +200,11 @@ fn specialise_ptx_func_inputs(
 
                     if mutability.is_some() {
                         quote::quote_spanned! { ty.span()=>
-                            rust_cuda::common::DeviceMutRef<#lifetime, #cuda_type>
+                            #crate_path::common::DeviceMutRef<#lifetime, #cuda_type>
                         }
                     } else {
                         quote::quote_spanned! { ty.span()=>
-                            rust_cuda::common::DeviceConstRef<#lifetime, #cuda_type>
+                            #crate_path::common::DeviceConstRef<#lifetime, #cuda_type>
                         }
                     }
                 } else if matches!(cuda_mode, InputCudaType::LendRustToCuda) {
@@ -211,7 +213,7 @@ fn specialise_ptx_func_inputs(
                     };
 
                     quote::quote_spanned! { ty.span()=>
-                        rust_cuda::common::DeviceMutRef<#lifetime, #cuda_type>
+                        #crate_path::common::DeviceMutRef<#lifetime, #cuda_type>
                     }
                 } else {
                     cuda_type
@@ -229,6 +231,7 @@ fn specialise_ptx_func_inputs(
 }
 
 fn specialise_ptx_unboxed_types(
+    crate_path: &syn::Path,
     KernelConfig { args, .. }: &KernelConfig,
     FunctionInputs { func_inputs, .. }: &FunctionInputs,
 ) -> Vec<TokenStream> {
@@ -240,7 +243,7 @@ fn specialise_ptx_unboxed_types(
                 let type_ident = quote::format_ident!("__T_{}", i);
 
                 quote::quote_spanned! { ty.span()=>
-                    rust_cuda::device::specialise_kernel_type!(#args :: #type_ident)
+                    #crate_path::device::specialise_kernel_type!(#args :: #type_ident)
                 }
             },
             syn::FnArg::Receiver(_) => unreachable!(),
diff --git a/rust-cuda-derive/src/kernel/wrapper/mod.rs b/rust-cuda-derive/src/kernel/wrapper/mod.rs
index c057fe7f1..76b88eee6 100644
--- a/rust-cuda-derive/src/kernel/wrapper/mod.rs
+++ b/rust-cuda-derive/src/kernel/wrapper/mod.rs
@@ -38,7 +38,63 @@ pub fn kernel(attr: TokenStream, func: TokenStream) -> TokenStream {
         },
     };
 
-    let func = parse_kernel_fn(func);
+    let mut func = parse_kernel_fn(func);
+
+    let mut crate_path = None;
+
+    func.attrs.retain(|attr| {
+        if attr.path.is_ident("kernel") {
+            if let Ok(syn::Meta::List(list)) = attr.parse_meta() {
+                for meta in &list.nested {
+                    match meta {
+                        syn::NestedMeta::Meta(syn::Meta::NameValue(syn::MetaNameValue {
+                            path,
+                            lit: syn::Lit::Str(s),
+                            ..
+                        })) if path.is_ident("crate") => match syn::parse_str::<syn::Path>(&s.value()) {
+                            Ok(new_crate_path) => {
+                                if crate_path.is_none() {
+                                    crate_path = Some(
+                                        syn::parse_quote_spanned! { s.span() => #new_crate_path },
+                                    );
+
+                                    return false;
+                                }
+
+                                emit_error!(
+                                    s.span(),
+                                    "[rust-cuda]: Duplicate #[kernel(crate)] attribute.",
+                                );
+                            },
+                            Err(err) => emit_error!(
+                                s.span(),
+                                "[rust-cuda]: Invalid #[kernel(crate = \
+                                 \"<crate-path>\")] attribute: {}.",
+                                err
+                            ),
+                        },
+                        _ => {
+                            emit_error!(
+                                meta.span(),
+                                "[rust-cuda]: Expected #[kernel(crate = \"<crate-path>\")] function attribute."
+                            );
+                        }
+                    }
+                }
+            } else {
+                emit_error!(
+                    attr.span(),
+                    "[rust-cuda]: Expected #[kernel(crate = \"<crate-path>\")] function attribute."
+                );
+            }
+
+            false
+        } else {
+            true
+        }
+    });
+
+    let crate_path = crate_path.unwrap_or_else(|| syn::parse_quote!(::rust_cuda));
 
     let mut generic_kernel_params = func.sig.generics.params.clone();
     let mut func_inputs = parse_function_inputs(&func, &mut generic_kernel_params);
@@ -177,6 +233,7 @@ pub fn kernel(attr: TokenStream, func: TokenStream) -> TokenStream {
 
     let args_trait = quote_args_trait(&config, &decl_generics, &impl_generics, &func_inputs);
     let cpu_wrapper = quote_cpu_wrapper(
+        &crate_path,
         &config,
         &decl_generics,
         &impl_generics,
@@ -184,8 +241,9 @@ pub fn kernel(attr: TokenStream, func: TokenStream) -> TokenStream {
         &func_ident,
         &func.attrs,
     );
-    let cpu_cuda_check = quote_generic_check(&func_ident, &config);
+    let cpu_cuda_check = quote_generic_check(&crate_path, &func_ident, &config);
     let cpu_linker_macro = quote_cpu_linker_macro(
+        &crate_path,
         &config,
         &decl_generics,
         &func_inputs,
@@ -194,6 +252,7 @@ pub fn kernel(attr: TokenStream, func: TokenStream) -> TokenStream {
         &func.attrs,
     );
     let cuda_wrapper = quote_cuda_wrapper(
+        &crate_path,
         &config,
         &func_inputs,
         &func_ident,
@@ -298,6 +357,7 @@ fn ident_from_pat_iter<'p, I: Iterator<Item = &'p syn::Pat>>(iter: I) -> Option<
 }
 
 fn quote_generic_check(
+    crate_path: &syn::Path,
     FuncIdent {
         func_ident_hash, ..
     }: &FuncIdent,
@@ -313,11 +373,11 @@ fn quote_generic_check(
 
     quote::quote_spanned! { func_ident_hash.span()=>
         #[cfg(not(target_os = "cuda"))]
-        const _: ::rust_cuda::safety::kernel_signature::Assert<{
-            ::rust_cuda::safety::kernel_signature::CpuAndGpuKernelSignatures::Match
-        }> = ::rust_cuda::safety::kernel_signature::Assert::<{
-            ::rust_cuda::safety::kernel_signature::check(
-                rust_cuda::host::check_kernel!(#args #crate_name #crate_manifest_dir).as_bytes(),
+        const _: #crate_path::safety::kernel_signature::Assert<{
+            #crate_path::safety::kernel_signature::CpuAndGpuKernelSignatures::Match
+        }> = #crate_path::safety::kernel_signature::Assert::<{
+            #crate_path::safety::kernel_signature::check(
+                #crate_path::host::check_kernel!(#args #crate_name #crate_manifest_dir).as_bytes(),
                 concat!(".visible .entry ", stringify!(#func_ident_hash)).as_bytes()
             )
         }>;
diff --git a/rust-cuda-derive/src/rust_to_cuda/field_copy.rs b/rust-cuda-derive/src/rust_to_cuda/field_copy.rs
index 61891aa8c..c6659e9c9 100644
--- a/rust-cuda-derive/src/rust_to_cuda/field_copy.rs
+++ b/rust-cuda-derive/src/rust_to_cuda/field_copy.rs
@@ -5,6 +5,7 @@ use super::field_ty::CudaReprFieldTy;
 
 #[allow(clippy::too_many_arguments, clippy::too_many_lines)]
 pub fn impl_field_copy_init_and_expand_alloc_type(
+    crate_path: &syn::Path,
     field: &syn::Field,
     field_index: usize,
 
@@ -33,12 +34,12 @@ pub fn impl_field_copy_init_and_expand_alloc_type(
     match cuda_repr_field_ty {
         CudaReprFieldTy::SafeDeviceCopy => {
             r2c_field_declarations.push(quote! {
-                let #field_repr_ident = rust_cuda::common::DeviceAccessible::from(
+                let #field_repr_ident = #crate_path::common::DeviceAccessible::from(
                     &self.#field_accessor,
                 );
             });
             r2c_field_async_declarations.push(quote! {
-                let #field_repr_ident = rust_cuda::common::DeviceAccessible::from(
+                let #field_repr_ident = #crate_path::common::DeviceAccessible::from(
                     &self.#field_accessor,
                 );
             });
@@ -49,26 +50,26 @@ pub fn impl_field_copy_init_and_expand_alloc_type(
 
             c2r_field_initialisations.push(quote! {
                 #optional_field_ident {
-                    rust_cuda::common::CudaAsRust::as_rust(&this.#field_accessor).into_inner()
+                    #crate_path::common::CudaAsRust::as_rust(&this.#field_accessor).into_inner()
                 },
             });
         },
         CudaReprFieldTy::RustToCuda { field_ty } => {
             combined_cuda_alloc_type = quote! {
-                rust_cuda::host::CombinedCudaAlloc<
-                    <#field_ty as rust_cuda::common::RustToCuda>::CudaAllocation,
+                #crate_path::host::CombinedCudaAlloc<
+                    <#field_ty as #crate_path::common::RustToCuda>::CudaAllocation,
                     #combined_cuda_alloc_type
                 >
             };
 
             r2c_field_declarations.push(quote! {
-                let (#field_repr_ident, alloc_front) = rust_cuda::common::RustToCuda::borrow(
+                let (#field_repr_ident, alloc_front) = #crate_path::common::RustToCuda::borrow(
                     &self.#field_accessor,
                     alloc_front,
                 )?;
             });
             r2c_field_async_declarations.push(quote! {
-                let (#field_repr_ident, alloc_front) = rust_cuda::common::RustToCudaAsync::borrow_async(
+                let (#field_repr_ident, alloc_front) = #crate_path::common::RustToCudaAsync::borrow_async(
                     &self.#field_accessor,
                     alloc_front,
                     stream,
@@ -80,13 +81,13 @@ pub fn impl_field_copy_init_and_expand_alloc_type(
             });
 
             r2c_field_destructors.push(quote! {
-                let alloc_front = rust_cuda::common::RustToCuda::restore(
+                let alloc_front = #crate_path::common::RustToCuda::restore(
                     &mut self.#field_accessor,
                     alloc_front,
                 )?;
             });
             r2c_field_async_destructors.push(quote! {
-                let alloc_front = rust_cuda::common::RustToCudaAsync::restore_async(
+                let alloc_front = #crate_path::common::RustToCudaAsync::restore_async(
                     &mut self.#field_accessor,
                     alloc_front,
                     stream,
@@ -95,30 +96,30 @@ pub fn impl_field_copy_init_and_expand_alloc_type(
 
             c2r_field_initialisations.push(quote! {
                 #optional_field_ident {
-                    rust_cuda::common::CudaAsRust::as_rust(&this.#field_accessor)
+                    #crate_path::common::CudaAsRust::as_rust(&this.#field_accessor)
                 },
             });
         },
         CudaReprFieldTy::RustToCudaProxy { proxy_ty, field_ty } => {
             combined_cuda_alloc_type = quote! {
-                rust_cuda::host::CombinedCudaAlloc<
-                    <#proxy_ty as rust_cuda::common::RustToCuda>::CudaAllocation,
+                #crate_path::host::CombinedCudaAlloc<
+                    <#proxy_ty as #crate_path::common::RustToCuda>::CudaAllocation,
                     #combined_cuda_alloc_type
                 >
             };
 
             r2c_field_declarations.push(quote! {
-                let (#field_repr_ident, alloc_front) = rust_cuda::common::RustToCuda::borrow(
+                let (#field_repr_ident, alloc_front) = #crate_path::common::RustToCuda::borrow(
                     <
-                        #proxy_ty as rust_cuda::common::RustToCudaProxy<#field_ty>
+                        #proxy_ty as #crate_path::common::RustToCudaProxy<#field_ty>
                     >::from_ref(&self.#field_accessor),
                     alloc_front,
                 )?;
             });
             r2c_field_async_declarations.push(quote! {
-                let (#field_repr_ident, alloc_front) = rust_cuda::common::RustToCudaAsync::borrow_async(
+                let (#field_repr_ident, alloc_front) = #crate_path::common::RustToCudaAsync::borrow_async(
                     <
-                        #proxy_ty as rust_cuda::common::RustToCudaAsyncProxy<#field_ty>
+                        #proxy_ty as #crate_path::common::RustToCudaAsyncProxy<#field_ty>
                     >::from_ref(&self.#field_accessor),
                     alloc_front,
                     stream,
@@ -130,17 +131,17 @@ pub fn impl_field_copy_init_and_expand_alloc_type(
             });
 
             r2c_field_destructors.push(quote! {
-                let alloc_front = rust_cuda::common::RustToCuda::restore(
+                let alloc_front = #crate_path::common::RustToCuda::restore(
                     <
-                        #proxy_ty as rust_cuda::common::RustToCudaProxy<#field_ty>
+                        #proxy_ty as #crate_path::common::RustToCudaProxy<#field_ty>
                     >::from_mut(&mut self.#field_accessor),
                     alloc_front,
                 )?;
             });
             r2c_field_async_destructors.push(quote! {
-                let alloc_front = rust_cuda::common::RustToCudaAsync::restore_async(
+                let alloc_front = #crate_path::common::RustToCudaAsync::restore_async(
                     <
-                        #proxy_ty as rust_cuda::common::RustToCudaAsyncProxy<#field_ty>
+                        #proxy_ty as #crate_path::common::RustToCudaAsyncProxy<#field_ty>
                     >::from_mut(&mut self.#field_accessor),
                     alloc_front,
                     stream,
@@ -149,8 +150,8 @@ pub fn impl_field_copy_init_and_expand_alloc_type(
 
             c2r_field_initialisations.push(quote! {
                 #optional_field_ident {
-                    rust_cuda::common::RustToCudaProxy::<#field_ty>::into(
-                        rust_cuda::common::CudaAsRust::as_rust(&this.#field_accessor)
+                    #crate_path::common::RustToCudaProxy::<#field_ty>::into(
+                        #crate_path::common::CudaAsRust::as_rust(&this.#field_accessor)
                     )
                 },
             });
diff --git a/rust-cuda-derive/src/rust_to_cuda/field_ty.rs b/rust-cuda-derive/src/rust_to_cuda/field_ty.rs
index 8416d3c17..21509ef8c 100644
--- a/rust-cuda-derive/src/rust_to_cuda/field_ty.rs
+++ b/rust-cuda-derive/src/rust_to_cuda/field_ty.rs
@@ -12,7 +12,10 @@ pub enum CudaReprFieldTy {
     },
 }
 
-pub fn swap_field_type_and_filter_attrs(field: &mut syn::Field) -> CudaReprFieldTy {
+pub fn swap_field_type_and_filter_attrs(
+    crate_path: &syn::Path,
+    field: &mut syn::Field,
+) -> CudaReprFieldTy {
     let mut cuda_repr_field_ty: Option<CudaReprFieldTy> = None;
     let mut field_ty = field.ty.clone();
 
@@ -33,8 +36,8 @@ pub fn swap_field_type_and_filter_attrs(field: &mut syn::Field) -> CudaReprField
                                     field_ty: Box::new(field_ty.clone()),
                                 });
                                 field_ty = parse_quote! {
-                                    rust_cuda::common::DeviceAccessible<
-                                        <#field_ty as rust_cuda::common::RustToCuda>::CudaRepresentation
+                                    #crate_path::common::DeviceAccessible<
+                                        <#field_ty as #crate_path::common::RustToCuda>::CudaRepresentation
                                     >
                                 };
                             } else {
@@ -54,8 +57,8 @@ pub fn swap_field_type_and_filter_attrs(field: &mut syn::Field) -> CudaReprField
                                     Ok(proxy_ty) => {
                                         let old_field_ty = Box::new(field_ty.clone());
                                         field_ty = parse_quote! {
-                                            rust_cuda::common::DeviceAccessible<
-                                                <#proxy_ty as rust_cuda::common::RustToCuda>::CudaRepresentation
+                                            #crate_path::common::DeviceAccessible<
+                                                <#proxy_ty as #crate_path::common::RustToCuda>::CudaRepresentation
                                             >
                                         };
                                         cuda_repr_field_ty = Some(CudaReprFieldTy::RustToCudaProxy {
@@ -104,8 +107,8 @@ pub fn swap_field_type_and_filter_attrs(field: &mut syn::Field) -> CudaReprField
         cuda_repr_field_ty
     } else {
         field_ty = parse_quote! {
-            rust_cuda::common::DeviceAccessible<
-                rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<#field_ty>
+            #crate_path::common::DeviceAccessible<
+                #crate_path::utils::device_copy::SafeDeviceCopyWrapper<#field_ty>
             >
         };
 
diff --git a/rust-cuda-derive/src/rust_to_cuda/generics.rs b/rust-cuda-derive/src/rust_to_cuda/generics.rs
index 646686534..b9335db46 100644
--- a/rust-cuda-derive/src/rust_to_cuda/generics.rs
+++ b/rust-cuda-derive/src/rust_to_cuda/generics.rs
@@ -10,6 +10,7 @@ pub fn expand_cuda_struct_generics_where_requested_in_attrs(
     syn::Generics,
     Vec<syn::Attribute>,
     bool,
+    syn::Path,
 ) {
     let mut type_params = ast
         .generics
@@ -30,8 +31,8 @@ pub fn expand_cuda_struct_generics_where_requested_in_attrs(
     }
 
     let mut r2c_ignore = false;
-
     let mut r2c_async_impl = None;
+    let mut crate_path = None;
 
     struct_attrs_cuda.retain(|attr| {
         if attr.path.is_ident("cuda") {
@@ -104,6 +105,30 @@ pub fn expand_cuda_struct_generics_where_requested_in_attrs(
                                 "[rust-cuda]: Duplicate #[cuda(async)] attribute.",
                             );
                         },
+                        syn::NestedMeta::Meta(syn::Meta::NameValue(syn::MetaNameValue {
+                            path,
+                            lit: syn::Lit::Str(s),
+                            ..
+                        })) if path.is_ident("crate") => match syn::parse_str::<syn::Path>(&s.value()) {
+                            Ok(new_crate_path) => {
+                                if crate_path.is_none() {
+                                    crate_path = Some(
+                                        syn::parse_quote_spanned! { s.span() => #new_crate_path },
+                                    );
+                                } else {
+                                    emit_error!(
+                                        s.span(),
+                                        "[rust-cuda]: Duplicate #[cuda(crate)] attribute.",
+                                    );
+                                }
+                            },
+                            Err(err) => emit_error!(
+                                s.span(),
+                                "[rust-cuda]: Invalid #[cuda(crate = \
+                                 \"<crate-path>\")] attribute: {}.",
+                                err
+                            ),
+                        },
                         syn::NestedMeta::Meta(syn::Meta::NameValue(syn::MetaNameValue {
                             path: syn::Path {
                                 leading_colon: None,
@@ -134,9 +159,10 @@ pub fn expand_cuda_struct_generics_where_requested_in_attrs(
                         _ => {
                             emit_error!(
                                 meta.span(),
-                                "[rust-cuda]: Expected #[cuda(ignore)] / #[cuda(bound = \
-                                 \"<where-predicate>\")] / #[cuda(layout::ATTR = \"VALUE\")] \
-                                 struct attribute."
+                                "[rust-cuda]: Expected #[cuda(ignore)] / \
+                                #[cuda(bound = \"<where-predicate>\")] / \
+                                #[cuda(crate = \"<crate-path>\")] / \
+                                #[cuda(layout::ATTR = \"VALUE\")] struct attribute."
                             );
                         },
                     }
@@ -144,8 +170,10 @@ pub fn expand_cuda_struct_generics_where_requested_in_attrs(
             } else {
                 emit_error!(
                     attr.span(),
-                    "[rust-cuda]: Expected #[cuda(ignore)] / #[cuda(bound = \
-                     \"<where-predicate>\")] / #[cuda(layout::ATTR = \"VALUE\")] struct attribute."
+                    "[rust-cuda]: Expected #[cuda(ignore)] / \
+                    #[cuda(bound = \"<where-predicate>\")] / \
+                    #[cuda(crate = \"<crate-path>\")] / \
+                    #[cuda(layout::ATTR = \"VALUE\")] struct attribute."
                 );
             }
 
@@ -155,18 +183,20 @@ pub fn expand_cuda_struct_generics_where_requested_in_attrs(
         }
     });
 
+    let crate_path = crate_path.unwrap_or_else(|| syn::parse_quote!(::rust_cuda));
+
     for ty in &type_params {
         struct_generics_cuda
             .make_where_clause()
             .predicates
             .push(syn::parse_quote! {
-                #ty: ::rust_cuda::common::RustToCuda
+                #ty: #crate_path::common::RustToCuda
             });
         struct_generics_cuda_async
             .make_where_clause()
             .predicates
             .push(syn::parse_quote! {
-                #ty: ::rust_cuda::common::RustToCudaAsync
+                #ty: #crate_path::common::RustToCudaAsync
             });
     }
 
@@ -176,5 +206,6 @@ pub fn expand_cuda_struct_generics_where_requested_in_attrs(
         struct_generics_cuda_async,
         struct_layout_attrs,
         r2c_async_impl.unwrap_or(true),
+        crate_path,
     )
 }
diff --git a/rust-cuda-derive/src/rust_to_cuda/impl.rs b/rust-cuda-derive/src/rust_to_cuda/impl.rs
index ff607af28..1ff844645 100644
--- a/rust-cuda-derive/src/rust_to_cuda/impl.rs
+++ b/rust-cuda-derive/src/rust_to_cuda/impl.rs
@@ -1,7 +1,9 @@
 use proc_macro2::TokenStream;
 use quote::quote;
 
+#[allow(clippy::too_many_arguments)]
 pub fn cuda_struct_declaration(
+    crate_path: &syn::Path,
     struct_attrs_cuda: &[syn::Attribute],
     struct_layout_attrs: &[syn::Attribute],
     struct_vis_cuda: &syn::Visibility,
@@ -27,23 +29,27 @@ pub fn cuda_struct_declaration(
         quote!(#where_clause #struct_fields_cuda)
     };
 
+    let const_type_layout_crate_path = quote! { #crate_path::const_type_layout }.to_string();
+
     quote! {
         #[allow(dead_code)]
         #[doc(hidden)]
         #(#struct_attrs_cuda)*
-        #[derive(rust_cuda::const_type_layout::TypeLayout)]
+        #[derive(#crate_path::const_type_layout::TypeLayout)]
         #struct_repr
         #(#struct_layout_attrs)*
+        #[layout(crate = #const_type_layout_crate_path)]
         #struct_vis_cuda struct #struct_name_cuda #struct_generics_cuda #struct_fields_where_clause
 
         // #[derive(DeviceCopy)] can interfer with type parameters
-        unsafe impl #impl_generics rust_cuda::rustacuda_core::DeviceCopy
+        unsafe impl #impl_generics #crate_path::rustacuda_core::DeviceCopy
             for #struct_name_cuda #ty_generics #where_clause {}
     }
 }
 
 #[allow(clippy::too_many_arguments)]
 pub fn rust_to_cuda_trait(
+    crate_path: &syn::Path,
     struct_name: &syn::Ident,
     struct_name_cuda: &syn::Ident,
     struct_generics_cuda: &syn::Generics,
@@ -70,7 +76,7 @@ pub fn rust_to_cuda_trait(
     let (impl_generics, ty_generics, where_clause) = struct_generics_cuda.split_for_impl();
 
     quote! {
-        unsafe impl #impl_generics rust_cuda::common::RustToCuda for #struct_name #ty_generics
+        unsafe impl #impl_generics #crate_path::common::RustToCuda for #struct_name #ty_generics
             #where_clause
         {
             type CudaRepresentation = #struct_name_cuda #ty_generics;
@@ -79,14 +85,14 @@ pub fn rust_to_cuda_trait(
             type CudaAllocation = #combined_cuda_alloc_type;
 
             #[cfg(not(target_os = "cuda"))]
-            unsafe fn borrow<CudaAllocType: rust_cuda::host::CudaAlloc>(
+            unsafe fn borrow<CudaAllocType: #crate_path::host::CudaAlloc>(
                 &self,
                 alloc: CudaAllocType,
-            ) -> rust_cuda::rustacuda::error::CudaResult<(
-                rust_cuda::common::DeviceAccessible<Self::CudaRepresentation>,
-                rust_cuda::host::CombinedCudaAlloc<Self::CudaAllocation, CudaAllocType>
+            ) -> #crate_path::rustacuda::error::CudaResult<(
+                #crate_path::common::DeviceAccessible<Self::CudaRepresentation>,
+                #crate_path::host::CombinedCudaAlloc<Self::CudaAllocation, CudaAllocType>
             )> {
-                let alloc_front = rust_cuda::host::NullCudaAlloc;
+                let alloc_front = #crate_path::host::NullCudaAlloc;
                 let alloc_tail = alloc;
 
                 #(#r2c_field_declarations)*
@@ -94,18 +100,18 @@ pub fn rust_to_cuda_trait(
                 let borrow = #rust_to_cuda_struct_construction;
 
                 Ok((
-                    rust_cuda::common::DeviceAccessible::from(borrow),
-                    rust_cuda::host::CombinedCudaAlloc::new(alloc_front, alloc_tail)
+                    #crate_path::common::DeviceAccessible::from(borrow),
+                    #crate_path::host::CombinedCudaAlloc::new(alloc_front, alloc_tail)
                 ))
             }
 
             #[cfg(not(target_os = "cuda"))]
-            unsafe fn restore<CudaAllocType: rust_cuda::host::CudaAlloc>(
+            unsafe fn restore<CudaAllocType: #crate_path::host::CudaAlloc>(
                 &mut self,
-                alloc: rust_cuda::host::CombinedCudaAlloc<
+                alloc: #crate_path::host::CombinedCudaAlloc<
                     Self::CudaAllocation, CudaAllocType
                 >,
-            ) -> rust_cuda::rustacuda::error::CudaResult<CudaAllocType> {
+            ) -> #crate_path::rustacuda::error::CudaResult<CudaAllocType> {
                 let (alloc_front, alloc_tail) = alloc.split();
 
                 #(#r2c_field_destructors)*
@@ -118,6 +124,7 @@ pub fn rust_to_cuda_trait(
 
 #[allow(clippy::too_many_arguments)]
 pub fn rust_to_cuda_async_trait(
+    crate_path: &syn::Path,
     struct_name: &syn::Ident,
     struct_name_cuda: &syn::Ident,
     struct_generics_cuda_async: &syn::Generics,
@@ -143,19 +150,19 @@ pub fn rust_to_cuda_async_trait(
     let (impl_generics, ty_generics, where_clause) = struct_generics_cuda_async.split_for_impl();
 
     quote! {
-        unsafe impl #impl_generics rust_cuda::common::RustToCudaAsync for #struct_name #ty_generics
+        unsafe impl #impl_generics #crate_path::common::RustToCudaAsync for #struct_name #ty_generics
             #where_clause
         {
             #[cfg(not(target_os = "cuda"))]
-            unsafe fn borrow_async<CudaAllocType: rust_cuda::host::CudaAlloc>(
+            unsafe fn borrow_async<CudaAllocType: #crate_path::host::CudaAlloc>(
                 &self,
                 alloc: CudaAllocType,
-                stream: &rust_cuda::rustacuda::stream::Stream,
-            ) -> rust_cuda::rustacuda::error::CudaResult<(
-                rust_cuda::common::DeviceAccessible<Self::CudaRepresentation>,
-                rust_cuda::host::CombinedCudaAlloc<Self::CudaAllocation, CudaAllocType>
+                stream: &#crate_path::rustacuda::stream::Stream,
+            ) -> #crate_path::rustacuda::error::CudaResult<(
+                #crate_path::common::DeviceAccessible<Self::CudaRepresentation>,
+                #crate_path::host::CombinedCudaAlloc<Self::CudaAllocation, CudaAllocType>
             )> {
-                let alloc_front = rust_cuda::host::NullCudaAlloc;
+                let alloc_front = #crate_path::host::NullCudaAlloc;
                 let alloc_tail = alloc;
 
                 #(#r2c_field_async_declarations)*
@@ -163,19 +170,19 @@ pub fn rust_to_cuda_async_trait(
                 let borrow = #rust_to_cuda_struct_construction;
 
                 Ok((
-                    rust_cuda::common::DeviceAccessible::from(borrow),
-                    rust_cuda::host::CombinedCudaAlloc::new(alloc_front, alloc_tail)
+                    #crate_path::common::DeviceAccessible::from(borrow),
+                    #crate_path::host::CombinedCudaAlloc::new(alloc_front, alloc_tail)
                 ))
             }
 
             #[cfg(not(target_os = "cuda"))]
-            unsafe fn restore_async<CudaAllocType: rust_cuda::host::CudaAlloc>(
+            unsafe fn restore_async<CudaAllocType: #crate_path::host::CudaAlloc>(
                 &mut self,
-                alloc: rust_cuda::host::CombinedCudaAlloc<
+                alloc: #crate_path::host::CombinedCudaAlloc<
                     Self::CudaAllocation, CudaAllocType
                 >,
-                stream: &rust_cuda::rustacuda::stream::Stream,
-            ) -> rust_cuda::rustacuda::error::CudaResult<CudaAllocType> {
+                stream: &#crate_path::rustacuda::stream::Stream,
+            ) -> #crate_path::rustacuda::error::CudaResult<CudaAllocType> {
                 let (alloc_front, alloc_tail) = alloc.split();
 
                 #(#r2c_field_async_destructors)*
@@ -187,6 +194,7 @@ pub fn rust_to_cuda_async_trait(
 }
 
 pub fn cuda_as_rust_trait(
+    crate_path: &syn::Path,
     struct_name: &syn::Ident,
     struct_name_cuda: &syn::Ident,
     struct_generics_cuda: &syn::Generics,
@@ -210,14 +218,14 @@ pub fn cuda_as_rust_trait(
     let (impl_generics, ty_generics, where_clause) = &struct_generics_cuda.split_for_impl();
 
     quote! {
-        unsafe impl #impl_generics rust_cuda::common::CudaAsRust
+        unsafe impl #impl_generics #crate_path::common::CudaAsRust
             for #struct_name_cuda #ty_generics #where_clause
         {
             type RustRepresentation = #struct_name #ty_generics;
 
             #[cfg(target_os = "cuda")]
             unsafe fn as_rust(
-                this: &rust_cuda::common::DeviceAccessible<Self>,
+                this: &#crate_path::common::DeviceAccessible<Self>,
             ) -> #struct_name #ty_generics {
                 #cuda_as_rust_struct_construction
             }
diff --git a/rust-cuda-derive/src/rust_to_cuda/mod.rs b/rust-cuda-derive/src/rust_to_cuda/mod.rs
index dc8eb6491..4173d6658 100644
--- a/rust-cuda-derive/src/rust_to_cuda/mod.rs
+++ b/rust-cuda-derive/src/rust_to_cuda/mod.rs
@@ -21,8 +21,17 @@ pub fn impl_rust_to_cuda(ast: &syn::DeriveInput) -> proc_macro::TokenStream {
     let struct_name = &ast.ident;
     let struct_name_cuda = get_cuda_repr_ident(struct_name);
 
+    let (
+        struct_attrs_cuda,
+        struct_generics_cuda,
+        struct_generics_cuda_async,
+        struct_layout_attrs,
+        r2c_async_impl,
+        crate_path,
+    ) = generics::expand_cuda_struct_generics_where_requested_in_attrs(ast);
+
     let mut combined_cuda_alloc_type: TokenStream = quote! {
-        rust_cuda::host::NullCudaAlloc
+        #crate_path::host::NullCudaAlloc
     };
     let mut r2c_field_declarations: Vec<TokenStream> = Vec::new();
     let mut r2c_field_async_declarations: Vec<TokenStream> = Vec::new();
@@ -45,9 +54,11 @@ pub fn impl_rust_to_cuda(ast: &syn::DeriveInput) -> proc_macro::TokenStream {
             let mut r2c_field_async_destructors_reverse: Vec<TokenStream> = Vec::new();
 
             for (field_index, field) in fields.iter_mut().enumerate() {
-                let cuda_repr_field_ty = field_ty::swap_field_type_and_filter_attrs(field);
+                let cuda_repr_field_ty =
+                    field_ty::swap_field_type_and_filter_attrs(&crate_path, field);
 
                 combined_cuda_alloc_type = field_copy::impl_field_copy_init_and_expand_alloc_type(
+                    &crate_path,
                     field,
                     field_index,
                     &cuda_repr_field_ty,
@@ -69,15 +80,8 @@ pub fn impl_rust_to_cuda(ast: &syn::DeriveInput) -> proc_macro::TokenStream {
         syn::Fields::Unit => (),
     }
 
-    let (
-        struct_attrs_cuda,
-        struct_generics_cuda,
-        struct_generics_cuda_async,
-        struct_layout_attrs,
-        r2c_async_impl,
-    ) = generics::expand_cuda_struct_generics_where_requested_in_attrs(ast);
-
     let cuda_struct_declaration = r#impl::cuda_struct_declaration(
+        &crate_path,
         &struct_attrs_cuda,
         &struct_layout_attrs,
         &ast.vis,
@@ -88,6 +92,7 @@ pub fn impl_rust_to_cuda(ast: &syn::DeriveInput) -> proc_macro::TokenStream {
     );
 
     let rust_to_cuda_trait_impl = r#impl::rust_to_cuda_trait(
+        &crate_path,
         struct_name,
         &struct_name_cuda,
         &struct_generics_cuda,
@@ -100,6 +105,7 @@ pub fn impl_rust_to_cuda(ast: &syn::DeriveInput) -> proc_macro::TokenStream {
 
     let rust_to_cuda_async_trait_impl = if r2c_async_impl {
         r#impl::rust_to_cuda_async_trait(
+            &crate_path,
             struct_name,
             &struct_name_cuda,
             &struct_generics_cuda_async,
@@ -113,6 +119,7 @@ pub fn impl_rust_to_cuda(ast: &syn::DeriveInput) -> proc_macro::TokenStream {
     };
 
     let cuda_as_rust_trait_impl = r#impl::cuda_as_rust_trait(
+        &crate_path,
         struct_name,
         &struct_name_cuda,
         &struct_generics_cuda,

From 73bb289b6266c577a875d53f2e03e947cc7e4d45 Mon Sep 17 00:00:00 2001
From: Juniper Tyree <juniper.tyree@helsinki.fi>
Date: Wed, 30 Nov 2022 11:28:51 -0800
Subject: [PATCH 015/120] Added simple thread-block shared memory support

---
 examples/single-source/src/main.rs | 11 +++++++++++
 src/device/mod.rs                  | 29 +++++++++++++++++++++++++++++
 src/device/utils.rs                |  1 -
 src/lib.rs                         |  8 ++++++++
 4 files changed, 48 insertions(+), 1 deletion(-)

diff --git a/examples/single-source/src/main.rs b/examples/single-source/src/main.rs
index 79f6e3ec1..891c2db06 100644
--- a/examples/single-source/src/main.rs
+++ b/examples/single-source/src/main.rs
@@ -47,6 +47,17 @@ pub fn kernel<'a, T: rc::common::RustToCuda>(
 ) where
     <T as rc::common::RustToCuda>::CudaRepresentation: rc::safety::StackOnly,
 {
+    use rc::device::ThreadBlockShared;
+
+    let shared: ThreadBlockShared<[Tuple; 3]> = ThreadBlockShared::new_uninit();
+    let shared2: ThreadBlockShared<[Tuple; 3]> = ThreadBlockShared::new_uninit();
+
+    unsafe {
+        (*shared.get().cast::<Tuple>().add(1)).0 = 42;
+    }
+    unsafe {
+        (*shared2.get().cast::<Tuple>().add(2)).1 = 24;
+    }
 }
 
 #[cfg(not(target_os = "cuda"))]
diff --git a/src/device/mod.rs b/src/device/mod.rs
index 39ae0719f..583bd2a2e 100644
--- a/src/device/mod.rs
+++ b/src/device/mod.rs
@@ -113,3 +113,32 @@ impl<T> DerefMut for ShallowCopy<T> {
         &mut self.0
     }
 }
+
+#[repr(transparent)]
+pub struct ThreadBlockShared<T: 'static> {
+    shared: *mut T,
+}
+
+impl<T: 'static> ThreadBlockShared<T> {
+    #[must_use]
+    pub fn new_uninit() -> Self {
+        let shared: *mut T;
+
+        unsafe {
+            core::arch::asm!(
+                ".shared .align {align} .b8 {reg}_rust_cuda_shared[{size}];",
+                "mov.u64 {reg}, {reg}_rust_cuda_shared;",
+                reg = out(reg64) shared,
+                align = const(core::mem::align_of::<T>()),
+                size = const(core::mem::size_of::<T>()),
+            );
+        }
+
+        Self { shared }
+    }
+
+    #[must_use]
+    pub fn get(&self) -> *mut T {
+        self.shared
+    }
+}
diff --git a/src/device/utils.rs b/src/device/utils.rs
index a45ff9c71..897df29ea 100644
--- a/src/device/utils.rs
+++ b/src/device/utils.rs
@@ -1,5 +1,4 @@
 use alloc::alloc::{GlobalAlloc, Layout};
-#[cfg(target_os = "cuda")]
 use core::arch::nvptx;
 
 /// Memory allocator using CUDA malloc/free
diff --git a/src/lib.rs b/src/lib.rs
index 2c202ffee..795e00cfa 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -8,6 +8,14 @@
     any(all(not(feature = "host"), target_os = "cuda"), doc),
     feature(stdsimd)
 )]
+#![cfg_attr(
+    any(all(not(feature = "host"), target_os = "cuda"), doc),
+    feature(asm_experimental_arch)
+)]
+#![cfg_attr(
+    any(all(not(feature = "host"), target_os = "cuda"), doc),
+    feature(asm_const)
+)]
 #![cfg_attr(any(feature = "alloc", doc), feature(allocator_api))]
 #![feature(doc_cfg)]
 #![feature(marker_trait_attr)]

From a23e76e47fd79b9b1ab195d6a7e155a5316345b1 Mon Sep 17 00:00:00 2001
From: Juniper Tyree <juniper.tyree@helsinki.fi>
Date: Wed, 30 Nov 2022 14:24:46 -0800
Subject: [PATCH 016/120] Fixed device utils doc tests

---
 src/device/utils.rs | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/device/utils.rs b/src/device/utils.rs
index 897df29ea..a45ff9c71 100644
--- a/src/device/utils.rs
+++ b/src/device/utils.rs
@@ -1,4 +1,5 @@
 use alloc::alloc::{GlobalAlloc, Layout};
+#[cfg(target_os = "cuda")]
 use core::arch::nvptx;
 
 /// Memory allocator using CUDA malloc/free

From 9f330f4d6ca38c3461034cb5aab721103f3abeca Mon Sep 17 00:00:00 2001
From: Juniper Tyree <juniper.tyree@helsinki.fi>
Date: Thu, 1 Dec 2022 03:39:34 -0800
Subject: [PATCH 017/120] Convert cuda thread-block-shared memory address to
 generic

---
 src/device/mod.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/device/mod.rs b/src/device/mod.rs
index 583bd2a2e..7c11cb34f 100644
--- a/src/device/mod.rs
+++ b/src/device/mod.rs
@@ -127,7 +127,7 @@ impl<T: 'static> ThreadBlockShared<T> {
         unsafe {
             core::arch::asm!(
                 ".shared .align {align} .b8 {reg}_rust_cuda_shared[{size}];",
-                "mov.u64 {reg}, {reg}_rust_cuda_shared;",
+                "cvta.shared.u64 {reg}, {reg}_rust_cuda_shared;",
                 reg = out(reg64) shared,
                 align = const(core::mem::align_of::<T>()),
                 size = const(core::mem::size_of::<T>()),

From 8970c5b7a95a2e2bb6a4ad08052820e6842ffc42 Mon Sep 17 00:00:00 2001
From: Juniper Tyree <juniper.tyree@helsinki.fi>
Date: Sat, 3 Dec 2022 05:47:38 -0800
Subject: [PATCH 018/120] First steps towards better shared memory, including
 dynamic

---
 .../generate/cpu_linker_macro/kernel_func.rs  |  19 ++-
 .../kernel_func_async/async_func_types.rs     |   9 ++
 .../kernel_func_async/launch_types.rs         |  11 ++
 .../kernel/wrapper/generate/cpu_wrapper.rs    |  20 +++
 .../kernel/wrapper/generate/cuda_wrapper.rs   |  32 ++++-
 .../src/kernel/wrapper/inputs/attribute.rs    |   7 +-
 .../src/kernel/wrapper/inputs/mod.rs          |  30 +++-
 rust-cuda-derive/src/kernel/wrapper/mod.rs    |  11 +-
 src/device/alloc.rs                           |  16 +++
 src/device/{utils.rs => macros.rs}            | 105 --------------
 src/device/mod.rs                             |  34 +----
 src/device/thread.rs                          | 133 ++++++++++++++++++
 src/utils/aliasing/const.rs                   |   4 +-
 src/utils/aliasing/dynamic.rs                 |   4 +-
 src/utils/mod.rs                              |   1 +
 src/utils/shared/mod.rs                       |  35 +++++
 src/utils/shared/slice.rs                     |  73 ++++++++++
 src/utils/shared/static.rs                    |  44 ++++++
 18 files changed, 441 insertions(+), 147 deletions(-)
 create mode 100644 src/device/alloc.rs
 rename src/device/{utils.rs => macros.rs} (59%)
 create mode 100644 src/device/thread.rs
 create mode 100644 src/utils/shared/mod.rs
 create mode 100644 src/utils/shared/slice.rs
 create mode 100644 src/utils/shared/static.rs

diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func.rs
index d6e70e276..00208e57e 100644
--- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func.rs
+++ b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func.rs
@@ -13,7 +13,10 @@ pub(super) fn quote_kernel_func(
         generic_wrapper_where_clause,
         ..
     }: &DeclGenerics,
-    inputs @ FunctionInputs { func_inputs, .. }: &FunctionInputs,
+    inputs @ FunctionInputs {
+        func_inputs,
+        func_input_cuda_types,
+    }: &FunctionInputs,
     fn_ident @ FuncIdent { func_ident, .. }: &FuncIdent,
     func_params: &[syn::Ident],
     func_attrs: &[syn::Attribute],
@@ -21,8 +24,9 @@ pub(super) fn quote_kernel_func(
 ) -> TokenStream {
     let new_func_inputs = func_inputs
         .iter()
+        .zip(func_input_cuda_types.iter())
         .enumerate()
-        .map(|(i, arg)| match arg {
+        .map(|(i, (arg, (cuda_type, _)))| match arg {
             syn::FnArg::Typed(syn::PatType {
                 attrs,
                 pat,
@@ -46,6 +50,16 @@ pub(super) fn quote_kernel_func(
                     quote! {
                         #(#attrs)* #pat #colon_token #and_token #lifetime #mutability #syn_type
                     }
+                } else if matches!(cuda_type, InputCudaType::ThreadBlockShared) {
+                    if let syn::Type::Slice(_) = &**ty {
+                        quote! { #(#attrs)* #pat #colon_token
+                            #crate_path::utils::shared::slice::ThreadBlockSharedSlice<#syn_type>
+                        }
+                    } else {
+                        quote! { #(#attrs)* #pat #colon_token
+                            #crate_path::utils::shared::r#static::ThreadBlockShared<#syn_type>
+                        }
+                    }
                 } else {
                     quote! { #(#attrs)* #pat #colon_token #syn_type }
                 }
@@ -171,6 +185,7 @@ fn generate_raw_func_input_wrap(
                             ) }
                         }
                     },
+                    InputCudaType::ThreadBlockShared => inner,
                 },
                 syn::FnArg::Receiver(_) => unreachable!(),
             },
diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func_async/async_func_types.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func_async/async_func_types.rs
index c24406c9a..8cbbc7790 100644
--- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func_async/async_func_types.rs
+++ b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func_async/async_func_types.rs
@@ -46,6 +46,15 @@ pub(super) fn generate_async_func_types(
                             <#syn_type as #crate_path::common::RustToCuda>::CudaRepresentation
                         >
                     },
+                    InputCudaType::ThreadBlockShared => if let syn::Type::Slice(_) = &**ty {
+                        quote! {
+                            #crate_path::utils::shared::slice::ThreadBlockSharedSlice<#syn_type>
+                        }
+                    } else {
+                        quote! {
+                            #crate_path::utils::shared::r#static::ThreadBlockShared<#syn_type>
+                        }
+                    },
                 };
 
                 if let syn::Type::Reference(syn::TypeReference {
diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func_async/launch_types.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func_async/launch_types.rs
index 16cd0008e..cda2d7e4a 100644
--- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func_async/launch_types.rs
+++ b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func_async/launch_types.rs
@@ -47,6 +47,17 @@ pub(in super::super) fn generate_launch_types(
                             <#syn_type as #crate_path::common::RustToCuda>::CudaRepresentation
                         >
                     },
+                    InputCudaType::ThreadBlockShared => {
+                        if let syn::Type::Slice(_) = &**ty {
+                            quote::quote_spanned! { ty.span()=>
+                                #crate_path::utils::shared::slice::ThreadBlockSharedSlice<#syn_type>
+                            }
+                        } else {
+                            quote::quote_spanned! { ty.span()=>
+                                #crate_path::utils::shared::r#static::ThreadBlockShared<#syn_type>
+                            }
+                        }
+                    },
                 };
 
                 cpu_func_types_launch.push(
diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper.rs
index 4851af9ce..6b15f2109 100644
--- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper.rs
+++ b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper.rs
@@ -91,6 +91,7 @@ pub(in super::super) fn quote_cpu_wrapper(
     }
 }
 
+#[allow(clippy::too_many_lines)]
 fn generate_new_func_inputs_decl(
     crate_path: &syn::Path,
     KernelConfig { args, .. }: &KernelConfig,
@@ -132,6 +133,16 @@ fn generate_new_func_inputs_decl(
                                 mutability: *mutability,
                                 elem: syn_type,
                             }))
+                        } else if matches!(cuda_mode, InputCudaType::ThreadBlockShared) {
+                            if let syn::Type::Slice(_) = &**ty {
+                                syn::parse_quote!(
+                                    #crate_path::utils::shared::slice::ThreadBlockSharedSlice<#syn_type>
+                                )
+                            } else {
+                                syn::parse_quote!(
+                                    #crate_path::utils::shared::r#static::ThreadBlockShared<#syn_type>
+                                )
+                            }
                         } else {
                             syn_type
                         }
@@ -155,6 +166,15 @@ fn generate_new_func_inputs_decl(
                                     <#syn_type as #crate_path::common::RustToCuda>::CudaRepresentation
                                 >
                             ),
+                            InputCudaType::ThreadBlockShared => if let syn::Type::Slice(_) = &**ty {
+                                syn::parse_quote!(
+                                    #crate_path::utils::shared::slice::ThreadBlockSharedSlice<#syn_type>
+                                )
+                            } else {
+                                syn::parse_quote!(
+                                    #crate_path::utils::shared::r#static::ThreadBlockShared<#syn_type>
+                                )
+                            },
                         };
 
                         if let syn::Type::Reference(syn::TypeReference {
diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cuda_wrapper.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cuda_wrapper.rs
index 36e316708..34db62123 100644
--- a/rust-cuda-derive/src/kernel/wrapper/generate/cuda_wrapper.rs
+++ b/rust-cuda-derive/src/kernel/wrapper/generate/cuda_wrapper.rs
@@ -33,6 +33,8 @@ pub(in super::super) fn quote_cuda_wrapper(
         })
         .collect::<Vec<_>>();
 
+    let mut shared_slice = Vec::new();
+
     let ptx_func_input_unwrap = func_inputs
         .iter().zip(func_input_cuda_types.iter()).enumerate()
         .rev()
@@ -90,7 +92,24 @@ pub(in super::super) fn quote_cuda_wrapper(
                                 #pat, |#pat: #syn_type| { #inner },
                             )
                         }
-                    }
+                    },
+                    InputCudaType::ThreadBlockShared => if let syn::Type::Slice(syn::TypeSlice { elem, .. }) = &**ty {
+                        shared_slice.push(elem);
+
+                        quote! {
+                            #ptx_jit_load;
+                            #crate_path::utils::shared::slice::ThreadBlockSharedSlice::with_uninit(
+                                #pat, |#pat: #syn_type| { #inner },
+                            )
+                        }
+                    } else {
+                        quote! {
+                            #ptx_jit_load;
+                            #crate_path::utils::shared::r#static::ThreadBlockShared::with_uninit(
+                                #pat, |#pat: #syn_type| { #inner },
+                            )
+                        }
+                    },
                 }
             },
             syn::FnArg::Receiver(_) => unreachable!(),
@@ -186,6 +205,17 @@ fn specialise_ptx_func_inputs(
                             <#syn_type as #crate_path::common::RustToCuda>::CudaRepresentation
                         >
                     },
+                    InputCudaType::ThreadBlockShared => {
+                        if let syn::Type::Slice(_) = &**ty {
+                            quote::quote_spanned! { ty.span()=>
+                                #crate_path::utils::shared::slice::ThreadBlockSharedSlice<#syn_type>
+                            }
+                        } else {
+                            quote::quote_spanned! { ty.span()=>
+                                #crate_path::utils::shared::r#static::ThreadBlockShared<#syn_type>
+                            }
+                        }
+                    },
                 };
 
                 let ty = if let syn::Type::Reference(syn::TypeReference {
diff --git a/rust-cuda-derive/src/kernel/wrapper/inputs/attribute.rs b/rust-cuda-derive/src/kernel/wrapper/inputs/attribute.rs
index ceeee1e3e..6b479a664 100644
--- a/rust-cuda-derive/src/kernel/wrapper/inputs/attribute.rs
+++ b/rust-cuda-derive/src/kernel/wrapper/inputs/attribute.rs
@@ -19,10 +19,11 @@ impl syn::parse::Parse for KernelInputAttribute {
                 let cuda_type = match &*mode.to_string() {
                     "SafeDeviceCopy" => InputCudaType::SafeDeviceCopy,
                     "LendRustToCuda" => InputCudaType::LendRustToCuda,
+                    "ThreadBlockShared" => InputCudaType::ThreadBlockShared,
                     _ => abort!(
                         mode.span(),
-                        "Unexpected CUDA transfer mode `{:?}`: Expected `SafeDeviceCopy` or \
-                         `LendRustToCuda`.",
+                        "Unexpected CUDA transfer mode `{}`: Expected `SafeDeviceCopy`, \
+                         `LendRustToCuda`, or `ThreadBlockShared`.",
                         mode
                     ),
                 };
@@ -61,7 +62,7 @@ impl syn::parse::Parse for KernelInputAttribute {
             },
             _ => abort!(
                 ident.span(),
-                "Unexpected kernel attribute `{:?}`: Expected `pass` or `jit`.",
+                "Unexpected kernel attribute `{}`: Expected `pass` or `jit`.",
                 ident
             ),
         }
diff --git a/rust-cuda-derive/src/kernel/wrapper/inputs/mod.rs b/rust-cuda-derive/src/kernel/wrapper/inputs/mod.rs
index f3cc1a4d8..fb010f76c 100644
--- a/rust-cuda-derive/src/kernel/wrapper/inputs/mod.rs
+++ b/rust-cuda-derive/src/kernel/wrapper/inputs/mod.rs
@@ -12,6 +12,7 @@ pub(super) struct FunctionInputs {
     pub(super) func_input_cuda_types: Vec<(InputCudaType, InputPtxJit)>,
 }
 
+#[allow(clippy::too_many_lines)]
 pub(super) fn parse_function_inputs(
     func: &syn::ItemFn,
     generic_params: &mut syn::punctuated::Punctuated<syn::GenericParam, syn::token::Comma>,
@@ -53,9 +54,25 @@ pub(super) fn parse_function_inputs(
 
                             for attr in attrs {
                                 match attr {
-                                    KernelInputAttribute::PassType(_span, pass_type)
+                                    KernelInputAttribute::PassType(span, pass_type)
                                         if cuda_type.is_none() =>
                                     {
+                                        if matches!(pass_type, InputCudaType::ThreadBlockShared)
+                                            && !matches!(
+                                                &**ty,
+                                                syn::Type::Ptr(syn::TypePtr {
+                                                    mutability: Some(_),
+                                                    ..
+                                                })
+                                            )
+                                        {
+                                            abort!(
+                                                span,
+                                                "Only mutable pointer types can be shared in a \
+                                                 thread block."
+                                            );
+                                        }
+
                                         cuda_type = Some(pass_type);
                                     },
                                     KernelInputAttribute::PassType(span, _pass_type) => {
@@ -208,6 +225,17 @@ fn ensure_reference_type_lifetime(
                 elem,
             }))
         },
+        ty @ syn::Type::Ptr(syn::TypePtr { elem, .. }) => {
+            if matches!(cuda_type, InputCudaType::ThreadBlockShared) {
+                if let syn::Type::Slice(syn::TypeSlice { elem, .. }) = &**elem {
+                    elem.clone()
+                } else {
+                    elem.clone()
+                }
+            } else {
+                Box::new(ty.clone())
+            }
+        },
         ty => {
             if matches!(cuda_type, InputCudaType::LendRustToCuda) {
                 generic_params.insert(
diff --git a/rust-cuda-derive/src/kernel/wrapper/mod.rs b/rust-cuda-derive/src/kernel/wrapper/mod.rs
index 76b88eee6..744a0f8d8 100644
--- a/rust-cuda-derive/src/kernel/wrapper/mod.rs
+++ b/rust-cuda-derive/src/kernel/wrapper/mod.rs
@@ -205,7 +205,9 @@ pub fn kernel(attr: TokenStream, func: TokenStream) -> TokenStream {
         .func_inputs
         .iter_mut()
         .zip(&func_params)
-        .map(|(arg, ident)| match arg {
+        .zip(&func_inputs.func_input_cuda_types)
+        .zip(&func.sig.inputs)
+        .map(|(((arg, ident), (cuda_type, _)), arg_orig)| match arg {
             syn::FnArg::Typed(syn::PatType {
                 attrs,
                 colon_token,
@@ -225,6 +227,12 @@ pub fn kernel(attr: TokenStream, func: TokenStream) -> TokenStream {
                     ty: ty.clone(),
                 });
 
+                if matches!(cuda_type, InputCudaType::ThreadBlockShared) {
+                    if let syn::FnArg::Typed(syn::PatType { ty: ty_orig, .. }) = arg_orig {
+                        *ty = ty_orig.clone();
+                    }
+                }
+
                 std::mem::replace(arg, ident_fn_arg)
             },
             syn::FnArg::Receiver(_) => unreachable!(),
@@ -284,6 +292,7 @@ pub fn kernel(attr: TokenStream, func: TokenStream) -> TokenStream {
 enum InputCudaType {
     SafeDeviceCopy,
     LendRustToCuda,
+    ThreadBlockShared,
 }
 
 struct InputPtxJit(bool);
diff --git a/src/device/alloc.rs b/src/device/alloc.rs
new file mode 100644
index 000000000..14a294814
--- /dev/null
+++ b/src/device/alloc.rs
@@ -0,0 +1,16 @@
+use alloc::alloc::{GlobalAlloc, Layout};
+#[cfg(target_os = "cuda")]
+use core::arch::nvptx;
+
+/// Memory allocator using CUDA malloc/free
+pub struct PTXAllocator;
+
+unsafe impl GlobalAlloc for PTXAllocator {
+    unsafe fn alloc(&self, layout: Layout) -> *mut u8 {
+        nvptx::malloc(layout.size()).cast()
+    }
+
+    unsafe fn dealloc(&self, ptr: *mut u8, _layout: Layout) {
+        nvptx::free(ptr.cast());
+    }
+}
diff --git a/src/device/utils.rs b/src/device/macros.rs
similarity index 59%
rename from src/device/utils.rs
rename to src/device/macros.rs
index a45ff9c71..932ca75ae 100644
--- a/src/device/utils.rs
+++ b/src/device/macros.rs
@@ -1,20 +1,3 @@
-use alloc::alloc::{GlobalAlloc, Layout};
-#[cfg(target_os = "cuda")]
-use core::arch::nvptx;
-
-/// Memory allocator using CUDA malloc/free
-pub struct PTXAllocator;
-
-unsafe impl GlobalAlloc for PTXAllocator {
-    unsafe fn alloc(&self, layout: Layout) -> *mut u8 {
-        nvptx::malloc(layout.size()).cast()
-    }
-
-    unsafe fn dealloc(&self, ptr: *mut u8, _layout: Layout) {
-        nvptx::free(ptr.cast());
-    }
-}
-
 // Based on https://github.com/popzxc/stdext-rs/blob/master/src/macros.rs
 #[macro_export]
 #[doc(hidden)]
@@ -130,91 +113,3 @@ macro_rules! assert_ne {
         }
     };
 }
-
-/// Dimension specified in kernel launching
-#[derive(Debug)]
-pub struct Dim3 {
-    pub x: u32,
-    pub y: u32,
-    pub z: u32,
-}
-
-/// Indices that the kernel code is running on
-#[derive(Debug)]
-pub struct Idx3 {
-    pub x: u32,
-    pub y: u32,
-    pub z: u32,
-}
-
-#[must_use]
-pub fn block_dim() -> Dim3 {
-    #[allow(clippy::cast_sign_loss)]
-    unsafe {
-        Dim3 {
-            x: nvptx::_block_dim_x() as u32,
-            y: nvptx::_block_dim_y() as u32,
-            z: nvptx::_block_dim_z() as u32,
-        }
-    }
-}
-
-#[must_use]
-pub fn block_idx() -> Idx3 {
-    #[allow(clippy::cast_sign_loss)]
-    unsafe {
-        Idx3 {
-            x: nvptx::_block_idx_x() as u32,
-            y: nvptx::_block_idx_y() as u32,
-            z: nvptx::_block_idx_z() as u32,
-        }
-    }
-}
-
-#[must_use]
-pub fn grid_dim() -> Dim3 {
-    #[allow(clippy::cast_sign_loss)]
-    unsafe {
-        Dim3 {
-            x: nvptx::_grid_dim_x() as u32,
-            y: nvptx::_grid_dim_y() as u32,
-            z: nvptx::_grid_dim_z() as u32,
-        }
-    }
-}
-
-#[must_use]
-pub fn thread_idx() -> Idx3 {
-    #[allow(clippy::cast_sign_loss)]
-    unsafe {
-        Idx3 {
-            x: nvptx::_thread_idx_x() as u32,
-            y: nvptx::_thread_idx_y() as u32,
-            z: nvptx::_thread_idx_z() as u32,
-        }
-    }
-}
-
-impl Dim3 {
-    #[must_use]
-    pub fn size(&self) -> usize {
-        (self.x as usize) * (self.y as usize) * (self.z as usize)
-    }
-}
-
-impl Idx3 {
-    #[must_use]
-    pub fn as_id(&self, dim: &Dim3) -> usize {
-        (self.x as usize)
-            + (self.y as usize) * (dim.x as usize)
-            + (self.z as usize) * (dim.x as usize) * (dim.y as usize)
-    }
-}
-
-#[must_use]
-pub fn index() -> usize {
-    let block_id = block_idx().as_id(&grid_dim());
-    let thread_id = thread_idx().as_id(&block_dim());
-
-    block_id * block_dim().size() + thread_id
-}
diff --git a/src/device/mod.rs b/src/device/mod.rs
index 7c11cb34f..45c833923 100644
--- a/src/device/mod.rs
+++ b/src/device/mod.rs
@@ -12,7 +12,10 @@ use crate::{
     safety::SafeDeviceCopy,
 };
 
-pub mod utils;
+pub mod alloc;
+pub mod thread;
+
+mod macros;
 
 pub trait BorrowFromRust: RustToCuda {
     /// # Safety
@@ -113,32 +116,3 @@ impl<T> DerefMut for ShallowCopy<T> {
         &mut self.0
     }
 }
-
-#[repr(transparent)]
-pub struct ThreadBlockShared<T: 'static> {
-    shared: *mut T,
-}
-
-impl<T: 'static> ThreadBlockShared<T> {
-    #[must_use]
-    pub fn new_uninit() -> Self {
-        let shared: *mut T;
-
-        unsafe {
-            core::arch::asm!(
-                ".shared .align {align} .b8 {reg}_rust_cuda_shared[{size}];",
-                "cvta.shared.u64 {reg}, {reg}_rust_cuda_shared;",
-                reg = out(reg64) shared,
-                align = const(core::mem::align_of::<T>()),
-                size = const(core::mem::size_of::<T>()),
-            );
-        }
-
-        Self { shared }
-    }
-
-    #[must_use]
-    pub fn get(&self) -> *mut T {
-        self.shared
-    }
-}
diff --git a/src/device/thread.rs b/src/device/thread.rs
new file mode 100644
index 000000000..8f3bc5719
--- /dev/null
+++ b/src/device/thread.rs
@@ -0,0 +1,133 @@
+#[cfg(target_os = "cuda")]
+use core::arch::nvptx;
+
+#[allow(clippy::module_name_repetitions)]
+pub struct Thread {
+    _private: (),
+}
+
+#[allow(clippy::module_name_repetitions)]
+pub struct ThreadBlock {
+    _private: (),
+}
+
+#[allow(clippy::module_name_repetitions)]
+pub struct ThreadBlockGrid {
+    _private: (),
+}
+
+impl Thread {
+    #[must_use]
+    pub fn this() -> Self {
+        Self { _private: () }
+    }
+
+    #[must_use]
+    pub fn index(&self) -> usize {
+        let block = self.block();
+        let grid = block.grid();
+
+        let block_id = block.idx().as_id(&grid.dim());
+        let thread_id = self.idx().as_id(&block.dim());
+
+        block_id * block.dim().size() + thread_id
+    }
+
+    #[must_use]
+    pub fn idx(&self) -> Idx3 {
+        #[allow(clippy::cast_sign_loss)]
+        unsafe {
+            Idx3 {
+                x: nvptx::_thread_idx_x() as u32,
+                y: nvptx::_thread_idx_y() as u32,
+                z: nvptx::_thread_idx_z() as u32,
+            }
+        }
+    }
+
+    #[must_use]
+    pub fn block(&self) -> ThreadBlock {
+        ThreadBlock { _private: () }
+    }
+}
+
+impl ThreadBlock {
+    #[must_use]
+    pub fn dim(&self) -> Dim3 {
+        #[allow(clippy::cast_sign_loss)]
+        unsafe {
+            Dim3 {
+                x: nvptx::_block_dim_x() as u32,
+                y: nvptx::_block_dim_y() as u32,
+                z: nvptx::_block_dim_z() as u32,
+            }
+        }
+    }
+
+    #[must_use]
+    pub fn idx(&self) -> Idx3 {
+        #[allow(clippy::cast_sign_loss)]
+        unsafe {
+            Idx3 {
+                x: nvptx::_block_idx_x() as u32,
+                y: nvptx::_block_idx_y() as u32,
+                z: nvptx::_block_idx_z() as u32,
+            }
+        }
+    }
+
+    #[must_use]
+    pub fn grid(&self) -> ThreadBlockGrid {
+        ThreadBlockGrid { _private: () }
+    }
+
+    pub fn synchronize(&self) {
+        unsafe { nvptx::_syncthreads() }
+    }
+}
+
+impl ThreadBlockGrid {
+    #[must_use]
+    pub fn dim(&self) -> Dim3 {
+        #[allow(clippy::cast_sign_loss)]
+        unsafe {
+            Dim3 {
+                x: nvptx::_grid_dim_x() as u32,
+                y: nvptx::_grid_dim_y() as u32,
+                z: nvptx::_grid_dim_z() as u32,
+            }
+        }
+    }
+}
+
+/// Dimension specified in kernel launching
+#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
+pub struct Dim3 {
+    pub x: u32,
+    pub y: u32,
+    pub z: u32,
+}
+
+/// Indices that the kernel code is running on
+#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
+pub struct Idx3 {
+    pub x: u32,
+    pub y: u32,
+    pub z: u32,
+}
+
+impl Dim3 {
+    #[must_use]
+    pub fn size(&self) -> usize {
+        (self.x as usize) * (self.y as usize) * (self.z as usize)
+    }
+}
+
+impl Idx3 {
+    #[must_use]
+    pub fn as_id(&self, dim: &Dim3) -> usize {
+        (self.x as usize)
+            + (self.y as usize) * (dim.x as usize)
+            + (self.z as usize) * (dim.x as usize) * (dim.y as usize)
+    }
+}
diff --git a/src/utils/aliasing/const.rs b/src/utils/aliasing/const.rs
index a60a94eb9..ea5f1bba4 100644
--- a/src/utils/aliasing/const.rs
+++ b/src/utils/aliasing/const.rs
@@ -28,7 +28,7 @@ unsafe impl<T: DeviceCopy, const STRIDE: usize> DeviceCopy
 
 #[cfg(all(not(feature = "host"), target_os = "cuda"))]
 fn split_slice_const_stride<E, const STRIDE: usize>(slice: &[E]) -> &[E] {
-    let offset: usize = crate::device::utils::index() * STRIDE;
+    let offset: usize = crate::device::thread::Thread::this().index() * STRIDE;
     let len = slice.len().min(offset + STRIDE).saturating_sub(offset);
 
     unsafe { core::slice::from_raw_parts(slice.as_ptr().add(offset), len) }
@@ -36,7 +36,7 @@ fn split_slice_const_stride<E, const STRIDE: usize>(slice: &[E]) -> &[E] {
 
 #[cfg(all(not(feature = "host"), target_os = "cuda"))]
 fn split_slice_const_stride_mut<E, const STRIDE: usize>(slice: &mut [E]) -> &mut [E] {
-    let offset: usize = crate::device::utils::index() * STRIDE;
+    let offset: usize = crate::device::thread::Thread::this().index() * STRIDE;
     let len = slice.len().min(offset + STRIDE).saturating_sub(offset);
 
     unsafe { core::slice::from_raw_parts_mut(slice.as_mut_ptr().add(offset), len) }
diff --git a/src/utils/aliasing/dynamic.rs b/src/utils/aliasing/dynamic.rs
index 668112f88..c2ad169ff 100644
--- a/src/utils/aliasing/dynamic.rs
+++ b/src/utils/aliasing/dynamic.rs
@@ -28,7 +28,7 @@ unsafe impl<T: DeviceCopy> DeviceCopy for SplitSliceOverCudaThreadsDynamicStride
 
 #[cfg(all(not(feature = "host"), target_os = "cuda"))]
 fn split_slice_dynamic_stride<E>(slice: &[E], stride: usize) -> &[E] {
-    let offset: usize = crate::device::utils::index() * stride;
+    let offset: usize = crate::device::thread::Thread::this().index() * stride;
     let len = slice.len().min(offset + stride).saturating_sub(offset);
 
     unsafe { core::slice::from_raw_parts(slice.as_ptr().add(offset), len) }
@@ -36,7 +36,7 @@ fn split_slice_dynamic_stride<E>(slice: &[E], stride: usize) -> &[E] {
 
 #[cfg(all(not(feature = "host"), target_os = "cuda"))]
 fn split_slice_dynamic_stride_mut<E>(slice: &mut [E], stride: usize) -> &mut [E] {
-    let offset: usize = crate::device::utils::index() * stride;
+    let offset: usize = crate::device::thread::Thread::this().index() * stride;
     let len = slice.len().min(offset + stride).saturating_sub(offset);
 
     unsafe { core::slice::from_raw_parts_mut(slice.as_mut_ptr().add(offset), len) }
diff --git a/src/utils/mod.rs b/src/utils/mod.rs
index 303e96262..c70432f31 100644
--- a/src/utils/mod.rs
+++ b/src/utils/mod.rs
@@ -4,6 +4,7 @@ pub mod aliasing;
 pub mod alloc;
 pub mod device_copy;
 pub mod exchange;
+pub mod shared;
 
 mod r#box;
 mod boxed_slice;
diff --git a/src/utils/shared/mod.rs b/src/utils/shared/mod.rs
new file mode 100644
index 000000000..8b49ca6d3
--- /dev/null
+++ b/src/utils/shared/mod.rs
@@ -0,0 +1,35 @@
+pub mod slice;
+pub mod r#static;
+
+#[cfg(not(any(all(not(feature = "host"), target_os = "cuda"), doc)))]
+#[doc(cfg(not(all(not(feature = "host"), target_os = "cuda"))))]
+#[allow(clippy::module_name_repetitions)]
+pub trait ThreadBlockShared: 'static + Sized {
+    fn share_uninit() -> r#static::ThreadBlockShared<Self>;
+}
+
+#[cfg(not(any(all(not(feature = "host"), target_os = "cuda"), doc)))]
+#[doc(cfg(not(all(not(feature = "host"), target_os = "cuda"))))]
+impl<T: 'static> ThreadBlockShared for T {
+    fn share_uninit() -> r#static::ThreadBlockShared<Self> {
+        r#static::ThreadBlockShared::uninit()
+    }
+}
+
+#[cfg(not(any(all(not(feature = "host"), target_os = "cuda"), doc)))]
+#[doc(cfg(not(all(not(feature = "host"), target_os = "cuda"))))]
+pub trait ThreadBlockSharedSlice: 'static {
+    type Elem: Sized;
+
+    fn share_uninit(len: usize) -> slice::ThreadBlockSharedSlice<Self::Elem>;
+}
+
+#[cfg(not(any(all(not(feature = "host"), target_os = "cuda"), doc)))]
+#[doc(cfg(not(all(not(feature = "host"), target_os = "cuda"))))]
+impl<T: 'static> ThreadBlockSharedSlice for [T] {
+    type Elem = T;
+
+    fn share_uninit(len: usize) -> slice::ThreadBlockSharedSlice<T> {
+        slice::ThreadBlockSharedSlice::with_len(len)
+    }
+}
diff --git a/src/utils/shared/slice.rs b/src/utils/shared/slice.rs
new file mode 100644
index 000000000..098670fba
--- /dev/null
+++ b/src/utils/shared/slice.rs
@@ -0,0 +1,73 @@
+use rustacuda_core::DeviceCopy;
+
+#[allow(clippy::module_name_repetitions)]
+#[derive(TypeLayout)]
+#[repr(C)]
+pub struct ThreadBlockSharedSlice<T: 'static> {
+    len: usize,
+    byte_offset: usize,
+    marker: [T; 0],
+}
+
+unsafe impl<T: 'static> DeviceCopy for ThreadBlockSharedSlice<T> {}
+
+#[cfg(not(any(all(not(feature = "host"), target_os = "cuda"), doc)))]
+#[doc(cfg(not(all(not(feature = "host"), target_os = "cuda"))))]
+impl<T: 'static> ThreadBlockSharedSlice<T> {
+    #[must_use]
+    pub fn with_len(len: usize) -> Self {
+        Self {
+            len,
+            byte_offset: 0,
+            marker: [],
+        }
+    }
+
+    #[must_use]
+    pub fn len(&self) -> usize {
+        self.len
+    }
+
+    #[must_use]
+    pub fn is_empty(&self) -> bool {
+        self.len == 0
+    }
+}
+
+#[cfg(all(not(feature = "host"), target_os = "cuda"))]
+#[doc(cfg(all(not(feature = "host"), target_os = "cuda")))]
+impl<T: 'static> ThreadBlockSharedSlice<T> {
+    /// # Safety
+    ///
+    /// The thread-block shared dynamic memory must be initialised once and
+    /// only once per kernel.
+    pub unsafe fn init() {
+        unsafe {
+            core::arch::asm!(
+                ".shared .align {align} .b8 rust_cuda_dynamic_shared[];",
+                align = const(core::mem::align_of::<T>()),
+            );
+        }
+    }
+
+    /// # Safety
+    ///
+    /// Exposing the [`ThreadBlockSharedSlice`] must be preceded by exactly one
+    /// call to [`ThreadBlockSharedSlice::init`] for the type `T` amongst
+    /// all `ThreadBlockSharedSlice<T>` that has the largest alignment.
+    pub unsafe fn with_uninit<F: FnOnce(*mut [T]) -> Q, Q>(self, inner: F) -> Q {
+        let base: *mut u8;
+
+        unsafe {
+            core::arch::asm!(
+                "cvta.shared.u64 {reg}, rust_cuda_dynamic_shared;",
+                reg = out(reg64) base,
+            );
+        }
+
+        let slice =
+            core::ptr::slice_from_raw_parts_mut(base.add(self.byte_offset).cast(), self.len);
+
+        inner(slice)
+    }
+}
diff --git a/src/utils/shared/static.rs b/src/utils/shared/static.rs
new file mode 100644
index 000000000..53f8aeb9e
--- /dev/null
+++ b/src/utils/shared/static.rs
@@ -0,0 +1,44 @@
+use rustacuda_core::DeviceCopy;
+
+#[derive(TypeLayout)]
+#[repr(transparent)]
+pub struct ThreadBlockShared<T: 'static> {
+    marker: [T; 0],
+}
+
+unsafe impl<T: 'static> DeviceCopy for ThreadBlockShared<T> {}
+
+#[cfg(not(any(all(not(feature = "host"), target_os = "cuda"), doc)))]
+#[doc(cfg(not(all(not(feature = "host"), target_os = "cuda"))))]
+impl<T: 'static> ThreadBlockShared<T> {
+    #[must_use]
+    pub fn uninit() -> Self {
+        Self { marker: [] }
+    }
+}
+
+#[cfg(any(all(not(feature = "host"), target_os = "cuda"), doc))]
+#[doc(cfg(all(not(feature = "host"), target_os = "cuda")))]
+impl<T: 'static> ThreadBlockShared<T> {
+    #[must_use]
+    pub fn new_uninit() -> *mut T {
+        let shared: *mut T;
+
+        unsafe {
+            core::arch::asm!(
+                ".shared .align {align} .b8 {reg}_rust_cuda_static_shared[{size}];",
+                "cvta.shared.u64 {reg}, {reg}_rust_cuda_static_shared;",
+                reg = out(reg64) shared,
+                align = const(core::mem::align_of::<T>()),
+                size = const(core::mem::size_of::<T>()),
+            );
+        }
+
+        shared
+    }
+
+    #[must_use]
+    pub fn with_uninit<F: FnOnce(*mut T) -> Q, Q>(self, inner: F) -> Q {
+        inner(Self::new_uninit())
+    }
+}

From 79792bd56602f30591a535edcee9536668322211 Mon Sep 17 00:00:00 2001
From: Juniper Tyree <juniper.tyree@helsinki.fi>
Date: Sat, 3 Dec 2022 13:24:23 -0800
Subject: [PATCH 019/120] Revert derive changes + R2C-based approach start

---
 examples/single-source/src/main.rs            | 16 ++--
 .../generate/cpu_linker_macro/kernel_func.rs  | 19 +----
 .../kernel_func_async/async_func_types.rs     |  9 --
 .../kernel_func_async/launch_types.rs         | 11 ---
 .../kernel/wrapper/generate/cpu_wrapper.rs    | 20 -----
 .../kernel/wrapper/generate/cuda_wrapper.rs   | 32 +------
 .../src/kernel/wrapper/inputs/attribute.rs    |  7 +-
 .../src/kernel/wrapper/inputs/mod.rs          | 30 +------
 rust-cuda-derive/src/kernel/wrapper/mod.rs    | 11 +--
 src/safety/stack_only.rs                      |  4 +
 src/utils/shared/mod.rs                       | 35 +-------
 src/utils/shared/static.rs                    | 84 +++++++++++++++++--
 12 files changed, 99 insertions(+), 179 deletions(-)

diff --git a/examples/single-source/src/main.rs b/examples/single-source/src/main.rs
index 891c2db06..2e1c9e199 100644
--- a/examples/single-source/src/main.rs
+++ b/examples/single-source/src/main.rs
@@ -10,6 +10,8 @@
 
 extern crate alloc;
 
+use rc::utils::shared::r#static::ThreadBlockShared;
+
 #[cfg(not(target_os = "cuda"))]
 fn main() {}
 
@@ -44,19 +46,21 @@ pub fn kernel<'a, T: rc::common::RustToCuda>(
     #[kernel(pass = SafeDeviceCopy, jit)] _v @ _w: &'a core::sync::atomic::AtomicU64,
     #[kernel(pass = LendRustToCuda)] _: Wrapper<T>,
     #[kernel(pass = SafeDeviceCopy)] Tuple(_s, mut __t): Tuple,
+    #[kernel(pass = LendRustToCuda)] shared3: ThreadBlockShared<u32>,
 ) where
     <T as rc::common::RustToCuda>::CudaRepresentation: rc::safety::StackOnly,
 {
-    use rc::device::ThreadBlockShared;
-
     let shared: ThreadBlockShared<[Tuple; 3]> = ThreadBlockShared::new_uninit();
     let shared2: ThreadBlockShared<[Tuple; 3]> = ThreadBlockShared::new_uninit();
 
     unsafe {
-        (*shared.get().cast::<Tuple>().add(1)).0 = 42;
+        (*shared.as_mut_ptr().cast::<Tuple>().add(1)).0 = 42;
+    }
+    unsafe {
+        (*shared2.as_mut_ptr().cast::<Tuple>().add(2)).1 = 24;
     }
     unsafe {
-        (*shared2.get().cast::<Tuple>().add(2)).1 = 24;
+        *shared3.as_mut_ptr() = 12;
     }
 }
 
@@ -84,10 +88,10 @@ mod host {
 mod cuda_prelude {
     use core::arch::nvptx;
 
-    use rc::device::utils;
+    use rc::device::alloc::PTXAllocator;
 
     #[global_allocator]
-    static _GLOBAL_ALLOCATOR: utils::PTXAllocator = utils::PTXAllocator;
+    static _GLOBAL_ALLOCATOR: PTXAllocator = PTXAllocator;
 
     #[panic_handler]
     fn panic(_: &::core::panic::PanicInfo) -> ! {
diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func.rs
index 00208e57e..d6e70e276 100644
--- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func.rs
+++ b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func.rs
@@ -13,10 +13,7 @@ pub(super) fn quote_kernel_func(
         generic_wrapper_where_clause,
         ..
     }: &DeclGenerics,
-    inputs @ FunctionInputs {
-        func_inputs,
-        func_input_cuda_types,
-    }: &FunctionInputs,
+    inputs @ FunctionInputs { func_inputs, .. }: &FunctionInputs,
     fn_ident @ FuncIdent { func_ident, .. }: &FuncIdent,
     func_params: &[syn::Ident],
     func_attrs: &[syn::Attribute],
@@ -24,9 +21,8 @@ pub(super) fn quote_kernel_func(
 ) -> TokenStream {
     let new_func_inputs = func_inputs
         .iter()
-        .zip(func_input_cuda_types.iter())
         .enumerate()
-        .map(|(i, (arg, (cuda_type, _)))| match arg {
+        .map(|(i, arg)| match arg {
             syn::FnArg::Typed(syn::PatType {
                 attrs,
                 pat,
@@ -50,16 +46,6 @@ pub(super) fn quote_kernel_func(
                     quote! {
                         #(#attrs)* #pat #colon_token #and_token #lifetime #mutability #syn_type
                     }
-                } else if matches!(cuda_type, InputCudaType::ThreadBlockShared) {
-                    if let syn::Type::Slice(_) = &**ty {
-                        quote! { #(#attrs)* #pat #colon_token
-                            #crate_path::utils::shared::slice::ThreadBlockSharedSlice<#syn_type>
-                        }
-                    } else {
-                        quote! { #(#attrs)* #pat #colon_token
-                            #crate_path::utils::shared::r#static::ThreadBlockShared<#syn_type>
-                        }
-                    }
                 } else {
                     quote! { #(#attrs)* #pat #colon_token #syn_type }
                 }
@@ -185,7 +171,6 @@ fn generate_raw_func_input_wrap(
                             ) }
                         }
                     },
-                    InputCudaType::ThreadBlockShared => inner,
                 },
                 syn::FnArg::Receiver(_) => unreachable!(),
             },
diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func_async/async_func_types.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func_async/async_func_types.rs
index 8cbbc7790..c24406c9a 100644
--- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func_async/async_func_types.rs
+++ b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func_async/async_func_types.rs
@@ -46,15 +46,6 @@ pub(super) fn generate_async_func_types(
                             <#syn_type as #crate_path::common::RustToCuda>::CudaRepresentation
                         >
                     },
-                    InputCudaType::ThreadBlockShared => if let syn::Type::Slice(_) = &**ty {
-                        quote! {
-                            #crate_path::utils::shared::slice::ThreadBlockSharedSlice<#syn_type>
-                        }
-                    } else {
-                        quote! {
-                            #crate_path::utils::shared::r#static::ThreadBlockShared<#syn_type>
-                        }
-                    },
                 };
 
                 if let syn::Type::Reference(syn::TypeReference {
diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func_async/launch_types.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func_async/launch_types.rs
index cda2d7e4a..16cd0008e 100644
--- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func_async/launch_types.rs
+++ b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func_async/launch_types.rs
@@ -47,17 +47,6 @@ pub(in super::super) fn generate_launch_types(
                             <#syn_type as #crate_path::common::RustToCuda>::CudaRepresentation
                         >
                     },
-                    InputCudaType::ThreadBlockShared => {
-                        if let syn::Type::Slice(_) = &**ty {
-                            quote::quote_spanned! { ty.span()=>
-                                #crate_path::utils::shared::slice::ThreadBlockSharedSlice<#syn_type>
-                            }
-                        } else {
-                            quote::quote_spanned! { ty.span()=>
-                                #crate_path::utils::shared::r#static::ThreadBlockShared<#syn_type>
-                            }
-                        }
-                    },
                 };
 
                 cpu_func_types_launch.push(
diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper.rs
index 6b15f2109..4851af9ce 100644
--- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper.rs
+++ b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper.rs
@@ -91,7 +91,6 @@ pub(in super::super) fn quote_cpu_wrapper(
     }
 }
 
-#[allow(clippy::too_many_lines)]
 fn generate_new_func_inputs_decl(
     crate_path: &syn::Path,
     KernelConfig { args, .. }: &KernelConfig,
@@ -133,16 +132,6 @@ fn generate_new_func_inputs_decl(
                                 mutability: *mutability,
                                 elem: syn_type,
                             }))
-                        } else if matches!(cuda_mode, InputCudaType::ThreadBlockShared) {
-                            if let syn::Type::Slice(_) = &**ty {
-                                syn::parse_quote!(
-                                    #crate_path::utils::shared::slice::ThreadBlockSharedSlice<#syn_type>
-                                )
-                            } else {
-                                syn::parse_quote!(
-                                    #crate_path::utils::shared::r#static::ThreadBlockShared<#syn_type>
-                                )
-                            }
                         } else {
                             syn_type
                         }
@@ -166,15 +155,6 @@ fn generate_new_func_inputs_decl(
                                     <#syn_type as #crate_path::common::RustToCuda>::CudaRepresentation
                                 >
                             ),
-                            InputCudaType::ThreadBlockShared => if let syn::Type::Slice(_) = &**ty {
-                                syn::parse_quote!(
-                                    #crate_path::utils::shared::slice::ThreadBlockSharedSlice<#syn_type>
-                                )
-                            } else {
-                                syn::parse_quote!(
-                                    #crate_path::utils::shared::r#static::ThreadBlockShared<#syn_type>
-                                )
-                            },
                         };
 
                         if let syn::Type::Reference(syn::TypeReference {
diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cuda_wrapper.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cuda_wrapper.rs
index 34db62123..36e316708 100644
--- a/rust-cuda-derive/src/kernel/wrapper/generate/cuda_wrapper.rs
+++ b/rust-cuda-derive/src/kernel/wrapper/generate/cuda_wrapper.rs
@@ -33,8 +33,6 @@ pub(in super::super) fn quote_cuda_wrapper(
         })
         .collect::<Vec<_>>();
 
-    let mut shared_slice = Vec::new();
-
     let ptx_func_input_unwrap = func_inputs
         .iter().zip(func_input_cuda_types.iter()).enumerate()
         .rev()
@@ -92,24 +90,7 @@ pub(in super::super) fn quote_cuda_wrapper(
                                 #pat, |#pat: #syn_type| { #inner },
                             )
                         }
-                    },
-                    InputCudaType::ThreadBlockShared => if let syn::Type::Slice(syn::TypeSlice { elem, .. }) = &**ty {
-                        shared_slice.push(elem);
-
-                        quote! {
-                            #ptx_jit_load;
-                            #crate_path::utils::shared::slice::ThreadBlockSharedSlice::with_uninit(
-                                #pat, |#pat: #syn_type| { #inner },
-                            )
-                        }
-                    } else {
-                        quote! {
-                            #ptx_jit_load;
-                            #crate_path::utils::shared::r#static::ThreadBlockShared::with_uninit(
-                                #pat, |#pat: #syn_type| { #inner },
-                            )
-                        }
-                    },
+                    }
                 }
             },
             syn::FnArg::Receiver(_) => unreachable!(),
@@ -205,17 +186,6 @@ fn specialise_ptx_func_inputs(
                             <#syn_type as #crate_path::common::RustToCuda>::CudaRepresentation
                         >
                     },
-                    InputCudaType::ThreadBlockShared => {
-                        if let syn::Type::Slice(_) = &**ty {
-                            quote::quote_spanned! { ty.span()=>
-                                #crate_path::utils::shared::slice::ThreadBlockSharedSlice<#syn_type>
-                            }
-                        } else {
-                            quote::quote_spanned! { ty.span()=>
-                                #crate_path::utils::shared::r#static::ThreadBlockShared<#syn_type>
-                            }
-                        }
-                    },
                 };
 
                 let ty = if let syn::Type::Reference(syn::TypeReference {
diff --git a/rust-cuda-derive/src/kernel/wrapper/inputs/attribute.rs b/rust-cuda-derive/src/kernel/wrapper/inputs/attribute.rs
index 6b479a664..ceeee1e3e 100644
--- a/rust-cuda-derive/src/kernel/wrapper/inputs/attribute.rs
+++ b/rust-cuda-derive/src/kernel/wrapper/inputs/attribute.rs
@@ -19,11 +19,10 @@ impl syn::parse::Parse for KernelInputAttribute {
                 let cuda_type = match &*mode.to_string() {
                     "SafeDeviceCopy" => InputCudaType::SafeDeviceCopy,
                     "LendRustToCuda" => InputCudaType::LendRustToCuda,
-                    "ThreadBlockShared" => InputCudaType::ThreadBlockShared,
                     _ => abort!(
                         mode.span(),
-                        "Unexpected CUDA transfer mode `{}`: Expected `SafeDeviceCopy`, \
-                         `LendRustToCuda`, or `ThreadBlockShared`.",
+                        "Unexpected CUDA transfer mode `{:?}`: Expected `SafeDeviceCopy` or \
+                         `LendRustToCuda`.",
                         mode
                     ),
                 };
@@ -62,7 +61,7 @@ impl syn::parse::Parse for KernelInputAttribute {
             },
             _ => abort!(
                 ident.span(),
-                "Unexpected kernel attribute `{}`: Expected `pass` or `jit`.",
+                "Unexpected kernel attribute `{:?}`: Expected `pass` or `jit`.",
                 ident
             ),
         }
diff --git a/rust-cuda-derive/src/kernel/wrapper/inputs/mod.rs b/rust-cuda-derive/src/kernel/wrapper/inputs/mod.rs
index fb010f76c..f3cc1a4d8 100644
--- a/rust-cuda-derive/src/kernel/wrapper/inputs/mod.rs
+++ b/rust-cuda-derive/src/kernel/wrapper/inputs/mod.rs
@@ -12,7 +12,6 @@ pub(super) struct FunctionInputs {
     pub(super) func_input_cuda_types: Vec<(InputCudaType, InputPtxJit)>,
 }
 
-#[allow(clippy::too_many_lines)]
 pub(super) fn parse_function_inputs(
     func: &syn::ItemFn,
     generic_params: &mut syn::punctuated::Punctuated<syn::GenericParam, syn::token::Comma>,
@@ -54,25 +53,9 @@ pub(super) fn parse_function_inputs(
 
                             for attr in attrs {
                                 match attr {
-                                    KernelInputAttribute::PassType(span, pass_type)
+                                    KernelInputAttribute::PassType(_span, pass_type)
                                         if cuda_type.is_none() =>
                                     {
-                                        if matches!(pass_type, InputCudaType::ThreadBlockShared)
-                                            && !matches!(
-                                                &**ty,
-                                                syn::Type::Ptr(syn::TypePtr {
-                                                    mutability: Some(_),
-                                                    ..
-                                                })
-                                            )
-                                        {
-                                            abort!(
-                                                span,
-                                                "Only mutable pointer types can be shared in a \
-                                                 thread block."
-                                            );
-                                        }
-
                                         cuda_type = Some(pass_type);
                                     },
                                     KernelInputAttribute::PassType(span, _pass_type) => {
@@ -225,17 +208,6 @@ fn ensure_reference_type_lifetime(
                 elem,
             }))
         },
-        ty @ syn::Type::Ptr(syn::TypePtr { elem, .. }) => {
-            if matches!(cuda_type, InputCudaType::ThreadBlockShared) {
-                if let syn::Type::Slice(syn::TypeSlice { elem, .. }) = &**elem {
-                    elem.clone()
-                } else {
-                    elem.clone()
-                }
-            } else {
-                Box::new(ty.clone())
-            }
-        },
         ty => {
             if matches!(cuda_type, InputCudaType::LendRustToCuda) {
                 generic_params.insert(
diff --git a/rust-cuda-derive/src/kernel/wrapper/mod.rs b/rust-cuda-derive/src/kernel/wrapper/mod.rs
index 744a0f8d8..76b88eee6 100644
--- a/rust-cuda-derive/src/kernel/wrapper/mod.rs
+++ b/rust-cuda-derive/src/kernel/wrapper/mod.rs
@@ -205,9 +205,7 @@ pub fn kernel(attr: TokenStream, func: TokenStream) -> TokenStream {
         .func_inputs
         .iter_mut()
         .zip(&func_params)
-        .zip(&func_inputs.func_input_cuda_types)
-        .zip(&func.sig.inputs)
-        .map(|(((arg, ident), (cuda_type, _)), arg_orig)| match arg {
+        .map(|(arg, ident)| match arg {
             syn::FnArg::Typed(syn::PatType {
                 attrs,
                 colon_token,
@@ -227,12 +225,6 @@ pub fn kernel(attr: TokenStream, func: TokenStream) -> TokenStream {
                     ty: ty.clone(),
                 });
 
-                if matches!(cuda_type, InputCudaType::ThreadBlockShared) {
-                    if let syn::FnArg::Typed(syn::PatType { ty: ty_orig, .. }) = arg_orig {
-                        *ty = ty_orig.clone();
-                    }
-                }
-
                 std::mem::replace(arg, ident_fn_arg)
             },
             syn::FnArg::Receiver(_) => unreachable!(),
@@ -292,7 +284,6 @@ pub fn kernel(attr: TokenStream, func: TokenStream) -> TokenStream {
 enum InputCudaType {
     SafeDeviceCopy,
     LendRustToCuda,
-    ThreadBlockShared,
 }
 
 struct InputPtxJit(bool);
diff --git a/src/safety/stack_only.rs b/src/safety/stack_only.rs
index e96f48993..ce8887bb3 100644
--- a/src/safety/stack_only.rs
+++ b/src/safety/stack_only.rs
@@ -36,5 +36,9 @@ mod sealed {
     impl<T> !StackOnly for &T {}
     impl<T> !StackOnly for &mut T {}
 
+    impl<T: 'static> !StackOnly for crate::utils::shared::r#static::ThreadBlockShared<T> {}
+    // impl<T: 'static> !StackOnly for
+    // crate::utils::shared::slice::ThreadBlockSharedSlice<T> {}
+
     impl<T> StackOnly for core::marker::PhantomData<T> {}
 }
diff --git a/src/utils/shared/mod.rs b/src/utils/shared/mod.rs
index 8b49ca6d3..dcfe3b008 100644
--- a/src/utils/shared/mod.rs
+++ b/src/utils/shared/mod.rs
@@ -1,35 +1,2 @@
-pub mod slice;
+// pub mod slice;
 pub mod r#static;
-
-#[cfg(not(any(all(not(feature = "host"), target_os = "cuda"), doc)))]
-#[doc(cfg(not(all(not(feature = "host"), target_os = "cuda"))))]
-#[allow(clippy::module_name_repetitions)]
-pub trait ThreadBlockShared: 'static + Sized {
-    fn share_uninit() -> r#static::ThreadBlockShared<Self>;
-}
-
-#[cfg(not(any(all(not(feature = "host"), target_os = "cuda"), doc)))]
-#[doc(cfg(not(all(not(feature = "host"), target_os = "cuda"))))]
-impl<T: 'static> ThreadBlockShared for T {
-    fn share_uninit() -> r#static::ThreadBlockShared<Self> {
-        r#static::ThreadBlockShared::uninit()
-    }
-}
-
-#[cfg(not(any(all(not(feature = "host"), target_os = "cuda"), doc)))]
-#[doc(cfg(not(all(not(feature = "host"), target_os = "cuda"))))]
-pub trait ThreadBlockSharedSlice: 'static {
-    type Elem: Sized;
-
-    fn share_uninit(len: usize) -> slice::ThreadBlockSharedSlice<Self::Elem>;
-}
-
-#[cfg(not(any(all(not(feature = "host"), target_os = "cuda"), doc)))]
-#[doc(cfg(not(all(not(feature = "host"), target_os = "cuda"))))]
-impl<T: 'static> ThreadBlockSharedSlice for [T] {
-    type Elem = T;
-
-    fn share_uninit(len: usize) -> slice::ThreadBlockSharedSlice<T> {
-        slice::ThreadBlockSharedSlice::with_len(len)
-    }
-}
diff --git a/src/utils/shared/static.rs b/src/utils/shared/static.rs
index 53f8aeb9e..fc3e86b3a 100644
--- a/src/utils/shared/static.rs
+++ b/src/utils/shared/static.rs
@@ -1,19 +1,87 @@
+#[cfg(not(target_os = "cuda"))]
+use core::marker::PhantomData;
+
+use const_type_layout::TypeGraphLayout;
 use rustacuda_core::DeviceCopy;
 
-#[derive(TypeLayout)]
+use crate::common::{CudaAsRust, DeviceAccessible, RustToCuda};
+
+#[cfg(not(target_os = "cuda"))]
+#[repr(transparent)]
+pub struct ThreadBlockShared<T: 'static> {
+    marker: PhantomData<T>,
+}
+
+#[cfg(target_os = "cuda")]
 #[repr(transparent)]
 pub struct ThreadBlockShared<T: 'static> {
+    shared: *mut T,
+}
+
+#[doc(hidden)]
+#[derive(TypeLayout)]
+#[repr(transparent)]
+pub struct ThreadBlockSharedCudaRepresentation<T: 'static> {
+    // Note: uses a zero-element array instead of PhantomData here so that
+    //       TypeLayout can still observe T's layout
     marker: [T; 0],
 }
 
-unsafe impl<T: 'static> DeviceCopy for ThreadBlockShared<T> {}
+unsafe impl<T: 'static> DeviceCopy for ThreadBlockSharedCudaRepresentation<T> {}
+
+unsafe impl<T: 'static + ~const TypeGraphLayout> RustToCuda for ThreadBlockShared<T> {
+    #[cfg(feature = "host")]
+    #[doc(cfg(feature = "host"))]
+    type CudaAllocation = crate::host::NullCudaAlloc;
+    type CudaRepresentation = ThreadBlockSharedCudaRepresentation<T>;
+
+    #[cfg(feature = "host")]
+    #[doc(cfg(feature = "host"))]
+    unsafe fn borrow<A: crate::host::CudaAlloc>(
+        &self,
+        alloc: A,
+    ) -> rustacuda::error::CudaResult<(
+        DeviceAccessible<Self::CudaRepresentation>,
+        crate::host::CombinedCudaAlloc<Self::CudaAllocation, A>,
+    )> {
+        Ok((
+            DeviceAccessible::from(ThreadBlockSharedCudaRepresentation { marker: [] }),
+            crate::host::CombinedCudaAlloc::new(crate::host::NullCudaAlloc, alloc),
+        ))
+    }
+
+    #[cfg(feature = "host")]
+    #[doc(cfg(feature = "host"))]
+    unsafe fn restore<A: crate::host::CudaAlloc>(
+        &mut self,
+        alloc: crate::host::CombinedCudaAlloc<Self::CudaAllocation, A>,
+    ) -> rustacuda::error::CudaResult<A> {
+        let (_null, alloc): (crate::host::NullCudaAlloc, A) = alloc.split();
+
+        Ok(alloc)
+    }
+}
+
+unsafe impl<T: 'static + ~const TypeGraphLayout> CudaAsRust
+    for ThreadBlockSharedCudaRepresentation<T>
+{
+    type RustRepresentation = ThreadBlockShared<T>;
+
+    #[cfg(any(not(feature = "host"), doc))]
+    #[doc(cfg(not(feature = "host")))]
+    unsafe fn as_rust(_this: &DeviceAccessible<Self>) -> Self::RustRepresentation {
+        ThreadBlockShared::new_uninit()
+    }
+}
 
 #[cfg(not(any(all(not(feature = "host"), target_os = "cuda"), doc)))]
 #[doc(cfg(not(all(not(feature = "host"), target_os = "cuda"))))]
 impl<T: 'static> ThreadBlockShared<T> {
     #[must_use]
-    pub fn uninit() -> Self {
-        Self { marker: [] }
+    pub fn new_uninit() -> Self {
+        Self {
+            marker: PhantomData::<T>,
+        }
     }
 }
 
@@ -21,7 +89,7 @@ impl<T: 'static> ThreadBlockShared<T> {
 #[doc(cfg(all(not(feature = "host"), target_os = "cuda")))]
 impl<T: 'static> ThreadBlockShared<T> {
     #[must_use]
-    pub fn new_uninit() -> *mut T {
+    pub fn new_uninit() -> Self {
         let shared: *mut T;
 
         unsafe {
@@ -34,11 +102,11 @@ impl<T: 'static> ThreadBlockShared<T> {
             );
         }
 
-        shared
+        Self { shared }
     }
 
     #[must_use]
-    pub fn with_uninit<F: FnOnce(*mut T) -> Q, Q>(self, inner: F) -> Q {
-        inner(Self::new_uninit())
+    pub fn as_mut_ptr(&self) -> *mut T {
+        self.shared
     }
 }

From 914dd90f9628a13c7a942a05b333bee65b9b852e Mon Sep 17 00:00:00 2001
From: Juniper Tyree <juniper.tyree@helsinki.fi>
Date: Sat, 3 Dec 2022 14:13:25 -0800
Subject: [PATCH 020/120] Some progress on shared slices

---
 src/lib.rs                 |   1 +
 src/safety/stack_only.rs   |   6 +-
 src/utils/shared/mod.rs    |   2 +-
 src/utils/shared/slice.rs  | 160 +++++++++++++++++++++++++++----------
 src/utils/shared/static.rs |  72 ++++++++---------
 5 files changed, 160 insertions(+), 81 deletions(-)

diff --git a/src/lib.rs b/src/lib.rs
index 795e00cfa..0c149d40f 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -16,6 +16,7 @@
     any(all(not(feature = "host"), target_os = "cuda"), doc),
     feature(asm_const)
 )]
+#![cfg_attr(target_os = "cuda", feature(ptr_metadata))]
 #![cfg_attr(any(feature = "alloc", doc), feature(allocator_api))]
 #![feature(doc_cfg)]
 #![feature(marker_trait_attr)]
diff --git a/src/safety/stack_only.rs b/src/safety/stack_only.rs
index ce8887bb3..eb3a69706 100644
--- a/src/safety/stack_only.rs
+++ b/src/safety/stack_only.rs
@@ -37,8 +37,10 @@ mod sealed {
     impl<T> !StackOnly for &mut T {}
 
     impl<T: 'static> !StackOnly for crate::utils::shared::r#static::ThreadBlockShared<T> {}
-    // impl<T: 'static> !StackOnly for
-    // crate::utils::shared::slice::ThreadBlockSharedSlice<T> {}
+    impl<T: 'static + ~const const_type_layout::TypeGraphLayout> !StackOnly
+        for crate::utils::shared::slice::ThreadBlockSharedSlice<T>
+    {
+    }
 
     impl<T> StackOnly for core::marker::PhantomData<T> {}
 }
diff --git a/src/utils/shared/mod.rs b/src/utils/shared/mod.rs
index dcfe3b008..88a586ad6 100644
--- a/src/utils/shared/mod.rs
+++ b/src/utils/shared/mod.rs
@@ -1,2 +1,2 @@
-// pub mod slice;
+pub mod slice;
 pub mod r#static;
diff --git a/src/utils/shared/slice.rs b/src/utils/shared/slice.rs
index 098670fba..238b1aac8 100644
--- a/src/utils/shared/slice.rs
+++ b/src/utils/shared/slice.rs
@@ -1,73 +1,151 @@
+#[cfg(not(target_os = "cuda"))]
+use core::marker::PhantomData;
+
+use const_type_layout::TypeGraphLayout;
 use rustacuda_core::DeviceCopy;
 
+use crate::common::{CudaAsRust, DeviceAccessible, RustToCuda};
+
+#[cfg(not(target_os = "cuda"))]
+#[allow(clippy::module_name_repetitions)]
+#[repr(transparent)]
+pub struct ThreadBlockSharedSlice<T: 'static + ~const TypeGraphLayout> {
+    len: usize,
+    marker: PhantomData<T>,
+}
+
+#[cfg(target_os = "cuda")]
 #[allow(clippy::module_name_repetitions)]
+#[repr(transparent)]
+pub struct ThreadBlockSharedSlice<T: 'static + ~const TypeGraphLayout> {
+    shared: *mut [T],
+}
+
+#[doc(hidden)]
 #[derive(TypeLayout)]
+#[layout(bound = "T: 'static + ~const TypeGraphLayout")]
 #[repr(C)]
-pub struct ThreadBlockSharedSlice<T: 'static> {
+pub struct ThreadBlockSharedSliceCudaRepresentation<T: 'static + ~const TypeGraphLayout> {
     len: usize,
-    byte_offset: usize,
+    // Note: uses a zero-element array instead of PhantomData here so that
+    //       TypeLayout can still observe T's layout
     marker: [T; 0],
 }
 
-unsafe impl<T: 'static> DeviceCopy for ThreadBlockSharedSlice<T> {}
+unsafe impl<T: 'static + ~const TypeGraphLayout> DeviceCopy
+    for ThreadBlockSharedSliceCudaRepresentation<T>
+{
+}
 
-#[cfg(not(any(all(not(feature = "host"), target_os = "cuda"), doc)))]
-#[doc(cfg(not(all(not(feature = "host"), target_os = "cuda"))))]
-impl<T: 'static> ThreadBlockSharedSlice<T> {
+// #[cfg(not(any(all(not(feature = "host"), target_os = "cuda"), doc)))]
+// #[doc(cfg(not(all(not(feature = "host"), target_os = "cuda"))))]
+impl<T: 'static + ~const TypeGraphLayout> ThreadBlockSharedSlice<T> {
+    #[cfg(any(not(target_os = "cuda"), doc))]
+    #[doc(cfg(not(target_os = "cuda")))]
     #[must_use]
-    pub fn with_len(len: usize) -> Self {
+    pub fn new_uninit_with_len(len: usize) -> Self {
         Self {
             len,
-            byte_offset: 0,
-            marker: [],
+            marker: PhantomData::<T>,
         }
     }
 
+    #[cfg(not(target_os = "cuda"))]
     #[must_use]
     pub fn len(&self) -> usize {
         self.len
     }
 
+    #[cfg(target_os = "cuda")]
+    #[must_use]
+    pub fn len(&self) -> usize {
+        core::ptr::metadata(self.shared)
+    }
+
     #[must_use]
     pub fn is_empty(&self) -> bool {
-        self.len == 0
+        self.len() == 0
+    }
+
+    #[cfg(any(target_os = "cuda", doc))]
+    #[doc(cfg(target_os = "cuda"))]
+    #[must_use]
+    pub fn as_mut_slice_ptr(&self) -> *mut [T] {
+        self.shared
+    }
+
+    #[cfg(any(target_os = "cuda", doc))]
+    #[doc(cfg(target_os = "cuda"))]
+    #[must_use]
+    pub fn as_mut_ptr(&self) -> *mut T {
+        self.shared.cast()
     }
 }
 
-#[cfg(all(not(feature = "host"), target_os = "cuda"))]
-#[doc(cfg(all(not(feature = "host"), target_os = "cuda")))]
-impl<T: 'static> ThreadBlockSharedSlice<T> {
-    /// # Safety
-    ///
-    /// The thread-block shared dynamic memory must be initialised once and
-    /// only once per kernel.
-    pub unsafe fn init() {
-        unsafe {
-            core::arch::asm!(
-                ".shared .align {align} .b8 rust_cuda_dynamic_shared[];",
-                align = const(core::mem::align_of::<T>()),
-            );
-        }
+unsafe impl<T: 'static + ~const TypeGraphLayout> RustToCuda for ThreadBlockSharedSlice<T> {
+    #[cfg(feature = "host")]
+    #[doc(cfg(feature = "host"))]
+    type CudaAllocation = crate::host::NullCudaAlloc;
+    type CudaRepresentation = ThreadBlockSharedSliceCudaRepresentation<T>;
+
+    #[cfg(feature = "host")]
+    #[doc(cfg(feature = "host"))]
+    unsafe fn borrow<A: crate::host::CudaAlloc>(
+        &self,
+        alloc: A,
+    ) -> rustacuda::error::CudaResult<(
+        DeviceAccessible<Self::CudaRepresentation>,
+        crate::host::CombinedCudaAlloc<Self::CudaAllocation, A>,
+    )> {
+        Ok((
+            DeviceAccessible::from(ThreadBlockSharedSliceCudaRepresentation {
+                len: self.len,
+                marker: [],
+            }),
+            crate::host::CombinedCudaAlloc::new(crate::host::NullCudaAlloc, alloc),
+        ))
     }
 
-    /// # Safety
-    ///
-    /// Exposing the [`ThreadBlockSharedSlice`] must be preceded by exactly one
-    /// call to [`ThreadBlockSharedSlice::init`] for the type `T` amongst
-    /// all `ThreadBlockSharedSlice<T>` that has the largest alignment.
-    pub unsafe fn with_uninit<F: FnOnce(*mut [T]) -> Q, Q>(self, inner: F) -> Q {
-        let base: *mut u8;
-
-        unsafe {
-            core::arch::asm!(
-                "cvta.shared.u64 {reg}, rust_cuda_dynamic_shared;",
-                reg = out(reg64) base,
-            );
-        }
+    #[cfg(feature = "host")]
+    #[doc(cfg(feature = "host"))]
+    unsafe fn restore<A: crate::host::CudaAlloc>(
+        &mut self,
+        alloc: crate::host::CombinedCudaAlloc<Self::CudaAllocation, A>,
+    ) -> rustacuda::error::CudaResult<A> {
+        let (_null, alloc): (crate::host::NullCudaAlloc, A) = alloc.split();
+
+        Ok(alloc)
+    }
+}
+
+unsafe impl<T: 'static + ~const TypeGraphLayout> CudaAsRust
+    for ThreadBlockSharedSliceCudaRepresentation<T>
+{
+    type RustRepresentation = ThreadBlockSharedSlice<T>;
+
+    #[cfg(any(not(feature = "host"), doc))]
+    #[doc(cfg(not(feature = "host")))]
+    unsafe fn as_rust(_this: &DeviceAccessible<Self>) -> Self::RustRepresentation {
+        todo!()
+
+        // unsafe {
+        //     core::arch::asm!(
+        //         ".shared .align {align} .b8 rust_cuda_dynamic_shared[];",
+        //         align = const(core::mem::align_of::<T>()),
+        //     );
+        // }
+
+        // let base: *mut u8;
 
-        let slice =
-            core::ptr::slice_from_raw_parts_mut(base.add(self.byte_offset).cast(), self.len);
+        // unsafe {
+        //     core::arch::asm!(
+        //         "cvta.shared.u64 {reg}, rust_cuda_dynamic_shared;",
+        //         reg = out(reg64) base,
+        //     );
+        // }
 
-        inner(slice)
+        // let slice = core::ptr::slice_from_raw_parts_mut(
+        //     base.add(self.byte_offset).cast(), self.len,
+        // );
     }
 }
diff --git a/src/utils/shared/static.rs b/src/utils/shared/static.rs
index fc3e86b3a..b93e24523 100644
--- a/src/utils/shared/static.rs
+++ b/src/utils/shared/static.rs
@@ -29,6 +29,41 @@ pub struct ThreadBlockSharedCudaRepresentation<T: 'static> {
 
 unsafe impl<T: 'static> DeviceCopy for ThreadBlockSharedCudaRepresentation<T> {}
 
+impl<T: 'static> ThreadBlockShared<T> {
+    #[cfg(not(target_os = "cuda"))]
+    #[must_use]
+    pub fn new_uninit() -> Self {
+        Self {
+            marker: PhantomData::<T>,
+        }
+    }
+
+    #[cfg(target_os = "cuda")]
+    #[must_use]
+    pub fn new_uninit() -> Self {
+        let shared: *mut T;
+
+        unsafe {
+            core::arch::asm!(
+                ".shared .align {align} .b8 {reg}_rust_cuda_static_shared[{size}];",
+                "cvta.shared.u64 {reg}, {reg}_rust_cuda_static_shared;",
+                reg = out(reg64) shared,
+                align = const(core::mem::align_of::<T>()),
+                size = const(core::mem::size_of::<T>()),
+            );
+        }
+
+        Self { shared }
+    }
+
+    #[cfg(any(target_os = "cuda", doc))]
+    #[doc(cfg(target_os = "cuda"))]
+    #[must_use]
+    pub fn as_mut_ptr(&self) -> *mut T {
+        self.shared
+    }
+}
+
 unsafe impl<T: 'static + ~const TypeGraphLayout> RustToCuda for ThreadBlockShared<T> {
     #[cfg(feature = "host")]
     #[doc(cfg(feature = "host"))]
@@ -73,40 +108,3 @@ unsafe impl<T: 'static + ~const TypeGraphLayout> CudaAsRust
         ThreadBlockShared::new_uninit()
     }
 }
-
-#[cfg(not(any(all(not(feature = "host"), target_os = "cuda"), doc)))]
-#[doc(cfg(not(all(not(feature = "host"), target_os = "cuda"))))]
-impl<T: 'static> ThreadBlockShared<T> {
-    #[must_use]
-    pub fn new_uninit() -> Self {
-        Self {
-            marker: PhantomData::<T>,
-        }
-    }
-}
-
-#[cfg(any(all(not(feature = "host"), target_os = "cuda"), doc))]
-#[doc(cfg(all(not(feature = "host"), target_os = "cuda")))]
-impl<T: 'static> ThreadBlockShared<T> {
-    #[must_use]
-    pub fn new_uninit() -> Self {
-        let shared: *mut T;
-
-        unsafe {
-            core::arch::asm!(
-                ".shared .align {align} .b8 {reg}_rust_cuda_static_shared[{size}];",
-                "cvta.shared.u64 {reg}, {reg}_rust_cuda_static_shared;",
-                reg = out(reg64) shared,
-                align = const(core::mem::align_of::<T>()),
-                size = const(core::mem::size_of::<T>()),
-            );
-        }
-
-        Self { shared }
-    }
-
-    #[must_use]
-    pub fn as_mut_ptr(&self) -> *mut T {
-        self.shared
-    }
-}

From b0826d7ffa454414e863ad946a85d9aeb96fa440 Mon Sep 17 00:00:00 2001
From: Juniper Tyree <juniper.langenstein@helsinki.fi>
Date: Sun, 8 Jan 2023 14:31:06 +0000
Subject: [PATCH 021/120] Backup of progress on compile-time PTX checking

---
 examples/single-source/src/main.rs            |  15 ++-
 rust-cuda-derive/Cargo.toml                   |   2 +
 rust-cuda-derive/build.rs                     |   3 +
 rust-cuda-derive/src/kernel/link/config.rs    |   3 +
 rust-cuda-derive/src/kernel/link/mod.rs       | 122 +++++++++++++++++-
 .../generate/cpu_linker_macro/get_ptx_str.rs  |   2 +-
 src/safety/device_copy.rs                     |   7 +
 src/safety/no_aliasing.rs                     |   6 +
 8 files changed, 152 insertions(+), 8 deletions(-)
 create mode 100644 rust-cuda-derive/build.rs

diff --git a/examples/single-source/src/main.rs b/examples/single-source/src/main.rs
index 2e1c9e199..c3b83d5ec 100644
--- a/examples/single-source/src/main.rs
+++ b/examples/single-source/src/main.rs
@@ -10,6 +10,7 @@
 
 extern crate alloc;
 
+#[cfg(target_os = "cuda")]
 use rc::utils::shared::r#static::ThreadBlockShared;
 
 #[cfg(not(target_os = "cuda"))]
@@ -45,23 +46,25 @@ pub fn kernel<'a, T: rc::common::RustToCuda>(
     #[kernel(pass = LendRustToCuda)] _z: &ShallowCopy<Wrapper<T>>,
     #[kernel(pass = SafeDeviceCopy, jit)] _v @ _w: &'a core::sync::atomic::AtomicU64,
     #[kernel(pass = LendRustToCuda)] _: Wrapper<T>,
-    #[kernel(pass = SafeDeviceCopy)] Tuple(_s, mut __t): Tuple,
-    #[kernel(pass = LendRustToCuda)] shared3: ThreadBlockShared<u32>,
+    #[kernel(pass = SafeDeviceCopy)] Tuple(s, mut __t): Tuple,
+    // #[kernel(pass = LendRustToCuda)] shared3: ThreadBlockShared<u32>,
 ) where
     <T as rc::common::RustToCuda>::CudaRepresentation: rc::safety::StackOnly,
 {
     let shared: ThreadBlockShared<[Tuple; 3]> = ThreadBlockShared::new_uninit();
     let shared2: ThreadBlockShared<[Tuple; 3]> = ThreadBlockShared::new_uninit();
 
+    #[allow(clippy::cast_possible_truncation, clippy::cast_sign_loss)]
     unsafe {
-        (*shared.as_mut_ptr().cast::<Tuple>().add(1)).0 = 42;
+        (*shared.as_mut_ptr().cast::<Tuple>().add(1)).0 = (f64::from(s) * 2.0) as u32;
     }
     unsafe {
         (*shared2.as_mut_ptr().cast::<Tuple>().add(2)).1 = 24;
     }
-    unsafe {
-        *shared3.as_mut_ptr() = 12;
-    }
+    unsafe { core::arch::asm!("hi") }
+    // unsafe {
+    //     *shared3.as_mut_ptr() = 12;
+    // }
 }
 
 #[cfg(not(target_os = "cuda"))]
diff --git a/rust-cuda-derive/Cargo.toml b/rust-cuda-derive/Cargo.toml
index 4b8677df4..788a08716 100644
--- a/rust-cuda-derive/Cargo.toml
+++ b/rust-cuda-derive/Cargo.toml
@@ -4,6 +4,7 @@ version = "0.1.0"
 authors = ["Juniper Tyree <juniper.langenstein@helsinki.fi>"]
 license = "MIT OR Apache-2.0"
 edition = "2021"
+links = "libnvptxcompiler_static"
 
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 
@@ -24,3 +25,4 @@ colored = "2.0"
 
 seahash = "4.1"
 ptx-builder = { git = "https://github.com/juntyr/rust-ptx-builder", rev = "1f1f49d" }
+ptx_compiler = "0.1"
diff --git a/rust-cuda-derive/build.rs b/rust-cuda-derive/build.rs
new file mode 100644
index 000000000..27d940ad2
--- /dev/null
+++ b/rust-cuda-derive/build.rs
@@ -0,0 +1,3 @@
+fn main() {
+    println!("cargo:rustc-link-lib=nvptxcompiler_static");
+}
diff --git a/rust-cuda-derive/src/kernel/link/config.rs b/rust-cuda-derive/src/kernel/link/config.rs
index cdfd0b575..bb5f011d6 100644
--- a/rust-cuda-derive/src/kernel/link/config.rs
+++ b/rust-cuda-derive/src/kernel/link/config.rs
@@ -3,6 +3,7 @@ use std::path::PathBuf;
 #[allow(clippy::module_name_repetitions)]
 pub(super) struct LinkKernelConfig {
     pub(super) kernel: syn::Ident,
+    pub(super) kernel_hash: syn::Ident,
     pub(super) args: syn::Ident,
     pub(super) crate_name: String,
     pub(super) crate_path: PathBuf,
@@ -12,6 +13,7 @@ pub(super) struct LinkKernelConfig {
 impl syn::parse::Parse for LinkKernelConfig {
     fn parse(input: syn::parse::ParseStream) -> syn::Result<Self> {
         let kernel: syn::Ident = input.parse()?;
+        let kernel_hash: syn::Ident = input.parse()?;
         let args: syn::Ident = input.parse()?;
         let name: syn::LitStr = input.parse()?;
         let path: syn::LitStr = input.parse()?;
@@ -37,6 +39,7 @@ impl syn::parse::Parse for LinkKernelConfig {
 
         Ok(Self {
             kernel,
+            kernel_hash,
             args,
             crate_name: name.value(),
             crate_path: PathBuf::from(path.value()),
diff --git a/rust-cuda-derive/src/kernel/link/mod.rs b/rust-cuda-derive/src/kernel/link/mod.rs
index 506d8ea03..1b116435c 100644
--- a/rust-cuda-derive/src/kernel/link/mod.rs
+++ b/rust-cuda-derive/src/kernel/link/mod.rs
@@ -1,7 +1,12 @@
 use std::{
-    env, fs,
+    env,
+    ffi::CString,
+    fs,
     io::{Read, Write},
+    mem::MaybeUninit,
+    os::raw::c_int,
     path::{Path, PathBuf},
+    ptr::addr_of_mut,
     sync::atomic::{AtomicBool, Ordering},
 };
 
@@ -11,6 +16,7 @@ use ptx_builder::{
     builder::{BuildStatus, Builder, MessageFormat, Profile},
     error::{BuildErrorKind, Error, Result},
 };
+use ptx_compiler::sys::size_t;
 
 use super::utils::skip_kernel_compilation;
 
@@ -56,6 +62,7 @@ pub fn link_kernel(tokens: TokenStream) -> TokenStream {
 
     let LinkKernelConfig {
         kernel,
+        kernel_hash,
         args,
         crate_name,
         crate_path,
@@ -199,6 +206,119 @@ pub fn link_kernel(tokens: TokenStream) -> TokenStream {
         kernel_ptx.replace_range(type_layout_start..type_layout_end, "");
     }
 
+    let mut compiler = MaybeUninit::uninit();
+    let r = unsafe {
+        ptx_compiler::sys::nvPTXCompilerCreate(
+            compiler.as_mut_ptr(),
+            kernel_ptx.len() as size_t,
+            kernel_ptx.as_ptr().cast(),
+        )
+    };
+    emit_call_site_warning!("PTX compiler create result {}", r);
+    let compiler = unsafe { compiler.assume_init() };
+
+    let mut major = 0;
+    let mut minor = 0;
+    let r = unsafe {
+        ptx_compiler::sys::nvPTXCompilerGetVersion(addr_of_mut!(major), addr_of_mut!(minor))
+    };
+    emit_call_site_warning!("PTX version result {}", r);
+    emit_call_site_warning!("PTX compiler version {}.{}", major, minor);
+
+    let kernel_name = if specialisation.is_empty() {
+        format!("{kernel_hash}_kernel")
+    } else {
+        format!(
+            "{kernel_hash}_kernel_{:016x}",
+            seahash::hash(specialisation.as_bytes())
+        )
+    };
+
+    let options = vec![
+        CString::new("--entry").unwrap(),
+        CString::new(kernel_name).unwrap(),
+        CString::new("--verbose").unwrap(),
+        CString::new("--warn-on-double-precision-use").unwrap(),
+        CString::new("--warn-on-local-memory-usage").unwrap(),
+        CString::new("--warn-on-spills").unwrap(),
+    ];
+    let options_ptrs = options.iter().map(|o| o.as_ptr()).collect::<Vec<_>>();
+
+    let r = unsafe {
+        ptx_compiler::sys::nvPTXCompilerCompile(
+            compiler,
+            options_ptrs.len() as c_int,
+            options_ptrs.as_ptr().cast(),
+        )
+    };
+    emit_call_site_warning!("PTX compile result {}", r);
+
+    let mut info_log_size = 0;
+    let r = unsafe {
+        ptx_compiler::sys::nvPTXCompilerGetInfoLogSize(compiler, addr_of_mut!(info_log_size))
+    };
+    emit_call_site_warning!("PTX info log size result {}", r);
+    #[allow(clippy::cast_possible_truncation)]
+    let mut info_log: Vec<u8> = Vec::with_capacity(info_log_size as usize);
+    if info_log_size > 0 {
+        let r = unsafe {
+            ptx_compiler::sys::nvPTXCompilerGetInfoLog(compiler, info_log.as_mut_ptr().cast())
+        };
+        emit_call_site_warning!("PTX info log content result {}", r);
+        #[allow(clippy::cast_possible_truncation)]
+        unsafe {
+            info_log.set_len(info_log_size as usize);
+        }
+    }
+    let info_log = String::from_utf8_lossy(&info_log);
+
+    let mut error_log_size = 0;
+    let r = unsafe {
+        ptx_compiler::sys::nvPTXCompilerGetErrorLogSize(compiler, addr_of_mut!(error_log_size))
+    };
+    emit_call_site_warning!("PTX error log size result {}", r);
+    #[allow(clippy::cast_possible_truncation)]
+    let mut error_log: Vec<u8> = Vec::with_capacity(error_log_size as usize);
+    if error_log_size > 0 {
+        let r = unsafe {
+            ptx_compiler::sys::nvPTXCompilerGetErrorLog(compiler, error_log.as_mut_ptr().cast())
+        };
+        emit_call_site_warning!("PTX error log content result {}", r);
+        #[allow(clippy::cast_possible_truncation)]
+        unsafe {
+            error_log.set_len(error_log_size as usize);
+        }
+    }
+    let error_log = String::from_utf8_lossy(&error_log);
+
+    // Ensure the compiler is not dropped
+    let mut compiler = MaybeUninit::new(compiler);
+    let r = unsafe { ptx_compiler::sys::nvPTXCompilerDestroy(compiler.as_mut_ptr()) };
+    emit_call_site_warning!("PTX compiler destroy result {}", r);
+
+    if !info_log.is_empty() {
+        emit_call_site_warning!("PTX compiler info log:\n{}", info_log);
+    }
+    if !error_log.is_empty() {
+        let mut max_lines = kernel_ptx.chars().filter(|c| *c == '\n').count() + 1;
+        let mut indent = 0;
+        while max_lines > 0 {
+            max_lines /= 10;
+            indent += 1;
+        }
+
+        abort_call_site!(
+            "PTX compiler error log:\n{}\nPTX source:\n{}",
+            error_log,
+            kernel_ptx
+                .lines()
+                .enumerate()
+                .map(|(i, l)| format!("{:indent$}| {l}", i + 1))
+                .collect::<Vec<_>>()
+                .join("\n")
+        );
+    }
+
     (quote! { const PTX_STR: &'static str = #kernel_ptx; #(#type_layouts)* }).into()
 }
 
diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/get_ptx_str.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/get_ptx_str.rs
index 179ba7eed..d412bd316 100644
--- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/get_ptx_str.rs
+++ b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/get_ptx_str.rs
@@ -84,7 +84,7 @@ pub(super) fn quote_get_ptx_str(
     quote! {
         fn get_ptx_str() -> &'static str {
             #crate_path::host::link_kernel!{
-                #func_ident #args #crate_name #crate_manifest_dir #generic_start_token
+                #func_ident #func_ident_hash #args #crate_name #crate_manifest_dir #generic_start_token
                     #($#macro_type_ids),*
                 #generic_close_token
             }
diff --git a/src/safety/device_copy.rs b/src/safety/device_copy.rs
index c5de73430..a2b17627f 100644
--- a/src/safety/device_copy.rs
+++ b/src/safety/device_copy.rs
@@ -19,4 +19,11 @@ mod sealed {
         for crate::utils::device_copy::SafeDeviceCopyWrapper<T>
     {
     }
+
+    // Only unsafe aliasing is possible since both only expose raw pointers
+    // impl<T: 'static> SafeDeviceCopy for
+    // crate::utils::shared::r#static::ThreadBlockShared<T> {}
+    // impl<T: 'static + ~const const_type_layout::TypeGraphLayout>
+    // SafeDeviceCopy for crate::utils::shared::slice::ThreadBlockSharedSlice<T>
+    // {}
 }
diff --git a/src/safety/no_aliasing.rs b/src/safety/no_aliasing.rs
index 22488efb8..dbc163e59 100644
--- a/src/safety/no_aliasing.rs
+++ b/src/safety/no_aliasing.rs
@@ -22,4 +22,10 @@ mod private {
     {
     }
     impl<T> NoAliasing for crate::utils::aliasing::SplitSliceOverCudaThreadsDynamicStride<T> {}
+
+    // Only unsafe aliasing is possible since both only expose raw pointers
+    // impl<T: 'static> NoAliasing for
+    // crate::utils::shared::r#static::ThreadBlockShared<T> {}
+    // impl<T: 'static + ~const const_type_layout::TypeGraphLayout> NoAliasing
+    // for crate::utils::shared::slice::ThreadBlockSharedSlice<T> {}
 }

From 5538d71c998a7679fe6f533e6edd3d7ffc876408 Mon Sep 17 00:00:00 2001
From: Juniper Tyree <juniper.tyree@helsinki.fi>
Date: Sat, 6 May 2023 14:49:33 +0000
Subject: [PATCH 022/120] Clean up the PTX JIT implementation

---
 .../cpu_linker_macro/kernel_func_async/mod.rs |  8 +--
 .../kernel_func_async/type_wrap.rs            | 67 ++++++++++++-------
 rust-cuda-ptx-jit/src/host/arguments.rs       | 48 -------------
 .../src/host/compiler/replace.rs              |  6 +-
 rust-cuda-ptx-jit/src/host/mod.rs             |  2 -
 rust-cuda-ptx-jit/src/lib.rs                  |  8 +++
 src/host.rs                                   |  2 +-
 7 files changed, 56 insertions(+), 85 deletions(-)
 delete mode 100644 rust-cuda-ptx-jit/src/host/arguments.rs

diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func_async/mod.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func_async/mod.rs
index 44cc4d904..c01dcdce3 100644
--- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func_async/mod.rs
+++ b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func_async/mod.rs
@@ -35,7 +35,7 @@ pub(super) fn quote_kernel_func_async(
         macro_type_ids,
     );
     let (func_input_wrap, func_cpu_ptx_jit_wrap) =
-        generate_func_input_and_ptx_jit_wraps(func_inputs);
+        generate_func_input_and_ptx_jit_wraps(crate_path, func_inputs);
     let (cpu_func_types_launch, cpu_func_lifetime_erased_types, cpu_func_unboxed_types) =
         generate_launch_types(
             crate_path,
@@ -60,11 +60,9 @@ pub(super) fn quote_kernel_func_async(
             } = #crate_path::host::Launcher::get_launch_package(self);
 
             let kernel_jit_result = if config.ptx_jit {
-                #crate_path::ptx_jit::compilePtxJITwithArguments! {
-                    kernel.compile_with_ptx_jit_args(#(#func_cpu_ptx_jit_wrap),*)
-                }?
+                kernel.compile_with_ptx_jit_args(#func_cpu_ptx_jit_wrap)?
             } else {
-                 kernel.compile_with_ptx_jit_args(None)?
+                kernel.compile_with_ptx_jit_args(None)?
             };
 
             let function = match kernel_jit_result {
diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func_async/type_wrap.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func_async/type_wrap.rs
index 50ea505f1..54ba2945b 100644
--- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func_async/type_wrap.rs
+++ b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func_async/type_wrap.rs
@@ -5,34 +5,49 @@ use crate::kernel::wrapper::InputCudaType;
 use super::super::super::super::FunctionInputs;
 
 pub(super) fn generate_func_input_and_ptx_jit_wraps(
+    crate_path: &syn::Path,
     FunctionInputs {
         func_inputs,
         func_input_cuda_types,
     }: &FunctionInputs,
-) -> (Vec<TokenStream>, Vec<TokenStream>) {
-    func_inputs
-        .iter()
-        .zip(func_input_cuda_types.iter())
-        .map(|(arg, (cuda_mode, ptx_jit))| match arg {
-            syn::FnArg::Typed(syn::PatType { pat, ty, .. }) => {
-                #[allow(clippy::if_same_then_else)]
-                let func_input = if let syn::Type::Reference(_) = &**ty {
-                    quote! { unsafe { #pat.for_device_async() } }
-                } else if matches!(cuda_mode, InputCudaType::LendRustToCuda) {
-                    quote! { unsafe { #pat.for_device_async() } }
-                } else {
-                    quote! { #pat }
-                };
-
-                let ptx_load = if ptx_jit.0 {
-                    quote! { ConstLoad[#pat.for_host()] }
-                } else {
-                    quote! { Ignore[#pat] }
-                };
-
-                (func_input, ptx_load)
-            },
-            syn::FnArg::Receiver(_) => unreachable!(),
-        })
-        .unzip()
+) -> (Vec<TokenStream>, TokenStream) {
+    let mut any_ptx_jit = false;
+
+    let (func_input_wrap, func_cpu_ptx_jit_wrap): (Vec<TokenStream>, Vec<TokenStream>) =
+        func_inputs
+            .iter()
+            .zip(func_input_cuda_types.iter())
+            .map(|(arg, (cuda_mode, ptx_jit))| match arg {
+                syn::FnArg::Typed(syn::PatType { pat, ty, .. }) => {
+                    #[allow(clippy::if_same_then_else)]
+                    let func_input = if let syn::Type::Reference(_) = &**ty {
+                        quote! { unsafe { #pat.for_device_async() } }
+                    } else if matches!(cuda_mode, InputCudaType::LendRustToCuda) {
+                        quote! { unsafe { #pat.for_device_async() } }
+                    } else {
+                        quote! { #pat }
+                    };
+
+                    let ptx_load = if ptx_jit.0 {
+                        any_ptx_jit = true;
+
+                        quote! { Some(#crate_path::ptx_jit::arg_as_raw_bytes(#pat.for_host())) }
+                    } else {
+                        quote! { None }
+                    };
+
+                    (func_input, ptx_load)
+                },
+                syn::FnArg::Receiver(_) => unreachable!(),
+            })
+            .unzip();
+
+    if any_ptx_jit {
+        (
+            func_input_wrap,
+            quote!(Some(&[#(#func_cpu_ptx_jit_wrap),*])),
+        )
+    } else {
+        (func_input_wrap, quote!(None))
+    }
 }
diff --git a/rust-cuda-ptx-jit/src/host/arguments.rs b/rust-cuda-ptx-jit/src/host/arguments.rs
deleted file mode 100644
index 0a67d42ea..000000000
--- a/rust-cuda-ptx-jit/src/host/arguments.rs
+++ /dev/null
@@ -1,48 +0,0 @@
-#[macro_export]
-#[doc(hidden)]
-#[doc(cfg(feature = "host"))]
-#[allow(clippy::module_name_repetitions)]
-macro_rules! compilePtxJITwithArguments {
-    // Invocation without arguments fast track
-    ($compiler:ident ()) => {
-        $crate::compilePtxJITwithArguments!($compiler.with_arguments ())
-    };
-    // Invocation without arguments fast track
-    ($compiler:ident $(. $path:ident)+ ()) => {
-        $compiler$(.$path)+(None)
-    };
-    // Invocation with arguments is forwarded to incremental muncher
-    ($compiler:ident ( $($args:tt)* )) => {
-        $crate::compilePtxJITwithArguments!($compiler.with_arguments ( $($args)* ))
-    };
-    // Invocation with arguments is forwarded to incremental muncher
-    ($compiler:ident $(. $path:ident)+ ( $($args:tt)* )) => {
-        $crate::compilePtxJITwithArguments!(@munch None $compiler$(.$path)+ => [, $($args)*] =>)
-    };
-    // Muncher base case: no `ConstLoad[$expr]` arguments
-    (@munch None $compiler:ident $(. $path:ident)+ => [] => $($rubbish:expr),*) => {
-        $compiler$(.$path)+(None)
-    };
-    // Muncher base case: at least one `ConstLoad[$expr]` argument
-    (@munch Some $compiler:ident $(. $path:ident)+ => [] => $($exprs:expr),*) => {
-        $compiler$(.$path)+(Some(&[$($exprs),*]))
-    };
-    // Muncher helper case: first `ConstLoad[$expr]` argument is recognised (redirect)
-    (@munch None $compiler:ident $(. $path:ident)+ => [, ConstLoad [ $head:expr ] $($tail:tt)*] => $($exprs:expr),*) => {
-        $crate::compilePtxJITwithArguments!(@munch Some $compiler$(.$path)+ => [, ConstLoad [ $head ] $($tail)*] => $($exprs),*)
-    };
-    // Muncher recursive case: much one `Ignore[$expr]` argument (no `ConstLoad[$expr]`s so far)
-    (@munch None $compiler:ident $(. $path:ident)+ => [, Ignore [ $head:expr ] $($tail:tt)*] => $($exprs:expr),*) => {
-        $crate::compilePtxJITwithArguments!(@munch None $compiler$(.$path)+ => [$($tail)*] => $($exprs,)* None)
-    };
-    // Muncher recursive case: much one `Ignore[$expr]` argument (some `ConstLoad[$expr]`s already)
-    (@munch Some $compiler:ident $(. $path:ident)+ => [, Ignore [ $head:expr ] $($tail:tt)*] => $($exprs:expr),*) => {
-        $crate::compilePtxJITwithArguments!(@munch Some $compiler$(.$path)+ => [$($tail)*] => $($exprs,)* None)
-    };
-    // Muncher recursive case: much one `ConstLoad[$expr]` (some `ConstLoad[$expr]`s already)
-    (@munch Some $compiler:ident $(. $path:ident)+ => [, ConstLoad [ $head:expr ] $($tail:tt)*] => $($exprs:expr),*) => {
-        $crate::compilePtxJITwithArguments!(@munch Some $compiler$(.$path)+ => [$($tail)*] => $($exprs,)* Some(unsafe {
-            ::std::slice::from_raw_parts($head as *const _ as *const u8, ::std::mem::size_of_val($head))
-        }))
-    };
-}
diff --git a/rust-cuda-ptx-jit/src/host/compiler/replace.rs b/rust-cuda-ptx-jit/src/host/compiler/replace.rs
index df4d270b8..920842d6f 100644
--- a/rust-cuda-ptx-jit/src/host/compiler/replace.rs
+++ b/rust-cuda-ptx-jit/src/host/compiler/replace.rs
@@ -4,7 +4,7 @@ use super::{PtxElement, PtxJITCompiler, PtxJITResult, PtxLoadWidth};
 
 impl PtxJITCompiler {
     #[allow(clippy::too_many_lines)]
-    pub fn with_arguments(&mut self, arguments: Option<&[Option<&[u8]>]>) -> PtxJITResult {
+    pub fn with_arguments(&mut self, arguments: Option<&[Option<*const [u8]>]>) -> PtxJITResult {
         // Check if the arguments, cast as byte slices, are the same as the last cached
         //  ones
         #[allow(clippy::explicit_deref_methods)]
@@ -16,7 +16,7 @@ impl PtxJITCompiler {
                     .zip(last_arguments.iter())
                     .all(|(a, b)| match (a, b) {
                         (None, None) => false,
-                        (Some(a), Some(b)) => *a != b.deref(),
+                        (Some(a), Some(b)) => (unsafe { &**a }) != b.deref(),
                         _ => true,
                     })
             },
@@ -30,7 +30,7 @@ impl PtxJITCompiler {
             self.last_arguments = arguments.map(|arguments| {
                 arguments
                     .iter()
-                    .map(|arg| arg.map(|bytes| bytes.to_owned().into_boxed_slice()))
+                    .map(|arg| arg.map(|bytes| unsafe { &*bytes }.to_owned().into_boxed_slice()))
                     .collect::<Vec<Option<Box<[u8]>>>>()
                     .into_boxed_slice()
             });
diff --git a/rust-cuda-ptx-jit/src/host/mod.rs b/rust-cuda-ptx-jit/src/host/mod.rs
index d0d9ffb53..2ace3405d 100644
--- a/rust-cuda-ptx-jit/src/host/mod.rs
+++ b/rust-cuda-ptx-jit/src/host/mod.rs
@@ -1,4 +1,2 @@
 pub mod compiler;
 pub mod kernel;
-
-mod arguments;
diff --git a/rust-cuda-ptx-jit/src/lib.rs b/rust-cuda-ptx-jit/src/lib.rs
index ae6080a3e..1f22b2830 100644
--- a/rust-cuda-ptx-jit/src/lib.rs
+++ b/rust-cuda-ptx-jit/src/lib.rs
@@ -1,5 +1,6 @@
 #![deny(clippy::pedantic)]
 #![cfg_attr(not(feature = "host"), no_std)]
+#![feature(ptr_from_ref)]
 #![feature(doc_cfg)]
 #![doc(html_root_url = "https://juntyr.github.io/rust-cuda/")]
 
@@ -12,3 +13,10 @@ pub use host::{compiler::PtxJITCompiler, compiler::PtxJITResult, kernel::CudaKer
 #[cfg(any(not(feature = "host"), doc))]
 #[doc(cfg(not(feature = "host")))]
 mod device;
+
+pub fn arg_as_raw_bytes<T: ?Sized>(r: &T) -> *const [u8] {
+    core::ptr::slice_from_raw_parts(
+        core::ptr::from_ref(r).cast::<u8>(),
+        core::mem::size_of_val(r),
+    )
+}
diff --git a/src/host.rs b/src/host.rs
index a104c50a3..591ed4ed5 100644
--- a/src/host.rs
+++ b/src/host.rs
@@ -95,7 +95,7 @@ impl<KernelTraitObject: ?Sized> TypedKernel<KernelTraitObject> {
     ///  (from [`Self::new`]).
     pub fn compile_with_ptx_jit_args(
         &mut self,
-        arguments: Option<&[Option<&[u8]>]>,
+        arguments: Option<&[Option<*const [u8]>]>,
     ) -> CudaResult<KernelJITResult> {
         let ptx_jit = self.compiler.with_arguments(arguments);
 

From eb576605b3415f0eaa9a7b5a028a57e754f7d5d2 Mon Sep 17 00:00:00 2001
From: Juniper Tyree <juniper.tyree@helsinki.fi>
Date: Sat, 6 May 2023 15:11:54 +0000
Subject: [PATCH 023/120] Add convenience functions for ThreadBlockShared
 arrays

---
 examples/single-source/src/main.rs | 15 +++++++--------
 src/utils/shared/static.rs         | 23 +++++++++++++++++++++++
 2 files changed, 30 insertions(+), 8 deletions(-)

diff --git a/examples/single-source/src/main.rs b/examples/single-source/src/main.rs
index c3b83d5ec..af382ff42 100644
--- a/examples/single-source/src/main.rs
+++ b/examples/single-source/src/main.rs
@@ -10,7 +10,6 @@
 
 extern crate alloc;
 
-#[cfg(target_os = "cuda")]
 use rc::utils::shared::r#static::ThreadBlockShared;
 
 #[cfg(not(target_os = "cuda"))]
@@ -47,7 +46,7 @@ pub fn kernel<'a, T: rc::common::RustToCuda>(
     #[kernel(pass = SafeDeviceCopy, jit)] _v @ _w: &'a core::sync::atomic::AtomicU64,
     #[kernel(pass = LendRustToCuda)] _: Wrapper<T>,
     #[kernel(pass = SafeDeviceCopy)] Tuple(s, mut __t): Tuple,
-    // #[kernel(pass = LendRustToCuda)] shared3: ThreadBlockShared<u32>,
+    #[kernel(pass = LendRustToCuda)] shared3: ThreadBlockShared<u32>,
 ) where
     <T as rc::common::RustToCuda>::CudaRepresentation: rc::safety::StackOnly,
 {
@@ -56,15 +55,15 @@ pub fn kernel<'a, T: rc::common::RustToCuda>(
 
     #[allow(clippy::cast_possible_truncation, clippy::cast_sign_loss)]
     unsafe {
-        (*shared.as_mut_ptr().cast::<Tuple>().add(1)).0 = (f64::from(s) * 2.0) as u32;
+        (*shared.index_mut(1)).0 = (f64::from(s) * 2.0) as u32;
+    }
+    unsafe {
+        (*shared2.index_mut(2)).1 = 24;
     }
+    // unsafe { core::arch::asm!("hi") }
     unsafe {
-        (*shared2.as_mut_ptr().cast::<Tuple>().add(2)).1 = 24;
+        *shared3.as_mut_ptr() = 12;
     }
-    unsafe { core::arch::asm!("hi") }
-    // unsafe {
-    //     *shared3.as_mut_ptr() = 12;
-    // }
 }
 
 #[cfg(not(target_os = "cuda"))]
diff --git a/src/utils/shared/static.rs b/src/utils/shared/static.rs
index b93e24523..58973ace4 100644
--- a/src/utils/shared/static.rs
+++ b/src/utils/shared/static.rs
@@ -7,12 +7,14 @@ use rustacuda_core::DeviceCopy;
 use crate::common::{CudaAsRust, DeviceAccessible, RustToCuda};
 
 #[cfg(not(target_os = "cuda"))]
+#[derive(TypeLayout)]
 #[repr(transparent)]
 pub struct ThreadBlockShared<T: 'static> {
     marker: PhantomData<T>,
 }
 
 #[cfg(target_os = "cuda")]
+#[derive(TypeLayout)]
 #[repr(transparent)]
 pub struct ThreadBlockShared<T: 'static> {
     shared: *mut T,
@@ -64,6 +66,27 @@ impl<T: 'static> ThreadBlockShared<T> {
     }
 }
 
+impl<T: 'static, const N: usize> ThreadBlockShared<[T; N]> {
+    #[cfg(any(target_os = "cuda", doc))]
+    #[doc(cfg(target_os = "cuda"))]
+    #[inline]
+    #[must_use]
+    pub fn index(&self, index: usize) -> *const T {
+        self.index_mut(index)
+    }
+
+    #[cfg(any(target_os = "cuda", doc))]
+    #[doc(cfg(target_os = "cuda"))]
+    #[inline]
+    #[must_use]
+    pub fn index_mut(&self, index: usize) -> *mut T {
+        assert!(index < N);
+
+        // Safety: Since *[T; N] is valid, *T is valid iff index < N
+        unsafe { self.shared.cast::<T>().add(index) }
+    }
+}
+
 unsafe impl<T: 'static + ~const TypeGraphLayout> RustToCuda for ThreadBlockShared<T> {
     #[cfg(feature = "host")]
     #[doc(cfg(feature = "host"))]

From a5ffb0e8da5efda143d1731127a2f11a62cb002e Mon Sep 17 00:00:00 2001
From: Juniper Tyree <juniper.tyree@helsinki.fi>
Date: Sat, 6 May 2023 16:23:38 +0000
Subject: [PATCH 024/120] Improve and fix CI

---
 .github/workflows/ci.yml           | 109 ++++++-------------------
 examples/single-source/src/main.rs |   4 +-
 src/lib.rs                         |   1 +
 src/safety/device_copy.rs          |   8 +-
 src/safety/no_aliasing.rs          |   5 +-
 src/safety/stack_only.rs           |   2 +-
 src/utils/shared/slice.rs          | 127 ++++++++---------------------
 src/utils/shared/static.rs         |  25 +++---
 8 files changed, 74 insertions(+), 207 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 57b6377f7..2e66a8ed9 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -40,53 +40,25 @@ jobs:
           sudo ./llvm.sh $(rustc --version -v | grep -oP "LLVM version: \K\d+")
           rm llvm.sh
           cargo install rust-ptx-linker --git https://github.com/juntyr/rust-ptx-linker --force
+      
+      - name: Install cargo-hack
+        uses: taiki-e/install-action@cargo-hack
 
-      - name: Check without features on CPU
-        run: |
-          cargo check
-
-      - name: Check with alloc feature on CPU
-        run: |
-          cargo check \
-            --features alloc
-
-      - name: Check with derive feature on CPU
-        run: |
-          cargo check \
-            --features derive
-
-      - name: Check with host feature on CPU
-        run: |
-          cargo check \
-            --features host
-
-      - name: Check with host,derive,alloc features on CPU
+      - name: Check feature powerset on the CPU
         run: |
-          cargo check \
-            --features host,derive,alloc
+          cargo hack check --feature-powerset --optional-deps \
+            --keep-going
 
-      - name: Check without features on CUDA
+      - name: Check feature powerset on CUDA
         run: |
-          cargo check \
+          cargo hack check --feature-powerset --optional-deps \
+            --skip host,rustacuda,rustacuda_derive \
+            --keep-going \
             --target nvptx64-nvidia-cuda
 
-      - name: Check with alloc feature on CUDA
-        run: |
-          cargo check \
-            --target nvptx64-nvidia-cuda \
-            --features alloc
-
-      - name: Check with derive feature on CUDA
-        run: |
-          cargo check \
-            --target nvptx64-nvidia-cuda \
-            --features derive
-
       - name: Check all workspace targets
         run: |
-          cargo check \
-            --workspace \
-            --all-targets
+          cargo check --workspace --all-targets
 
   test:
     name: Test Suite
@@ -176,58 +148,23 @@ jobs:
           rm llvm.sh
           cargo install rust-ptx-linker --git https://github.com/juntyr/rust-ptx-linker --force
 
-      - name: Check the code style without features on CPU
-        run: |
-          cargo clippy \
-            -- -D warnings
-
-      - name: Check the code style with alloc feature on CPU
-        run: |
-          cargo clippy \
-            --features alloc \
-            -- -D warnings
-
-      - name: Check the code style with derive feature on CPU
-        run: |
-          cargo clippy \
-            --features derive \
-            -- -D warnings
-
-      - name: Check the code style with host feature on CPU
-        run: |
-          cargo clippy \
-            --features host \
-            -- -D warnings
-
-      - name: Check the code style with host,derive,alloc features on CPU
-        run: |
-          cargo clippy \
-            --features host,derive,alloc \
-            -- -D warnings
-
-      - name: Check the code style without features on CUDA
-        run: |
-          cargo clippy \
-            --target nvptx64-nvidia-cuda \
-            -- -D warnings
+      - name: Install cargo-hack
+        uses: taiki-e/install-action@cargo-hack
       
-      - name: Check the code style with alloc feature on CUDA
+      - name: Check feature powerset on the CPU
         run: |
-          cargo clippy \
-            --target nvptx64-nvidia-cuda \
-            --features alloc \
+          cargo hack clippy --feature-powerset --optional-deps \
+            --keep-going \
             -- -D warnings
-
-      - name: Check the code style with derive feature on CUDA
+      
+      - name: Check feature powerset on CUDA
         run: |
-          cargo clippy \
+          cargo hack clippy --feature-powerset --optional-deps \
+            --skip host,rustacuda,rustacuda_derive \
+            --keep-going \
             --target nvptx64-nvidia-cuda \
-            --features derive \
             -- -D warnings
 
-      - name: Check the code style for all workspace targets
+      - name: Check all workspace targets
         run: |
-          cargo clippy \
-            --workspace \
-            --all-targets \
-            -- -D warnings
+          cargo clippy --workspace --all-targets -- -D warnings
diff --git a/examples/single-source/src/main.rs b/examples/single-source/src/main.rs
index af382ff42..e1030d261 100644
--- a/examples/single-source/src/main.rs
+++ b/examples/single-source/src/main.rs
@@ -55,10 +55,10 @@ pub fn kernel<'a, T: rc::common::RustToCuda>(
 
     #[allow(clippy::cast_possible_truncation, clippy::cast_sign_loss)]
     unsafe {
-        (*shared.index_mut(1)).0 = (f64::from(s) * 2.0) as u32;
+        (*shared.index_mut_unchecked(1)).0 = (f64::from(s) * 2.0) as u32;
     }
     unsafe {
-        (*shared2.index_mut(2)).1 = 24;
+        (*shared2.index_mut_unchecked(2)).1 = 24;
     }
     // unsafe { core::arch::asm!("hi") }
     unsafe {
diff --git a/src/lib.rs b/src/lib.rs
index 0c149d40f..de590c29b 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -26,6 +26,7 @@
 #![feature(impl_trait_in_assoc_type)]
 #![allow(incomplete_features)]
 #![feature(generic_const_exprs)]
+#![cfg_attr(target_os = "cuda", feature(slice_ptr_get))]
 #![doc(html_root_url = "https://juntyr.github.io/rust-cuda/")]
 
 #[doc(hidden)]
diff --git a/src/safety/device_copy.rs b/src/safety/device_copy.rs
index a2b17627f..0631f54d1 100644
--- a/src/safety/device_copy.rs
+++ b/src/safety/device_copy.rs
@@ -20,10 +20,6 @@ mod sealed {
     {
     }
 
-    // Only unsafe aliasing is possible since both only expose raw pointers
-    // impl<T: 'static> SafeDeviceCopy for
-    // crate::utils::shared::r#static::ThreadBlockShared<T> {}
-    // impl<T: 'static + ~const const_type_layout::TypeGraphLayout>
-    // SafeDeviceCopy for crate::utils::shared::slice::ThreadBlockSharedSlice<T>
-    // {}
+    // No data is actually copied to the device
+    impl<T: 'static> SafeDeviceCopy for crate::utils::shared::r#static::ThreadBlockShared<T> {}
 }
diff --git a/src/safety/no_aliasing.rs b/src/safety/no_aliasing.rs
index dbc163e59..0b246a52f 100644
--- a/src/safety/no_aliasing.rs
+++ b/src/safety/no_aliasing.rs
@@ -24,8 +24,5 @@ mod private {
     impl<T> NoAliasing for crate::utils::aliasing::SplitSliceOverCudaThreadsDynamicStride<T> {}
 
     // Only unsafe aliasing is possible since both only expose raw pointers
-    // impl<T: 'static> NoAliasing for
-    // crate::utils::shared::r#static::ThreadBlockShared<T> {}
-    // impl<T: 'static + ~const const_type_layout::TypeGraphLayout> NoAliasing
-    // for crate::utils::shared::slice::ThreadBlockSharedSlice<T> {}
+    impl<T: 'static> NoAliasing for crate::utils::shared::r#static::ThreadBlockShared<T> {}
 }
diff --git a/src/safety/stack_only.rs b/src/safety/stack_only.rs
index eb3a69706..47812cbcc 100644
--- a/src/safety/stack_only.rs
+++ b/src/safety/stack_only.rs
@@ -37,7 +37,7 @@ mod sealed {
     impl<T> !StackOnly for &mut T {}
 
     impl<T: 'static> !StackOnly for crate::utils::shared::r#static::ThreadBlockShared<T> {}
-    impl<T: 'static + ~const const_type_layout::TypeGraphLayout> !StackOnly
+    impl<T: 'static + const_type_layout::TypeGraphLayout> !StackOnly
         for crate::utils::shared::slice::ThreadBlockSharedSlice<T>
     {
     }
diff --git a/src/utils/shared/slice.rs b/src/utils/shared/slice.rs
index 238b1aac8..e1f95ba95 100644
--- a/src/utils/shared/slice.rs
+++ b/src/utils/shared/slice.rs
@@ -2,14 +2,11 @@
 use core::marker::PhantomData;
 
 use const_type_layout::TypeGraphLayout;
-use rustacuda_core::DeviceCopy;
-
-use crate::common::{CudaAsRust, DeviceAccessible, RustToCuda};
 
 #[cfg(not(target_os = "cuda"))]
 #[allow(clippy::module_name_repetitions)]
 #[repr(transparent)]
-pub struct ThreadBlockSharedSlice<T: 'static + ~const TypeGraphLayout> {
+pub struct ThreadBlockSharedSlice<T: 'static + TypeGraphLayout> {
     len: usize,
     marker: PhantomData<T>,
 }
@@ -17,29 +14,11 @@ pub struct ThreadBlockSharedSlice<T: 'static + ~const TypeGraphLayout> {
 #[cfg(target_os = "cuda")]
 #[allow(clippy::module_name_repetitions)]
 #[repr(transparent)]
-pub struct ThreadBlockSharedSlice<T: 'static + ~const TypeGraphLayout> {
+pub struct ThreadBlockSharedSlice<T: 'static + TypeGraphLayout> {
     shared: *mut [T],
 }
 
-#[doc(hidden)]
-#[derive(TypeLayout)]
-#[layout(bound = "T: 'static + ~const TypeGraphLayout")]
-#[repr(C)]
-pub struct ThreadBlockSharedSliceCudaRepresentation<T: 'static + ~const TypeGraphLayout> {
-    len: usize,
-    // Note: uses a zero-element array instead of PhantomData here so that
-    //       TypeLayout can still observe T's layout
-    marker: [T; 0],
-}
-
-unsafe impl<T: 'static + ~const TypeGraphLayout> DeviceCopy
-    for ThreadBlockSharedSliceCudaRepresentation<T>
-{
-}
-
-// #[cfg(not(any(all(not(feature = "host"), target_os = "cuda"), doc)))]
-// #[doc(cfg(not(all(not(feature = "host"), target_os = "cuda"))))]
-impl<T: 'static + ~const TypeGraphLayout> ThreadBlockSharedSlice<T> {
+impl<T: 'static + TypeGraphLayout> ThreadBlockSharedSlice<T> {
     #[cfg(any(not(target_os = "cuda"), doc))]
     #[doc(cfg(not(target_os = "cuda")))]
     #[must_use]
@@ -50,6 +29,22 @@ impl<T: 'static + ~const TypeGraphLayout> ThreadBlockSharedSlice<T> {
         }
     }
 
+    #[cfg(any(not(target_os = "cuda"), doc))]
+    #[doc(cfg(not(target_os = "cuda")))]
+    #[must_use]
+    pub fn with_len(mut self, len: usize) -> Self {
+        self.len = len;
+        self
+    }
+
+    #[cfg(any(not(target_os = "cuda"), doc))]
+    #[doc(cfg(not(target_os = "cuda")))]
+    #[must_use]
+    pub fn with_len_mut(&mut self, len: usize) -> &mut Self {
+        self.len = len;
+        self
+    }
+
     #[cfg(not(target_os = "cuda"))]
     #[must_use]
     pub fn len(&self) -> usize {
@@ -70,82 +65,28 @@ impl<T: 'static + ~const TypeGraphLayout> ThreadBlockSharedSlice<T> {
     #[cfg(any(target_os = "cuda", doc))]
     #[doc(cfg(target_os = "cuda"))]
     #[must_use]
-    pub fn as_mut_slice_ptr(&self) -> *mut [T] {
-        self.shared
+    pub fn as_mut_ptr(&self) -> *mut T {
+        self.shared.cast()
     }
 
     #[cfg(any(target_os = "cuda", doc))]
     #[doc(cfg(target_os = "cuda"))]
     #[must_use]
-    pub fn as_mut_ptr(&self) -> *mut T {
-        self.shared.cast()
+    pub fn as_mut_slice_ptr(&self) -> *mut [T] {
+        self.shared
     }
-}
 
-unsafe impl<T: 'static + ~const TypeGraphLayout> RustToCuda for ThreadBlockSharedSlice<T> {
-    #[cfg(feature = "host")]
-    #[doc(cfg(feature = "host"))]
-    type CudaAllocation = crate::host::NullCudaAlloc;
-    type CudaRepresentation = ThreadBlockSharedSliceCudaRepresentation<T>;
-
-    #[cfg(feature = "host")]
-    #[doc(cfg(feature = "host"))]
-    unsafe fn borrow<A: crate::host::CudaAlloc>(
+    #[cfg(any(target_os = "cuda", doc))]
+    #[doc(cfg(target_os = "cuda"))]
+    /// Safety:
+    ///
+    /// The provided `index` must not be out of bounds.
+    #[inline]
+    #[must_use]
+    pub unsafe fn index_mut_unchecked<I: core::slice::SliceIndex<[T]>>(
         &self,
-        alloc: A,
-    ) -> rustacuda::error::CudaResult<(
-        DeviceAccessible<Self::CudaRepresentation>,
-        crate::host::CombinedCudaAlloc<Self::CudaAllocation, A>,
-    )> {
-        Ok((
-            DeviceAccessible::from(ThreadBlockSharedSliceCudaRepresentation {
-                len: self.len,
-                marker: [],
-            }),
-            crate::host::CombinedCudaAlloc::new(crate::host::NullCudaAlloc, alloc),
-        ))
-    }
-
-    #[cfg(feature = "host")]
-    #[doc(cfg(feature = "host"))]
-    unsafe fn restore<A: crate::host::CudaAlloc>(
-        &mut self,
-        alloc: crate::host::CombinedCudaAlloc<Self::CudaAllocation, A>,
-    ) -> rustacuda::error::CudaResult<A> {
-        let (_null, alloc): (crate::host::NullCudaAlloc, A) = alloc.split();
-
-        Ok(alloc)
-    }
-}
-
-unsafe impl<T: 'static + ~const TypeGraphLayout> CudaAsRust
-    for ThreadBlockSharedSliceCudaRepresentation<T>
-{
-    type RustRepresentation = ThreadBlockSharedSlice<T>;
-
-    #[cfg(any(not(feature = "host"), doc))]
-    #[doc(cfg(not(feature = "host")))]
-    unsafe fn as_rust(_this: &DeviceAccessible<Self>) -> Self::RustRepresentation {
-        todo!()
-
-        // unsafe {
-        //     core::arch::asm!(
-        //         ".shared .align {align} .b8 rust_cuda_dynamic_shared[];",
-        //         align = const(core::mem::align_of::<T>()),
-        //     );
-        // }
-
-        // let base: *mut u8;
-
-        // unsafe {
-        //     core::arch::asm!(
-        //         "cvta.shared.u64 {reg}, rust_cuda_dynamic_shared;",
-        //         reg = out(reg64) base,
-        //     );
-        // }
-
-        // let slice = core::ptr::slice_from_raw_parts_mut(
-        //     base.add(self.byte_offset).cast(), self.len,
-        // );
+        index: I,
+    ) -> *mut <I as core::slice::SliceIndex<[T]>>::Output {
+        self.shared.get_unchecked_mut(index)
     }
 }
diff --git a/src/utils/shared/static.rs b/src/utils/shared/static.rs
index 58973ace4..368dc8296 100644
--- a/src/utils/shared/static.rs
+++ b/src/utils/shared/static.rs
@@ -69,25 +69,20 @@ impl<T: 'static> ThreadBlockShared<T> {
 impl<T: 'static, const N: usize> ThreadBlockShared<[T; N]> {
     #[cfg(any(target_os = "cuda", doc))]
     #[doc(cfg(target_os = "cuda"))]
+    /// Safety:
+    ///
+    /// The provided `index` must not be out of bounds.
     #[inline]
     #[must_use]
-    pub fn index(&self, index: usize) -> *const T {
-        self.index_mut(index)
-    }
-
-    #[cfg(any(target_os = "cuda", doc))]
-    #[doc(cfg(target_os = "cuda"))]
-    #[inline]
-    #[must_use]
-    pub fn index_mut(&self, index: usize) -> *mut T {
-        assert!(index < N);
-
-        // Safety: Since *[T; N] is valid, *T is valid iff index < N
-        unsafe { self.shared.cast::<T>().add(index) }
+    pub unsafe fn index_mut_unchecked<I: core::slice::SliceIndex<[T]>>(
+        &self,
+        index: I,
+    ) -> *mut <I as core::slice::SliceIndex<[T]>>::Output {
+        core::ptr::slice_from_raw_parts_mut(self.shared.cast::<T>(), N).get_unchecked_mut(index)
     }
 }
 
-unsafe impl<T: 'static + ~const TypeGraphLayout> RustToCuda for ThreadBlockShared<T> {
+unsafe impl<T: 'static + TypeGraphLayout> RustToCuda for ThreadBlockShared<T> {
     #[cfg(feature = "host")]
     #[doc(cfg(feature = "host"))]
     type CudaAllocation = crate::host::NullCudaAlloc;
@@ -120,7 +115,7 @@ unsafe impl<T: 'static + ~const TypeGraphLayout> RustToCuda for ThreadBlockShare
     }
 }
 
-unsafe impl<T: 'static + ~const TypeGraphLayout> CudaAsRust
+unsafe impl<T: 'static + TypeGraphLayout> CudaAsRust
     for ThreadBlockSharedCudaRepresentation<T>
 {
     type RustRepresentation = ThreadBlockShared<T>;

From 8864dbf91ba879d4cf53670e95cc7f75ee23bbe5 Mon Sep 17 00:00:00 2001
From: Juniper Tyree <juniper.tyree@helsinki.fi>
Date: Sat, 6 May 2023 17:30:51 +0000
Subject: [PATCH 025/120] Remove broken ThreadBlockShared RustToCuda impl

---
 examples/single-source/src/main.rs |  9 +++--
 src/safety/device_copy.rs          | 11 ++++--
 src/safety/no_aliasing.rs          |  7 +++-
 src/safety/stack_only.rs           |  1 +
 src/safety/unified_heap.rs         |  7 ++++
 src/utils/shared/static.rs         | 63 ------------------------------
 6 files changed, 27 insertions(+), 71 deletions(-)

diff --git a/examples/single-source/src/main.rs b/examples/single-source/src/main.rs
index e1030d261..55a2e8046 100644
--- a/examples/single-source/src/main.rs
+++ b/examples/single-source/src/main.rs
@@ -10,6 +10,7 @@
 
 extern crate alloc;
 
+#[cfg(target_os = "cuda")]
 use rc::utils::shared::r#static::ThreadBlockShared;
 
 #[cfg(not(target_os = "cuda"))]
@@ -46,7 +47,7 @@ pub fn kernel<'a, T: rc::common::RustToCuda>(
     #[kernel(pass = SafeDeviceCopy, jit)] _v @ _w: &'a core::sync::atomic::AtomicU64,
     #[kernel(pass = LendRustToCuda)] _: Wrapper<T>,
     #[kernel(pass = SafeDeviceCopy)] Tuple(s, mut __t): Tuple,
-    #[kernel(pass = LendRustToCuda)] shared3: ThreadBlockShared<u32>,
+    // #[kernel(pass = SafeDeviceCopy)] shared3: ThreadBlockShared<u32>,
 ) where
     <T as rc::common::RustToCuda>::CudaRepresentation: rc::safety::StackOnly,
 {
@@ -61,9 +62,9 @@ pub fn kernel<'a, T: rc::common::RustToCuda>(
         (*shared2.index_mut_unchecked(2)).1 = 24;
     }
     // unsafe { core::arch::asm!("hi") }
-    unsafe {
-        *shared3.as_mut_ptr() = 12;
-    }
+    // unsafe {
+    //     *shared3.as_mut_ptr() = 12;
+    // }
 }
 
 #[cfg(not(target_os = "cuda"))]
diff --git a/src/safety/device_copy.rs b/src/safety/device_copy.rs
index 0631f54d1..ee1cef0dc 100644
--- a/src/safety/device_copy.rs
+++ b/src/safety/device_copy.rs
@@ -7,6 +7,14 @@ mod sealed {
     #[marker]
     pub trait SafeDeviceCopy {}
 
+    // Thread-block-shared data cannot be copied since information is added inside
+    //  CUDA
+    impl<T: 'static> !SafeDeviceCopy for crate::utils::shared::r#static::ThreadBlockShared<T> {}
+    impl<T: 'static + const_type_layout::TypeGraphLayout> !SafeDeviceCopy
+        for crate::utils::shared::slice::ThreadBlockSharedSlice<T>
+    {
+    }
+
     impl<T: crate::safety::StackOnly> SafeDeviceCopy for T {}
     #[cfg(any(feature = "alloc", doc))]
     impl<T: crate::safety::UnifiedHeapOnly> SafeDeviceCopy for T {}
@@ -19,7 +27,4 @@ mod sealed {
         for crate::utils::device_copy::SafeDeviceCopyWrapper<T>
     {
     }
-
-    // No data is actually copied to the device
-    impl<T: 'static> SafeDeviceCopy for crate::utils::shared::r#static::ThreadBlockShared<T> {}
 }
diff --git a/src/safety/no_aliasing.rs b/src/safety/no_aliasing.rs
index 0b246a52f..98a180b6a 100644
--- a/src/safety/no_aliasing.rs
+++ b/src/safety/no_aliasing.rs
@@ -23,6 +23,11 @@ mod private {
     }
     impl<T> NoAliasing for crate::utils::aliasing::SplitSliceOverCudaThreadsDynamicStride<T> {}
 
-    // Only unsafe aliasing is possible since both only expose raw pointers
+    // Thread-block-shared data only allows unsafe aliasing since only raw pointers
+    //  are exposed
     impl<T: 'static> NoAliasing for crate::utils::shared::r#static::ThreadBlockShared<T> {}
+    impl<T: 'static + const_type_layout::TypeGraphLayout> NoAliasing
+        for crate::utils::shared::slice::ThreadBlockSharedSlice<T>
+    {
+    }
 }
diff --git a/src/safety/stack_only.rs b/src/safety/stack_only.rs
index 47812cbcc..5dc5c0cbb 100644
--- a/src/safety/stack_only.rs
+++ b/src/safety/stack_only.rs
@@ -36,6 +36,7 @@ mod sealed {
     impl<T> !StackOnly for &T {}
     impl<T> !StackOnly for &mut T {}
 
+    // Thread-block-shared data contains data not on the stack
     impl<T: 'static> !StackOnly for crate::utils::shared::r#static::ThreadBlockShared<T> {}
     impl<T: 'static + const_type_layout::TypeGraphLayout> !StackOnly
         for crate::utils::shared::slice::ThreadBlockSharedSlice<T>
diff --git a/src/safety/unified_heap.rs b/src/safety/unified_heap.rs
index 9eda2d550..483b40c3a 100644
--- a/src/safety/unified_heap.rs
+++ b/src/safety/unified_heap.rs
@@ -38,6 +38,13 @@ mod sealed {
     impl<T> !UnifiedHeapOnly for &T {}
     impl<T> !UnifiedHeapOnly for &mut T {}
 
+    // Thread-block-shared data contains CUDA-only data
+    impl<T: 'static> !UnifiedHeapOnly for crate::utils::shared::r#static::ThreadBlockShared<T> {}
+    impl<T: 'static + const_type_layout::TypeGraphLayout> !UnifiedHeapOnly
+        for crate::utils::shared::slice::ThreadBlockSharedSlice<T>
+    {
+    }
+
     impl<T> UnifiedHeapOnly for core::marker::PhantomData<T> {}
 
     impl<T> UnifiedHeapOnly for alloc::boxed::Box<T, UnifiedAllocator> {}
diff --git a/src/utils/shared/static.rs b/src/utils/shared/static.rs
index 368dc8296..324c0fdef 100644
--- a/src/utils/shared/static.rs
+++ b/src/utils/shared/static.rs
@@ -1,36 +1,18 @@
 #[cfg(not(target_os = "cuda"))]
 use core::marker::PhantomData;
 
-use const_type_layout::TypeGraphLayout;
-use rustacuda_core::DeviceCopy;
-
-use crate::common::{CudaAsRust, DeviceAccessible, RustToCuda};
-
 #[cfg(not(target_os = "cuda"))]
-#[derive(TypeLayout)]
 #[repr(transparent)]
 pub struct ThreadBlockShared<T: 'static> {
     marker: PhantomData<T>,
 }
 
 #[cfg(target_os = "cuda")]
-#[derive(TypeLayout)]
 #[repr(transparent)]
 pub struct ThreadBlockShared<T: 'static> {
     shared: *mut T,
 }
 
-#[doc(hidden)]
-#[derive(TypeLayout)]
-#[repr(transparent)]
-pub struct ThreadBlockSharedCudaRepresentation<T: 'static> {
-    // Note: uses a zero-element array instead of PhantomData here so that
-    //       TypeLayout can still observe T's layout
-    marker: [T; 0],
-}
-
-unsafe impl<T: 'static> DeviceCopy for ThreadBlockSharedCudaRepresentation<T> {}
-
 impl<T: 'static> ThreadBlockShared<T> {
     #[cfg(not(target_os = "cuda"))]
     #[must_use]
@@ -81,48 +63,3 @@ impl<T: 'static, const N: usize> ThreadBlockShared<[T; N]> {
         core::ptr::slice_from_raw_parts_mut(self.shared.cast::<T>(), N).get_unchecked_mut(index)
     }
 }
-
-unsafe impl<T: 'static + TypeGraphLayout> RustToCuda for ThreadBlockShared<T> {
-    #[cfg(feature = "host")]
-    #[doc(cfg(feature = "host"))]
-    type CudaAllocation = crate::host::NullCudaAlloc;
-    type CudaRepresentation = ThreadBlockSharedCudaRepresentation<T>;
-
-    #[cfg(feature = "host")]
-    #[doc(cfg(feature = "host"))]
-    unsafe fn borrow<A: crate::host::CudaAlloc>(
-        &self,
-        alloc: A,
-    ) -> rustacuda::error::CudaResult<(
-        DeviceAccessible<Self::CudaRepresentation>,
-        crate::host::CombinedCudaAlloc<Self::CudaAllocation, A>,
-    )> {
-        Ok((
-            DeviceAccessible::from(ThreadBlockSharedCudaRepresentation { marker: [] }),
-            crate::host::CombinedCudaAlloc::new(crate::host::NullCudaAlloc, alloc),
-        ))
-    }
-
-    #[cfg(feature = "host")]
-    #[doc(cfg(feature = "host"))]
-    unsafe fn restore<A: crate::host::CudaAlloc>(
-        &mut self,
-        alloc: crate::host::CombinedCudaAlloc<Self::CudaAllocation, A>,
-    ) -> rustacuda::error::CudaResult<A> {
-        let (_null, alloc): (crate::host::NullCudaAlloc, A) = alloc.split();
-
-        Ok(alloc)
-    }
-}
-
-unsafe impl<T: 'static + TypeGraphLayout> CudaAsRust
-    for ThreadBlockSharedCudaRepresentation<T>
-{
-    type RustRepresentation = ThreadBlockShared<T>;
-
-    #[cfg(any(not(feature = "host"), doc))]
-    #[doc(cfg(not(feature = "host")))]
-    unsafe fn as_rust(_this: &DeviceAccessible<Self>) -> Self::RustRepresentation {
-        ThreadBlockShared::new_uninit()
-    }
-}

From 9645e3c43adaf0966cba12e0c4d933983d26365c Mon Sep 17 00:00:00 2001
From: Juniper Tyree <juniper.tyree@helsinki.fi>
Date: Sun, 7 May 2023 09:47:16 +0000
Subject: [PATCH 026/120] Refactor kernel trait generation to push more safety
 constraints to the kernel definition

---
 examples/single-source/expanded.rs            | 1150 +++++++++++++++++
 examples/single-source/src/main.rs            |    8 +-
 rust-cuda-derive/src/kernel/link/mod.rs       |    2 +-
 rust-cuda-derive/src/kernel/wrapper/config.rs |    5 +
 .../generate/cpu_linker_macro/get_ptx_str.rs  |   88 +-
 .../wrapper/generate/cpu_linker_macro/mod.rs  |   33 +-
 .../kernel/wrapper/generate/cpu_wrapper.rs    |  193 ---
 .../kernel_func.rs                            |   49 +-
 .../kernel_func_async/async_func_types.rs     |   13 +-
 .../kernel_func_async/launch_types.rs         |   42 +-
 .../kernel_func_async/mod.rs                  |   70 +-
 .../kernel_func_async/type_wrap.rs            |    0
 .../wrapper/generate/cpu_wrapper/mod.rs       |   96 ++
 rust-cuda-derive/src/kernel/wrapper/mod.rs    |   49 +-
 .../src/rust_to_cuda/field_copy.rs            |    4 +-
 rust-cuda-derive/src/rust_to_cuda/impl.rs     |   25 +-
 rust-cuda-derive/src/rust_to_cuda/mod.rs      |    2 +-
 src/common.rs                                 |   60 +-
 src/device/mod.rs                             |    4 +
 src/host.rs                                   |   46 +-
 src/utils/aliasing/const.rs                   |   18 +-
 src/utils/aliasing/dynamic.rs                 |   18 +-
 src/utils/aliasing/final.rs                   |   18 +-
 src/utils/box.rs                              |   12 +-
 src/utils/boxed_slice.rs                      |   12 +-
 src/utils/device_copy.rs                      |   32 +-
 src/utils/exchange/buffer/device.rs           |    3 +-
 src/utils/exchange/buffer/host.rs             |    6 +-
 src/utils/exchange/wrapper.rs                 |   10 +-
 src/utils/option.rs                           |    7 +-
 src/utils/shared/slice.rs                     |    2 +-
 src/utils/shared/static.rs                    |    2 +-
 32 files changed, 1611 insertions(+), 468 deletions(-)
 create mode 100644 examples/single-source/expanded.rs
 delete mode 100644 rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper.rs
 rename rust-cuda-derive/src/kernel/wrapper/generate/{cpu_linker_macro => cpu_wrapper}/kernel_func.rs (84%)
 rename rust-cuda-derive/src/kernel/wrapper/generate/{cpu_linker_macro => cpu_wrapper}/kernel_func_async/async_func_types.rs (89%)
 rename rust-cuda-derive/src/kernel/wrapper/generate/{cpu_linker_macro => cpu_wrapper}/kernel_func_async/launch_types.rs (63%)
 rename rust-cuda-derive/src/kernel/wrapper/generate/{cpu_linker_macro => cpu_wrapper}/kernel_func_async/mod.rs (70%)
 rename rust-cuda-derive/src/kernel/wrapper/generate/{cpu_linker_macro => cpu_wrapper}/kernel_func_async/type_wrap.rs (100%)
 create mode 100644 rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/mod.rs

diff --git a/examples/single-source/expanded.rs b/examples/single-source/expanded.rs
new file mode 100644
index 000000000..f16379c37
--- /dev/null
+++ b/examples/single-source/expanded.rs
@@ -0,0 +1,1150 @@
+#![feature(prelude_import)]
+#![deny(clippy::pedantic)]
+#![feature(cfg_version)]
+#![feature(const_type_name)]
+#![feature(const_refs_to_cell)]
+#![feature(const_trait_impl)]
+#![feature(const_mut_refs)]
+#[prelude_import]
+use std::prelude::rust_2021::*;
+#[macro_use]
+extern crate std;
+extern crate alloc;
+#[cfg(not(target_os = "cuda"))]
+fn main() {}
+#[repr(C)]
+#[layout(crate = "rc::const_type_layout")]
+pub struct Dummy(i32);
+unsafe impl const rc::const_type_layout::TypeLayout for Dummy {
+    const TYPE_LAYOUT: rc::const_type_layout::TypeLayoutInfo<'static> = {
+        rc::const_type_layout::TypeLayoutInfo {
+            name: ::core::any::type_name::<Self>(),
+            size: ::core::mem::size_of::<Self>(),
+            alignment: ::core::mem::align_of::<Self>(),
+            structure: rc::const_type_layout::TypeStructure::Struct {
+                repr: "C",
+                fields: &[
+                    rc::const_type_layout::Field {
+                        name: "0",
+                        offset: {
+                            {
+                                #[allow(clippy::unneeded_field_pattern)]
+                                let Dummy { 0: _, .. }: Dummy;
+                                if let ::const_type_layout::MaybeUninhabited::Inhabited(
+                                    uninit,
+                                )
+                                    = unsafe {
+                                        <Dummy as ::const_type_layout::TypeLayout>::uninit()
+                                    } {
+                                    let base_ptr: *const Dummy = (&raw const uninit).cast();
+                                    #[allow(unused_unsafe)]
+                                    let field_ptr = unsafe { &raw const (*base_ptr).0 };
+                                    #[allow(clippy::cast_sign_loss)]
+                                    let offset = unsafe {
+                                        field_ptr.cast::<u8>().offset_from(base_ptr.cast()) as usize
+                                    };
+                                    #[allow(clippy::forget_non_drop, clippy::forget_copy)]
+                                    core::mem::forget(uninit);
+                                    ::const_type_layout::MaybeUninhabited::Inhabited(offset)
+                                } else {
+                                    ::const_type_layout::MaybeUninhabited::Uninhabited
+                                }
+                            }
+                        },
+                        ty: ::core::any::type_name::<i32>(),
+                    },
+                ],
+            },
+        }
+    };
+    unsafe fn uninit() -> rc::const_type_layout::MaybeUninhabited<
+        ::core::mem::MaybeUninit<Self>,
+    > {
+        if let (rc::const_type_layout::MaybeUninhabited::Inhabited(f_0))
+            = (<i32 as rc::const_type_layout::TypeLayout>::uninit()) {
+            rc::const_type_layout::MaybeUninhabited::Inhabited(
+                ::core::mem::MaybeUninit::new(Dummy(f_0.assume_init())),
+            )
+        } else {
+            rc::const_type_layout::MaybeUninhabited::Uninhabited
+        }
+    }
+}
+unsafe impl const rc::const_type_layout::TypeGraph for Dummy {
+    fn populate_graph(graph: &mut rc::const_type_layout::TypeLayoutGraph<'static>) {
+        if graph.insert(&<Self as rc::const_type_layout::TypeLayout>::TYPE_LAYOUT) {
+            <i32 as rc::const_type_layout::TypeGraph>::populate_graph(graph);
+        }
+    }
+}
+#[cuda(crate = "rc")]
+#[allow(dead_code)]
+pub struct Wrapper<T> {
+    #[cuda(embed)]
+    inner: T,
+}
+#[allow(dead_code)]
+#[doc(hidden)]
+#[allow(dead_code)]
+#[repr(C)]
+#[layout(free = "T")]
+#[layout(crate = "rc :: const_type_layout")]
+pub struct WrapperCudaRepresentation<T>
+where
+    T: rc::common::RustToCuda,
+{
+    inner: rc::common::DeviceAccessible<
+        <T as rc::common::RustToCuda>::CudaRepresentation,
+    >,
+}
+unsafe impl<T> const rc::const_type_layout::TypeLayout for WrapperCudaRepresentation<T>
+where
+    T: rc::common::RustToCuda,
+{
+    const TYPE_LAYOUT: rc::const_type_layout::TypeLayoutInfo<'static> = {
+        rc::const_type_layout::TypeLayoutInfo {
+            name: ::core::any::type_name::<Self>(),
+            size: ::core::mem::size_of::<Self>(),
+            alignment: ::core::mem::align_of::<Self>(),
+            structure: rc::const_type_layout::TypeStructure::Struct {
+                repr: "C",
+                fields: &[
+                    rc::const_type_layout::Field {
+                        name: "inner",
+                        offset: {
+                            {
+                                #[allow(clippy::unneeded_field_pattern)]
+                                let WrapperCudaRepresentation {
+                                    inner: _,
+                                    ..
+                                }: WrapperCudaRepresentation<T>;
+                                if let ::const_type_layout::MaybeUninhabited::Inhabited(
+                                    uninit,
+                                )
+                                    = unsafe {
+                                        <WrapperCudaRepresentation<
+                                            T,
+                                        > as ::const_type_layout::TypeLayout>::uninit()
+                                    } {
+                                    let base_ptr: *const WrapperCudaRepresentation<T> = (&raw const uninit)
+                                        .cast();
+                                    #[allow(unused_unsafe)]
+                                    let field_ptr = unsafe { &raw const (*base_ptr).inner };
+                                    #[allow(clippy::cast_sign_loss)]
+                                    let offset = unsafe {
+                                        field_ptr.cast::<u8>().offset_from(base_ptr.cast()) as usize
+                                    };
+                                    #[allow(clippy::forget_non_drop, clippy::forget_copy)]
+                                    core::mem::forget(uninit);
+                                    ::const_type_layout::MaybeUninhabited::Inhabited(offset)
+                                } else {
+                                    ::const_type_layout::MaybeUninhabited::Uninhabited
+                                }
+                            }
+                        },
+                        ty: ::core::any::type_name::<
+                            rc::common::DeviceAccessible<
+                                <T as rc::common::RustToCuda>::CudaRepresentation,
+                            >,
+                        >(),
+                    },
+                ],
+            },
+        }
+    };
+    unsafe fn uninit() -> rc::const_type_layout::MaybeUninhabited<
+        ::core::mem::MaybeUninit<Self>,
+    > {
+        if let (rc::const_type_layout::MaybeUninhabited::Inhabited(inner))
+            = (<rc::common::DeviceAccessible<
+                <T as rc::common::RustToCuda>::CudaRepresentation,
+            > as rc::const_type_layout::TypeLayout>::uninit()) {
+            rc::const_type_layout::MaybeUninhabited::Inhabited(
+                ::core::mem::MaybeUninit::new(WrapperCudaRepresentation {
+                    inner: inner.assume_init(),
+                }),
+            )
+        } else {
+            rc::const_type_layout::MaybeUninhabited::Uninhabited
+        }
+    }
+}
+unsafe impl<T> const rc::const_type_layout::TypeGraph for WrapperCudaRepresentation<T>
+where
+    T: rc::common::RustToCuda,
+{
+    fn populate_graph(graph: &mut rc::const_type_layout::TypeLayoutGraph<'static>) {
+        if graph.insert(&<Self as rc::const_type_layout::TypeLayout>::TYPE_LAYOUT) {
+            <rc::common::DeviceAccessible<
+                <T as rc::common::RustToCuda>::CudaRepresentation,
+            > as rc::const_type_layout::TypeGraph>::populate_graph(graph);
+        }
+    }
+}
+unsafe impl<T> rc::rustacuda_core::DeviceCopy for WrapperCudaRepresentation<T>
+where
+    T: rc::common::RustToCuda,
+{}
+unsafe impl<T> rc::common::RustToCuda for Wrapper<T>
+where
+    T: rc::common::RustToCuda,
+{
+    type CudaRepresentation = WrapperCudaRepresentation<T>;
+    type CudaAllocation = rc::common::CombinedCudaAlloc<
+        <T as rc::common::RustToCuda>::CudaAllocation,
+        rc::common::NullCudaAlloc,
+    >;
+    #[cfg(not(target_os = "cuda"))]
+    unsafe fn borrow<CudaAllocType: rc::common::CudaAlloc>(
+        &self,
+        alloc: CudaAllocType,
+    ) -> rc::rustacuda::error::CudaResult<
+        (
+            rc::common::DeviceAccessible<Self::CudaRepresentation>,
+            rc::common::CombinedCudaAlloc<Self::CudaAllocation, CudaAllocType>,
+        ),
+    > {
+        let alloc_front = rc::common::NullCudaAlloc;
+        let alloc_tail = alloc;
+        let (field_inner_repr, alloc_front) = rc::common::RustToCuda::borrow(
+            &self.inner,
+            alloc_front,
+        )?;
+        let borrow = WrapperCudaRepresentation {
+            inner: field_inner_repr,
+        };
+        Ok((
+            rc::common::DeviceAccessible::from(borrow),
+            rc::common::CombinedCudaAlloc::new(alloc_front, alloc_tail),
+        ))
+    }
+    #[cfg(not(target_os = "cuda"))]
+    unsafe fn restore<CudaAllocType: rc::common::CudaAlloc>(
+        &mut self,
+        alloc: rc::common::CombinedCudaAlloc<Self::CudaAllocation, CudaAllocType>,
+    ) -> rc::rustacuda::error::CudaResult<CudaAllocType> {
+        let (alloc_front, alloc_tail) = alloc.split();
+        let alloc_front = rc::common::RustToCuda::restore(&mut self.inner, alloc_front)?;
+        Ok(alloc_tail)
+    }
+}
+unsafe impl<T> rc::common::RustToCudaAsync for Wrapper<T>
+where
+    T: rc::common::RustToCudaAsync,
+{
+    #[cfg(not(target_os = "cuda"))]
+    unsafe fn borrow_async<CudaAllocType: rc::common::CudaAlloc>(
+        &self,
+        alloc: CudaAllocType,
+        stream: &rc::rustacuda::stream::Stream,
+    ) -> rc::rustacuda::error::CudaResult<
+        (
+            rc::common::DeviceAccessible<Self::CudaRepresentation>,
+            rc::common::CombinedCudaAlloc<Self::CudaAllocation, CudaAllocType>,
+        ),
+    > {
+        let alloc_front = rc::common::NullCudaAlloc;
+        let alloc_tail = alloc;
+        let (field_inner_repr, alloc_front) = rc::common::RustToCudaAsync::borrow_async(
+            &self.inner,
+            alloc_front,
+            stream,
+        )?;
+        let borrow = WrapperCudaRepresentation {
+            inner: field_inner_repr,
+        };
+        Ok((
+            rc::common::DeviceAccessible::from(borrow),
+            rc::common::CombinedCudaAlloc::new(alloc_front, alloc_tail),
+        ))
+    }
+    #[cfg(not(target_os = "cuda"))]
+    unsafe fn restore_async<CudaAllocType: rc::common::CudaAlloc>(
+        &mut self,
+        alloc: rc::common::CombinedCudaAlloc<Self::CudaAllocation, CudaAllocType>,
+        stream: &rc::rustacuda::stream::Stream,
+    ) -> rc::rustacuda::error::CudaResult<CudaAllocType> {
+        let (alloc_front, alloc_tail) = alloc.split();
+        let alloc_front = rc::common::RustToCudaAsync::restore_async(
+            &mut self.inner,
+            alloc_front,
+            stream,
+        )?;
+        Ok(alloc_tail)
+    }
+}
+unsafe impl<T> rc::common::CudaAsRust for WrapperCudaRepresentation<T>
+where
+    T: rc::common::RustToCuda,
+{
+    type RustRepresentation = Wrapper<T>;
+}
+#[cuda(crate = "rc")]
+pub struct Empty([u8; 0]);
+#[allow(dead_code)]
+#[doc(hidden)]
+#[repr(C)]
+#[layout(crate = "rc :: const_type_layout")]
+pub struct EmptyCudaRepresentation(
+    rc::common::DeviceAccessible<rc::utils::device_copy::SafeDeviceCopyWrapper<[u8; 0]>>,
+);
+unsafe impl const rc::const_type_layout::TypeLayout for EmptyCudaRepresentation {
+    const TYPE_LAYOUT: rc::const_type_layout::TypeLayoutInfo<'static> = {
+        rc::const_type_layout::TypeLayoutInfo {
+            name: ::core::any::type_name::<Self>(),
+            size: ::core::mem::size_of::<Self>(),
+            alignment: ::core::mem::align_of::<Self>(),
+            structure: rc::const_type_layout::TypeStructure::Struct {
+                repr: "C",
+                fields: &[
+                    rc::const_type_layout::Field {
+                        name: "0",
+                        offset: {
+                            {
+                                #[allow(clippy::unneeded_field_pattern)]
+                                let EmptyCudaRepresentation {
+                                    0: _,
+                                    ..
+                                }: EmptyCudaRepresentation;
+                                if let ::const_type_layout::MaybeUninhabited::Inhabited(
+                                    uninit,
+                                )
+                                    = unsafe {
+                                        <EmptyCudaRepresentation as ::const_type_layout::TypeLayout>::uninit()
+                                    } {
+                                    let base_ptr: *const EmptyCudaRepresentation = (&raw const uninit)
+                                        .cast();
+                                    #[allow(unused_unsafe)]
+                                    let field_ptr = unsafe { &raw const (*base_ptr).0 };
+                                    #[allow(clippy::cast_sign_loss)]
+                                    let offset = unsafe {
+                                        field_ptr.cast::<u8>().offset_from(base_ptr.cast()) as usize
+                                    };
+                                    #[allow(clippy::forget_non_drop, clippy::forget_copy)]
+                                    core::mem::forget(uninit);
+                                    ::const_type_layout::MaybeUninhabited::Inhabited(offset)
+                                } else {
+                                    ::const_type_layout::MaybeUninhabited::Uninhabited
+                                }
+                            }
+                        },
+                        ty: ::core::any::type_name::<
+                            rc::common::DeviceAccessible<
+                                rc::utils::device_copy::SafeDeviceCopyWrapper<[u8; 0]>,
+                            >,
+                        >(),
+                    },
+                ],
+            },
+        }
+    };
+    unsafe fn uninit() -> rc::const_type_layout::MaybeUninhabited<
+        ::core::mem::MaybeUninit<Self>,
+    > {
+        if let (rc::const_type_layout::MaybeUninhabited::Inhabited(f_0))
+            = (<rc::common::DeviceAccessible<
+                rc::utils::device_copy::SafeDeviceCopyWrapper<[u8; 0]>,
+            > as rc::const_type_layout::TypeLayout>::uninit()) {
+            rc::const_type_layout::MaybeUninhabited::Inhabited(
+                ::core::mem::MaybeUninit::new(EmptyCudaRepresentation(f_0.assume_init())),
+            )
+        } else {
+            rc::const_type_layout::MaybeUninhabited::Uninhabited
+        }
+    }
+}
+unsafe impl const rc::const_type_layout::TypeGraph for EmptyCudaRepresentation {
+    fn populate_graph(graph: &mut rc::const_type_layout::TypeLayoutGraph<'static>) {
+        if graph.insert(&<Self as rc::const_type_layout::TypeLayout>::TYPE_LAYOUT) {
+            <rc::common::DeviceAccessible<
+                rc::utils::device_copy::SafeDeviceCopyWrapper<[u8; 0]>,
+            > as rc::const_type_layout::TypeGraph>::populate_graph(graph);
+        }
+    }
+}
+unsafe impl rc::rustacuda_core::DeviceCopy for EmptyCudaRepresentation {}
+unsafe impl rc::common::RustToCuda for Empty {
+    type CudaRepresentation = EmptyCudaRepresentation;
+    type CudaAllocation = rc::common::NullCudaAlloc;
+    #[cfg(not(target_os = "cuda"))]
+    unsafe fn borrow<CudaAllocType: rc::common::CudaAlloc>(
+        &self,
+        alloc: CudaAllocType,
+    ) -> rc::rustacuda::error::CudaResult<
+        (
+            rc::common::DeviceAccessible<Self::CudaRepresentation>,
+            rc::common::CombinedCudaAlloc<Self::CudaAllocation, CudaAllocType>,
+        ),
+    > {
+        let alloc_front = rc::common::NullCudaAlloc;
+        let alloc_tail = alloc;
+        let field_0_repr = rc::common::DeviceAccessible::from(&self.0);
+        let borrow = EmptyCudaRepresentation(field_0_repr);
+        Ok((
+            rc::common::DeviceAccessible::from(borrow),
+            rc::common::CombinedCudaAlloc::new(alloc_front, alloc_tail),
+        ))
+    }
+    #[cfg(not(target_os = "cuda"))]
+    unsafe fn restore<CudaAllocType: rc::common::CudaAlloc>(
+        &mut self,
+        alloc: rc::common::CombinedCudaAlloc<Self::CudaAllocation, CudaAllocType>,
+    ) -> rc::rustacuda::error::CudaResult<CudaAllocType> {
+        let (alloc_front, alloc_tail) = alloc.split();
+        Ok(alloc_tail)
+    }
+}
+unsafe impl rc::common::RustToCudaAsync for Empty {
+    #[cfg(not(target_os = "cuda"))]
+    unsafe fn borrow_async<CudaAllocType: rc::common::CudaAlloc>(
+        &self,
+        alloc: CudaAllocType,
+        stream: &rc::rustacuda::stream::Stream,
+    ) -> rc::rustacuda::error::CudaResult<
+        (
+            rc::common::DeviceAccessible<Self::CudaRepresentation>,
+            rc::common::CombinedCudaAlloc<Self::CudaAllocation, CudaAllocType>,
+        ),
+    > {
+        let alloc_front = rc::common::NullCudaAlloc;
+        let alloc_tail = alloc;
+        let field_0_repr = rc::common::DeviceAccessible::from(&self.0);
+        let borrow = EmptyCudaRepresentation(field_0_repr);
+        Ok((
+            rc::common::DeviceAccessible::from(borrow),
+            rc::common::CombinedCudaAlloc::new(alloc_front, alloc_tail),
+        ))
+    }
+    #[cfg(not(target_os = "cuda"))]
+    unsafe fn restore_async<CudaAllocType: rc::common::CudaAlloc>(
+        &mut self,
+        alloc: rc::common::CombinedCudaAlloc<Self::CudaAllocation, CudaAllocType>,
+        stream: &rc::rustacuda::stream::Stream,
+    ) -> rc::rustacuda::error::CudaResult<CudaAllocType> {
+        let (alloc_front, alloc_tail) = alloc.split();
+        Ok(alloc_tail)
+    }
+}
+unsafe impl rc::common::CudaAsRust for EmptyCudaRepresentation {
+    type RustRepresentation = Empty;
+}
+#[repr(C)]
+#[layout(crate = "rc::const_type_layout")]
+pub struct Tuple(u32, i32);
+unsafe impl const rc::const_type_layout::TypeLayout for Tuple {
+    const TYPE_LAYOUT: rc::const_type_layout::TypeLayoutInfo<'static> = {
+        rc::const_type_layout::TypeLayoutInfo {
+            name: ::core::any::type_name::<Self>(),
+            size: ::core::mem::size_of::<Self>(),
+            alignment: ::core::mem::align_of::<Self>(),
+            structure: rc::const_type_layout::TypeStructure::Struct {
+                repr: "C",
+                fields: &[
+                    rc::const_type_layout::Field {
+                        name: "0",
+                        offset: {
+                            {
+                                #[allow(clippy::unneeded_field_pattern)]
+                                let Tuple { 0: _, .. }: Tuple;
+                                if let ::const_type_layout::MaybeUninhabited::Inhabited(
+                                    uninit,
+                                )
+                                    = unsafe {
+                                        <Tuple as ::const_type_layout::TypeLayout>::uninit()
+                                    } {
+                                    let base_ptr: *const Tuple = (&raw const uninit).cast();
+                                    #[allow(unused_unsafe)]
+                                    let field_ptr = unsafe { &raw const (*base_ptr).0 };
+                                    #[allow(clippy::cast_sign_loss)]
+                                    let offset = unsafe {
+                                        field_ptr.cast::<u8>().offset_from(base_ptr.cast()) as usize
+                                    };
+                                    #[allow(clippy::forget_non_drop, clippy::forget_copy)]
+                                    core::mem::forget(uninit);
+                                    ::const_type_layout::MaybeUninhabited::Inhabited(offset)
+                                } else {
+                                    ::const_type_layout::MaybeUninhabited::Uninhabited
+                                }
+                            }
+                        },
+                        ty: ::core::any::type_name::<u32>(),
+                    },
+                    rc::const_type_layout::Field {
+                        name: "1",
+                        offset: {
+                            {
+                                #[allow(clippy::unneeded_field_pattern)]
+                                let Tuple { 1: _, .. }: Tuple;
+                                if let ::const_type_layout::MaybeUninhabited::Inhabited(
+                                    uninit,
+                                )
+                                    = unsafe {
+                                        <Tuple as ::const_type_layout::TypeLayout>::uninit()
+                                    } {
+                                    let base_ptr: *const Tuple = (&raw const uninit).cast();
+                                    #[allow(unused_unsafe)]
+                                    let field_ptr = unsafe { &raw const (*base_ptr).1 };
+                                    #[allow(clippy::cast_sign_loss)]
+                                    let offset = unsafe {
+                                        field_ptr.cast::<u8>().offset_from(base_ptr.cast()) as usize
+                                    };
+                                    #[allow(clippy::forget_non_drop, clippy::forget_copy)]
+                                    core::mem::forget(uninit);
+                                    ::const_type_layout::MaybeUninhabited::Inhabited(offset)
+                                } else {
+                                    ::const_type_layout::MaybeUninhabited::Uninhabited
+                                }
+                            }
+                        },
+                        ty: ::core::any::type_name::<i32>(),
+                    },
+                ],
+            },
+        }
+    };
+    unsafe fn uninit() -> rc::const_type_layout::MaybeUninhabited<
+        ::core::mem::MaybeUninit<Self>,
+    > {
+        if let (
+            rc::const_type_layout::MaybeUninhabited::Inhabited(f_0),
+            rc::const_type_layout::MaybeUninhabited::Inhabited(f_1),
+        )
+            = (
+                <u32 as rc::const_type_layout::TypeLayout>::uninit(),
+                <i32 as rc::const_type_layout::TypeLayout>::uninit(),
+            ) {
+            rc::const_type_layout::MaybeUninhabited::Inhabited(
+                ::core::mem::MaybeUninit::new(
+                    Tuple(f_0.assume_init(), f_1.assume_init()),
+                ),
+            )
+        } else {
+            rc::const_type_layout::MaybeUninhabited::Uninhabited
+        }
+    }
+}
+unsafe impl const rc::const_type_layout::TypeGraph for Tuple {
+    fn populate_graph(graph: &mut rc::const_type_layout::TypeLayoutGraph<'static>) {
+        if graph.insert(&<Self as rc::const_type_layout::TypeLayout>::TYPE_LAYOUT) {
+            <u32 as rc::const_type_layout::TypeGraph>::populate_graph(graph);
+            <i32 as rc::const_type_layout::TypeGraph>::populate_graph(graph);
+        }
+    }
+}
+#[cfg(not(target_os = "cuda"))]
+#[allow(clippy::missing_safety_doc)]
+unsafe trait KernelArgs<T: rc::common::RustToCuda>
+where
+    T: rc::safety::StackOnly,
+    <T as rc::common::RustToCuda>::CudaRepresentation: rc::safety::StackOnly,
+    <T as rc::common::RustToCuda>::CudaAllocation: rc::common::EmptyCudaAlloc,
+{
+    type __T_0;
+    type __T_1;
+    type __T_2;
+    type __T_3;
+    type __T_4;
+    type __T_5;
+}
+unsafe impl<T: rc::common::RustToCuda> KernelArgs<T> for ()
+where
+    T: rc::safety::StackOnly,
+    <T as rc::common::RustToCuda>::CudaRepresentation: rc::safety::StackOnly,
+    <T as rc::common::RustToCuda>::CudaAllocation: rc::common::EmptyCudaAlloc,
+{
+    type __T_0 = Dummy;
+    type __T_1 = Wrapper<T>;
+    type __T_2 = Wrapper<T>;
+    type __T_3 = core::sync::atomic::AtomicU64;
+    type __T_4 = Wrapper<T>;
+    type __T_5 = Tuple;
+}
+#[cfg(not(target_os = "cuda"))]
+#[allow(clippy::missing_safety_doc)]
+unsafe trait KernelPtx<T: rc::common::RustToCuda>
+where
+    T: rc::safety::StackOnly,
+    <T as rc::common::RustToCuda>::CudaRepresentation: rc::safety::StackOnly,
+    <T as rc::common::RustToCuda>::CudaAllocation: rc::common::EmptyCudaAlloc,
+{
+    fn get_ptx_str() -> &'static str
+    where
+        Self: Sized + rc::host::Launcher<KernelTraitObject = dyn Kernel<T>>;
+    fn new_kernel() -> rc::rustacuda::error::CudaResult<
+        rc::host::TypedKernel<dyn Kernel<T>>,
+    >
+    where
+        Self: Sized + rc::host::Launcher<KernelTraitObject = dyn Kernel<T>>;
+}
+#[cfg(not(target_os = "cuda"))]
+#[allow(clippy::missing_safety_doc)]
+unsafe trait Kernel<T: rc::common::RustToCuda>: KernelPtx<T>
+where
+    T: rc::safety::StackOnly,
+    <T as rc::common::RustToCuda>::CudaRepresentation: rc::safety::StackOnly,
+    <T as rc::common::RustToCuda>::CudaAllocation: rc::common::EmptyCudaAlloc,
+{
+    #[allow(clippy::needless_lifetimes)]
+    #[allow(clippy::too_many_arguments)]
+    #[allow(clippy::used_underscore_binding)]
+    #[allow(unused_variables)]
+    fn kernel<'stream, '__r2c_lt_0, '__r2c_lt_1, '__r2c_lt_2, '__r2c_move_lt_4, 'a>(
+        &mut self,
+        stream: &'stream rc::rustacuda::stream::Stream,
+        _x: &'__r2c_lt_0 <() as KernelArgs<T>>::__T_0,
+        _y: &'__r2c_lt_1 mut <() as KernelArgs<T>>::__T_1,
+        _z: &'__r2c_lt_2 <() as KernelArgs<T>>::__T_2,
+        _v: &'a <() as KernelArgs<T>>::__T_3,
+        kernel_arg_4: <() as KernelArgs<T>>::__T_4,
+        s_t: <() as KernelArgs<T>>::__T_5,
+    ) -> rc::rustacuda::error::CudaResult<()>
+    where
+        Self: Sized + rc::host::Launcher<KernelTraitObject = dyn Kernel<T>>,
+    {
+        const fn __check_is_sync<T: ?Sized>(_x: &T) -> bool {
+            trait IsSyncMarker {
+                const SYNC: bool = false;
+            }
+            impl<T: ?Sized> IsSyncMarker for T {}
+            struct CheckIs<T: ?Sized>(::core::marker::PhantomData<T>);
+            #[allow(dead_code)]
+            impl<T: ?Sized + Sync> CheckIs<T> {
+                const SYNC: bool = true;
+            }
+            <CheckIs<T>>::SYNC
+        }
+        let mut ___x_box = rc::host::HostDeviceBox::from(
+            rc::rustacuda::memory::DeviceBox::new(
+                rc::utils::device_copy::SafeDeviceCopyWrapper::from_ref(_x),
+            )?,
+        );
+        #[allow(clippy::redundant_closure_call)]
+        let __result = (|_x| {
+            rc::host::LendToCuda::lend_to_cuda_mut(
+                _y,
+                |mut _y| {
+                    (|_y| {
+                        rc::host::LendToCuda::lend_to_cuda(
+                            _z,
+                            |_z| {
+                                (|_z| {
+                                    let mut ___v_box = rc::host::HostDeviceBox::from(
+                                        rc::rustacuda::memory::DeviceBox::new(
+                                            rc::utils::device_copy::SafeDeviceCopyWrapper::from_ref(_v),
+                                        )?,
+                                    );
+                                    #[allow(clippy::redundant_closure_call)]
+                                    let __result = (|_v| {
+                                        rc::host::LendToCuda::move_to_cuda(
+                                            kernel_arg_4,
+                                            |mut kernel_arg_4| {
+                                                (|kernel_arg_4| {
+                                                    {
+                                                        let s_t = rc::utils::device_copy::SafeDeviceCopyWrapper::from(
+                                                            s_t,
+                                                        );
+                                                        self.kernel_async(
+                                                            stream,
+                                                            _x,
+                                                            _y,
+                                                            _z,
+                                                            _v,
+                                                            kernel_arg_4,
+                                                            s_t,
+                                                        )?;
+                                                        stream.synchronize()
+                                                    }
+                                                })(kernel_arg_4.as_async())
+                                            },
+                                        )
+                                    })(unsafe {
+                                        rc::host::HostAndDeviceConstRef::new(
+                                                &___v_box,
+                                                rc::utils::device_copy::SafeDeviceCopyWrapper::from_ref(_v),
+                                            )
+                                            .as_async()
+                                    });
+                                    if !__check_is_sync(_v) {
+                                        ___v_box
+                                            .copy_to(unsafe { &mut *(_v as *const _ as *mut _) })?;
+                                    }
+                                    ::core::mem::drop(___v_box);
+                                    __result
+                                })(_z.as_async())
+                            },
+                        )
+                    })(_y.as_async())
+                },
+            )
+        })(unsafe {
+            rc::host::HostAndDeviceConstRef::new(
+                    &___x_box,
+                    rc::utils::device_copy::SafeDeviceCopyWrapper::from_ref(_x),
+                )
+                .as_async()
+        });
+        if !__check_is_sync(_x) {
+            ___x_box.copy_to(unsafe { &mut *(_x as *const _ as *mut _) })?;
+        }
+        ::core::mem::drop(___x_box);
+        __result
+    }
+    #[allow(clippy::extra_unused_type_parameters)]
+    #[allow(clippy::too_many_arguments)]
+    #[allow(clippy::used_underscore_binding)]
+    #[allow(unused_variables)]
+    fn kernel_async<
+        'stream,
+        '__r2c_lt_0,
+        '__r2c_lt_1,
+        '__r2c_lt_2,
+        '__r2c_move_lt_4,
+        'a,
+    >(
+        &mut self,
+        stream: &'stream rc::rustacuda::stream::Stream,
+        _x: rc::host::HostAndDeviceConstRefAsync<
+            'stream,
+            '__r2c_lt_0,
+            rc::utils::device_copy::SafeDeviceCopyWrapper<<() as KernelArgs<T>>::__T_0>,
+        >,
+        mut _y: rc::host::HostAndDeviceMutRefAsync<
+            'stream,
+            '__r2c_lt_1,
+            rc::common::DeviceAccessible<
+                <<() as KernelArgs<
+                    T,
+                >>::__T_1 as rc::common::RustToCuda>::CudaRepresentation,
+            >,
+        >,
+        _z: rc::host::HostAndDeviceConstRefAsync<
+            'stream,
+            '__r2c_lt_2,
+            rc::common::DeviceAccessible<
+                <<() as KernelArgs<
+                    T,
+                >>::__T_2 as rc::common::RustToCuda>::CudaRepresentation,
+            >,
+        >,
+        _v: rc::host::HostAndDeviceConstRefAsync<
+            'stream,
+            'a,
+            rc::utils::device_copy::SafeDeviceCopyWrapper<<() as KernelArgs<T>>::__T_3>,
+        >,
+        kernel_arg_4: rc::host::HostAndDeviceOwnedAsync<
+            'stream,
+            '__r2c_move_lt_4,
+            rc::common::DeviceAccessible<
+                <<() as KernelArgs<
+                    T,
+                >>::__T_4 as rc::common::RustToCuda>::CudaRepresentation,
+            >,
+        >,
+        s_t: rc::utils::device_copy::SafeDeviceCopyWrapper<<() as KernelArgs<T>>::__T_5>,
+    ) -> rc::rustacuda::error::CudaResult<()>
+    where
+        Self: Sized + rc::host::Launcher<KernelTraitObject = dyn Kernel<T>>,
+    {
+        let rc::host::LaunchPackage { kernel, watcher, config } = rc::host::Launcher::get_launch_package(
+            self,
+        );
+        let kernel_jit_result = if config.ptx_jit {
+            kernel
+                .compile_with_ptx_jit_args(
+                    Some(
+                        &[
+                            None,
+                            Some(rc::ptx_jit::arg_as_raw_bytes(_y.for_host())),
+                            None,
+                            Some(rc::ptx_jit::arg_as_raw_bytes(_v.for_host())),
+                            None,
+                            None,
+                        ],
+                    ),
+                )?
+        } else {
+            kernel.compile_with_ptx_jit_args(None)?
+        };
+        let function = match kernel_jit_result {
+            rc::host::KernelJITResult::Recompiled(function) => {
+                <Self as rc::host::Launcher>::on_compile(function, watcher)?;
+                function
+            }
+            rc::host::KernelJITResult::Cached(function) => function,
+        };
+        #[allow(clippy::redundant_closure_call)]
+        (|
+            _x: rc::common::DeviceConstRef<
+                '__r2c_lt_0,
+                rc::utils::device_copy::SafeDeviceCopyWrapper<
+                    <() as KernelArgs<T>>::__T_0,
+                >,
+            >,
+            _y: rc::common::DeviceMutRef<
+                '__r2c_lt_1,
+                rc::common::DeviceAccessible<
+                    <<() as KernelArgs<
+                        T,
+                    >>::__T_1 as rc::common::RustToCuda>::CudaRepresentation,
+                >,
+            >,
+            _z: rc::common::DeviceConstRef<
+                '__r2c_lt_2,
+                rc::common::DeviceAccessible<
+                    <<() as KernelArgs<
+                        T,
+                    >>::__T_2 as rc::common::RustToCuda>::CudaRepresentation,
+                >,
+            >,
+            _v: rc::common::DeviceConstRef<
+                'a,
+                rc::utils::device_copy::SafeDeviceCopyWrapper<
+                    <() as KernelArgs<T>>::__T_3,
+                >,
+            >,
+            kernel_arg_4: rc::common::DeviceMutRef<
+                '__r2c_move_lt_4,
+                rc::common::DeviceAccessible<
+                    <<() as KernelArgs<
+                        T,
+                    >>::__T_4 as rc::common::RustToCuda>::CudaRepresentation,
+                >,
+            >,
+            s_t: rc::utils::device_copy::SafeDeviceCopyWrapper<
+                <() as KernelArgs<T>>::__T_5,
+            >|
+        {
+            if false {
+                #[allow(dead_code)]
+                fn assert_impl_devicecopy<T: rc::rustacuda_core::DeviceCopy>(_val: &T) {}
+                #[allow(dead_code)]
+                fn assert_impl_no_aliasing<T: rc::safety::NoAliasing>() {}
+                #[allow(dead_code)]
+                fn assert_impl_fits_into_device_register<
+                    T: rc::safety::FitsIntoDeviceRegister,
+                >(_val: &T) {}
+                assert_impl_devicecopy(&_x);
+                assert_impl_devicecopy(&_y);
+                assert_impl_devicecopy(&_z);
+                assert_impl_devicecopy(&_v);
+                assert_impl_devicecopy(&kernel_arg_4);
+                assert_impl_devicecopy(&s_t);
+                assert_impl_no_aliasing::<<() as KernelArgs<T>>::__T_0>();
+                assert_impl_no_aliasing::<<() as KernelArgs<T>>::__T_1>();
+                assert_impl_no_aliasing::<<() as KernelArgs<T>>::__T_2>();
+                assert_impl_no_aliasing::<<() as KernelArgs<T>>::__T_3>();
+                assert_impl_no_aliasing::<<() as KernelArgs<T>>::__T_4>();
+                assert_impl_no_aliasing::<<() as KernelArgs<T>>::__T_5>();
+                assert_impl_fits_into_device_register(&_x);
+                assert_impl_fits_into_device_register(&_y);
+                assert_impl_fits_into_device_register(&_z);
+                assert_impl_fits_into_device_register(&_v);
+                assert_impl_fits_into_device_register(&kernel_arg_4);
+                assert_impl_fits_into_device_register(&s_t);
+            }
+            let rc::host::LaunchConfig { grid, block, shared_memory_size, ptx_jit: _ } = config;
+            unsafe {
+                stream
+                    .launch(
+                        function,
+                        grid,
+                        block,
+                        shared_memory_size,
+                        &[
+                            &_x as *const _ as *mut ::std::ffi::c_void,
+                            &_y as *const _ as *mut ::std::ffi::c_void,
+                            &_z as *const _ as *mut ::std::ffi::c_void,
+                            &_v as *const _ as *mut ::std::ffi::c_void,
+                            &kernel_arg_4 as *const _ as *mut ::std::ffi::c_void,
+                            &s_t as *const _ as *mut ::std::ffi::c_void,
+                        ],
+                    )
+            }
+        })(
+            unsafe { _x.for_device_async() },
+            unsafe { _y.for_device_async() },
+            unsafe { _z.for_device_async() },
+            unsafe { _v.for_device_async() },
+            unsafe { kernel_arg_4.for_device_async() },
+            s_t,
+        )
+    }
+}
+#[cfg(not(target_os = "cuda"))]
+#[allow(clippy::missing_safety_doc)]
+unsafe impl<T: rc::common::RustToCuda, K: KernelPtx<T>> Kernel<T> for K
+where
+    T: rc::safety::StackOnly,
+    <T as rc::common::RustToCuda>::CudaRepresentation: rc::safety::StackOnly,
+    <T as rc::common::RustToCuda>::CudaAllocation: rc::common::EmptyCudaAlloc,
+{}
+#[cfg(not(target_os = "cuda"))]
+const _: rc::safety::kernel_signature::Assert<
+    { rc::safety::kernel_signature::CpuAndGpuKernelSignatures::Match },
+> = rc::safety::kernel_signature::Assert::<
+    {
+        rc::safety::kernel_signature::check(
+            "//\n// Generated by LLVM NVPTX Back-End\n//\n\n.version 3.2\n.target sm_35\n.address_size 64\n\n\t// .globl\tkernel_type_layout\n\n.visible .entry kernel_type_layout()\n{\n\n\n\tret;\n\n}\n\t// .globl\tkernel_dfae7eaf723a670c\n.visible .entry kernel_dfae7eaf723a670c()\n{\n\n\n\tret;\n\n}\n"
+                .as_bytes(),
+            ".visible .entry kernel_dfae7eaf723a670c".as_bytes(),
+        )
+    },
+>;
+#[cfg(not(target_os = "cuda"))]
+mod host {
+    #[allow(unused_imports)]
+    use super::KernelArgs;
+    use super::{Kernel, KernelPtx};
+    #[allow(dead_code)]
+    struct Launcher<T: rc::common::RustToCuda>(core::marker::PhantomData<T>);
+    unsafe impl KernelPtx<crate::Empty> for Launcher<crate::Empty> {
+        fn get_ptx_str() -> &'static str {
+            const PTX_STR: &'static str = "//\n// Generated by LLVM NVPTX Back-End\n//\n\n.version 3.2\n.target sm_35\n.address_size 64\n\n\t// .globl\tkernel_dfae7eaf723a670c_kernel_aab1c403129e575b\n.visible .entry kernel_dfae7eaf723a670c_kernel_aab1c403129e575b(\n\t.param .u64 kernel_dfae7eaf723a670c_kernel_aab1c403129e575b_param_0,\n\t.param .u64 kernel_dfae7eaf723a670c_kernel_aab1c403129e575b_param_1,\n\t.param .u64 kernel_dfae7eaf723a670c_kernel_aab1c403129e575b_param_2,\n\t.param .u64 kernel_dfae7eaf723a670c_kernel_aab1c403129e575b_param_3,\n\t.param .u64 kernel_dfae7eaf723a670c_kernel_aab1c403129e575b_param_4,\n\t.param .align 4 .b8 kernel_dfae7eaf723a670c_kernel_aab1c403129e575b_param_5[8]\n)\n{\n\t.reg .b32 \t%r<6>;\n\t.reg .b64 \t%rd<7>;\n\t.reg .f64 \t%fd<5>;\n\n\tld.param.u64 \t%rd3, [kernel_dfae7eaf723a670c_kernel_aab1c403129e575b_param_3];\n\tcvta.to.global.u64 \t%rd4, %rd3;\n\tld.param.u64 \t%rd5, [kernel_dfae7eaf723a670c_kernel_aab1c403129e575b_param_1];\n\tcvta.to.global.u64 \t%rd6, %rd5;\n\tld.global.u32 \t%r1, [%rd6];\n\t// begin inline asm\n\t// <rust-cuda-ptx-jit-const-load-%r1-1> //\n\t// end inline asm\n\tld.param.u32 \t%r3, [kernel_dfae7eaf723a670c_kernel_aab1c403129e575b_param_5];\n\tld.global.u32 \t%r2, [%rd4];\n\t// begin inline asm\n\t// <rust-cuda-ptx-jit-const-load-%r2-3> //\n\t// end inline asm\n\t// begin inline asm\n\t.shared .align 4 .b8 %rd1_rust_cuda_static_shared[24];\ncvta.shared.u64 %rd1, %rd1_rust_cuda_static_shared;\n\t// end inline asm\n\t// begin inline asm\n\t.shared .align 4 .b8 %rd2_rust_cuda_static_shared[24];\ncvta.shared.u64 %rd2, %rd2_rust_cuda_static_shared;\n\t// end inline asm\n\tcvt.rn.f64.u32 \t%fd1, %r3;\n\tadd.rn.f64 \t%fd2, %fd1, %fd1;\n\tmax.f64 \t%fd3, %fd2, 0d0000000000000000;\n\tmin.f64 \t%fd4, %fd3, 0d41EFFFFFFFE00000;\n\tcvt.rzi.u32.f64 \t%r4, %fd4;\n\tst.u32 \t[%rd1+8], %r4;\n\tmov.u32 \t%r5, 24;\n\tst.u32 \t[%rd2+20], %r5;\n\tret;\n\n}\n\n// <crate::Empty>\n";
+            const __KERNEL_DFAE7EAF723A670C__X_LAYOUT: &[u8; 879usize] = b"\xef\x06\x050.1.0mrust_cuda::common::DeviceConstRef<rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<single_source::Dummy>>\x06mrust_cuda::common::DeviceConstRef<rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<single_source::Dummy>>\x08\x08s\x0btransparent\x02\x07pointerh\x00Q*const rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<single_source::Dummy>\treferenceh\x00fcore::marker::PhantomData<&rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<single_source::Dummy>>Q*const rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<single_source::Dummy>\x08\x08pJrust_cuda::utils::device_copy::SafeDeviceCopyWrapper<single_source::Dummy>iJrust_cuda::utils::device_copy::SafeDeviceCopyWrapper<single_source::Dummy>\x04\x04s\x0btransparent\x01\x010h\x00\x14single_source::Dummy\x14single_source::Dummy\x04\x04s\x01C\x01\x010h\x00\x03i32\x03i32\x04\x04vfcore::marker::PhantomData<&rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<single_source::Dummy>>\x00\x01s\x00\x00";
+            const __KERNEL_DFAE7EAF723A670C__Y_LAYOUT: &[u8; 1811usize] = b"\x93\x0e\x050.1.0\x84\x01rust_cuda::common::DeviceMutRef<rust_cuda::common::DeviceAccessible<single_source::WrapperCudaRepresentation<single_source::Empty>>>\x0b\x84\x01rust_cuda::common::DeviceMutRef<rust_cuda::common::DeviceAccessible<single_source::WrapperCudaRepresentation<single_source::Empty>>>\x08\x08s\x0btransparent\x02\x07pointerh\x00h*mut rust_cuda::common::DeviceAccessible<single_source::WrapperCudaRepresentation<single_source::Empty>>\treferenceh\x00\x83\x01core::marker::PhantomData<&mut rust_cuda::common::DeviceAccessible<single_source::WrapperCudaRepresentation<single_source::Empty>>>h*mut rust_cuda::common::DeviceAccessible<single_source::WrapperCudaRepresentation<single_source::Empty>>\x08\x08pcrust_cuda::common::DeviceAccessible<single_source::WrapperCudaRepresentation<single_source::Empty>>mcrust_cuda::common::DeviceAccessible<single_source::WrapperCudaRepresentation<single_source::Empty>>\x00\x01s\x0btransparent\x01\x010h\x00>single_source::WrapperCudaRepresentation<single_source::Empty>>single_source::WrapperCudaRepresentation<single_source::Empty>\x00\x01s\x01C\x01\x05innerh\x00Krust_cuda::common::DeviceAccessible<single_source::EmptyCudaRepresentation>Krust_cuda::common::DeviceAccessible<single_source::EmptyCudaRepresentation>\x00\x01s\x0btransparent\x01\x010h\x00&single_source::EmptyCudaRepresentation&single_source::EmptyCudaRepresentation\x00\x01s\x01C\x01\x010h\x00brust_cuda::common::DeviceAccessible<rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<[u8; 0]>>brust_cuda::common::DeviceAccessible<rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<[u8; 0]>>\x00\x01s\x0btransparent\x01\x010h\x00=rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<[u8; 0]>=rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<[u8; 0]>\x00\x01s\x0btransparent\x01\x010h\x00\x07[u8; 0]\x07[u8; 0]\x00\x01a\x02u8\x00\x02u8\x01\x01v\x83\x01core::marker::PhantomData<&mut rust_cuda::common::DeviceAccessible<single_source::WrapperCudaRepresentation<single_source::Empty>>>\x00\x01s\x00\x00";
+            const __KERNEL_DFAE7EAF723A670C__Z_LAYOUT: &[u8; 1809usize] = b"\x91\x0e\x050.1.0\x86\x01rust_cuda::common::DeviceConstRef<rust_cuda::common::DeviceAccessible<single_source::WrapperCudaRepresentation<single_source::Empty>>>\x0b\x86\x01rust_cuda::common::DeviceConstRef<rust_cuda::common::DeviceAccessible<single_source::WrapperCudaRepresentation<single_source::Empty>>>\x08\x08s\x0btransparent\x02\x07pointerh\x00j*const rust_cuda::common::DeviceAccessible<single_source::WrapperCudaRepresentation<single_source::Empty>>\treferenceh\x00\x7fcore::marker::PhantomData<&rust_cuda::common::DeviceAccessible<single_source::WrapperCudaRepresentation<single_source::Empty>>>j*const rust_cuda::common::DeviceAccessible<single_source::WrapperCudaRepresentation<single_source::Empty>>\x08\x08pcrust_cuda::common::DeviceAccessible<single_source::WrapperCudaRepresentation<single_source::Empty>>icrust_cuda::common::DeviceAccessible<single_source::WrapperCudaRepresentation<single_source::Empty>>\x00\x01s\x0btransparent\x01\x010h\x00>single_source::WrapperCudaRepresentation<single_source::Empty>>single_source::WrapperCudaRepresentation<single_source::Empty>\x00\x01s\x01C\x01\x05innerh\x00Krust_cuda::common::DeviceAccessible<single_source::EmptyCudaRepresentation>Krust_cuda::common::DeviceAccessible<single_source::EmptyCudaRepresentation>\x00\x01s\x0btransparent\x01\x010h\x00&single_source::EmptyCudaRepresentation&single_source::EmptyCudaRepresentation\x00\x01s\x01C\x01\x010h\x00brust_cuda::common::DeviceAccessible<rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<[u8; 0]>>brust_cuda::common::DeviceAccessible<rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<[u8; 0]>>\x00\x01s\x0btransparent\x01\x010h\x00=rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<[u8; 0]>=rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<[u8; 0]>\x00\x01s\x0btransparent\x01\x010h\x00\x07[u8; 0]\x07[u8; 0]\x00\x01a\x02u8\x00\x02u8\x01\x01v\x7fcore::marker::PhantomData<&rust_cuda::common::DeviceAccessible<single_source::WrapperCudaRepresentation<single_source::Empty>>>\x00\x01s\x00\x00";
+            const __KERNEL_DFAE7EAF723A670C__V_LAYOUT: &[u8; 1068usize] = b"\xac\x08\x050.1.0vrust_cuda::common::DeviceConstRef<rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<core::sync::atomic::AtomicU64>>\x07vrust_cuda::common::DeviceConstRef<rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<core::sync::atomic::AtomicU64>>\x08\x08s\x0btransparent\x02\x07pointerh\x00Z*const rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<core::sync::atomic::AtomicU64>\treferenceh\x00ocore::marker::PhantomData<&rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<core::sync::atomic::AtomicU64>>Z*const rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<core::sync::atomic::AtomicU64>\x08\x08pSrust_cuda::utils::device_copy::SafeDeviceCopyWrapper<core::sync::atomic::AtomicU64>iSrust_cuda::utils::device_copy::SafeDeviceCopyWrapper<core::sync::atomic::AtomicU64>\x08\x08s\x0btransparent\x01\x010h\x00\x1dcore::sync::atomic::AtomicU64\x1dcore::sync::atomic::AtomicU64\x08\x08s\nC,align(8)\x01\x01vh\x00\x1bcore::cell::UnsafeCell<u64>\x1bcore::cell::UnsafeCell<u64>\x08\x08s\x15no_nieche,transparent\x01\x05valueh\x00\x03u64\x03u64\x08\x08vocore::marker::PhantomData<&rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<core::sync::atomic::AtomicU64>>\x00\x01s\x00\x00";
+            const __KERNEL_DFAE7EAF723A670C_KERNEL_ARG_4_LAYOUT: &[u8; 1811usize] = b"\x93\x0e\x050.1.0\x84\x01rust_cuda::common::DeviceMutRef<rust_cuda::common::DeviceAccessible<single_source::WrapperCudaRepresentation<single_source::Empty>>>\x0b\x84\x01rust_cuda::common::DeviceMutRef<rust_cuda::common::DeviceAccessible<single_source::WrapperCudaRepresentation<single_source::Empty>>>\x08\x08s\x0btransparent\x02\x07pointerh\x00h*mut rust_cuda::common::DeviceAccessible<single_source::WrapperCudaRepresentation<single_source::Empty>>\treferenceh\x00\x83\x01core::marker::PhantomData<&mut rust_cuda::common::DeviceAccessible<single_source::WrapperCudaRepresentation<single_source::Empty>>>h*mut rust_cuda::common::DeviceAccessible<single_source::WrapperCudaRepresentation<single_source::Empty>>\x08\x08pcrust_cuda::common::DeviceAccessible<single_source::WrapperCudaRepresentation<single_source::Empty>>mcrust_cuda::common::DeviceAccessible<single_source::WrapperCudaRepresentation<single_source::Empty>>\x00\x01s\x0btransparent\x01\x010h\x00>single_source::WrapperCudaRepresentation<single_source::Empty>>single_source::WrapperCudaRepresentation<single_source::Empty>\x00\x01s\x01C\x01\x05innerh\x00Krust_cuda::common::DeviceAccessible<single_source::EmptyCudaRepresentation>Krust_cuda::common::DeviceAccessible<single_source::EmptyCudaRepresentation>\x00\x01s\x0btransparent\x01\x010h\x00&single_source::EmptyCudaRepresentation&single_source::EmptyCudaRepresentation\x00\x01s\x01C\x01\x010h\x00brust_cuda::common::DeviceAccessible<rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<[u8; 0]>>brust_cuda::common::DeviceAccessible<rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<[u8; 0]>>\x00\x01s\x0btransparent\x01\x010h\x00=rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<[u8; 0]>=rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<[u8; 0]>\x00\x01s\x0btransparent\x01\x010h\x00\x07[u8; 0]\x07[u8; 0]\x00\x01a\x02u8\x00\x02u8\x01\x01v\x83\x01core::marker::PhantomData<&mut rust_cuda::common::DeviceAccessible<single_source::WrapperCudaRepresentation<single_source::Empty>>>\x00\x01s\x00\x00";
+            const __KERNEL_DFAE7EAF723A670C_S_T_LAYOUT: &[u8; 257usize] = b"\x81\x02\x050.1.0Jrust_cuda::utils::device_copy::SafeDeviceCopyWrapper<single_source::Tuple>\x04Jrust_cuda::utils::device_copy::SafeDeviceCopyWrapper<single_source::Tuple>\x08\x04s\x0btransparent\x01\x010h\x00\x14single_source::Tuple\x14single_source::Tuple\x08\x04s\x01C\x02\x010h\x00\x03u32\x011h\x04\x03i32\x03u32\x04\x04v\x03i32\x04\x04v";
+            const _: rc::safety::kernel_signature::Assert<
+                { rc::safety::kernel_signature::CpuAndGpuKernelSignatures::Match },
+            > = rc::safety::kernel_signature::Assert::<
+                {
+                    rc::safety::kernel_signature::check(
+                        PTX_STR.as_bytes(),
+                        ".visible .entry kernel_dfae7eaf723a670c_kernel_aab1c403129e575b"
+                            .as_bytes(),
+                    )
+                },
+            >;
+            const _: rc::safety::type_layout::Assert<
+                { rc::safety::type_layout::CpuAndGpuTypeLayouts::Match },
+            > = rc::safety::type_layout::Assert::<
+                {
+                    rc::safety::type_layout::check::<
+                        rc::common::DeviceConstRef<
+                            'static,
+                            rc::utils::device_copy::SafeDeviceCopyWrapper<
+                                <() as KernelArgs<crate::Empty>>::__T_0,
+                            >,
+                        >,
+                    >(__KERNEL_DFAE7EAF723A670C__X_LAYOUT)
+                },
+            >;
+            const _: rc::safety::type_layout::Assert<
+                { rc::safety::type_layout::CpuAndGpuTypeLayouts::Match },
+            > = rc::safety::type_layout::Assert::<
+                {
+                    rc::safety::type_layout::check::<
+                        rc::common::DeviceMutRef<
+                            'static,
+                            rc::common::DeviceAccessible<
+                                <<() as KernelArgs<
+                                    crate::Empty,
+                                >>::__T_1 as rc::common::RustToCuda>::CudaRepresentation,
+                            >,
+                        >,
+                    >(__KERNEL_DFAE7EAF723A670C__Y_LAYOUT)
+                },
+            >;
+            const _: rc::safety::type_layout::Assert<
+                { rc::safety::type_layout::CpuAndGpuTypeLayouts::Match },
+            > = rc::safety::type_layout::Assert::<
+                {
+                    rc::safety::type_layout::check::<
+                        rc::common::DeviceConstRef<
+                            'static,
+                            rc::common::DeviceAccessible<
+                                <<() as KernelArgs<
+                                    crate::Empty,
+                                >>::__T_2 as rc::common::RustToCuda>::CudaRepresentation,
+                            >,
+                        >,
+                    >(__KERNEL_DFAE7EAF723A670C__Z_LAYOUT)
+                },
+            >;
+            const _: rc::safety::type_layout::Assert<
+                { rc::safety::type_layout::CpuAndGpuTypeLayouts::Match },
+            > = rc::safety::type_layout::Assert::<
+                {
+                    rc::safety::type_layout::check::<
+                        rc::common::DeviceConstRef<
+                            'static,
+                            rc::utils::device_copy::SafeDeviceCopyWrapper<
+                                <() as KernelArgs<crate::Empty>>::__T_3,
+                            >,
+                        >,
+                    >(__KERNEL_DFAE7EAF723A670C__V_LAYOUT)
+                },
+            >;
+            const _: rc::safety::type_layout::Assert<
+                { rc::safety::type_layout::CpuAndGpuTypeLayouts::Match },
+            > = rc::safety::type_layout::Assert::<
+                {
+                    rc::safety::type_layout::check::<
+                        rc::common::DeviceMutRef<
+                            'static,
+                            rc::common::DeviceAccessible<
+                                <<() as KernelArgs<
+                                    crate::Empty,
+                                >>::__T_4 as rc::common::RustToCuda>::CudaRepresentation,
+                            >,
+                        >,
+                    >(__KERNEL_DFAE7EAF723A670C_KERNEL_ARG_4_LAYOUT)
+                },
+            >;
+            const _: rc::safety::type_layout::Assert<
+                { rc::safety::type_layout::CpuAndGpuTypeLayouts::Match },
+            > = rc::safety::type_layout::Assert::<
+                {
+                    rc::safety::type_layout::check::<
+                        rc::utils::device_copy::SafeDeviceCopyWrapper<
+                            <() as KernelArgs<crate::Empty>>::__T_5,
+                        >,
+                    >(__KERNEL_DFAE7EAF723A670C_S_T_LAYOUT)
+                },
+            >;
+            PTX_STR
+        }
+        fn new_kernel() -> rc::rustacuda::error::CudaResult<
+            rc::host::TypedKernel<dyn Kernel<crate::Empty>>,
+        > {
+            let ptx = Self::get_ptx_str();
+            let entry_point = "kernel_dfae7eaf723a670c_kernel_aab1c403129e575b";
+            rc::host::TypedKernel::new(ptx, entry_point)
+        }
+    }
+    unsafe impl KernelPtx<rc::utils::device_copy::SafeDeviceCopyWrapper<u64>>
+    for Launcher<rc::utils::device_copy::SafeDeviceCopyWrapper<u64>> {
+        fn get_ptx_str() -> &'static str {
+            const PTX_STR: &'static str = "//\n// Generated by LLVM NVPTX Back-End\n//\n\n.version 3.2\n.target sm_35\n.address_size 64\n\n\t// .globl\tkernel_dfae7eaf723a670c_kernel_54d0891c50855d77\n.visible .entry kernel_dfae7eaf723a670c_kernel_54d0891c50855d77(\n\t.param .u64 kernel_dfae7eaf723a670c_kernel_54d0891c50855d77_param_0,\n\t.param .u64 kernel_dfae7eaf723a670c_kernel_54d0891c50855d77_param_1,\n\t.param .u64 kernel_dfae7eaf723a670c_kernel_54d0891c50855d77_param_2,\n\t.param .u64 kernel_dfae7eaf723a670c_kernel_54d0891c50855d77_param_3,\n\t.param .u64 kernel_dfae7eaf723a670c_kernel_54d0891c50855d77_param_4,\n\t.param .align 4 .b8 kernel_dfae7eaf723a670c_kernel_54d0891c50855d77_param_5[8]\n)\n{\n\t.reg .b32 \t%r<6>;\n\t.reg .b64 \t%rd<7>;\n\t.reg .f64 \t%fd<5>;\n\n\tld.param.u64 \t%rd3, [kernel_dfae7eaf723a670c_kernel_54d0891c50855d77_param_3];\n\tcvta.to.global.u64 \t%rd4, %rd3;\n\tld.param.u64 \t%rd5, [kernel_dfae7eaf723a670c_kernel_54d0891c50855d77_param_1];\n\tcvta.to.global.u64 \t%rd6, %rd5;\n\tld.global.u32 \t%r1, [%rd6];\n\t// begin inline asm\n\t// <rust-cuda-ptx-jit-const-load-%r1-1> //\n\t// end inline asm\n\tld.param.u32 \t%r3, [kernel_dfae7eaf723a670c_kernel_54d0891c50855d77_param_5];\n\tld.global.u32 \t%r2, [%rd4];\n\t// begin inline asm\n\t// <rust-cuda-ptx-jit-const-load-%r2-3> //\n\t// end inline asm\n\t// begin inline asm\n\t.shared .align 4 .b8 %rd1_rust_cuda_static_shared[24];\ncvta.shared.u64 %rd1, %rd1_rust_cuda_static_shared;\n\t// end inline asm\n\t// begin inline asm\n\t.shared .align 4 .b8 %rd2_rust_cuda_static_shared[24];\ncvta.shared.u64 %rd2, %rd2_rust_cuda_static_shared;\n\t// end inline asm\n\tcvt.rn.f64.u32 \t%fd1, %r3;\n\tadd.rn.f64 \t%fd2, %fd1, %fd1;\n\tmax.f64 \t%fd3, %fd2, 0d0000000000000000;\n\tmin.f64 \t%fd4, %fd3, 0d41EFFFFFFFE00000;\n\tcvt.rzi.u32.f64 \t%r4, %fd4;\n\tst.u32 \t[%rd1+8], %r4;\n\tmov.u32 \t%r5, 24;\n\tst.u32 \t[%rd2+20], %r5;\n\tret;\n\n}\n\n// <rc::utils::device_copy::SafeDeviceCopyWrapper<u64>>\n";
+            const __KERNEL_DFAE7EAF723A670C__X_LAYOUT: &[u8; 879usize] = b"\xef\x06\x050.1.0mrust_cuda::common::DeviceConstRef<rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<single_source::Dummy>>\x06mrust_cuda::common::DeviceConstRef<rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<single_source::Dummy>>\x08\x08s\x0btransparent\x02\x07pointerh\x00Q*const rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<single_source::Dummy>\treferenceh\x00fcore::marker::PhantomData<&rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<single_source::Dummy>>Q*const rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<single_source::Dummy>\x08\x08pJrust_cuda::utils::device_copy::SafeDeviceCopyWrapper<single_source::Dummy>iJrust_cuda::utils::device_copy::SafeDeviceCopyWrapper<single_source::Dummy>\x04\x04s\x0btransparent\x01\x010h\x00\x14single_source::Dummy\x14single_source::Dummy\x04\x04s\x01C\x01\x010h\x00\x03i32\x03i32\x04\x04vfcore::marker::PhantomData<&rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<single_source::Dummy>>\x00\x01s\x00\x00";
+            const __KERNEL_DFAE7EAF723A670C__Y_LAYOUT: &[u8; 1891usize] = b"\xe3\x0e\x050.1.0\xa9\x01rust_cuda::common::DeviceMutRef<rust_cuda::common::DeviceAccessible<single_source::WrapperCudaRepresentation<rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<u64>>>>\x08\xa9\x01rust_cuda::common::DeviceMutRef<rust_cuda::common::DeviceAccessible<single_source::WrapperCudaRepresentation<rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<u64>>>>\x08\x08s\x0btransparent\x02\x07pointerh\x00\x8d\x01*mut rust_cuda::common::DeviceAccessible<single_source::WrapperCudaRepresentation<rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<u64>>>\treferenceh\x00\xa8\x01core::marker::PhantomData<&mut rust_cuda::common::DeviceAccessible<single_source::WrapperCudaRepresentation<rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<u64>>>>\x8d\x01*mut rust_cuda::common::DeviceAccessible<single_source::WrapperCudaRepresentation<rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<u64>>>\x08\x08p\x88\x01rust_cuda::common::DeviceAccessible<single_source::WrapperCudaRepresentation<rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<u64>>>m\x88\x01rust_cuda::common::DeviceAccessible<single_source::WrapperCudaRepresentation<rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<u64>>>\x08\x08s\x0btransparent\x01\x010h\x00csingle_source::WrapperCudaRepresentation<rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<u64>>csingle_source::WrapperCudaRepresentation<rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<u64>>\x08\x08s\x01C\x01\x05innerh\x00^rust_cuda::common::DeviceAccessible<rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<u64>>^rust_cuda::common::DeviceAccessible<rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<u64>>\x08\x08s\x0btransparent\x01\x010h\x009rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<u64>9rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<u64>\x08\x08s\x0btransparent\x01\x010h\x00\x03u64\x03u64\x08\x08v\xa8\x01core::marker::PhantomData<&mut rust_cuda::common::DeviceAccessible<single_source::WrapperCudaRepresentation<rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<u64>>>>\x00\x01s\x00\x00";
+            const __KERNEL_DFAE7EAF723A670C__Z_LAYOUT: &[u8; 1891usize] = b"\xe3\x0e\x050.1.0\xab\x01rust_cuda::common::DeviceConstRef<rust_cuda::common::DeviceAccessible<single_source::WrapperCudaRepresentation<rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<u64>>>>\x08\xab\x01rust_cuda::common::DeviceConstRef<rust_cuda::common::DeviceAccessible<single_source::WrapperCudaRepresentation<rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<u64>>>>\x08\x08s\x0btransparent\x02\x07pointerh\x00\x8f\x01*const rust_cuda::common::DeviceAccessible<single_source::WrapperCudaRepresentation<rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<u64>>>\treferenceh\x00\xa4\x01core::marker::PhantomData<&rust_cuda::common::DeviceAccessible<single_source::WrapperCudaRepresentation<rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<u64>>>>\x8f\x01*const rust_cuda::common::DeviceAccessible<single_source::WrapperCudaRepresentation<rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<u64>>>\x08\x08p\x88\x01rust_cuda::common::DeviceAccessible<single_source::WrapperCudaRepresentation<rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<u64>>>i\x88\x01rust_cuda::common::DeviceAccessible<single_source::WrapperCudaRepresentation<rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<u64>>>\x08\x08s\x0btransparent\x01\x010h\x00csingle_source::WrapperCudaRepresentation<rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<u64>>csingle_source::WrapperCudaRepresentation<rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<u64>>\x08\x08s\x01C\x01\x05innerh\x00^rust_cuda::common::DeviceAccessible<rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<u64>>^rust_cuda::common::DeviceAccessible<rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<u64>>\x08\x08s\x0btransparent\x01\x010h\x009rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<u64>9rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<u64>\x08\x08s\x0btransparent\x01\x010h\x00\x03u64\x03u64\x08\x08v\xa4\x01core::marker::PhantomData<&rust_cuda::common::DeviceAccessible<single_source::WrapperCudaRepresentation<rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<u64>>>>\x00\x01s\x00\x00";
+            const __KERNEL_DFAE7EAF723A670C__V_LAYOUT: &[u8; 1068usize] = b"\xac\x08\x050.1.0vrust_cuda::common::DeviceConstRef<rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<core::sync::atomic::AtomicU64>>\x07vrust_cuda::common::DeviceConstRef<rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<core::sync::atomic::AtomicU64>>\x08\x08s\x0btransparent\x02\x07pointerh\x00Z*const rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<core::sync::atomic::AtomicU64>\treferenceh\x00ocore::marker::PhantomData<&rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<core::sync::atomic::AtomicU64>>Z*const rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<core::sync::atomic::AtomicU64>\x08\x08pSrust_cuda::utils::device_copy::SafeDeviceCopyWrapper<core::sync::atomic::AtomicU64>iSrust_cuda::utils::device_copy::SafeDeviceCopyWrapper<core::sync::atomic::AtomicU64>\x08\x08s\x0btransparent\x01\x010h\x00\x1dcore::sync::atomic::AtomicU64\x1dcore::sync::atomic::AtomicU64\x08\x08s\nC,align(8)\x01\x01vh\x00\x1bcore::cell::UnsafeCell<u64>\x1bcore::cell::UnsafeCell<u64>\x08\x08s\x15no_nieche,transparent\x01\x05valueh\x00\x03u64\x03u64\x08\x08vocore::marker::PhantomData<&rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<core::sync::atomic::AtomicU64>>\x00\x01s\x00\x00";
+            const __KERNEL_DFAE7EAF723A670C_KERNEL_ARG_4_LAYOUT: &[u8; 1891usize] = b"\xe3\x0e\x050.1.0\xa9\x01rust_cuda::common::DeviceMutRef<rust_cuda::common::DeviceAccessible<single_source::WrapperCudaRepresentation<rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<u64>>>>\x08\xa9\x01rust_cuda::common::DeviceMutRef<rust_cuda::common::DeviceAccessible<single_source::WrapperCudaRepresentation<rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<u64>>>>\x08\x08s\x0btransparent\x02\x07pointerh\x00\x8d\x01*mut rust_cuda::common::DeviceAccessible<single_source::WrapperCudaRepresentation<rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<u64>>>\treferenceh\x00\xa8\x01core::marker::PhantomData<&mut rust_cuda::common::DeviceAccessible<single_source::WrapperCudaRepresentation<rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<u64>>>>\x8d\x01*mut rust_cuda::common::DeviceAccessible<single_source::WrapperCudaRepresentation<rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<u64>>>\x08\x08p\x88\x01rust_cuda::common::DeviceAccessible<single_source::WrapperCudaRepresentation<rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<u64>>>m\x88\x01rust_cuda::common::DeviceAccessible<single_source::WrapperCudaRepresentation<rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<u64>>>\x08\x08s\x0btransparent\x01\x010h\x00csingle_source::WrapperCudaRepresentation<rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<u64>>csingle_source::WrapperCudaRepresentation<rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<u64>>\x08\x08s\x01C\x01\x05innerh\x00^rust_cuda::common::DeviceAccessible<rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<u64>>^rust_cuda::common::DeviceAccessible<rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<u64>>\x08\x08s\x0btransparent\x01\x010h\x009rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<u64>9rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<u64>\x08\x08s\x0btransparent\x01\x010h\x00\x03u64\x03u64\x08\x08v\xa8\x01core::marker::PhantomData<&mut rust_cuda::common::DeviceAccessible<single_source::WrapperCudaRepresentation<rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<u64>>>>\x00\x01s\x00\x00";
+            const __KERNEL_DFAE7EAF723A670C_S_T_LAYOUT: &[u8; 257usize] = b"\x81\x02\x050.1.0Jrust_cuda::utils::device_copy::SafeDeviceCopyWrapper<single_source::Tuple>\x04Jrust_cuda::utils::device_copy::SafeDeviceCopyWrapper<single_source::Tuple>\x08\x04s\x0btransparent\x01\x010h\x00\x14single_source::Tuple\x14single_source::Tuple\x08\x04s\x01C\x02\x010h\x00\x03u32\x011h\x04\x03i32\x03u32\x04\x04v\x03i32\x04\x04v";
+            const _: rc::safety::kernel_signature::Assert<
+                { rc::safety::kernel_signature::CpuAndGpuKernelSignatures::Match },
+            > = rc::safety::kernel_signature::Assert::<
+                {
+                    rc::safety::kernel_signature::check(
+                        PTX_STR.as_bytes(),
+                        ".visible .entry kernel_dfae7eaf723a670c_kernel_54d0891c50855d77"
+                            .as_bytes(),
+                    )
+                },
+            >;
+            const _: rc::safety::type_layout::Assert<
+                { rc::safety::type_layout::CpuAndGpuTypeLayouts::Match },
+            > = rc::safety::type_layout::Assert::<
+                {
+                    rc::safety::type_layout::check::<
+                        rc::common::DeviceConstRef<
+                            'static,
+                            rc::utils::device_copy::SafeDeviceCopyWrapper<
+                                <() as KernelArgs<
+                                    rc::utils::device_copy::SafeDeviceCopyWrapper<u64>,
+                                >>::__T_0,
+                            >,
+                        >,
+                    >(__KERNEL_DFAE7EAF723A670C__X_LAYOUT)
+                },
+            >;
+            const _: rc::safety::type_layout::Assert<
+                { rc::safety::type_layout::CpuAndGpuTypeLayouts::Match },
+            > = rc::safety::type_layout::Assert::<
+                {
+                    rc::safety::type_layout::check::<
+                        rc::common::DeviceMutRef<
+                            'static,
+                            rc::common::DeviceAccessible<
+                                <<() as KernelArgs<
+                                    rc::utils::device_copy::SafeDeviceCopyWrapper<u64>,
+                                >>::__T_1 as rc::common::RustToCuda>::CudaRepresentation,
+                            >,
+                        >,
+                    >(__KERNEL_DFAE7EAF723A670C__Y_LAYOUT)
+                },
+            >;
+            const _: rc::safety::type_layout::Assert<
+                { rc::safety::type_layout::CpuAndGpuTypeLayouts::Match },
+            > = rc::safety::type_layout::Assert::<
+                {
+                    rc::safety::type_layout::check::<
+                        rc::common::DeviceConstRef<
+                            'static,
+                            rc::common::DeviceAccessible<
+                                <<() as KernelArgs<
+                                    rc::utils::device_copy::SafeDeviceCopyWrapper<u64>,
+                                >>::__T_2 as rc::common::RustToCuda>::CudaRepresentation,
+                            >,
+                        >,
+                    >(__KERNEL_DFAE7EAF723A670C__Z_LAYOUT)
+                },
+            >;
+            const _: rc::safety::type_layout::Assert<
+                { rc::safety::type_layout::CpuAndGpuTypeLayouts::Match },
+            > = rc::safety::type_layout::Assert::<
+                {
+                    rc::safety::type_layout::check::<
+                        rc::common::DeviceConstRef<
+                            'static,
+                            rc::utils::device_copy::SafeDeviceCopyWrapper<
+                                <() as KernelArgs<
+                                    rc::utils::device_copy::SafeDeviceCopyWrapper<u64>,
+                                >>::__T_3,
+                            >,
+                        >,
+                    >(__KERNEL_DFAE7EAF723A670C__V_LAYOUT)
+                },
+            >;
+            const _: rc::safety::type_layout::Assert<
+                { rc::safety::type_layout::CpuAndGpuTypeLayouts::Match },
+            > = rc::safety::type_layout::Assert::<
+                {
+                    rc::safety::type_layout::check::<
+                        rc::common::DeviceMutRef<
+                            'static,
+                            rc::common::DeviceAccessible<
+                                <<() as KernelArgs<
+                                    rc::utils::device_copy::SafeDeviceCopyWrapper<u64>,
+                                >>::__T_4 as rc::common::RustToCuda>::CudaRepresentation,
+                            >,
+                        >,
+                    >(__KERNEL_DFAE7EAF723A670C_KERNEL_ARG_4_LAYOUT)
+                },
+            >;
+            const _: rc::safety::type_layout::Assert<
+                { rc::safety::type_layout::CpuAndGpuTypeLayouts::Match },
+            > = rc::safety::type_layout::Assert::<
+                {
+                    rc::safety::type_layout::check::<
+                        rc::utils::device_copy::SafeDeviceCopyWrapper<
+                            <() as KernelArgs<
+                                rc::utils::device_copy::SafeDeviceCopyWrapper<u64>,
+                            >>::__T_5,
+                        >,
+                    >(__KERNEL_DFAE7EAF723A670C_S_T_LAYOUT)
+                },
+            >;
+            PTX_STR
+        }
+        fn new_kernel() -> rc::rustacuda::error::CudaResult<
+            rc::host::TypedKernel<
+                dyn Kernel<rc::utils::device_copy::SafeDeviceCopyWrapper<u64>>,
+            >,
+        > {
+            let ptx = Self::get_ptx_str();
+            let entry_point = "kernel_dfae7eaf723a670c_kernel_54d0891c50855d77";
+            rc::host::TypedKernel::new(ptx, entry_point)
+        }
+    }
+    impl<T: rc::common::RustToCuda> rc::host::Launcher for Launcher<T> {
+        type CompilationWatcher = ();
+        type KernelTraitObject = dyn Kernel<T>;
+        fn get_launch_package(&mut self) -> rc::host::LaunchPackage<Self> {
+            ::core::panicking::panic("not implemented")
+        }
+    }
+}
diff --git a/examples/single-source/src/main.rs b/examples/single-source/src/main.rs
index 55a2e8046..981c9bccc 100644
--- a/examples/single-source/src/main.rs
+++ b/examples/single-source/src/main.rs
@@ -38,7 +38,7 @@ pub struct Empty([u8; 0]);
 #[layout(crate = "rc::const_type_layout")]
 pub struct Tuple(u32, i32);
 
-#[rc::common::kernel(use link_kernel! as impl Kernel<KernelArgs> for Launcher)]
+#[rc::common::kernel(use link_kernel! as impl Kernel<KernelArgs, KernelPtx> for Launcher)]
 #[kernel(crate = "rc")]
 pub fn kernel<'a, T: rc::common::RustToCuda>(
     #[kernel(pass = SafeDeviceCopy)] _x: &Dummy,
@@ -49,7 +49,9 @@ pub fn kernel<'a, T: rc::common::RustToCuda>(
     #[kernel(pass = SafeDeviceCopy)] Tuple(s, mut __t): Tuple,
     // #[kernel(pass = SafeDeviceCopy)] shared3: ThreadBlockShared<u32>,
 ) where
+    T: rc::safety::StackOnly + rc::safety::NoAliasing,
     <T as rc::common::RustToCuda>::CudaRepresentation: rc::safety::StackOnly,
+    <T as rc::common::RustToCuda>::CudaAllocation: rc::common::EmptyCudaAlloc,
 {
     let shared: ThreadBlockShared<[Tuple; 3]> = ThreadBlockShared::new_uninit();
     let shared2: ThreadBlockShared<[Tuple; 3]> = ThreadBlockShared::new_uninit();
@@ -69,7 +71,9 @@ pub fn kernel<'a, T: rc::common::RustToCuda>(
 
 #[cfg(not(target_os = "cuda"))]
 mod host {
-    use super::{Kernel, KernelArgs};
+    #[allow(unused_imports)]
+    use super::KernelArgs;
+    use super::{Kernel, KernelPtx};
 
     #[allow(dead_code)]
     struct Launcher<T: rc::common::RustToCuda>(core::marker::PhantomData<T>);
diff --git a/rust-cuda-derive/src/kernel/link/mod.rs b/rust-cuda-derive/src/kernel/link/mod.rs
index 1b116435c..d383198ec 100644
--- a/rust-cuda-derive/src/kernel/link/mod.rs
+++ b/rust-cuda-derive/src/kernel/link/mod.rs
@@ -247,7 +247,7 @@ pub fn link_kernel(tokens: TokenStream) -> TokenStream {
     let r = unsafe {
         ptx_compiler::sys::nvPTXCompilerCompile(
             compiler,
-            options_ptrs.len() as c_int,
+            c_int::try_from(options_ptrs.len()).unwrap(),
             options_ptrs.as_ptr().cast(),
         )
     };
diff --git a/rust-cuda-derive/src/kernel/wrapper/config.rs b/rust-cuda-derive/src/kernel/wrapper/config.rs
index c07486c2b..382db35f9 100644
--- a/rust-cuda-derive/src/kernel/wrapper/config.rs
+++ b/rust-cuda-derive/src/kernel/wrapper/config.rs
@@ -3,6 +3,7 @@ pub(super) struct KernelConfig {
     pub(super) linker: syn::Ident,
     pub(super) kernel: syn::Ident,
     pub(super) args: syn::Ident,
+    pub(super) ptx: syn::Ident,
     pub(super) launcher: syn::Ident,
 }
 
@@ -17,6 +18,9 @@ impl syn::parse::Parse for KernelConfig {
         let kernel: syn::Ident = input.parse()?;
         let _lt_token: syn::token::Lt = input.parse()?;
         let args: syn::Ident = input.parse()?;
+        let _comma: syn::token::Comma = input.parse()?;
+        let ptx: syn::Ident = input.parse()?;
+        let _comma: Option<syn::Ident> = input.parse()?;
         let _gt_token: syn::token::Gt = input.parse()?;
         let _for: syn::token::For = input.parse()?;
         let launcher: syn::Ident = input.parse()?;
@@ -26,6 +30,7 @@ impl syn::parse::Parse for KernelConfig {
             linker,
             kernel,
             args,
+            ptx,
             launcher,
         })
     }
diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/get_ptx_str.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/get_ptx_str.rs
index d412bd316..b3e215a20 100644
--- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/get_ptx_str.rs
+++ b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/get_ptx_str.rs
@@ -3,7 +3,7 @@ use syn::spanned::Spanned;
 
 use crate::kernel::utils::skip_kernel_compilation;
 
-use super::super::super::{DeclGenerics, FuncIdent, FunctionInputs, KernelConfig};
+use super::super::super::{DeclGenerics, FuncIdent, FunctionInputs, InputCudaType, KernelConfig};
 
 pub(super) fn quote_get_ptx_str(
     crate_path: &syn::Path,
@@ -30,14 +30,8 @@ pub(super) fn quote_get_ptx_str(
     let crate_manifest_dir = proc_macro::tracked_env::var("CARGO_MANIFEST_DIR")
         .unwrap_or_else(|err| abort_call_site!("Failed to read crate path: {:?}.", err));
 
-    let cpu_func_lifetime_erased_types = super::kernel_func_async::generate_launch_types(
-        crate_path,
-        config,
-        generics,
-        inputs,
-        macro_type_ids,
-    )
-    .1;
+    let cpu_func_lifetime_erased_types =
+        generate_lifetime_erased_types(crate_path, config, generics, inputs, macro_type_ids);
 
     let matching_kernel_assert = if skip_kernel_compilation() {
         quote!()
@@ -93,7 +87,83 @@ pub(super) fn quote_get_ptx_str(
 
             #(#type_layout_asserts)*
 
+            #[deny(improper_ctypes)]
+            mod __rust_cuda_ffi_safe_assert {
+                use super::#args;
+
+                extern "C" { #(
+                    #[allow(dead_code)]
+                    static #func_params: #cpu_func_lifetime_erased_types;
+                )* }
+            }
+
             PTX_STR
         }
     }
 }
+
+fn generate_lifetime_erased_types(
+    crate_path: &syn::Path,
+    KernelConfig { args, .. }: &KernelConfig,
+    DeclGenerics {
+        generic_start_token,
+        generic_close_token,
+        ..
+    }: &DeclGenerics,
+    FunctionInputs {
+        func_inputs,
+        func_input_cuda_types,
+    }: &FunctionInputs,
+    macro_type_ids: &[syn::Ident],
+) -> Vec<TokenStream> {
+    let mut cpu_func_lifetime_erased_types = Vec::with_capacity(func_inputs.len());
+
+    func_inputs
+        .iter()
+        .zip(func_input_cuda_types.iter())
+        .enumerate()
+        .for_each(|(i, (arg, (cuda_mode, _ptx_jit)))| match arg {
+            syn::FnArg::Typed(syn::PatType { ty, .. }) => {
+                let type_ident = quote::format_ident!("__T_{}", i);
+                let syn_type = quote::quote_spanned! { ty.span()=>
+                    <() as #args #generic_start_token
+                        #($#macro_type_ids),*
+                    #generic_close_token>::#type_ident
+                };
+
+                let cuda_type = match cuda_mode {
+                    InputCudaType::SafeDeviceCopy => quote::quote_spanned! { ty.span()=>
+                        #crate_path::utils::device_copy::SafeDeviceCopyWrapper<#syn_type>
+                    },
+                    InputCudaType::LendRustToCuda => quote::quote_spanned! { ty.span()=>
+                        #crate_path::common::DeviceAccessible<
+                            <#syn_type as #crate_path::common::RustToCuda>::CudaRepresentation
+                        >
+                    },
+                };
+
+                cpu_func_lifetime_erased_types.push(
+                    if let syn::Type::Reference(syn::TypeReference { mutability, .. }) = &**ty {
+                        if mutability.is_some() {
+                            quote::quote_spanned! { ty.span()=>
+                                #crate_path::common::DeviceMutRef<'static, #cuda_type>
+                            }
+                        } else {
+                            quote::quote_spanned! { ty.span()=>
+                                #crate_path::common::DeviceConstRef<'static, #cuda_type>
+                            }
+                        }
+                    } else if matches!(cuda_mode, InputCudaType::LendRustToCuda) {
+                        quote::quote_spanned! { ty.span()=>
+                            #crate_path::common::DeviceMutRef<'static, #cuda_type>
+                        }
+                    } else {
+                        cuda_type
+                    },
+                );
+            },
+            syn::FnArg::Receiver(_) => unreachable!(),
+        });
+
+    cpu_func_lifetime_erased_types
+}
diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/mod.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/mod.rs
index aedf1e12e..91f94a568 100644
--- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/mod.rs
+++ b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/mod.rs
@@ -3,22 +3,18 @@ use proc_macro2::TokenStream;
 use super::super::{DeclGenerics, FuncIdent, FunctionInputs, KernelConfig};
 
 mod get_ptx_str;
-mod kernel_func;
-mod kernel_func_async;
 mod new_kernel;
 
 use get_ptx_str::quote_get_ptx_str;
-use kernel_func::quote_kernel_func;
-use kernel_func_async::quote_kernel_func_async;
 use new_kernel::quote_new_kernel;
 
 pub(in super::super) fn quote_cpu_linker_macro(
     crate_path: &syn::Path,
     config @ KernelConfig {
         visibility,
-        kernel,
         linker,
         launcher,
+        ptx,
         ..
     }: &KernelConfig,
     decl_generics @ DeclGenerics {
@@ -30,7 +26,6 @@ pub(in super::super) fn quote_cpu_linker_macro(
     func_inputs: &FunctionInputs,
     func_ident: &FuncIdent,
     func_params: &[syn::Ident],
-    func_attrs: &[syn::Attribute],
 ) -> TokenStream {
     let macro_types = generic_params
         .iter()
@@ -72,42 +67,18 @@ pub(in super::super) fn quote_cpu_linker_macro(
         func_ident,
         &macro_type_ids,
     );
-    let kernel_func = quote_kernel_func(
-        crate_path,
-        config,
-        decl_generics,
-        func_inputs,
-        func_ident,
-        func_params,
-        func_attrs,
-        &macro_type_ids,
-    );
-    let kernel_func_async = quote_kernel_func_async(
-        crate_path,
-        config,
-        decl_generics,
-        func_inputs,
-        func_ident,
-        func_params,
-        func_attrs,
-        &macro_type_ids,
-    );
 
     quote! {
         #[cfg(not(target_os = "cuda"))]
         #cpu_linker_macro_visibility
         macro_rules! #linker {
             (#(#macro_types),* $(,)?) => {
-                unsafe impl #kernel #generic_start_token #($#macro_type_ids),* #generic_close_token
+                unsafe impl #ptx #generic_start_token #($#macro_type_ids),* #generic_close_token
                     for #launcher #generic_start_token #($#macro_type_ids),* #generic_close_token
                 {
                     #get_ptx_str
 
                     #new_kernel
-
-                    #kernel_func
-
-                    #kernel_func_async
                 }
             };
         }
diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper.rs
deleted file mode 100644
index 4851af9ce..000000000
--- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper.rs
+++ /dev/null
@@ -1,193 +0,0 @@
-use proc_macro2::TokenStream;
-
-use crate::kernel::utils::r2c_move_lifetime;
-
-use super::super::{
-    DeclGenerics, FuncIdent, FunctionInputs, ImplGenerics, InputCudaType, KernelConfig,
-};
-
-pub(in super::super) fn quote_cpu_wrapper(
-    crate_path: &syn::Path,
-    config @ KernelConfig {
-        visibility, kernel, ..
-    }: &KernelConfig,
-    DeclGenerics {
-        generic_start_token,
-        generic_trait_params,
-        generic_close_token,
-        generic_trait_where_clause,
-        generic_wrapper_params,
-        generic_wrapper_where_clause,
-        ..
-    }: &DeclGenerics,
-    impl_generics @ ImplGenerics { ty_generics, .. }: &ImplGenerics,
-    func_inputs: &FunctionInputs,
-    FuncIdent {
-        func_ident,
-        func_ident_async,
-        ..
-    }: &FuncIdent,
-    func_attrs: &[syn::Attribute],
-) -> TokenStream {
-    let launcher_predicate = quote! {
-        Self: Sized + #crate_path::host::Launcher<
-            KernelTraitObject = dyn #kernel #ty_generics
-        >
-    };
-
-    let generic_wrapper_where_clause = match generic_wrapper_where_clause {
-        Some(syn::WhereClause {
-            where_token,
-            predicates,
-        }) if !predicates.is_empty() => {
-            let comma = if predicates.empty_or_trailing() {
-                quote!()
-            } else {
-                quote!(,)
-            };
-
-            quote! {
-                #where_token #predicates #comma #launcher_predicate
-            }
-        },
-        _ => quote! {
-            where #launcher_predicate
-        },
-    };
-
-    let (new_func_inputs_decl, new_func_inputs_async_decl) =
-        generate_new_func_inputs_decl(crate_path, config, impl_generics, func_inputs);
-
-    quote! {
-        #[cfg(not(target_os = "cuda"))]
-        #[allow(clippy::missing_safety_doc)]
-        #visibility unsafe trait #kernel #generic_start_token #generic_trait_params #generic_close_token
-            #generic_trait_where_clause
-        {
-            fn get_ptx_str() -> &'static str where #launcher_predicate;
-
-            fn new_kernel() -> #crate_path::rustacuda::error::CudaResult<
-                #crate_path::host::TypedKernel<dyn #kernel #ty_generics>
-            > where #launcher_predicate;
-
-            #(#func_attrs)*
-            #[allow(clippy::too_many_arguments)]
-            fn #func_ident <'stream, #generic_wrapper_params>(
-                &mut self,
-                stream: &'stream #crate_path::rustacuda::stream::Stream,
-                #(#new_func_inputs_decl),*
-            ) -> #crate_path::rustacuda::error::CudaResult<()>
-                #generic_wrapper_where_clause;
-
-            #(#func_attrs)*
-            #[allow(clippy::too_many_arguments)]
-            fn #func_ident_async <'stream, #generic_wrapper_params>(
-                &mut self,
-                stream: &'stream #crate_path::rustacuda::stream::Stream,
-                #(#new_func_inputs_async_decl),*
-            ) -> #crate_path::rustacuda::error::CudaResult<()>
-                #generic_wrapper_where_clause;
-        }
-    }
-}
-
-fn generate_new_func_inputs_decl(
-    crate_path: &syn::Path,
-    KernelConfig { args, .. }: &KernelConfig,
-    ImplGenerics { ty_generics, .. }: &ImplGenerics,
-    FunctionInputs {
-        func_inputs,
-        func_input_cuda_types,
-    }: &FunctionInputs,
-) -> (Vec<syn::FnArg>, Vec<syn::FnArg>) {
-    func_inputs
-        .iter()
-        .zip(func_input_cuda_types.iter())
-        .enumerate()
-        .map(|(i, (arg, (cuda_mode, _ptx_jit)))| match arg {
-            syn::FnArg::Typed(syn::PatType {
-                attrs,
-                pat,
-                colon_token,
-                ty,
-            }) => (
-                syn::FnArg::Typed(syn::PatType {
-                    attrs: attrs.clone(),
-                    pat: pat.clone(),
-                    colon_token: *colon_token,
-                    ty: {
-                        let type_ident = quote::format_ident!("__T_{}", i);
-                        let syn_type = syn::parse_quote!(<() as #args #ty_generics>::#type_ident);
-
-                        if let syn::Type::Reference(syn::TypeReference {
-                            and_token,
-                            lifetime,
-                            mutability,
-                            ..
-                        }) = &**ty
-                        {
-                            Box::new(syn::Type::Reference(syn::TypeReference {
-                                and_token: *and_token,
-                                lifetime: lifetime.clone(),
-                                mutability: *mutability,
-                                elem: syn_type,
-                            }))
-                        } else {
-                            syn_type
-                        }
-                    },
-                }),
-                syn::FnArg::Typed(syn::PatType {
-                    attrs: attrs.clone(),
-                    pat: pat.clone(),
-                    colon_token: *colon_token,
-                    ty: {
-                        let type_ident = quote::format_ident!("__T_{}", i);
-                        let syn_type: Box<syn::Type> =
-                            syn::parse_quote!(<() as #args #ty_generics>::#type_ident);
-
-                        let cuda_type = match cuda_mode {
-                            InputCudaType::SafeDeviceCopy => syn::parse_quote!(
-                                #crate_path::utils::device_copy::SafeDeviceCopyWrapper<#syn_type>
-                            ),
-                            InputCudaType::LendRustToCuda => syn::parse_quote!(
-                                #crate_path::common::DeviceAccessible<
-                                    <#syn_type as #crate_path::common::RustToCuda>::CudaRepresentation
-                                >
-                            ),
-                        };
-
-                        if let syn::Type::Reference(syn::TypeReference {
-                            lifetime,
-                            mutability,
-                            ..
-                        }) = &**ty
-                        {
-                            let wrapped_type = if mutability.is_some() {
-                                syn::parse_quote!(
-                                    #crate_path::host::HostAndDeviceMutRefAsync<'stream, #lifetime, #cuda_type>
-                                )
-                            } else {
-                                syn::parse_quote!(
-                                    #crate_path::host::HostAndDeviceConstRefAsync<'stream, #lifetime, #cuda_type>
-                                )
-                            };
-
-                            Box::new(wrapped_type)
-                        } else if matches!(cuda_mode, InputCudaType::LendRustToCuda) {
-                            let lifetime = r2c_move_lifetime(i, ty);
-
-                            let wrapped_type = syn::parse_quote!(
-                                #crate_path::host::HostAndDeviceOwnedAsync<'stream, #lifetime, #cuda_type>
-                            );
-
-                            Box::new(wrapped_type)
-                        } else {
-                            cuda_type
-                        }
-                    },
-                })
-            ),
-            syn::FnArg::Receiver(_) => unreachable!(),
-        }).unzip()
-}
diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func.rs
similarity index 84%
rename from rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func.rs
rename to rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func.rs
index d6e70e276..94b4b9598 100644
--- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func.rs
+++ b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func.rs
@@ -1,15 +1,16 @@
 use proc_macro2::TokenStream;
 
-use super::super::super::{DeclGenerics, FuncIdent, FunctionInputs, InputCudaType, KernelConfig};
+use super::super::super::{
+    DeclGenerics, FuncIdent, FunctionInputs, ImplGenerics, InputCudaType, KernelConfig,
+};
 
 #[allow(clippy::too_many_arguments)]
-pub(super) fn quote_kernel_func(
+pub(super) fn quote_kernel_func_inputs(
     crate_path: &syn::Path,
-    KernelConfig { args, .. }: &KernelConfig,
+    KernelConfig { kernel, args, .. }: &KernelConfig,
+    ImplGenerics { ty_generics, .. }: &ImplGenerics,
     DeclGenerics {
-        generic_start_token,
         generic_wrapper_params,
-        generic_close_token,
         generic_wrapper_where_clause,
         ..
     }: &DeclGenerics,
@@ -17,9 +18,34 @@ pub(super) fn quote_kernel_func(
     fn_ident @ FuncIdent { func_ident, .. }: &FuncIdent,
     func_params: &[syn::Ident],
     func_attrs: &[syn::Attribute],
-    macro_type_ids: &[syn::Ident],
 ) -> TokenStream {
-    let new_func_inputs = func_inputs
+    let launcher_predicate = quote! {
+        Self: Sized + #crate_path::host::Launcher<
+            KernelTraitObject = dyn #kernel #ty_generics
+        >
+    };
+
+    let generic_wrapper_where_clause = match generic_wrapper_where_clause {
+        Some(syn::WhereClause {
+            where_token,
+            predicates,
+        }) if !predicates.is_empty() => {
+            let comma = if predicates.empty_or_trailing() {
+                quote!()
+            } else {
+                quote!(,)
+            };
+
+            quote! {
+                #where_token #predicates #comma #launcher_predicate
+            }
+        },
+        _ => quote! {
+            where #launcher_predicate
+        },
+    };
+
+    let kernel_func_inputs = func_inputs
         .iter()
         .enumerate()
         .map(|(i, arg)| match arg {
@@ -31,9 +57,7 @@ pub(super) fn quote_kernel_func(
             }) => {
                 let type_ident = quote::format_ident!("__T_{}", i);
                 let syn_type = quote! {
-                    <() as #args #generic_start_token
-                        #($#macro_type_ids),*
-                    #generic_close_token>::#type_ident
+                    <() as #args #ty_generics>::#type_ident
                 };
 
                 if let syn::Type::Reference(syn::TypeReference {
@@ -60,10 +84,13 @@ pub(super) fn quote_kernel_func(
     quote! {
         #(#func_attrs)*
         #[allow(clippy::needless_lifetimes)]
+        #[allow(clippy::too_many_arguments)]
+        #[allow(clippy::used_underscore_binding)]
+        #[allow(unused_variables)]
         fn #func_ident <'stream, #generic_wrapper_params>(
             &mut self,
             stream: &'stream #crate_path::rustacuda::stream::Stream,
-            #(#new_func_inputs),*
+            #(#kernel_func_inputs),*
         ) -> #crate_path::rustacuda::error::CudaResult<()>
             #generic_wrapper_where_clause
         {
diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func_async/async_func_types.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func_async/async_func_types.rs
similarity index 89%
rename from rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func_async/async_func_types.rs
rename to rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func_async/async_func_types.rs
index c24406c9a..efe8026eb 100644
--- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func_async/async_func_types.rs
+++ b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func_async/async_func_types.rs
@@ -3,21 +3,16 @@ use syn::spanned::Spanned;
 
 use crate::kernel::utils::r2c_move_lifetime;
 
-use super::super::super::super::{DeclGenerics, FunctionInputs, InputCudaType, KernelConfig};
+use super::super::super::super::{FunctionInputs, ImplGenerics, InputCudaType, KernelConfig};
 
 pub(super) fn generate_async_func_types(
     crate_path: &syn::Path,
     KernelConfig { args, .. }: &KernelConfig,
-    DeclGenerics {
-        generic_start_token,
-        generic_close_token,
-        ..
-    }: &DeclGenerics,
+    ImplGenerics { ty_generics, .. }: &ImplGenerics,
     FunctionInputs {
         func_inputs,
         func_input_cuda_types,
     }: &FunctionInputs,
-    macro_type_ids: &[syn::Ident],
 ) -> Vec<TokenStream> {
     func_inputs
         .iter()
@@ -32,9 +27,7 @@ pub(super) fn generate_async_func_types(
             }) => {
                 let type_ident = quote::format_ident!("__T_{}", i);
                 let syn_type = quote! {
-                    <() as #args #generic_start_token
-                        #($#macro_type_ids),*
-                    #generic_close_token>::#type_ident
+                    <() as #args #ty_generics>::#type_ident
                 };
 
                 let cuda_type = match cuda_mode {
diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func_async/launch_types.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func_async/launch_types.rs
similarity index 63%
rename from rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func_async/launch_types.rs
rename to rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func_async/launch_types.rs
index 16cd0008e..454bdcd57 100644
--- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func_async/launch_types.rs
+++ b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func_async/launch_types.rs
@@ -3,24 +3,18 @@ use syn::spanned::Spanned;
 
 use crate::kernel::utils::r2c_move_lifetime;
 
-use super::super::super::super::{DeclGenerics, FunctionInputs, InputCudaType, KernelConfig};
+use super::super::super::super::{FunctionInputs, ImplGenerics, InputCudaType, KernelConfig};
 
 pub(in super::super) fn generate_launch_types(
     crate_path: &syn::Path,
     KernelConfig { args, .. }: &KernelConfig,
-    DeclGenerics {
-        generic_start_token,
-        generic_close_token,
-        ..
-    }: &DeclGenerics,
+    ImplGenerics { ty_generics, .. }: &ImplGenerics,
     FunctionInputs {
         func_inputs,
         func_input_cuda_types,
     }: &FunctionInputs,
-    macro_type_ids: &[syn::Ident],
-) -> (Vec<TokenStream>, Vec<TokenStream>, Vec<TokenStream>) {
+) -> (Vec<TokenStream>, Vec<TokenStream>) {
     let mut cpu_func_types_launch = Vec::with_capacity(func_inputs.len());
-    let mut cpu_func_lifetime_erased_types = Vec::with_capacity(func_inputs.len());
     let mut cpu_func_unboxed_types = Vec::with_capacity(func_inputs.len());
 
     func_inputs
@@ -31,9 +25,7 @@ pub(in super::super) fn generate_launch_types(
             syn::FnArg::Typed(syn::PatType { ty, .. }) => {
                 let type_ident = quote::format_ident!("__T_{}", i);
                 let syn_type = quote::quote_spanned! { ty.span()=>
-                    <() as #args #generic_start_token
-                        #($#macro_type_ids),*
-                    #generic_close_token>::#type_ident
+                    <() as #args #ty_generics>::#type_ident
                 };
 
                 cpu_func_unboxed_types.push(syn_type.clone());
@@ -75,33 +67,9 @@ pub(in super::super) fn generate_launch_types(
                         quote! { #cuda_type }
                     },
                 );
-
-                cpu_func_lifetime_erased_types.push(
-                    if let syn::Type::Reference(syn::TypeReference { mutability, .. }) = &**ty {
-                        if mutability.is_some() {
-                            quote::quote_spanned! { ty.span()=>
-                                #crate_path::common::DeviceMutRef<'static, #cuda_type>
-                            }
-                        } else {
-                            quote::quote_spanned! { ty.span()=>
-                                #crate_path::common::DeviceConstRef<'static, #cuda_type>
-                            }
-                        }
-                    } else if matches!(cuda_mode, InputCudaType::LendRustToCuda) {
-                        quote::quote_spanned! { ty.span()=>
-                            #crate_path::common::DeviceMutRef<'static, #cuda_type>
-                        }
-                    } else {
-                        cuda_type
-                    },
-                );
             },
             syn::FnArg::Receiver(_) => unreachable!(),
         });
 
-    (
-        cpu_func_types_launch,
-        cpu_func_lifetime_erased_types,
-        cpu_func_unboxed_types,
-    )
+    (cpu_func_types_launch, cpu_func_unboxed_types)
 }
diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func_async/mod.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func_async/mod.rs
similarity index 70%
rename from rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func_async/mod.rs
rename to rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func_async/mod.rs
index c01dcdce3..462855156 100644
--- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func_async/mod.rs
+++ b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func_async/mod.rs
@@ -1,20 +1,21 @@
 use proc_macro2::TokenStream;
 
-use super::super::super::{DeclGenerics, FuncIdent, FunctionInputs, KernelConfig};
+use super::super::super::{DeclGenerics, FuncIdent, FunctionInputs, ImplGenerics, KernelConfig};
 
 mod async_func_types;
 mod launch_types;
 mod type_wrap;
 
 use async_func_types::generate_async_func_types;
-pub(super) use launch_types::generate_launch_types;
+use launch_types::generate_launch_types;
 use type_wrap::generate_func_input_and_ptx_jit_wraps;
 
 #[allow(clippy::too_many_arguments)]
 pub(super) fn quote_kernel_func_async(
     crate_path: &syn::Path,
-    config @ KernelConfig { args, .. }: &KernelConfig,
-    decl_generics @ DeclGenerics {
+    config @ KernelConfig { kernel, .. }: &KernelConfig,
+    impl_generics @ ImplGenerics { ty_generics, .. }: &ImplGenerics,
+    DeclGenerics {
         generic_wrapper_params,
         generic_wrapper_where_clause,
         ..
@@ -25,33 +26,50 @@ pub(super) fn quote_kernel_func_async(
     }: &FuncIdent,
     func_params: &[syn::Ident],
     func_attrs: &[syn::Attribute],
-    macro_type_ids: &[syn::Ident],
 ) -> TokenStream {
-    let new_func_inputs_async = generate_async_func_types(
-        crate_path,
-        config,
-        decl_generics,
-        func_inputs,
-        macro_type_ids,
-    );
+    let launcher_predicate = quote! {
+        Self: Sized + #crate_path::host::Launcher<
+            KernelTraitObject = dyn #kernel #ty_generics
+        >
+    };
+
+    let generic_wrapper_where_clause = match generic_wrapper_where_clause {
+        Some(syn::WhereClause {
+            where_token,
+            predicates,
+        }) if !predicates.is_empty() => {
+            let comma = if predicates.empty_or_trailing() {
+                quote!()
+            } else {
+                quote!(,)
+            };
+
+            quote! {
+                #where_token #predicates #comma #launcher_predicate
+            }
+        },
+        _ => quote! {
+            where #launcher_predicate
+        },
+    };
+
+    let kernel_func_async_inputs =
+        generate_async_func_types(crate_path, config, impl_generics, func_inputs);
     let (func_input_wrap, func_cpu_ptx_jit_wrap) =
         generate_func_input_and_ptx_jit_wraps(crate_path, func_inputs);
-    let (cpu_func_types_launch, cpu_func_lifetime_erased_types, cpu_func_unboxed_types) =
-        generate_launch_types(
-            crate_path,
-            config,
-            decl_generics,
-            func_inputs,
-            macro_type_ids,
-        );
+    let (cpu_func_types_launch, cpu_func_unboxed_types) =
+        generate_launch_types(crate_path, config, impl_generics, func_inputs);
 
     quote! {
         #(#func_attrs)*
         #[allow(clippy::extra_unused_type_parameters)]
+        #[allow(clippy::too_many_arguments)]
+        #[allow(clippy::used_underscore_binding)]
+        #[allow(unused_variables)]
         fn #func_ident_async <'stream, #generic_wrapper_params>(
             &mut self,
             stream: &'stream #crate_path::rustacuda::stream::Stream,
-            #(#new_func_inputs_async),*
+            #(#kernel_func_async_inputs),*
         ) -> #crate_path::rustacuda::error::CudaResult<()>
             #generic_wrapper_where_clause
         {
@@ -77,16 +95,6 @@ pub(super) fn quote_kernel_func_async(
 
             #[allow(clippy::redundant_closure_call)]
             (|#(#func_params: #cpu_func_types_launch),*| {
-                #[deny(improper_ctypes)]
-                mod __rust_cuda_ffi_safe_assert {
-                    use super::#args;
-
-                    extern "C" { #(
-                        #[allow(dead_code)]
-                        static #func_params: #cpu_func_lifetime_erased_types;
-                    )* }
-                }
-
                 if false {
                     #[allow(dead_code)]
                     fn assert_impl_devicecopy<T: #crate_path::rustacuda_core::DeviceCopy>(_val: &T) {}
diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func_async/type_wrap.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func_async/type_wrap.rs
similarity index 100%
rename from rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func_async/type_wrap.rs
rename to rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func_async/type_wrap.rs
diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/mod.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/mod.rs
new file mode 100644
index 000000000..1b984f920
--- /dev/null
+++ b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/mod.rs
@@ -0,0 +1,96 @@
+use proc_macro2::TokenStream;
+
+use super::super::{
+    BlanketGenerics, DeclGenerics, FuncIdent, FunctionInputs, ImplGenerics, KernelConfig,
+};
+
+mod kernel_func;
+mod kernel_func_async;
+
+use kernel_func::quote_kernel_func_inputs;
+use kernel_func_async::quote_kernel_func_async;
+
+#[allow(clippy::too_many_arguments)]
+pub(in super::super) fn quote_cpu_wrapper(
+    crate_path: &syn::Path,
+    config @ KernelConfig {
+        visibility,
+        kernel,
+        ptx,
+        ..
+    }: &KernelConfig,
+    decl @ DeclGenerics {
+        generic_start_token,
+        generic_trait_params,
+        generic_close_token,
+        generic_trait_where_clause,
+        ..
+    }: &DeclGenerics,
+    impl_generics @ ImplGenerics { ty_generics, .. }: &ImplGenerics,
+    BlanketGenerics {
+        blanket_ty,
+        impl_generics: blanket_impl_generics,
+        where_clause: blanket_where_clause,
+    }: &BlanketGenerics,
+    func_inputs: &FunctionInputs,
+    fn_ident: &FuncIdent,
+    func_params: &[syn::Ident],
+    func_attrs: &[syn::Attribute],
+) -> TokenStream {
+    let launcher_predicate = quote! {
+        Self: Sized + #crate_path::host::Launcher<
+            KernelTraitObject = dyn #kernel #ty_generics
+        >
+    };
+
+    let kernel_func = quote_kernel_func_inputs(
+        crate_path,
+        config,
+        impl_generics,
+        decl,
+        func_inputs,
+        fn_ident,
+        func_params,
+        func_attrs,
+    );
+    let kernel_func_async = quote_kernel_func_async(
+        crate_path,
+        config,
+        impl_generics,
+        decl,
+        func_inputs,
+        fn_ident,
+        func_params,
+        func_attrs,
+    );
+
+    quote! {
+        #[cfg(not(target_os = "cuda"))]
+        #[allow(clippy::missing_safety_doc)]
+        #visibility unsafe trait #ptx #generic_start_token #generic_trait_params #generic_close_token
+            #generic_trait_where_clause
+        {
+            fn get_ptx_str() -> &'static str where #launcher_predicate;
+
+            fn new_kernel() -> #crate_path::rustacuda::error::CudaResult<
+                #crate_path::host::TypedKernel<dyn #kernel #ty_generics>
+            > where #launcher_predicate;
+        }
+
+        #[cfg(not(target_os = "cuda"))]
+        #[allow(clippy::missing_safety_doc)]
+        #visibility unsafe trait #kernel #generic_start_token #generic_trait_params #generic_close_token: #ptx #ty_generics
+            #generic_trait_where_clause
+        {
+            #kernel_func
+
+            #kernel_func_async
+        }
+
+        #[cfg(not(target_os = "cuda"))]
+        #[allow(clippy::missing_safety_doc)]
+        unsafe impl #blanket_impl_generics #kernel #ty_generics for #blanket_ty
+            #blanket_where_clause
+        {}
+    }
+}
diff --git a/rust-cuda-derive/src/kernel/wrapper/mod.rs b/rust-cuda-derive/src/kernel/wrapper/mod.rs
index 76b88eee6..b720a8965 100644
--- a/rust-cuda-derive/src/kernel/wrapper/mod.rs
+++ b/rust-cuda-derive/src/kernel/wrapper/mod.rs
@@ -31,8 +31,8 @@ pub fn kernel(attr: TokenStream, func: TokenStream) -> TokenStream {
         Ok(config) => config,
         Err(err) => {
             abort_call_site!(
-                "#[kernel(pub? use LINKER! as impl KERNEL<ARGS> for LAUNCHER)] expects LINKER, \
-                 KERNEL, ARGS and LAUNCHER identifiers: {:?}",
+                "#[kernel(pub? use LINKER! as impl KERNEL<ARGS, PTX> for LAUNCHER)] expects \
+                 LINKER, KERNEL, ARGS, PTX, and LAUNCHER identifiers: {:?}",
                 err
             )
         },
@@ -172,14 +172,34 @@ pub fn kernel(attr: TokenStream, func: TokenStream) -> TokenStream {
         gt_token: generic_close_token,
         where_clause: generic_trait_where_clause.clone(),
     };
-    let impl_generics = {
-        let (impl_generics, ty_generics, where_clause) = trait_generics.split_for_impl();
-
-        ImplGenerics {
-            impl_generics,
-            ty_generics,
-            where_clause,
-        }
+    let (impl_generics, ty_generics, where_clause) = trait_generics.split_for_impl();
+    let blanket_ty = syn::Ident::new("K", Span::mixed_site());
+    let mut blanket_params = generic_trait_params.clone();
+    let ptx = &config.ptx;
+    blanket_params.push(syn::GenericParam::Type(syn::TypeParam {
+        attrs: Vec::new(),
+        ident: blanket_ty.clone(),
+        colon_token: syn::parse_quote!(:),
+        bounds: syn::parse_quote!(#ptx #ty_generics),
+        eq_token: None,
+        default: None,
+    }));
+    let trait_blanket_generics = syn::Generics {
+        lt_token: Some(generic_start_token.unwrap_or(syn::parse_quote!(<))),
+        params: blanket_params,
+        gt_token: Some(generic_close_token.unwrap_or(syn::parse_quote!(>))),
+        where_clause: generic_trait_where_clause.clone(),
+    };
+    let (blanket_impl_generics, _, blanket_where_clause) = trait_blanket_generics.split_for_impl();
+    let blanket_generics = BlanketGenerics {
+        blanket_ty,
+        impl_generics: blanket_impl_generics,
+        where_clause: blanket_where_clause,
+    };
+    let impl_generics = ImplGenerics {
+        impl_generics,
+        ty_generics,
+        where_clause,
     };
 
     let func_ident = FuncIdent {
@@ -237,8 +257,10 @@ pub fn kernel(attr: TokenStream, func: TokenStream) -> TokenStream {
         &config,
         &decl_generics,
         &impl_generics,
+        &blanket_generics,
         &func_inputs,
         &func_ident,
+        &func_params,
         &func.attrs,
     );
     let cpu_cuda_check = quote_generic_check(&crate_path, &func_ident, &config);
@@ -249,7 +271,6 @@ pub fn kernel(attr: TokenStream, func: TokenStream) -> TokenStream {
         &func_inputs,
         &func_ident,
         &func_params,
-        &func.attrs,
     );
     let cuda_wrapper = quote_cuda_wrapper(
         &crate_path,
@@ -307,6 +328,12 @@ struct ImplGenerics<'f> {
     where_clause: Option<&'f syn::WhereClause>,
 }
 
+struct BlanketGenerics<'f> {
+    blanket_ty: syn::Ident,
+    impl_generics: syn::ImplGenerics<'f>,
+    where_clause: Option<&'f syn::WhereClause>,
+}
+
 #[allow(clippy::struct_field_names)]
 struct FuncIdent<'f> {
     func_ident: &'f syn::Ident,
diff --git a/rust-cuda-derive/src/rust_to_cuda/field_copy.rs b/rust-cuda-derive/src/rust_to_cuda/field_copy.rs
index c6659e9c9..549f5ab56 100644
--- a/rust-cuda-derive/src/rust_to_cuda/field_copy.rs
+++ b/rust-cuda-derive/src/rust_to_cuda/field_copy.rs
@@ -56,7 +56,7 @@ pub fn impl_field_copy_init_and_expand_alloc_type(
         },
         CudaReprFieldTy::RustToCuda { field_ty } => {
             combined_cuda_alloc_type = quote! {
-                #crate_path::host::CombinedCudaAlloc<
+                #crate_path::common::CombinedCudaAlloc<
                     <#field_ty as #crate_path::common::RustToCuda>::CudaAllocation,
                     #combined_cuda_alloc_type
                 >
@@ -102,7 +102,7 @@ pub fn impl_field_copy_init_and_expand_alloc_type(
         },
         CudaReprFieldTy::RustToCudaProxy { proxy_ty, field_ty } => {
             combined_cuda_alloc_type = quote! {
-                #crate_path::host::CombinedCudaAlloc<
+                #crate_path::common::CombinedCudaAlloc<
                     <#proxy_ty as #crate_path::common::RustToCuda>::CudaAllocation,
                     #combined_cuda_alloc_type
                 >
diff --git a/rust-cuda-derive/src/rust_to_cuda/impl.rs b/rust-cuda-derive/src/rust_to_cuda/impl.rs
index 1ff844645..1682c0c80 100644
--- a/rust-cuda-derive/src/rust_to_cuda/impl.rs
+++ b/rust-cuda-derive/src/rust_to_cuda/impl.rs
@@ -81,18 +81,17 @@ pub fn rust_to_cuda_trait(
         {
             type CudaRepresentation = #struct_name_cuda #ty_generics;
 
-            #[cfg(not(target_os = "cuda"))]
             type CudaAllocation = #combined_cuda_alloc_type;
 
             #[cfg(not(target_os = "cuda"))]
-            unsafe fn borrow<CudaAllocType: #crate_path::host::CudaAlloc>(
+            unsafe fn borrow<CudaAllocType: #crate_path::common::CudaAlloc>(
                 &self,
                 alloc: CudaAllocType,
             ) -> #crate_path::rustacuda::error::CudaResult<(
                 #crate_path::common::DeviceAccessible<Self::CudaRepresentation>,
-                #crate_path::host::CombinedCudaAlloc<Self::CudaAllocation, CudaAllocType>
+                #crate_path::common::CombinedCudaAlloc<Self::CudaAllocation, CudaAllocType>
             )> {
-                let alloc_front = #crate_path::host::NullCudaAlloc;
+                let alloc_front = #crate_path::common::NullCudaAlloc;
                 let alloc_tail = alloc;
 
                 #(#r2c_field_declarations)*
@@ -101,14 +100,14 @@ pub fn rust_to_cuda_trait(
 
                 Ok((
                     #crate_path::common::DeviceAccessible::from(borrow),
-                    #crate_path::host::CombinedCudaAlloc::new(alloc_front, alloc_tail)
+                    #crate_path::common::CombinedCudaAlloc::new(alloc_front, alloc_tail)
                 ))
             }
 
             #[cfg(not(target_os = "cuda"))]
-            unsafe fn restore<CudaAllocType: #crate_path::host::CudaAlloc>(
+            unsafe fn restore<CudaAllocType: #crate_path::common::CudaAlloc>(
                 &mut self,
-                alloc: #crate_path::host::CombinedCudaAlloc<
+                alloc: #crate_path::common::CombinedCudaAlloc<
                     Self::CudaAllocation, CudaAllocType
                 >,
             ) -> #crate_path::rustacuda::error::CudaResult<CudaAllocType> {
@@ -154,15 +153,15 @@ pub fn rust_to_cuda_async_trait(
             #where_clause
         {
             #[cfg(not(target_os = "cuda"))]
-            unsafe fn borrow_async<CudaAllocType: #crate_path::host::CudaAlloc>(
+            unsafe fn borrow_async<CudaAllocType: #crate_path::common::CudaAlloc>(
                 &self,
                 alloc: CudaAllocType,
                 stream: &#crate_path::rustacuda::stream::Stream,
             ) -> #crate_path::rustacuda::error::CudaResult<(
                 #crate_path::common::DeviceAccessible<Self::CudaRepresentation>,
-                #crate_path::host::CombinedCudaAlloc<Self::CudaAllocation, CudaAllocType>
+                #crate_path::common::CombinedCudaAlloc<Self::CudaAllocation, CudaAllocType>
             )> {
-                let alloc_front = #crate_path::host::NullCudaAlloc;
+                let alloc_front = #crate_path::common::NullCudaAlloc;
                 let alloc_tail = alloc;
 
                 #(#r2c_field_async_declarations)*
@@ -171,14 +170,14 @@ pub fn rust_to_cuda_async_trait(
 
                 Ok((
                     #crate_path::common::DeviceAccessible::from(borrow),
-                    #crate_path::host::CombinedCudaAlloc::new(alloc_front, alloc_tail)
+                    #crate_path::common::CombinedCudaAlloc::new(alloc_front, alloc_tail)
                 ))
             }
 
             #[cfg(not(target_os = "cuda"))]
-            unsafe fn restore_async<CudaAllocType: #crate_path::host::CudaAlloc>(
+            unsafe fn restore_async<CudaAllocType: #crate_path::common::CudaAlloc>(
                 &mut self,
-                alloc: #crate_path::host::CombinedCudaAlloc<
+                alloc: #crate_path::common::CombinedCudaAlloc<
                     Self::CudaAllocation, CudaAllocType
                 >,
                 stream: &#crate_path::rustacuda::stream::Stream,
diff --git a/rust-cuda-derive/src/rust_to_cuda/mod.rs b/rust-cuda-derive/src/rust_to_cuda/mod.rs
index 4173d6658..5e11ffc8c 100644
--- a/rust-cuda-derive/src/rust_to_cuda/mod.rs
+++ b/rust-cuda-derive/src/rust_to_cuda/mod.rs
@@ -31,7 +31,7 @@ pub fn impl_rust_to_cuda(ast: &syn::DeriveInput) -> proc_macro::TokenStream {
     ) = generics::expand_cuda_struct_generics_where_requested_in_attrs(ast);
 
     let mut combined_cuda_alloc_type: TokenStream = quote! {
-        #crate_path::host::NullCudaAlloc
+        #crate_path::common::NullCudaAlloc
     };
     let mut r2c_field_declarations: Vec<TokenStream> = Vec::new();
     let mut r2c_field_async_declarations: Vec<TokenStream> = Vec::new();
diff --git a/src/common.rs b/src/common.rs
index abb196c05..6a7e7d926 100644
--- a/src/common.rs
+++ b/src/common.rs
@@ -79,9 +79,7 @@ impl<T: ?Sized + DeviceCopy> DerefMut for DeviceAccessible<T> {
 /// This is an internal trait and should ONLY be derived automatically using
 /// `#[derive(LendRustToCuda)]`
 pub unsafe trait RustToCuda {
-    #[cfg(feature = "host")]
-    #[doc(cfg(feature = "host"))]
-    type CudaAllocation: crate::host::CudaAlloc;
+    type CudaAllocation: CudaAlloc;
     type CudaRepresentation: CudaAsRust<RustRepresentation = Self> + TypeGraphLayout;
 
     #[cfg(feature = "host")]
@@ -97,12 +95,12 @@ pub unsafe trait RustToCuda {
     /// The returned [`Self::CudaRepresentation`] must NEVER be accessed on the
     ///  CPU  as it contains a GPU-resident copy of `self`.
     #[allow(clippy::type_complexity)]
-    unsafe fn borrow<A: crate::host::CudaAlloc>(
+    unsafe fn borrow<A: CudaAlloc>(
         &self,
         alloc: A,
     ) -> rustacuda::error::CudaResult<(
         DeviceAccessible<Self::CudaRepresentation>,
-        crate::host::CombinedCudaAlloc<Self::CudaAllocation, A>,
+        CombinedCudaAlloc<Self::CudaAllocation, A>,
     )>;
 
     #[cfg(feature = "host")]
@@ -116,9 +114,9 @@ pub unsafe trait RustToCuda {
     ///
     /// This is an internal function and should NEVER be called manually
     #[allow(clippy::type_complexity)]
-    unsafe fn restore<A: crate::host::CudaAlloc>(
+    unsafe fn restore<A: CudaAlloc>(
         &mut self,
-        alloc: crate::host::CombinedCudaAlloc<Self::CudaAllocation, A>,
+        alloc: CombinedCudaAlloc<Self::CudaAllocation, A>,
     ) -> rustacuda::error::CudaResult<A>;
 }
 
@@ -142,13 +140,13 @@ pub unsafe trait RustToCudaAsync: RustToCuda {
     /// be accessed on the  CPU  as it contains a GPU-resident copy of
     /// `self`.
     #[allow(clippy::type_complexity)]
-    unsafe fn borrow_async<A: crate::host::CudaAlloc>(
+    unsafe fn borrow_async<A: CudaAlloc>(
         &self,
         alloc: A,
         stream: &rustacuda::stream::Stream,
     ) -> rustacuda::error::CudaResult<(
         DeviceAccessible<Self::CudaRepresentation>,
-        crate::host::CombinedCudaAlloc<Self::CudaAllocation, A>,
+        CombinedCudaAlloc<Self::CudaAllocation, A>,
     )>;
 
     #[cfg(feature = "host")]
@@ -162,9 +160,9 @@ pub unsafe trait RustToCudaAsync: RustToCuda {
     ///
     /// This is an internal function and should NEVER be called manually
     #[allow(clippy::type_complexity)]
-    unsafe fn restore_async<A: crate::host::CudaAlloc>(
+    unsafe fn restore_async<A: CudaAlloc>(
         &mut self,
-        alloc: crate::host::CombinedCudaAlloc<Self::CudaAllocation, A>,
+        alloc: CombinedCudaAlloc<Self::CudaAllocation, A>,
         stream: &rustacuda::stream::Stream,
     ) -> rustacuda::error::CudaResult<A>;
 }
@@ -240,3 +238,43 @@ impl<'r, T: DeviceCopy> AsMut<T> for DeviceMutRef<'r, T> {
         unsafe { &mut *self.pointer }
     }
 }
+
+pub(crate) mod crate_private {
+    pub mod alloc {
+        pub trait Sealed {}
+    }
+}
+
+mod private {
+    pub mod empty {
+        pub trait Sealed {}
+    }
+}
+
+pub trait EmptyCudaAlloc: private::empty::Sealed {}
+impl<T: private::empty::Sealed> EmptyCudaAlloc for T {}
+
+pub trait CudaAlloc: crate_private::alloc::Sealed {}
+impl<T: crate_private::alloc::Sealed> CudaAlloc for T {}
+
+impl<T: CudaAlloc> crate_private::alloc::Sealed for Option<T> {}
+
+pub struct NullCudaAlloc;
+impl crate_private::alloc::Sealed for NullCudaAlloc {}
+impl private::empty::Sealed for NullCudaAlloc {}
+
+pub struct CombinedCudaAlloc<A: CudaAlloc, B: CudaAlloc>(A, B);
+impl<A: CudaAlloc, B: CudaAlloc> crate_private::alloc::Sealed for CombinedCudaAlloc<A, B> {}
+impl<A: CudaAlloc + EmptyCudaAlloc, B: CudaAlloc + EmptyCudaAlloc> private::empty::Sealed
+    for CombinedCudaAlloc<A, B>
+{
+}
+impl<A: CudaAlloc, B: CudaAlloc> CombinedCudaAlloc<A, B> {
+    pub fn new(front: A, tail: B) -> Self {
+        Self(front, tail)
+    }
+
+    pub fn split(self) -> (A, B) {
+        (self.0, self.1)
+    }
+}
diff --git a/src/device/mod.rs b/src/device/mod.rs
index 45c833923..f7347aa98 100644
--- a/src/device/mod.rs
+++ b/src/device/mod.rs
@@ -116,3 +116,7 @@ impl<T> DerefMut for ShallowCopy<T> {
         &mut self.0
     }
 }
+
+pub struct SomeCudaAlloc(());
+
+impl crate::common::crate_private::alloc::Sealed for SomeCudaAlloc {}
diff --git a/src/host.rs b/src/host.rs
index 591ed4ed5..7a5eaf854 100644
--- a/src/host.rs
+++ b/src/host.rs
@@ -20,7 +20,9 @@ use rustacuda_core::{DeviceCopy, DevicePointer};
 pub use rust_cuda_derive::{check_kernel, link_kernel, specialise_kernel_call};
 
 use crate::{
-    common::{DeviceAccessible, DeviceConstRef, DeviceMutRef, RustToCuda},
+    common::{
+        DeviceAccessible, DeviceConstRef, DeviceMutRef, EmptyCudaAlloc, NullCudaAlloc, RustToCuda,
+    },
     ptx_jit::{CudaKernel, PtxJITCompiler, PtxJITResult},
     safety::SafeDeviceCopy,
 };
@@ -250,53 +252,17 @@ impl<T: RustToCuda> LendToCuda for T {
     }
 }
 
-pub(crate) mod private {
-    pub mod alloc {
-        pub trait Sealed {}
-    }
-
+mod private {
     pub mod drop {
         pub trait Sealed: Sized {
             fn drop(val: Self) -> Result<(), (rustacuda::error::CudaError, Self)>;
         }
     }
-
-    pub mod empty {
-        pub trait Sealed {}
-    }
-}
-
-pub trait EmptyCudaAlloc: private::empty::Sealed {}
-impl<T: private::empty::Sealed> EmptyCudaAlloc for T {}
-
-pub trait CudaAlloc: private::alloc::Sealed {}
-impl<T: private::alloc::Sealed> CudaAlloc for T {}
-
-impl<T: CudaAlloc> private::alloc::Sealed for Option<T> {}
-
-pub struct NullCudaAlloc;
-impl private::alloc::Sealed for NullCudaAlloc {}
-impl private::empty::Sealed for NullCudaAlloc {}
-
-pub struct CombinedCudaAlloc<A: CudaAlloc, B: CudaAlloc>(A, B);
-impl<A: CudaAlloc, B: CudaAlloc> private::alloc::Sealed for CombinedCudaAlloc<A, B> {}
-impl<A: CudaAlloc + EmptyCudaAlloc, B: CudaAlloc + EmptyCudaAlloc> private::empty::Sealed
-    for CombinedCudaAlloc<A, B>
-{
-}
-impl<A: CudaAlloc, B: CudaAlloc> CombinedCudaAlloc<A, B> {
-    pub fn new(front: A, tail: B) -> Self {
-        Self(front, tail)
-    }
-
-    pub fn split(self) -> (A, B) {
-        (self.0, self.1)
-    }
 }
 
 #[repr(transparent)]
 pub struct CudaDropWrapper<C: private::drop::Sealed>(ManuallyDrop<C>);
-impl<C: private::drop::Sealed> private::alloc::Sealed for CudaDropWrapper<C> {}
+impl<C: private::drop::Sealed> crate::common::crate_private::alloc::Sealed for CudaDropWrapper<C> {}
 impl<C: private::drop::Sealed> From<C> for CudaDropWrapper<C> {
     fn from(val: C) -> Self {
         Self(ManuallyDrop::new(val))
@@ -416,7 +382,7 @@ impl<T: DeviceCopy> Drop for HostLockedBox<T> {
 #[allow(clippy::module_name_repetitions)]
 pub struct HostDeviceBox<T: DeviceCopy>(DevicePointer<T>);
 
-impl<T: DeviceCopy> private::alloc::Sealed for HostDeviceBox<T> {}
+impl<T: DeviceCopy> crate::common::crate_private::alloc::Sealed for HostDeviceBox<T> {}
 
 impl<T: DeviceCopy> HostDeviceBox<T> {
     /// # Errors
diff --git a/src/utils/aliasing/const.rs b/src/utils/aliasing/const.rs
index ea5f1bba4..91496a47d 100644
--- a/src/utils/aliasing/const.rs
+++ b/src/utils/aliasing/const.rs
@@ -178,8 +178,6 @@ impl<E, T: BorrowMut<[E]>, const STRIDE: usize> BorrowMut<[E]>
 unsafe impl<T: RustToCuda, const STRIDE: usize> RustToCuda
     for SplitSliceOverCudaThreadsConstStride<T, STRIDE>
 {
-    #[cfg(feature = "host")]
-    #[doc(cfg(feature = "host"))]
     type CudaAllocation = T::CudaAllocation;
     type CudaRepresentation =
         SplitSliceOverCudaThreadsConstStride<DeviceAccessible<T::CudaRepresentation>, STRIDE>;
@@ -187,12 +185,12 @@ unsafe impl<T: RustToCuda, const STRIDE: usize> RustToCuda
     #[cfg(feature = "host")]
     #[doc(cfg(feature = "host"))]
     #[allow(clippy::type_complexity)]
-    unsafe fn borrow<A: crate::host::CudaAlloc>(
+    unsafe fn borrow<A: crate::common::CudaAlloc>(
         &self,
         alloc: A,
     ) -> rustacuda::error::CudaResult<(
         DeviceAccessible<Self::CudaRepresentation>,
-        crate::host::CombinedCudaAlloc<Self::CudaAllocation, A>,
+        crate::common::CombinedCudaAlloc<Self::CudaAllocation, A>,
     )> {
         let (cuda_repr, alloc) = self.0.borrow(alloc)?;
 
@@ -204,9 +202,9 @@ unsafe impl<T: RustToCuda, const STRIDE: usize> RustToCuda
 
     #[cfg(feature = "host")]
     #[doc(cfg(feature = "host"))]
-    unsafe fn restore<A: crate::host::CudaAlloc>(
+    unsafe fn restore<A: crate::common::CudaAlloc>(
         &mut self,
-        alloc: crate::host::CombinedCudaAlloc<Self::CudaAllocation, A>,
+        alloc: crate::common::CombinedCudaAlloc<Self::CudaAllocation, A>,
     ) -> rustacuda::error::CudaResult<A> {
         self.0.restore(alloc)
     }
@@ -218,13 +216,13 @@ unsafe impl<T: RustToCudaAsync, const STRIDE: usize> RustToCudaAsync
     #[cfg(feature = "host")]
     #[doc(cfg(feature = "host"))]
     #[allow(clippy::type_complexity)]
-    unsafe fn borrow_async<A: crate::host::CudaAlloc>(
+    unsafe fn borrow_async<A: crate::common::CudaAlloc>(
         &self,
         alloc: A,
         stream: &rustacuda::stream::Stream,
     ) -> rustacuda::error::CudaResult<(
         DeviceAccessible<Self::CudaRepresentation>,
-        crate::host::CombinedCudaAlloc<Self::CudaAllocation, A>,
+        crate::common::CombinedCudaAlloc<Self::CudaAllocation, A>,
     )> {
         let (cuda_repr, alloc) = self.0.borrow_async(alloc, stream)?;
 
@@ -236,9 +234,9 @@ unsafe impl<T: RustToCudaAsync, const STRIDE: usize> RustToCudaAsync
 
     #[cfg(feature = "host")]
     #[doc(cfg(feature = "host"))]
-    unsafe fn restore_async<A: crate::host::CudaAlloc>(
+    unsafe fn restore_async<A: crate::common::CudaAlloc>(
         &mut self,
-        alloc: crate::host::CombinedCudaAlloc<Self::CudaAllocation, A>,
+        alloc: crate::common::CombinedCudaAlloc<Self::CudaAllocation, A>,
         stream: &rustacuda::stream::Stream,
     ) -> rustacuda::error::CudaResult<A> {
         self.0.restore_async(alloc, stream)
diff --git a/src/utils/aliasing/dynamic.rs b/src/utils/aliasing/dynamic.rs
index c2ad169ff..d7b48b05f 100644
--- a/src/utils/aliasing/dynamic.rs
+++ b/src/utils/aliasing/dynamic.rs
@@ -152,8 +152,6 @@ impl<E, T: BorrowMut<[E]>> BorrowMut<[E]> for SplitSliceOverCudaThreadsDynamicSt
 }
 
 unsafe impl<T: RustToCuda> RustToCuda for SplitSliceOverCudaThreadsDynamicStride<T> {
-    #[cfg(feature = "host")]
-    #[doc(cfg(feature = "host"))]
     type CudaAllocation = T::CudaAllocation;
     type CudaRepresentation =
         SplitSliceOverCudaThreadsDynamicStride<DeviceAccessible<T::CudaRepresentation>>;
@@ -161,12 +159,12 @@ unsafe impl<T: RustToCuda> RustToCuda for SplitSliceOverCudaThreadsDynamicStride
     #[cfg(feature = "host")]
     #[doc(cfg(feature = "host"))]
     #[allow(clippy::type_complexity)]
-    unsafe fn borrow<A: crate::host::CudaAlloc>(
+    unsafe fn borrow<A: crate::common::CudaAlloc>(
         &self,
         alloc: A,
     ) -> rustacuda::error::CudaResult<(
         DeviceAccessible<Self::CudaRepresentation>,
-        crate::host::CombinedCudaAlloc<Self::CudaAllocation, A>,
+        crate::common::CombinedCudaAlloc<Self::CudaAllocation, A>,
     )> {
         let (cuda_repr, alloc) = self.inner.borrow(alloc)?;
 
@@ -181,9 +179,9 @@ unsafe impl<T: RustToCuda> RustToCuda for SplitSliceOverCudaThreadsDynamicStride
 
     #[cfg(feature = "host")]
     #[doc(cfg(feature = "host"))]
-    unsafe fn restore<A: crate::host::CudaAlloc>(
+    unsafe fn restore<A: crate::common::CudaAlloc>(
         &mut self,
-        alloc: crate::host::CombinedCudaAlloc<Self::CudaAllocation, A>,
+        alloc: crate::common::CombinedCudaAlloc<Self::CudaAllocation, A>,
     ) -> rustacuda::error::CudaResult<A> {
         self.inner.restore(alloc)
     }
@@ -193,13 +191,13 @@ unsafe impl<T: RustToCudaAsync> RustToCudaAsync for SplitSliceOverCudaThreadsDyn
     #[cfg(feature = "host")]
     #[doc(cfg(feature = "host"))]
     #[allow(clippy::type_complexity)]
-    unsafe fn borrow_async<A: crate::host::CudaAlloc>(
+    unsafe fn borrow_async<A: crate::common::CudaAlloc>(
         &self,
         alloc: A,
         stream: &rustacuda::stream::Stream,
     ) -> rustacuda::error::CudaResult<(
         DeviceAccessible<Self::CudaRepresentation>,
-        crate::host::CombinedCudaAlloc<Self::CudaAllocation, A>,
+        crate::common::CombinedCudaAlloc<Self::CudaAllocation, A>,
     )> {
         let (cuda_repr, alloc) = self.inner.borrow_async(alloc, stream)?;
 
@@ -214,9 +212,9 @@ unsafe impl<T: RustToCudaAsync> RustToCudaAsync for SplitSliceOverCudaThreadsDyn
 
     #[cfg(feature = "host")]
     #[doc(cfg(feature = "host"))]
-    unsafe fn restore_async<A: crate::host::CudaAlloc>(
+    unsafe fn restore_async<A: crate::common::CudaAlloc>(
         &mut self,
-        alloc: crate::host::CombinedCudaAlloc<Self::CudaAllocation, A>,
+        alloc: crate::common::CombinedCudaAlloc<Self::CudaAllocation, A>,
         stream: &rustacuda::stream::Stream,
     ) -> rustacuda::error::CudaResult<A> {
         self.inner.restore_async(alloc, stream)
diff --git a/src/utils/aliasing/final.rs b/src/utils/aliasing/final.rs
index 5a3d1695c..019ece1b6 100644
--- a/src/utils/aliasing/final.rs
+++ b/src/utils/aliasing/final.rs
@@ -12,20 +12,18 @@ pub struct FinalCudaRepresentation<T: CudaAsRust>(DeviceAccessible<T>);
 unsafe impl<T: CudaAsRust> rustacuda_core::DeviceCopy for FinalCudaRepresentation<T> {}
 
 unsafe impl<T: RustToCuda> RustToCuda for Final<T> {
-    #[cfg(feature = "host")]
-    #[doc(cfg(feature = "host"))]
     type CudaAllocation = T::CudaAllocation;
     type CudaRepresentation = FinalCudaRepresentation<T::CudaRepresentation>;
 
     #[cfg(feature = "host")]
     #[doc(cfg(feature = "host"))]
     #[allow(clippy::type_complexity)]
-    unsafe fn borrow<A: crate::host::CudaAlloc>(
+    unsafe fn borrow<A: crate::common::CudaAlloc>(
         &self,
         alloc: A,
     ) -> rustacuda::error::CudaResult<(
         DeviceAccessible<Self::CudaRepresentation>,
-        crate::host::CombinedCudaAlloc<Self::CudaAllocation, A>,
+        crate::common::CombinedCudaAlloc<Self::CudaAllocation, A>,
     )> {
         let (cuda_repr, alloc) = (**self).borrow(alloc)?;
 
@@ -37,9 +35,9 @@ unsafe impl<T: RustToCuda> RustToCuda for Final<T> {
 
     #[cfg(feature = "host")]
     #[doc(cfg(feature = "host"))]
-    unsafe fn restore<A: crate::host::CudaAlloc>(
+    unsafe fn restore<A: crate::common::CudaAlloc>(
         &mut self,
-        alloc: crate::host::CombinedCudaAlloc<Self::CudaAllocation, A>,
+        alloc: crate::common::CombinedCudaAlloc<Self::CudaAllocation, A>,
     ) -> rustacuda::error::CudaResult<A> {
         // Safety: Final is a repr(transparent) newtype wrapper around T
         let inner: &mut T = &mut *(self as *mut Self).cast();
@@ -52,13 +50,13 @@ unsafe impl<T: RustToCudaAsync> RustToCudaAsync for Final<T> {
     #[cfg(feature = "host")]
     #[doc(cfg(feature = "host"))]
     #[allow(clippy::type_complexity)]
-    unsafe fn borrow_async<A: crate::host::CudaAlloc>(
+    unsafe fn borrow_async<A: crate::common::CudaAlloc>(
         &self,
         alloc: A,
         stream: &rustacuda::stream::Stream,
     ) -> rustacuda::error::CudaResult<(
         DeviceAccessible<Self::CudaRepresentation>,
-        crate::host::CombinedCudaAlloc<Self::CudaAllocation, A>,
+        crate::common::CombinedCudaAlloc<Self::CudaAllocation, A>,
     )> {
         let (cuda_repr, alloc) = (**self).borrow_async(alloc, stream)?;
 
@@ -70,9 +68,9 @@ unsafe impl<T: RustToCudaAsync> RustToCudaAsync for Final<T> {
 
     #[cfg(feature = "host")]
     #[doc(cfg(feature = "host"))]
-    unsafe fn restore_async<A: crate::host::CudaAlloc>(
+    unsafe fn restore_async<A: crate::common::CudaAlloc>(
         &mut self,
-        alloc: crate::host::CombinedCudaAlloc<Self::CudaAllocation, A>,
+        alloc: crate::common::CombinedCudaAlloc<Self::CudaAllocation, A>,
         stream: &rustacuda::stream::Stream,
     ) -> rustacuda::error::CudaResult<A> {
         // Safety: Final is a repr(transparent) newtype wrapper around T
diff --git a/src/utils/box.rs b/src/utils/box.rs
index e3381f022..195536f0d 100644
--- a/src/utils/box.rs
+++ b/src/utils/box.rs
@@ -9,8 +9,11 @@ use crate::{
 
 #[cfg(feature = "host")]
 use crate::{
-    host::CombinedCudaAlloc, host::CudaAlloc, host::CudaDropWrapper, rustacuda::error::CudaResult,
-    rustacuda::memory::DeviceBox, utils::device_copy::SafeDeviceCopyWrapper,
+    common::{CombinedCudaAlloc, CudaAlloc},
+    host::CudaDropWrapper,
+    rustacuda::error::CudaResult,
+    rustacuda::memory::DeviceBox,
+    utils::device_copy::SafeDeviceCopyWrapper,
 };
 
 #[doc(hidden)]
@@ -29,8 +32,9 @@ unsafe impl<T: SafeDeviceCopy + TypeGraphLayout> rustacuda_core::DeviceCopy
 
 unsafe impl<T: SafeDeviceCopy + TypeGraphLayout> RustToCuda for Box<T> {
     #[cfg(feature = "host")]
-    #[doc(cfg(feature = "host"))]
-    type CudaAllocation = CudaDropWrapper<DeviceBox<SafeDeviceCopyWrapper<T>>>;
+    type CudaAllocation = crate::host::CudaDropWrapper<DeviceBox<SafeDeviceCopyWrapper<T>>>;
+    #[cfg(not(feature = "host"))]
+    type CudaAllocation = crate::device::SomeCudaAlloc;
     type CudaRepresentation = BoxCudaRepresentation<T>;
 
     #[cfg(feature = "host")]
diff --git a/src/utils/boxed_slice.rs b/src/utils/boxed_slice.rs
index 5ed008801..d5c022ede 100644
--- a/src/utils/boxed_slice.rs
+++ b/src/utils/boxed_slice.rs
@@ -9,8 +9,11 @@ use crate::{
 
 #[cfg(feature = "host")]
 use crate::{
-    host::CombinedCudaAlloc, host::CudaAlloc, host::CudaDropWrapper, rustacuda::error::CudaResult,
-    rustacuda::memory::DeviceBuffer, utils::device_copy::SafeDeviceCopyWrapper,
+    common::{CombinedCudaAlloc, CudaAlloc},
+    host::CudaDropWrapper,
+    rustacuda::error::CudaResult,
+    rustacuda::memory::DeviceBuffer,
+    utils::device_copy::SafeDeviceCopyWrapper,
 };
 
 #[doc(hidden)]
@@ -29,8 +32,9 @@ unsafe impl<T: SafeDeviceCopy + TypeGraphLayout> rustacuda_core::DeviceCopy
 
 unsafe impl<T: SafeDeviceCopy + TypeGraphLayout> RustToCuda for Box<[T]> {
     #[cfg(feature = "host")]
-    #[doc(cfg(feature = "host"))]
-    type CudaAllocation = CudaDropWrapper<DeviceBuffer<SafeDeviceCopyWrapper<T>>>;
+    type CudaAllocation = crate::host::CudaDropWrapper<DeviceBuffer<SafeDeviceCopyWrapper<T>>>;
+    #[cfg(not(feature = "host"))]
+    type CudaAllocation = crate::device::SomeCudaAlloc;
     type CudaRepresentation = BoxedSliceCudaRepresentation<T>;
 
     #[cfg(feature = "host")]
diff --git a/src/utils/device_copy.rs b/src/utils/device_copy.rs
index 1ae0515f9..46a75824c 100644
--- a/src/utils/device_copy.rs
+++ b/src/utils/device_copy.rs
@@ -3,10 +3,13 @@
 use const_type_layout::TypeGraphLayout;
 
 use crate::{
-    common::{CudaAsRust, DeviceAccessible, RustToCuda, RustToCudaAsync},
+    common::{CudaAsRust, DeviceAccessible, NullCudaAlloc, RustToCuda, RustToCudaAsync},
     safety::SafeDeviceCopy,
 };
 
+#[cfg(feature = "host")]
+use crate::common::{CombinedCudaAlloc, CudaAlloc};
+
 #[derive(Copy, Clone, Debug, TypeLayout)]
 #[repr(transparent)]
 pub struct SafeDeviceCopyWrapper<T>(T)
@@ -71,30 +74,29 @@ impl<T: SafeDeviceCopy + TypeGraphLayout> SafeDeviceCopyWrapper<T> {
 }
 
 unsafe impl<T: SafeDeviceCopy + TypeGraphLayout> RustToCuda for SafeDeviceCopyWrapper<T> {
-    #[cfg(feature = "host")]
-    type CudaAllocation = crate::host::NullCudaAlloc;
+    type CudaAllocation = NullCudaAlloc;
     type CudaRepresentation = Self;
 
     #[cfg(feature = "host")]
     #[allow(clippy::type_complexity)]
-    unsafe fn borrow<A: crate::host::CudaAlloc>(
+    unsafe fn borrow<A: CudaAlloc>(
         &self,
         alloc: A,
     ) -> rustacuda::error::CudaResult<(
         DeviceAccessible<Self::CudaRepresentation>,
-        crate::host::CombinedCudaAlloc<Self::CudaAllocation, A>,
+        CombinedCudaAlloc<Self::CudaAllocation, A>,
     )> {
-        let alloc = crate::host::CombinedCudaAlloc::new(crate::host::NullCudaAlloc, alloc);
+        let alloc = CombinedCudaAlloc::new(NullCudaAlloc, alloc);
         Ok((DeviceAccessible::from(&self.0), alloc))
     }
 
     #[cfg(feature = "host")]
     #[doc(cfg(feature = "host"))]
-    unsafe fn restore<A: crate::host::CudaAlloc>(
+    unsafe fn restore<A: CudaAlloc>(
         &mut self,
-        alloc: crate::host::CombinedCudaAlloc<Self::CudaAllocation, A>,
+        alloc: CombinedCudaAlloc<Self::CudaAllocation, A>,
     ) -> rustacuda::error::CudaResult<A> {
-        let (_alloc_front, alloc_tail): (crate::host::NullCudaAlloc, A) = alloc.split();
+        let (_alloc_front, alloc_tail): (NullCudaAlloc, A) = alloc.split();
 
         Ok(alloc_tail)
     }
@@ -105,26 +107,26 @@ unsafe impl<T: SafeDeviceCopy + TypeGraphLayout> RustToCudaAsync
 {
     #[cfg(feature = "host")]
     #[allow(clippy::type_complexity)]
-    unsafe fn borrow_async<A: crate::host::CudaAlloc>(
+    unsafe fn borrow_async<A: CudaAlloc>(
         &self,
         alloc: A,
         _stream: &rustacuda::stream::Stream,
     ) -> rustacuda::error::CudaResult<(
         DeviceAccessible<Self::CudaRepresentation>,
-        crate::host::CombinedCudaAlloc<Self::CudaAllocation, A>,
+        CombinedCudaAlloc<Self::CudaAllocation, A>,
     )> {
-        let alloc = crate::host::CombinedCudaAlloc::new(crate::host::NullCudaAlloc, alloc);
+        let alloc = CombinedCudaAlloc::new(NullCudaAlloc, alloc);
         Ok((DeviceAccessible::from(&self.0), alloc))
     }
 
     #[cfg(feature = "host")]
     #[doc(cfg(feature = "host"))]
-    unsafe fn restore_async<A: crate::host::CudaAlloc>(
+    unsafe fn restore_async<A: CudaAlloc>(
         &mut self,
-        alloc: crate::host::CombinedCudaAlloc<Self::CudaAllocation, A>,
+        alloc: CombinedCudaAlloc<Self::CudaAllocation, A>,
         _stream: &rustacuda::stream::Stream,
     ) -> rustacuda::error::CudaResult<A> {
-        let (_alloc_front, alloc_tail): (crate::host::NullCudaAlloc, A) = alloc.split();
+        let (_alloc_front, alloc_tail): (NullCudaAlloc, A) = alloc.split();
 
         Ok(alloc_tail)
     }
diff --git a/src/utils/exchange/buffer/device.rs b/src/utils/exchange/buffer/device.rs
index 1ecaf91d2..14ffac979 100644
--- a/src/utils/exchange/buffer/device.rs
+++ b/src/utils/exchange/buffer/device.rs
@@ -3,7 +3,7 @@ use core::ops::{Deref, DerefMut};
 use const_type_layout::TypeGraphLayout;
 
 use crate::{
-    common::{RustToCuda, RustToCudaAsync},
+    common::{NullCudaAlloc, RustToCuda, RustToCudaAsync},
     safety::SafeDeviceCopy,
 };
 
@@ -43,6 +43,7 @@ impl<T: SafeDeviceCopy, const M2D: bool, const M2H: bool> DerefMut
 unsafe impl<T: SafeDeviceCopy + TypeGraphLayout, const M2D: bool, const M2H: bool> RustToCuda
     for CudaExchangeBufferDevice<T, M2D, M2H>
 {
+    type CudaAllocation = NullCudaAlloc;
     type CudaRepresentation = CudaExchangeBufferCudaRepresentation<T, M2D, M2H>;
 }
 
diff --git a/src/utils/exchange/buffer/host.rs b/src/utils/exchange/buffer/host.rs
index debe33059..e45efc71e 100644
--- a/src/utils/exchange/buffer/host.rs
+++ b/src/utils/exchange/buffer/host.rs
@@ -11,8 +11,10 @@ use rustacuda::{
 };
 
 use crate::{
-    common::{DeviceAccessible, RustToCuda, RustToCudaAsync},
-    host::{CombinedCudaAlloc, CudaAlloc, CudaDropWrapper, NullCudaAlloc},
+    common::{
+        CombinedCudaAlloc, CudaAlloc, DeviceAccessible, NullCudaAlloc, RustToCuda, RustToCudaAsync,
+    },
+    host::CudaDropWrapper,
     safety::SafeDeviceCopy,
 };
 
diff --git a/src/utils/exchange/wrapper.rs b/src/utils/exchange/wrapper.rs
index a4a8e50f7..711409469 100644
--- a/src/utils/exchange/wrapper.rs
+++ b/src/utils/exchange/wrapper.rs
@@ -15,11 +15,13 @@ use rustacuda::{
 };
 
 use crate::{
-    common::{DeviceAccessible, RustToCuda, RustToCudaAsync},
+    common::{
+        CombinedCudaAlloc, DeviceAccessible, EmptyCudaAlloc, NullCudaAlloc, RustToCuda,
+        RustToCudaAsync,
+    },
     host::{
-        CombinedCudaAlloc, CudaDropWrapper, EmptyCudaAlloc, HostAndDeviceConstRef,
-        HostAndDeviceConstRefAsync, HostAndDeviceMutRef, HostAndDeviceMutRefAsync, HostDeviceBox,
-        HostLockedBox, NullCudaAlloc,
+        CudaDropWrapper, HostAndDeviceConstRef, HostAndDeviceConstRefAsync, HostAndDeviceMutRef,
+        HostAndDeviceMutRefAsync, HostDeviceBox, HostLockedBox,
     },
 };
 
diff --git a/src/utils/option.rs b/src/utils/option.rs
index 18b86527b..f939f5ba0 100644
--- a/src/utils/option.rs
+++ b/src/utils/option.rs
@@ -12,7 +12,10 @@ use crate::{
 };
 
 #[cfg(feature = "host")]
-use crate::{host::CombinedCudaAlloc, host::CudaAlloc, rustacuda::error::CudaResult};
+use crate::{
+    common::{CombinedCudaAlloc, CudaAlloc},
+    rustacuda::error::CudaResult,
+};
 
 #[doc(hidden)]
 #[allow(clippy::module_name_repetitions)]
@@ -28,8 +31,6 @@ pub struct OptionCudaRepresentation<T: CudaAsRust> {
 unsafe impl<T: CudaAsRust> rustacuda_core::DeviceCopy for OptionCudaRepresentation<T> {}
 
 unsafe impl<T: RustToCuda> RustToCuda for Option<T> {
-    #[cfg(feature = "host")]
-    #[doc(cfg(feature = "host"))]
     type CudaAllocation = Option<<T as RustToCuda>::CudaAllocation>;
     type CudaRepresentation = OptionCudaRepresentation<<T as RustToCuda>::CudaRepresentation>;
 
diff --git a/src/utils/shared/slice.rs b/src/utils/shared/slice.rs
index e1f95ba95..0a8a66c62 100644
--- a/src/utils/shared/slice.rs
+++ b/src/utils/shared/slice.rs
@@ -78,7 +78,7 @@ impl<T: 'static + TypeGraphLayout> ThreadBlockSharedSlice<T> {
 
     #[cfg(any(target_os = "cuda", doc))]
     #[doc(cfg(target_os = "cuda"))]
-    /// Safety:
+    /// # Safety
     ///
     /// The provided `index` must not be out of bounds.
     #[inline]
diff --git a/src/utils/shared/static.rs b/src/utils/shared/static.rs
index 324c0fdef..5b8cdfc52 100644
--- a/src/utils/shared/static.rs
+++ b/src/utils/shared/static.rs
@@ -51,7 +51,7 @@ impl<T: 'static> ThreadBlockShared<T> {
 impl<T: 'static, const N: usize> ThreadBlockShared<[T; N]> {
     #[cfg(any(target_os = "cuda", doc))]
     #[doc(cfg(target_os = "cuda"))]
-    /// Safety:
+    /// # Safety
     ///
     /// The provided `index` must not be out of bounds.
     #[inline]

From 0f4fc4639554007b700be0a5aa752a1a69ad541c Mon Sep 17 00:00:00 2001
From: Juniper Tyree <juniper.tyree@helsinki.fi>
Date: Sun, 7 May 2023 10:04:28 +0000
Subject: [PATCH 027/120] Fixed SomeCudaAlloc import

---
 .gitignore                                |    3 +
 examples/single-source/expanded.rs        | 1150 ---------------------
 rust-cuda-derive/src/rust_to_cuda/impl.rs |    4 +-
 rust-cuda-derive/src/rust_to_cuda/mod.rs  |    2 +-
 src/common.rs                             |   10 +-
 src/device/mod.rs                         |    4 -
 src/host.rs                               |   10 +-
 src/utils/box.rs                          |    2 +-
 src/utils/boxed_slice.rs                  |    2 +-
 src/utils/device_copy.rs                  |   12 +-
 src/utils/exchange/buffer/device.rs       |    4 +-
 src/utils/exchange/buffer/host.rs         |    8 +-
 src/utils/exchange/wrapper.rs             |   20 +-
 13 files changed, 42 insertions(+), 1189 deletions(-)
 delete mode 100644 examples/single-source/expanded.rs

diff --git a/.gitignore b/.gitignore
index 767dae236..218ca8786 100644
--- a/.gitignore
+++ b/.gitignore
@@ -8,3 +8,6 @@ Cargo.lock
 
 # These are backup files generated by rustfmt
 **/*.rs.bk
+
+# cargo expand dev output files
+**/expanded.rs
diff --git a/examples/single-source/expanded.rs b/examples/single-source/expanded.rs
deleted file mode 100644
index f16379c37..000000000
--- a/examples/single-source/expanded.rs
+++ /dev/null
@@ -1,1150 +0,0 @@
-#![feature(prelude_import)]
-#![deny(clippy::pedantic)]
-#![feature(cfg_version)]
-#![feature(const_type_name)]
-#![feature(const_refs_to_cell)]
-#![feature(const_trait_impl)]
-#![feature(const_mut_refs)]
-#[prelude_import]
-use std::prelude::rust_2021::*;
-#[macro_use]
-extern crate std;
-extern crate alloc;
-#[cfg(not(target_os = "cuda"))]
-fn main() {}
-#[repr(C)]
-#[layout(crate = "rc::const_type_layout")]
-pub struct Dummy(i32);
-unsafe impl const rc::const_type_layout::TypeLayout for Dummy {
-    const TYPE_LAYOUT: rc::const_type_layout::TypeLayoutInfo<'static> = {
-        rc::const_type_layout::TypeLayoutInfo {
-            name: ::core::any::type_name::<Self>(),
-            size: ::core::mem::size_of::<Self>(),
-            alignment: ::core::mem::align_of::<Self>(),
-            structure: rc::const_type_layout::TypeStructure::Struct {
-                repr: "C",
-                fields: &[
-                    rc::const_type_layout::Field {
-                        name: "0",
-                        offset: {
-                            {
-                                #[allow(clippy::unneeded_field_pattern)]
-                                let Dummy { 0: _, .. }: Dummy;
-                                if let ::const_type_layout::MaybeUninhabited::Inhabited(
-                                    uninit,
-                                )
-                                    = unsafe {
-                                        <Dummy as ::const_type_layout::TypeLayout>::uninit()
-                                    } {
-                                    let base_ptr: *const Dummy = (&raw const uninit).cast();
-                                    #[allow(unused_unsafe)]
-                                    let field_ptr = unsafe { &raw const (*base_ptr).0 };
-                                    #[allow(clippy::cast_sign_loss)]
-                                    let offset = unsafe {
-                                        field_ptr.cast::<u8>().offset_from(base_ptr.cast()) as usize
-                                    };
-                                    #[allow(clippy::forget_non_drop, clippy::forget_copy)]
-                                    core::mem::forget(uninit);
-                                    ::const_type_layout::MaybeUninhabited::Inhabited(offset)
-                                } else {
-                                    ::const_type_layout::MaybeUninhabited::Uninhabited
-                                }
-                            }
-                        },
-                        ty: ::core::any::type_name::<i32>(),
-                    },
-                ],
-            },
-        }
-    };
-    unsafe fn uninit() -> rc::const_type_layout::MaybeUninhabited<
-        ::core::mem::MaybeUninit<Self>,
-    > {
-        if let (rc::const_type_layout::MaybeUninhabited::Inhabited(f_0))
-            = (<i32 as rc::const_type_layout::TypeLayout>::uninit()) {
-            rc::const_type_layout::MaybeUninhabited::Inhabited(
-                ::core::mem::MaybeUninit::new(Dummy(f_0.assume_init())),
-            )
-        } else {
-            rc::const_type_layout::MaybeUninhabited::Uninhabited
-        }
-    }
-}
-unsafe impl const rc::const_type_layout::TypeGraph for Dummy {
-    fn populate_graph(graph: &mut rc::const_type_layout::TypeLayoutGraph<'static>) {
-        if graph.insert(&<Self as rc::const_type_layout::TypeLayout>::TYPE_LAYOUT) {
-            <i32 as rc::const_type_layout::TypeGraph>::populate_graph(graph);
-        }
-    }
-}
-#[cuda(crate = "rc")]
-#[allow(dead_code)]
-pub struct Wrapper<T> {
-    #[cuda(embed)]
-    inner: T,
-}
-#[allow(dead_code)]
-#[doc(hidden)]
-#[allow(dead_code)]
-#[repr(C)]
-#[layout(free = "T")]
-#[layout(crate = "rc :: const_type_layout")]
-pub struct WrapperCudaRepresentation<T>
-where
-    T: rc::common::RustToCuda,
-{
-    inner: rc::common::DeviceAccessible<
-        <T as rc::common::RustToCuda>::CudaRepresentation,
-    >,
-}
-unsafe impl<T> const rc::const_type_layout::TypeLayout for WrapperCudaRepresentation<T>
-where
-    T: rc::common::RustToCuda,
-{
-    const TYPE_LAYOUT: rc::const_type_layout::TypeLayoutInfo<'static> = {
-        rc::const_type_layout::TypeLayoutInfo {
-            name: ::core::any::type_name::<Self>(),
-            size: ::core::mem::size_of::<Self>(),
-            alignment: ::core::mem::align_of::<Self>(),
-            structure: rc::const_type_layout::TypeStructure::Struct {
-                repr: "C",
-                fields: &[
-                    rc::const_type_layout::Field {
-                        name: "inner",
-                        offset: {
-                            {
-                                #[allow(clippy::unneeded_field_pattern)]
-                                let WrapperCudaRepresentation {
-                                    inner: _,
-                                    ..
-                                }: WrapperCudaRepresentation<T>;
-                                if let ::const_type_layout::MaybeUninhabited::Inhabited(
-                                    uninit,
-                                )
-                                    = unsafe {
-                                        <WrapperCudaRepresentation<
-                                            T,
-                                        > as ::const_type_layout::TypeLayout>::uninit()
-                                    } {
-                                    let base_ptr: *const WrapperCudaRepresentation<T> = (&raw const uninit)
-                                        .cast();
-                                    #[allow(unused_unsafe)]
-                                    let field_ptr = unsafe { &raw const (*base_ptr).inner };
-                                    #[allow(clippy::cast_sign_loss)]
-                                    let offset = unsafe {
-                                        field_ptr.cast::<u8>().offset_from(base_ptr.cast()) as usize
-                                    };
-                                    #[allow(clippy::forget_non_drop, clippy::forget_copy)]
-                                    core::mem::forget(uninit);
-                                    ::const_type_layout::MaybeUninhabited::Inhabited(offset)
-                                } else {
-                                    ::const_type_layout::MaybeUninhabited::Uninhabited
-                                }
-                            }
-                        },
-                        ty: ::core::any::type_name::<
-                            rc::common::DeviceAccessible<
-                                <T as rc::common::RustToCuda>::CudaRepresentation,
-                            >,
-                        >(),
-                    },
-                ],
-            },
-        }
-    };
-    unsafe fn uninit() -> rc::const_type_layout::MaybeUninhabited<
-        ::core::mem::MaybeUninit<Self>,
-    > {
-        if let (rc::const_type_layout::MaybeUninhabited::Inhabited(inner))
-            = (<rc::common::DeviceAccessible<
-                <T as rc::common::RustToCuda>::CudaRepresentation,
-            > as rc::const_type_layout::TypeLayout>::uninit()) {
-            rc::const_type_layout::MaybeUninhabited::Inhabited(
-                ::core::mem::MaybeUninit::new(WrapperCudaRepresentation {
-                    inner: inner.assume_init(),
-                }),
-            )
-        } else {
-            rc::const_type_layout::MaybeUninhabited::Uninhabited
-        }
-    }
-}
-unsafe impl<T> const rc::const_type_layout::TypeGraph for WrapperCudaRepresentation<T>
-where
-    T: rc::common::RustToCuda,
-{
-    fn populate_graph(graph: &mut rc::const_type_layout::TypeLayoutGraph<'static>) {
-        if graph.insert(&<Self as rc::const_type_layout::TypeLayout>::TYPE_LAYOUT) {
-            <rc::common::DeviceAccessible<
-                <T as rc::common::RustToCuda>::CudaRepresentation,
-            > as rc::const_type_layout::TypeGraph>::populate_graph(graph);
-        }
-    }
-}
-unsafe impl<T> rc::rustacuda_core::DeviceCopy for WrapperCudaRepresentation<T>
-where
-    T: rc::common::RustToCuda,
-{}
-unsafe impl<T> rc::common::RustToCuda for Wrapper<T>
-where
-    T: rc::common::RustToCuda,
-{
-    type CudaRepresentation = WrapperCudaRepresentation<T>;
-    type CudaAllocation = rc::common::CombinedCudaAlloc<
-        <T as rc::common::RustToCuda>::CudaAllocation,
-        rc::common::NullCudaAlloc,
-    >;
-    #[cfg(not(target_os = "cuda"))]
-    unsafe fn borrow<CudaAllocType: rc::common::CudaAlloc>(
-        &self,
-        alloc: CudaAllocType,
-    ) -> rc::rustacuda::error::CudaResult<
-        (
-            rc::common::DeviceAccessible<Self::CudaRepresentation>,
-            rc::common::CombinedCudaAlloc<Self::CudaAllocation, CudaAllocType>,
-        ),
-    > {
-        let alloc_front = rc::common::NullCudaAlloc;
-        let alloc_tail = alloc;
-        let (field_inner_repr, alloc_front) = rc::common::RustToCuda::borrow(
-            &self.inner,
-            alloc_front,
-        )?;
-        let borrow = WrapperCudaRepresentation {
-            inner: field_inner_repr,
-        };
-        Ok((
-            rc::common::DeviceAccessible::from(borrow),
-            rc::common::CombinedCudaAlloc::new(alloc_front, alloc_tail),
-        ))
-    }
-    #[cfg(not(target_os = "cuda"))]
-    unsafe fn restore<CudaAllocType: rc::common::CudaAlloc>(
-        &mut self,
-        alloc: rc::common::CombinedCudaAlloc<Self::CudaAllocation, CudaAllocType>,
-    ) -> rc::rustacuda::error::CudaResult<CudaAllocType> {
-        let (alloc_front, alloc_tail) = alloc.split();
-        let alloc_front = rc::common::RustToCuda::restore(&mut self.inner, alloc_front)?;
-        Ok(alloc_tail)
-    }
-}
-unsafe impl<T> rc::common::RustToCudaAsync for Wrapper<T>
-where
-    T: rc::common::RustToCudaAsync,
-{
-    #[cfg(not(target_os = "cuda"))]
-    unsafe fn borrow_async<CudaAllocType: rc::common::CudaAlloc>(
-        &self,
-        alloc: CudaAllocType,
-        stream: &rc::rustacuda::stream::Stream,
-    ) -> rc::rustacuda::error::CudaResult<
-        (
-            rc::common::DeviceAccessible<Self::CudaRepresentation>,
-            rc::common::CombinedCudaAlloc<Self::CudaAllocation, CudaAllocType>,
-        ),
-    > {
-        let alloc_front = rc::common::NullCudaAlloc;
-        let alloc_tail = alloc;
-        let (field_inner_repr, alloc_front) = rc::common::RustToCudaAsync::borrow_async(
-            &self.inner,
-            alloc_front,
-            stream,
-        )?;
-        let borrow = WrapperCudaRepresentation {
-            inner: field_inner_repr,
-        };
-        Ok((
-            rc::common::DeviceAccessible::from(borrow),
-            rc::common::CombinedCudaAlloc::new(alloc_front, alloc_tail),
-        ))
-    }
-    #[cfg(not(target_os = "cuda"))]
-    unsafe fn restore_async<CudaAllocType: rc::common::CudaAlloc>(
-        &mut self,
-        alloc: rc::common::CombinedCudaAlloc<Self::CudaAllocation, CudaAllocType>,
-        stream: &rc::rustacuda::stream::Stream,
-    ) -> rc::rustacuda::error::CudaResult<CudaAllocType> {
-        let (alloc_front, alloc_tail) = alloc.split();
-        let alloc_front = rc::common::RustToCudaAsync::restore_async(
-            &mut self.inner,
-            alloc_front,
-            stream,
-        )?;
-        Ok(alloc_tail)
-    }
-}
-unsafe impl<T> rc::common::CudaAsRust for WrapperCudaRepresentation<T>
-where
-    T: rc::common::RustToCuda,
-{
-    type RustRepresentation = Wrapper<T>;
-}
-#[cuda(crate = "rc")]
-pub struct Empty([u8; 0]);
-#[allow(dead_code)]
-#[doc(hidden)]
-#[repr(C)]
-#[layout(crate = "rc :: const_type_layout")]
-pub struct EmptyCudaRepresentation(
-    rc::common::DeviceAccessible<rc::utils::device_copy::SafeDeviceCopyWrapper<[u8; 0]>>,
-);
-unsafe impl const rc::const_type_layout::TypeLayout for EmptyCudaRepresentation {
-    const TYPE_LAYOUT: rc::const_type_layout::TypeLayoutInfo<'static> = {
-        rc::const_type_layout::TypeLayoutInfo {
-            name: ::core::any::type_name::<Self>(),
-            size: ::core::mem::size_of::<Self>(),
-            alignment: ::core::mem::align_of::<Self>(),
-            structure: rc::const_type_layout::TypeStructure::Struct {
-                repr: "C",
-                fields: &[
-                    rc::const_type_layout::Field {
-                        name: "0",
-                        offset: {
-                            {
-                                #[allow(clippy::unneeded_field_pattern)]
-                                let EmptyCudaRepresentation {
-                                    0: _,
-                                    ..
-                                }: EmptyCudaRepresentation;
-                                if let ::const_type_layout::MaybeUninhabited::Inhabited(
-                                    uninit,
-                                )
-                                    = unsafe {
-                                        <EmptyCudaRepresentation as ::const_type_layout::TypeLayout>::uninit()
-                                    } {
-                                    let base_ptr: *const EmptyCudaRepresentation = (&raw const uninit)
-                                        .cast();
-                                    #[allow(unused_unsafe)]
-                                    let field_ptr = unsafe { &raw const (*base_ptr).0 };
-                                    #[allow(clippy::cast_sign_loss)]
-                                    let offset = unsafe {
-                                        field_ptr.cast::<u8>().offset_from(base_ptr.cast()) as usize
-                                    };
-                                    #[allow(clippy::forget_non_drop, clippy::forget_copy)]
-                                    core::mem::forget(uninit);
-                                    ::const_type_layout::MaybeUninhabited::Inhabited(offset)
-                                } else {
-                                    ::const_type_layout::MaybeUninhabited::Uninhabited
-                                }
-                            }
-                        },
-                        ty: ::core::any::type_name::<
-                            rc::common::DeviceAccessible<
-                                rc::utils::device_copy::SafeDeviceCopyWrapper<[u8; 0]>,
-                            >,
-                        >(),
-                    },
-                ],
-            },
-        }
-    };
-    unsafe fn uninit() -> rc::const_type_layout::MaybeUninhabited<
-        ::core::mem::MaybeUninit<Self>,
-    > {
-        if let (rc::const_type_layout::MaybeUninhabited::Inhabited(f_0))
-            = (<rc::common::DeviceAccessible<
-                rc::utils::device_copy::SafeDeviceCopyWrapper<[u8; 0]>,
-            > as rc::const_type_layout::TypeLayout>::uninit()) {
-            rc::const_type_layout::MaybeUninhabited::Inhabited(
-                ::core::mem::MaybeUninit::new(EmptyCudaRepresentation(f_0.assume_init())),
-            )
-        } else {
-            rc::const_type_layout::MaybeUninhabited::Uninhabited
-        }
-    }
-}
-unsafe impl const rc::const_type_layout::TypeGraph for EmptyCudaRepresentation {
-    fn populate_graph(graph: &mut rc::const_type_layout::TypeLayoutGraph<'static>) {
-        if graph.insert(&<Self as rc::const_type_layout::TypeLayout>::TYPE_LAYOUT) {
-            <rc::common::DeviceAccessible<
-                rc::utils::device_copy::SafeDeviceCopyWrapper<[u8; 0]>,
-            > as rc::const_type_layout::TypeGraph>::populate_graph(graph);
-        }
-    }
-}
-unsafe impl rc::rustacuda_core::DeviceCopy for EmptyCudaRepresentation {}
-unsafe impl rc::common::RustToCuda for Empty {
-    type CudaRepresentation = EmptyCudaRepresentation;
-    type CudaAllocation = rc::common::NullCudaAlloc;
-    #[cfg(not(target_os = "cuda"))]
-    unsafe fn borrow<CudaAllocType: rc::common::CudaAlloc>(
-        &self,
-        alloc: CudaAllocType,
-    ) -> rc::rustacuda::error::CudaResult<
-        (
-            rc::common::DeviceAccessible<Self::CudaRepresentation>,
-            rc::common::CombinedCudaAlloc<Self::CudaAllocation, CudaAllocType>,
-        ),
-    > {
-        let alloc_front = rc::common::NullCudaAlloc;
-        let alloc_tail = alloc;
-        let field_0_repr = rc::common::DeviceAccessible::from(&self.0);
-        let borrow = EmptyCudaRepresentation(field_0_repr);
-        Ok((
-            rc::common::DeviceAccessible::from(borrow),
-            rc::common::CombinedCudaAlloc::new(alloc_front, alloc_tail),
-        ))
-    }
-    #[cfg(not(target_os = "cuda"))]
-    unsafe fn restore<CudaAllocType: rc::common::CudaAlloc>(
-        &mut self,
-        alloc: rc::common::CombinedCudaAlloc<Self::CudaAllocation, CudaAllocType>,
-    ) -> rc::rustacuda::error::CudaResult<CudaAllocType> {
-        let (alloc_front, alloc_tail) = alloc.split();
-        Ok(alloc_tail)
-    }
-}
-unsafe impl rc::common::RustToCudaAsync for Empty {
-    #[cfg(not(target_os = "cuda"))]
-    unsafe fn borrow_async<CudaAllocType: rc::common::CudaAlloc>(
-        &self,
-        alloc: CudaAllocType,
-        stream: &rc::rustacuda::stream::Stream,
-    ) -> rc::rustacuda::error::CudaResult<
-        (
-            rc::common::DeviceAccessible<Self::CudaRepresentation>,
-            rc::common::CombinedCudaAlloc<Self::CudaAllocation, CudaAllocType>,
-        ),
-    > {
-        let alloc_front = rc::common::NullCudaAlloc;
-        let alloc_tail = alloc;
-        let field_0_repr = rc::common::DeviceAccessible::from(&self.0);
-        let borrow = EmptyCudaRepresentation(field_0_repr);
-        Ok((
-            rc::common::DeviceAccessible::from(borrow),
-            rc::common::CombinedCudaAlloc::new(alloc_front, alloc_tail),
-        ))
-    }
-    #[cfg(not(target_os = "cuda"))]
-    unsafe fn restore_async<CudaAllocType: rc::common::CudaAlloc>(
-        &mut self,
-        alloc: rc::common::CombinedCudaAlloc<Self::CudaAllocation, CudaAllocType>,
-        stream: &rc::rustacuda::stream::Stream,
-    ) -> rc::rustacuda::error::CudaResult<CudaAllocType> {
-        let (alloc_front, alloc_tail) = alloc.split();
-        Ok(alloc_tail)
-    }
-}
-unsafe impl rc::common::CudaAsRust for EmptyCudaRepresentation {
-    type RustRepresentation = Empty;
-}
-#[repr(C)]
-#[layout(crate = "rc::const_type_layout")]
-pub struct Tuple(u32, i32);
-unsafe impl const rc::const_type_layout::TypeLayout for Tuple {
-    const TYPE_LAYOUT: rc::const_type_layout::TypeLayoutInfo<'static> = {
-        rc::const_type_layout::TypeLayoutInfo {
-            name: ::core::any::type_name::<Self>(),
-            size: ::core::mem::size_of::<Self>(),
-            alignment: ::core::mem::align_of::<Self>(),
-            structure: rc::const_type_layout::TypeStructure::Struct {
-                repr: "C",
-                fields: &[
-                    rc::const_type_layout::Field {
-                        name: "0",
-                        offset: {
-                            {
-                                #[allow(clippy::unneeded_field_pattern)]
-                                let Tuple { 0: _, .. }: Tuple;
-                                if let ::const_type_layout::MaybeUninhabited::Inhabited(
-                                    uninit,
-                                )
-                                    = unsafe {
-                                        <Tuple as ::const_type_layout::TypeLayout>::uninit()
-                                    } {
-                                    let base_ptr: *const Tuple = (&raw const uninit).cast();
-                                    #[allow(unused_unsafe)]
-                                    let field_ptr = unsafe { &raw const (*base_ptr).0 };
-                                    #[allow(clippy::cast_sign_loss)]
-                                    let offset = unsafe {
-                                        field_ptr.cast::<u8>().offset_from(base_ptr.cast()) as usize
-                                    };
-                                    #[allow(clippy::forget_non_drop, clippy::forget_copy)]
-                                    core::mem::forget(uninit);
-                                    ::const_type_layout::MaybeUninhabited::Inhabited(offset)
-                                } else {
-                                    ::const_type_layout::MaybeUninhabited::Uninhabited
-                                }
-                            }
-                        },
-                        ty: ::core::any::type_name::<u32>(),
-                    },
-                    rc::const_type_layout::Field {
-                        name: "1",
-                        offset: {
-                            {
-                                #[allow(clippy::unneeded_field_pattern)]
-                                let Tuple { 1: _, .. }: Tuple;
-                                if let ::const_type_layout::MaybeUninhabited::Inhabited(
-                                    uninit,
-                                )
-                                    = unsafe {
-                                        <Tuple as ::const_type_layout::TypeLayout>::uninit()
-                                    } {
-                                    let base_ptr: *const Tuple = (&raw const uninit).cast();
-                                    #[allow(unused_unsafe)]
-                                    let field_ptr = unsafe { &raw const (*base_ptr).1 };
-                                    #[allow(clippy::cast_sign_loss)]
-                                    let offset = unsafe {
-                                        field_ptr.cast::<u8>().offset_from(base_ptr.cast()) as usize
-                                    };
-                                    #[allow(clippy::forget_non_drop, clippy::forget_copy)]
-                                    core::mem::forget(uninit);
-                                    ::const_type_layout::MaybeUninhabited::Inhabited(offset)
-                                } else {
-                                    ::const_type_layout::MaybeUninhabited::Uninhabited
-                                }
-                            }
-                        },
-                        ty: ::core::any::type_name::<i32>(),
-                    },
-                ],
-            },
-        }
-    };
-    unsafe fn uninit() -> rc::const_type_layout::MaybeUninhabited<
-        ::core::mem::MaybeUninit<Self>,
-    > {
-        if let (
-            rc::const_type_layout::MaybeUninhabited::Inhabited(f_0),
-            rc::const_type_layout::MaybeUninhabited::Inhabited(f_1),
-        )
-            = (
-                <u32 as rc::const_type_layout::TypeLayout>::uninit(),
-                <i32 as rc::const_type_layout::TypeLayout>::uninit(),
-            ) {
-            rc::const_type_layout::MaybeUninhabited::Inhabited(
-                ::core::mem::MaybeUninit::new(
-                    Tuple(f_0.assume_init(), f_1.assume_init()),
-                ),
-            )
-        } else {
-            rc::const_type_layout::MaybeUninhabited::Uninhabited
-        }
-    }
-}
-unsafe impl const rc::const_type_layout::TypeGraph for Tuple {
-    fn populate_graph(graph: &mut rc::const_type_layout::TypeLayoutGraph<'static>) {
-        if graph.insert(&<Self as rc::const_type_layout::TypeLayout>::TYPE_LAYOUT) {
-            <u32 as rc::const_type_layout::TypeGraph>::populate_graph(graph);
-            <i32 as rc::const_type_layout::TypeGraph>::populate_graph(graph);
-        }
-    }
-}
-#[cfg(not(target_os = "cuda"))]
-#[allow(clippy::missing_safety_doc)]
-unsafe trait KernelArgs<T: rc::common::RustToCuda>
-where
-    T: rc::safety::StackOnly,
-    <T as rc::common::RustToCuda>::CudaRepresentation: rc::safety::StackOnly,
-    <T as rc::common::RustToCuda>::CudaAllocation: rc::common::EmptyCudaAlloc,
-{
-    type __T_0;
-    type __T_1;
-    type __T_2;
-    type __T_3;
-    type __T_4;
-    type __T_5;
-}
-unsafe impl<T: rc::common::RustToCuda> KernelArgs<T> for ()
-where
-    T: rc::safety::StackOnly,
-    <T as rc::common::RustToCuda>::CudaRepresentation: rc::safety::StackOnly,
-    <T as rc::common::RustToCuda>::CudaAllocation: rc::common::EmptyCudaAlloc,
-{
-    type __T_0 = Dummy;
-    type __T_1 = Wrapper<T>;
-    type __T_2 = Wrapper<T>;
-    type __T_3 = core::sync::atomic::AtomicU64;
-    type __T_4 = Wrapper<T>;
-    type __T_5 = Tuple;
-}
-#[cfg(not(target_os = "cuda"))]
-#[allow(clippy::missing_safety_doc)]
-unsafe trait KernelPtx<T: rc::common::RustToCuda>
-where
-    T: rc::safety::StackOnly,
-    <T as rc::common::RustToCuda>::CudaRepresentation: rc::safety::StackOnly,
-    <T as rc::common::RustToCuda>::CudaAllocation: rc::common::EmptyCudaAlloc,
-{
-    fn get_ptx_str() -> &'static str
-    where
-        Self: Sized + rc::host::Launcher<KernelTraitObject = dyn Kernel<T>>;
-    fn new_kernel() -> rc::rustacuda::error::CudaResult<
-        rc::host::TypedKernel<dyn Kernel<T>>,
-    >
-    where
-        Self: Sized + rc::host::Launcher<KernelTraitObject = dyn Kernel<T>>;
-}
-#[cfg(not(target_os = "cuda"))]
-#[allow(clippy::missing_safety_doc)]
-unsafe trait Kernel<T: rc::common::RustToCuda>: KernelPtx<T>
-where
-    T: rc::safety::StackOnly,
-    <T as rc::common::RustToCuda>::CudaRepresentation: rc::safety::StackOnly,
-    <T as rc::common::RustToCuda>::CudaAllocation: rc::common::EmptyCudaAlloc,
-{
-    #[allow(clippy::needless_lifetimes)]
-    #[allow(clippy::too_many_arguments)]
-    #[allow(clippy::used_underscore_binding)]
-    #[allow(unused_variables)]
-    fn kernel<'stream, '__r2c_lt_0, '__r2c_lt_1, '__r2c_lt_2, '__r2c_move_lt_4, 'a>(
-        &mut self,
-        stream: &'stream rc::rustacuda::stream::Stream,
-        _x: &'__r2c_lt_0 <() as KernelArgs<T>>::__T_0,
-        _y: &'__r2c_lt_1 mut <() as KernelArgs<T>>::__T_1,
-        _z: &'__r2c_lt_2 <() as KernelArgs<T>>::__T_2,
-        _v: &'a <() as KernelArgs<T>>::__T_3,
-        kernel_arg_4: <() as KernelArgs<T>>::__T_4,
-        s_t: <() as KernelArgs<T>>::__T_5,
-    ) -> rc::rustacuda::error::CudaResult<()>
-    where
-        Self: Sized + rc::host::Launcher<KernelTraitObject = dyn Kernel<T>>,
-    {
-        const fn __check_is_sync<T: ?Sized>(_x: &T) -> bool {
-            trait IsSyncMarker {
-                const SYNC: bool = false;
-            }
-            impl<T: ?Sized> IsSyncMarker for T {}
-            struct CheckIs<T: ?Sized>(::core::marker::PhantomData<T>);
-            #[allow(dead_code)]
-            impl<T: ?Sized + Sync> CheckIs<T> {
-                const SYNC: bool = true;
-            }
-            <CheckIs<T>>::SYNC
-        }
-        let mut ___x_box = rc::host::HostDeviceBox::from(
-            rc::rustacuda::memory::DeviceBox::new(
-                rc::utils::device_copy::SafeDeviceCopyWrapper::from_ref(_x),
-            )?,
-        );
-        #[allow(clippy::redundant_closure_call)]
-        let __result = (|_x| {
-            rc::host::LendToCuda::lend_to_cuda_mut(
-                _y,
-                |mut _y| {
-                    (|_y| {
-                        rc::host::LendToCuda::lend_to_cuda(
-                            _z,
-                            |_z| {
-                                (|_z| {
-                                    let mut ___v_box = rc::host::HostDeviceBox::from(
-                                        rc::rustacuda::memory::DeviceBox::new(
-                                            rc::utils::device_copy::SafeDeviceCopyWrapper::from_ref(_v),
-                                        )?,
-                                    );
-                                    #[allow(clippy::redundant_closure_call)]
-                                    let __result = (|_v| {
-                                        rc::host::LendToCuda::move_to_cuda(
-                                            kernel_arg_4,
-                                            |mut kernel_arg_4| {
-                                                (|kernel_arg_4| {
-                                                    {
-                                                        let s_t = rc::utils::device_copy::SafeDeviceCopyWrapper::from(
-                                                            s_t,
-                                                        );
-                                                        self.kernel_async(
-                                                            stream,
-                                                            _x,
-                                                            _y,
-                                                            _z,
-                                                            _v,
-                                                            kernel_arg_4,
-                                                            s_t,
-                                                        )?;
-                                                        stream.synchronize()
-                                                    }
-                                                })(kernel_arg_4.as_async())
-                                            },
-                                        )
-                                    })(unsafe {
-                                        rc::host::HostAndDeviceConstRef::new(
-                                                &___v_box,
-                                                rc::utils::device_copy::SafeDeviceCopyWrapper::from_ref(_v),
-                                            )
-                                            .as_async()
-                                    });
-                                    if !__check_is_sync(_v) {
-                                        ___v_box
-                                            .copy_to(unsafe { &mut *(_v as *const _ as *mut _) })?;
-                                    }
-                                    ::core::mem::drop(___v_box);
-                                    __result
-                                })(_z.as_async())
-                            },
-                        )
-                    })(_y.as_async())
-                },
-            )
-        })(unsafe {
-            rc::host::HostAndDeviceConstRef::new(
-                    &___x_box,
-                    rc::utils::device_copy::SafeDeviceCopyWrapper::from_ref(_x),
-                )
-                .as_async()
-        });
-        if !__check_is_sync(_x) {
-            ___x_box.copy_to(unsafe { &mut *(_x as *const _ as *mut _) })?;
-        }
-        ::core::mem::drop(___x_box);
-        __result
-    }
-    #[allow(clippy::extra_unused_type_parameters)]
-    #[allow(clippy::too_many_arguments)]
-    #[allow(clippy::used_underscore_binding)]
-    #[allow(unused_variables)]
-    fn kernel_async<
-        'stream,
-        '__r2c_lt_0,
-        '__r2c_lt_1,
-        '__r2c_lt_2,
-        '__r2c_move_lt_4,
-        'a,
-    >(
-        &mut self,
-        stream: &'stream rc::rustacuda::stream::Stream,
-        _x: rc::host::HostAndDeviceConstRefAsync<
-            'stream,
-            '__r2c_lt_0,
-            rc::utils::device_copy::SafeDeviceCopyWrapper<<() as KernelArgs<T>>::__T_0>,
-        >,
-        mut _y: rc::host::HostAndDeviceMutRefAsync<
-            'stream,
-            '__r2c_lt_1,
-            rc::common::DeviceAccessible<
-                <<() as KernelArgs<
-                    T,
-                >>::__T_1 as rc::common::RustToCuda>::CudaRepresentation,
-            >,
-        >,
-        _z: rc::host::HostAndDeviceConstRefAsync<
-            'stream,
-            '__r2c_lt_2,
-            rc::common::DeviceAccessible<
-                <<() as KernelArgs<
-                    T,
-                >>::__T_2 as rc::common::RustToCuda>::CudaRepresentation,
-            >,
-        >,
-        _v: rc::host::HostAndDeviceConstRefAsync<
-            'stream,
-            'a,
-            rc::utils::device_copy::SafeDeviceCopyWrapper<<() as KernelArgs<T>>::__T_3>,
-        >,
-        kernel_arg_4: rc::host::HostAndDeviceOwnedAsync<
-            'stream,
-            '__r2c_move_lt_4,
-            rc::common::DeviceAccessible<
-                <<() as KernelArgs<
-                    T,
-                >>::__T_4 as rc::common::RustToCuda>::CudaRepresentation,
-            >,
-        >,
-        s_t: rc::utils::device_copy::SafeDeviceCopyWrapper<<() as KernelArgs<T>>::__T_5>,
-    ) -> rc::rustacuda::error::CudaResult<()>
-    where
-        Self: Sized + rc::host::Launcher<KernelTraitObject = dyn Kernel<T>>,
-    {
-        let rc::host::LaunchPackage { kernel, watcher, config } = rc::host::Launcher::get_launch_package(
-            self,
-        );
-        let kernel_jit_result = if config.ptx_jit {
-            kernel
-                .compile_with_ptx_jit_args(
-                    Some(
-                        &[
-                            None,
-                            Some(rc::ptx_jit::arg_as_raw_bytes(_y.for_host())),
-                            None,
-                            Some(rc::ptx_jit::arg_as_raw_bytes(_v.for_host())),
-                            None,
-                            None,
-                        ],
-                    ),
-                )?
-        } else {
-            kernel.compile_with_ptx_jit_args(None)?
-        };
-        let function = match kernel_jit_result {
-            rc::host::KernelJITResult::Recompiled(function) => {
-                <Self as rc::host::Launcher>::on_compile(function, watcher)?;
-                function
-            }
-            rc::host::KernelJITResult::Cached(function) => function,
-        };
-        #[allow(clippy::redundant_closure_call)]
-        (|
-            _x: rc::common::DeviceConstRef<
-                '__r2c_lt_0,
-                rc::utils::device_copy::SafeDeviceCopyWrapper<
-                    <() as KernelArgs<T>>::__T_0,
-                >,
-            >,
-            _y: rc::common::DeviceMutRef<
-                '__r2c_lt_1,
-                rc::common::DeviceAccessible<
-                    <<() as KernelArgs<
-                        T,
-                    >>::__T_1 as rc::common::RustToCuda>::CudaRepresentation,
-                >,
-            >,
-            _z: rc::common::DeviceConstRef<
-                '__r2c_lt_2,
-                rc::common::DeviceAccessible<
-                    <<() as KernelArgs<
-                        T,
-                    >>::__T_2 as rc::common::RustToCuda>::CudaRepresentation,
-                >,
-            >,
-            _v: rc::common::DeviceConstRef<
-                'a,
-                rc::utils::device_copy::SafeDeviceCopyWrapper<
-                    <() as KernelArgs<T>>::__T_3,
-                >,
-            >,
-            kernel_arg_4: rc::common::DeviceMutRef<
-                '__r2c_move_lt_4,
-                rc::common::DeviceAccessible<
-                    <<() as KernelArgs<
-                        T,
-                    >>::__T_4 as rc::common::RustToCuda>::CudaRepresentation,
-                >,
-            >,
-            s_t: rc::utils::device_copy::SafeDeviceCopyWrapper<
-                <() as KernelArgs<T>>::__T_5,
-            >|
-        {
-            if false {
-                #[allow(dead_code)]
-                fn assert_impl_devicecopy<T: rc::rustacuda_core::DeviceCopy>(_val: &T) {}
-                #[allow(dead_code)]
-                fn assert_impl_no_aliasing<T: rc::safety::NoAliasing>() {}
-                #[allow(dead_code)]
-                fn assert_impl_fits_into_device_register<
-                    T: rc::safety::FitsIntoDeviceRegister,
-                >(_val: &T) {}
-                assert_impl_devicecopy(&_x);
-                assert_impl_devicecopy(&_y);
-                assert_impl_devicecopy(&_z);
-                assert_impl_devicecopy(&_v);
-                assert_impl_devicecopy(&kernel_arg_4);
-                assert_impl_devicecopy(&s_t);
-                assert_impl_no_aliasing::<<() as KernelArgs<T>>::__T_0>();
-                assert_impl_no_aliasing::<<() as KernelArgs<T>>::__T_1>();
-                assert_impl_no_aliasing::<<() as KernelArgs<T>>::__T_2>();
-                assert_impl_no_aliasing::<<() as KernelArgs<T>>::__T_3>();
-                assert_impl_no_aliasing::<<() as KernelArgs<T>>::__T_4>();
-                assert_impl_no_aliasing::<<() as KernelArgs<T>>::__T_5>();
-                assert_impl_fits_into_device_register(&_x);
-                assert_impl_fits_into_device_register(&_y);
-                assert_impl_fits_into_device_register(&_z);
-                assert_impl_fits_into_device_register(&_v);
-                assert_impl_fits_into_device_register(&kernel_arg_4);
-                assert_impl_fits_into_device_register(&s_t);
-            }
-            let rc::host::LaunchConfig { grid, block, shared_memory_size, ptx_jit: _ } = config;
-            unsafe {
-                stream
-                    .launch(
-                        function,
-                        grid,
-                        block,
-                        shared_memory_size,
-                        &[
-                            &_x as *const _ as *mut ::std::ffi::c_void,
-                            &_y as *const _ as *mut ::std::ffi::c_void,
-                            &_z as *const _ as *mut ::std::ffi::c_void,
-                            &_v as *const _ as *mut ::std::ffi::c_void,
-                            &kernel_arg_4 as *const _ as *mut ::std::ffi::c_void,
-                            &s_t as *const _ as *mut ::std::ffi::c_void,
-                        ],
-                    )
-            }
-        })(
-            unsafe { _x.for_device_async() },
-            unsafe { _y.for_device_async() },
-            unsafe { _z.for_device_async() },
-            unsafe { _v.for_device_async() },
-            unsafe { kernel_arg_4.for_device_async() },
-            s_t,
-        )
-    }
-}
-#[cfg(not(target_os = "cuda"))]
-#[allow(clippy::missing_safety_doc)]
-unsafe impl<T: rc::common::RustToCuda, K: KernelPtx<T>> Kernel<T> for K
-where
-    T: rc::safety::StackOnly,
-    <T as rc::common::RustToCuda>::CudaRepresentation: rc::safety::StackOnly,
-    <T as rc::common::RustToCuda>::CudaAllocation: rc::common::EmptyCudaAlloc,
-{}
-#[cfg(not(target_os = "cuda"))]
-const _: rc::safety::kernel_signature::Assert<
-    { rc::safety::kernel_signature::CpuAndGpuKernelSignatures::Match },
-> = rc::safety::kernel_signature::Assert::<
-    {
-        rc::safety::kernel_signature::check(
-            "//\n// Generated by LLVM NVPTX Back-End\n//\n\n.version 3.2\n.target sm_35\n.address_size 64\n\n\t// .globl\tkernel_type_layout\n\n.visible .entry kernel_type_layout()\n{\n\n\n\tret;\n\n}\n\t// .globl\tkernel_dfae7eaf723a670c\n.visible .entry kernel_dfae7eaf723a670c()\n{\n\n\n\tret;\n\n}\n"
-                .as_bytes(),
-            ".visible .entry kernel_dfae7eaf723a670c".as_bytes(),
-        )
-    },
->;
-#[cfg(not(target_os = "cuda"))]
-mod host {
-    #[allow(unused_imports)]
-    use super::KernelArgs;
-    use super::{Kernel, KernelPtx};
-    #[allow(dead_code)]
-    struct Launcher<T: rc::common::RustToCuda>(core::marker::PhantomData<T>);
-    unsafe impl KernelPtx<crate::Empty> for Launcher<crate::Empty> {
-        fn get_ptx_str() -> &'static str {
-            const PTX_STR: &'static str = "//\n// Generated by LLVM NVPTX Back-End\n//\n\n.version 3.2\n.target sm_35\n.address_size 64\n\n\t// .globl\tkernel_dfae7eaf723a670c_kernel_aab1c403129e575b\n.visible .entry kernel_dfae7eaf723a670c_kernel_aab1c403129e575b(\n\t.param .u64 kernel_dfae7eaf723a670c_kernel_aab1c403129e575b_param_0,\n\t.param .u64 kernel_dfae7eaf723a670c_kernel_aab1c403129e575b_param_1,\n\t.param .u64 kernel_dfae7eaf723a670c_kernel_aab1c403129e575b_param_2,\n\t.param .u64 kernel_dfae7eaf723a670c_kernel_aab1c403129e575b_param_3,\n\t.param .u64 kernel_dfae7eaf723a670c_kernel_aab1c403129e575b_param_4,\n\t.param .align 4 .b8 kernel_dfae7eaf723a670c_kernel_aab1c403129e575b_param_5[8]\n)\n{\n\t.reg .b32 \t%r<6>;\n\t.reg .b64 \t%rd<7>;\n\t.reg .f64 \t%fd<5>;\n\n\tld.param.u64 \t%rd3, [kernel_dfae7eaf723a670c_kernel_aab1c403129e575b_param_3];\n\tcvta.to.global.u64 \t%rd4, %rd3;\n\tld.param.u64 \t%rd5, [kernel_dfae7eaf723a670c_kernel_aab1c403129e575b_param_1];\n\tcvta.to.global.u64 \t%rd6, %rd5;\n\tld.global.u32 \t%r1, [%rd6];\n\t// begin inline asm\n\t// <rust-cuda-ptx-jit-const-load-%r1-1> //\n\t// end inline asm\n\tld.param.u32 \t%r3, [kernel_dfae7eaf723a670c_kernel_aab1c403129e575b_param_5];\n\tld.global.u32 \t%r2, [%rd4];\n\t// begin inline asm\n\t// <rust-cuda-ptx-jit-const-load-%r2-3> //\n\t// end inline asm\n\t// begin inline asm\n\t.shared .align 4 .b8 %rd1_rust_cuda_static_shared[24];\ncvta.shared.u64 %rd1, %rd1_rust_cuda_static_shared;\n\t// end inline asm\n\t// begin inline asm\n\t.shared .align 4 .b8 %rd2_rust_cuda_static_shared[24];\ncvta.shared.u64 %rd2, %rd2_rust_cuda_static_shared;\n\t// end inline asm\n\tcvt.rn.f64.u32 \t%fd1, %r3;\n\tadd.rn.f64 \t%fd2, %fd1, %fd1;\n\tmax.f64 \t%fd3, %fd2, 0d0000000000000000;\n\tmin.f64 \t%fd4, %fd3, 0d41EFFFFFFFE00000;\n\tcvt.rzi.u32.f64 \t%r4, %fd4;\n\tst.u32 \t[%rd1+8], %r4;\n\tmov.u32 \t%r5, 24;\n\tst.u32 \t[%rd2+20], %r5;\n\tret;\n\n}\n\n// <crate::Empty>\n";
-            const __KERNEL_DFAE7EAF723A670C__X_LAYOUT: &[u8; 879usize] = b"\xef\x06\x050.1.0mrust_cuda::common::DeviceConstRef<rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<single_source::Dummy>>\x06mrust_cuda::common::DeviceConstRef<rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<single_source::Dummy>>\x08\x08s\x0btransparent\x02\x07pointerh\x00Q*const rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<single_source::Dummy>\treferenceh\x00fcore::marker::PhantomData<&rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<single_source::Dummy>>Q*const rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<single_source::Dummy>\x08\x08pJrust_cuda::utils::device_copy::SafeDeviceCopyWrapper<single_source::Dummy>iJrust_cuda::utils::device_copy::SafeDeviceCopyWrapper<single_source::Dummy>\x04\x04s\x0btransparent\x01\x010h\x00\x14single_source::Dummy\x14single_source::Dummy\x04\x04s\x01C\x01\x010h\x00\x03i32\x03i32\x04\x04vfcore::marker::PhantomData<&rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<single_source::Dummy>>\x00\x01s\x00\x00";
-            const __KERNEL_DFAE7EAF723A670C__Y_LAYOUT: &[u8; 1811usize] = b"\x93\x0e\x050.1.0\x84\x01rust_cuda::common::DeviceMutRef<rust_cuda::common::DeviceAccessible<single_source::WrapperCudaRepresentation<single_source::Empty>>>\x0b\x84\x01rust_cuda::common::DeviceMutRef<rust_cuda::common::DeviceAccessible<single_source::WrapperCudaRepresentation<single_source::Empty>>>\x08\x08s\x0btransparent\x02\x07pointerh\x00h*mut rust_cuda::common::DeviceAccessible<single_source::WrapperCudaRepresentation<single_source::Empty>>\treferenceh\x00\x83\x01core::marker::PhantomData<&mut rust_cuda::common::DeviceAccessible<single_source::WrapperCudaRepresentation<single_source::Empty>>>h*mut rust_cuda::common::DeviceAccessible<single_source::WrapperCudaRepresentation<single_source::Empty>>\x08\x08pcrust_cuda::common::DeviceAccessible<single_source::WrapperCudaRepresentation<single_source::Empty>>mcrust_cuda::common::DeviceAccessible<single_source::WrapperCudaRepresentation<single_source::Empty>>\x00\x01s\x0btransparent\x01\x010h\x00>single_source::WrapperCudaRepresentation<single_source::Empty>>single_source::WrapperCudaRepresentation<single_source::Empty>\x00\x01s\x01C\x01\x05innerh\x00Krust_cuda::common::DeviceAccessible<single_source::EmptyCudaRepresentation>Krust_cuda::common::DeviceAccessible<single_source::EmptyCudaRepresentation>\x00\x01s\x0btransparent\x01\x010h\x00&single_source::EmptyCudaRepresentation&single_source::EmptyCudaRepresentation\x00\x01s\x01C\x01\x010h\x00brust_cuda::common::DeviceAccessible<rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<[u8; 0]>>brust_cuda::common::DeviceAccessible<rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<[u8; 0]>>\x00\x01s\x0btransparent\x01\x010h\x00=rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<[u8; 0]>=rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<[u8; 0]>\x00\x01s\x0btransparent\x01\x010h\x00\x07[u8; 0]\x07[u8; 0]\x00\x01a\x02u8\x00\x02u8\x01\x01v\x83\x01core::marker::PhantomData<&mut rust_cuda::common::DeviceAccessible<single_source::WrapperCudaRepresentation<single_source::Empty>>>\x00\x01s\x00\x00";
-            const __KERNEL_DFAE7EAF723A670C__Z_LAYOUT: &[u8; 1809usize] = b"\x91\x0e\x050.1.0\x86\x01rust_cuda::common::DeviceConstRef<rust_cuda::common::DeviceAccessible<single_source::WrapperCudaRepresentation<single_source::Empty>>>\x0b\x86\x01rust_cuda::common::DeviceConstRef<rust_cuda::common::DeviceAccessible<single_source::WrapperCudaRepresentation<single_source::Empty>>>\x08\x08s\x0btransparent\x02\x07pointerh\x00j*const rust_cuda::common::DeviceAccessible<single_source::WrapperCudaRepresentation<single_source::Empty>>\treferenceh\x00\x7fcore::marker::PhantomData<&rust_cuda::common::DeviceAccessible<single_source::WrapperCudaRepresentation<single_source::Empty>>>j*const rust_cuda::common::DeviceAccessible<single_source::WrapperCudaRepresentation<single_source::Empty>>\x08\x08pcrust_cuda::common::DeviceAccessible<single_source::WrapperCudaRepresentation<single_source::Empty>>icrust_cuda::common::DeviceAccessible<single_source::WrapperCudaRepresentation<single_source::Empty>>\x00\x01s\x0btransparent\x01\x010h\x00>single_source::WrapperCudaRepresentation<single_source::Empty>>single_source::WrapperCudaRepresentation<single_source::Empty>\x00\x01s\x01C\x01\x05innerh\x00Krust_cuda::common::DeviceAccessible<single_source::EmptyCudaRepresentation>Krust_cuda::common::DeviceAccessible<single_source::EmptyCudaRepresentation>\x00\x01s\x0btransparent\x01\x010h\x00&single_source::EmptyCudaRepresentation&single_source::EmptyCudaRepresentation\x00\x01s\x01C\x01\x010h\x00brust_cuda::common::DeviceAccessible<rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<[u8; 0]>>brust_cuda::common::DeviceAccessible<rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<[u8; 0]>>\x00\x01s\x0btransparent\x01\x010h\x00=rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<[u8; 0]>=rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<[u8; 0]>\x00\x01s\x0btransparent\x01\x010h\x00\x07[u8; 0]\x07[u8; 0]\x00\x01a\x02u8\x00\x02u8\x01\x01v\x7fcore::marker::PhantomData<&rust_cuda::common::DeviceAccessible<single_source::WrapperCudaRepresentation<single_source::Empty>>>\x00\x01s\x00\x00";
-            const __KERNEL_DFAE7EAF723A670C__V_LAYOUT: &[u8; 1068usize] = b"\xac\x08\x050.1.0vrust_cuda::common::DeviceConstRef<rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<core::sync::atomic::AtomicU64>>\x07vrust_cuda::common::DeviceConstRef<rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<core::sync::atomic::AtomicU64>>\x08\x08s\x0btransparent\x02\x07pointerh\x00Z*const rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<core::sync::atomic::AtomicU64>\treferenceh\x00ocore::marker::PhantomData<&rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<core::sync::atomic::AtomicU64>>Z*const rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<core::sync::atomic::AtomicU64>\x08\x08pSrust_cuda::utils::device_copy::SafeDeviceCopyWrapper<core::sync::atomic::AtomicU64>iSrust_cuda::utils::device_copy::SafeDeviceCopyWrapper<core::sync::atomic::AtomicU64>\x08\x08s\x0btransparent\x01\x010h\x00\x1dcore::sync::atomic::AtomicU64\x1dcore::sync::atomic::AtomicU64\x08\x08s\nC,align(8)\x01\x01vh\x00\x1bcore::cell::UnsafeCell<u64>\x1bcore::cell::UnsafeCell<u64>\x08\x08s\x15no_nieche,transparent\x01\x05valueh\x00\x03u64\x03u64\x08\x08vocore::marker::PhantomData<&rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<core::sync::atomic::AtomicU64>>\x00\x01s\x00\x00";
-            const __KERNEL_DFAE7EAF723A670C_KERNEL_ARG_4_LAYOUT: &[u8; 1811usize] = b"\x93\x0e\x050.1.0\x84\x01rust_cuda::common::DeviceMutRef<rust_cuda::common::DeviceAccessible<single_source::WrapperCudaRepresentation<single_source::Empty>>>\x0b\x84\x01rust_cuda::common::DeviceMutRef<rust_cuda::common::DeviceAccessible<single_source::WrapperCudaRepresentation<single_source::Empty>>>\x08\x08s\x0btransparent\x02\x07pointerh\x00h*mut rust_cuda::common::DeviceAccessible<single_source::WrapperCudaRepresentation<single_source::Empty>>\treferenceh\x00\x83\x01core::marker::PhantomData<&mut rust_cuda::common::DeviceAccessible<single_source::WrapperCudaRepresentation<single_source::Empty>>>h*mut rust_cuda::common::DeviceAccessible<single_source::WrapperCudaRepresentation<single_source::Empty>>\x08\x08pcrust_cuda::common::DeviceAccessible<single_source::WrapperCudaRepresentation<single_source::Empty>>mcrust_cuda::common::DeviceAccessible<single_source::WrapperCudaRepresentation<single_source::Empty>>\x00\x01s\x0btransparent\x01\x010h\x00>single_source::WrapperCudaRepresentation<single_source::Empty>>single_source::WrapperCudaRepresentation<single_source::Empty>\x00\x01s\x01C\x01\x05innerh\x00Krust_cuda::common::DeviceAccessible<single_source::EmptyCudaRepresentation>Krust_cuda::common::DeviceAccessible<single_source::EmptyCudaRepresentation>\x00\x01s\x0btransparent\x01\x010h\x00&single_source::EmptyCudaRepresentation&single_source::EmptyCudaRepresentation\x00\x01s\x01C\x01\x010h\x00brust_cuda::common::DeviceAccessible<rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<[u8; 0]>>brust_cuda::common::DeviceAccessible<rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<[u8; 0]>>\x00\x01s\x0btransparent\x01\x010h\x00=rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<[u8; 0]>=rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<[u8; 0]>\x00\x01s\x0btransparent\x01\x010h\x00\x07[u8; 0]\x07[u8; 0]\x00\x01a\x02u8\x00\x02u8\x01\x01v\x83\x01core::marker::PhantomData<&mut rust_cuda::common::DeviceAccessible<single_source::WrapperCudaRepresentation<single_source::Empty>>>\x00\x01s\x00\x00";
-            const __KERNEL_DFAE7EAF723A670C_S_T_LAYOUT: &[u8; 257usize] = b"\x81\x02\x050.1.0Jrust_cuda::utils::device_copy::SafeDeviceCopyWrapper<single_source::Tuple>\x04Jrust_cuda::utils::device_copy::SafeDeviceCopyWrapper<single_source::Tuple>\x08\x04s\x0btransparent\x01\x010h\x00\x14single_source::Tuple\x14single_source::Tuple\x08\x04s\x01C\x02\x010h\x00\x03u32\x011h\x04\x03i32\x03u32\x04\x04v\x03i32\x04\x04v";
-            const _: rc::safety::kernel_signature::Assert<
-                { rc::safety::kernel_signature::CpuAndGpuKernelSignatures::Match },
-            > = rc::safety::kernel_signature::Assert::<
-                {
-                    rc::safety::kernel_signature::check(
-                        PTX_STR.as_bytes(),
-                        ".visible .entry kernel_dfae7eaf723a670c_kernel_aab1c403129e575b"
-                            .as_bytes(),
-                    )
-                },
-            >;
-            const _: rc::safety::type_layout::Assert<
-                { rc::safety::type_layout::CpuAndGpuTypeLayouts::Match },
-            > = rc::safety::type_layout::Assert::<
-                {
-                    rc::safety::type_layout::check::<
-                        rc::common::DeviceConstRef<
-                            'static,
-                            rc::utils::device_copy::SafeDeviceCopyWrapper<
-                                <() as KernelArgs<crate::Empty>>::__T_0,
-                            >,
-                        >,
-                    >(__KERNEL_DFAE7EAF723A670C__X_LAYOUT)
-                },
-            >;
-            const _: rc::safety::type_layout::Assert<
-                { rc::safety::type_layout::CpuAndGpuTypeLayouts::Match },
-            > = rc::safety::type_layout::Assert::<
-                {
-                    rc::safety::type_layout::check::<
-                        rc::common::DeviceMutRef<
-                            'static,
-                            rc::common::DeviceAccessible<
-                                <<() as KernelArgs<
-                                    crate::Empty,
-                                >>::__T_1 as rc::common::RustToCuda>::CudaRepresentation,
-                            >,
-                        >,
-                    >(__KERNEL_DFAE7EAF723A670C__Y_LAYOUT)
-                },
-            >;
-            const _: rc::safety::type_layout::Assert<
-                { rc::safety::type_layout::CpuAndGpuTypeLayouts::Match },
-            > = rc::safety::type_layout::Assert::<
-                {
-                    rc::safety::type_layout::check::<
-                        rc::common::DeviceConstRef<
-                            'static,
-                            rc::common::DeviceAccessible<
-                                <<() as KernelArgs<
-                                    crate::Empty,
-                                >>::__T_2 as rc::common::RustToCuda>::CudaRepresentation,
-                            >,
-                        >,
-                    >(__KERNEL_DFAE7EAF723A670C__Z_LAYOUT)
-                },
-            >;
-            const _: rc::safety::type_layout::Assert<
-                { rc::safety::type_layout::CpuAndGpuTypeLayouts::Match },
-            > = rc::safety::type_layout::Assert::<
-                {
-                    rc::safety::type_layout::check::<
-                        rc::common::DeviceConstRef<
-                            'static,
-                            rc::utils::device_copy::SafeDeviceCopyWrapper<
-                                <() as KernelArgs<crate::Empty>>::__T_3,
-                            >,
-                        >,
-                    >(__KERNEL_DFAE7EAF723A670C__V_LAYOUT)
-                },
-            >;
-            const _: rc::safety::type_layout::Assert<
-                { rc::safety::type_layout::CpuAndGpuTypeLayouts::Match },
-            > = rc::safety::type_layout::Assert::<
-                {
-                    rc::safety::type_layout::check::<
-                        rc::common::DeviceMutRef<
-                            'static,
-                            rc::common::DeviceAccessible<
-                                <<() as KernelArgs<
-                                    crate::Empty,
-                                >>::__T_4 as rc::common::RustToCuda>::CudaRepresentation,
-                            >,
-                        >,
-                    >(__KERNEL_DFAE7EAF723A670C_KERNEL_ARG_4_LAYOUT)
-                },
-            >;
-            const _: rc::safety::type_layout::Assert<
-                { rc::safety::type_layout::CpuAndGpuTypeLayouts::Match },
-            > = rc::safety::type_layout::Assert::<
-                {
-                    rc::safety::type_layout::check::<
-                        rc::utils::device_copy::SafeDeviceCopyWrapper<
-                            <() as KernelArgs<crate::Empty>>::__T_5,
-                        >,
-                    >(__KERNEL_DFAE7EAF723A670C_S_T_LAYOUT)
-                },
-            >;
-            PTX_STR
-        }
-        fn new_kernel() -> rc::rustacuda::error::CudaResult<
-            rc::host::TypedKernel<dyn Kernel<crate::Empty>>,
-        > {
-            let ptx = Self::get_ptx_str();
-            let entry_point = "kernel_dfae7eaf723a670c_kernel_aab1c403129e575b";
-            rc::host::TypedKernel::new(ptx, entry_point)
-        }
-    }
-    unsafe impl KernelPtx<rc::utils::device_copy::SafeDeviceCopyWrapper<u64>>
-    for Launcher<rc::utils::device_copy::SafeDeviceCopyWrapper<u64>> {
-        fn get_ptx_str() -> &'static str {
-            const PTX_STR: &'static str = "//\n// Generated by LLVM NVPTX Back-End\n//\n\n.version 3.2\n.target sm_35\n.address_size 64\n\n\t// .globl\tkernel_dfae7eaf723a670c_kernel_54d0891c50855d77\n.visible .entry kernel_dfae7eaf723a670c_kernel_54d0891c50855d77(\n\t.param .u64 kernel_dfae7eaf723a670c_kernel_54d0891c50855d77_param_0,\n\t.param .u64 kernel_dfae7eaf723a670c_kernel_54d0891c50855d77_param_1,\n\t.param .u64 kernel_dfae7eaf723a670c_kernel_54d0891c50855d77_param_2,\n\t.param .u64 kernel_dfae7eaf723a670c_kernel_54d0891c50855d77_param_3,\n\t.param .u64 kernel_dfae7eaf723a670c_kernel_54d0891c50855d77_param_4,\n\t.param .align 4 .b8 kernel_dfae7eaf723a670c_kernel_54d0891c50855d77_param_5[8]\n)\n{\n\t.reg .b32 \t%r<6>;\n\t.reg .b64 \t%rd<7>;\n\t.reg .f64 \t%fd<5>;\n\n\tld.param.u64 \t%rd3, [kernel_dfae7eaf723a670c_kernel_54d0891c50855d77_param_3];\n\tcvta.to.global.u64 \t%rd4, %rd3;\n\tld.param.u64 \t%rd5, [kernel_dfae7eaf723a670c_kernel_54d0891c50855d77_param_1];\n\tcvta.to.global.u64 \t%rd6, %rd5;\n\tld.global.u32 \t%r1, [%rd6];\n\t// begin inline asm\n\t// <rust-cuda-ptx-jit-const-load-%r1-1> //\n\t// end inline asm\n\tld.param.u32 \t%r3, [kernel_dfae7eaf723a670c_kernel_54d0891c50855d77_param_5];\n\tld.global.u32 \t%r2, [%rd4];\n\t// begin inline asm\n\t// <rust-cuda-ptx-jit-const-load-%r2-3> //\n\t// end inline asm\n\t// begin inline asm\n\t.shared .align 4 .b8 %rd1_rust_cuda_static_shared[24];\ncvta.shared.u64 %rd1, %rd1_rust_cuda_static_shared;\n\t// end inline asm\n\t// begin inline asm\n\t.shared .align 4 .b8 %rd2_rust_cuda_static_shared[24];\ncvta.shared.u64 %rd2, %rd2_rust_cuda_static_shared;\n\t// end inline asm\n\tcvt.rn.f64.u32 \t%fd1, %r3;\n\tadd.rn.f64 \t%fd2, %fd1, %fd1;\n\tmax.f64 \t%fd3, %fd2, 0d0000000000000000;\n\tmin.f64 \t%fd4, %fd3, 0d41EFFFFFFFE00000;\n\tcvt.rzi.u32.f64 \t%r4, %fd4;\n\tst.u32 \t[%rd1+8], %r4;\n\tmov.u32 \t%r5, 24;\n\tst.u32 \t[%rd2+20], %r5;\n\tret;\n\n}\n\n// <rc::utils::device_copy::SafeDeviceCopyWrapper<u64>>\n";
-            const __KERNEL_DFAE7EAF723A670C__X_LAYOUT: &[u8; 879usize] = b"\xef\x06\x050.1.0mrust_cuda::common::DeviceConstRef<rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<single_source::Dummy>>\x06mrust_cuda::common::DeviceConstRef<rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<single_source::Dummy>>\x08\x08s\x0btransparent\x02\x07pointerh\x00Q*const rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<single_source::Dummy>\treferenceh\x00fcore::marker::PhantomData<&rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<single_source::Dummy>>Q*const rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<single_source::Dummy>\x08\x08pJrust_cuda::utils::device_copy::SafeDeviceCopyWrapper<single_source::Dummy>iJrust_cuda::utils::device_copy::SafeDeviceCopyWrapper<single_source::Dummy>\x04\x04s\x0btransparent\x01\x010h\x00\x14single_source::Dummy\x14single_source::Dummy\x04\x04s\x01C\x01\x010h\x00\x03i32\x03i32\x04\x04vfcore::marker::PhantomData<&rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<single_source::Dummy>>\x00\x01s\x00\x00";
-            const __KERNEL_DFAE7EAF723A670C__Y_LAYOUT: &[u8; 1891usize] = b"\xe3\x0e\x050.1.0\xa9\x01rust_cuda::common::DeviceMutRef<rust_cuda::common::DeviceAccessible<single_source::WrapperCudaRepresentation<rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<u64>>>>\x08\xa9\x01rust_cuda::common::DeviceMutRef<rust_cuda::common::DeviceAccessible<single_source::WrapperCudaRepresentation<rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<u64>>>>\x08\x08s\x0btransparent\x02\x07pointerh\x00\x8d\x01*mut rust_cuda::common::DeviceAccessible<single_source::WrapperCudaRepresentation<rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<u64>>>\treferenceh\x00\xa8\x01core::marker::PhantomData<&mut rust_cuda::common::DeviceAccessible<single_source::WrapperCudaRepresentation<rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<u64>>>>\x8d\x01*mut rust_cuda::common::DeviceAccessible<single_source::WrapperCudaRepresentation<rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<u64>>>\x08\x08p\x88\x01rust_cuda::common::DeviceAccessible<single_source::WrapperCudaRepresentation<rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<u64>>>m\x88\x01rust_cuda::common::DeviceAccessible<single_source::WrapperCudaRepresentation<rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<u64>>>\x08\x08s\x0btransparent\x01\x010h\x00csingle_source::WrapperCudaRepresentation<rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<u64>>csingle_source::WrapperCudaRepresentation<rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<u64>>\x08\x08s\x01C\x01\x05innerh\x00^rust_cuda::common::DeviceAccessible<rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<u64>>^rust_cuda::common::DeviceAccessible<rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<u64>>\x08\x08s\x0btransparent\x01\x010h\x009rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<u64>9rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<u64>\x08\x08s\x0btransparent\x01\x010h\x00\x03u64\x03u64\x08\x08v\xa8\x01core::marker::PhantomData<&mut rust_cuda::common::DeviceAccessible<single_source::WrapperCudaRepresentation<rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<u64>>>>\x00\x01s\x00\x00";
-            const __KERNEL_DFAE7EAF723A670C__Z_LAYOUT: &[u8; 1891usize] = b"\xe3\x0e\x050.1.0\xab\x01rust_cuda::common::DeviceConstRef<rust_cuda::common::DeviceAccessible<single_source::WrapperCudaRepresentation<rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<u64>>>>\x08\xab\x01rust_cuda::common::DeviceConstRef<rust_cuda::common::DeviceAccessible<single_source::WrapperCudaRepresentation<rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<u64>>>>\x08\x08s\x0btransparent\x02\x07pointerh\x00\x8f\x01*const rust_cuda::common::DeviceAccessible<single_source::WrapperCudaRepresentation<rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<u64>>>\treferenceh\x00\xa4\x01core::marker::PhantomData<&rust_cuda::common::DeviceAccessible<single_source::WrapperCudaRepresentation<rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<u64>>>>\x8f\x01*const rust_cuda::common::DeviceAccessible<single_source::WrapperCudaRepresentation<rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<u64>>>\x08\x08p\x88\x01rust_cuda::common::DeviceAccessible<single_source::WrapperCudaRepresentation<rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<u64>>>i\x88\x01rust_cuda::common::DeviceAccessible<single_source::WrapperCudaRepresentation<rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<u64>>>\x08\x08s\x0btransparent\x01\x010h\x00csingle_source::WrapperCudaRepresentation<rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<u64>>csingle_source::WrapperCudaRepresentation<rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<u64>>\x08\x08s\x01C\x01\x05innerh\x00^rust_cuda::common::DeviceAccessible<rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<u64>>^rust_cuda::common::DeviceAccessible<rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<u64>>\x08\x08s\x0btransparent\x01\x010h\x009rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<u64>9rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<u64>\x08\x08s\x0btransparent\x01\x010h\x00\x03u64\x03u64\x08\x08v\xa4\x01core::marker::PhantomData<&rust_cuda::common::DeviceAccessible<single_source::WrapperCudaRepresentation<rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<u64>>>>\x00\x01s\x00\x00";
-            const __KERNEL_DFAE7EAF723A670C__V_LAYOUT: &[u8; 1068usize] = b"\xac\x08\x050.1.0vrust_cuda::common::DeviceConstRef<rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<core::sync::atomic::AtomicU64>>\x07vrust_cuda::common::DeviceConstRef<rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<core::sync::atomic::AtomicU64>>\x08\x08s\x0btransparent\x02\x07pointerh\x00Z*const rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<core::sync::atomic::AtomicU64>\treferenceh\x00ocore::marker::PhantomData<&rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<core::sync::atomic::AtomicU64>>Z*const rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<core::sync::atomic::AtomicU64>\x08\x08pSrust_cuda::utils::device_copy::SafeDeviceCopyWrapper<core::sync::atomic::AtomicU64>iSrust_cuda::utils::device_copy::SafeDeviceCopyWrapper<core::sync::atomic::AtomicU64>\x08\x08s\x0btransparent\x01\x010h\x00\x1dcore::sync::atomic::AtomicU64\x1dcore::sync::atomic::AtomicU64\x08\x08s\nC,align(8)\x01\x01vh\x00\x1bcore::cell::UnsafeCell<u64>\x1bcore::cell::UnsafeCell<u64>\x08\x08s\x15no_nieche,transparent\x01\x05valueh\x00\x03u64\x03u64\x08\x08vocore::marker::PhantomData<&rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<core::sync::atomic::AtomicU64>>\x00\x01s\x00\x00";
-            const __KERNEL_DFAE7EAF723A670C_KERNEL_ARG_4_LAYOUT: &[u8; 1891usize] = b"\xe3\x0e\x050.1.0\xa9\x01rust_cuda::common::DeviceMutRef<rust_cuda::common::DeviceAccessible<single_source::WrapperCudaRepresentation<rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<u64>>>>\x08\xa9\x01rust_cuda::common::DeviceMutRef<rust_cuda::common::DeviceAccessible<single_source::WrapperCudaRepresentation<rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<u64>>>>\x08\x08s\x0btransparent\x02\x07pointerh\x00\x8d\x01*mut rust_cuda::common::DeviceAccessible<single_source::WrapperCudaRepresentation<rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<u64>>>\treferenceh\x00\xa8\x01core::marker::PhantomData<&mut rust_cuda::common::DeviceAccessible<single_source::WrapperCudaRepresentation<rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<u64>>>>\x8d\x01*mut rust_cuda::common::DeviceAccessible<single_source::WrapperCudaRepresentation<rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<u64>>>\x08\x08p\x88\x01rust_cuda::common::DeviceAccessible<single_source::WrapperCudaRepresentation<rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<u64>>>m\x88\x01rust_cuda::common::DeviceAccessible<single_source::WrapperCudaRepresentation<rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<u64>>>\x08\x08s\x0btransparent\x01\x010h\x00csingle_source::WrapperCudaRepresentation<rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<u64>>csingle_source::WrapperCudaRepresentation<rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<u64>>\x08\x08s\x01C\x01\x05innerh\x00^rust_cuda::common::DeviceAccessible<rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<u64>>^rust_cuda::common::DeviceAccessible<rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<u64>>\x08\x08s\x0btransparent\x01\x010h\x009rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<u64>9rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<u64>\x08\x08s\x0btransparent\x01\x010h\x00\x03u64\x03u64\x08\x08v\xa8\x01core::marker::PhantomData<&mut rust_cuda::common::DeviceAccessible<single_source::WrapperCudaRepresentation<rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<u64>>>>\x00\x01s\x00\x00";
-            const __KERNEL_DFAE7EAF723A670C_S_T_LAYOUT: &[u8; 257usize] = b"\x81\x02\x050.1.0Jrust_cuda::utils::device_copy::SafeDeviceCopyWrapper<single_source::Tuple>\x04Jrust_cuda::utils::device_copy::SafeDeviceCopyWrapper<single_source::Tuple>\x08\x04s\x0btransparent\x01\x010h\x00\x14single_source::Tuple\x14single_source::Tuple\x08\x04s\x01C\x02\x010h\x00\x03u32\x011h\x04\x03i32\x03u32\x04\x04v\x03i32\x04\x04v";
-            const _: rc::safety::kernel_signature::Assert<
-                { rc::safety::kernel_signature::CpuAndGpuKernelSignatures::Match },
-            > = rc::safety::kernel_signature::Assert::<
-                {
-                    rc::safety::kernel_signature::check(
-                        PTX_STR.as_bytes(),
-                        ".visible .entry kernel_dfae7eaf723a670c_kernel_54d0891c50855d77"
-                            .as_bytes(),
-                    )
-                },
-            >;
-            const _: rc::safety::type_layout::Assert<
-                { rc::safety::type_layout::CpuAndGpuTypeLayouts::Match },
-            > = rc::safety::type_layout::Assert::<
-                {
-                    rc::safety::type_layout::check::<
-                        rc::common::DeviceConstRef<
-                            'static,
-                            rc::utils::device_copy::SafeDeviceCopyWrapper<
-                                <() as KernelArgs<
-                                    rc::utils::device_copy::SafeDeviceCopyWrapper<u64>,
-                                >>::__T_0,
-                            >,
-                        >,
-                    >(__KERNEL_DFAE7EAF723A670C__X_LAYOUT)
-                },
-            >;
-            const _: rc::safety::type_layout::Assert<
-                { rc::safety::type_layout::CpuAndGpuTypeLayouts::Match },
-            > = rc::safety::type_layout::Assert::<
-                {
-                    rc::safety::type_layout::check::<
-                        rc::common::DeviceMutRef<
-                            'static,
-                            rc::common::DeviceAccessible<
-                                <<() as KernelArgs<
-                                    rc::utils::device_copy::SafeDeviceCopyWrapper<u64>,
-                                >>::__T_1 as rc::common::RustToCuda>::CudaRepresentation,
-                            >,
-                        >,
-                    >(__KERNEL_DFAE7EAF723A670C__Y_LAYOUT)
-                },
-            >;
-            const _: rc::safety::type_layout::Assert<
-                { rc::safety::type_layout::CpuAndGpuTypeLayouts::Match },
-            > = rc::safety::type_layout::Assert::<
-                {
-                    rc::safety::type_layout::check::<
-                        rc::common::DeviceConstRef<
-                            'static,
-                            rc::common::DeviceAccessible<
-                                <<() as KernelArgs<
-                                    rc::utils::device_copy::SafeDeviceCopyWrapper<u64>,
-                                >>::__T_2 as rc::common::RustToCuda>::CudaRepresentation,
-                            >,
-                        >,
-                    >(__KERNEL_DFAE7EAF723A670C__Z_LAYOUT)
-                },
-            >;
-            const _: rc::safety::type_layout::Assert<
-                { rc::safety::type_layout::CpuAndGpuTypeLayouts::Match },
-            > = rc::safety::type_layout::Assert::<
-                {
-                    rc::safety::type_layout::check::<
-                        rc::common::DeviceConstRef<
-                            'static,
-                            rc::utils::device_copy::SafeDeviceCopyWrapper<
-                                <() as KernelArgs<
-                                    rc::utils::device_copy::SafeDeviceCopyWrapper<u64>,
-                                >>::__T_3,
-                            >,
-                        >,
-                    >(__KERNEL_DFAE7EAF723A670C__V_LAYOUT)
-                },
-            >;
-            const _: rc::safety::type_layout::Assert<
-                { rc::safety::type_layout::CpuAndGpuTypeLayouts::Match },
-            > = rc::safety::type_layout::Assert::<
-                {
-                    rc::safety::type_layout::check::<
-                        rc::common::DeviceMutRef<
-                            'static,
-                            rc::common::DeviceAccessible<
-                                <<() as KernelArgs<
-                                    rc::utils::device_copy::SafeDeviceCopyWrapper<u64>,
-                                >>::__T_4 as rc::common::RustToCuda>::CudaRepresentation,
-                            >,
-                        >,
-                    >(__KERNEL_DFAE7EAF723A670C_KERNEL_ARG_4_LAYOUT)
-                },
-            >;
-            const _: rc::safety::type_layout::Assert<
-                { rc::safety::type_layout::CpuAndGpuTypeLayouts::Match },
-            > = rc::safety::type_layout::Assert::<
-                {
-                    rc::safety::type_layout::check::<
-                        rc::utils::device_copy::SafeDeviceCopyWrapper<
-                            <() as KernelArgs<
-                                rc::utils::device_copy::SafeDeviceCopyWrapper<u64>,
-                            >>::__T_5,
-                        >,
-                    >(__KERNEL_DFAE7EAF723A670C_S_T_LAYOUT)
-                },
-            >;
-            PTX_STR
-        }
-        fn new_kernel() -> rc::rustacuda::error::CudaResult<
-            rc::host::TypedKernel<
-                dyn Kernel<rc::utils::device_copy::SafeDeviceCopyWrapper<u64>>,
-            >,
-        > {
-            let ptx = Self::get_ptx_str();
-            let entry_point = "kernel_dfae7eaf723a670c_kernel_54d0891c50855d77";
-            rc::host::TypedKernel::new(ptx, entry_point)
-        }
-    }
-    impl<T: rc::common::RustToCuda> rc::host::Launcher for Launcher<T> {
-        type CompilationWatcher = ();
-        type KernelTraitObject = dyn Kernel<T>;
-        fn get_launch_package(&mut self) -> rc::host::LaunchPackage<Self> {
-            ::core::panicking::panic("not implemented")
-        }
-    }
-}
diff --git a/rust-cuda-derive/src/rust_to_cuda/impl.rs b/rust-cuda-derive/src/rust_to_cuda/impl.rs
index 1682c0c80..896e51e89 100644
--- a/rust-cuda-derive/src/rust_to_cuda/impl.rs
+++ b/rust-cuda-derive/src/rust_to_cuda/impl.rs
@@ -91,7 +91,7 @@ pub fn rust_to_cuda_trait(
                 #crate_path::common::DeviceAccessible<Self::CudaRepresentation>,
                 #crate_path::common::CombinedCudaAlloc<Self::CudaAllocation, CudaAllocType>
             )> {
-                let alloc_front = #crate_path::common::NullCudaAlloc;
+                let alloc_front = #crate_path::common::NoCudaAlloc;
                 let alloc_tail = alloc;
 
                 #(#r2c_field_declarations)*
@@ -161,7 +161,7 @@ pub fn rust_to_cuda_async_trait(
                 #crate_path::common::DeviceAccessible<Self::CudaRepresentation>,
                 #crate_path::common::CombinedCudaAlloc<Self::CudaAllocation, CudaAllocType>
             )> {
-                let alloc_front = #crate_path::common::NullCudaAlloc;
+                let alloc_front = #crate_path::common::NoCudaAlloc;
                 let alloc_tail = alloc;
 
                 #(#r2c_field_async_declarations)*
diff --git a/rust-cuda-derive/src/rust_to_cuda/mod.rs b/rust-cuda-derive/src/rust_to_cuda/mod.rs
index 5e11ffc8c..fb5b39503 100644
--- a/rust-cuda-derive/src/rust_to_cuda/mod.rs
+++ b/rust-cuda-derive/src/rust_to_cuda/mod.rs
@@ -31,7 +31,7 @@ pub fn impl_rust_to_cuda(ast: &syn::DeriveInput) -> proc_macro::TokenStream {
     ) = generics::expand_cuda_struct_generics_where_requested_in_attrs(ast);
 
     let mut combined_cuda_alloc_type: TokenStream = quote! {
-        #crate_path::common::NullCudaAlloc
+        #crate_path::common::NoCudaAlloc
     };
     let mut r2c_field_declarations: Vec<TokenStream> = Vec::new();
     let mut r2c_field_async_declarations: Vec<TokenStream> = Vec::new();
diff --git a/src/common.rs b/src/common.rs
index 6a7e7d926..cf44848a4 100644
--- a/src/common.rs
+++ b/src/common.rs
@@ -259,9 +259,13 @@ impl<T: crate_private::alloc::Sealed> CudaAlloc for T {}
 
 impl<T: CudaAlloc> crate_private::alloc::Sealed for Option<T> {}
 
-pub struct NullCudaAlloc;
-impl crate_private::alloc::Sealed for NullCudaAlloc {}
-impl private::empty::Sealed for NullCudaAlloc {}
+pub struct NoCudaAlloc;
+impl crate_private::alloc::Sealed for NoCudaAlloc {}
+impl private::empty::Sealed for NoCudaAlloc {}
+
+pub struct SomeCudaAlloc(());
+impl crate_private::alloc::Sealed for SomeCudaAlloc {}
+impl !private::empty::Sealed for SomeCudaAlloc {}
 
 pub struct CombinedCudaAlloc<A: CudaAlloc, B: CudaAlloc>(A, B);
 impl<A: CudaAlloc, B: CudaAlloc> crate_private::alloc::Sealed for CombinedCudaAlloc<A, B> {}
diff --git a/src/device/mod.rs b/src/device/mod.rs
index f7347aa98..45c833923 100644
--- a/src/device/mod.rs
+++ b/src/device/mod.rs
@@ -116,7 +116,3 @@ impl<T> DerefMut for ShallowCopy<T> {
         &mut self.0
     }
 }
-
-pub struct SomeCudaAlloc(());
-
-impl crate::common::crate_private::alloc::Sealed for SomeCudaAlloc {}
diff --git a/src/host.rs b/src/host.rs
index 7a5eaf854..aed9aaa83 100644
--- a/src/host.rs
+++ b/src/host.rs
@@ -21,7 +21,7 @@ pub use rust_cuda_derive::{check_kernel, link_kernel, specialise_kernel_call};
 
 use crate::{
     common::{
-        DeviceAccessible, DeviceConstRef, DeviceMutRef, EmptyCudaAlloc, NullCudaAlloc, RustToCuda,
+        DeviceAccessible, DeviceConstRef, DeviceMutRef, EmptyCudaAlloc, NoCudaAlloc, RustToCuda,
     },
     ptx_jit::{CudaKernel, PtxJITCompiler, PtxJITResult},
     safety::SafeDeviceCopy,
@@ -196,7 +196,7 @@ impl<T: RustToCuda> LendToCuda for T {
         &self,
         inner: F,
     ) -> Result<O, E> {
-        let (cuda_repr, alloc) = unsafe { self.borrow(NullCudaAlloc) }?;
+        let (cuda_repr, alloc) = unsafe { self.borrow(NoCudaAlloc) }?;
 
         let result = HostAndDeviceConstRef::with_new(&cuda_repr, inner);
 
@@ -216,13 +216,13 @@ impl<T: RustToCuda> LendToCuda for T {
         &mut self,
         inner: F,
     ) -> Result<O, E> {
-        let (mut cuda_repr, alloc) = unsafe { self.borrow(NullCudaAlloc) }?;
+        let (mut cuda_repr, alloc) = unsafe { self.borrow(NoCudaAlloc) }?;
 
         let result = HostAndDeviceMutRef::with_new(&mut cuda_repr, inner);
 
         core::mem::drop(cuda_repr);
 
-        let _: NullCudaAlloc = unsafe { self.restore(alloc) }?;
+        let _: NoCudaAlloc = unsafe { self.restore(alloc) }?;
 
         result
     }
@@ -242,7 +242,7 @@ impl<T: RustToCuda> LendToCuda for T {
         <Self as RustToCuda>::CudaRepresentation: SafeDeviceCopy,
         <Self as RustToCuda>::CudaAllocation: EmptyCudaAlloc,
     {
-        let (cuda_repr, alloc) = unsafe { self.borrow(NullCudaAlloc) }?;
+        let (cuda_repr, alloc) = unsafe { self.borrow(NoCudaAlloc) }?;
 
         let result = HostAndDeviceOwned::with_new(cuda_repr, inner);
 
diff --git a/src/utils/box.rs b/src/utils/box.rs
index 195536f0d..8e81941a1 100644
--- a/src/utils/box.rs
+++ b/src/utils/box.rs
@@ -34,7 +34,7 @@ unsafe impl<T: SafeDeviceCopy + TypeGraphLayout> RustToCuda for Box<T> {
     #[cfg(feature = "host")]
     type CudaAllocation = crate::host::CudaDropWrapper<DeviceBox<SafeDeviceCopyWrapper<T>>>;
     #[cfg(not(feature = "host"))]
-    type CudaAllocation = crate::device::SomeCudaAlloc;
+    type CudaAllocation = crate::common::SomeCudaAlloc;
     type CudaRepresentation = BoxCudaRepresentation<T>;
 
     #[cfg(feature = "host")]
diff --git a/src/utils/boxed_slice.rs b/src/utils/boxed_slice.rs
index d5c022ede..4a06e0a8d 100644
--- a/src/utils/boxed_slice.rs
+++ b/src/utils/boxed_slice.rs
@@ -34,7 +34,7 @@ unsafe impl<T: SafeDeviceCopy + TypeGraphLayout> RustToCuda for Box<[T]> {
     #[cfg(feature = "host")]
     type CudaAllocation = crate::host::CudaDropWrapper<DeviceBuffer<SafeDeviceCopyWrapper<T>>>;
     #[cfg(not(feature = "host"))]
-    type CudaAllocation = crate::device::SomeCudaAlloc;
+    type CudaAllocation = crate::common::SomeCudaAlloc;
     type CudaRepresentation = BoxedSliceCudaRepresentation<T>;
 
     #[cfg(feature = "host")]
diff --git a/src/utils/device_copy.rs b/src/utils/device_copy.rs
index 46a75824c..0c77a8d1a 100644
--- a/src/utils/device_copy.rs
+++ b/src/utils/device_copy.rs
@@ -3,7 +3,7 @@
 use const_type_layout::TypeGraphLayout;
 
 use crate::{
-    common::{CudaAsRust, DeviceAccessible, NullCudaAlloc, RustToCuda, RustToCudaAsync},
+    common::{CudaAsRust, DeviceAccessible, NoCudaAlloc, RustToCuda, RustToCudaAsync},
     safety::SafeDeviceCopy,
 };
 
@@ -74,7 +74,7 @@ impl<T: SafeDeviceCopy + TypeGraphLayout> SafeDeviceCopyWrapper<T> {
 }
 
 unsafe impl<T: SafeDeviceCopy + TypeGraphLayout> RustToCuda for SafeDeviceCopyWrapper<T> {
-    type CudaAllocation = NullCudaAlloc;
+    type CudaAllocation = NoCudaAlloc;
     type CudaRepresentation = Self;
 
     #[cfg(feature = "host")]
@@ -86,7 +86,7 @@ unsafe impl<T: SafeDeviceCopy + TypeGraphLayout> RustToCuda for SafeDeviceCopyWr
         DeviceAccessible<Self::CudaRepresentation>,
         CombinedCudaAlloc<Self::CudaAllocation, A>,
     )> {
-        let alloc = CombinedCudaAlloc::new(NullCudaAlloc, alloc);
+        let alloc = CombinedCudaAlloc::new(NoCudaAlloc, alloc);
         Ok((DeviceAccessible::from(&self.0), alloc))
     }
 
@@ -96,7 +96,7 @@ unsafe impl<T: SafeDeviceCopy + TypeGraphLayout> RustToCuda for SafeDeviceCopyWr
         &mut self,
         alloc: CombinedCudaAlloc<Self::CudaAllocation, A>,
     ) -> rustacuda::error::CudaResult<A> {
-        let (_alloc_front, alloc_tail): (NullCudaAlloc, A) = alloc.split();
+        let (_alloc_front, alloc_tail): (NoCudaAlloc, A) = alloc.split();
 
         Ok(alloc_tail)
     }
@@ -115,7 +115,7 @@ unsafe impl<T: SafeDeviceCopy + TypeGraphLayout> RustToCudaAsync
         DeviceAccessible<Self::CudaRepresentation>,
         CombinedCudaAlloc<Self::CudaAllocation, A>,
     )> {
-        let alloc = CombinedCudaAlloc::new(NullCudaAlloc, alloc);
+        let alloc = CombinedCudaAlloc::new(NoCudaAlloc, alloc);
         Ok((DeviceAccessible::from(&self.0), alloc))
     }
 
@@ -126,7 +126,7 @@ unsafe impl<T: SafeDeviceCopy + TypeGraphLayout> RustToCudaAsync
         alloc: CombinedCudaAlloc<Self::CudaAllocation, A>,
         _stream: &rustacuda::stream::Stream,
     ) -> rustacuda::error::CudaResult<A> {
-        let (_alloc_front, alloc_tail): (NullCudaAlloc, A) = alloc.split();
+        let (_alloc_front, alloc_tail): (NoCudaAlloc, A) = alloc.split();
 
         Ok(alloc_tail)
     }
diff --git a/src/utils/exchange/buffer/device.rs b/src/utils/exchange/buffer/device.rs
index 14ffac979..09ffa2b43 100644
--- a/src/utils/exchange/buffer/device.rs
+++ b/src/utils/exchange/buffer/device.rs
@@ -3,7 +3,7 @@ use core::ops::{Deref, DerefMut};
 use const_type_layout::TypeGraphLayout;
 
 use crate::{
-    common::{NullCudaAlloc, RustToCuda, RustToCudaAsync},
+    common::{NoCudaAlloc, RustToCuda, RustToCudaAsync},
     safety::SafeDeviceCopy,
 };
 
@@ -43,7 +43,7 @@ impl<T: SafeDeviceCopy, const M2D: bool, const M2H: bool> DerefMut
 unsafe impl<T: SafeDeviceCopy + TypeGraphLayout, const M2D: bool, const M2H: bool> RustToCuda
     for CudaExchangeBufferDevice<T, M2D, M2H>
 {
-    type CudaAllocation = NullCudaAlloc;
+    type CudaAllocation = NoCudaAlloc;
     type CudaRepresentation = CudaExchangeBufferCudaRepresentation<T, M2D, M2H>;
 }
 
diff --git a/src/utils/exchange/buffer/host.rs b/src/utils/exchange/buffer/host.rs
index e45efc71e..384f290bb 100644
--- a/src/utils/exchange/buffer/host.rs
+++ b/src/utils/exchange/buffer/host.rs
@@ -12,7 +12,7 @@ use rustacuda::{
 
 use crate::{
     common::{
-        CombinedCudaAlloc, CudaAlloc, DeviceAccessible, NullCudaAlloc, RustToCuda, RustToCudaAsync,
+        CombinedCudaAlloc, CudaAlloc, DeviceAccessible, NoCudaAlloc, RustToCuda, RustToCudaAsync,
     },
     host::CudaDropWrapper,
     safety::SafeDeviceCopy,
@@ -107,7 +107,7 @@ impl<T: SafeDeviceCopy + TypeGraphLayout, const M2D: bool, const M2H: bool> Dere
 unsafe impl<T: SafeDeviceCopy + TypeGraphLayout, const M2D: bool, const M2H: bool> RustToCuda
     for CudaExchangeBufferHost<T, M2D, M2H>
 {
-    type CudaAllocation = NullCudaAlloc;
+    type CudaAllocation = NoCudaAlloc;
     type CudaRepresentation = CudaExchangeBufferCudaRepresentation<T, M2D, M2H>;
 
     #[allow(clippy::type_complexity)]
@@ -136,7 +136,7 @@ unsafe impl<T: SafeDeviceCopy + TypeGraphLayout, const M2D: bool, const M2H: boo
                 device_buffer.as_mut_ptr(),
                 device_buffer.len(),
             )),
-            CombinedCudaAlloc::new(NullCudaAlloc, alloc),
+            CombinedCudaAlloc::new(NoCudaAlloc, alloc),
         ))
     }
 
@@ -191,7 +191,7 @@ unsafe impl<T: SafeDeviceCopy + TypeGraphLayout, const M2D: bool, const M2H: boo
                 device_buffer.as_mut_ptr(),
                 device_buffer.len(),
             )),
-            CombinedCudaAlloc::new(NullCudaAlloc, alloc),
+            CombinedCudaAlloc::new(NoCudaAlloc, alloc),
         ))
     }
 
diff --git a/src/utils/exchange/wrapper.rs b/src/utils/exchange/wrapper.rs
index 711409469..4ca2474d4 100644
--- a/src/utils/exchange/wrapper.rs
+++ b/src/utils/exchange/wrapper.rs
@@ -16,7 +16,7 @@ use rustacuda::{
 
 use crate::{
     common::{
-        CombinedCudaAlloc, DeviceAccessible, EmptyCudaAlloc, NullCudaAlloc, RustToCuda,
+        CombinedCudaAlloc, DeviceAccessible, EmptyCudaAlloc, NoCudaAlloc, RustToCuda,
         RustToCudaAsync,
     },
     host::{
@@ -48,7 +48,7 @@ pub struct ExchangeWrapperOnDevice<T: RustToCuda<CudaAllocation: EmptyCudaAlloc>
     value: T,
     device_box: HostDeviceBox<DeviceAccessible<<T as RustToCuda>::CudaRepresentation>>,
     locked_cuda_repr: HostLockedBox<DeviceAccessible<<T as RustToCuda>::CudaRepresentation>>,
-    null_alloc: CombinedCudaAlloc<<T as RustToCuda>::CudaAllocation, NullCudaAlloc>,
+    null_alloc: CombinedCudaAlloc<<T as RustToCuda>::CudaAllocation, NoCudaAlloc>,
     move_event: CudaDropWrapper<Event>,
 }
 
@@ -57,7 +57,7 @@ pub struct ExchangeWrapperOnDeviceAsync<'stream, T: RustToCuda<CudaAllocation: E
     value: T,
     device_box: HostDeviceBox<DeviceAccessible<<T as RustToCuda>::CudaRepresentation>>,
     locked_cuda_repr: HostLockedBox<DeviceAccessible<<T as RustToCuda>::CudaRepresentation>>,
-    null_alloc: CombinedCudaAlloc<<T as RustToCuda>::CudaAllocation, NullCudaAlloc>,
+    null_alloc: CombinedCudaAlloc<<T as RustToCuda>::CudaAllocation, NoCudaAlloc>,
     move_event: CudaDropWrapper<Event>,
     stream: &'stream Stream,
     waker: Arc<Mutex<Option<Waker>>>,
@@ -73,7 +73,7 @@ impl<T: RustToCuda<CudaAllocation: EmptyCudaAlloc>> ExchangeWrapperOnHost<T> {
         // called first,           which initialised the memory.
         let device_box = unsafe { DeviceBox::uninitialized() }?.into();
 
-        let (cuda_repr, _null_alloc) = unsafe { value.borrow(NullCudaAlloc) }?;
+        let (cuda_repr, _null_alloc) = unsafe { value.borrow(NoCudaAlloc) }?;
         let locked_cuda_repr = HostLockedBox::new(cuda_repr)?;
 
         let move_event = Event::new(EventFlags::DISABLE_TIMING)?.into();
@@ -99,7 +99,7 @@ impl<T: RustToCuda<CudaAllocation: EmptyCudaAlloc>> ExchangeWrapperOnHost<T> {
     /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside
     /// CUDA
     pub fn move_to_device(mut self) -> CudaResult<ExchangeWrapperOnDevice<T>> {
-        let (cuda_repr, null_alloc) = unsafe { self.value.borrow(NullCudaAlloc) }?;
+        let (cuda_repr, null_alloc) = unsafe { self.value.borrow(NoCudaAlloc) }?;
         *self.locked_cuda_repr = cuda_repr;
 
         self.device_box.copy_from(&self.locked_cuda_repr)?;
@@ -129,7 +129,7 @@ impl<T: RustToCudaAsync<CudaAllocation: EmptyCudaAlloc>> ExchangeWrapperOnHost<T
         mut self,
         stream: &Stream,
     ) -> CudaResult<ExchangeWrapperOnDeviceAsync<'_, T>> {
-        let (cuda_repr, null_alloc) = unsafe { self.value.borrow_async(NullCudaAlloc, stream) }?;
+        let (cuda_repr, null_alloc) = unsafe { self.value.borrow_async(NoCudaAlloc, stream) }?;
         *self.locked_cuda_repr = cuda_repr;
 
         // Safety: The device value is not safely exposed until either
@@ -347,7 +347,7 @@ impl<'stream, T: RustToCuda<CudaAllocation: EmptyCudaAlloc>>
     /// CUDA
     pub fn move_to_host(mut self) -> CudaResult<ExchangeWrapperOnHost<T>> {
         // Reflect deep changes back to the CPU
-        let _null_alloc: NullCudaAlloc = unsafe { self.value.restore(self.null_alloc) }?;
+        let _null_alloc: NoCudaAlloc = unsafe { self.value.restore(self.null_alloc) }?;
 
         // Note: Shallow changes are not reflected back to the CPU
 
@@ -378,7 +378,7 @@ impl<'stream, T: RustToCudaAsync<CudaAllocation: EmptyCudaAlloc>>
         stream: &'stream Stream,
     ) -> CudaResult<ExchangeWrapperOnHostAsync<'stream, T>> {
         // Reflect deep changes back to the CPU
-        let _null_alloc: NullCudaAlloc =
+        let _null_alloc: NoCudaAlloc =
             unsafe { self.value.restore_async(self.null_alloc, stream) }?;
 
         // Note: Shallow changes are not reflected back to the CPU
@@ -456,7 +456,7 @@ impl<T: RustToCuda<CudaAllocation: EmptyCudaAlloc>> ExchangeWrapperOnDevice<T> {
     /// CUDA
     pub fn move_to_host(mut self) -> CudaResult<ExchangeWrapperOnHost<T>> {
         // Reflect deep changes back to the CPU
-        let _null_alloc: NullCudaAlloc = unsafe { self.value.restore(self.null_alloc) }?;
+        let _null_alloc: NoCudaAlloc = unsafe { self.value.restore(self.null_alloc) }?;
 
         // Note: Shallow changes are not reflected back to the CPU
 
@@ -499,7 +499,7 @@ impl<T: RustToCudaAsync<CudaAllocation: EmptyCudaAlloc>> ExchangeWrapperOnDevice
         stream: &Stream,
     ) -> CudaResult<ExchangeWrapperOnHostAsync<'_, T>> {
         // Reflect deep changes back to the CPU
-        let _null_alloc: NullCudaAlloc =
+        let _null_alloc: NoCudaAlloc =
             unsafe { self.value.restore_async(self.null_alloc, stream) }?;
 
         // Note: Shallow changes are not reflected back to the CPU

From 41d36161359e3d275abd79e10652433d450cf217 Mon Sep 17 00:00:00 2001
From: Juniper Tyree <juniper.tyree@helsinki.fi>
Date: Sun, 7 May 2023 14:59:20 +0000
Subject: [PATCH 028/120] Added error handling to the compile-time PTX checking

---
 rust-cuda-derive/Cargo.toml                   |   6 +-
 rust-cuda-derive/build.rs                     |   2 +
 rust-cuda-derive/src/kernel/link/mod.rs       | 272 ++++++++++------
 .../src/kernel/link/ptx_compiler_sys.rs       | 301 ++++++++++++++++++
 4 files changed, 485 insertions(+), 96 deletions(-)
 create mode 100644 rust-cuda-derive/src/kernel/link/ptx_compiler_sys.rs

diff --git a/rust-cuda-derive/Cargo.toml b/rust-cuda-derive/Cargo.toml
index 788a08716..41ad5a33f 100644
--- a/rust-cuda-derive/Cargo.toml
+++ b/rust-cuda-derive/Cargo.toml
@@ -22,7 +22,9 @@ serde_json = "1.0"
 cargo_metadata = { version = "0.18", features = ["builder"] }
 strip-ansi-escapes = "0.2"
 colored = "2.0"
-
+thiserror = "1.0"
 seahash = "4.1"
 ptx-builder = { git = "https://github.com/juntyr/rust-ptx-builder", rev = "1f1f49d" }
-ptx_compiler = "0.1"
+
+[build-dependencies]
+find_cuda_helper = "0.2"
diff --git a/rust-cuda-derive/build.rs b/rust-cuda-derive/build.rs
index 27d940ad2..f7aa5b1a9 100644
--- a/rust-cuda-derive/build.rs
+++ b/rust-cuda-derive/build.rs
@@ -1,3 +1,5 @@
 fn main() {
+    find_cuda_helper::include_cuda();
+
     println!("cargo:rustc-link-lib=nvptxcompiler_static");
 }
diff --git a/rust-cuda-derive/src/kernel/link/mod.rs b/rust-cuda-derive/src/kernel/link/mod.rs
index d383198ec..f6f4719c4 100644
--- a/rust-cuda-derive/src/kernel/link/mod.rs
+++ b/rust-cuda-derive/src/kernel/link/mod.rs
@@ -1,9 +1,9 @@
 use std::{
     env,
     ffi::CString,
+    fmt::Write as FmtWrite,
     fs,
     io::{Read, Write},
-    mem::MaybeUninit,
     os::raw::c_int,
     path::{Path, PathBuf},
     ptr::addr_of_mut,
@@ -16,15 +16,16 @@ use ptx_builder::{
     builder::{BuildStatus, Builder, MessageFormat, Profile},
     error::{BuildErrorKind, Error, Result},
 };
-use ptx_compiler::sys::size_t;
 
 use super::utils::skip_kernel_compilation;
 
 mod config;
 mod error;
+mod ptx_compiler_sys;
 
 use config::{CheckKernelConfig, LinkKernelConfig};
 use error::emit_ptx_build_error;
+use ptx_compiler_sys::NvptxError;
 
 pub fn check_kernel(tokens: TokenStream) -> TokenStream {
     proc_macro_error::set_dummy(quote! {
@@ -206,100 +207,32 @@ pub fn link_kernel(tokens: TokenStream) -> TokenStream {
         kernel_ptx.replace_range(type_layout_start..type_layout_end, "");
     }
 
-    let mut compiler = MaybeUninit::uninit();
-    let r = unsafe {
-        ptx_compiler::sys::nvPTXCompilerCreate(
-            compiler.as_mut_ptr(),
-            kernel_ptx.len() as size_t,
-            kernel_ptx.as_ptr().cast(),
-        )
-    };
-    emit_call_site_warning!("PTX compiler create result {}", r);
-    let compiler = unsafe { compiler.assume_init() };
-
-    let mut major = 0;
-    let mut minor = 0;
-    let r = unsafe {
-        ptx_compiler::sys::nvPTXCompilerGetVersion(addr_of_mut!(major), addr_of_mut!(minor))
-    };
-    emit_call_site_warning!("PTX version result {}", r);
-    emit_call_site_warning!("PTX compiler version {}.{}", major, minor);
+    let (result, error_log, info_log, version, drop) =
+        check_kernel_ptx(&kernel_ptx, &specialisation, &kernel_hash);
 
-    let kernel_name = if specialisation.is_empty() {
-        format!("{kernel_hash}_kernel")
-    } else {
-        format!(
-            "{kernel_hash}_kernel_{:016x}",
-            seahash::hash(specialisation.as_bytes())
-        )
-    };
-
-    let options = vec![
-        CString::new("--entry").unwrap(),
-        CString::new(kernel_name).unwrap(),
-        CString::new("--verbose").unwrap(),
-        CString::new("--warn-on-double-precision-use").unwrap(),
-        CString::new("--warn-on-local-memory-usage").unwrap(),
-        CString::new("--warn-on-spills").unwrap(),
-    ];
-    let options_ptrs = options.iter().map(|o| o.as_ptr()).collect::<Vec<_>>();
-
-    let r = unsafe {
-        ptx_compiler::sys::nvPTXCompilerCompile(
-            compiler,
-            c_int::try_from(options_ptrs.len()).unwrap(),
-            options_ptrs.as_ptr().cast(),
-        )
+    let ptx_compiler = match &version {
+        Ok((major, minor)) => format!("PTX compiler v{major}.{minor}"),
+        Err(_) => String::from("PTX compiler"),
     };
-    emit_call_site_warning!("PTX compile result {}", r);
 
-    let mut info_log_size = 0;
-    let r = unsafe {
-        ptx_compiler::sys::nvPTXCompilerGetInfoLogSize(compiler, addr_of_mut!(info_log_size))
-    };
-    emit_call_site_warning!("PTX info log size result {}", r);
-    #[allow(clippy::cast_possible_truncation)]
-    let mut info_log: Vec<u8> = Vec::with_capacity(info_log_size as usize);
-    if info_log_size > 0 {
-        let r = unsafe {
-            ptx_compiler::sys::nvPTXCompilerGetInfoLog(compiler, info_log.as_mut_ptr().cast())
-        };
-        emit_call_site_warning!("PTX info log content result {}", r);
-        #[allow(clippy::cast_possible_truncation)]
-        unsafe {
-            info_log.set_len(info_log_size as usize);
-        }
-    }
-    let info_log = String::from_utf8_lossy(&info_log);
-
-    let mut error_log_size = 0;
-    let r = unsafe {
-        ptx_compiler::sys::nvPTXCompilerGetErrorLogSize(compiler, addr_of_mut!(error_log_size))
-    };
-    emit_call_site_warning!("PTX error log size result {}", r);
-    #[allow(clippy::cast_possible_truncation)]
-    let mut error_log: Vec<u8> = Vec::with_capacity(error_log_size as usize);
-    if error_log_size > 0 {
-        let r = unsafe {
-            ptx_compiler::sys::nvPTXCompilerGetErrorLog(compiler, error_log.as_mut_ptr().cast())
-        };
-        emit_call_site_warning!("PTX error log content result {}", r);
-        #[allow(clippy::cast_possible_truncation)]
-        unsafe {
-            error_log.set_len(error_log_size as usize);
-        }
+    // TODO: allow user to select
+    // - warn on double
+    // - warn on float
+    // - warn on spills
+    // - verbose warn
+    // - warnings as errors
+    // - show PTX source if warning or error
+
+    let mut errors = String::new();
+    if let Err(err) = drop {
+        let _ = errors.write_fmt(format_args!("Error dropping the {ptx_compiler}: {err}\n"));
     }
-    let error_log = String::from_utf8_lossy(&error_log);
-
-    // Ensure the compiler is not dropped
-    let mut compiler = MaybeUninit::new(compiler);
-    let r = unsafe { ptx_compiler::sys::nvPTXCompilerDestroy(compiler.as_mut_ptr()) };
-    emit_call_site_warning!("PTX compiler destroy result {}", r);
-
-    if !info_log.is_empty() {
-        emit_call_site_warning!("PTX compiler info log:\n{}", info_log);
+    if let Err(err) = version {
+        let _ = errors.write_fmt(format_args!(
+            "Error fetching the version of the {ptx_compiler}: {err}\n"
+        ));
     }
-    if !error_log.is_empty() {
+    if let (Ok(Some(_)), _) | (_, Ok(Some(_))) = (&info_log, &error_log) {
         let mut max_lines = kernel_ptx.chars().filter(|c| *c == '\n').count() + 1;
         let mut indent = 0;
         while max_lines > 0 {
@@ -307,9 +240,8 @@ pub fn link_kernel(tokens: TokenStream) -> TokenStream {
             indent += 1;
         }
 
-        abort_call_site!(
-            "PTX compiler error log:\n{}\nPTX source:\n{}",
-            error_log,
+        emit_call_site_warning!(
+            "PTX source code:\n{}",
             kernel_ptx
                 .lines()
                 .enumerate()
@@ -318,10 +250,162 @@ pub fn link_kernel(tokens: TokenStream) -> TokenStream {
                 .join("\n")
         );
     }
+    match info_log {
+        Ok(None) => (),
+        Ok(Some(info_log)) => emit_call_site_warning!("{ptx_compiler} info log:\n{}", info_log),
+        Err(err) => {
+            let _ = errors.write_fmt(format_args!(
+                "Error fetching the info log of the {ptx_compiler}: {err}\n"
+            ));
+        },
+    };
+    match error_log {
+        Ok(None) => (),
+        Ok(Some(error_log)) => emit_call_site_error!("{ptx_compiler} error log:\n{}", error_log),
+        Err(err) => {
+            let _ = errors.write_fmt(format_args!(
+                "Error fetching the error log of the {ptx_compiler}: {err}\n"
+            ));
+        },
+    };
+    if let Err(err) = result {
+        let _ = errors.write_fmt(format_args!("Error compiling the PTX source code: {err}\n"));
+    }
+    if !errors.is_empty() {
+        abort_call_site!("{}", errors);
+    }
 
     (quote! { const PTX_STR: &'static str = #kernel_ptx; #(#type_layouts)* }).into()
 }
 
+#[allow(clippy::type_complexity)]
+fn check_kernel_ptx(
+    kernel_ptx: &str,
+    specialisation: &str,
+    kernel_hash: &proc_macro2::Ident,
+) -> (
+    Result<(), NvptxError>,
+    Result<Option<String>, NvptxError>,
+    Result<Option<String>, NvptxError>,
+    Result<(u32, u32), NvptxError>,
+    Result<(), NvptxError>,
+) {
+    let compiler = {
+        let mut compiler = std::ptr::null_mut();
+        if let Err(err) = NvptxError::try_err_from(unsafe {
+            ptx_compiler_sys::nvPTXCompilerCreate(
+                addr_of_mut!(compiler),
+                kernel_ptx.len() as ptx_compiler_sys::size_t,
+                kernel_ptx.as_ptr().cast(),
+            )
+        }) {
+            abort_call_site!("PTX compiler creation failed: {}", err);
+        }
+        compiler
+    };
+
+    let result = {
+        let kernel_name = if specialisation.is_empty() {
+            format!("{kernel_hash}_kernel")
+        } else {
+            format!(
+                "{kernel_hash}_kernel_{:016x}",
+                seahash::hash(specialisation.as_bytes())
+            )
+        };
+
+        let options = vec![
+            CString::new("--entry").unwrap(),
+            CString::new(kernel_name).unwrap(),
+            CString::new("--verbose").unwrap(),
+            CString::new("--warn-on-double-precision-use").unwrap(),
+            CString::new("--warn-on-local-memory-usage").unwrap(),
+            CString::new("--warn-on-spills").unwrap(),
+        ];
+        let options_ptrs = options.iter().map(|o| o.as_ptr()).collect::<Vec<_>>();
+
+        NvptxError::try_err_from(unsafe {
+            ptx_compiler_sys::nvPTXCompilerCompile(
+                compiler,
+                c_int::try_from(options_ptrs.len()).unwrap(),
+                options_ptrs.as_ptr().cast(),
+            )
+        })
+    };
+
+    let error_log = (|| {
+        let mut error_log_size = 0;
+
+        NvptxError::try_err_from(unsafe {
+            ptx_compiler_sys::nvPTXCompilerGetErrorLogSize(compiler, addr_of_mut!(error_log_size))
+        })?;
+
+        if error_log_size == 0 {
+            return Ok(None);
+        }
+
+        #[allow(clippy::cast_possible_truncation)]
+        let mut error_log: Vec<u8> = Vec::with_capacity(error_log_size as usize);
+
+        NvptxError::try_err_from(unsafe {
+            ptx_compiler_sys::nvPTXCompilerGetErrorLog(compiler, error_log.as_mut_ptr().cast())
+        })?;
+
+        #[allow(clippy::cast_possible_truncation)]
+        unsafe {
+            error_log.set_len(error_log_size as usize);
+        }
+
+        Ok(Some(String::from_utf8_lossy(&error_log).into_owned()))
+    })();
+
+    let info_log = (|| {
+        let mut info_log_size = 0;
+
+        NvptxError::try_err_from(unsafe {
+            ptx_compiler_sys::nvPTXCompilerGetInfoLogSize(compiler, addr_of_mut!(info_log_size))
+        })?;
+
+        if info_log_size == 0 {
+            return Ok(None);
+        }
+
+        #[allow(clippy::cast_possible_truncation)]
+        let mut info_log: Vec<u8> = Vec::with_capacity(info_log_size as usize);
+
+        NvptxError::try_err_from(unsafe {
+            ptx_compiler_sys::nvPTXCompilerGetInfoLog(compiler, info_log.as_mut_ptr().cast())
+        })?;
+
+        #[allow(clippy::cast_possible_truncation)]
+        unsafe {
+            info_log.set_len(info_log_size as usize);
+        }
+
+        Ok(Some(String::from_utf8_lossy(&info_log).into_owned()))
+    })();
+
+    let version = (|| {
+        let mut major = 0;
+        let mut minor = 0;
+
+        NvptxError::try_err_from(unsafe {
+            ptx_compiler_sys::nvPTXCompilerGetVersion(addr_of_mut!(major), addr_of_mut!(minor))
+        })?;
+
+        Ok((major, minor))
+    })();
+
+    let drop = {
+        let mut compiler = compiler;
+        NvptxError::try_err_from(unsafe {
+            ptx_compiler_sys::nvPTXCompilerDestroy(addr_of_mut!(compiler))
+        })
+    };
+
+    (result, error_log, info_log, version, drop)
+}
+
 fn compile_kernel(
     args: &syn::Ident,
     crate_name: &str,
diff --git a/rust-cuda-derive/src/kernel/link/ptx_compiler_sys.rs b/rust-cuda-derive/src/kernel/link/ptx_compiler_sys.rs
new file mode 100644
index 000000000..93837a418
--- /dev/null
+++ b/rust-cuda-derive/src/kernel/link/ptx_compiler_sys.rs
@@ -0,0 +1,301 @@
+use thiserror::Error;
+
+#[allow(non_camel_case_types)]
+pub type size_t = ::std::os::raw::c_ulonglong;
+
+#[repr(C)]
+pub struct nvPTXCompiler {
+    _private: [u8; 0],
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Error)]
+#[non_exhaustive]
+pub enum NvptxError {
+    #[error("Invalid compiler handle")]
+    InvalidCompilerHandle,
+    #[error("Invalid PTX input")]
+    InvalidInput,
+    #[error("Compilation failure")]
+    CompilationFailure,
+    #[error("Internal error")]
+    Internal,
+    #[error("Out of memory")]
+    OutOfMemory,
+    #[error("Incomplete compiler invocation")]
+    CompilerInvocationIncomplete,
+    #[error("Unsupported PTX version")]
+    UnsupportedPtxVersion,
+    #[error("Unsupported dev-side sync")]
+    UnsupportedDevSideSync,
+    #[error("Unknown error code")]
+    UnknownError,
+}
+
+impl NvptxError {
+    const NVPTXCOMPILE_ERROR_COMPILATION_FAILURE: NvptxCompileResult = 3;
+    const NVPTXCOMPILE_ERROR_COMPILER_INVOCATION_INCOMPLETE: NvptxCompileResult = 6;
+    const NVPTXCOMPILE_ERROR_INTERNAL: NvptxCompileResult = 4;
+    const NVPTXCOMPILE_ERROR_INVALID_COMPILER_HANDLE: NvptxCompileResult = 1;
+    const NVPTXCOMPILE_ERROR_INVALID_INPUT: NvptxCompileResult = 2;
+    const NVPTXCOMPILE_ERROR_OUT_OF_MEMORY: NvptxCompileResult = 5;
+    const NVPTXCOMPILE_ERROR_UNSUPPORTED_DEVSIDE_SYNC: NvptxCompileResult = 8;
+    const NVPTXCOMPILE_ERROR_UNSUPPORTED_PTX_VERSION: NvptxCompileResult = 7;
+    const NVPTXCOMPILE_SUCCESS: NvptxCompileResult = 0;
+
+    pub fn try_err_from(result: NvptxCompileResult) -> Result<(), Self> {
+        match result {
+            Self::NVPTXCOMPILE_SUCCESS => Ok(()),
+            Self::NVPTXCOMPILE_ERROR_INVALID_COMPILER_HANDLE => Err(Self::InvalidCompilerHandle),
+            Self::NVPTXCOMPILE_ERROR_INVALID_INPUT => Err(Self::InvalidInput),
+            Self::NVPTXCOMPILE_ERROR_COMPILATION_FAILURE => Err(Self::CompilationFailure),
+            Self::NVPTXCOMPILE_ERROR_INTERNAL => Err(Self::Internal),
+            Self::NVPTXCOMPILE_ERROR_OUT_OF_MEMORY => Err(Self::OutOfMemory),
+            Self::NVPTXCOMPILE_ERROR_COMPILER_INVOCATION_INCOMPLETE => {
+                Err(Self::CompilerInvocationIncomplete)
+            },
+            Self::NVPTXCOMPILE_ERROR_UNSUPPORTED_PTX_VERSION => Err(Self::UnsupportedPtxVersion),
+            Self::NVPTXCOMPILE_ERROR_UNSUPPORTED_DEVSIDE_SYNC => Err(Self::UnsupportedDevSideSync),
+            _ => Err(Self::UnknownError),
+        }
+    }
+}
+
+/// [`nvPTXCompilerHandle`] represents a handle to the PTX Compiler.
+///
+/// To compile a PTX program string, an instance of [`nvPTXCompiler`]
+/// must be created and the handle to it must be obtained using the
+/// API [`nvPTXCompilerCreate`]. Then the compilation can be done
+/// using the API [`nvPTXCompilerCompile`].
+pub type NvptxCompilerHandle = *mut nvPTXCompiler;
+
+/// The [`nvPTXCompiler`] APIs return the [`nvPTXCompileResult`] codes to
+/// indicate the call result"]
+pub type NvptxCompileResult = ::std::os::raw::c_int;
+
+extern "C" {
+    /// Queries the current major and minor version of PTX Compiler APIs being
+    /// used
+    ///
+    /// # Parameters
+    /// - [out] `major`: Major version of the PTX Compiler APIs
+    /// - [out] `minor`: Minor version of the PTX Compiler APIs
+    ///
+    /// # Return
+    /// - [`NvptxCompileResult`]::`NVPTXCOMPILE_SUCCESS`
+    /// - [`NvptxCompileResult`]::`NVPTXCOMPILE_ERROR_INTERNAL`
+    ///
+    /// # Note
+    /// The version of PTX Compiler APIs follows the CUDA Toolkit versioning.
+    /// The PTX ISA version supported by a PTX Compiler API version is listed
+    /// [here](https://docs.nvidia.com/cuda/parallel-thread-execution/#release-notes).
+    pub fn nvPTXCompilerGetVersion(
+        major: *mut ::std::os::raw::c_uint,
+        minor: *mut ::std::os::raw::c_uint,
+    ) -> NvptxCompileResult;
+
+    #[doc = " \\ingroup compilation"]
+    #[doc = ""]
+    #[doc = " \\brief            Obtains the handle to an instance of the PTX compiler"]
+    #[doc = "                   initialized with the given PTX program \\p ptxCode"]
+    #[doc = ""]
+    #[doc = " \\param            [out] compiler  Returns a handle to PTX compiler initialized"]
+    #[doc = "                                   with the PTX program \\p ptxCode"]
+    #[doc = " \\param            [in] ptxCodeLen Size of the PTX program \\p ptxCode passed as \
+             string"]
+    #[doc = " \\param            [in] ptxCode    The PTX program which is to be compiled passed as \
+             string."]
+    #[doc = ""]
+    #[doc = ""]
+    #[doc = " \\return"]
+    #[doc = "   - \\link #nvPTXCompileResult NVPTXCOMPILE_SUCCESS \\endlink"]
+    #[doc = "   - \\link #nvPTXCompileResult NVPTXCOMPILE_ERROR_OUT_OF_MEMORY \\endlink"]
+    #[doc = "   - \\link #nvPTXCompileResult NVPTXCOMPILE_ERROR_INTERNAL \\endlink"]
+    pub fn nvPTXCompilerCreate(
+        compiler: *mut NvptxCompilerHandle,
+        ptxCodeLen: size_t,
+        ptxCode: *const ::std::os::raw::c_char,
+    ) -> NvptxCompileResult;
+
+    #[doc = " \\ingroup compilation"]
+    #[doc = ""]
+    #[doc = " \\brief            Destroys and cleans the already created PTX compiler"]
+    #[doc = ""]
+    #[doc = " \\param            [in] compiler  A handle to the PTX compiler which is to be \
+             destroyed"]
+    #[doc = ""]
+    #[doc = " \\return"]
+    #[doc = "   - \\link #nvPTXCompileResult NVPTXCOMPILE_SUCCESS \\endlink"]
+    #[doc = "   - \\link #nvPTXCompileResult NVPTXCOMPILE_ERROR_OUT_OF_MEMORY \\endlink"]
+    #[doc = "   - \\link #nvPTXCompileResult NVPTXCOMPILE_ERROR_INTERNAL \\endlink"]
+    #[doc = "   - \\link #nvPTXCompileResult NVPTXCOMPILE_ERROR_INVALID_PROGRAM_HANDLE \\endlink"]
+    #[doc = ""]
+    pub fn nvPTXCompilerDestroy(compiler: *mut NvptxCompilerHandle) -> NvptxCompileResult;
+
+    #[doc = " \\ingroup compilation"]
+    #[doc = ""]
+    #[doc = " \\brief          Compile a PTX program with the given compiler options"]
+    #[doc = ""]
+    #[doc = " \\param            [in,out] compiler      A handle to PTX compiler initialized with \
+             the"]
+    #[doc = "                                          PTX program which is to be compiled."]
+    #[doc = "                                          The compiled program can be accessed using \
+             the handle"]
+    #[doc = " \\param            [in] numCompileOptions Length of the array \\p compileOptions"]
+    #[doc = " \\param            [in] compileOptions   Compiler options with which compilation \
+             should be done."]
+    #[doc = "                                         The compiler options string is a null \
+             terminated character array."]
+    #[doc = "                                         A valid list of compiler options is at"]
+    #[doc = "                                         <a href=\"http://docs.nvidia.com/cuda/ptx-compiler-api/index.html#compile-options\">link</a>."]
+    #[doc = " \\note                                   --gpu-name (-arch) is a mandatory option."]
+    #[doc = ""]
+    #[doc = " \\return"]
+    #[doc = "   - \\link #nvPTXCompileResult NVPTXCOMPILE_SUCCESS \\endlink"]
+    #[doc = "   - \\link #nvPTXCompileResult NVPTXCOMPILE_ERROR_OUT_OF_MEMORY \\endlink"]
+    #[doc = "   - \\link #nvPTXCompileResult NVPTXCOMPILE_ERROR_INTERNAL \\endlink"]
+    #[doc = "   - \\link #nvPTXCompileResult NVPTXCOMPILE_ERROR_INVALID_PROGRAM_HANDLE \\endlink"]
+    #[doc = "   - \\link #nvPTXCompileResult NVPTXCOMPILE_ERROR_COMPILATION_FAILURE  \\endlink"]
+    #[doc = "   - \\link #nvPTXCompileResult NVPTXCOMPILE_ERROR_UNSUPPORTED_PTX_VERSION  \\endlink"]
+    #[doc = ""]
+    pub fn nvPTXCompilerCompile(
+        compiler: NvptxCompilerHandle,
+        numCompileOptions: ::std::os::raw::c_int,
+        compileOptions: *const *const ::std::os::raw::c_char,
+    ) -> NvptxCompileResult;
+
+    #[doc = " \\ingroup compilation"]
+    #[doc = ""]
+    #[doc = " \\brief            Obtains the size of the image of the compiled program"]
+    #[doc = ""]
+    #[doc = " \\param            [in] compiler          A handle to PTX compiler on which \
+             nvPTXCompilerCompile() has been performed."]
+    #[doc = " \\param            [out] binaryImageSize  The size of the image of the compiled \
+             program"]
+    #[doc = ""]
+    #[doc = " \\return"]
+    #[doc = "   - \\link #nvPTXCompileResult NVPTXCOMPILE_SUCCESS \\endlink"]
+    #[doc = "   - \\link #nvPTXCompileResult NVPTXCOMPILE_ERROR_INTERNAL \\endlink"]
+    #[doc = "   - \\link #nvPTXCompileResult NVPTXCOMPILE_ERROR_INVALID_PROGRAM_HANDLE \\endlink"]
+    #[doc = "   - \\link #nvPTXCompileResult NVPTXCOMPILE_ERROR_COMPILER_INVOCATION_INCOMPLETE \
+             \\endlink"]
+    #[doc = ""]
+    #[doc = " \\note             nvPTXCompilerCompile() API should be invoked for the handle \
+             before calling this API."]
+    #[doc = "                   Otherwise, NVPTXCOMPILE_ERROR_COMPILER_INVOCATION_INCOMPLETE is \
+             returned."]
+    pub fn nvPTXCompilerGetCompiledProgramSize(
+        compiler: NvptxCompilerHandle,
+        binaryImageSize: *mut size_t,
+    ) -> NvptxCompileResult;
+
+    #[doc = " \\ingroup compilation"]
+    #[doc = ""]
+    #[doc = " \\brief            Obtains the image of the compiled program"]
+    #[doc = ""]
+    #[doc = " \\param            [in] compiler          A handle to PTX compiler on which \
+             nvPTXCompilerCompile() has been performed."]
+    #[doc = " \\param            [out] binaryImage      The image of the compiled program."]
+    #[doc = "                                         Client should allocate memory for \\p \
+             binaryImage"]
+    #[doc = ""]
+    #[doc = " \\return"]
+    #[doc = "   - \\link #nvPTXCompileResult NVPTXCOMPILE_SUCCESS \\endlink"]
+    #[doc = "   - \\link #nvPTXCompileResult NVPTXCOMPILE_ERROR_INTERNAL \\endlink"]
+    #[doc = "   - \\link #nvPTXCompileResult NVPTXCOMPILE_ERROR_INVALID_PROGRAM_HANDLE \\endlink"]
+    #[doc = "   - \\link #nvPTXCompileResult NVPTXCOMPILE_ERROR_COMPILER_INVOCATION_INCOMPLETE \
+             \\endlink"]
+    #[doc = ""]
+    #[doc = " \\note             nvPTXCompilerCompile() API should be invoked for the handle \
+             before calling this API."]
+    #[doc = "                   Otherwise, NVPTXCOMPILE_ERROR_COMPILER_INVOCATION_INCOMPLETE is \
+             returned."]
+    #[doc = ""]
+    pub fn nvPTXCompilerGetCompiledProgram(
+        compiler: NvptxCompilerHandle,
+        binaryImage: *mut ::std::os::raw::c_void,
+    ) -> NvptxCompileResult;
+
+    #[doc = " \\ingroup compilation"]
+    #[doc = ""]
+    #[doc = " \\brief            Query the size of the error message that was seen previously for \
+             the handle"]
+    #[doc = ""]
+    #[doc = " \\param            [in] compiler          A handle to PTX compiler on which \
+             nvPTXCompilerCompile() has been performed."]
+    #[doc = " \\param            [out] errorLogSize     The size of the error log in bytes which \
+             was produced"]
+    #[doc = "                                          in previous call to nvPTXCompilerCompiler()."]
+    #[doc = ""]
+    #[doc = " \\return"]
+    #[doc = "   - \\link #nvPTXCompileResult NVPTXCOMPILE_SUCCESS \\endlink"]
+    #[doc = "   - \\link #nvPTXCompileResult NVPTXCOMPILE_ERROR_INTERNAL \\endlink"]
+    #[doc = "   - \\link #nvPTXCompileResult NVPTXCOMPILE_ERROR_INVALID_PROGRAM_HANDLE \\endlink"]
+    #[doc = ""]
+    pub fn nvPTXCompilerGetErrorLogSize(
+        compiler: NvptxCompilerHandle,
+        errorLogSize: *mut size_t,
+    ) -> NvptxCompileResult;
+
+    #[doc = " \\ingroup compilation"]
+    #[doc = ""]
+    #[doc = " \\brief            Query the error message that was seen previously for the handle"]
+    #[doc = ""]
+    #[doc = " \\param            [in] compiler         A handle to PTX compiler on which \
+             nvPTXCompilerCompile() has been performed."]
+    #[doc = " \\param            [out] errorLog        The error log which was produced in \
+             previous call to nvPTXCompilerCompiler()."]
+    #[doc = "                                         Clients should allocate memory for \\p \
+             errorLog"]
+    #[doc = ""]
+    #[doc = " \\return"]
+    #[doc = "   - \\link #nvPTXCompileResult NVPTXCOMPILE_SUCCESS \\endlink"]
+    #[doc = "   - \\link #nvPTXCompileResult NVPTXCOMPILE_ERROR_INTERNAL \\endlink"]
+    #[doc = "   - \\link #nvPTXCompileResult NVPTXCOMPILE_ERROR_INVALID_PROGRAM_HANDLE \\endlink"]
+    #[doc = ""]
+    pub fn nvPTXCompilerGetErrorLog(
+        compiler: NvptxCompilerHandle,
+        errorLog: *mut ::std::os::raw::c_char,
+    ) -> NvptxCompileResult;
+
+    #[doc = " \\ingroup compilation"]
+    #[doc = ""]
+    #[doc = " \\brief            Query the size of the information message that was seen \
+             previously for the handle"]
+    #[doc = ""]
+    #[doc = " \\param            [in] compiler        A handle to PTX compiler on which \
+             nvPTXCompilerCompile() has been performed."]
+    #[doc = " \\param            [out] infoLogSize    The size of the information log in bytes \
+             which was produced"]
+    #[doc = "                                         in previous call to nvPTXCompilerCompiler()."]
+    #[doc = ""]
+    #[doc = " \\return"]
+    #[doc = "   - \\link #nvPTXCompileResult NVPTXCOMPILE_SUCCESS \\endlink"]
+    #[doc = "   - \\link #nvPTXCompileResult NVPTXCOMPILE_ERROR_INTERNAL \\endlink"]
+    #[doc = "   - \\link #nvPTXCompileResult NVPTXCOMPILE_ERROR_INVALID_PROGRAM_HANDLE \\endlink"]
+    #[doc = ""]
+    pub fn nvPTXCompilerGetInfoLogSize(
+        compiler: NvptxCompilerHandle,
+        infoLogSize: *mut size_t,
+    ) -> NvptxCompileResult;
+
+    #[doc = " \\ingroup compilation"]
+    #[doc = ""]
+    #[doc = " \\brief           Query the information message that was seen previously for the \
+             handle"]
+    #[doc = ""]
+    #[doc = " \\param            [in] compiler        A handle to PTX compiler on which \
+             nvPTXCompilerCompile() has been performed."]
+    #[doc = " \\param            [out] infoLog        The information log which was produced in \
+             previous call to nvPTXCompilerCompiler()."]
+    #[doc = "                                        Clients should allocate memory for \\p infoLog"]
+    #[doc = ""]
+    #[doc = " \\return"]
+    #[doc = "   - \\link #nvPTXCompileResult NVPTXCOMPILE_SUCCESS \\endlink"]
+    #[doc = "   - \\link #nvPTXCompileResult NVPTXCOMPILE_ERROR_INTERNAL \\endlink"]
+    #[doc = "   - \\link #nvPTXCompileResult NVPTXCOMPILE_ERROR_INVALID_PROGRAM_HANDLE \\endlink"]
+    #[doc = ""]
+    pub fn nvPTXCompilerGetInfoLog(
+        compiler: NvptxCompilerHandle,
+        infoLog: *mut ::std::os::raw::c_char,
+    ) -> NvptxCompileResult;
+}

From 57e10d77272a9b38fa36c1402db3888e3bb7c940 Mon Sep 17 00:00:00 2001
From: Juniper Tyree <juniper.tyree@helsinki.fi>
Date: Mon, 8 May 2023 04:32:28 +0000
Subject: [PATCH 029/120] Add PTX lint parsing, no actual support yet

---
 examples/single-source/src/main.rs            |   4 +
 .../src/kernel/link/ptx_compiler_sys.rs       |  64 ++++----
 rust-cuda-derive/src/kernel/wrapper/mod.rs    | 151 +++++++++++++++++-
 rust-cuda-derive/src/lib.rs                   |   1 +
 4 files changed, 182 insertions(+), 38 deletions(-)

diff --git a/examples/single-source/src/main.rs b/examples/single-source/src/main.rs
index 981c9bccc..997fa88bc 100644
--- a/examples/single-source/src/main.rs
+++ b/examples/single-source/src/main.rs
@@ -40,6 +40,10 @@ pub struct Tuple(u32, i32);
 
 #[rc::common::kernel(use link_kernel! as impl Kernel<KernelArgs, KernelPtx> for Launcher)]
 #[kernel(crate = "rc")]
+#[kernel(
+    allow(ptx::double_precision_use),
+    forbid(ptx::local_memory_usage, ptx::register_spills)
+)]
 pub fn kernel<'a, T: rc::common::RustToCuda>(
     #[kernel(pass = SafeDeviceCopy)] _x: &Dummy,
     #[kernel(pass = LendRustToCuda, jit)] _y: &mut ShallowCopy<Wrapper<T>>,
diff --git a/rust-cuda-derive/src/kernel/link/ptx_compiler_sys.rs b/rust-cuda-derive/src/kernel/link/ptx_compiler_sys.rs
index 93837a418..5e459a623 100644
--- a/rust-cuda-derive/src/kernel/link/ptx_compiler_sys.rs
+++ b/rust-cuda-derive/src/kernel/link/ptx_compiler_sys.rs
@@ -74,15 +74,15 @@ pub type NvptxCompileResult = ::std::os::raw::c_int;
 
 extern "C" {
     /// Queries the current major and minor version of PTX Compiler APIs being
-    /// used
+    ///  used.
     ///
     /// # Parameters
     /// - [out] `major`: Major version of the PTX Compiler APIs
     /// - [out] `minor`: Minor version of the PTX Compiler APIs
     ///
-    /// # Return
-    /// - [`NvptxCompileResult`]::`NVPTXCOMPILE_SUCCESS`
-    /// - [`NvptxCompileResult`]::`NVPTXCOMPILE_ERROR_INTERNAL`
+    /// # Returns
+    /// - [`NvptxCompileResult::NVPTXCOMPILE_SUCCESS`]
+    /// - [`NvptxCompileResult::NVPTXCOMPILE_ERROR_INTERNAL`]
     ///
     /// # Note
     /// The version of PTX Compiler APIs follows the CUDA Toolkit versioning.
@@ -93,42 +93,38 @@ extern "C" {
         minor: *mut ::std::os::raw::c_uint,
     ) -> NvptxCompileResult;
 
-    #[doc = " \\ingroup compilation"]
-    #[doc = ""]
-    #[doc = " \\brief            Obtains the handle to an instance of the PTX compiler"]
-    #[doc = "                   initialized with the given PTX program \\p ptxCode"]
-    #[doc = ""]
-    #[doc = " \\param            [out] compiler  Returns a handle to PTX compiler initialized"]
-    #[doc = "                                   with the PTX program \\p ptxCode"]
-    #[doc = " \\param            [in] ptxCodeLen Size of the PTX program \\p ptxCode passed as \
-             string"]
-    #[doc = " \\param            [in] ptxCode    The PTX program which is to be compiled passed as \
-             string."]
-    #[doc = ""]
-    #[doc = ""]
-    #[doc = " \\return"]
-    #[doc = "   - \\link #nvPTXCompileResult NVPTXCOMPILE_SUCCESS \\endlink"]
-    #[doc = "   - \\link #nvPTXCompileResult NVPTXCOMPILE_ERROR_OUT_OF_MEMORY \\endlink"]
-    #[doc = "   - \\link #nvPTXCompileResult NVPTXCOMPILE_ERROR_INTERNAL \\endlink"]
+    /// Obtains the handle to an instance of the PTX compiler
+    ///  initialized with the given PTX program `ptxCode`.
+    ///
+    /// # Parameters
+    /// - [out] `compiler`: Returns a handle to PTX compiler initialized with
+    ///   the PTX program `ptxCode`
+    /// - [in] `ptxCodeLen`: Size of the PTX program `ptxCode` passed as a
+    ///   string
+    /// - [in] `ptxCode`: The PTX program which is to be compiled passed as a
+    ///   string
+    ///
+    /// # Returns
+    /// - [`NvptxCompileResult::NVPTXCOMPILE_SUCCESS`]
+    /// - [`NvptxCompileResult::NVPTXCOMPILE_ERROR_OUT_OF_MEMORY`]
+    /// - [`NvptxCompileResult::NVPTXCOMPILE_ERROR_INTERNAL`]
     pub fn nvPTXCompilerCreate(
         compiler: *mut NvptxCompilerHandle,
         ptxCodeLen: size_t,
         ptxCode: *const ::std::os::raw::c_char,
     ) -> NvptxCompileResult;
 
-    #[doc = " \\ingroup compilation"]
-    #[doc = ""]
-    #[doc = " \\brief            Destroys and cleans the already created PTX compiler"]
-    #[doc = ""]
-    #[doc = " \\param            [in] compiler  A handle to the PTX compiler which is to be \
-             destroyed"]
-    #[doc = ""]
-    #[doc = " \\return"]
-    #[doc = "   - \\link #nvPTXCompileResult NVPTXCOMPILE_SUCCESS \\endlink"]
-    #[doc = "   - \\link #nvPTXCompileResult NVPTXCOMPILE_ERROR_OUT_OF_MEMORY \\endlink"]
-    #[doc = "   - \\link #nvPTXCompileResult NVPTXCOMPILE_ERROR_INTERNAL \\endlink"]
-    #[doc = "   - \\link #nvPTXCompileResult NVPTXCOMPILE_ERROR_INVALID_PROGRAM_HANDLE \\endlink"]
-    #[doc = ""]
+    /// Destroys and cleans the already created PTX compiler.
+    ///
+    /// # Parameters
+    /// - [in] `compiler`: A handle to the PTX compiler which is to be
+    ///   destroyed.
+    ///
+    /// # Returns
+    /// - [`NvptxCompileResult::NVPTXCOMPILE_SUCCESS`]
+    /// - [`NvptxCompileResult::NVPTXCOMPILE_ERROR_OUT_OF_MEMORY`]
+    /// - [`NvptxCompileResult::NVPTXCOMPILE_ERROR_INTERNAL`]
+    /// - [`NvptxCompileResult::NVPTXCOMPILE_ERROR_INVALID_PROGRAM_HANDLE`]
     pub fn nvPTXCompilerDestroy(compiler: *mut NvptxCompilerHandle) -> NvptxCompileResult;
 
     #[doc = " \\ingroup compilation"]
diff --git a/rust-cuda-derive/src/kernel/wrapper/mod.rs b/rust-cuda-derive/src/kernel/wrapper/mod.rs
index b720a8965..a677c3e0f 100644
--- a/rust-cuda-derive/src/kernel/wrapper/mod.rs
+++ b/rust-cuda-derive/src/kernel/wrapper/mod.rs
@@ -1,4 +1,8 @@
-use std::hash::{Hash, Hasher};
+use std::{
+    collections::HashMap,
+    fmt,
+    hash::{Hash, Hasher},
+};
 
 use proc_macro::TokenStream;
 
@@ -41,6 +45,7 @@ pub fn kernel(attr: TokenStream, func: TokenStream) -> TokenStream {
     let mut func = parse_kernel_fn(func);
 
     let mut crate_path = None;
+    let mut lint_levels = HashMap::new();
 
     func.attrs.retain(|attr| {
         if attr.path.is_ident("kernel") {
@@ -58,7 +63,7 @@ pub fn kernel(attr: TokenStream, func: TokenStream) -> TokenStream {
                                         syn::parse_quote_spanned! { s.span() => #new_crate_path },
                                     );
 
-                                    return false;
+                                    continue;
                                 }
 
                                 emit_error!(
@@ -73,10 +78,106 @@ pub fn kernel(attr: TokenStream, func: TokenStream) -> TokenStream {
                                 err
                             ),
                         },
+                        syn::NestedMeta::Meta(syn::Meta::List(syn::MetaList {
+                            path,
+                            nested,
+                            ..
+                        })) if path.is_ident("allow") || path.is_ident("warn") || path.is_ident("deny") || path.is_ident("forbid") => {
+                            let level = match path.get_ident() {
+                                Some(ident) if ident == "allow" => LintLevel::Allow,
+                                Some(ident) if ident == "warn" => LintLevel::Warn,
+                                Some(ident) if ident == "deny" => LintLevel::Deny,
+                                Some(ident) if ident == "forbid" => LintLevel::Forbid,
+                                _ => unreachable!(),
+                            };
+
+                            for meta in nested {
+                                let syn::NestedMeta::Meta(syn::Meta::Path(path)) = meta else {
+                                    emit_error!(
+                                        meta.span(),
+                                        "[rust-cuda]: Invalid #[kernel({}(<lint>))] attribute.",
+                                        level,
+                                    );
+                                    continue;
+                                };
+
+                                if path.leading_colon.is_some() || path.segments.empty_or_trailing() || path.segments.len() != 2 {
+                                    emit_error!(
+                                        meta.span(),
+                                        "[rust-cuda]: Invalid #[kernel({}(<lint>))] attribute: <lint> must be of the form `ptx::lint`.",
+                                        level,
+                                    );
+                                    continue;
+                                }
+
+                                let Some(syn::PathSegment { ident: namespace, arguments: syn::PathArguments::None }) = path.segments.first() else {
+                                    emit_error!(
+                                        meta.span(),
+                                        "[rust-cuda]: Invalid #[kernel({}(<lint>))] attribute: <lint> must be of the form `ptx::lint`.",
+                                        level,
+                                    );
+                                    continue;
+                                };
+
+                                if namespace != "ptx" {
+                                    emit_error!(
+                                        meta.span(),
+                                        "[rust-cuda]: Invalid #[kernel({}(<lint>))] attribute: <lint> must be of the form `ptx::lint`.",
+                                        level,
+                                    );
+                                    continue;
+                                }
+
+                                let Some(syn::PathSegment { ident: lint, arguments: syn::PathArguments::None }) = path.segments.last() else {
+                                    emit_error!(
+                                        meta.span(),
+                                        "[rust-cuda]: Invalid #[kernel({}(<lint>))] attribute: <lint> must be of the form `ptx::lint`.",
+                                        level,
+                                    );
+                                    continue;
+                                };
+
+                                let lint = match lint {
+                                    l if l == "verbose" => PtxLint::Verbose,
+                                    l if l == "double_precision_use" => PtxLint::DoublePrecisionUse,
+                                    l if l == "local_memory_usage" => PtxLint::LocalMemoryUsage,
+                                    l if l == "register_spills" => PtxLint::RegisterSpills,
+                                    _ => {
+                                        emit_error!(
+                                            meta.span(),
+                                            "[rust-cuda]: Unknown PTX kernel lint `ptx::{}`.",
+                                            lint,
+                                        );
+                                        continue;
+                                    }
+                                };
+
+                                match lint_levels.get(&lint) {
+                                    None => (),
+                                    Some(LintLevel::Forbid) if level < LintLevel::Forbid => {
+                                        emit_error!(
+                                            meta.span(),
+                                            "[rust-cuda]: {}(ptx::{}) incompatible with previous forbid.",
+                                            level, lint,
+                                        );
+                                        continue;
+                                    },
+                                    Some(previous) => {
+                                        emit_warning!(
+                                            meta.span(),
+                                            "[rust-cuda]: {}(ptx::{}) overwrites previous {}.",
+                                            level, lint, previous,
+                                        );
+                                    }
+                                }
+
+                                lint_levels.insert(lint, level);
+                            }
+                        },
                         _ => {
                             emit_error!(
                                 meta.span(),
-                                "[rust-cuda]: Expected #[kernel(crate = \"<crate-path>\")] function attribute."
+                                "[rust-cuda]: Expected #[kernel(crate = \"<crate-path>\")] or #[kernel(allow/warn/deny/forbid(<lint>))] function attribute."
                             );
                         }
                     }
@@ -84,7 +185,7 @@ pub fn kernel(attr: TokenStream, func: TokenStream) -> TokenStream {
             } else {
                 emit_error!(
                     attr.span(),
-                    "[rust-cuda]: Expected #[kernel(crate = \"<crate-path>\")] function attribute."
+                    "[rust-cuda]: Expected #[kernel(crate = \"<crate-path>\")] or or #[kernel(allow/warn/deny/forbid(<lint>))] function attribute."
                 );
             }
 
@@ -96,6 +197,10 @@ pub fn kernel(attr: TokenStream, func: TokenStream) -> TokenStream {
 
     let crate_path = crate_path.unwrap_or_else(|| syn::parse_quote!(::rust_cuda));
 
+    let _ = lint_levels.try_insert(PtxLint::DoublePrecisionUse, LintLevel::Warn);
+    let _ = lint_levels.try_insert(PtxLint::LocalMemoryUsage, LintLevel::Warn);
+    let _ = lint_levels.try_insert(PtxLint::RegisterSpills, LintLevel::Warn);
+
     let mut generic_kernel_params = func.sig.generics.params.clone();
     let mut func_inputs = parse_function_inputs(&func, &mut generic_kernel_params);
 
@@ -341,6 +446,44 @@ struct FuncIdent<'f> {
     func_ident_hash: syn::Ident,
 }
 
+#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Debug)]
+enum LintLevel {
+    Allow,
+    Warn,
+    Deny,
+    Forbid,
+}
+
+impl fmt::Display for LintLevel {
+    fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result {
+        match self {
+            Self::Allow => fmt.write_str("allow"),
+            Self::Warn => fmt.write_str("warn"),
+            Self::Deny => fmt.write_str("deny"),
+            Self::Forbid => fmt.write_str("forbid"),
+        }
+    }
+}
+
+#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Debug)]
+enum PtxLint {
+    Verbose,
+    DoublePrecisionUse,
+    LocalMemoryUsage,
+    RegisterSpills,
+}
+
+impl fmt::Display for PtxLint {
+    fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result {
+        match self {
+            Self::Verbose => fmt.write_str("verbose"),
+            Self::DoublePrecisionUse => fmt.write_str("double_precision_use"),
+            Self::LocalMemoryUsage => fmt.write_str("local_memory_usage"),
+            Self::RegisterSpills => fmt.write_str("register_spills"),
+        }
+    }
+}
+
 fn ident_from_pat(pat: &syn::Pat) -> Option<syn::Ident> {
     match pat {
         syn::Pat::Lit(_)
diff --git a/rust-cuda-derive/src/lib.rs b/rust-cuda-derive/src/lib.rs
index d5d8f3018..572e1c9da 100644
--- a/rust-cuda-derive/src/lib.rs
+++ b/rust-cuda-derive/src/lib.rs
@@ -4,6 +4,7 @@
 #![feature(proc_macro_span)]
 #![feature(if_let_guard)]
 #![feature(let_chains)]
+#![feature(map_try_insert)]
 #![doc(html_root_url = "https://juntyr.github.io/rust-cuda/")]
 
 extern crate proc_macro;

From 1ab8b471dfc01b822b5b5d2a81cb18c78b333d13 Mon Sep 17 00:00:00 2001
From: Juniper Tyree <juniper.tyree@helsinki.fi>
Date: Mon, 8 May 2023 19:01:57 +0000
Subject: [PATCH 030/120] Added lint checking support to monomorphised kernel
 impls

---
 rust-cuda-derive/src/kernel/link/config.rs    |  19 +-
 rust-cuda-derive/src/kernel/link/mod.rs       |  96 ++++++++--
 rust-cuda-derive/src/kernel/lints.rs          | 154 ++++++++++++++++
 rust-cuda-derive/src/kernel/mod.rs            |   1 +
 .../generate/cpu_linker_macro/get_ptx_str.rs  |   4 +-
 .../wrapper/generate/cpu_linker_macro/mod.rs  |   2 +
 rust-cuda-derive/src/kernel/wrapper/mod.rs    | 164 ++++--------------
 7 files changed, 291 insertions(+), 149 deletions(-)
 create mode 100644 rust-cuda-derive/src/kernel/lints.rs

diff --git a/rust-cuda-derive/src/kernel/link/config.rs b/rust-cuda-derive/src/kernel/link/config.rs
index bb5f011d6..e2b399dc4 100644
--- a/rust-cuda-derive/src/kernel/link/config.rs
+++ b/rust-cuda-derive/src/kernel/link/config.rs
@@ -1,4 +1,6 @@
-use std::path::PathBuf;
+use std::{collections::HashMap, path::PathBuf};
+
+use super::super::lints::{parse_ptx_lint_level, LintLevel, PtxLint};
 
 #[allow(clippy::module_name_repetitions)]
 pub(super) struct LinkKernelConfig {
@@ -8,6 +10,7 @@ pub(super) struct LinkKernelConfig {
     pub(super) crate_name: String,
     pub(super) crate_path: PathBuf,
     pub(super) specialisation: String,
+    pub(super) ptx_lint_levels: HashMap<PtxLint, LintLevel>,
 }
 
 impl syn::parse::Parse for LinkKernelConfig {
@@ -37,6 +40,19 @@ impl syn::parse::Parse for LinkKernelConfig {
             String::new()
         };
 
+        let attrs = syn::punctuated::Punctuated::<
+            syn::MetaList,
+            syn::token::Comma,
+        >::parse_separated_nonempty(input)?;
+
+        let mut ptx_lint_levels = HashMap::new();
+
+        for syn::MetaList { path, nested, .. } in attrs {
+            parse_ptx_lint_level(&path, &nested, &mut ptx_lint_levels);
+        }
+
+        proc_macro_error::abort_if_dirty();
+
         Ok(Self {
             kernel,
             kernel_hash,
@@ -44,6 +60,7 @@ impl syn::parse::Parse for LinkKernelConfig {
             crate_name: name.value(),
             crate_path: PathBuf::from(path.value()),
             specialisation,
+            ptx_lint_levels,
         })
     }
 }
diff --git a/rust-cuda-derive/src/kernel/link/mod.rs b/rust-cuda-derive/src/kernel/link/mod.rs
index f6f4719c4..a79505c13 100644
--- a/rust-cuda-derive/src/kernel/link/mod.rs
+++ b/rust-cuda-derive/src/kernel/link/mod.rs
@@ -1,4 +1,5 @@
 use std::{
+    collections::HashMap,
     env,
     ffi::CString,
     fmt::Write as FmtWrite,
@@ -17,7 +18,10 @@ use ptx_builder::{
     error::{BuildErrorKind, Error, Result},
 };
 
-use super::utils::skip_kernel_compilation;
+use super::{
+    lints::{LintLevel, PtxLint},
+    utils::skip_kernel_compilation,
+};
 
 mod config;
 mod error;
@@ -68,12 +72,14 @@ pub fn link_kernel(tokens: TokenStream) -> TokenStream {
         crate_name,
         crate_path,
         specialisation,
+        ptx_lint_levels,
     } = match syn::parse_macro_input::parse(tokens) {
         Ok(config) => config,
         Err(err) => {
             abort_call_site!(
-                "link_kernel!(KERNEL ARGS NAME PATH SPECIALISATION) expects KERNEL and ARGS \
-                 identifiers, NAME and PATH string literals, and SPECIALISATION tokens: {:?}",
+                "link_kernel!(KERNEL ARGS NAME PATH SPECIALISATION LINTS,*) expects KERNEL and \
+                 ARGS identifiers, NAME and PATH string literals, SPECIALISATION and LINTS \
+                 tokens: {:?}",
                 err
             )
         },
@@ -208,7 +214,7 @@ pub fn link_kernel(tokens: TokenStream) -> TokenStream {
     }
 
     let (result, error_log, info_log, version, drop) =
-        check_kernel_ptx(&kernel_ptx, &specialisation, &kernel_hash);
+        check_kernel_ptx(&kernel_ptx, &specialisation, &kernel_hash, &ptx_lint_levels);
 
     let ptx_compiler = match &version {
         Ok((major, minor)) => format!("PTX compiler v{major}.{minor}"),
@@ -279,10 +285,12 @@ pub fn link_kernel(tokens: TokenStream) -> TokenStream {
 }
 
 #[allow(clippy::type_complexity)]
+#[allow(clippy::too_many_lines)]
 fn check_kernel_ptx(
     kernel_ptx: &str,
     specialisation: &str,
     kernel_hash: &proc_macro2::Ident,
+    ptx_lint_levels: &HashMap<PtxLint, LintLevel>,
 ) -> (
     Result<(), NvptxError>,
     Result<Option<String>, NvptxError>,
@@ -304,7 +312,7 @@ fn check_kernel_ptx(
         compiler
     };
 
-    let result = {
+    let result = (|| {
         let kernel_name = if specialisation.is_empty() {
             format!("{kernel_hash}_kernel")
         } else {
@@ -313,15 +321,79 @@ fn check_kernel_ptx(
                 seahash::hash(specialisation.as_bytes())
             )
         };
-
-        let options = vec![
+        let mut options = vec![
             CString::new("--entry").unwrap(),
             CString::new(kernel_name).unwrap(),
-            CString::new("--verbose").unwrap(),
-            CString::new("--warn-on-double-precision-use").unwrap(),
-            CString::new("--warn-on-local-memory-usage").unwrap(),
-            CString::new("--warn-on-spills").unwrap(),
         ];
+
+        if ptx_lint_levels
+            .values()
+            .any(|level| *level > LintLevel::Warn)
+        {
+            let mut options = options.clone();
+
+            if ptx_lint_levels
+                .get(&PtxLint::Verbose)
+                .map_or(false, |level| *level > LintLevel::Warn)
+            {
+                options.push(CString::new("--verbose").unwrap());
+            }
+            if ptx_lint_levels
+                .get(&PtxLint::DoublePrecisionUse)
+                .map_or(false, |level| *level > LintLevel::Warn)
+            {
+                options.push(CString::new("--warn-on-double-precision-use").unwrap());
+            }
+            if ptx_lint_levels
+                .get(&PtxLint::LocalMemoryUsage)
+                .map_or(false, |level| *level > LintLevel::Warn)
+            {
+                options.push(CString::new("--warn-on-local-memory-usage").unwrap());
+            }
+            if ptx_lint_levels
+                .get(&PtxLint::RegisterSpills)
+                .map_or(false, |level| *level > LintLevel::Warn)
+            {
+                options.push(CString::new("--warn-on-spills").unwrap());
+            }
+            options.push(CString::new("--warning-as-error").unwrap());
+
+            let options_ptrs = options.iter().map(|o| o.as_ptr()).collect::<Vec<_>>();
+
+            NvptxError::try_err_from(unsafe {
+                ptx_compiler_sys::nvPTXCompilerCompile(
+                    compiler,
+                    options_ptrs.len() as c_int,
+                    options_ptrs.as_ptr().cast(),
+                )
+            })?;
+        };
+
+        if ptx_lint_levels
+            .get(&PtxLint::Verbose)
+            .map_or(false, |level| *level > LintLevel::Allow)
+        {
+            options.push(CString::new("--verbose").unwrap());
+        }
+        if ptx_lint_levels
+            .get(&PtxLint::DoublePrecisionUse)
+            .map_or(false, |level| *level > LintLevel::Allow)
+        {
+            options.push(CString::new("--warn-on-double-precision-use").unwrap());
+        }
+        if ptx_lint_levels
+            .get(&PtxLint::LocalMemoryUsage)
+            .map_or(false, |level| *level > LintLevel::Allow)
+        {
+            options.push(CString::new("--warn-on-local-memory-usage").unwrap());
+        }
+        if ptx_lint_levels
+            .get(&PtxLint::RegisterSpills)
+            .map_or(false, |level| *level > LintLevel::Allow)
+        {
+            options.push(CString::new("--warn-on-spills").unwrap());
+        }
+
         let options_ptrs = options.iter().map(|o| o.as_ptr()).collect::<Vec<_>>();
 
         NvptxError::try_err_from(unsafe {
@@ -331,7 +403,7 @@ fn check_kernel_ptx(
                 options_ptrs.as_ptr().cast(),
             )
         })
-    };
+    })();
 
     let error_log = (|| {
         let mut error_log_size = 0;
diff --git a/rust-cuda-derive/src/kernel/lints.rs b/rust-cuda-derive/src/kernel/lints.rs
new file mode 100644
index 000000000..6cdb63ca8
--- /dev/null
+++ b/rust-cuda-derive/src/kernel/lints.rs
@@ -0,0 +1,154 @@
+use std::{collections::HashMap, fmt};
+
+use syn::spanned::Spanned;
+
+pub fn parse_ptx_lint_level(
+    path: &syn::Path,
+    nested: &syn::punctuated::Punctuated<syn::NestedMeta, syn::token::Comma>,
+    ptx_lint_levels: &mut HashMap<PtxLint, LintLevel>,
+) {
+    let level = match path.get_ident() {
+        Some(ident) if ident == "allow" => LintLevel::Allow,
+        Some(ident) if ident == "warn" => LintLevel::Warn,
+        Some(ident) if ident == "deny" => LintLevel::Deny,
+        Some(ident) if ident == "forbid" => LintLevel::Forbid,
+        _ => {
+            emit_error!(
+                path.span(),
+                "[rust-cuda]: Invalid lint #[kernel(<level>(<lint>))] attribute: unknown lint \
+                 level, must be one of `allow`, `warn`, `deny`, `forbid`.",
+            );
+
+            return;
+        },
+    };
+
+    for meta in nested {
+        let syn::NestedMeta::Meta(syn::Meta::Path(path)) = meta else {
+            emit_error!(
+                meta.span(),
+                "[rust-cuda]: Invalid #[kernel({}(<lint>))] attribute.",
+                level,
+            );
+            continue;
+        };
+
+        if path.leading_colon.is_some()
+            || path.segments.empty_or_trailing()
+            || path.segments.len() != 2
+        {
+            emit_error!(
+                meta.span(),
+                "[rust-cuda]: Invalid #[kernel({}(<lint>))] attribute: <lint> must be of the form \
+                 `ptx::lint`.",
+                level,
+            );
+            continue;
+        }
+
+        let Some(syn::PathSegment { ident: namespace, arguments: syn::PathArguments::None }) = path.segments.first() else {
+            emit_error!(
+                meta.span(),
+                "[rust-cuda]: Invalid #[kernel({}(<lint>))] attribute: <lint> must be of the form `ptx::lint`.",
+                level,
+            );
+            continue;
+        };
+
+        if namespace != "ptx" {
+            emit_error!(
+                meta.span(),
+                "[rust-cuda]: Invalid #[kernel({}(<lint>))] attribute: <lint> must be of the form \
+                 `ptx::lint`.",
+                level,
+            );
+            continue;
+        }
+
+        let Some(syn::PathSegment { ident: lint, arguments: syn::PathArguments::None }) = path.segments.last() else {
+            emit_error!(
+                meta.span(),
+                "[rust-cuda]: Invalid #[kernel({}(<lint>))] attribute: <lint> must be of the form `ptx::lint`.",
+                level,
+            );
+            continue;
+        };
+
+        let lint = match lint {
+            l if l == "verbose" => PtxLint::Verbose,
+            l if l == "double_precision_use" => PtxLint::DoublePrecisionUse,
+            l if l == "local_memory_usage" => PtxLint::LocalMemoryUsage,
+            l if l == "register_spills" => PtxLint::RegisterSpills,
+            _ => {
+                emit_error!(
+                    meta.span(),
+                    "[rust-cuda]: Unknown PTX kernel lint `ptx::{}`.",
+                    lint,
+                );
+                continue;
+            },
+        };
+
+        match ptx_lint_levels.get(&lint) {
+            None => (),
+            Some(LintLevel::Forbid) if level < LintLevel::Forbid => {
+                emit_error!(
+                    meta.span(),
+                    "[rust-cuda]: {}(ptx::{}) incompatible with previous forbid.",
+                    level,
+                    lint,
+                );
+                continue;
+            },
+            Some(previous) => {
+                emit_warning!(
+                    meta.span(),
+                    "[rust-cuda]: {}(ptx::{}) overwrites previous {}.",
+                    level,
+                    lint,
+                    previous,
+                );
+            },
+        }
+
+        ptx_lint_levels.insert(lint, level);
+    }
+}
+
+#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Debug)]
+pub enum LintLevel {
+    Allow,
+    Warn,
+    Deny,
+    Forbid,
+}
+
+impl fmt::Display for LintLevel {
+    fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result {
+        match self {
+            Self::Allow => fmt.write_str("allow"),
+            Self::Warn => fmt.write_str("warn"),
+            Self::Deny => fmt.write_str("deny"),
+            Self::Forbid => fmt.write_str("forbid"),
+        }
+    }
+}
+
+#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Debug)]
+pub enum PtxLint {
+    Verbose,
+    DoublePrecisionUse,
+    LocalMemoryUsage,
+    RegisterSpills,
+}
+
+impl fmt::Display for PtxLint {
+    fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result {
+        match self {
+            Self::Verbose => fmt.write_str("verbose"),
+            Self::DoublePrecisionUse => fmt.write_str("double_precision_use"),
+            Self::LocalMemoryUsage => fmt.write_str("local_memory_usage"),
+            Self::RegisterSpills => fmt.write_str("register_spills"),
+        }
+    }
+}
diff --git a/rust-cuda-derive/src/kernel/mod.rs b/rust-cuda-derive/src/kernel/mod.rs
index c44f1dd2f..6dff13380 100644
--- a/rust-cuda-derive/src/kernel/mod.rs
+++ b/rust-cuda-derive/src/kernel/mod.rs
@@ -2,4 +2,5 @@ pub mod link;
 pub mod specialise;
 pub mod wrapper;
 
+mod lints;
 mod utils;
diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/get_ptx_str.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/get_ptx_str.rs
index b3e215a20..d62445803 100644
--- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/get_ptx_str.rs
+++ b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/get_ptx_str.rs
@@ -5,6 +5,7 @@ use crate::kernel::utils::skip_kernel_compilation;
 
 use super::super::super::{DeclGenerics, FuncIdent, FunctionInputs, InputCudaType, KernelConfig};
 
+#[allow(clippy::too_many_arguments)]
 pub(super) fn quote_get_ptx_str(
     crate_path: &syn::Path,
     FuncIdent {
@@ -21,6 +22,7 @@ pub(super) fn quote_get_ptx_str(
     inputs: &FunctionInputs,
     func_params: &[syn::Ident],
     macro_type_ids: &[syn::Ident],
+    ptx_lint_levels: &TokenStream,
 ) -> TokenStream {
     let crate_name = match proc_macro::tracked_env::var("CARGO_CRATE_NAME") {
         Ok(crate_name) => crate_name.to_uppercase(),
@@ -80,7 +82,7 @@ pub(super) fn quote_get_ptx_str(
             #crate_path::host::link_kernel!{
                 #func_ident #func_ident_hash #args #crate_name #crate_manifest_dir #generic_start_token
                     #($#macro_type_ids),*
-                #generic_close_token
+                #generic_close_token #ptx_lint_levels
             }
 
             #matching_kernel_assert
diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/mod.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/mod.rs
index 91f94a568..0ca963bb2 100644
--- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/mod.rs
+++ b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/mod.rs
@@ -26,6 +26,7 @@ pub(in super::super) fn quote_cpu_linker_macro(
     func_inputs: &FunctionInputs,
     func_ident: &FuncIdent,
     func_params: &[syn::Ident],
+    ptx_lint_levels: &TokenStream,
 ) -> TokenStream {
     let macro_types = generic_params
         .iter()
@@ -59,6 +60,7 @@ pub(in super::super) fn quote_cpu_linker_macro(
         func_inputs,
         func_params,
         &macro_type_ids,
+        ptx_lint_levels,
     );
     let new_kernel = quote_new_kernel(
         crate_path,
diff --git a/rust-cuda-derive/src/kernel/wrapper/mod.rs b/rust-cuda-derive/src/kernel/wrapper/mod.rs
index a677c3e0f..ee7cfa404 100644
--- a/rust-cuda-derive/src/kernel/wrapper/mod.rs
+++ b/rust-cuda-derive/src/kernel/wrapper/mod.rs
@@ -1,6 +1,5 @@
 use std::{
     collections::HashMap,
-    fmt,
     hash::{Hash, Hasher},
 };
 
@@ -11,6 +10,8 @@ mod generate;
 mod inputs;
 mod parse;
 
+use super::lints::{parse_ptx_lint_level, LintLevel, PtxLint};
+
 use config::KernelConfig;
 use generate::{
     args_trait::quote_args_trait, cpu_linker_macro::quote_cpu_linker_macro,
@@ -19,7 +20,7 @@ use generate::{
 };
 use inputs::{parse_function_inputs, FunctionInputs};
 use parse::parse_kernel_fn;
-use proc_macro2::Span;
+use proc_macro2::{Ident, Span};
 use syn::spanned::Spanned;
 
 #[allow(clippy::too_many_lines)]
@@ -45,7 +46,7 @@ pub fn kernel(attr: TokenStream, func: TokenStream) -> TokenStream {
     let mut func = parse_kernel_fn(func);
 
     let mut crate_path = None;
-    let mut lint_levels = HashMap::new();
+    let mut ptx_lint_levels = HashMap::new();
 
     func.attrs.retain(|attr| {
         if attr.path.is_ident("kernel") {
@@ -83,96 +84,7 @@ pub fn kernel(attr: TokenStream, func: TokenStream) -> TokenStream {
                             nested,
                             ..
                         })) if path.is_ident("allow") || path.is_ident("warn") || path.is_ident("deny") || path.is_ident("forbid") => {
-                            let level = match path.get_ident() {
-                                Some(ident) if ident == "allow" => LintLevel::Allow,
-                                Some(ident) if ident == "warn" => LintLevel::Warn,
-                                Some(ident) if ident == "deny" => LintLevel::Deny,
-                                Some(ident) if ident == "forbid" => LintLevel::Forbid,
-                                _ => unreachable!(),
-                            };
-
-                            for meta in nested {
-                                let syn::NestedMeta::Meta(syn::Meta::Path(path)) = meta else {
-                                    emit_error!(
-                                        meta.span(),
-                                        "[rust-cuda]: Invalid #[kernel({}(<lint>))] attribute.",
-                                        level,
-                                    );
-                                    continue;
-                                };
-
-                                if path.leading_colon.is_some() || path.segments.empty_or_trailing() || path.segments.len() != 2 {
-                                    emit_error!(
-                                        meta.span(),
-                                        "[rust-cuda]: Invalid #[kernel({}(<lint>))] attribute: <lint> must be of the form `ptx::lint`.",
-                                        level,
-                                    );
-                                    continue;
-                                }
-
-                                let Some(syn::PathSegment { ident: namespace, arguments: syn::PathArguments::None }) = path.segments.first() else {
-                                    emit_error!(
-                                        meta.span(),
-                                        "[rust-cuda]: Invalid #[kernel({}(<lint>))] attribute: <lint> must be of the form `ptx::lint`.",
-                                        level,
-                                    );
-                                    continue;
-                                };
-
-                                if namespace != "ptx" {
-                                    emit_error!(
-                                        meta.span(),
-                                        "[rust-cuda]: Invalid #[kernel({}(<lint>))] attribute: <lint> must be of the form `ptx::lint`.",
-                                        level,
-                                    );
-                                    continue;
-                                }
-
-                                let Some(syn::PathSegment { ident: lint, arguments: syn::PathArguments::None }) = path.segments.last() else {
-                                    emit_error!(
-                                        meta.span(),
-                                        "[rust-cuda]: Invalid #[kernel({}(<lint>))] attribute: <lint> must be of the form `ptx::lint`.",
-                                        level,
-                                    );
-                                    continue;
-                                };
-
-                                let lint = match lint {
-                                    l if l == "verbose" => PtxLint::Verbose,
-                                    l if l == "double_precision_use" => PtxLint::DoublePrecisionUse,
-                                    l if l == "local_memory_usage" => PtxLint::LocalMemoryUsage,
-                                    l if l == "register_spills" => PtxLint::RegisterSpills,
-                                    _ => {
-                                        emit_error!(
-                                            meta.span(),
-                                            "[rust-cuda]: Unknown PTX kernel lint `ptx::{}`.",
-                                            lint,
-                                        );
-                                        continue;
-                                    }
-                                };
-
-                                match lint_levels.get(&lint) {
-                                    None => (),
-                                    Some(LintLevel::Forbid) if level < LintLevel::Forbid => {
-                                        emit_error!(
-                                            meta.span(),
-                                            "[rust-cuda]: {}(ptx::{}) incompatible with previous forbid.",
-                                            level, lint,
-                                        );
-                                        continue;
-                                    },
-                                    Some(previous) => {
-                                        emit_warning!(
-                                            meta.span(),
-                                            "[rust-cuda]: {}(ptx::{}) overwrites previous {}.",
-                                            level, lint, previous,
-                                        );
-                                    }
-                                }
-
-                                lint_levels.insert(lint, level);
-                            }
+                            parse_ptx_lint_level(path, nested, &mut ptx_lint_levels);
                         },
                         _ => {
                             emit_error!(
@@ -197,9 +109,26 @@ pub fn kernel(attr: TokenStream, func: TokenStream) -> TokenStream {
 
     let crate_path = crate_path.unwrap_or_else(|| syn::parse_quote!(::rust_cuda));
 
-    let _ = lint_levels.try_insert(PtxLint::DoublePrecisionUse, LintLevel::Warn);
-    let _ = lint_levels.try_insert(PtxLint::LocalMemoryUsage, LintLevel::Warn);
-    let _ = lint_levels.try_insert(PtxLint::RegisterSpills, LintLevel::Warn);
+    let _ = ptx_lint_levels.try_insert(PtxLint::Verbose, LintLevel::Allow);
+    let _ = ptx_lint_levels.try_insert(PtxLint::DoublePrecisionUse, LintLevel::Warn);
+    let _ = ptx_lint_levels.try_insert(PtxLint::LocalMemoryUsage, LintLevel::Warn);
+    let _ = ptx_lint_levels.try_insert(PtxLint::RegisterSpills, LintLevel::Warn);
+
+    let ptx_lint_levels = {
+        let (lints, levels): (Vec<Ident>, Vec<Ident>) = ptx_lint_levels
+            .into_iter()
+            .map(|(lint, level)| {
+                (
+                    Ident::new(&lint.to_string(), Span::call_site()),
+                    Ident::new(&level.to_string(), Span::call_site()),
+                )
+            })
+            .unzip();
+
+        quote! {
+            #(#levels(ptx::#lints)),*
+        }
+    };
 
     let mut generic_kernel_params = func.sig.generics.params.clone();
     let mut func_inputs = parse_function_inputs(&func, &mut generic_kernel_params);
@@ -376,6 +305,7 @@ pub fn kernel(attr: TokenStream, func: TokenStream) -> TokenStream {
         &func_inputs,
         &func_ident,
         &func_params,
+        &ptx_lint_levels,
     );
     let cuda_wrapper = quote_cuda_wrapper(
         &crate_path,
@@ -446,44 +376,6 @@ struct FuncIdent<'f> {
     func_ident_hash: syn::Ident,
 }
 
-#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Debug)]
-enum LintLevel {
-    Allow,
-    Warn,
-    Deny,
-    Forbid,
-}
-
-impl fmt::Display for LintLevel {
-    fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result {
-        match self {
-            Self::Allow => fmt.write_str("allow"),
-            Self::Warn => fmt.write_str("warn"),
-            Self::Deny => fmt.write_str("deny"),
-            Self::Forbid => fmt.write_str("forbid"),
-        }
-    }
-}
-
-#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Debug)]
-enum PtxLint {
-    Verbose,
-    DoublePrecisionUse,
-    LocalMemoryUsage,
-    RegisterSpills,
-}
-
-impl fmt::Display for PtxLint {
-    fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result {
-        match self {
-            Self::Verbose => fmt.write_str("verbose"),
-            Self::DoublePrecisionUse => fmt.write_str("double_precision_use"),
-            Self::LocalMemoryUsage => fmt.write_str("local_memory_usage"),
-            Self::RegisterSpills => fmt.write_str("register_spills"),
-        }
-    }
-}
-
 fn ident_from_pat(pat: &syn::Pat) -> Option<syn::Ident> {
     match pat {
         syn::Pat::Lit(_)
@@ -547,7 +439,9 @@ fn quote_generic_check(
             #crate_path::safety::kernel_signature::CpuAndGpuKernelSignatures::Match
         }> = #crate_path::safety::kernel_signature::Assert::<{
             #crate_path::safety::kernel_signature::check(
-                #crate_path::host::check_kernel!(#args #crate_name #crate_manifest_dir).as_bytes(),
+                #crate_path::host::check_kernel!(
+                    #args #crate_name #crate_manifest_dir
+                ).as_bytes(),
                 concat!(".visible .entry ", stringify!(#func_ident_hash)).as_bytes()
             )
         }>;

From d8a732f85d1c168d6c4f1726155624009115aac0 Mon Sep 17 00:00:00 2001
From: Juniper Tyree <juniper.tyree@helsinki.fi>
Date: Tue, 9 May 2023 07:23:25 +0000
Subject: [PATCH 031/120] Improve kernel checking + added cubin dump lint

---
 rust-cuda-derive/src/kernel/link/config.rs    |   3 +
 rust-cuda-derive/src/kernel/link/mod.rs       | 195 +++++++++++---
 .../src/kernel/link/ptx_compiler_sys.rs       | 246 ++++++++----------
 rust-cuda-derive/src/kernel/lints.rs          |   3 +
 .../src/kernel/specialise/entry.rs            |   2 +-
 rust-cuda-derive/src/kernel/wrapper/mod.rs    |  14 +-
 6 files changed, 278 insertions(+), 185 deletions(-)

diff --git a/rust-cuda-derive/src/kernel/link/config.rs b/rust-cuda-derive/src/kernel/link/config.rs
index e2b399dc4..efb7899fa 100644
--- a/rust-cuda-derive/src/kernel/link/config.rs
+++ b/rust-cuda-derive/src/kernel/link/config.rs
@@ -67,6 +67,7 @@ impl syn::parse::Parse for LinkKernelConfig {
 
 #[allow(clippy::module_name_repetitions)]
 pub(super) struct CheckKernelConfig {
+    pub(super) kernel_hash: syn::Ident,
     pub(super) args: syn::Ident,
     pub(super) crate_name: String,
     pub(super) crate_path: PathBuf,
@@ -74,11 +75,13 @@ pub(super) struct CheckKernelConfig {
 
 impl syn::parse::Parse for CheckKernelConfig {
     fn parse(input: syn::parse::ParseStream) -> syn::Result<Self> {
+        let kernel_hash: syn::Ident = input.parse()?;
         let args: syn::Ident = input.parse()?;
         let name: syn::LitStr = input.parse()?;
         let path: syn::LitStr = input.parse()?;
 
         Ok(Self {
+            kernel_hash,
             args,
             crate_name: name.value(),
             crate_path: PathBuf::from(path.value()),
diff --git a/rust-cuda-derive/src/kernel/link/mod.rs b/rust-cuda-derive/src/kernel/link/mod.rs
index a79505c13..5d5fef5b4 100644
--- a/rust-cuda-derive/src/kernel/link/mod.rs
+++ b/rust-cuda-derive/src/kernel/link/mod.rs
@@ -32,11 +32,10 @@ use error::emit_ptx_build_error;
 use ptx_compiler_sys::NvptxError;
 
 pub fn check_kernel(tokens: TokenStream) -> TokenStream {
-    proc_macro_error::set_dummy(quote! {
-        "ERROR in this PTX compilation"
-    });
+    proc_macro_error::set_dummy(quote! {::core::result::Result::Err(())});
 
     let CheckKernelConfig {
+        kernel_hash,
         args,
         crate_name,
         crate_path,
@@ -44,8 +43,8 @@ pub fn check_kernel(tokens: TokenStream) -> TokenStream {
         Ok(config) => config,
         Err(err) => {
             abort_call_site!(
-                "check_kernel!(ARGS NAME PATH) expects ARGS identifier, NAME and PATH string \
-                 literals: {:?}",
+                "check_kernel!(HASH ARGS NAME PATH) expects HASH and ARGS identifiers, annd NAME \
+                 and PATH string literals: {:?}",
                 err
             )
         },
@@ -53,10 +52,18 @@ pub fn check_kernel(tokens: TokenStream) -> TokenStream {
 
     let kernel_ptx = compile_kernel(&args, &crate_name, &crate_path, Specialisation::Check);
 
-    match kernel_ptx {
-        Some(kernel_ptx) => quote!(#kernel_ptx).into(),
-        None => quote!("ERROR in this PTX compilation").into(),
-    }
+    let Some(kernel_ptx) = kernel_ptx else {
+        return quote!(::core::result::Result::Err(())).into()
+    };
+
+    check_kernel_ptx_and_report(
+        &kernel_ptx,
+        Specialisation::Check,
+        &kernel_hash,
+        &HashMap::new(),
+    );
+
+    quote!(::core::result::Result::Ok(())).into()
 }
 
 #[allow(clippy::module_name_repetitions, clippy::too_many_lines)]
@@ -77,9 +84,9 @@ pub fn link_kernel(tokens: TokenStream) -> TokenStream {
         Ok(config) => config,
         Err(err) => {
             abort_call_site!(
-                "link_kernel!(KERNEL ARGS NAME PATH SPECIALISATION LINTS,*) expects KERNEL and \
-                 ARGS identifiers, NAME and PATH string literals, SPECIALISATION and LINTS \
-                 tokens: {:?}",
+                "link_kernel!(KERNEL HASH ARGS NAME PATH SPECIALISATION LINTS,*) expects KERNEL, \
+                 HASH, and ARGS identifiers, NAME and PATH string literals, and SPECIALISATION \
+                 and LINTS tokens: {:?}",
                 err
             )
         },
@@ -213,32 +220,44 @@ pub fn link_kernel(tokens: TokenStream) -> TokenStream {
         kernel_ptx.replace_range(type_layout_start..type_layout_end, "");
     }
 
-    let (result, error_log, info_log, version, drop) =
-        check_kernel_ptx(&kernel_ptx, &specialisation, &kernel_hash, &ptx_lint_levels);
+    check_kernel_ptx_and_report(
+        &kernel_ptx,
+        Specialisation::Link(&specialisation),
+        &kernel_hash,
+        &ptx_lint_levels,
+    );
+
+    (quote! { const PTX_STR: &'static str = #kernel_ptx; #(#type_layouts)* }).into()
+}
+
+#[allow(clippy::too_many_lines)]
+fn check_kernel_ptx_and_report(
+    kernel_ptx: &str,
+    specialisation: Specialisation,
+    kernel_hash: &proc_macro2::Ident,
+    ptx_lint_levels: &HashMap<PtxLint, LintLevel>,
+) {
+    let (result, error_log, info_log, binary, version, drop) =
+        check_kernel_ptx(kernel_ptx, specialisation, kernel_hash, ptx_lint_levels);
 
     let ptx_compiler = match &version {
         Ok((major, minor)) => format!("PTX compiler v{major}.{minor}"),
         Err(_) => String::from("PTX compiler"),
     };
 
-    // TODO: allow user to select
-    // - warn on double
-    // - warn on float
-    // - warn on spills
-    // - verbose warn
-    // - warnings as errors
-    // - show PTX source if warning or error
-
     let mut errors = String::new();
+
     if let Err(err) = drop {
         let _ = errors.write_fmt(format_args!("Error dropping the {ptx_compiler}: {err}\n"));
     }
+
     if let Err(err) = version {
         let _ = errors.write_fmt(format_args!(
             "Error fetching the version of the {ptx_compiler}: {err}\n"
         ));
     }
-    if let (Ok(Some(_)), _) | (_, Ok(Some(_))) = (&info_log, &error_log) {
+
+    let ptx_source_code = {
         let mut max_lines = kernel_ptx.chars().filter(|c| *c == '\n').count() + 1;
         let mut indent = 0;
         while max_lines > 0 {
@@ -246,7 +265,7 @@ pub fn link_kernel(tokens: TokenStream) -> TokenStream {
             indent += 1;
         }
 
-        emit_call_site_warning!(
+        format!(
             "PTX source code:\n{}",
             kernel_ptx
                 .lines()
@@ -254,47 +273,109 @@ pub fn link_kernel(tokens: TokenStream) -> TokenStream {
                 .map(|(i, l)| format!("{:indent$}| {l}", i + 1))
                 .collect::<Vec<_>>()
                 .join("\n")
-        );
+        )
+    };
+
+    match binary {
+        Ok(None) => (),
+        Ok(Some(binary)) => {
+            if ptx_lint_levels
+                .get(&PtxLint::DumpBinary)
+                .map_or(false, |level| *level > LintLevel::Allow)
+            {
+                const HEX: [char; 16] = [
+                    '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f',
+                ];
+
+                let mut binary_hex = String::with_capacity(binary.len() * 2);
+                for byte in binary {
+                    binary_hex.push(HEX[usize::from(byte >> 4)]);
+                    binary_hex.push(HEX[usize::from(byte & 0x0F)]);
+                }
+
+                if ptx_lint_levels
+                    .get(&PtxLint::DumpBinary)
+                    .map_or(false, |level| *level > LintLevel::Warn)
+                {
+                    emit_call_site_error!(
+                        "{} compiled binary:\n{}\n\n{}",
+                        ptx_compiler,
+                        binary_hex,
+                        ptx_source_code
+                    );
+                } else {
+                    emit_call_site_warning!(
+                        "{} compiled binary:\n{}\n\n{}",
+                        ptx_compiler,
+                        binary_hex,
+                        ptx_source_code
+                    );
+                }
+            }
+        },
+        Err(err) => {
+            let _ = errors.write_fmt(format_args!(
+                "Error fetching the compiled binary from {ptx_compiler}: {err}\n"
+            ));
+        },
     }
+
     match info_log {
         Ok(None) => (),
-        Ok(Some(info_log)) => emit_call_site_warning!("{ptx_compiler} info log:\n{}", info_log),
+        Ok(Some(info_log)) => emit_call_site_warning!(
+            "{} info log:\n{}\n{}",
+            ptx_compiler,
+            info_log,
+            ptx_source_code
+        ),
         Err(err) => {
             let _ = errors.write_fmt(format_args!(
                 "Error fetching the info log of the {ptx_compiler}: {err}\n"
             ));
         },
     };
-    match error_log {
-        Ok(None) => (),
-        Ok(Some(error_log)) => emit_call_site_error!("{ptx_compiler} error log:\n{}", error_log),
+
+    let error_log = match error_log {
+        Ok(None) => String::new(),
+        Ok(Some(error_log)) => {
+            format!("{ptx_compiler} error log:\n{error_log}\n{ptx_source_code}")
+        },
         Err(err) => {
             let _ = errors.write_fmt(format_args!(
                 "Error fetching the error log of the {ptx_compiler}: {err}\n"
             ));
+            String::new()
         },
     };
+
     if let Err(err) = result {
         let _ = errors.write_fmt(format_args!("Error compiling the PTX source code: {err}\n"));
     }
-    if !errors.is_empty() {
-        abort_call_site!("{}", errors);
-    }
 
-    (quote! { const PTX_STR: &'static str = #kernel_ptx; #(#type_layouts)* }).into()
+    if !error_log.is_empty() || !errors.is_empty() {
+        abort_call_site!(
+            "{error_log}{}{errors}",
+            if !error_log.is_empty() && !errors.is_empty() {
+                "\n\n"
+            } else {
+                ""
+            }
+        );
+    }
 }
 
 #[allow(clippy::type_complexity)]
 #[allow(clippy::too_many_lines)]
 fn check_kernel_ptx(
     kernel_ptx: &str,
-    specialisation: &str,
+    specialisation: Specialisation,
     kernel_hash: &proc_macro2::Ident,
     ptx_lint_levels: &HashMap<PtxLint, LintLevel>,
 ) -> (
     Result<(), NvptxError>,
     Result<Option<String>, NvptxError>,
     Result<Option<String>, NvptxError>,
+    Result<Option<Vec<u8>>, NvptxError>,
     Result<(u32, u32), NvptxError>,
     Result<(), NvptxError>,
 ) {
@@ -313,14 +394,15 @@ fn check_kernel_ptx(
     };
 
     let result = (|| {
-        let kernel_name = if specialisation.is_empty() {
-            format!("{kernel_hash}_kernel")
-        } else {
-            format!(
+        let kernel_name = match specialisation {
+            Specialisation::Check => format!("{kernel_hash}_chECK"),
+            Specialisation::Link("") => format!("{kernel_hash}_kernel"),
+            Specialisation::Link(specialisation) => format!(
                 "{kernel_hash}_kernel_{:016x}",
                 seahash::hash(specialisation.as_bytes())
-            )
+            ),
         };
+
         let mut options = vec![
             CString::new("--entry").unwrap(),
             CString::new(kernel_name).unwrap(),
@@ -457,6 +539,39 @@ fn check_kernel_ptx(
         Ok(Some(String::from_utf8_lossy(&info_log).into_owned()))
     })();
 
+    let binary = (|| {
+        if result.is_err() {
+            return Ok(None);
+        }
+
+        let mut binary_size = 0;
+
+        NvptxError::try_err_from(unsafe {
+            ptx_compiler_sys::nvPTXCompilerGetCompiledProgramSize(
+                compiler,
+                addr_of_mut!(binary_size),
+            )
+        })?;
+
+        if binary_size == 0 {
+            return Ok(None);
+        }
+
+        #[allow(clippy::cast_possible_truncation)]
+        let mut binary: Vec<u8> = Vec::with_capacity(binary_size as usize);
+
+        NvptxError::try_err_from(unsafe {
+            ptx_compiler_sys::nvPTXCompilerGetCompiledProgram(compiler, binary.as_mut_ptr().cast())
+        })?;
+
+        #[allow(clippy::cast_possible_truncation)]
+        unsafe {
+            binary.set_len(binary_size as usize);
+        }
+
+        Ok(Some(binary))
+    })();
+
     let version = (|| {
         let mut major = 0;
         let mut minor = 0;
@@ -475,7 +590,7 @@ fn check_kernel_ptx(
         })
     };
 
-    (result, error_log, info_log, version, drop)
+    (result, error_log, info_log, binary, version, drop)
 }
 
 fn compile_kernel(
diff --git a/rust-cuda-derive/src/kernel/link/ptx_compiler_sys.rs b/rust-cuda-derive/src/kernel/link/ptx_compiler_sys.rs
index 5e459a623..0ab332dad 100644
--- a/rust-cuda-derive/src/kernel/link/ptx_compiler_sys.rs
+++ b/rust-cuda-derive/src/kernel/link/ptx_compiler_sys.rs
@@ -74,7 +74,7 @@ pub type NvptxCompileResult = ::std::os::raw::c_int;
 
 extern "C" {
     /// Queries the current major and minor version of PTX Compiler APIs being
-    ///  used.
+    /// used.
     ///
     /// # Parameters
     /// - [out] `major`: Major version of the PTX Compiler APIs
@@ -94,7 +94,7 @@ extern "C" {
     ) -> NvptxCompileResult;
 
     /// Obtains the handle to an instance of the PTX compiler
-    ///  initialized with the given PTX program `ptxCode`.
+    /// initialized with the given PTX program `ptxCode`.
     ///
     /// # Parameters
     /// - [out] `compiler`: Returns a handle to PTX compiler initialized with
@@ -127,169 +127,147 @@ extern "C" {
     /// - [`NvptxCompileResult::NVPTXCOMPILE_ERROR_INVALID_PROGRAM_HANDLE`]
     pub fn nvPTXCompilerDestroy(compiler: *mut NvptxCompilerHandle) -> NvptxCompileResult;
 
-    #[doc = " \\ingroup compilation"]
-    #[doc = ""]
-    #[doc = " \\brief          Compile a PTX program with the given compiler options"]
-    #[doc = ""]
-    #[doc = " \\param            [in,out] compiler      A handle to PTX compiler initialized with \
-             the"]
-    #[doc = "                                          PTX program which is to be compiled."]
-    #[doc = "                                          The compiled program can be accessed using \
-             the handle"]
-    #[doc = " \\param            [in] numCompileOptions Length of the array \\p compileOptions"]
-    #[doc = " \\param            [in] compileOptions   Compiler options with which compilation \
-             should be done."]
-    #[doc = "                                         The compiler options string is a null \
-             terminated character array."]
-    #[doc = "                                         A valid list of compiler options is at"]
-    #[doc = "                                         <a href=\"http://docs.nvidia.com/cuda/ptx-compiler-api/index.html#compile-options\">link</a>."]
-    #[doc = " \\note                                   --gpu-name (-arch) is a mandatory option."]
-    #[doc = ""]
-    #[doc = " \\return"]
-    #[doc = "   - \\link #nvPTXCompileResult NVPTXCOMPILE_SUCCESS \\endlink"]
-    #[doc = "   - \\link #nvPTXCompileResult NVPTXCOMPILE_ERROR_OUT_OF_MEMORY \\endlink"]
-    #[doc = "   - \\link #nvPTXCompileResult NVPTXCOMPILE_ERROR_INTERNAL \\endlink"]
-    #[doc = "   - \\link #nvPTXCompileResult NVPTXCOMPILE_ERROR_INVALID_PROGRAM_HANDLE \\endlink"]
-    #[doc = "   - \\link #nvPTXCompileResult NVPTXCOMPILE_ERROR_COMPILATION_FAILURE  \\endlink"]
-    #[doc = "   - \\link #nvPTXCompileResult NVPTXCOMPILE_ERROR_UNSUPPORTED_PTX_VERSION  \\endlink"]
-    #[doc = ""]
+    /// Compile a PTX program with the given compiler options.
+    ///
+    /// # Parameters
+    /// - [in, out] `compiler`: A handle to PTX compiler initialized with the
+    ///   PTX program which is to be compiled. The compiled program can be
+    ///   accessed using the handle.
+    /// - [in] `numCompileOptions`: Length of the array `compileOptions`
+    /// - [in] `compileOptions`: Compiler options with which compilation should
+    ///   be done. The compiler options string is a null terminated character
+    ///   array. A valid list of compiler options is available at
+    ///   [link](http://docs.nvidia.com/cuda/ptx-compiler-api/index.html#compile-options).
+    ///
+    /// # Note
+    /// `--gpu-name` (`-arch`) is a mandatory option.
+    ///
+    /// # Returns
+    /// - [`NvptxCompileResult::NVPTXCOMPILE_SUCCESS`]
+    /// - [`NvptxCompileResult::NVPTXCOMPILE_ERROR_OUT_OF_MEMORY`]
+    /// - [`NvptxCompileResult::NVPTXCOMPILE_ERROR_INTERNAL`]
+    /// - [`NvptxCompileResult::NVPTXCOMPILE_ERROR_INVALID_PROGRAM_HANDLE`]
+    /// - [`NvptxCompileResult::NVPTXCOMPILE_ERROR_COMPILATION_FAILURE`]
+    /// - [`NvptxCompileResult::NVPTXCOMPILE_ERROR_UNSUPPORTED_PTX_VERSION`]
     pub fn nvPTXCompilerCompile(
         compiler: NvptxCompilerHandle,
         numCompileOptions: ::std::os::raw::c_int,
         compileOptions: *const *const ::std::os::raw::c_char,
     ) -> NvptxCompileResult;
 
-    #[doc = " \\ingroup compilation"]
-    #[doc = ""]
-    #[doc = " \\brief            Obtains the size of the image of the compiled program"]
-    #[doc = ""]
-    #[doc = " \\param            [in] compiler          A handle to PTX compiler on which \
-             nvPTXCompilerCompile() has been performed."]
-    #[doc = " \\param            [out] binaryImageSize  The size of the image of the compiled \
-             program"]
-    #[doc = ""]
-    #[doc = " \\return"]
-    #[doc = "   - \\link #nvPTXCompileResult NVPTXCOMPILE_SUCCESS \\endlink"]
-    #[doc = "   - \\link #nvPTXCompileResult NVPTXCOMPILE_ERROR_INTERNAL \\endlink"]
-    #[doc = "   - \\link #nvPTXCompileResult NVPTXCOMPILE_ERROR_INVALID_PROGRAM_HANDLE \\endlink"]
-    #[doc = "   - \\link #nvPTXCompileResult NVPTXCOMPILE_ERROR_COMPILER_INVOCATION_INCOMPLETE \
-             \\endlink"]
-    #[doc = ""]
-    #[doc = " \\note             nvPTXCompilerCompile() API should be invoked for the handle \
-             before calling this API."]
-    #[doc = "                   Otherwise, NVPTXCOMPILE_ERROR_COMPILER_INVOCATION_INCOMPLETE is \
-             returned."]
+    /// Obtains the size of the image of the compiled program.
+    ///
+    /// # Parameters
+    /// - [in] `compiler`: A handle to PTX compiler on which
+    ///   [`nvPTXCompilerCompile`] has been performed.
+    /// - [out] `binaryImageSize`: The size of the image of the compiled program
+    ///
+    /// # Returns
+    /// - [`NvptxCompileResult::NVPTXCOMPILE_SUCCESS`]
+    /// - [`NvptxCompileResult::NVPTXCOMPILE_ERROR_INTERNAL`]
+    /// - [`NvptxCompileResult::NVPTXCOMPILE_ERROR_INVALID_PROGRAM_HANDLE`]
+    /// - [`NvptxCompileResult::NVPTXCOMPILE_ERROR_COMPILER_INVOCATION_INCOMPLETE`]
+    ///
+    /// # Note
+    /// The [`nvPTXCompilerCompile`] function should be invoked for the handle
+    /// before calling this API. Otherwise,
+    /// [`NvptxCompileResult::NVPTXCOMPILE_ERROR_COMPILER_INVOCATION_INCOMPLETE`]
+    /// is returned.
     pub fn nvPTXCompilerGetCompiledProgramSize(
         compiler: NvptxCompilerHandle,
         binaryImageSize: *mut size_t,
     ) -> NvptxCompileResult;
 
-    #[doc = " \\ingroup compilation"]
-    #[doc = ""]
-    #[doc = " \\brief            Obtains the image of the compiled program"]
-    #[doc = ""]
-    #[doc = " \\param            [in] compiler          A handle to PTX compiler on which \
-             nvPTXCompilerCompile() has been performed."]
-    #[doc = " \\param            [out] binaryImage      The image of the compiled program."]
-    #[doc = "                                         Client should allocate memory for \\p \
-             binaryImage"]
-    #[doc = ""]
-    #[doc = " \\return"]
-    #[doc = "   - \\link #nvPTXCompileResult NVPTXCOMPILE_SUCCESS \\endlink"]
-    #[doc = "   - \\link #nvPTXCompileResult NVPTXCOMPILE_ERROR_INTERNAL \\endlink"]
-    #[doc = "   - \\link #nvPTXCompileResult NVPTXCOMPILE_ERROR_INVALID_PROGRAM_HANDLE \\endlink"]
-    #[doc = "   - \\link #nvPTXCompileResult NVPTXCOMPILE_ERROR_COMPILER_INVOCATION_INCOMPLETE \
-             \\endlink"]
-    #[doc = ""]
-    #[doc = " \\note             nvPTXCompilerCompile() API should be invoked for the handle \
-             before calling this API."]
-    #[doc = "                   Otherwise, NVPTXCOMPILE_ERROR_COMPILER_INVOCATION_INCOMPLETE is \
-             returned."]
-    #[doc = ""]
+    /// Obtains the image of the compiled program.
+    ///
+    /// # Parameters
+    /// - [in] `compiler`: A handle to PTX compiler on which
+    ///   [`nvPTXCompilerCompile`] has been performed.
+    /// - [out] `binaryImage`: The image of the compiled program. The caller
+    ///   should allocate memory for `binaryImage`.
+    ///
+    /// # Returns
+    /// - [`NvptxCompileResult::NVPTXCOMPILE_SUCCESS`]
+    /// - [`NvptxCompileResult::NVPTXCOMPILE_ERROR_INTERNAL`]
+    /// - [`NvptxCompileResult::NVPTXCOMPILE_ERROR_INVALID_PROGRAM_HANDLE`]
+    /// - [`NvptxCompileResult::NVPTXCOMPILE_ERROR_COMPILER_INVOCATION_INCOMPLETE`]
+    ///
+    /// # Note
+    /// The [`nvPTXCompilerCompile`] function should be invoked for the handle
+    /// before calling this API. Otherwise,
+    /// [`NvptxCompileResult::NVPTXCOMPILE_ERROR_COMPILER_INVOCATION_INCOMPLETE`]
+    /// is returned.
     pub fn nvPTXCompilerGetCompiledProgram(
         compiler: NvptxCompilerHandle,
         binaryImage: *mut ::std::os::raw::c_void,
     ) -> NvptxCompileResult;
 
-    #[doc = " \\ingroup compilation"]
-    #[doc = ""]
-    #[doc = " \\brief            Query the size of the error message that was seen previously for \
-             the handle"]
-    #[doc = ""]
-    #[doc = " \\param            [in] compiler          A handle to PTX compiler on which \
-             nvPTXCompilerCompile() has been performed."]
-    #[doc = " \\param            [out] errorLogSize     The size of the error log in bytes which \
-             was produced"]
-    #[doc = "                                          in previous call to nvPTXCompilerCompiler()."]
-    #[doc = ""]
-    #[doc = " \\return"]
-    #[doc = "   - \\link #nvPTXCompileResult NVPTXCOMPILE_SUCCESS \\endlink"]
-    #[doc = "   - \\link #nvPTXCompileResult NVPTXCOMPILE_ERROR_INTERNAL \\endlink"]
-    #[doc = "   - \\link #nvPTXCompileResult NVPTXCOMPILE_ERROR_INVALID_PROGRAM_HANDLE \\endlink"]
-    #[doc = ""]
+    /// Query the size of the error message that was seen previously for the
+    /// handle.
+    ///
+    /// - [in] `compiler`: A handle to PTX compiler on which
+    ///   [`nvPTXCompilerCompile`] has been performed.
+    /// - [out] `errorLogSize`: The size of the error log in bytes which was
+    ///   produced in previous call to [`nvPTXCompilerCompile`].
+    ///
+    /// # Returns
+    /// - [`NvptxCompileResult::NVPTXCOMPILE_SUCCESS`]
+    /// - [`NvptxCompileResult::NVPTXCOMPILE_ERROR_INTERNAL`]
+    /// - [`NvptxCompileResult::NVPTXCOMPILE_ERROR_INVALID_PROGRAM_HANDLE`]
     pub fn nvPTXCompilerGetErrorLogSize(
         compiler: NvptxCompilerHandle,
         errorLogSize: *mut size_t,
     ) -> NvptxCompileResult;
 
-    #[doc = " \\ingroup compilation"]
-    #[doc = ""]
-    #[doc = " \\brief            Query the error message that was seen previously for the handle"]
-    #[doc = ""]
-    #[doc = " \\param            [in] compiler         A handle to PTX compiler on which \
-             nvPTXCompilerCompile() has been performed."]
-    #[doc = " \\param            [out] errorLog        The error log which was produced in \
-             previous call to nvPTXCompilerCompiler()."]
-    #[doc = "                                         Clients should allocate memory for \\p \
-             errorLog"]
-    #[doc = ""]
-    #[doc = " \\return"]
-    #[doc = "   - \\link #nvPTXCompileResult NVPTXCOMPILE_SUCCESS \\endlink"]
-    #[doc = "   - \\link #nvPTXCompileResult NVPTXCOMPILE_ERROR_INTERNAL \\endlink"]
-    #[doc = "   - \\link #nvPTXCompileResult NVPTXCOMPILE_ERROR_INVALID_PROGRAM_HANDLE \\endlink"]
-    #[doc = ""]
+    /// Query the error message that was seen previously for the handle.
+    ///
+    /// # Parameters
+    /// - [in] `compiler`: A handle to PTX compiler on which
+    ///   [`nvPTXCompilerCompile`] has been performed.
+    /// - [out] `errorLog`: The error log which was produced in previous call to
+    ///   [`nvPTXCompilerCompile`]. The caller should allocate memory for
+    ///   `errorLog`.
+    ///
+    /// # Returns
+    /// - [`NvptxCompileResult::NVPTXCOMPILE_SUCCESS`]
+    /// - [`NvptxCompileResult::NVPTXCOMPILE_ERROR_INTERNAL`]
+    /// - [`NvptxCompileResult::NVPTXCOMPILE_ERROR_INVALID_PROGRAM_HANDLE`]
     pub fn nvPTXCompilerGetErrorLog(
         compiler: NvptxCompilerHandle,
         errorLog: *mut ::std::os::raw::c_char,
     ) -> NvptxCompileResult;
 
-    #[doc = " \\ingroup compilation"]
-    #[doc = ""]
-    #[doc = " \\brief            Query the size of the information message that was seen \
-             previously for the handle"]
-    #[doc = ""]
-    #[doc = " \\param            [in] compiler        A handle to PTX compiler on which \
-             nvPTXCompilerCompile() has been performed."]
-    #[doc = " \\param            [out] infoLogSize    The size of the information log in bytes \
-             which was produced"]
-    #[doc = "                                         in previous call to nvPTXCompilerCompiler()."]
-    #[doc = ""]
-    #[doc = " \\return"]
-    #[doc = "   - \\link #nvPTXCompileResult NVPTXCOMPILE_SUCCESS \\endlink"]
-    #[doc = "   - \\link #nvPTXCompileResult NVPTXCOMPILE_ERROR_INTERNAL \\endlink"]
-    #[doc = "   - \\link #nvPTXCompileResult NVPTXCOMPILE_ERROR_INVALID_PROGRAM_HANDLE \\endlink"]
-    #[doc = ""]
+    /// Query the size of the information message that was seen previously for
+    /// the handle.
+    ///
+    /// # Parameters
+    /// - [in] `compiler`: A handle to PTX compiler on which
+    ///   [`nvPTXCompilerCompile`] has been performed.
+    /// - [out] `infoLogSize`: The size of the information log in bytes which
+    ///   was produced in previous call to [`nvPTXCompilerCompile`].
+    ///
+    /// # Returns
+    /// - [`NvptxCompileResult::NVPTXCOMPILE_SUCCESS`]
+    /// - [`NvptxCompileResult::NVPTXCOMPILE_ERROR_INTERNAL`]
+    /// - [`NvptxCompileResult::NVPTXCOMPILE_ERROR_INVALID_PROGRAM_HANDLE`]
     pub fn nvPTXCompilerGetInfoLogSize(
         compiler: NvptxCompilerHandle,
         infoLogSize: *mut size_t,
     ) -> NvptxCompileResult;
 
-    #[doc = " \\ingroup compilation"]
-    #[doc = ""]
-    #[doc = " \\brief           Query the information message that was seen previously for the \
-             handle"]
-    #[doc = ""]
-    #[doc = " \\param            [in] compiler        A handle to PTX compiler on which \
-             nvPTXCompilerCompile() has been performed."]
-    #[doc = " \\param            [out] infoLog        The information log which was produced in \
-             previous call to nvPTXCompilerCompiler()."]
-    #[doc = "                                        Clients should allocate memory for \\p infoLog"]
-    #[doc = ""]
-    #[doc = " \\return"]
-    #[doc = "   - \\link #nvPTXCompileResult NVPTXCOMPILE_SUCCESS \\endlink"]
-    #[doc = "   - \\link #nvPTXCompileResult NVPTXCOMPILE_ERROR_INTERNAL \\endlink"]
-    #[doc = "   - \\link #nvPTXCompileResult NVPTXCOMPILE_ERROR_INVALID_PROGRAM_HANDLE \\endlink"]
-    #[doc = ""]
+    /// Query the information message that was seen previously for the handle.
+    ///
+    /// # Parameters
+    /// - [in] `compiler`: A handle to PTX compiler on which
+    ///   [`nvPTXCompilerCompile`] has been performed.
+    /// - [out] `infoLog`: The information log which was produced in previous
+    ///   call to [`nvPTXCompilerCompile`]. The caller should allocate memory
+    ///   for `infoLog`.
+    ///
+    /// # Returns
+    /// - [`NvptxCompileResult::NVPTXCOMPILE_SUCCESS`]
+    /// - [`NvptxCompileResult::NVPTXCOMPILE_ERROR_INTERNAL`]
+    /// - [`NvptxCompileResult::NVPTXCOMPILE_ERROR_INVALID_PROGRAM_HANDLE`]
     pub fn nvPTXCompilerGetInfoLog(
         compiler: NvptxCompilerHandle,
         infoLog: *mut ::std::os::raw::c_char,
diff --git a/rust-cuda-derive/src/kernel/lints.rs b/rust-cuda-derive/src/kernel/lints.rs
index 6cdb63ca8..e91222dcd 100644
--- a/rust-cuda-derive/src/kernel/lints.rs
+++ b/rust-cuda-derive/src/kernel/lints.rs
@@ -79,6 +79,7 @@ pub fn parse_ptx_lint_level(
             l if l == "double_precision_use" => PtxLint::DoublePrecisionUse,
             l if l == "local_memory_usage" => PtxLint::LocalMemoryUsage,
             l if l == "register_spills" => PtxLint::RegisterSpills,
+            l if l == "dump_binary" => PtxLint::DumpBinary,
             _ => {
                 emit_error!(
                     meta.span(),
@@ -140,6 +141,7 @@ pub enum PtxLint {
     DoublePrecisionUse,
     LocalMemoryUsage,
     RegisterSpills,
+    DumpBinary,
 }
 
 impl fmt::Display for PtxLint {
@@ -149,6 +151,7 @@ impl fmt::Display for PtxLint {
             Self::DoublePrecisionUse => fmt.write_str("double_precision_use"),
             Self::LocalMemoryUsage => fmt.write_str("local_memory_usage"),
             Self::RegisterSpills => fmt.write_str("register_spills"),
+            Self::DumpBinary => fmt.write_str("dump_binary"),
         }
     }
 }
diff --git a/rust-cuda-derive/src/kernel/specialise/entry.rs b/rust-cuda-derive/src/kernel/specialise/entry.rs
index e8bce23b9..b85a433e7 100644
--- a/rust-cuda-derive/src/kernel/specialise/entry.rs
+++ b/rust-cuda-derive/src/kernel/specialise/entry.rs
@@ -33,7 +33,7 @@ pub fn specialise_kernel_entry(attr: TokenStream, func: TokenStream) -> TokenStr
     func.sig.ident = match proc_macro::tracked_env::var(&specialisation_var).as_deref() {
         Ok("") => quote::format_ident!("{}_kernel", func.sig.ident),
         Ok("chECK") => {
-            let func_ident = func.sig.ident;
+            let func_ident = quote::format_ident!("{}_chECK", func.sig.ident);
 
             return (quote! {
                 #[cfg(target_os = "cuda")]
diff --git a/rust-cuda-derive/src/kernel/wrapper/mod.rs b/rust-cuda-derive/src/kernel/wrapper/mod.rs
index ee7cfa404..a70c38e94 100644
--- a/rust-cuda-derive/src/kernel/wrapper/mod.rs
+++ b/rust-cuda-derive/src/kernel/wrapper/mod.rs
@@ -113,6 +113,7 @@ pub fn kernel(attr: TokenStream, func: TokenStream) -> TokenStream {
     let _ = ptx_lint_levels.try_insert(PtxLint::DoublePrecisionUse, LintLevel::Warn);
     let _ = ptx_lint_levels.try_insert(PtxLint::LocalMemoryUsage, LintLevel::Warn);
     let _ = ptx_lint_levels.try_insert(PtxLint::RegisterSpills, LintLevel::Warn);
+    let _ = ptx_lint_levels.try_insert(PtxLint::DumpBinary, LintLevel::Allow);
 
     let ptx_lint_levels = {
         let (lints, levels): (Vec<Ident>, Vec<Ident>) = ptx_lint_levels
@@ -435,15 +436,8 @@ fn quote_generic_check(
 
     quote::quote_spanned! { func_ident_hash.span()=>
         #[cfg(not(target_os = "cuda"))]
-        const _: #crate_path::safety::kernel_signature::Assert<{
-            #crate_path::safety::kernel_signature::CpuAndGpuKernelSignatures::Match
-        }> = #crate_path::safety::kernel_signature::Assert::<{
-            #crate_path::safety::kernel_signature::check(
-                #crate_path::host::check_kernel!(
-                    #args #crate_name #crate_manifest_dir
-                ).as_bytes(),
-                concat!(".visible .entry ", stringify!(#func_ident_hash)).as_bytes()
-            )
-        }>;
+        const _: ::core::result::Result<(), ()> = #crate_path::host::check_kernel!(
+            #func_ident_hash #args #crate_name #crate_manifest_dir
+        );
     }
 }

From 8f4e7a17e46df26138026dd767fd1553950c6099 Mon Sep 17 00:00:00 2001
From: Juniper Tyree <juniper.tyree@helsinki.fi>
Date: Tue, 9 May 2023 08:03:58 +0000
Subject: [PATCH 032/120] Fix kernel macro config parsing

---
 rust-cuda-derive/src/kernel/wrapper/config.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rust-cuda-derive/src/kernel/wrapper/config.rs b/rust-cuda-derive/src/kernel/wrapper/config.rs
index 382db35f9..d8951230d 100644
--- a/rust-cuda-derive/src/kernel/wrapper/config.rs
+++ b/rust-cuda-derive/src/kernel/wrapper/config.rs
@@ -20,7 +20,7 @@ impl syn::parse::Parse for KernelConfig {
         let args: syn::Ident = input.parse()?;
         let _comma: syn::token::Comma = input.parse()?;
         let ptx: syn::Ident = input.parse()?;
-        let _comma: Option<syn::Ident> = input.parse()?;
+        let _comma: Option<syn::token::Comma> = input.parse()?;
         let _gt_token: syn::token::Gt = input.parse()?;
         let _for: syn::token::For = input.parse()?;
         let launcher: syn::Ident = input.parse()?;

From e9df07ddb3921e3ef8031c90aaa7509f8e77252b Mon Sep 17 00:00:00 2001
From: Juniper Tyree <juniper.tyree@helsinki.fi>
Date: Tue, 9 May 2023 08:58:04 +0000
Subject: [PATCH 033/120] Explicitly fitting Device[Const|Mut]Ref into device
 registers

---
 rust-cuda-derive/src/kernel/link/mod.rs |  2 +-
 src/safety/register_fit.rs              | 12 ++++++++++++
 2 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/rust-cuda-derive/src/kernel/link/mod.rs b/rust-cuda-derive/src/kernel/link/mod.rs
index 5d5fef5b4..cdc727a22 100644
--- a/rust-cuda-derive/src/kernel/link/mod.rs
+++ b/rust-cuda-derive/src/kernel/link/mod.rs
@@ -445,7 +445,7 @@ fn check_kernel_ptx(
             NvptxError::try_err_from(unsafe {
                 ptx_compiler_sys::nvPTXCompilerCompile(
                     compiler,
-                    options_ptrs.len() as c_int,
+                    c_int::try_from(options_ptrs.len()).unwrap(),
                     options_ptrs.as_ptr().cast(),
                 )
             })?;
diff --git a/src/safety/register_fit.rs b/src/safety/register_fit.rs
index 1ddf33849..ef1c8ce98 100644
--- a/src/safety/register_fit.rs
+++ b/src/safety/register_fit.rs
@@ -2,12 +2,24 @@ pub trait FitsIntoDeviceRegister: private::FitsIntoDeviceRegister {}
 impl<T: private::FitsIntoDeviceRegister> FitsIntoDeviceRegister for T {}
 
 mod private {
+    #[marker]
     pub trait FitsIntoDeviceRegister {}
     impl<T> FitsIntoDeviceRegister for T where
         AssertTypeFitsInto64Bits<{ TypeSize::check::<T>() }>: FitsInto64Bits
     {
     }
 
+    // Since T: Sized, the pointers are thin, and must thus fit into device
+    // registers
+    impl<'r, T: rustacuda_core::DeviceCopy + 'r> FitsIntoDeviceRegister
+        for crate::common::DeviceConstRef<'r, T>
+    {
+    }
+    impl<'r, T: rustacuda_core::DeviceCopy + 'r> FitsIntoDeviceRegister
+        for crate::common::DeviceMutRef<'r, T>
+    {
+    }
+
     #[derive(PartialEq, Eq, core::marker::ConstParamTy)]
     pub enum TypeSize {
         TypeFitsInto64Bits,

From cff4eab1c4dbe6a1248ed707482b94b22fa2fea9 Mon Sep 17 00:00:00 2001
From: Juniper Tyree <juniper.tyree@helsinki.fi>
Date: Tue, 9 May 2023 09:25:48 +0000
Subject: [PATCH 034/120] Switched one std:: to core::

---
 .../wrapper/generate/cpu_wrapper/kernel_func_async/mod.rs       | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func_async/mod.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func_async/mod.rs
index 462855156..9a22a46e8 100644
--- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func_async/mod.rs
+++ b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func_async/mod.rs
@@ -119,7 +119,7 @@ pub(super) fn quote_kernel_func_async(
                 unsafe { stream.launch(function, grid, block, shared_memory_size,
                     &[
                         #(
-                            &#func_params as *const _ as *mut ::std::ffi::c_void
+                            &#func_params as *const _ as *mut ::core::ffi::c_void
                         ),*
                     ]
                 ) }

From fb9461abfd8b03df3046ddb34e280e00bccf515e Mon Sep 17 00:00:00 2001
From: Juniper Tyree <juniper.tyree@helsinki.fi>
Date: Tue, 9 May 2023 12:25:17 +0000
Subject: [PATCH 035/120] Remove register-sized CUDA kernel args check,
 unnecessary since https://github.com/rust-lang/rust/pull/94703

---
 examples/single-source/src/main.rs            |  8 ++-
 .../cpu_wrapper/kernel_func_async/mod.rs      |  6 --
 .../kernel/wrapper/generate/cuda_wrapper.rs   |  6 --
 src/safety/mod.rs                             |  2 -
 src/safety/register_fit.rs                    | 55 -------------------
 5 files changed, 7 insertions(+), 70 deletions(-)
 delete mode 100644 src/safety/register_fit.rs

diff --git a/examples/single-source/src/main.rs b/examples/single-source/src/main.rs
index 997fa88bc..ccd384676 100644
--- a/examples/single-source/src/main.rs
+++ b/examples/single-source/src/main.rs
@@ -38,6 +38,11 @@ pub struct Empty([u8; 0]);
 #[layout(crate = "rc::const_type_layout")]
 pub struct Tuple(u32, i32);
 
+#[repr(C)]
+#[derive(rc::const_type_layout::TypeLayout)]
+#[layout(crate = "rc::const_type_layout")]
+pub struct Triple(i32, i32, i32);
+
 #[rc::common::kernel(use link_kernel! as impl Kernel<KernelArgs, KernelPtx> for Launcher)]
 #[kernel(crate = "rc")]
 #[kernel(
@@ -51,6 +56,7 @@ pub fn kernel<'a, T: rc::common::RustToCuda>(
     #[kernel(pass = SafeDeviceCopy, jit)] _v @ _w: &'a core::sync::atomic::AtomicU64,
     #[kernel(pass = LendRustToCuda)] _: Wrapper<T>,
     #[kernel(pass = SafeDeviceCopy)] Tuple(s, mut __t): Tuple,
+    #[kernel(pass = SafeDeviceCopy)] q: Triple,
     // #[kernel(pass = SafeDeviceCopy)] shared3: ThreadBlockShared<u32>,
 ) where
     T: rc::safety::StackOnly + rc::safety::NoAliasing,
@@ -65,7 +71,7 @@ pub fn kernel<'a, T: rc::common::RustToCuda>(
         (*shared.index_mut_unchecked(1)).0 = (f64::from(s) * 2.0) as u32;
     }
     unsafe {
-        (*shared2.index_mut_unchecked(2)).1 = 24;
+        (*shared2.index_mut_unchecked(2)).1 = q.0 + q.1 + q.2;
     }
     // unsafe { core::arch::asm!("hi") }
     // unsafe {
diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func_async/mod.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func_async/mod.rs
index 9a22a46e8..6e123440e 100644
--- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func_async/mod.rs
+++ b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func_async/mod.rs
@@ -102,14 +102,8 @@ pub(super) fn quote_kernel_func_async(
                     #[allow(dead_code)]
                     fn assert_impl_no_aliasing<T: #crate_path::safety::NoAliasing>() {}
 
-                    #[allow(dead_code)]
-                    fn assert_impl_fits_into_device_register<
-                        T: #crate_path::safety::FitsIntoDeviceRegister,
-                    >(_val: &T) {}
-
                     #(assert_impl_devicecopy(&#func_params);)*
                     #(assert_impl_no_aliasing::<#cpu_func_unboxed_types>();)*
-                    #(assert_impl_fits_into_device_register(&#func_params);)*
                 }
 
                 let #crate_path::host::LaunchConfig {
diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cuda_wrapper.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cuda_wrapper.rs
index 36e316708..29473858e 100644
--- a/rust-cuda-derive/src/kernel/wrapper/generate/cuda_wrapper.rs
+++ b/rust-cuda-derive/src/kernel/wrapper/generate/cuda_wrapper.rs
@@ -136,14 +136,8 @@ pub(in super::super) fn quote_cuda_wrapper(
                 #[allow(dead_code)]
                 fn assert_impl_no_aliasing<T: #crate_path::safety::NoAliasing>() {}
 
-                #[allow(dead_code)]
-                fn assert_impl_fits_into_device_register<
-                    T: #crate_path::safety::FitsIntoDeviceRegister,
-                >(_val: &T) {}
-
                 #(assert_impl_devicecopy(&#func_params);)*
                 #(assert_impl_no_aliasing::<#ptx_func_unboxed_types>();)*
-                #(assert_impl_fits_into_device_register(&#func_params);)*
             }
 
             #ptx_func_input_unwrap
diff --git a/src/safety/mod.rs b/src/safety/mod.rs
index cf7a8f718..7e5c419a5 100644
--- a/src/safety/mod.rs
+++ b/src/safety/mod.rs
@@ -1,7 +1,6 @@
 mod arch;
 mod device_copy;
 mod no_aliasing;
-mod register_fit;
 mod stack_only;
 #[cfg(any(feature = "alloc", doc))]
 mod unified_heap;
@@ -13,7 +12,6 @@ pub mod type_layout;
 
 pub use device_copy::SafeDeviceCopy;
 pub use no_aliasing::NoAliasing;
-pub use register_fit::FitsIntoDeviceRegister;
 pub use stack_only::StackOnly;
 #[cfg(any(feature = "alloc", doc))]
 pub use unified_heap::UnifiedHeapOnly;
diff --git a/src/safety/register_fit.rs b/src/safety/register_fit.rs
deleted file mode 100644
index ef1c8ce98..000000000
--- a/src/safety/register_fit.rs
+++ /dev/null
@@ -1,55 +0,0 @@
-pub trait FitsIntoDeviceRegister: private::FitsIntoDeviceRegister {}
-impl<T: private::FitsIntoDeviceRegister> FitsIntoDeviceRegister for T {}
-
-mod private {
-    #[marker]
-    pub trait FitsIntoDeviceRegister {}
-    impl<T> FitsIntoDeviceRegister for T where
-        AssertTypeFitsInto64Bits<{ TypeSize::check::<T>() }>: FitsInto64Bits
-    {
-    }
-
-    // Since T: Sized, the pointers are thin, and must thus fit into device
-    // registers
-    impl<'r, T: rustacuda_core::DeviceCopy + 'r> FitsIntoDeviceRegister
-        for crate::common::DeviceConstRef<'r, T>
-    {
-    }
-    impl<'r, T: rustacuda_core::DeviceCopy + 'r> FitsIntoDeviceRegister
-        for crate::common::DeviceMutRef<'r, T>
-    {
-    }
-
-    #[derive(PartialEq, Eq, core::marker::ConstParamTy)]
-    pub enum TypeSize {
-        TypeFitsInto64Bits,
-        // FIXME: ConstParamTy variant with str ICEs in rustdoc
-        #[cfg(not(doc))]
-        TypeExeceeds64Bits(&'static str),
-        #[cfg(doc)]
-        TypeExeceeds64Bits,
-    }
-
-    impl TypeSize {
-        pub const fn check<T>() -> Self {
-            if core::mem::size_of::<T>() <= core::mem::size_of::<u64>() {
-                Self::TypeFitsInto64Bits
-            } else {
-                #[cfg(not(doc))]
-                {
-                    Self::TypeExeceeds64Bits(core::any::type_name::<T>())
-                }
-                #[cfg(doc)]
-                {
-                    Self::TypeExeceeds64Bits
-                }
-            }
-        }
-    }
-
-    pub enum AssertTypeFitsInto64Bits<const CHECK: TypeSize> {}
-
-    pub trait FitsInto64Bits {}
-
-    impl FitsInto64Bits for AssertTypeFitsInto64Bits<{ TypeSize::TypeFitsInto64Bits }> {}
-}

From e33a270b8b2e34101336d3558f3f62c9a7a2c227 Mon Sep 17 00:00:00 2001
From: Juniper Tyree <juniper.tyree@helsinki.fi>
Date: Sun, 14 May 2023 09:30:51 +0000
Subject: [PATCH 036/120] Simplified the kernel parameter layout extraction
 from PTX

---
 rust-cuda-derive/src/kernel/link/mod.rs       | 185 ++++++++----------
 rust-cuda-derive/src/kernel/mod.rs            |   3 +
 .../kernel/wrapper/generate/cuda_wrapper.rs   |  23 +--
 rust-cuda-ptx-jit/src/device.rs               |   2 +-
 4 files changed, 102 insertions(+), 111 deletions(-)

diff --git a/rust-cuda-derive/src/kernel/link/mod.rs b/rust-cuda-derive/src/kernel/link/mod.rs
index cdc727a22..8df29f33a 100644
--- a/rust-cuda-derive/src/kernel/link/mod.rs
+++ b/rust-cuda-derive/src/kernel/link/mod.rs
@@ -21,6 +21,7 @@ use ptx_builder::{
 use super::{
     lints::{LintLevel, PtxLint},
     utils::skip_kernel_compilation,
+    KERNEL_TYPE_USE_END_CANARY, KERNEL_TYPE_USE_START_CANARY,
 };
 
 mod config;
@@ -66,14 +67,14 @@ pub fn check_kernel(tokens: TokenStream) -> TokenStream {
     quote!(::core::result::Result::Ok(())).into()
 }
 
-#[allow(clippy::module_name_repetitions, clippy::too_many_lines)]
+#[allow(clippy::module_name_repetitions)]
 pub fn link_kernel(tokens: TokenStream) -> TokenStream {
     proc_macro_error::set_dummy(quote! {
         const PTX_STR: &'static str = "ERROR in this PTX compilation";
     });
 
     let LinkKernelConfig {
-        kernel,
+        kernel: _kernel,
         kernel_hash,
         args,
         crate_name,
@@ -111,123 +112,109 @@ pub fn link_kernel(tokens: TokenStream) -> TokenStream {
         .into();
     };
 
-    let kernel_layout_name = if specialisation.is_empty() {
-        format!("{kernel}_type_layout_kernel")
-    } else {
-        format!(
-            "{kernel}_type_layout_kernel_{:016x}",
-            seahash::hash(specialisation.as_bytes())
-        )
-    };
+    let type_layouts = extract_ptx_kernel_layout(&mut kernel_ptx);
+    remove_kernel_type_use_from_ptx(&mut kernel_ptx);
 
-    let mut type_layouts = Vec::new();
+    check_kernel_ptx_and_report(
+        &kernel_ptx,
+        Specialisation::Link(&specialisation),
+        &kernel_hash,
+        &ptx_lint_levels,
+    );
 
-    let type_layout_start_pattern = format!("\n\t// .globl\t{kernel_layout_name}");
+    (quote! { const PTX_STR: &'static str = #kernel_ptx; #(#type_layouts)* }).into()
+}
 
-    if let Some(type_layout_start) = kernel_ptx.find(&type_layout_start_pattern) {
-        const BEFORE_PARAM_PATTERN: &str = ".global .align 1 .b8 ";
-        const PARAM_LEN_PATTERN: &str = "[";
-        const LEN_BYTES_PATTERN: &str = "] = {";
-        const AFTER_BYTES_PATTERN: &str = "};";
+fn extract_ptx_kernel_layout(kernel_ptx: &mut String) -> Vec<proc_macro2::TokenStream> {
+    const BEFORE_PARAM_PATTERN: &str = "global .align 1 .b8 ";
+    const PARAM_LEN_PATTERN: &str = "[";
+    const LEN_BYTES_PATTERN: &str = "] = {";
+    const AFTER_BYTES_PATTERN: &str = "};";
 
-        let after_type_layout_start = type_layout_start + type_layout_start_pattern.len();
+    let mut type_layouts = Vec::new();
+
+    while let Some(type_layout_start) = kernel_ptx.find(BEFORE_PARAM_PATTERN) {
+        let param_start = type_layout_start + BEFORE_PARAM_PATTERN.len();
 
-        let Some(type_layout_middle) = kernel_ptx[after_type_layout_start..]
-            .find(&format!(".visible .entry {kernel_layout_name}"))
-            .map(|i| after_type_layout_start + i)
-        else {
+        let Some(len_start_offset) = kernel_ptx[param_start..].find(PARAM_LEN_PATTERN) else {
             abort_call_site!(
-                "Kernel compilation generated invalid PTX: incomplete type layout information"
+                "Kernel compilation generated invalid PTX: missing type layout data"
             )
         };
+        let len_start = param_start + len_start_offset + PARAM_LEN_PATTERN.len();
 
-        let mut next_type_layout = after_type_layout_start;
+        let Some(bytes_start_offset) = kernel_ptx[len_start..].find(LEN_BYTES_PATTERN) else {
+            abort_call_site!(
+                "Kernel compilation generated invalid PTX: missing type layout length"
+            )
+        };
+        let bytes_start = len_start + bytes_start_offset + LEN_BYTES_PATTERN.len();
 
-        while let Some(param_start_offset) =
-            kernel_ptx[next_type_layout..type_layout_middle].find(BEFORE_PARAM_PATTERN)
-        {
-            let param_start = next_type_layout + param_start_offset + BEFORE_PARAM_PATTERN.len();
+        let Some(bytes_end_offset) = kernel_ptx[bytes_start..].find(AFTER_BYTES_PATTERN) else {
+            abort_call_site!(
+                "Kernel compilation generated invalid PTX: invalid type layout data"
+            )
+        };
+        let param = &kernel_ptx[param_start..(param_start + len_start_offset)];
+        let len = &kernel_ptx[len_start..(len_start + bytes_start_offset)];
+        let bytes = &kernel_ptx[bytes_start..(bytes_start + bytes_end_offset)];
 
-            if let Some(len_start_offset) =
-                kernel_ptx[param_start..type_layout_middle].find(PARAM_LEN_PATTERN)
-            {
-                let len_start = param_start + len_start_offset + PARAM_LEN_PATTERN.len();
+        let param = quote::format_ident!("{}", param);
 
-                if let Some(bytes_start_offset) =
-                    kernel_ptx[len_start..type_layout_middle].find(LEN_BYTES_PATTERN)
-                {
-                    let bytes_start = len_start + bytes_start_offset + LEN_BYTES_PATTERN.len();
+        let Ok(len) = len.parse::<usize>() else {
+            abort_call_site!(
+                "Kernel compilation generated invalid PTX: invalid type layout length"
+            )
+        };
+        let Ok(bytes) = bytes.split(", ").map(std::str::FromStr::from_str).collect::<Result<Vec<u8>, _>>() else {
+            abort_call_site!(
+                "Kernel compilation generated invalid PTX: invalid type layout byte"
+            )
+        };
 
-                    if let Some(bytes_end_offset) =
-                        kernel_ptx[bytes_start..type_layout_middle].find(AFTER_BYTES_PATTERN)
-                    {
-                        let param = &kernel_ptx[param_start..(param_start + len_start_offset)];
-                        let len = &kernel_ptx[len_start..(len_start + bytes_start_offset)];
-                        let bytes = &kernel_ptx[bytes_start..(bytes_start + bytes_end_offset)];
-
-                        let param = quote::format_ident!("{}", param);
-
-                        let Ok(len) = len.parse::<usize>() else {
-                            abort_call_site!(
-                                "Kernel compilation generated invalid PTX: invalid type layout \
-                                 length"
-                            )
-                        };
-                        let Ok(bytes) = bytes
-                            .split(", ")
-                            .map(std::str::FromStr::from_str)
-                            .collect::<Result<Vec<u8>, _>>()
-                        else {
-                            abort_call_site!(
-                                "Kernel compilation generated invalid PTX: invalid type layout \
-                                 byte"
-                            )
-                        };
-
-                        if bytes.len() != len {
-                            abort_call_site!(
-                                "Kernel compilation generated invalid PTX: type layout length \
-                                 mismatch"
-                            );
-                        }
-
-                        let byte_str = syn::LitByteStr::new(&bytes, proc_macro2::Span::call_site());
-
-                        type_layouts.push(quote! {
-                            const #param: &[u8; #len] = #byte_str;
-                        });
-
-                        next_type_layout =
-                            bytes_start + bytes_end_offset + AFTER_BYTES_PATTERN.len();
-                    } else {
-                        next_type_layout = bytes_start;
-                    }
-                } else {
-                    next_type_layout = len_start;
-                }
-            } else {
-                next_type_layout = param_start;
-            }
+        if bytes.len() != len {
+            abort_call_site!(
+                "Kernel compilation generated invalid PTX: type layout length mismatch"
+            );
         }
 
-        let Some(type_layout_end) = kernel_ptx[type_layout_middle..]
-            .find('}')
-            .map(|i| type_layout_middle + i + '}'.len_utf8())
-        else {
-            abort_call_site!("Kernel compilation generated invalid PTX")
-        };
+        let byte_str = syn::LitByteStr::new(&bytes, proc_macro2::Span::call_site());
+
+        type_layouts.push(quote! {
+            const #param: &[u8; #len] = #byte_str;
+        });
+
+        let type_layout_end = bytes_start + bytes_end_offset + AFTER_BYTES_PATTERN.len();
 
         kernel_ptx.replace_range(type_layout_start..type_layout_end, "");
     }
 
-    check_kernel_ptx_and_report(
-        &kernel_ptx,
-        Specialisation::Link(&specialisation),
-        &kernel_hash,
-        &ptx_lint_levels,
-    );
+    type_layouts
+}
 
-    (quote! { const PTX_STR: &'static str = #kernel_ptx; #(#type_layouts)* }).into()
+fn remove_kernel_type_use_from_ptx(kernel_ptx: &mut String) {
+    while let Some(kernel_type_layout_start) = kernel_ptx.find(KERNEL_TYPE_USE_START_CANARY) {
+        let kernel_type_layout_start = kernel_ptx[..kernel_type_layout_start]
+            .rfind('\n')
+            .unwrap_or(kernel_type_layout_start);
+
+        let Some(kernel_type_layout_end_offset) = kernel_ptx[
+            kernel_type_layout_start..
+        ].find(KERNEL_TYPE_USE_END_CANARY) else {
+            abort_call_site!(
+                "Kernel compilation generated invalid PTX: incomplete type layout use section"
+            );
+        };
+
+        let kernel_type_layout_end_offset = kernel_type_layout_end_offset
+            + kernel_ptx[kernel_type_layout_start + kernel_type_layout_end_offset..]
+                .find('\n')
+                .unwrap_or(KERNEL_TYPE_USE_END_CANARY.len());
+
+        let kernel_type_layout_end = kernel_type_layout_start + kernel_type_layout_end_offset;
+
+        kernel_ptx.replace_range(kernel_type_layout_start..kernel_type_layout_end, "");
+    }
 }
 
 #[allow(clippy::too_many_lines)]
diff --git a/rust-cuda-derive/src/kernel/mod.rs b/rust-cuda-derive/src/kernel/mod.rs
index 6dff13380..9e3a80789 100644
--- a/rust-cuda-derive/src/kernel/mod.rs
+++ b/rust-cuda-derive/src/kernel/mod.rs
@@ -4,3 +4,6 @@ pub mod wrapper;
 
 mod lints;
 mod utils;
+
+const KERNEL_TYPE_USE_START_CANARY: &str = "// <rust-cuda-kernel-param-type-use-start> //";
+const KERNEL_TYPE_USE_END_CANARY: &str = "// <rust-cuda-kernel-param-type-use-end> //";
diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cuda_wrapper.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cuda_wrapper.rs
index 29473858e..04e396d70 100644
--- a/rust-cuda-derive/src/kernel/wrapper/generate/cuda_wrapper.rs
+++ b/rust-cuda-derive/src/kernel/wrapper/generate/cuda_wrapper.rs
@@ -2,7 +2,10 @@ use proc_macro2::TokenStream;
 use quote::quote_spanned;
 use syn::spanned::Spanned;
 
-use super::super::{FuncIdent, FunctionInputs, InputCudaType, KernelConfig};
+use super::super::{
+    super::{KERNEL_TYPE_USE_END_CANARY, KERNEL_TYPE_USE_START_CANARY},
+    FuncIdent, FunctionInputs, InputCudaType, KernelConfig,
+};
 
 #[allow(clippy::too_many_lines)]
 pub(in super::super) fn quote_cuda_wrapper(
@@ -96,29 +99,27 @@ pub(in super::super) fn quote_cuda_wrapper(
             syn::FnArg::Receiver(_) => unreachable!(),
         });
 
-    let func_type_layout_ident = quote::format_ident!("{}_type_layout", func_ident);
-
     quote! {
         #[cfg(target_os = "cuda")]
         #[#crate_path::device::specialise_kernel_entry(#args)]
         #[no_mangle]
         #(#func_attrs)*
-        pub unsafe extern "ptx-kernel" fn #func_type_layout_ident(#(#func_params: &mut &[u8]),*) {
+        pub unsafe extern "ptx-kernel" fn #func_ident_hash(#(#ptx_func_inputs),*) {
+            unsafe {
+                ::core::arch::asm!(#KERNEL_TYPE_USE_START_CANARY);
+            }
             #(
                 #[no_mangle]
                 static #func_layout_params: [
                     u8; #crate_path::const_type_layout::serialised_type_graph_len::<#ptx_func_types>()
                 ] = #crate_path::const_type_layout::serialise_type_graph::<#ptx_func_types>();
 
-                *#func_params = &#func_layout_params;
+                unsafe { ::core::ptr::read_volatile(&#func_layout_params[0]) };
             )*
-        }
+            unsafe {
+                ::core::arch::asm!(#KERNEL_TYPE_USE_END_CANARY);
+            }
 
-        #[cfg(target_os = "cuda")]
-        #[#crate_path::device::specialise_kernel_entry(#args)]
-        #[no_mangle]
-        #(#func_attrs)*
-        pub unsafe extern "ptx-kernel" fn #func_ident_hash(#(#ptx_func_inputs),*) {
             #[deny(improper_ctypes)]
             mod __rust_cuda_ffi_safe_assert {
                 use super::#args;
diff --git a/rust-cuda-ptx-jit/src/device.rs b/rust-cuda-ptx-jit/src/device.rs
index 533021b90..c647a65eb 100644
--- a/rust-cuda-ptx-jit/src/device.rs
+++ b/rust-cuda-ptx-jit/src/device.rs
@@ -5,7 +5,7 @@ macro_rules! PtxJITConstLoad {
     ([$index:literal] => $reference:expr) => {
         unsafe {
             ::core::arch::asm!(
-                concat!("// <rust-cuda-ptx-jit-const-load-{}-", $index, "> //"),
+                ::core::concat!("// <rust-cuda-ptx-jit-const-load-{}-", $index, "> //"),
                 in(reg32) *($reference as *const _ as *const u32),
             )
         }

From d28f237af93f28e0d0b2a7a3a8d6c0421925a13b Mon Sep 17 00:00:00 2001
From: Juniper Tyree <juniper.tyree@helsinki.fi>
Date: Tue, 12 Dec 2023 12:42:21 +0000
Subject: [PATCH 037/120] Fix up rebase issues

---
 rust-cuda-derive/src/kernel/link/mod.rs | 36 +++++++++++--------------
 rust-cuda-derive/src/kernel/lints.rs    | 19 ++++++++++---
 rust-toolchain                          |  2 +-
 src/utils/device_copy.rs                |  4 +--
 src/utils/exchange/buffer/device.rs     |  4 +--
 src/utils/exchange/buffer/host.rs       |  4 +--
 6 files changed, 36 insertions(+), 33 deletions(-)

diff --git a/rust-cuda-derive/src/kernel/link/mod.rs b/rust-cuda-derive/src/kernel/link/mod.rs
index 8df29f33a..b03c9f756 100644
--- a/rust-cuda-derive/src/kernel/link/mod.rs
+++ b/rust-cuda-derive/src/kernel/link/mod.rs
@@ -54,7 +54,7 @@ pub fn check_kernel(tokens: TokenStream) -> TokenStream {
     let kernel_ptx = compile_kernel(&args, &crate_name, &crate_path, Specialisation::Check);
 
     let Some(kernel_ptx) = kernel_ptx else {
-        return quote!(::core::result::Result::Err(())).into()
+        return quote!(::core::result::Result::Err(())).into();
     };
 
     check_kernel_ptx_and_report(
@@ -126,7 +126,7 @@ pub fn link_kernel(tokens: TokenStream) -> TokenStream {
 }
 
 fn extract_ptx_kernel_layout(kernel_ptx: &mut String) -> Vec<proc_macro2::TokenStream> {
-    const BEFORE_PARAM_PATTERN: &str = "global .align 1 .b8 ";
+    const BEFORE_PARAM_PATTERN: &str = ".visible .global .align 1 .b8 ";
     const PARAM_LEN_PATTERN: &str = "[";
     const LEN_BYTES_PATTERN: &str = "] = {";
     const AFTER_BYTES_PATTERN: &str = "};";
@@ -137,23 +137,17 @@ fn extract_ptx_kernel_layout(kernel_ptx: &mut String) -> Vec<proc_macro2::TokenS
         let param_start = type_layout_start + BEFORE_PARAM_PATTERN.len();
 
         let Some(len_start_offset) = kernel_ptx[param_start..].find(PARAM_LEN_PATTERN) else {
-            abort_call_site!(
-                "Kernel compilation generated invalid PTX: missing type layout data"
-            )
+            abort_call_site!("Kernel compilation generated invalid PTX: missing type layout data")
         };
         let len_start = param_start + len_start_offset + PARAM_LEN_PATTERN.len();
 
         let Some(bytes_start_offset) = kernel_ptx[len_start..].find(LEN_BYTES_PATTERN) else {
-            abort_call_site!(
-                "Kernel compilation generated invalid PTX: missing type layout length"
-            )
+            abort_call_site!("Kernel compilation generated invalid PTX: missing type layout length")
         };
         let bytes_start = len_start + bytes_start_offset + LEN_BYTES_PATTERN.len();
 
         let Some(bytes_end_offset) = kernel_ptx[bytes_start..].find(AFTER_BYTES_PATTERN) else {
-            abort_call_site!(
-                "Kernel compilation generated invalid PTX: invalid type layout data"
-            )
+            abort_call_site!("Kernel compilation generated invalid PTX: invalid type layout data")
         };
         let param = &kernel_ptx[param_start..(param_start + len_start_offset)];
         let len = &kernel_ptx[len_start..(len_start + bytes_start_offset)];
@@ -162,14 +156,14 @@ fn extract_ptx_kernel_layout(kernel_ptx: &mut String) -> Vec<proc_macro2::TokenS
         let param = quote::format_ident!("{}", param);
 
         let Ok(len) = len.parse::<usize>() else {
-            abort_call_site!(
-                "Kernel compilation generated invalid PTX: invalid type layout length"
-            )
+            abort_call_site!("Kernel compilation generated invalid PTX: invalid type layout length")
         };
-        let Ok(bytes) = bytes.split(", ").map(std::str::FromStr::from_str).collect::<Result<Vec<u8>, _>>() else {
-            abort_call_site!(
-                "Kernel compilation generated invalid PTX: invalid type layout byte"
-            )
+        let Ok(bytes) = bytes
+            .split(", ")
+            .map(std::str::FromStr::from_str)
+            .collect::<Result<Vec<u8>, _>>()
+        else {
+            abort_call_site!("Kernel compilation generated invalid PTX: invalid type layout byte")
         };
 
         if bytes.len() != len {
@@ -198,9 +192,9 @@ fn remove_kernel_type_use_from_ptx(kernel_ptx: &mut String) {
             .rfind('\n')
             .unwrap_or(kernel_type_layout_start);
 
-        let Some(kernel_type_layout_end_offset) = kernel_ptx[
-            kernel_type_layout_start..
-        ].find(KERNEL_TYPE_USE_END_CANARY) else {
+        let Some(kernel_type_layout_end_offset) =
+            kernel_ptx[kernel_type_layout_start..].find(KERNEL_TYPE_USE_END_CANARY)
+        else {
             abort_call_site!(
                 "Kernel compilation generated invalid PTX: incomplete type layout use section"
             );
diff --git a/rust-cuda-derive/src/kernel/lints.rs b/rust-cuda-derive/src/kernel/lints.rs
index e91222dcd..6da06ed4b 100644
--- a/rust-cuda-derive/src/kernel/lints.rs
+++ b/rust-cuda-derive/src/kernel/lints.rs
@@ -2,6 +2,7 @@ use std::{collections::HashMap, fmt};
 
 use syn::spanned::Spanned;
 
+#[allow(clippy::too_many_lines)]
 pub fn parse_ptx_lint_level(
     path: &syn::Path,
     nested: &syn::punctuated::Punctuated<syn::NestedMeta, syn::token::Comma>,
@@ -46,10 +47,15 @@ pub fn parse_ptx_lint_level(
             continue;
         }
 
-        let Some(syn::PathSegment { ident: namespace, arguments: syn::PathArguments::None }) = path.segments.first() else {
+        let Some(syn::PathSegment {
+            ident: namespace,
+            arguments: syn::PathArguments::None,
+        }) = path.segments.first()
+        else {
             emit_error!(
                 meta.span(),
-                "[rust-cuda]: Invalid #[kernel({}(<lint>))] attribute: <lint> must be of the form `ptx::lint`.",
+                "[rust-cuda]: Invalid #[kernel({}(<lint>))] attribute: <lint> must be of the form \
+                 `ptx::lint`.",
                 level,
             );
             continue;
@@ -65,10 +71,15 @@ pub fn parse_ptx_lint_level(
             continue;
         }
 
-        let Some(syn::PathSegment { ident: lint, arguments: syn::PathArguments::None }) = path.segments.last() else {
+        let Some(syn::PathSegment {
+            ident: lint,
+            arguments: syn::PathArguments::None,
+        }) = path.segments.last()
+        else {
             emit_error!(
                 meta.span(),
-                "[rust-cuda]: Invalid #[kernel({}(<lint>))] attribute: <lint> must be of the form `ptx::lint`.",
+                "[rust-cuda]: Invalid #[kernel({}(<lint>))] attribute: <lint> must be of the form \
+                 `ptx::lint`.",
                 level,
             );
             continue;
diff --git a/rust-toolchain b/rust-toolchain
index 512b40786..d6e655e5f 100644
--- a/rust-toolchain
+++ b/rust-toolchain
@@ -1,5 +1,5 @@
 [toolchain]
-# Pin version pin until const traits are back
+# Pin to final 1.75.0 nightly
 channel = "nightly-2023-11-10"
 components = [ "cargo", "rustfmt", "clippy" ]
 targets = [ "x86_64-unknown-linux-gnu", "nvptx64-nvidia-cuda" ]
diff --git a/src/utils/device_copy.rs b/src/utils/device_copy.rs
index 0c77a8d1a..2869cd296 100644
--- a/src/utils/device_copy.rs
+++ b/src/utils/device_copy.rs
@@ -102,9 +102,7 @@ unsafe impl<T: SafeDeviceCopy + TypeGraphLayout> RustToCuda for SafeDeviceCopyWr
     }
 }
 
-unsafe impl<T: SafeDeviceCopy + TypeGraphLayout> RustToCudaAsync
-    for SafeDeviceCopyWrapper<T>
-{
+unsafe impl<T: SafeDeviceCopy + TypeGraphLayout> RustToCudaAsync for SafeDeviceCopyWrapper<T> {
     #[cfg(feature = "host")]
     #[allow(clippy::type_complexity)]
     unsafe fn borrow_async<A: CudaAlloc>(
diff --git a/src/utils/exchange/buffer/device.rs b/src/utils/exchange/buffer/device.rs
index 09ffa2b43..f6f00248b 100644
--- a/src/utils/exchange/buffer/device.rs
+++ b/src/utils/exchange/buffer/device.rs
@@ -48,7 +48,7 @@ unsafe impl<T: SafeDeviceCopy + TypeGraphLayout, const M2D: bool, const M2H: boo
 }
 
 #[cfg(not(all(doc, feature = "host")))]
-unsafe impl<T: SafeDeviceCopy + TypeGraphLayout, const M2D: bool, const M2H: bool>
-    RustToCudaAsync for CudaExchangeBufferDevice<T, M2D, M2H>
+unsafe impl<T: SafeDeviceCopy + TypeGraphLayout, const M2D: bool, const M2H: bool> RustToCudaAsync
+    for CudaExchangeBufferDevice<T, M2D, M2H>
 {
 }
diff --git a/src/utils/exchange/buffer/host.rs b/src/utils/exchange/buffer/host.rs
index 384f290bb..24a95bfe3 100644
--- a/src/utils/exchange/buffer/host.rs
+++ b/src/utils/exchange/buffer/host.rs
@@ -160,8 +160,8 @@ unsafe impl<T: SafeDeviceCopy + TypeGraphLayout, const M2D: bool, const M2H: boo
     }
 }
 
-unsafe impl<T: SafeDeviceCopy + TypeGraphLayout, const M2D: bool, const M2H: bool>
-    RustToCudaAsync for CudaExchangeBufferHost<T, M2D, M2H>
+unsafe impl<T: SafeDeviceCopy + TypeGraphLayout, const M2D: bool, const M2H: bool> RustToCudaAsync
+    for CudaExchangeBufferHost<T, M2D, M2H>
 {
     #[allow(clippy::type_complexity)]
     unsafe fn borrow_async<A: CudaAlloc>(

From e9bb611f60fe2222b2bb3be247aa44d6a7cd6a66 Mon Sep 17 00:00:00 2001
From: Juniper Tyree <juniper.tyree@helsinki.fi>
Date: Tue, 12 Dec 2023 12:47:23 +0000
Subject: [PATCH 038/120] Install CUDA in all CI steps

---
 .github/workflows/ci.yml | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 2e66a8ed9..954395a77 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -23,6 +23,16 @@ jobs:
         rust: [nightly]
 
     steps:
+      - name: Install CUDA
+        run: |
+          wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/cuda-ubuntu1804.pin
+          sudo mv cuda-ubuntu1804.pin /etc/apt/preferences.d/cuda-repository-pin-600
+          curl -L -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/cuda-keyring_1.0-1_all.deb
+          sudo dpkg -i cuda-keyring_1.0-1_all.deb
+          sudo add-apt-repository "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/ /"
+          sudo apt-get update -q
+          sudo apt-get install cuda -y --no-install-recommends
+
       - name: Checkout the Repository
         uses: actions/checkout@v2
       
@@ -129,6 +139,16 @@ jobs:
         rust: [nightly]
 
     steps:
+      - name: Install CUDA
+        run: |
+          wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/cuda-ubuntu1804.pin
+          sudo mv cuda-ubuntu1804.pin /etc/apt/preferences.d/cuda-repository-pin-600
+          curl -L -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/cuda-keyring_1.0-1_all.deb
+          sudo dpkg -i cuda-keyring_1.0-1_all.deb
+          sudo add-apt-repository "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/ /"
+          sudo apt-get update -q
+          sudo apt-get install cuda -y --no-install-recommends
+
       - name: Checkout the Repository
         uses: actions/checkout@v2
       

From 1493d97b16f9fd04a723c4cf96e85f7d7e1c6612 Mon Sep 17 00:00:00 2001
From: Juniper Tyree <juniper.tyree@helsinki.fi>
Date: Tue, 12 Dec 2023 14:27:10 +0000
Subject: [PATCH 039/120] Use CStr literals

---
 .github/workflows/rustdoc.yml           |  1 +
 rust-cuda-derive/src/kernel/link/mod.rs | 24 +++++++++++-------------
 rust-cuda-derive/src/lib.rs             |  1 +
 3 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/.github/workflows/rustdoc.yml b/.github/workflows/rustdoc.yml
index 285fc57c2..ec466acf0 100644
--- a/.github/workflows/rustdoc.yml
+++ b/.github/workflows/rustdoc.yml
@@ -31,6 +31,7 @@ jobs:
             --extern-html-root-url rustacuda=https://docs.rs/rustacuda/0.1.3/ \
             --extern-html-root-url rustacuda_core=https://docs.rs/rustacuda_core/0.1.2/ \
             --extern-html-root-url rustacuda_derive=https://docs.rs/rustacuda_derive/0.1.2/ \
+            --extern-html-root-url const_type_layout=https://docs.rs/const-type-layout/0.2.0/ \
             -Zunstable-options \
           " cargo doc \
             --all-features \
diff --git a/rust-cuda-derive/src/kernel/link/mod.rs b/rust-cuda-derive/src/kernel/link/mod.rs
index b03c9f756..ae0b5ea63 100644
--- a/rust-cuda-derive/src/kernel/link/mod.rs
+++ b/rust-cuda-derive/src/kernel/link/mod.rs
@@ -383,11 +383,9 @@ fn check_kernel_ptx(
                 seahash::hash(specialisation.as_bytes())
             ),
         };
+        let kernel_name = CString::new(kernel_name).unwrap();
 
-        let mut options = vec![
-            CString::new("--entry").unwrap(),
-            CString::new(kernel_name).unwrap(),
-        ];
+        let mut options = vec![c"--entry", kernel_name.as_c_str()];
 
         if ptx_lint_levels
             .values()
@@ -399,27 +397,27 @@ fn check_kernel_ptx(
                 .get(&PtxLint::Verbose)
                 .map_or(false, |level| *level > LintLevel::Warn)
             {
-                options.push(CString::new("--verbose").unwrap());
+                options.push(c"--verbose");
             }
             if ptx_lint_levels
                 .get(&PtxLint::DoublePrecisionUse)
                 .map_or(false, |level| *level > LintLevel::Warn)
             {
-                options.push(CString::new("--warn-on-double-precision-use").unwrap());
+                options.push(c"--warn-on-double-precision-use");
             }
             if ptx_lint_levels
                 .get(&PtxLint::LocalMemoryUsage)
                 .map_or(false, |level| *level > LintLevel::Warn)
             {
-                options.push(CString::new("--warn-on-local-memory-usage").unwrap());
+                options.push(c"--warn-on-local-memory-usage");
             }
             if ptx_lint_levels
                 .get(&PtxLint::RegisterSpills)
                 .map_or(false, |level| *level > LintLevel::Warn)
             {
-                options.push(CString::new("--warn-on-spills").unwrap());
+                options.push(c"--warn-on-spills");
             }
-            options.push(CString::new("--warning-as-error").unwrap());
+            options.push(c"--warning-as-error");
 
             let options_ptrs = options.iter().map(|o| o.as_ptr()).collect::<Vec<_>>();
 
@@ -436,25 +434,25 @@ fn check_kernel_ptx(
             .get(&PtxLint::Verbose)
             .map_or(false, |level| *level > LintLevel::Allow)
         {
-            options.push(CString::new("--verbose").unwrap());
+            options.push(c"--verbose");
         }
         if ptx_lint_levels
             .get(&PtxLint::DoublePrecisionUse)
             .map_or(false, |level| *level > LintLevel::Allow)
         {
-            options.push(CString::new("--warn-on-double-precision-use").unwrap());
+            options.push(c"--warn-on-double-precision-use");
         }
         if ptx_lint_levels
             .get(&PtxLint::LocalMemoryUsage)
             .map_or(false, |level| *level > LintLevel::Allow)
         {
-            options.push(CString::new("--warn-on-local-memory-usage").unwrap());
+            options.push(c"--warn-on-local-memory-usage");
         }
         if ptx_lint_levels
             .get(&PtxLint::RegisterSpills)
             .map_or(false, |level| *level > LintLevel::Allow)
         {
-            options.push(CString::new("--warn-on-spills").unwrap());
+            options.push(c"--warn-on-spills");
         }
 
         let options_ptrs = options.iter().map(|o| o.as_ptr()).collect::<Vec<_>>();
diff --git a/rust-cuda-derive/src/lib.rs b/rust-cuda-derive/src/lib.rs
index 572e1c9da..e94048081 100644
--- a/rust-cuda-derive/src/lib.rs
+++ b/rust-cuda-derive/src/lib.rs
@@ -5,6 +5,7 @@
 #![feature(if_let_guard)]
 #![feature(let_chains)]
 #![feature(map_try_insert)]
+#![feature(c_str_literals)]
 #![doc(html_root_url = "https://juntyr.github.io/rust-cuda/")]
 
 extern crate proc_macro;

From e09e884cd934b97d2a0663052c14e8aa62b54995 Mon Sep 17 00:00:00 2001
From: Juniper Tyree <juniper.tyree@helsinki.fi>
Date: Wed, 13 Dec 2023 13:37:20 +0000
Subject: [PATCH 040/120] Simplify and document the safety traits

---
 .github/workflows/rustdoc.yml                 |   3 +-
 .vscode/settings.json                         |   2 +-
 Cargo.toml                                    |   5 -
 examples/single-source/src/main.rs            |   2 +-
 .../cpu_wrapper/kernel_func_async/mod.rs      |   4 +-
 .../kernel/wrapper/generate/cuda_wrapper.rs   |   8 +-
 src/lib.rs                                    |   3 +-
 src/safety/device_copy.rs                     |  47 ++++---
 src/safety/mod.rs                             |   6 +-
 src/safety/no_aliasing.rs                     | 105 +++++++++++----
 src/safety/stack_only.rs                      | 123 ++++++++++++------
 src/safety/unified_heap.rs                    |  53 --------
 src/utils/alloc.rs                            |  67 ----------
 src/utils/mod.rs                              |   3 -
 src/utils/shared/slice.rs                     |  40 +++---
 src/utils/shared/static.rs                    |  53 ++++----
 16 files changed, 238 insertions(+), 286 deletions(-)
 delete mode 100644 src/safety/unified_heap.rs
 delete mode 100644 src/utils/alloc.rs

diff --git a/.github/workflows/rustdoc.yml b/.github/workflows/rustdoc.yml
index ec466acf0..23f4f1c07 100644
--- a/.github/workflows/rustdoc.yml
+++ b/.github/workflows/rustdoc.yml
@@ -28,10 +28,11 @@ jobs:
         run: |
           RUSTDOCFLAGS="\
             --enable-index-page \
+            --extern-html-root-url const_type_layout=https://docs.rs/const-type-layout/0.2.0/ \
+            --extern-html-root-url final=https://docs.rs/final/0.1.1/ \
             --extern-html-root-url rustacuda=https://docs.rs/rustacuda/0.1.3/ \
             --extern-html-root-url rustacuda_core=https://docs.rs/rustacuda_core/0.1.2/ \
             --extern-html-root-url rustacuda_derive=https://docs.rs/rustacuda_derive/0.1.2/ \
-            --extern-html-root-url const_type_layout=https://docs.rs/const-type-layout/0.2.0/ \
             -Zunstable-options \
           " cargo doc \
             --all-features \
diff --git a/.vscode/settings.json b/.vscode/settings.json
index 93f713cad..b033ed643 100644
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -4,5 +4,5 @@
     "rust-analyzer.updates.askBeforeDownload": false,
     "rust-analyzer.checkOnSave.command": "reap-clippy",
     "rust-analyzer.cargo.allFeatures": false,
-    "rust-analyzer.cargo.features": ["alloc", "derive", "host"],
+    "rust-analyzer.cargo.features": ["derive", "host"],
 }
diff --git a/Cargo.toml b/Cargo.toml
index 2ebfbe32e..a2076ca1c 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -19,7 +19,6 @@ rust-version = "1.75" # nightly
 
 [features]
 default = []
-alloc = ["hashbrown"]
 host = ["rustacuda", "rust-cuda-ptx-jit/host"]
 derive = ["rustacuda_derive", "rust-cuda-derive"]
 
@@ -32,10 +31,6 @@ rustacuda_derive = { git = "https://github.com/juntyr/RustaCUDA", rev = "c6ea7cc
 const-type-layout = { version = "0.2.0", features = ["derive"] }
 
 final = "0.1.1"
-hashbrown = { version = "0.14", default-features = false, features = ["inline-more"], optional = true }
 
 rust-cuda-derive = { path = "rust-cuda-derive", optional = true }
 rust-cuda-ptx-jit = { path = "rust-cuda-ptx-jit" }
-
-[dev-dependencies]
-hashbrown = { version = "0.14", default-features = false, features = ["inline-more"] }
diff --git a/examples/single-source/src/main.rs b/examples/single-source/src/main.rs
index ccd384676..b80a14201 100644
--- a/examples/single-source/src/main.rs
+++ b/examples/single-source/src/main.rs
@@ -59,7 +59,7 @@ pub fn kernel<'a, T: rc::common::RustToCuda>(
     #[kernel(pass = SafeDeviceCopy)] q: Triple,
     // #[kernel(pass = SafeDeviceCopy)] shared3: ThreadBlockShared<u32>,
 ) where
-    T: rc::safety::StackOnly + rc::safety::NoAliasing,
+    T: rc::safety::StackOnly + rc::safety::NoSafeAliasing,
     <T as rc::common::RustToCuda>::CudaRepresentation: rc::safety::StackOnly,
     <T as rc::common::RustToCuda>::CudaAllocation: rc::common::EmptyCudaAlloc,
 {
diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func_async/mod.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func_async/mod.rs
index 6e123440e..747f4a278 100644
--- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func_async/mod.rs
+++ b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func_async/mod.rs
@@ -100,10 +100,10 @@ pub(super) fn quote_kernel_func_async(
                     fn assert_impl_devicecopy<T: #crate_path::rustacuda_core::DeviceCopy>(_val: &T) {}
 
                     #[allow(dead_code)]
-                    fn assert_impl_no_aliasing<T: #crate_path::safety::NoAliasing>() {}
+                    fn assert_impl_no_safe_aliasing<T: #crate_path::safety::NoSafeAliasing>() {}
 
                     #(assert_impl_devicecopy(&#func_params);)*
-                    #(assert_impl_no_aliasing::<#cpu_func_unboxed_types>();)*
+                    #(assert_impl_no_safe_aliasing::<#cpu_func_unboxed_types>();)*
                 }
 
                 let #crate_path::host::LaunchConfig {
diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cuda_wrapper.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cuda_wrapper.rs
index 04e396d70..042ae5e7a 100644
--- a/rust-cuda-derive/src/kernel/wrapper/generate/cuda_wrapper.rs
+++ b/rust-cuda-derive/src/kernel/wrapper/generate/cuda_wrapper.rs
@@ -64,9 +64,9 @@ pub(in super::super) fn quote_cuda_wrapper(
                         syn::TypeReference { and_token, .. }
                     ) = &**ty {
                         // DeviceCopy mode only supports immutable references
-                        quote! { #ptx_jit_load; { let #pat: #and_token #syn_type = #pat.as_ref().into_ref(); #inner } }
+                        quote! { { let #pat: #and_token #syn_type = #pat.as_ref().into_ref(); #inner } }
                     } else {
-                        quote! { { let #pat: #syn_type = #pat.into_inner(); #inner } }
+                        quote! { #ptx_jit_load; { let #pat: #syn_type = #pat.into_inner(); #inner } }
                     },
                     InputCudaType::LendRustToCuda => if let syn::Type::Reference(
                         syn::TypeReference { and_token, mutability, ..}
@@ -135,10 +135,10 @@ pub(in super::super) fn quote_cuda_wrapper(
                 fn assert_impl_devicecopy<T: #crate_path::rustacuda_core::DeviceCopy>(_val: &T) {}
 
                 #[allow(dead_code)]
-                fn assert_impl_no_aliasing<T: #crate_path::safety::NoAliasing>() {}
+                fn assert_impl_no_safe_aliasing<T: #crate_path::safety::NoSafeAliasing>() {}
 
                 #(assert_impl_devicecopy(&#func_params);)*
-                #(assert_impl_no_aliasing::<#ptx_func_unboxed_types>();)*
+                #(assert_impl_no_safe_aliasing::<#ptx_func_unboxed_types>();)*
             }
 
             #ptx_func_input_unwrap
diff --git a/src/lib.rs b/src/lib.rs
index de590c29b..273e27779 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -16,14 +16,13 @@
     any(all(not(feature = "host"), target_os = "cuda"), doc),
     feature(asm_const)
 )]
-#![cfg_attr(target_os = "cuda", feature(ptr_metadata))]
-#![cfg_attr(any(feature = "alloc", doc), feature(allocator_api))]
 #![feature(doc_cfg)]
 #![feature(marker_trait_attr)]
 #![feature(const_type_name)]
 #![feature(offset_of)]
 #![feature(adt_const_params)]
 #![feature(impl_trait_in_assoc_type)]
+#![feature(ptr_metadata)]
 #![allow(incomplete_features)]
 #![feature(generic_const_exprs)]
 #![cfg_attr(target_os = "cuda", feature(slice_ptr_get))]
diff --git a/src/safety/device_copy.rs b/src/safety/device_copy.rs
index ee1cef0dc..9aedc8e81 100644
--- a/src/safety/device_copy.rs
+++ b/src/safety/device_copy.rs
@@ -1,30 +1,29 @@
-#[allow(clippy::module_name_repetitions)]
-pub trait SafeDeviceCopy: sealed::SafeDeviceCopy {}
+use const_type_layout::TypeGraphLayout;
 
-impl<T: sealed::SafeDeviceCopy> SafeDeviceCopy for T {}
+use crate::{common::DeviceAccessible, safety::StackOnly};
 
-mod sealed {
-    #[marker]
-    pub trait SafeDeviceCopy {}
+#[allow(clippy::module_name_repetitions)]
+/// Types which are safe to memcpy from the CPU to a GPU.
+///
+/// For a type to implement [`SafeDeviceCopy`], it must
+///
+/// * have the same memory layout on both the CPU and GPU
+///
+/// * not contain any references to data that is inaccessible from the GPU
+///
+/// Types that implement both [`TypeGraphLayout`] and [`StackOnly`] satisfy
+/// both of these criteria and thus implement [`SafeDeviceCopy`].
+#[marker]
+pub trait SafeDeviceCopy: sealed::Sealed {}
 
-    // Thread-block-shared data cannot be copied since information is added inside
-    //  CUDA
-    impl<T: 'static> !SafeDeviceCopy for crate::utils::shared::r#static::ThreadBlockShared<T> {}
-    impl<T: 'static + const_type_layout::TypeGraphLayout> !SafeDeviceCopy
-        for crate::utils::shared::slice::ThreadBlockSharedSlice<T>
-    {
-    }
+impl<T: StackOnly + TypeGraphLayout> SafeDeviceCopy for T {}
+impl<T: StackOnly + TypeGraphLayout> sealed::Sealed for T {}
 
-    impl<T: crate::safety::StackOnly> SafeDeviceCopy for T {}
-    #[cfg(any(feature = "alloc", doc))]
-    impl<T: crate::safety::UnifiedHeapOnly> SafeDeviceCopy for T {}
+#[doc(hidden)]
+impl<T: SafeDeviceCopy + rustacuda_core::DeviceCopy> SafeDeviceCopy for DeviceAccessible<T> {}
+impl<T: SafeDeviceCopy + rustacuda_core::DeviceCopy> sealed::Sealed for DeviceAccessible<T> {}
 
-    impl<T: SafeDeviceCopy + rustacuda_core::DeviceCopy> SafeDeviceCopy
-        for crate::common::DeviceAccessible<T>
-    {
-    }
-    impl<T: SafeDeviceCopy + const_type_layout::TypeGraphLayout> SafeDeviceCopy
-        for crate::utils::device_copy::SafeDeviceCopyWrapper<T>
-    {
-    }
+mod sealed {
+    #[marker]
+    pub trait Sealed {}
 }
diff --git a/src/safety/mod.rs b/src/safety/mod.rs
index 7e5c419a5..72ed9c7db 100644
--- a/src/safety/mod.rs
+++ b/src/safety/mod.rs
@@ -2,8 +2,6 @@ mod arch;
 mod device_copy;
 mod no_aliasing;
 mod stack_only;
-#[cfg(any(feature = "alloc", doc))]
-mod unified_heap;
 
 #[doc(hidden)]
 pub mod kernel_signature;
@@ -11,7 +9,5 @@ pub mod kernel_signature;
 pub mod type_layout;
 
 pub use device_copy::SafeDeviceCopy;
-pub use no_aliasing::NoAliasing;
+pub use no_aliasing::NoSafeAliasing;
 pub use stack_only::StackOnly;
-#[cfg(any(feature = "alloc", doc))]
-pub use unified_heap::UnifiedHeapOnly;
diff --git a/src/safety/no_aliasing.rs b/src/safety/no_aliasing.rs
index 98a180b6a..0fc3abf9c 100644
--- a/src/safety/no_aliasing.rs
+++ b/src/safety/no_aliasing.rs
@@ -1,33 +1,86 @@
 #[allow(clippy::module_name_repetitions)]
-pub trait NoAliasing: private::NoAliasing {}
-impl<T: private::NoAliasing> NoAliasing for T {}
+/// Types which can be safely shared between CUDA threads because they do
+/// not provide safe aliasing mutable access to some shared inner state.
+///
+/// This trait is automatically implemented when the compiler determines
+/// it's appropriate.
+///
+/// Data types that contain no references and can thus live entirely on
+/// the stack, e.g. primitive types like [`u8`] and structs, tuples, and
+/// enums made only from them, or more generally those types that implement
+/// [`StackOnly`](super::StackOnly), also implement [`NoSafeAliasing`] as they
+/// do not contain any inner data that might be shared when each thread is
+/// given mutable access to a copy.
+///
+/// In contrast, `&mut T` (and any type containing a mutable reference) do *not*
+/// implement [`NoSafeAliasing`] as several threads would obtain mutable
+/// aliasing access to the same date, thus violating Rust's borrowing and
+/// memory safety rules.
+///
+/// Even though `*const T` and `*mut T` do not provide *safe* mutable aliasing
+/// access to their underlying data, as dereferincing them is always unsafe,
+/// they (and any type containing a pointer) do *not* implement
+/// [`NoSafeAliasing`] to ensure that any data type that uses them to build a
+/// safe interface to accessing data, e.g. [`Box`], does not accidentially
+/// implement [`NoSafeAliasing`]. If you have implemented a data structure that
+/// uses `*const T` or `*mut T` internally but also ensures that no safe
+/// aliasing mutable access is provided, you can *unsafely* implement
+/// [`NoSafeAliasing`] for your type. Please reference the [Safety](#safety)
+/// section below for more details on the contract you must uphold in this case.
+///
+/// # Safety
+///
+/// This trait must only be manually implemented for a type that upholds
+/// the no-mutable-aliasing guarantee through its safe API.
+///
+/// The following examples outline three different cases for types that do
+/// fulfil this safety requirement:
+///
+/// * [`Final`](final::Final) implements [`NoSafeAliasing`]
+/// because even a mutable reference to it only provides read-only access
+/// to its inner data.
+///
+/// * [`SplitSliceOverCudaThreadsConstStride`](crate::utils::aliasing::SplitSliceOverCudaThreadsConstStride)
+/// and
+/// [`SplitSliceOverCudaThreadsDynamicStride`](crate::utils::aliasing::SplitSliceOverCudaThreadsDynamicStride)
+/// also implement [`NoSafeAliasing`] because they only provide each CUDA thread
+/// with mutable access to its own partition of a slice and thus avoid mutable
+/// aliasing.
+///
+/// * [`ThreadBlockShared`](crate::utils::shared::static::ThreadBlockShared)
+/// and
+/// [`ThreadBlockSharedSlice`](crate::utils::shared::slice::ThreadBlockSharedSlice)
+/// also implement [`NoSafeAliasing`] since they only provide access to `*mut
+/// T`, which is always unsafe to mutate and thus moves the burden to uphoald
+/// the no-mutable-aliasing safety invariant to the user who derefereces these
+/// pointers.
+pub unsafe auto trait NoSafeAliasing {}
 
-mod private {
-    pub auto trait NoAliasing {}
+impl<T> !NoSafeAliasing for &mut T {}
+impl<T> !NoSafeAliasing for *const T {}
+impl<T> !NoSafeAliasing for *mut T {}
 
-    impl<T> !NoAliasing for *const T {}
-    impl<T> !NoAliasing for *mut T {}
-    impl<T> !NoAliasing for &mut T {}
+unsafe impl<T> NoSafeAliasing for core::marker::PhantomData<T> {}
 
-    impl<T> NoAliasing for core::marker::PhantomData<T> {}
-
-    impl<T> NoAliasing for r#final::Final<T> {}
-    impl<T: crate::common::CudaAsRust> NoAliasing
-        for crate::utils::aliasing::FinalCudaRepresentation<T>
-    {
-    }
+unsafe impl<T> NoSafeAliasing for r#final::Final<T> {}
+unsafe impl<T: crate::common::CudaAsRust> NoSafeAliasing
+    for crate::utils::aliasing::FinalCudaRepresentation<T>
+{
+}
 
-    impl<T, const STRIDE: usize> NoAliasing
-        for crate::utils::aliasing::SplitSliceOverCudaThreadsConstStride<T, STRIDE>
-    {
-    }
-    impl<T> NoAliasing for crate::utils::aliasing::SplitSliceOverCudaThreadsDynamicStride<T> {}
+unsafe impl<T, const STRIDE: usize> NoSafeAliasing
+    for crate::utils::aliasing::SplitSliceOverCudaThreadsConstStride<T, STRIDE>
+{
+}
+unsafe impl<T> NoSafeAliasing
+    for crate::utils::aliasing::SplitSliceOverCudaThreadsDynamicStride<T>
+{
+}
 
-    // Thread-block-shared data only allows unsafe aliasing since only raw pointers
-    //  are exposed
-    impl<T: 'static> NoAliasing for crate::utils::shared::r#static::ThreadBlockShared<T> {}
-    impl<T: 'static + const_type_layout::TypeGraphLayout> NoAliasing
-        for crate::utils::shared::slice::ThreadBlockSharedSlice<T>
-    {
-    }
+// Thread-block-shared data only allows unsafe aliasing since only raw pointers
+//  are exposed
+unsafe impl<T: 'static> NoSafeAliasing for crate::utils::shared::r#static::ThreadBlockShared<T> {}
+unsafe impl<T: 'static + const_type_layout::TypeGraphLayout> NoSafeAliasing
+    for crate::utils::shared::slice::ThreadBlockSharedSlice<T>
+{
 }
diff --git a/src/safety/stack_only.rs b/src/safety/stack_only.rs
index 5dc5c0cbb..bfb4e80d0 100644
--- a/src/safety/stack_only.rs
+++ b/src/safety/stack_only.rs
@@ -1,47 +1,86 @@
-/// ```rust
-/// # use rust_cuda::safety::StackOnly;
-/// fn assert_stackonly(_x: impl StackOnly) {}
-/// ```
-/// ```rust
-/// # use rust_cuda::safety::StackOnly;
-/// # fn assert_stackonly(_x: impl StackOnly) {}
-/// assert_stackonly(42);
-/// ```
-/// ```rust
-/// # use rust_cuda::safety::StackOnly;
-/// # fn assert_stackonly(_x: impl StackOnly) {}
-/// assert_stackonly([42; 42]);
-/// ```
-/// ```rust,compile_fail
-/// # use alloc::vec;
-/// # use rust_cuda::safety::StackOnly;
-/// # fn assert_stackonly(_x: impl StackOnly) {}
-/// assert_stackonly(vec![42]);
-/// ```
-/// ```rust,compile_fail
-/// # use alloc::vec;
-/// # use rust_cuda::safety::StackOnly;
-/// # fn assert_stackonly(_x: impl StackOnly) {}
-/// assert_stackonly(&42);
-/// ```
-#[allow(clippy::module_name_repetitions)]
-pub trait StackOnly: sealed::StackOnly {}
-impl<T: sealed::StackOnly> StackOnly for T {}
+macro_rules! stack_only_docs {
+    ($item:item) => {
+        /// Types which contain no pointers or references and can thus live entirely
+        /// on the stack.
+        ///
+        /// This trait is automatically implemented when the compiler determines
+        /// it's appropriate.
+        ///
+        /// Note that this trait is *sealed*, i.e. you cannot implement it on your
+        /// own custom types.
+        ///
+        /// Primitive types like [`u8`] and structs, tuples, and enums made only
+        /// from them implement [`StackOnly`].
+        ///
+        /// In contrast, `&T`, `&mut T`, `*const T`, `*mut T`, and any type
+        /// containing a reference or a pointer do *not* implement [`StackOnly`].
+        ///
+        /// # Examples
+        ///
+        /// ```rust
+        /// # use rust_cuda::safety::StackOnly;
+        /// fn assert_stackonly(_x: impl StackOnly) {}
+        /// ```
+        /// ```rust
+        /// # use rust_cuda::safety::StackOnly;
+        /// # fn assert_stackonly(_x: impl StackOnly) {}
+        /// assert_stackonly(42); // ok
+        /// ```
+        /// ```rust
+        /// # use rust_cuda::safety::StackOnly;
+        /// # fn assert_stackonly(_x: impl StackOnly) {}
+        /// assert_stackonly([42; 42]); // ok
+        /// ```
+        /// ```rust,compile_fail
+        /// # use alloc::vec;
+        /// # use rust_cuda::safety::StackOnly;
+        /// # fn assert_stackonly(_x: impl StackOnly) {}
+        /// assert_stackonly(vec![42]); // error
+        /// ```
+        /// ```rust,compile_fail
+        /// # use alloc::vec;
+        /// # use rust_cuda::safety::StackOnly;
+        /// # fn assert_stackonly(_x: impl StackOnly) {}
+        /// assert_stackonly(&42); // error
+        /// ```
+        /// ```rust,compile_fail
+        /// # use alloc::vec;
+        /// # use rust_cuda::safety::StackOnly;
+        /// # fn assert_stackonly(_x: impl StackOnly) {}
+        /// # use crate::utils::shared::r#static::ThreadBlockShared;
+        /// assert_stackonly(ThreadBlockShared::new_uninit()); // error
+        /// ```
+        /// ```rust,compile_fail
+        /// # use alloc::vec;
+        /// # use rust_cuda::safety::StackOnly;
+        /// # fn assert_stackonly(_x: impl StackOnly) {}
+        /// # use crate::utils::shared::slice::ThreadBlockSharedSlice;
+        /// assert_stackonly(ThreadBlockSharedSlice::new_uninit_with_len(0)); // error
+        /// ```
+        $item
+    };
+}
 
-mod sealed {
-    pub auto trait StackOnly {}
+#[cfg(not(doc))]
+stack_only_docs! {
+    #[allow(clippy::module_name_repetitions)]
+    pub trait StackOnly: sealed::Sealed {}
+}
+#[cfg(doc)]
+stack_only_docs! {
+    pub use sealed::Sealed as StackOnly;
+}
+
+#[cfg(not(doc))]
+impl<T: sealed::Sealed> StackOnly for T {}
 
-    impl<T> !StackOnly for *const T {}
-    impl<T> !StackOnly for *mut T {}
-    impl<T> !StackOnly for &T {}
-    impl<T> !StackOnly for &mut T {}
+mod sealed {
+    pub auto trait Sealed {}
 
-    // Thread-block-shared data contains data not on the stack
-    impl<T: 'static> !StackOnly for crate::utils::shared::r#static::ThreadBlockShared<T> {}
-    impl<T: 'static + const_type_layout::TypeGraphLayout> !StackOnly
-        for crate::utils::shared::slice::ThreadBlockSharedSlice<T>
-    {
-    }
+    impl<T> !Sealed for &T {}
+    impl<T> !Sealed for &mut T {}
+    impl<T> !Sealed for *const T {}
+    impl<T> !Sealed for *mut T {}
 
-    impl<T> StackOnly for core::marker::PhantomData<T> {}
+    impl<T> Sealed for core::marker::PhantomData<T> {}
 }
diff --git a/src/safety/unified_heap.rs b/src/safety/unified_heap.rs
deleted file mode 100644
index 483b40c3a..000000000
--- a/src/safety/unified_heap.rs
+++ /dev/null
@@ -1,53 +0,0 @@
-#[doc(cfg(feature = "alloc"))]
-/// ```rust
-/// # use rust_cuda::safety::UnifiedHeapOnly;
-/// fn assert_unified_heap_only(_x: impl UnifiedHeapOnly) {}
-/// ```
-/// ```rust
-/// # use rust_cuda::safety::UnifiedHeapOnly;
-/// # fn assert_unified_heap_only(_x: impl UnifiedHeapOnly) {}
-/// assert_unified_heap_only(42);
-/// ```
-/// ```rust
-/// # use rust_cuda::safety::UnifiedHeapOnly;
-/// # fn assert_unified_heap_only(_x: impl UnifiedHeapOnly) {}
-/// assert_unified_heap_only([42; 42]);
-/// ```
-/// ```rust,compile_fail
-/// # use alloc::vec;
-/// # use rust_cuda::safety::UnifiedHeapOnly;
-/// # fn assert_unified_heap_only(_x: impl UnifiedHeapOnly) {}
-/// assert_unified_heap_only(vec![42]);
-/// ```
-/// ```rust,compile_fail
-/// # use rust_cuda::safety::UnifiedHeapOnly;
-/// # fn assert_unified_heap_only(_x: impl UnifiedHeapOnly) {}
-/// assert_unified_heap_only(&42);
-/// ```
-#[allow(clippy::module_name_repetitions)]
-pub trait UnifiedHeapOnly: sealed::UnifiedHeapOnly {}
-impl<T: sealed::UnifiedHeapOnly> UnifiedHeapOnly for T {}
-
-mod sealed {
-    use crate::utils::alloc::UnifiedAllocator;
-
-    pub auto trait UnifiedHeapOnly {}
-
-    impl<T> !UnifiedHeapOnly for *const T {}
-    impl<T> !UnifiedHeapOnly for *mut T {}
-    impl<T> !UnifiedHeapOnly for &T {}
-    impl<T> !UnifiedHeapOnly for &mut T {}
-
-    // Thread-block-shared data contains CUDA-only data
-    impl<T: 'static> !UnifiedHeapOnly for crate::utils::shared::r#static::ThreadBlockShared<T> {}
-    impl<T: 'static + const_type_layout::TypeGraphLayout> !UnifiedHeapOnly
-        for crate::utils::shared::slice::ThreadBlockSharedSlice<T>
-    {
-    }
-
-    impl<T> UnifiedHeapOnly for core::marker::PhantomData<T> {}
-
-    impl<T> UnifiedHeapOnly for alloc::boxed::Box<T, UnifiedAllocator> {}
-    impl<T> UnifiedHeapOnly for alloc::vec::Vec<T, UnifiedAllocator> {}
-    impl<T> UnifiedHeapOnly for hashbrown::HashMap<T, UnifiedAllocator> {}
-}
diff --git a/src/utils/alloc.rs b/src/utils/alloc.rs
deleted file mode 100644
index 3bbcf225b..000000000
--- a/src/utils/alloc.rs
+++ /dev/null
@@ -1,67 +0,0 @@
-use alloc::alloc::{AllocError, Allocator, Layout};
-use core::ptr::NonNull;
-
-#[allow(clippy::module_name_repetitions)]
-pub struct UnifiedAllocator;
-
-unsafe impl Allocator for UnifiedAllocator {
-    #[cfg(feature = "host")]
-    fn allocate(&self, layout: Layout) -> Result<NonNull<[u8]>, AllocError> {
-        if layout.size() == 0 {
-            return Ok(NonNull::<[u8; 0]>::dangling());
-        }
-
-        match layout.align() {
-            1 => alloc_unified_aligned::<u8>(layout.size()),
-            2 => alloc_unified_aligned::<u16>(layout.size() >> 1),
-            4 => alloc_unified_aligned::<u32>(layout.size() >> 2),
-            8 => alloc_unified_aligned::<u64>(layout.size() >> 3),
-            _ => Err(AllocError),
-        }
-    }
-
-    #[cfg(not(feature = "host"))]
-    fn allocate(&self, _layout: Layout) -> Result<NonNull<[u8]>, AllocError> {
-        Err(AllocError)
-    }
-
-    #[cfg(feature = "host")]
-    unsafe fn deallocate(&self, ptr: NonNull<u8>, layout: Layout) {
-        use rustacuda::{
-            error::CudaResult,
-            memory::{cuda_free_unified, UnifiedPointer},
-        };
-
-        if layout.size() == 0 {
-            return;
-        }
-
-        let _: CudaResult<()> = cuda_free_unified(UnifiedPointer::wrap(ptr.as_ptr()));
-    }
-
-    #[cfg(not(feature = "host"))]
-    unsafe fn deallocate(&self, _ptr: NonNull<u8>, _layout: Layout) {
-        // no-op
-    }
-}
-
-#[cfg(feature = "host")]
-fn alloc_unified_aligned<T: rustacuda_core::DeviceCopy>(
-    size: usize,
-) -> Result<NonNull<[u8]>, AllocError> {
-    use rustacuda::memory::cuda_malloc_unified;
-
-    match unsafe { cuda_malloc_unified::<T>(size) } {
-        Ok(mut ptr) => {
-            let bytes: &mut [u8] = unsafe {
-                core::slice::from_raw_parts_mut(
-                    ptr.as_raw_mut().cast(),
-                    size * core::mem::align_of::<T>(),
-                )
-            };
-
-            NonNull::new(bytes).ok_or(AllocError)
-        },
-        Err(_) => Err(AllocError),
-    }
-}
diff --git a/src/utils/mod.rs b/src/utils/mod.rs
index c70432f31..dadf5a443 100644
--- a/src/utils/mod.rs
+++ b/src/utils/mod.rs
@@ -1,7 +1,4 @@
 pub mod aliasing;
-#[cfg(any(feature = "alloc", doc))]
-#[doc(cfg(feature = "alloc"))]
-pub mod alloc;
 pub mod device_copy;
 pub mod exchange;
 pub mod shared;
diff --git a/src/utils/shared/slice.rs b/src/utils/shared/slice.rs
index 0a8a66c62..804623ae4 100644
--- a/src/utils/shared/slice.rs
+++ b/src/utils/shared/slice.rs
@@ -1,20 +1,12 @@
-#[cfg(not(target_os = "cuda"))]
-use core::marker::PhantomData;
-
 use const_type_layout::TypeGraphLayout;
 
-#[cfg(not(target_os = "cuda"))]
-#[allow(clippy::module_name_repetitions)]
-#[repr(transparent)]
-pub struct ThreadBlockSharedSlice<T: 'static + TypeGraphLayout> {
-    len: usize,
-    marker: PhantomData<T>,
-}
-
-#[cfg(target_os = "cuda")]
 #[allow(clippy::module_name_repetitions)]
 #[repr(transparent)]
 pub struct ThreadBlockSharedSlice<T: 'static + TypeGraphLayout> {
+    #[cfg(not(target_os = "cuda"))]
+    // dangling marker s.t. Self is not StackOnly
+    dangling: *mut [T],
+    #[cfg(target_os = "cuda")]
     shared: *mut [T],
 }
 
@@ -24,8 +16,7 @@ impl<T: 'static + TypeGraphLayout> ThreadBlockSharedSlice<T> {
     #[must_use]
     pub fn new_uninit_with_len(len: usize) -> Self {
         Self {
-            len,
-            marker: PhantomData::<T>,
+            dangling: Self::dangling_slice_with_len(len),
         }
     }
 
@@ -33,7 +24,7 @@ impl<T: 'static + TypeGraphLayout> ThreadBlockSharedSlice<T> {
     #[doc(cfg(not(target_os = "cuda")))]
     #[must_use]
     pub fn with_len(mut self, len: usize) -> Self {
-        self.len = len;
+        self.dangling = Self::dangling_slice_with_len(len);
         self
     }
 
@@ -41,20 +32,27 @@ impl<T: 'static + TypeGraphLayout> ThreadBlockSharedSlice<T> {
     #[doc(cfg(not(target_os = "cuda")))]
     #[must_use]
     pub fn with_len_mut(&mut self, len: usize) -> &mut Self {
-        self.len = len;
+        self.dangling = Self::dangling_slice_with_len(len);
         self
     }
 
     #[cfg(not(target_os = "cuda"))]
-    #[must_use]
-    pub fn len(&self) -> usize {
-        self.len
+    fn dangling_slice_with_len(len: usize) -> *mut [T] {
+        core::ptr::slice_from_raw_parts_mut(core::ptr::NonNull::dangling().as_ptr(), len)
     }
 
-    #[cfg(target_os = "cuda")]
     #[must_use]
     pub fn len(&self) -> usize {
-        core::ptr::metadata(self.shared)
+        core::ptr::metadata({
+            #[cfg(not(target_os = "cuda"))]
+            {
+                self.dangling
+            }
+            #[cfg(target_os = "cuda")]
+            {
+                self.shared
+            }
+        })
     }
 
     #[must_use]
diff --git a/src/utils/shared/static.rs b/src/utils/shared/static.rs
index 5b8cdfc52..0ba7f9df0 100644
--- a/src/utils/shared/static.rs
+++ b/src/utils/shared/static.rs
@@ -1,43 +1,38 @@
-#[cfg(not(target_os = "cuda"))]
-use core::marker::PhantomData;
-
-#[cfg(not(target_os = "cuda"))]
-#[repr(transparent)]
-pub struct ThreadBlockShared<T: 'static> {
-    marker: PhantomData<T>,
-}
-
-#[cfg(target_os = "cuda")]
 #[repr(transparent)]
 pub struct ThreadBlockShared<T: 'static> {
+    #[cfg(not(target_os = "cuda"))]
+    // dangling marker s.t. Self is not StackOnly
+    _dangling: *mut T,
+    #[cfg(target_os = "cuda")]
     shared: *mut T,
 }
 
 impl<T: 'static> ThreadBlockShared<T> {
-    #[cfg(not(target_os = "cuda"))]
     #[must_use]
     pub fn new_uninit() -> Self {
-        Self {
-            marker: PhantomData::<T>,
+        #[cfg(not(target_os = "cuda"))]
+        {
+            Self {
+                _dangling: core::ptr::NonNull::dangling().as_ptr(),
+            }
         }
-    }
 
-    #[cfg(target_os = "cuda")]
-    #[must_use]
-    pub fn new_uninit() -> Self {
-        let shared: *mut T;
-
-        unsafe {
-            core::arch::asm!(
-                ".shared .align {align} .b8 {reg}_rust_cuda_static_shared[{size}];",
-                "cvta.shared.u64 {reg}, {reg}_rust_cuda_static_shared;",
-                reg = out(reg64) shared,
-                align = const(core::mem::align_of::<T>()),
-                size = const(core::mem::size_of::<T>()),
-            );
+        #[cfg(target_os = "cuda")]
+        {
+            let shared: *mut T;
+
+            unsafe {
+                core::arch::asm!(
+                    ".shared .align {align} .b8 {reg}_rust_cuda_static_shared[{size}];",
+                    "cvta.shared.u64 {reg}, {reg}_rust_cuda_static_shared;",
+                    reg = out(reg64) shared,
+                    align = const(core::mem::align_of::<T>()),
+                    size = const(core::mem::size_of::<T>()),
+                );
+            }
+
+            Self { shared }
         }
-
-        Self { shared }
     }
 
     #[cfg(any(target_os = "cuda", doc))]

From 4baa5dc7b49ad1d9e1830da981e866945df1dc59 Mon Sep 17 00:00:00 2001
From: Juniper Tyree <juniper.tyree@helsinki.fi>
Date: Wed, 13 Dec 2023 13:42:12 +0000
Subject: [PATCH 041/120] Fix move_to_cuda bound

---
 src/device/mod.rs | 4 ++--
 src/host.rs       | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/device/mod.rs b/src/device/mod.rs
index 45c833923..699424355 100644
--- a/src/device/mod.rs
+++ b/src/device/mod.rs
@@ -51,7 +51,7 @@ pub trait BorrowFromRust: RustToCuda {
         inner: F,
     ) -> O
     where
-        Self: Sized + SafeDeviceCopy,
+        Self: Sized,
         <Self as RustToCuda>::CudaRepresentation: SafeDeviceCopy;
 }
 
@@ -86,7 +86,7 @@ impl<T: RustToCuda> BorrowFromRust for T {
         inner: F,
     ) -> O
     where
-        Self: Sized + SafeDeviceCopy,
+        Self: Sized,
         <Self as RustToCuda>::CudaRepresentation: SafeDeviceCopy,
     {
         inner(CudaAsRust::as_rust(cuda_repr_mut.as_mut()))
diff --git a/src/host.rs b/src/host.rs
index aed9aaa83..9709798d3 100644
--- a/src/host.rs
+++ b/src/host.rs
@@ -180,7 +180,7 @@ pub trait LendToCuda: RustToCuda {
         inner: F,
     ) -> Result<O, E>
     where
-        Self: Sized + SafeDeviceCopy,
+        Self: Sized,
         <Self as RustToCuda>::CudaRepresentation: SafeDeviceCopy,
         <Self as RustToCuda>::CudaAllocation: EmptyCudaAlloc;
 }
@@ -238,7 +238,7 @@ impl<T: RustToCuda> LendToCuda for T {
         inner: F,
     ) -> Result<O, E>
     where
-        Self: Sized + SafeDeviceCopy,
+        Self: Sized,
         <Self as RustToCuda>::CudaRepresentation: SafeDeviceCopy,
         <Self as RustToCuda>::CudaAllocation: EmptyCudaAlloc,
     {

From 720d14a48fe1b01a2a6ff1a848c54d4215b7f766 Mon Sep 17 00:00:00 2001
From: Juniper Tyree <juniper.tyree@helsinki.fi>
Date: Wed, 13 Dec 2023 13:58:43 +0000
Subject: [PATCH 042/120] Fix clippy for 1.76

---
 rust-cuda-derive/src/lib.rs | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/rust-cuda-derive/src/lib.rs b/rust-cuda-derive/src/lib.rs
index e94048081..74e76a2cc 100644
--- a/rust-cuda-derive/src/lib.rs
+++ b/rust-cuda-derive/src/lib.rs
@@ -5,7 +5,8 @@
 #![feature(if_let_guard)]
 #![feature(let_chains)]
 #![feature(map_try_insert)]
-#![feature(c_str_literals)]
+#![feature(cfg_version)]
+#![cfg_attr(not(version("1.76.0")), feature(c_str_literals))]
 #![doc(html_root_url = "https://juntyr.github.io/rust-cuda/")]
 
 extern crate proc_macro;

From 942c5f9a748e33c41fefe1cb8b22451b91de3334 Mon Sep 17 00:00:00 2001
From: Juniper Tyree <juniper.tyree@helsinki.fi>
Date: Thu, 14 Dec 2023 13:51:17 +0000
Subject: [PATCH 043/120] Cleaned up the rust-cuda device macros with better
 print

The implementation still uses String for dynamic formatting, which
currently pulls in loads of formatting and panic machinery.

While a custom String type that pre-allocated the exact format String
length can avoid some of that, the formatting machinery even for e.g.
usize is still large.

If `format_args!` is ever optimised for better inlining, the more
verbose and lower-level implementation could be reconsidered.
---
 .vscode/settings.json                         |   6 +-
 Cargo.toml                                    |   2 +-
 examples/derive/Cargo.toml                    |   3 +-
 examples/print/.cargo/config.toml             |   5 +
 examples/print/Cargo.toml                     |  14 +++
 examples/print/src/main.rs                    |  69 +++++++++++
 examples/single-source/.cargo/config.toml     |   2 +-
 examples/single-source/Cargo.toml             |   5 +-
 examples/single-source/src/main.rs            |   7 +-
 rust-cuda-derive/Cargo.toml                   |   2 +-
 rust-cuda-derive/src/kernel/link/mod.rs       |  12 ++
 .../src/kernel/link/ptx_compiler_sys.rs       |  10 +-
 rust-cuda-derive/src/kernel/lints.rs          |   3 +
 .../generate/cpu_linker_macro/get_ptx_str.rs  |   1 +
 .../kernel/wrapper/generate/cuda_wrapper.rs   |   1 +
 rust-cuda-derive/src/kernel/wrapper/mod.rs    |   1 +
 rust-cuda-ptx-jit/Cargo.toml                  |   2 +-
 src/device/macros.rs                          | 115 ------------------
 src/device/mod.rs                             |   3 +-
 src/device/utils.rs                           |  28 +++++
 20 files changed, 153 insertions(+), 138 deletions(-)
 create mode 100644 examples/print/.cargo/config.toml
 create mode 100644 examples/print/Cargo.toml
 create mode 100644 examples/print/src/main.rs
 delete mode 100644 src/device/macros.rs
 create mode 100644 src/device/utils.rs

diff --git a/.vscode/settings.json b/.vscode/settings.json
index b033ed643..c2b4219f5 100644
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -4,5 +4,9 @@
     "rust-analyzer.updates.askBeforeDownload": false,
     "rust-analyzer.checkOnSave.command": "reap-clippy",
     "rust-analyzer.cargo.allFeatures": false,
-    "rust-analyzer.cargo.features": ["derive", "host"],
+    "rust-analyzer.cargo.features": [
+        "derive",
+        "host"
+    ],
+    "rust-analyzer.showUnlinkedFileNotification": false,
 }
diff --git a/Cargo.toml b/Cargo.toml
index a2076ca1c..9e9a568f2 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,7 +1,7 @@
 [workspace]
 members = [
     ".", "rust-cuda-derive", "rust-cuda-ptx-jit",
-    "examples/single-source", "examples/derive",
+    "examples/derive", "examples/print", "examples/single-source",
 ]
 default-members = [
     ".", "rust-cuda-derive", "rust-cuda-ptx-jit"
diff --git a/examples/derive/Cargo.toml b/examples/derive/Cargo.toml
index f4ea53d90..1b000fe8c 100644
--- a/examples/derive/Cargo.toml
+++ b/examples/derive/Cargo.toml
@@ -1,12 +1,11 @@
 [package]
 name = "derive"
 version = "0.1.0"
-authors = ["Juniper Tyree <juniper.langenstein@helsinki.fi>"]
+authors = ["Juniper Tyree <juniper.tyree@helsinki.fi>"]
 license = "MIT OR Apache-2.0"
 edition = "2021"
 
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 
 [dependencies]
-const-type-layout = { version = "0.2.0" }
 rc = { package = "rust-cuda", path = "../../", features = ["derive", "host"] }
diff --git a/examples/print/.cargo/config.toml b/examples/print/.cargo/config.toml
new file mode 100644
index 000000000..f7029e166
--- /dev/null
+++ b/examples/print/.cargo/config.toml
@@ -0,0 +1,5 @@
+[target.nvptx64-nvidia-cuda]
+rustflags = ["-Clink-args=--arch sm_35", "-Clinker-plugin-lto", "-Ccodegen-units=1", "-Clink-arg=-O3", "-Clink-arg=--lto"]
+
+[unstable]
+features = ["all"]
diff --git a/examples/print/Cargo.toml b/examples/print/Cargo.toml
new file mode 100644
index 000000000..21f513d8f
--- /dev/null
+++ b/examples/print/Cargo.toml
@@ -0,0 +1,14 @@
+[package]
+name = "print"
+version = "0.1.0"
+authors = ["Juniper Tyree <juniper.tyree@helsinki.fi>"]
+license = "MIT OR Apache-2.0"
+edition = "2021"
+
+# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+
+[target.'cfg(target_os = "cuda")'.dependencies]
+rust-cuda = { path = "../../", features = ["derive"] }
+
+[target.'cfg(not(target_os = "cuda"))'.dependencies]
+rust-cuda = { path = "../../", features = ["derive", "host"] }
diff --git a/examples/print/src/main.rs b/examples/print/src/main.rs
new file mode 100644
index 000000000..49492e0a4
--- /dev/null
+++ b/examples/print/src/main.rs
@@ -0,0 +1,69 @@
+#![deny(clippy::pedantic)]
+#![cfg_attr(target_os = "cuda", no_std)]
+#![cfg_attr(target_os = "cuda", no_main)]
+#![cfg_attr(target_os = "cuda", feature(abi_ptx))]
+#![cfg_attr(target_os = "cuda", feature(alloc_error_handler))]
+#![cfg_attr(target_os = "cuda", feature(asm_experimental_arch))]
+#![cfg_attr(target_os = "cuda", feature(core_panic))]
+
+extern crate alloc;
+
+#[cfg(not(target_os = "cuda"))]
+fn main() {}
+
+#[rust_cuda::common::kernel(use link_kernel! as impl Kernel<KernelArgs, KernelPtx> for Launcher)]
+#[kernel(deny(
+    ptx::double_precision_use,
+    ptx::local_memory_usage,
+    ptx::register_spills,
+    ptx::dynamic_stack_size
+))]
+pub fn kernel() {
+    rust_cuda::device::utils::print(format_args!("println! from CUDA kernel"));
+}
+
+#[cfg(not(target_os = "cuda"))]
+mod host {
+    #[allow(unused_imports)]
+    use super::KernelArgs;
+    use super::{Kernel, KernelPtx};
+
+    #[allow(dead_code)]
+    struct Launcher;
+
+    link_kernel!();
+
+    impl rust_cuda::host::Launcher for Launcher {
+        type CompilationWatcher = ();
+        type KernelTraitObject = dyn Kernel;
+
+        fn get_launch_package(&mut self) -> rust_cuda::host::LaunchPackage<Self> {
+            unimplemented!()
+        }
+    }
+}
+
+#[cfg(target_os = "cuda")]
+mod cuda_prelude {
+    use rust_cuda::device::alloc::PTXAllocator;
+
+    #[global_allocator]
+    static _GLOBAL_ALLOCATOR: PTXAllocator = PTXAllocator;
+
+    #[panic_handler]
+    fn panic(info: &::core::panic::PanicInfo) -> ! {
+        rust_cuda::device::utils::print(format_args!("{info}\n"));
+
+        rust_cuda::device::utils::abort()
+    }
+
+    #[alloc_error_handler]
+    fn alloc_error_handler(layout: ::core::alloc::Layout) -> ! {
+        let (size, align) = (layout.size(), layout.align());
+
+        ::core::panicking::panic_nounwind_fmt(
+            format_args!("memory allocation of {size} bytes with alignment {align} failed\n"),
+            true,
+        )
+    }
+}
diff --git a/examples/single-source/.cargo/config.toml b/examples/single-source/.cargo/config.toml
index 48db9d693..f7029e166 100644
--- a/examples/single-source/.cargo/config.toml
+++ b/examples/single-source/.cargo/config.toml
@@ -1,5 +1,5 @@
 [target.nvptx64-nvidia-cuda]
-rustflags = ["-Clink-args=--arch sm_35", "-Clink-arg=-O3", "-Clink-arg=--lto"]
+rustflags = ["-Clink-args=--arch sm_35", "-Clinker-plugin-lto", "-Ccodegen-units=1", "-Clink-arg=-O3", "-Clink-arg=--lto"]
 
 [unstable]
 features = ["all"]
diff --git a/examples/single-source/Cargo.toml b/examples/single-source/Cargo.toml
index 351d694a0..eeada181d 100644
--- a/examples/single-source/Cargo.toml
+++ b/examples/single-source/Cargo.toml
@@ -1,15 +1,12 @@
 [package]
 name = "single-source"
 version = "0.1.0"
-authors = ["Juniper Tyree <juniper.langenstein@helsinki.fi>"]
+authors = ["Juniper Tyree <juniper.tyree@helsinki.fi>"]
 license = "MIT OR Apache-2.0"
 edition = "2021"
 
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 
-[dependencies]
-const-type-layout = { version = "0.2.0" }
-
 [target.'cfg(target_os = "cuda")'.dependencies]
 rc = { package = "rust-cuda", path = "../../", features = ["derive"] }
 
diff --git a/examples/single-source/src/main.rs b/examples/single-source/src/main.rs
index b80a14201..085bd3b8d 100644
--- a/examples/single-source/src/main.rs
+++ b/examples/single-source/src/main.rs
@@ -3,7 +3,6 @@
 #![cfg_attr(target_os = "cuda", no_main)]
 #![cfg_attr(target_os = "cuda", feature(abi_ptx))]
 #![cfg_attr(target_os = "cuda", feature(alloc_error_handler))]
-#![cfg_attr(target_os = "cuda", feature(stdsimd))]
 #![cfg_attr(target_os = "cuda", feature(asm_experimental_arch))]
 #![feature(const_type_name)]
 #![feature(offset_of)]
@@ -103,8 +102,6 @@ mod host {
 
 #[cfg(target_os = "cuda")]
 mod cuda_prelude {
-    use core::arch::nvptx;
-
     use rc::device::alloc::PTXAllocator;
 
     #[global_allocator]
@@ -112,11 +109,11 @@ mod cuda_prelude {
 
     #[panic_handler]
     fn panic(_: &::core::panic::PanicInfo) -> ! {
-        unsafe { nvptx::trap() }
+        rc::device::utils::abort()
     }
 
     #[alloc_error_handler]
     fn alloc_error_handler(_: core::alloc::Layout) -> ! {
-        unsafe { nvptx::trap() }
+        rc::device::utils::abort()
     }
 }
diff --git a/rust-cuda-derive/Cargo.toml b/rust-cuda-derive/Cargo.toml
index 41ad5a33f..31a686008 100644
--- a/rust-cuda-derive/Cargo.toml
+++ b/rust-cuda-derive/Cargo.toml
@@ -1,7 +1,7 @@
 [package]
 name = "rust-cuda-derive"
 version = "0.1.0"
-authors = ["Juniper Tyree <juniper.langenstein@helsinki.fi>"]
+authors = ["Juniper Tyree <juniper.tyree@helsinki.fi>"]
 license = "MIT OR Apache-2.0"
 edition = "2021"
 links = "libnvptxcompiler_static"
diff --git a/rust-cuda-derive/src/kernel/link/mod.rs b/rust-cuda-derive/src/kernel/link/mod.rs
index ae0b5ea63..78a352780 100644
--- a/rust-cuda-derive/src/kernel/link/mod.rs
+++ b/rust-cuda-derive/src/kernel/link/mod.rs
@@ -417,6 +417,12 @@ fn check_kernel_ptx(
             {
                 options.push(c"--warn-on-spills");
             }
+            if ptx_lint_levels
+                .get(&PtxLint::DynamicStackSize)
+                .map_or(true, |level| *level <= LintLevel::Warn)
+            {
+                options.push(c"--suppress-stack-size-warning");
+            }
             options.push(c"--warning-as-error");
 
             let options_ptrs = options.iter().map(|o| o.as_ptr()).collect::<Vec<_>>();
@@ -454,6 +460,12 @@ fn check_kernel_ptx(
         {
             options.push(c"--warn-on-spills");
         }
+        if ptx_lint_levels
+            .get(&PtxLint::DynamicStackSize)
+            .map_or(true, |level| *level < LintLevel::Warn)
+        {
+            options.push(c"--suppress-stack-size-warning");
+        }
 
         let options_ptrs = options.iter().map(|o| o.as_ptr()).collect::<Vec<_>>();
 
diff --git a/rust-cuda-derive/src/kernel/link/ptx_compiler_sys.rs b/rust-cuda-derive/src/kernel/link/ptx_compiler_sys.rs
index 0ab332dad..fac72cebf 100644
--- a/rust-cuda-derive/src/kernel/link/ptx_compiler_sys.rs
+++ b/rust-cuda-derive/src/kernel/link/ptx_compiler_sys.rs
@@ -4,7 +4,7 @@ use thiserror::Error;
 pub type size_t = ::std::os::raw::c_ulonglong;
 
 #[repr(C)]
-pub struct nvPTXCompiler {
+pub struct NvptxCompiler {
     _private: [u8; 0],
 }
 
@@ -60,15 +60,15 @@ impl NvptxError {
     }
 }
 
-/// [`nvPTXCompilerHandle`] represents a handle to the PTX Compiler.
+/// [`NvptxCompilerHandle`] represents a handle to the PTX Compiler.
 ///
-/// To compile a PTX program string, an instance of [`nvPTXCompiler`]
+/// To compile a PTX program string, an instance of [`NvptxCompiler`]
 /// must be created and the handle to it must be obtained using the
 /// API [`nvPTXCompilerCreate`]. Then the compilation can be done
 /// using the API [`nvPTXCompilerCompile`].
-pub type NvptxCompilerHandle = *mut nvPTXCompiler;
+pub type NvptxCompilerHandle = *mut NvptxCompiler;
 
-/// The [`nvPTXCompiler`] APIs return the [`nvPTXCompileResult`] codes to
+/// The [`NvptxCompiler`] APIs return the [`NvptxCompileResult`] codes to
 /// indicate the call result"]
 pub type NvptxCompileResult = ::std::os::raw::c_int;
 
diff --git a/rust-cuda-derive/src/kernel/lints.rs b/rust-cuda-derive/src/kernel/lints.rs
index 6da06ed4b..6c198b71a 100644
--- a/rust-cuda-derive/src/kernel/lints.rs
+++ b/rust-cuda-derive/src/kernel/lints.rs
@@ -91,6 +91,7 @@ pub fn parse_ptx_lint_level(
             l if l == "local_memory_usage" => PtxLint::LocalMemoryUsage,
             l if l == "register_spills" => PtxLint::RegisterSpills,
             l if l == "dump_binary" => PtxLint::DumpBinary,
+            l if l == "dynamic_stack_size" => PtxLint::DynamicStackSize,
             _ => {
                 emit_error!(
                     meta.span(),
@@ -153,6 +154,7 @@ pub enum PtxLint {
     LocalMemoryUsage,
     RegisterSpills,
     DumpBinary,
+    DynamicStackSize,
 }
 
 impl fmt::Display for PtxLint {
@@ -163,6 +165,7 @@ impl fmt::Display for PtxLint {
             Self::LocalMemoryUsage => fmt.write_str("local_memory_usage"),
             Self::RegisterSpills => fmt.write_str("register_spills"),
             Self::DumpBinary => fmt.write_str("dump_binary"),
+            Self::DynamicStackSize => fmt.write_str("dynamic_stack_size"),
         }
     }
 }
diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/get_ptx_str.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/get_ptx_str.rs
index d62445803..10732a133 100644
--- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/get_ptx_str.rs
+++ b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/get_ptx_str.rs
@@ -91,6 +91,7 @@ pub(super) fn quote_get_ptx_str(
 
             #[deny(improper_ctypes)]
             mod __rust_cuda_ffi_safe_assert {
+                #[allow(unused_imports)]
                 use super::#args;
 
                 extern "C" { #(
diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cuda_wrapper.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cuda_wrapper.rs
index 042ae5e7a..40d4abfbf 100644
--- a/rust-cuda-derive/src/kernel/wrapper/generate/cuda_wrapper.rs
+++ b/rust-cuda-derive/src/kernel/wrapper/generate/cuda_wrapper.rs
@@ -122,6 +122,7 @@ pub(in super::super) fn quote_cuda_wrapper(
 
             #[deny(improper_ctypes)]
             mod __rust_cuda_ffi_safe_assert {
+                #[allow(unused_imports)]
                 use super::#args;
 
                 extern "C" { #(
diff --git a/rust-cuda-derive/src/kernel/wrapper/mod.rs b/rust-cuda-derive/src/kernel/wrapper/mod.rs
index a70c38e94..a812f9dd4 100644
--- a/rust-cuda-derive/src/kernel/wrapper/mod.rs
+++ b/rust-cuda-derive/src/kernel/wrapper/mod.rs
@@ -114,6 +114,7 @@ pub fn kernel(attr: TokenStream, func: TokenStream) -> TokenStream {
     let _ = ptx_lint_levels.try_insert(PtxLint::LocalMemoryUsage, LintLevel::Warn);
     let _ = ptx_lint_levels.try_insert(PtxLint::RegisterSpills, LintLevel::Warn);
     let _ = ptx_lint_levels.try_insert(PtxLint::DumpBinary, LintLevel::Allow);
+    let _ = ptx_lint_levels.try_insert(PtxLint::DynamicStackSize, LintLevel::Warn);
 
     let ptx_lint_levels = {
         let (lints, levels): (Vec<Ident>, Vec<Ident>) = ptx_lint_levels
diff --git a/rust-cuda-ptx-jit/Cargo.toml b/rust-cuda-ptx-jit/Cargo.toml
index aa7fa32c6..dc5fe4249 100644
--- a/rust-cuda-ptx-jit/Cargo.toml
+++ b/rust-cuda-ptx-jit/Cargo.toml
@@ -1,7 +1,7 @@
 [package]
 name = "rust-cuda-ptx-jit"
 version = "0.1.0"
-authors = ["Juniper Tyree <juniper.langenstein@helsinki.fi>"]
+authors = ["Juniper Tyree <juniper.tyree@helsinki.fi>"]
 license = "MIT OR Apache-2.0"
 edition = "2021"
 
diff --git a/src/device/macros.rs b/src/device/macros.rs
deleted file mode 100644
index 932ca75ae..000000000
--- a/src/device/macros.rs
+++ /dev/null
@@ -1,115 +0,0 @@
-// Based on https://github.com/popzxc/stdext-rs/blob/master/src/macros.rs
-#[macro_export]
-#[doc(hidden)]
-macro_rules! function {
-    () => {{
-        // Hack to get the name of the enclosing function
-        fn f() {}
-        fn type_name_of<T>(_: T) -> &'static str {
-            core::any::type_name::<T>()
-        }
-        let name = type_name_of(f);
-
-        // Remove the `::f` suffix
-        &name[..name.len() - 3]
-    }};
-}
-
-/// Alternative of [`std::print!`](https://doc.rust-lang.org/std/macro.print.html) using CUDA `vprintf` system-call
-#[doc(cfg(all(not(feature = "host"), target_os = "cuda")))]
-#[macro_export]
-macro_rules! print {
-    ($($arg:tt)*) => {
-        let msg = $crate::alloc::format!($($arg)*);
-
-        #[allow(unused_unsafe)]
-        unsafe {
-            ::core::arch::nvptx::vprintf(msg.as_ptr(), ::core::ptr::null_mut());
-        }
-    }
-}
-
-/// Alternative of [`std::println!`](https://doc.rust-lang.org/std/macro.println.html) using CUDA `vprintf` system-call
-#[doc(cfg(all(not(feature = "host"), target_os = "cuda")))]
-#[macro_export]
-macro_rules! println {
-    () => ($crate::print!("\n"));
-    ($fmt:expr) => ($crate::print!(concat!($fmt, "\n")));
-    ($fmt:expr, $($arg:tt)*) => ($crate::print!(concat!($fmt, "\n"), $($arg)*));
-}
-
-/// Assertion in GPU kernel for one expression is true.
-#[doc(cfg(all(not(feature = "host"), target_os = "cuda")))]
-#[macro_export]
-macro_rules! assert {
-    ($e:expr) => {
-        if !$e {
-            let msg = $crate::alloc::format!(
-                "\nassertion failed: {}\nexpression: {:?}",
-                stringify!($e),
-                $e,
-            );
-
-            unsafe {
-                ::core::arch::nvptx::__assert_fail(
-                    msg.as_ptr(),
-                    file!().as_ptr(),
-                    line!(),
-                    $crate::function!().as_ptr(),
-                )
-            };
-        }
-    };
-}
-
-/// Assertion in GPU kernel for two expressions are equal.
-#[doc(cfg(all(not(feature = "host"), target_os = "cuda")))]
-#[macro_export]
-macro_rules! assert_eq {
-    ($a:expr, $b:expr) => {
-        if $a != $b {
-            let msg = $crate::alloc::format!(
-                "\nassertion failed: ({} == {})\nleft : {:?}\nright: {:?}",
-                stringify!($a),
-                stringify!($b),
-                $a,
-                $b
-            );
-
-            unsafe {
-                ::core::arch::nvptx::__assert_fail(
-                    msg.as_ptr(),
-                    file!().as_ptr(),
-                    line!(),
-                    $crate::function!().as_ptr(),
-                )
-            };
-        }
-    };
-}
-
-/// Assertion in GPU kernel for two expressions are not equal.
-#[doc(cfg(all(not(feature = "host"), target_os = "cuda")))]
-#[macro_export]
-macro_rules! assert_ne {
-    ($a:expr, $b:expr) => {
-        if $a == $b {
-            let msg = $crate::alloc::format!(
-                "\nassertion failed: ({} != {})\nleft : {:?}\nright: {:?}",
-                stringify!($a),
-                stringify!($b),
-                $a,
-                $b
-            );
-
-            unsafe {
-                ::core::arch::nvptx::__assert_fail(
-                    msg.as_ptr(),
-                    file!().as_ptr(),
-                    line!(),
-                    $crate::function!().as_ptr(),
-                )
-            };
-        }
-    };
-}
diff --git a/src/device/mod.rs b/src/device/mod.rs
index 699424355..ca9aab9fd 100644
--- a/src/device/mod.rs
+++ b/src/device/mod.rs
@@ -14,8 +14,7 @@ use crate::{
 
 pub mod alloc;
 pub mod thread;
-
-mod macros;
+pub mod utils;
 
 pub trait BorrowFromRust: RustToCuda {
     /// # Safety
diff --git a/src/device/utils.rs b/src/device/utils.rs
new file mode 100644
index 000000000..e12f5b83c
--- /dev/null
+++ b/src/device/utils.rs
@@ -0,0 +1,28 @@
+/// Abort the CUDA kernel using the `trap` system call.
+#[allow(clippy::inline_always)]
+#[inline(always)]
+pub fn abort() -> ! {
+    unsafe { ::core::arch::nvptx::trap() }
+}
+
+/// The [`print`](print()) function takes an [`Arguments`](core::fmt::Arguments)
+/// struct and formats and prints it to the CUDA kernel's standard output using
+/// the `vprintf` system call.
+///
+/// The [`Arguments`](core::fmt::Arguments) instance can be created with the
+/// [`format_args!`](core::format_args) macro.
+#[allow(clippy::inline_always)]
+#[inline(always)]
+pub fn print(args: ::core::fmt::Arguments) {
+    let msg; // place to store the dynamically expanded format string
+    let msg = if let Some(msg) = args.as_str() {
+        msg
+    } else {
+        msg = ::alloc::fmt::format(args);
+        msg.as_str()
+    };
+
+    unsafe {
+        ::core::arch::nvptx::vprintf(msg.as_ptr(), ::core::ptr::null_mut());
+    }
+}

From 068e4584d7be8818d3ee16c7e8547a16b498728f Mon Sep 17 00:00:00 2001
From: Juniper Tyree <juniper.tyree@helsinki.fi>
Date: Sat, 16 Dec 2023 00:06:00 +0000
Subject: [PATCH 044/120] Switch to using more vprintf in embedded CUDA kernel

---
 examples/print/src/main.rs | 49 ++++++++++++++++++++++++++++----------
 src/device/alloc.rs        |  4 ++++
 src/device/utils.rs        | 16 ++++++++++++-
 src/lib.rs                 |  3 +++
 4 files changed, 59 insertions(+), 13 deletions(-)

diff --git a/examples/print/src/main.rs b/examples/print/src/main.rs
index 49492e0a4..4d38e0ede 100644
--- a/examples/print/src/main.rs
+++ b/examples/print/src/main.rs
@@ -4,7 +4,9 @@
 #![cfg_attr(target_os = "cuda", feature(abi_ptx))]
 #![cfg_attr(target_os = "cuda", feature(alloc_error_handler))]
 #![cfg_attr(target_os = "cuda", feature(asm_experimental_arch))]
-#![cfg_attr(target_os = "cuda", feature(core_panic))]
+#![feature(ptr_from_ref)]
+#![feature(stdsimd)]
+#![feature(c_str_literals)]
 
 extern crate alloc;
 
@@ -12,14 +14,11 @@ extern crate alloc;
 fn main() {}
 
 #[rust_cuda::common::kernel(use link_kernel! as impl Kernel<KernelArgs, KernelPtx> for Launcher)]
-#[kernel(deny(
-    ptx::double_precision_use,
-    ptx::local_memory_usage,
-    ptx::register_spills,
-    ptx::dynamic_stack_size
-))]
+#[kernel(allow(ptx::local_memory_usage))]
 pub fn kernel() {
     rust_cuda::device::utils::print(format_args!("println! from CUDA kernel"));
+
+    ::alloc::alloc::handle_alloc_error(::core::alloc::Layout::new::<i8>());
 }
 
 #[cfg(not(target_os = "cuda"))]
@@ -58,12 +57,38 @@ mod cuda_prelude {
     }
 
     #[alloc_error_handler]
+    #[track_caller]
     fn alloc_error_handler(layout: ::core::alloc::Layout) -> ! {
-        let (size, align) = (layout.size(), layout.align());
+        #[repr(C)]
+        struct FormatArgs {
+            size: usize,
+            align: usize,
+            file_len: u32,
+            file_ptr: *const u8,
+            line: u32,
+            column: u32,
+        }
+
+        let location = ::core::panic::Location::caller();
 
-        ::core::panicking::panic_nounwind_fmt(
-            format_args!("memory allocation of {size} bytes with alignment {align} failed\n"),
-            true,
-        )
+        unsafe {
+            ::core::arch::nvptx::vprintf(
+                c"memory allocation of %llu bytes with alignment %llu failed at %.*s:%lu:%lu\n"
+                    .as_ptr()
+                    .cast(),
+                #[allow(clippy::cast_possible_truncation)]
+                ::core::ptr::from_ref(&FormatArgs {
+                    size: layout.size(),
+                    align: layout.align(),
+                    file_len: location.file().len() as u32,
+                    file_ptr: location.file().as_ptr(),
+                    line: location.line(),
+                    column: location.column(),
+                })
+                .cast(),
+            );
+        }
+
+        rust_cuda::device::utils::abort()
     }
 }
diff --git a/src/device/alloc.rs b/src/device/alloc.rs
index 14a294814..0217fa939 100644
--- a/src/device/alloc.rs
+++ b/src/device/alloc.rs
@@ -6,10 +6,14 @@ use core::arch::nvptx;
 pub struct PTXAllocator;
 
 unsafe impl GlobalAlloc for PTXAllocator {
+    #[allow(clippy::inline_always)]
+    #[inline(always)]
     unsafe fn alloc(&self, layout: Layout) -> *mut u8 {
         nvptx::malloc(layout.size()).cast()
     }
 
+    #[allow(clippy::inline_always)]
+    #[inline(always)]
     unsafe fn dealloc(&self, ptr: *mut u8, _layout: Layout) {
         nvptx::free(ptr.cast());
     }
diff --git a/src/device/utils.rs b/src/device/utils.rs
index e12f5b83c..bac1c6d3b 100644
--- a/src/device/utils.rs
+++ b/src/device/utils.rs
@@ -14,6 +14,12 @@ pub fn abort() -> ! {
 #[allow(clippy::inline_always)]
 #[inline(always)]
 pub fn print(args: ::core::fmt::Arguments) {
+    #[repr(C)]
+    struct FormatArgs {
+        msg_len: u32,
+        msg_ptr: *const u8,
+    }
+
     let msg; // place to store the dynamically expanded format string
     let msg = if let Some(msg) = args.as_str() {
         msg
@@ -23,6 +29,14 @@ pub fn print(args: ::core::fmt::Arguments) {
     };
 
     unsafe {
-        ::core::arch::nvptx::vprintf(msg.as_ptr(), ::core::ptr::null_mut());
+        ::core::arch::nvptx::vprintf(
+            c"%.*s".as_ptr().cast(),
+            #[allow(clippy::cast_possible_truncation)]
+            ::core::ptr::from_ref(&FormatArgs {
+                msg_len: msg.len() as u32,
+                msg_ptr: msg.as_ptr(),
+            })
+            .cast(),
+        );
     }
 }
diff --git a/src/lib.rs b/src/lib.rs
index 273e27779..0316613c9 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -23,6 +23,9 @@
 #![feature(adt_const_params)]
 #![feature(impl_trait_in_assoc_type)]
 #![feature(ptr_metadata)]
+#![feature(ptr_from_ref)]
+#![feature(cfg_version)]
+#![cfg_attr(not(version("1.76.0")), feature(c_str_literals))]
 #![allow(incomplete_features)]
 #![feature(generic_const_exprs)]
 #![cfg_attr(target_os = "cuda", feature(slice_ptr_get))]

From eb1a9b4bf6c20c6778d134951ee0b4d1f42ac1a5 Mon Sep 17 00:00:00 2001
From: Juniper Tyree <juniper.tyree@helsinki.fi>
Date: Sat, 16 Dec 2023 01:45:07 -0800
Subject: [PATCH 045/120] Make print example fully executable

---
 examples/print/src/main.rs | 92 +++++++++++++++++++++++++++++++++++---
 src/device/utils.rs        |  2 +-
 2 files changed, 86 insertions(+), 8 deletions(-)

diff --git a/examples/print/src/main.rs b/examples/print/src/main.rs
index 4d38e0ede..94302a7a9 100644
--- a/examples/print/src/main.rs
+++ b/examples/print/src/main.rs
@@ -11,12 +11,52 @@
 extern crate alloc;
 
 #[cfg(not(target_os = "cuda"))]
-fn main() {}
+fn main() -> rust_cuda::rustacuda::error::CudaResult<()> {
+    // Initialize the CUDA API
+    rust_cuda::rustacuda::init(rust_cuda::rustacuda::CudaFlags::empty())?;
+
+    // Get the first device
+    let device = rust_cuda::rustacuda::device::Device::get_device(0)?;
+
+    // Create a context associated to this device
+    let context = rust_cuda::host::CudaDropWrapper::from(
+        rust_cuda::rustacuda::context::Context::create_and_push(
+            rust_cuda::rustacuda::context::ContextFlags::MAP_HOST
+                | rust_cuda::rustacuda::context::ContextFlags::SCHED_AUTO,
+            device,
+        )?,
+    );
+
+    rust_cuda::rustacuda::context::CurrentContext::set_resource_limit(
+        rust_cuda::rustacuda::context::ResourceLimit::StackSize,
+        4096,
+    )?;
+    rust_cuda::rustacuda::context::CurrentContext::set_resource_limit(
+        rust_cuda::rustacuda::context::ResourceLimit::PrintfFifoSize,
+        4096,
+    )?;
+
+    let stream = rust_cuda::host::CudaDropWrapper::from(rust_cuda::rustacuda::stream::Stream::new(
+        rust_cuda::rustacuda::stream::StreamFlags::NON_BLOCKING,
+        None,
+    )?);
+
+    let mut kernel = host::Launcher::try_new(
+        rust_cuda::rustacuda::function::GridSize::x(1),
+        rust_cuda::rustacuda::function::BlockSize::x(4),
+    )?;
+
+    kernel.kernel(&stream)?;
+
+    std::mem::drop(context);
+
+    Ok(())
+}
 
-#[rust_cuda::common::kernel(use link_kernel! as impl Kernel<KernelArgs, KernelPtx> for Launcher)]
+#[rust_cuda::common::kernel(pub use link_kernel! as impl Kernel<KernelArgs, KernelPtx> for Launcher)]
 #[kernel(allow(ptx::local_memory_usage))]
 pub fn kernel() {
-    rust_cuda::device::utils::print(format_args!("println! from CUDA kernel"));
+    rust_cuda::device::utils::print(format_args!("print from CUDA kernel\n"));
 
     ::alloc::alloc::handle_alloc_error(::core::alloc::Layout::new::<i8>());
 }
@@ -27,8 +67,28 @@ mod host {
     use super::KernelArgs;
     use super::{Kernel, KernelPtx};
 
-    #[allow(dead_code)]
-    struct Launcher;
+    pub struct Launcher {
+        kernel: rust_cuda::host::TypedKernel<dyn Kernel>,
+        grid: rust_cuda::rustacuda::function::GridSize,
+        block: rust_cuda::rustacuda::function::BlockSize,
+        watcher: (),
+    }
+
+    impl Launcher {
+        pub fn try_new(
+            grid: rust_cuda::rustacuda::function::GridSize,
+            block: rust_cuda::rustacuda::function::BlockSize,
+        ) -> rust_cuda::rustacuda::error::CudaResult<Self> {
+            let kernel = Self::new_kernel()?;
+
+            Ok(Self {
+                kernel,
+                grid,
+                block,
+                watcher: (),
+            })
+        }
+    }
 
     link_kernel!();
 
@@ -37,7 +97,18 @@ mod host {
         type KernelTraitObject = dyn Kernel;
 
         fn get_launch_package(&mut self) -> rust_cuda::host::LaunchPackage<Self> {
-            unimplemented!()
+            rust_cuda::host::LaunchPackage {
+                config: rust_cuda::host::LaunchConfig {
+                    grid: self.grid.clone(),
+                    block: self.block.clone(),
+                    shared_memory_size: 0_u32,
+                    ptx_jit: false,
+                },
+
+                kernel: &mut self.kernel,
+
+                watcher: &mut self.watcher,
+            }
         }
     }
 }
@@ -63,23 +134,30 @@ mod cuda_prelude {
         struct FormatArgs {
             size: usize,
             align: usize,
+            thread_idx_x: u32,
+            thread_idx_y: u32,
+            thread_idx_z: u32,
             file_len: u32,
             file_ptr: *const u8,
             line: u32,
             column: u32,
         }
 
+        let thread_idx = rust_cuda::device::thread::Thread::this().idx();
         let location = ::core::panic::Location::caller();
 
         unsafe {
             ::core::arch::nvptx::vprintf(
-                c"memory allocation of %llu bytes with alignment %llu failed at %.*s:%lu:%lu\n"
+                c"memory allocation of %llu bytes with alignment %llu failed on thread (x=%u, y=%u, z=%u) at %*s:%u:%u\n"
                     .as_ptr()
                     .cast(),
                 #[allow(clippy::cast_possible_truncation)]
                 ::core::ptr::from_ref(&FormatArgs {
                     size: layout.size(),
                     align: layout.align(),
+                    thread_idx_x: thread_idx.x,
+                    thread_idx_y: thread_idx.y,
+                    thread_idx_z: thread_idx.z,
                     file_len: location.file().len() as u32,
                     file_ptr: location.file().as_ptr(),
                     line: location.line(),
diff --git a/src/device/utils.rs b/src/device/utils.rs
index bac1c6d3b..8e644be48 100644
--- a/src/device/utils.rs
+++ b/src/device/utils.rs
@@ -30,7 +30,7 @@ pub fn print(args: ::core::fmt::Arguments) {
 
     unsafe {
         ::core::arch::nvptx::vprintf(
-            c"%.*s".as_ptr().cast(),
+            c"%*s".as_ptr().cast(),
             #[allow(clippy::cast_possible_truncation)]
             ::core::ptr::from_ref(&FormatArgs {
                 msg_len: msg.len() as u32,

From b0303d6bbbee5a5845f45bb65525dbc4cad00535 Mon Sep 17 00:00:00 2001
From: Juniper Tyree <juniper.tyree@helsinki.fi>
Date: Sat, 16 Dec 2023 13:31:49 +0000
Subject: [PATCH 046/120] Clean up the print example

---
 examples/print/.cargo/config.toml         |   3 -
 examples/print/src/main.rs                | 167 +++++++---------------
 examples/single-source/.cargo/config.toml |   3 -
 examples/single-source/src/main.rs        |   2 +-
 src/device/thread.rs                      |  22 +++
 src/device/utils.rs                       | 159 ++++++++++++++++++--
 src/host.rs                               |  24 +++-
 src/lib.rs                                |   3 +
 8 files changed, 248 insertions(+), 135 deletions(-)

diff --git a/examples/print/.cargo/config.toml b/examples/print/.cargo/config.toml
index f7029e166..4a98afe58 100644
--- a/examples/print/.cargo/config.toml
+++ b/examples/print/.cargo/config.toml
@@ -1,5 +1,2 @@
 [target.nvptx64-nvidia-cuda]
 rustflags = ["-Clink-args=--arch sm_35", "-Clinker-plugin-lto", "-Ccodegen-units=1", "-Clink-arg=-O3", "-Clink-arg=--lto"]
-
-[unstable]
-features = ["all"]
diff --git a/examples/print/src/main.rs b/examples/print/src/main.rs
index 94302a7a9..c74380bf2 100644
--- a/examples/print/src/main.rs
+++ b/examples/print/src/main.rs
@@ -4,22 +4,45 @@
 #![cfg_attr(target_os = "cuda", feature(abi_ptx))]
 #![cfg_attr(target_os = "cuda", feature(alloc_error_handler))]
 #![cfg_attr(target_os = "cuda", feature(asm_experimental_arch))]
-#![feature(ptr_from_ref)]
-#![feature(stdsimd)]
-#![feature(c_str_literals)]
+#![feature(const_type_name)]
 
 extern crate alloc;
 
+#[derive(rust_cuda::const_type_layout::TypeLayout)]
+#[layout(crate = "rust_cuda::const_type_layout")]
+#[repr(C)]
+pub enum Action {
+    Print,
+    Panic,
+    AllocError,
+}
+
+#[rust_cuda::common::kernel(pub use link_kernel! as impl Kernel<KernelArgs, KernelPtx> for Launcher)]
+#[kernel(allow(ptx::local_memory_usage))]
+pub fn kernel(#[kernel(pass = SafeDeviceCopy)] action: Action) {
+    match action {
+        Action::Print => rust_cuda::device::utils::println!("println! from CUDA kernel"),
+        Action::Panic => panic!("panic! from CUDA kernel"),
+        Action::AllocError => {
+            ::alloc::alloc::handle_alloc_error(::core::alloc::Layout::new::<i8>())
+        },
+    }
+}
+
 #[cfg(not(target_os = "cuda"))]
 fn main() -> rust_cuda::rustacuda::error::CudaResult<()> {
+    // Link the non-generic CUDA kernel
+    type Launcher = rust_cuda::host::SimpleKernelLauncher<dyn Kernel>;
+    link_kernel!();
+
     // Initialize the CUDA API
     rust_cuda::rustacuda::init(rust_cuda::rustacuda::CudaFlags::empty())?;
 
-    // Get the first device
+    // Get the first CUDA GPU device
     let device = rust_cuda::rustacuda::device::Device::get_device(0)?;
 
-    // Create a context associated to this device
-    let context = rust_cuda::host::CudaDropWrapper::from(
+    // Create a CUDA context associated to this device
+    let _context = rust_cuda::host::CudaDropWrapper::from(
         rust_cuda::rustacuda::context::Context::create_and_push(
             rust_cuda::rustacuda::context::ContextFlags::MAP_HOST
                 | rust_cuda::rustacuda::context::ContextFlags::SCHED_AUTO,
@@ -36,83 +59,34 @@ fn main() -> rust_cuda::rustacuda::error::CudaResult<()> {
         4096,
     )?;
 
+    // Create a new CUDA stream to submit kernels to
     let stream = rust_cuda::host::CudaDropWrapper::from(rust_cuda::rustacuda::stream::Stream::new(
         rust_cuda::rustacuda::stream::StreamFlags::NON_BLOCKING,
         None,
     )?);
 
-    let mut kernel = host::Launcher::try_new(
-        rust_cuda::rustacuda::function::GridSize::x(1),
-        rust_cuda::rustacuda::function::BlockSize::x(4),
-    )?;
-
-    kernel.kernel(&stream)?;
-
-    std::mem::drop(context);
+    // Create a new launcher for the CUDA kernel
+    let mut launcher = Launcher {
+        kernel: <Launcher as KernelPtx>::new_kernel()?,
+        config: rust_cuda::host::LaunchConfig {
+            grid: rust_cuda::rustacuda::function::GridSize::x(1),
+            block: rust_cuda::rustacuda::function::BlockSize::x(4),
+            shared_memory_size: 0,
+            ptx_jit: false,
+        },
+    };
+
+    // Launch the CUDA kernel on the stream and synchronise to its completion
+    println!("Launching print kernel ...");
+    launcher.kernel(&stream, Action::Print).unwrap();
+    println!("Launching panic kernel ...");
+    launcher.kernel(&stream, Action::Panic).unwrap_err();
+    println!("Launching alloc error kernel ...");
+    launcher.kernel(&stream, Action::AllocError).unwrap_err();
 
     Ok(())
 }
 
-#[rust_cuda::common::kernel(pub use link_kernel! as impl Kernel<KernelArgs, KernelPtx> for Launcher)]
-#[kernel(allow(ptx::local_memory_usage))]
-pub fn kernel() {
-    rust_cuda::device::utils::print(format_args!("print from CUDA kernel\n"));
-
-    ::alloc::alloc::handle_alloc_error(::core::alloc::Layout::new::<i8>());
-}
-
-#[cfg(not(target_os = "cuda"))]
-mod host {
-    #[allow(unused_imports)]
-    use super::KernelArgs;
-    use super::{Kernel, KernelPtx};
-
-    pub struct Launcher {
-        kernel: rust_cuda::host::TypedKernel<dyn Kernel>,
-        grid: rust_cuda::rustacuda::function::GridSize,
-        block: rust_cuda::rustacuda::function::BlockSize,
-        watcher: (),
-    }
-
-    impl Launcher {
-        pub fn try_new(
-            grid: rust_cuda::rustacuda::function::GridSize,
-            block: rust_cuda::rustacuda::function::BlockSize,
-        ) -> rust_cuda::rustacuda::error::CudaResult<Self> {
-            let kernel = Self::new_kernel()?;
-
-            Ok(Self {
-                kernel,
-                grid,
-                block,
-                watcher: (),
-            })
-        }
-    }
-
-    link_kernel!();
-
-    impl rust_cuda::host::Launcher for Launcher {
-        type CompilationWatcher = ();
-        type KernelTraitObject = dyn Kernel;
-
-        fn get_launch_package(&mut self) -> rust_cuda::host::LaunchPackage<Self> {
-            rust_cuda::host::LaunchPackage {
-                config: rust_cuda::host::LaunchConfig {
-                    grid: self.grid.clone(),
-                    block: self.block.clone(),
-                    shared_memory_size: 0_u32,
-                    ptx_jit: false,
-                },
-
-                kernel: &mut self.kernel,
-
-                watcher: &mut self.watcher,
-            }
-        }
-    }
-}
-
 #[cfg(target_os = "cuda")]
 mod cuda_prelude {
     use rust_cuda::device::alloc::PTXAllocator;
@@ -122,51 +96,14 @@ mod cuda_prelude {
 
     #[panic_handler]
     fn panic(info: &::core::panic::PanicInfo) -> ! {
-        rust_cuda::device::utils::print(format_args!("{info}\n"));
-
-        rust_cuda::device::utils::abort()
+        // pretty format and print the panic message
+        // but don't allow dynamic formatting or panic payload downcasting
+        rust_cuda::device::utils::pretty_panic_handler(info, false, false)
     }
 
     #[alloc_error_handler]
     #[track_caller]
     fn alloc_error_handler(layout: ::core::alloc::Layout) -> ! {
-        #[repr(C)]
-        struct FormatArgs {
-            size: usize,
-            align: usize,
-            thread_idx_x: u32,
-            thread_idx_y: u32,
-            thread_idx_z: u32,
-            file_len: u32,
-            file_ptr: *const u8,
-            line: u32,
-            column: u32,
-        }
-
-        let thread_idx = rust_cuda::device::thread::Thread::this().idx();
-        let location = ::core::panic::Location::caller();
-
-        unsafe {
-            ::core::arch::nvptx::vprintf(
-                c"memory allocation of %llu bytes with alignment %llu failed on thread (x=%u, y=%u, z=%u) at %*s:%u:%u\n"
-                    .as_ptr()
-                    .cast(),
-                #[allow(clippy::cast_possible_truncation)]
-                ::core::ptr::from_ref(&FormatArgs {
-                    size: layout.size(),
-                    align: layout.align(),
-                    thread_idx_x: thread_idx.x,
-                    thread_idx_y: thread_idx.y,
-                    thread_idx_z: thread_idx.z,
-                    file_len: location.file().len() as u32,
-                    file_ptr: location.file().as_ptr(),
-                    line: location.line(),
-                    column: location.column(),
-                })
-                .cast(),
-            );
-        }
-
-        rust_cuda::device::utils::abort()
+        rust_cuda::device::utils::pretty_alloc_error_handler(layout)
     }
 }
diff --git a/examples/single-source/.cargo/config.toml b/examples/single-source/.cargo/config.toml
index f7029e166..4a98afe58 100644
--- a/examples/single-source/.cargo/config.toml
+++ b/examples/single-source/.cargo/config.toml
@@ -1,5 +1,2 @@
 [target.nvptx64-nvidia-cuda]
 rustflags = ["-Clink-args=--arch sm_35", "-Clinker-plugin-lto", "-Ccodegen-units=1", "-Clink-arg=-O3", "-Clink-arg=--lto"]
-
-[unstable]
-features = ["all"]
diff --git a/examples/single-source/src/main.rs b/examples/single-source/src/main.rs
index 085bd3b8d..97e0be020 100644
--- a/examples/single-source/src/main.rs
+++ b/examples/single-source/src/main.rs
@@ -91,7 +91,7 @@ mod host {
     link_kernel!(rc::utils::device_copy::SafeDeviceCopyWrapper<u64>);
 
     impl<T: rc::common::RustToCuda> rc::host::Launcher for Launcher<T> {
-        type CompilationWatcher = ();
+        type CompilationWatcher<'a> = ();
         type KernelTraitObject = dyn Kernel<T>;
 
         fn get_launch_package(&mut self) -> rc::host::LaunchPackage<Self> {
diff --git a/src/device/thread.rs b/src/device/thread.rs
index 8f3bc5719..26ee357d2 100644
--- a/src/device/thread.rs
+++ b/src/device/thread.rs
@@ -18,11 +18,15 @@ pub struct ThreadBlockGrid {
 
 impl Thread {
     #[must_use]
+    #[allow(clippy::inline_always)]
+    #[inline(always)]
     pub fn this() -> Self {
         Self { _private: () }
     }
 
     #[must_use]
+    #[allow(clippy::inline_always)]
+    #[inline(always)]
     pub fn index(&self) -> usize {
         let block = self.block();
         let grid = block.grid();
@@ -34,6 +38,8 @@ impl Thread {
     }
 
     #[must_use]
+    #[allow(clippy::inline_always)]
+    #[inline(always)]
     pub fn idx(&self) -> Idx3 {
         #[allow(clippy::cast_sign_loss)]
         unsafe {
@@ -46,6 +52,8 @@ impl Thread {
     }
 
     #[must_use]
+    #[allow(clippy::inline_always)]
+    #[inline(always)]
     pub fn block(&self) -> ThreadBlock {
         ThreadBlock { _private: () }
     }
@@ -53,6 +61,8 @@ impl Thread {
 
 impl ThreadBlock {
     #[must_use]
+    #[allow(clippy::inline_always)]
+    #[inline(always)]
     pub fn dim(&self) -> Dim3 {
         #[allow(clippy::cast_sign_loss)]
         unsafe {
@@ -65,6 +75,8 @@ impl ThreadBlock {
     }
 
     #[must_use]
+    #[allow(clippy::inline_always)]
+    #[inline(always)]
     pub fn idx(&self) -> Idx3 {
         #[allow(clippy::cast_sign_loss)]
         unsafe {
@@ -77,10 +89,14 @@ impl ThreadBlock {
     }
 
     #[must_use]
+    #[allow(clippy::inline_always)]
+    #[inline(always)]
     pub fn grid(&self) -> ThreadBlockGrid {
         ThreadBlockGrid { _private: () }
     }
 
+    #[allow(clippy::inline_always)]
+    #[inline(always)]
     pub fn synchronize(&self) {
         unsafe { nvptx::_syncthreads() }
     }
@@ -88,6 +104,8 @@ impl ThreadBlock {
 
 impl ThreadBlockGrid {
     #[must_use]
+    #[allow(clippy::inline_always)]
+    #[inline(always)]
     pub fn dim(&self) -> Dim3 {
         #[allow(clippy::cast_sign_loss)]
         unsafe {
@@ -118,6 +136,8 @@ pub struct Idx3 {
 
 impl Dim3 {
     #[must_use]
+    #[allow(clippy::inline_always)]
+    #[inline(always)]
     pub fn size(&self) -> usize {
         (self.x as usize) * (self.y as usize) * (self.z as usize)
     }
@@ -125,6 +145,8 @@ impl Dim3 {
 
 impl Idx3 {
     #[must_use]
+    #[allow(clippy::inline_always)]
+    #[inline(always)]
     pub fn as_id(&self, dim: &Dim3) -> usize {
         (self.x as usize)
             + (self.y as usize) * (dim.x as usize)
diff --git a/src/device/utils.rs b/src/device/utils.rs
index 8e644be48..e7206e118 100644
--- a/src/device/utils.rs
+++ b/src/device/utils.rs
@@ -5,9 +5,30 @@ pub fn abort() -> ! {
     unsafe { ::core::arch::nvptx::trap() }
 }
 
-/// The [`print`](print()) function takes an [`Arguments`](core::fmt::Arguments)
-/// struct and formats and prints it to the CUDA kernel's standard output using
-/// the `vprintf` system call.
+/// Prints to the CUDA kernel's standard output using the `vprintf` system call.
+///
+/// Replacement for the [`std::print!`] macro, which now forwards to the
+/// [`print()`] function.
+pub macro print($($arg:tt)*) {
+    self::print(::core::format_args!($($arg)*))
+}
+
+/// Prints to the CUDA kernel's standard output using the `vprintf` system call.
+///
+/// Replacement for the [`std::println!`] macro, which now forwards to the
+/// [`print()`] function.
+pub macro println {
+    () => {
+        self::print(::core::format_args!("\n"))
+    },
+    ($($arg:tt)*) => {
+        self::print(::core::format_args!("{}\n", ::core::format_args!($($arg)*)))
+    },
+}
+
+/// The [`print()`] function takes an [`Arguments`](core::fmt::Arguments) struct
+/// and formats and prints it to the CUDA kernel's standard output using the
+/// `vprintf` system call.
 ///
 /// The [`Arguments`](core::fmt::Arguments) instance can be created with the
 /// [`format_args!`](core::format_args) macro.
@@ -28,15 +49,133 @@ pub fn print(args: ::core::fmt::Arguments) {
         msg.as_str()
     };
 
+    let args = FormatArgs {
+        msg_len: u32::try_from(msg.len()).unwrap_or(u32::MAX),
+        msg_ptr: msg.as_ptr(),
+    };
+
+    unsafe {
+        ::core::arch::nvptx::vprintf(c"%*s".as_ptr().cast(), ::core::ptr::from_ref(&args).cast());
+    }
+}
+
+// TODO: docs
+#[allow(clippy::inline_always)]
+#[inline(always)]
+pub fn pretty_panic_handler(
+    info: &::core::panic::PanicInfo,
+    allow_dynamic_message: bool,
+    allow_dynamic_payload: bool,
+) -> ! {
+    #[repr(C)]
+    struct FormatArgs {
+        file_len: u32,
+        file_ptr: *const u8,
+        line: u32,
+        column: u32,
+        thread_idx_x: u32,
+        thread_idx_y: u32,
+        thread_idx_z: u32,
+        msg_len: u32,
+        msg_ptr: *const u8,
+    }
+
+    let msg; // place to store the dynamically expanded format string
+    let msg = if let Some(message) = info.message() {
+        if let Some(msg) = message.as_str() {
+            msg
+        } else if allow_dynamic_message {
+            msg = ::alloc::fmt::format(*message);
+            msg.as_str()
+        } else {
+            "<dynamic panic message>"
+        }
+    } else if let Some(msg) = info.payload().downcast_ref::<&'static str>()
+        && allow_dynamic_payload
+    {
+        msg
+    } else if let Some(msg) = info.payload().downcast_ref::<::alloc::string::String>()
+        && allow_dynamic_payload
+    {
+        msg.as_str()
+    } else {
+        "<unknown panic payload type>"
+    };
+
+    let location_line = info.location().map_or(0, ::core::panic::Location::line);
+    let location_column = info.location().map_or(0, ::core::panic::Location::column);
+    let location_file = info
+        .location()
+        .map_or("<unknown panic location>", ::core::panic::Location::file);
+
+    let thread_idx = crate::device::thread::Thread::this().idx();
+
+    let args = FormatArgs {
+        file_len: u32::try_from(location_file.len()).unwrap_or(u32::MAX),
+        file_ptr: location_file.as_ptr(),
+        line: location_line,
+        column: location_column,
+        thread_idx_x: thread_idx.x,
+        thread_idx_y: thread_idx.y,
+        thread_idx_z: thread_idx.z,
+        msg_len: u32::try_from(msg.len()).unwrap_or(u32::MAX),
+        msg_ptr: msg.as_ptr(),
+    };
+
     unsafe {
         ::core::arch::nvptx::vprintf(
-            c"%*s".as_ptr().cast(),
-            #[allow(clippy::cast_possible_truncation)]
-            ::core::ptr::from_ref(&FormatArgs {
-                msg_len: msg.len() as u32,
-                msg_ptr: msg.as_ptr(),
-            })
-            .cast(),
+            c"panicked at %*s:%u:%u on thread (x=%u, y=%u, z=%u):\n%*s\n"
+                .as_ptr()
+                .cast(),
+            ::core::ptr::from_ref(&args).cast(),
         );
     }
+
+    abort()
+}
+
+// TODO: docs
+#[track_caller]
+#[allow(clippy::inline_always)]
+#[inline(always)]
+pub fn pretty_alloc_error_handler(layout: ::core::alloc::Layout) -> ! {
+    #[repr(C)]
+    struct FormatArgs {
+        size: usize,
+        align: usize,
+        file_len: u32,
+        file_ptr: *const u8,
+        line: u32,
+        column: u32,
+        thread_idx_x: u32,
+        thread_idx_y: u32,
+        thread_idx_z: u32,
+    }
+
+    let location = ::core::panic::Location::caller();
+    let thread_idx = crate::device::thread::Thread::this().idx();
+
+    let args = FormatArgs {
+        size: layout.size(),
+        align: layout.align(),
+        file_len: u32::try_from(location.file().len()).unwrap_or(u32::MAX),
+        file_ptr: location.file().as_ptr(),
+        line: location.line(),
+        column: location.column(),
+        thread_idx_x: thread_idx.x,
+        thread_idx_y: thread_idx.y,
+        thread_idx_z: thread_idx.z,
+    };
+
+    unsafe {
+        ::core::arch::nvptx::vprintf(
+            c"memory allocation of %llu bytes with alignment %llu failed at \
+            %*s:%u:%u on thread (x=%u, y=%u, z=%u)\n"
+                .as_ptr()
+                .cast(),
+            ::core::ptr::from_ref(&args).cast(),
+        );
+    }
+
+    abort()
 }
diff --git a/src/host.rs b/src/host.rs
index 9709798d3..1bdce5ee6 100644
--- a/src/host.rs
+++ b/src/host.rs
@@ -29,7 +29,7 @@ use crate::{
 
 pub trait Launcher {
     type KernelTraitObject: ?Sized;
-    type CompilationWatcher;
+    type CompilationWatcher<'a>;
 
     fn get_launch_package(&mut self) -> LaunchPackage<Self>;
 
@@ -38,7 +38,7 @@ pub trait Launcher {
     /// Should only return a [`CudaError`] if some implementation-defined
     ///  critical kernel function configuration failed.
     #[allow(unused_variables)]
-    fn on_compile(kernel: &Function, watcher: &mut Self::CompilationWatcher) -> CudaResult<()> {
+    fn on_compile(kernel: &Function, watcher: Self::CompilationWatcher<'_>) -> CudaResult<()> {
         Ok(())
     }
 }
@@ -54,7 +54,25 @@ pub struct LaunchConfig {
 pub struct LaunchPackage<'l, L: ?Sized + Launcher> {
     pub config: LaunchConfig,
     pub kernel: &'l mut TypedKernel<L::KernelTraitObject>,
-    pub watcher: &'l mut L::CompilationWatcher,
+    pub watcher: L::CompilationWatcher<'l>,
+}
+
+pub struct SimpleKernelLauncher<KernelTraitObject: ?Sized> {
+    pub kernel: TypedKernel<KernelTraitObject>,
+    pub config: LaunchConfig,
+}
+
+impl<KernelTraitObject: ?Sized> Launcher for SimpleKernelLauncher<KernelTraitObject> {
+    type CompilationWatcher<'a> = ();
+    type KernelTraitObject = KernelTraitObject;
+
+    fn get_launch_package(&mut self) -> LaunchPackage<Self> {
+        LaunchPackage {
+            config: self.config.clone(),
+            kernel: &mut self.kernel,
+            watcher: (),
+        }
+    }
 }
 
 pub enum KernelJITResult<'k> {
diff --git a/src/lib.rs b/src/lib.rs
index 0316613c9..5ac5f8218 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -24,6 +24,9 @@
 #![feature(impl_trait_in_assoc_type)]
 #![feature(ptr_metadata)]
 #![feature(ptr_from_ref)]
+#![feature(decl_macro)]
+#![feature(panic_info_message)]
+#![feature(let_chains)]
 #![feature(cfg_version)]
 #![cfg_attr(not(version("1.76.0")), feature(c_str_literals))]
 #![allow(incomplete_features)]

From df09a966393be7720853d1ab0b00567702143b59 Mon Sep 17 00:00:00 2001
From: Juniper Tyree <juniper.tyree@helsinki.fi>
Date: Sat, 16 Dec 2023 14:20:54 +0000
Subject: [PATCH 047/120] ptr_from_ref is stable from 1.76

---
 rust-cuda-ptx-jit/src/lib.rs | 3 ++-
 src/lib.rs                   | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/rust-cuda-ptx-jit/src/lib.rs b/rust-cuda-ptx-jit/src/lib.rs
index 1f22b2830..802e26cdd 100644
--- a/rust-cuda-ptx-jit/src/lib.rs
+++ b/rust-cuda-ptx-jit/src/lib.rs
@@ -1,6 +1,7 @@
 #![deny(clippy::pedantic)]
 #![cfg_attr(not(feature = "host"), no_std)]
-#![feature(ptr_from_ref)]
+#![feature(cfg_version)]
+#![cfg_attr(not(version("1.76.0")), feature(ptr_from_ref))]
 #![feature(doc_cfg)]
 #![doc(html_root_url = "https://juntyr.github.io/rust-cuda/")]
 
diff --git a/src/lib.rs b/src/lib.rs
index 5ac5f8218..100e95325 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -23,12 +23,12 @@
 #![feature(adt_const_params)]
 #![feature(impl_trait_in_assoc_type)]
 #![feature(ptr_metadata)]
-#![feature(ptr_from_ref)]
 #![feature(decl_macro)]
 #![feature(panic_info_message)]
 #![feature(let_chains)]
 #![feature(cfg_version)]
 #![cfg_attr(not(version("1.76.0")), feature(c_str_literals))]
+#![cfg_attr(not(version("1.76.0")), feature(ptr_from_ref))]
 #![allow(incomplete_features)]
 #![feature(generic_const_exprs)]
 #![cfg_attr(target_os = "cuda", feature(slice_ptr_get))]

From a49dd175698010d3ec985526c0b28c67deed50db Mon Sep 17 00:00:00 2001
From: Juniper Tyree <juniper.tyree@helsinki.fi>
Date: Sat, 16 Dec 2023 15:34:15 +0000
Subject: [PATCH 048/120] Exit on CUDA panic instead of abort to allow the host
 to handle the error

---
 examples/print/src/main.rs | 15 +++------------
 src/device/utils.rs        | 10 ++++++++--
 2 files changed, 11 insertions(+), 14 deletions(-)

diff --git a/examples/print/src/main.rs b/examples/print/src/main.rs
index c74380bf2..68c16ab03 100644
--- a/examples/print/src/main.rs
+++ b/examples/print/src/main.rs
@@ -50,15 +50,6 @@ fn main() -> rust_cuda::rustacuda::error::CudaResult<()> {
         )?,
     );
 
-    rust_cuda::rustacuda::context::CurrentContext::set_resource_limit(
-        rust_cuda::rustacuda::context::ResourceLimit::StackSize,
-        4096,
-    )?;
-    rust_cuda::rustacuda::context::CurrentContext::set_resource_limit(
-        rust_cuda::rustacuda::context::ResourceLimit::PrintfFifoSize,
-        4096,
-    )?;
-
     // Create a new CUDA stream to submit kernels to
     let stream = rust_cuda::host::CudaDropWrapper::from(rust_cuda::rustacuda::stream::Stream::new(
         rust_cuda::rustacuda::stream::StreamFlags::NON_BLOCKING,
@@ -78,11 +69,11 @@ fn main() -> rust_cuda::rustacuda::error::CudaResult<()> {
 
     // Launch the CUDA kernel on the stream and synchronise to its completion
     println!("Launching print kernel ...");
-    launcher.kernel(&stream, Action::Print).unwrap();
+    launcher.kernel(&stream, Action::Print)?;
     println!("Launching panic kernel ...");
-    launcher.kernel(&stream, Action::Panic).unwrap_err();
+    launcher.kernel(&stream, Action::Panic)?;
     println!("Launching alloc error kernel ...");
-    launcher.kernel(&stream, Action::AllocError).unwrap_err();
+    launcher.kernel(&stream, Action::AllocError)?;
 
     Ok(())
 }
diff --git a/src/device/utils.rs b/src/device/utils.rs
index e7206e118..073e7bd54 100644
--- a/src/device/utils.rs
+++ b/src/device/utils.rs
@@ -5,6 +5,12 @@ pub fn abort() -> ! {
     unsafe { ::core::arch::nvptx::trap() }
 }
 
+#[allow(clippy::inline_always)]
+#[inline(always)]
+pub fn exit() -> ! {
+    unsafe { ::core::arch::asm!("exit;", options(noreturn)) }
+}
+
 /// Prints to the CUDA kernel's standard output using the `vprintf` system call.
 ///
 /// Replacement for the [`std::print!`] macro, which now forwards to the
@@ -131,7 +137,7 @@ pub fn pretty_panic_handler(
         );
     }
 
-    abort()
+    exit()
 }
 
 // TODO: docs
@@ -177,5 +183,5 @@ pub fn pretty_alloc_error_handler(layout: ::core::alloc::Layout) -> ! {
         );
     }
 
-    abort()
+    exit()
 }

From 1e4de0cb478fd5afd01ab59e301b68bdddc601e4 Mon Sep 17 00:00:00 2001
From: Juniper Tyree <juniper.tyree@helsinki.fi>
Date: Tue, 19 Dec 2023 06:55:49 +0000
Subject: [PATCH 049/120] Backup of early progress for switching from kernel
 traits to functions

---
 examples/print/src/main.rs                    |  39 ++--
 examples/single-source/src/main.rs            |  36 ++--
 rust-cuda-derive/src/kernel/link/mod.rs       |  34 ++-
 .../src/kernel/specialise/call.rs             |  18 ++
 rust-cuda-derive/src/kernel/wrapper/config.rs |   4 -
 .../{get_ptx_str.rs => get_ptx.rs}            |  12 +-
 .../wrapper/generate/cpu_linker_macro/mod.rs  |  75 ++++---
 .../generate/cpu_linker_macro/new_kernel.rs   |  34 ---
 .../generate/cpu_wrapper/kernel_func.rs       |  65 ++++--
 .../cpu_wrapper/kernel_func_async/mod.rs      |   1 +
 .../wrapper/generate/cpu_wrapper/mod.rs       |  52 ++---
 .../src/kernel/wrapper/inputs/mod.rs          |  68 +++---
 rust-cuda-derive/src/kernel/wrapper/mod.rs    |  14 +-
 rust-cuda-ptx-jit/src/host/compiler/mod.rs    |  41 ----
 rust-cuda-ptx-jit/src/host/kernel.rs          |  58 ------
 rust-cuda-ptx-jit/src/host/mod.rs             |  43 +++-
 .../src/host/{compiler => }/preprocess.rs     |   0
 .../src/host/{compiler => }/regex.rs          |   0
 .../src/host/{compiler => }/replace.rs        |   0
 rust-cuda-ptx-jit/src/lib.rs                  |   2 +-
 src/host.rs                                   | 194 ++++++++++++------
 src/lib.rs                                    |   1 +
 src/safety/kernel_signature.rs                |  46 ++++-
 23 files changed, 469 insertions(+), 368 deletions(-)
 rename rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/{get_ptx_str.rs => get_ptx.rs} (95%)
 delete mode 100644 rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/new_kernel.rs
 delete mode 100644 rust-cuda-ptx-jit/src/host/compiler/mod.rs
 delete mode 100644 rust-cuda-ptx-jit/src/host/kernel.rs
 rename rust-cuda-ptx-jit/src/host/{compiler => }/preprocess.rs (100%)
 rename rust-cuda-ptx-jit/src/host/{compiler => }/regex.rs (100%)
 rename rust-cuda-ptx-jit/src/host/{compiler => }/replace.rs (100%)

diff --git a/examples/print/src/main.rs b/examples/print/src/main.rs
index 68c16ab03..cedaa6bae 100644
--- a/examples/print/src/main.rs
+++ b/examples/print/src/main.rs
@@ -5,6 +5,9 @@
 #![cfg_attr(target_os = "cuda", feature(alloc_error_handler))]
 #![cfg_attr(target_os = "cuda", feature(asm_experimental_arch))]
 #![feature(const_type_name)]
+#![feature(cfg_version)]
+#![cfg_attr(not(version("1.76.0")), feature(c_str_literals))]
+#![feature(type_alias_impl_trait)]
 
 extern crate alloc;
 
@@ -17,7 +20,7 @@ pub enum Action {
     AllocError,
 }
 
-#[rust_cuda::common::kernel(pub use link_kernel! as impl Kernel<KernelArgs, KernelPtx> for Launcher)]
+#[rust_cuda::common::kernel(pub use link! as impl Kernel<KernelArgs> for Launcher)]
 #[kernel(allow(ptx::local_memory_usage))]
 pub fn kernel(#[kernel(pass = SafeDeviceCopy)] action: Action) {
     match action {
@@ -32,11 +35,11 @@ pub fn kernel(#[kernel(pass = SafeDeviceCopy)] action: Action) {
 #[cfg(not(target_os = "cuda"))]
 fn main() -> rust_cuda::rustacuda::error::CudaResult<()> {
     // Link the non-generic CUDA kernel
-    type Launcher = rust_cuda::host::SimpleKernelLauncher<dyn Kernel>;
-    link_kernel!();
+    struct KernelPtx;
+    link! { impl kernel for KernelPtx }
 
     // Initialize the CUDA API
-    rust_cuda::rustacuda::init(rust_cuda::rustacuda::CudaFlags::empty())?;
+    /*rust_cuda::rustacuda::init(rust_cuda::rustacuda::CudaFlags::empty())?;
 
     // Get the first CUDA GPU device
     let device = rust_cuda::rustacuda::device::Device::get_device(0)?;
@@ -54,26 +57,28 @@ fn main() -> rust_cuda::rustacuda::error::CudaResult<()> {
     let stream = rust_cuda::host::CudaDropWrapper::from(rust_cuda::rustacuda::stream::Stream::new(
         rust_cuda::rustacuda::stream::StreamFlags::NON_BLOCKING,
         None,
-    )?);
+    )?);*/
 
-    // Create a new launcher for the CUDA kernel
-    let mut launcher = Launcher {
-        kernel: <Launcher as KernelPtx>::new_kernel()?,
-        config: rust_cuda::host::LaunchConfig {
-            grid: rust_cuda::rustacuda::function::GridSize::x(1),
-            block: rust_cuda::rustacuda::function::BlockSize::x(4),
-            shared_memory_size: 0,
-            ptx_jit: false,
-        },
+    // Create a new instance of the CUDA kernel and prepare the launch config
+    let mut kernel = rust_cuda::host::TypedPtxKernel::<kernel>::new::<KernelPtx>(None);
+    let config = rust_cuda::host::LaunchConfig {
+        grid: rust_cuda::rustacuda::function::GridSize::x(1),
+        block: rust_cuda::rustacuda::function::BlockSize::x(4),
+        shared_memory_size: 0,
+        ptx_jit: false,
     };
+    // let mut launcher = rust_cuda::host::Launcher { kernel: &mut typed_kernel, config };
 
     // Launch the CUDA kernel on the stream and synchronise to its completion
     println!("Launching print kernel ...");
-    launcher.kernel(&stream, Action::Print)?;
+    kernel.launch1(&config, Action::Print)?;
+    // kernel(&mut launcher, Action::Print)?;
     println!("Launching panic kernel ...");
-    launcher.kernel(&stream, Action::Panic)?;
+    kernel.launch1(&config, Action::Panic)?;
+    // kernel(&mut launcher, Action::Panic)?;
     println!("Launching alloc error kernel ...");
-    launcher.kernel(&stream, Action::AllocError)?;
+    kernel.launch1(&config, Action::AllocError)?;
+    // kernel(&mut launcher, Action::AllocError)?;
 
     Ok(())
 }
diff --git a/examples/single-source/src/main.rs b/examples/single-source/src/main.rs
index 97e0be020..b06f7031c 100644
--- a/examples/single-source/src/main.rs
+++ b/examples/single-source/src/main.rs
@@ -6,6 +6,10 @@
 #![cfg_attr(target_os = "cuda", feature(asm_experimental_arch))]
 #![feature(const_type_name)]
 #![feature(offset_of)]
+#![feature(cfg_version)]
+#![cfg_attr(not(version("1.76.0")), feature(c_str_literals))]
+#![feature(type_alias_impl_trait)]
+#![feature(associated_type_bounds)]
 
 extern crate alloc;
 
@@ -42,13 +46,13 @@ pub struct Tuple(u32, i32);
 #[layout(crate = "rc::const_type_layout")]
 pub struct Triple(i32, i32, i32);
 
-#[rc::common::kernel(use link_kernel! as impl Kernel<KernelArgs, KernelPtx> for Launcher)]
+#[rc::common::kernel(use link! as impl Kernel<KernelArgs> for Launcher)]
 #[kernel(crate = "rc")]
 #[kernel(
     allow(ptx::double_precision_use),
     forbid(ptx::local_memory_usage, ptx::register_spills)
 )]
-pub fn kernel<'a, T: rc::common::RustToCuda>(
+pub fn kernel<'a, T: rc::common::RustToCuda<CudaRepresentation: rc::safety::StackOnly, CudaAllocation: rc::common::EmptyCudaAlloc> + rc::safety::StackOnly + rc::safety::NoSafeAliasing>(
     #[kernel(pass = SafeDeviceCopy)] _x: &Dummy,
     #[kernel(pass = LendRustToCuda, jit)] _y: &mut ShallowCopy<Wrapper<T>>,
     #[kernel(pass = LendRustToCuda)] _z: &ShallowCopy<Wrapper<T>>,
@@ -57,11 +61,7 @@ pub fn kernel<'a, T: rc::common::RustToCuda>(
     #[kernel(pass = SafeDeviceCopy)] Tuple(s, mut __t): Tuple,
     #[kernel(pass = SafeDeviceCopy)] q: Triple,
     // #[kernel(pass = SafeDeviceCopy)] shared3: ThreadBlockShared<u32>,
-) where
-    T: rc::safety::StackOnly + rc::safety::NoSafeAliasing,
-    <T as rc::common::RustToCuda>::CudaRepresentation: rc::safety::StackOnly,
-    <T as rc::common::RustToCuda>::CudaAllocation: rc::common::EmptyCudaAlloc,
-{
+) {
     let shared: ThreadBlockShared<[Tuple; 3]> = ThreadBlockShared::new_uninit();
     let shared2: ThreadBlockShared<[Tuple; 3]> = ThreadBlockShared::new_uninit();
 
@@ -80,24 +80,12 @@ pub fn kernel<'a, T: rc::common::RustToCuda>(
 
 #[cfg(not(target_os = "cuda"))]
 mod host {
-    #[allow(unused_imports)]
-    use super::KernelArgs;
-    use super::{Kernel, KernelPtx};
+    use super::{kernel, KernelArgs};
 
-    #[allow(dead_code)]
-    struct Launcher<T: rc::common::RustToCuda>(core::marker::PhantomData<T>);
-
-    link_kernel!(crate::Empty);
-    link_kernel!(rc::utils::device_copy::SafeDeviceCopyWrapper<u64>);
-
-    impl<T: rc::common::RustToCuda> rc::host::Launcher for Launcher<T> {
-        type CompilationWatcher<'a> = ();
-        type KernelTraitObject = dyn Kernel<T>;
-
-        fn get_launch_package(&mut self) -> rc::host::LaunchPackage<Self> {
-            unimplemented!()
-        }
-    }
+    // Link several instances of the generic CUDA kernel
+    struct KernelPtx<'a, T>(std::marker::PhantomData<&'a T>);
+    link! { impl kernel<'a, crate::Empty> for KernelPtx }
+    link! { impl kernel<'a, rc::utils::device_copy::SafeDeviceCopyWrapper<u64>> for KernelPtx }
 }
 
 #[cfg(target_os = "cuda")]
diff --git a/rust-cuda-derive/src/kernel/link/mod.rs b/rust-cuda-derive/src/kernel/link/mod.rs
index 78a352780..bcbe297cf 100644
--- a/rust-cuda-derive/src/kernel/link/mod.rs
+++ b/rust-cuda-derive/src/kernel/link/mod.rs
@@ -1,7 +1,7 @@
 use std::{
     collections::HashMap,
     env,
-    ffi::CString,
+    ffi::{CStr, CString},
     fmt::Write as FmtWrite,
     fs,
     io::{Read, Write},
@@ -70,7 +70,7 @@ pub fn check_kernel(tokens: TokenStream) -> TokenStream {
 #[allow(clippy::module_name_repetitions)]
 pub fn link_kernel(tokens: TokenStream) -> TokenStream {
     proc_macro_error::set_dummy(quote! {
-        const PTX_STR: &'static str = "ERROR in this PTX compilation";
+        const PTX_CSTR: &'static ::core::ffi::CStr = c"ERROR in this PTX compilation";
     });
 
     let LinkKernelConfig {
@@ -95,7 +95,7 @@ pub fn link_kernel(tokens: TokenStream) -> TokenStream {
 
     if skip_kernel_compilation() {
         return quote! {
-            const PTX_STR: &'static str = "CLIPPY skips specialised PTX compilation";
+            const PTX_CSTR: &'static ::core::ffi::CStr = c"CLIPPY skips specialised PTX compilation";
         }
         .into();
     }
@@ -107,7 +107,7 @@ pub fn link_kernel(tokens: TokenStream) -> TokenStream {
         Specialisation::Link(&specialisation),
     ) else {
         return (quote! {
-            const PTX_STR: &'static str = "ERROR in this PTX compilation";
+            const PTX_CSTR: &'static ::core::ffi::CStr = c"ERROR in this PTX compilation";
         })
         .into();
     };
@@ -122,7 +122,23 @@ pub fn link_kernel(tokens: TokenStream) -> TokenStream {
         &ptx_lint_levels,
     );
 
-    (quote! { const PTX_STR: &'static str = #kernel_ptx; #(#type_layouts)* }).into()
+    let mut kernel_ptx = kernel_ptx.into_bytes();
+    kernel_ptx.push(b'\0');
+
+    if let Err(err) = CStr::from_bytes_with_nul(&kernel_ptx) {
+        abort_call_site!(
+            "Kernel compilation generated invalid PTX: internal nul byte: {:?}",
+            err
+        );
+    }
+
+    // TODO: CStr constructor blocked on https://github.com/rust-lang/rust/issues/118560
+    let kernel_ptx = syn::LitByteStr::new(&kernel_ptx, proc_macro2::Span::call_site());
+    // Safety: the validity of kernel_ptx as a CStr was just checked above
+    let kernel_ptx =
+        quote! { unsafe { ::core::ffi::CStr::from_bytes_with_nul_unchecked(#kernel_ptx) } };
+
+    (quote! { const PTX_CSTR: &'static ::core::ffi::CStr = #kernel_ptx; #(#type_layouts)* }).into()
 }
 
 fn extract_ptx_kernel_layout(kernel_ptx: &mut String) -> Vec<proc_macro2::TokenStream> {
@@ -626,7 +642,7 @@ fn compile_kernel(
             Some(kernel_ptx)
         },
         Err(err) => {
-            eprintln!("{err:?}");
+            eprintln!("{err}");
             emit_ptx_build_error();
             None
         },
@@ -669,7 +685,7 @@ fn build_kernel_with_specialisation(
         let any_output = AtomicBool::new(false);
         let crate_name = String::from(builder.get_crate_name());
 
-        match builder.build_live(
+        let build = builder.build_live(
             |stdout_line| {
                 if let Ok(cargo_metadata::Message::CompilerMessage(mut message)) =
                     serde_json::from_str(stdout_line)
@@ -737,7 +753,9 @@ fn build_kernel_with_specialisation(
                 );
                 colored::control::unset_override();
             },
-        )? {
+        )?;
+
+        match build {
             BuildStatus::Success(output) => {
                 let ptx_path = output.get_assembly_path();
 
diff --git a/rust-cuda-derive/src/kernel/specialise/call.rs b/rust-cuda-derive/src/kernel/specialise/call.rs
index 34eb0dc35..10e43d26a 100644
--- a/rust-cuda-derive/src/kernel/specialise/call.rs
+++ b/rust-cuda-derive/src/kernel/specialise/call.rs
@@ -1,3 +1,5 @@
+use std::ffi::CStr;
+
 use proc_macro::TokenStream;
 
 #[allow(clippy::module_name_repetitions)]
@@ -25,6 +27,22 @@ pub fn specialise_kernel_call(tokens: TokenStream) -> TokenStream {
         format!("{kernel}_kernel")
     };
 
+    let mut mangled_kernel_ident = mangled_kernel_ident.into_bytes();
+    mangled_kernel_ident.push(b'\0');
+
+    if let Err(err) = CStr::from_bytes_with_nul(&mangled_kernel_ident) {
+        abort_call_site!(
+            "Kernel compilation generated invalid kernel entry point: internal nul byte: {:?}",
+            err
+        );
+    }
+
+    // TODO: CStr constructor blocked on https://github.com/rust-lang/rust/issues/118560
+    let mangled_kernel_ident =
+        syn::LitByteStr::new(&mangled_kernel_ident, proc_macro2::Span::call_site());
+    // Safety: the validity of mangled_kernel_ident as a CStr was just checked above
+    let mangled_kernel_ident = quote! { unsafe { ::core::ffi::CStr::from_bytes_with_nul_unchecked(#mangled_kernel_ident) } };
+
     (quote! { #mangled_kernel_ident }).into()
 }
 
diff --git a/rust-cuda-derive/src/kernel/wrapper/config.rs b/rust-cuda-derive/src/kernel/wrapper/config.rs
index d8951230d..6ba9ebedc 100644
--- a/rust-cuda-derive/src/kernel/wrapper/config.rs
+++ b/rust-cuda-derive/src/kernel/wrapper/config.rs
@@ -3,7 +3,6 @@ pub(super) struct KernelConfig {
     pub(super) linker: syn::Ident,
     pub(super) kernel: syn::Ident,
     pub(super) args: syn::Ident,
-    pub(super) ptx: syn::Ident,
     pub(super) launcher: syn::Ident,
 }
 
@@ -18,8 +17,6 @@ impl syn::parse::Parse for KernelConfig {
         let kernel: syn::Ident = input.parse()?;
         let _lt_token: syn::token::Lt = input.parse()?;
         let args: syn::Ident = input.parse()?;
-        let _comma: syn::token::Comma = input.parse()?;
-        let ptx: syn::Ident = input.parse()?;
         let _comma: Option<syn::token::Comma> = input.parse()?;
         let _gt_token: syn::token::Gt = input.parse()?;
         let _for: syn::token::For = input.parse()?;
@@ -30,7 +27,6 @@ impl syn::parse::Parse for KernelConfig {
             linker,
             kernel,
             args,
-            ptx,
             launcher,
         })
     }
diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/get_ptx_str.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/get_ptx.rs
similarity index 95%
rename from rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/get_ptx_str.rs
rename to rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/get_ptx.rs
index 10732a133..790b3b8df 100644
--- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/get_ptx_str.rs
+++ b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/get_ptx.rs
@@ -6,7 +6,7 @@ use crate::kernel::utils::skip_kernel_compilation;
 use super::super::super::{DeclGenerics, FuncIdent, FunctionInputs, InputCudaType, KernelConfig};
 
 #[allow(clippy::too_many_arguments)]
-pub(super) fn quote_get_ptx_str(
+pub(super) fn quote_get_ptx(
     crate_path: &syn::Path,
     FuncIdent {
         func_ident,
@@ -43,12 +43,12 @@ pub(super) fn quote_get_ptx_str(
                 #crate_path::safety::kernel_signature::CpuAndGpuKernelSignatures::Match
             }> = #crate_path::safety::kernel_signature::Assert::<{
                 #crate_path::safety::kernel_signature::check(
-                    PTX_STR.as_bytes(),
-                    concat!(".visible .entry ", #crate_path::host::specialise_kernel_call!(
+                    PTX_CSTR.to_bytes(),
+                    #crate_path::host::specialise_kernel_call!(
                         #func_ident_hash #generic_start_token
                             #($#macro_type_ids),*
                         #generic_close_token
-                    )).as_bytes()
+                    ).to_bytes(),
                 )
             }>;
         }
@@ -78,7 +78,7 @@ pub(super) fn quote_get_ptx_str(
     };
 
     quote! {
-        fn get_ptx_str() -> &'static str {
+        fn get_ptx() -> &'static ::core::ffi::CStr {
             #crate_path::host::link_kernel!{
                 #func_ident #func_ident_hash #args #crate_name #crate_manifest_dir #generic_start_token
                     #($#macro_type_ids),*
@@ -100,7 +100,7 @@ pub(super) fn quote_get_ptx_str(
                 )* }
             }
 
-            PTX_STR
+            PTX_CSTR
         }
     }
 }
diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/mod.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/mod.rs
index 0ca963bb2..495b61870 100644
--- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/mod.rs
+++ b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/mod.rs
@@ -2,33 +2,35 @@ use proc_macro2::TokenStream;
 
 use super::super::{DeclGenerics, FuncIdent, FunctionInputs, KernelConfig};
 
-mod get_ptx_str;
-mod new_kernel;
+mod get_ptx;
 
-use get_ptx_str::quote_get_ptx_str;
-use new_kernel::quote_new_kernel;
+use get_ptx::quote_get_ptx;
 
 pub(in super::super) fn quote_cpu_linker_macro(
     crate_path: &syn::Path,
     config @ KernelConfig {
         visibility,
         linker,
+        kernel,
         launcher,
-        ptx,
         ..
     }: &KernelConfig,
     decl_generics @ DeclGenerics {
         generic_start_token,
         generic_trait_params: generic_params,
         generic_close_token,
+        generic_kernel_params,
         ..
     }: &DeclGenerics,
     func_inputs: &FunctionInputs,
-    func_ident: &FuncIdent,
+    func_ident @ FuncIdent {
+        func_ident: func_ident_name,
+        func_ident_hash, ..
+    }: &FuncIdent,
     func_params: &[syn::Ident],
     ptx_lint_levels: &TokenStream,
 ) -> TokenStream {
-    let macro_types = generic_params
+    let macro_generics = generic_kernel_params//generic_params
         .iter()
         .enumerate()
         .map(|(i, generic)| {
@@ -37,50 +39,77 @@ pub(in super::super) fn quote_cpu_linker_macro(
             match generic {
                 syn::GenericParam::Type(_) => quote!($#generic_ident:ty),
                 syn::GenericParam::Const(_) => quote!($#generic_ident:expr),
-                syn::GenericParam::Lifetime(_) => unreachable!(),
+                syn::GenericParam::Lifetime(_) => quote!($#generic_ident:lifetime),//unreachable!(),
             }
         })
         .collect::<Vec<_>>();
 
-    let macro_type_ids = (0..generic_params.len())
+    let macro_generic_ids = (0..generic_kernel_params.len())
         .map(|i| quote::format_ident!("__g_{}", i))
         .collect::<Vec<_>>();
 
+    let macro_only_lt_generic_ids = generic_kernel_params//generic_params
+        .iter()
+        .enumerate()
+        .filter_map(|(i, generic)| {
+            let generic_ident = quote::format_ident!("__g_{}", i);
+
+            match generic {
+                syn::GenericParam::Type(_) | syn::GenericParam::Const(_) => None,
+                syn::GenericParam::Lifetime(_) => Some(generic_ident),
+            }
+        })
+        .collect::<Vec<_>>();
+
+    let macro_non_lt_generic_ids = generic_kernel_params//generic_params
+        .iter()
+        .enumerate()
+        .filter_map(|(i, generic)| {
+            let generic_ident = quote::format_ident!("__g_{}", i);
+
+            match generic {
+                syn::GenericParam::Type(_) | syn::GenericParam::Const(_) => Some(generic_ident),
+                syn::GenericParam::Lifetime(_) => None,
+            }
+        })
+        .collect::<Vec<_>>();
+
     let cpu_linker_macro_visibility = if visibility.is_some() {
         quote! { #[macro_export] }
     } else {
         quote! {}
     };
 
-    let get_ptx_str = quote_get_ptx_str(
+    let get_ptx = quote_get_ptx(
         crate_path,
         func_ident,
         config,
         decl_generics,
         func_inputs,
         func_params,
-        &macro_type_ids,
+        &macro_non_lt_generic_ids,
         ptx_lint_levels,
     );
-    let new_kernel = quote_new_kernel(
-        crate_path,
-        config,
-        decl_generics,
-        func_ident,
-        &macro_type_ids,
-    );
 
     quote! {
         #[cfg(not(target_os = "cuda"))]
         #cpu_linker_macro_visibility
         macro_rules! #linker {
-            (#(#macro_types),* $(,)?) => {
-                unsafe impl #ptx #generic_start_token #($#macro_type_ids),* #generic_close_token
-                    for #launcher #generic_start_token #($#macro_type_ids),* #generic_close_token
+            (impl #func_ident_name #generic_start_token #(#macro_generics),* $(,)? #generic_close_token for $ptx:ident) => {
+                unsafe impl<#($#macro_only_lt_generic_ids),*> #crate_path::host::CompiledKernelPtx<
+                    #func_ident_name #generic_start_token #($#macro_generic_ids),* #generic_close_token
+                    //dyn #kernel #generic_start_token #($#macro_type_ids),* #generic_close_token
+                > for $ptx #generic_start_token #($#macro_generic_ids),* #generic_close_token // #launcher #generic_start_token #($#macro_type_ids),* #generic_close_token
                 {
-                    #get_ptx_str
+                    #get_ptx
 
-                    #new_kernel
+                    fn get_entry_point() -> &'static ::core::ffi::CStr {
+                        #crate_path::host::specialise_kernel_call!(
+                            #func_ident_hash #generic_start_token
+                                #($#macro_non_lt_generic_ids),*
+                            #generic_close_token
+                        )
+                    }
                 }
             };
         }
diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/new_kernel.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/new_kernel.rs
deleted file mode 100644
index 6b53954e4..000000000
--- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/new_kernel.rs
+++ /dev/null
@@ -1,34 +0,0 @@
-use proc_macro2::TokenStream;
-
-use super::super::super::{DeclGenerics, FuncIdent, KernelConfig};
-
-pub(super) fn quote_new_kernel(
-    crate_path: &syn::Path,
-    KernelConfig { kernel, .. }: &KernelConfig,
-    DeclGenerics {
-        generic_start_token,
-        generic_close_token,
-        ..
-    }: &DeclGenerics,
-    FuncIdent {
-        func_ident_hash, ..
-    }: &FuncIdent,
-    macro_type_ids: &[syn::Ident],
-) -> TokenStream {
-    quote! {
-        fn new_kernel() -> #crate_path::rustacuda::error::CudaResult<
-            #crate_path::host::TypedKernel<dyn #kernel #generic_start_token
-                #($#macro_type_ids),*
-            #generic_close_token>
-        > {
-            let ptx = Self::get_ptx_str();
-            let entry_point = #crate_path::host::specialise_kernel_call!(
-                #func_ident_hash #generic_start_token
-                    #($#macro_type_ids),*
-                #generic_close_token
-            );
-
-            #crate_path::host::TypedKernel::new(ptx, entry_point)
-        }
-    }
-}
diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func.rs
index 94b4b9598..6fa778eb3 100644
--- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func.rs
+++ b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func.rs
@@ -5,11 +5,13 @@ use super::super::super::{
 };
 
 #[allow(clippy::too_many_arguments)]
+#[allow(clippy::too_many_lines)] // FIXME
 pub(super) fn quote_kernel_func_inputs(
     crate_path: &syn::Path,
-    KernelConfig { kernel, args, .. }: &KernelConfig,
+    KernelConfig { kernel, args, visibility, .. }: &KernelConfig,
     ImplGenerics { ty_generics, .. }: &ImplGenerics,
     DeclGenerics {
+        generic_kernel_params,
         generic_wrapper_params,
         generic_wrapper_where_clause,
         ..
@@ -45,7 +47,7 @@ pub(super) fn quote_kernel_func_inputs(
         },
     };
 
-    let kernel_func_inputs = func_inputs
+    let (kernel_func_inputs, kernel_func_input_tys): (Vec<_>, Vec<_>) = func_inputs
         .iter()
         .enumerate()
         .map(|(i, arg)| match arg {
@@ -56,44 +58,73 @@ pub(super) fn quote_kernel_func_inputs(
                 ty,
             }) => {
                 let type_ident = quote::format_ident!("__T_{}", i);
-                let syn_type = quote! {
+                let syn_type: syn::Type = syn::parse_quote! {
                     <() as #args #ty_generics>::#type_ident
                 };
-
-                if let syn::Type::Reference(syn::TypeReference {
+                let syn_type = if let syn::Type::Reference(syn::TypeReference {
                     and_token,
                     lifetime,
                     mutability,
                     ..
                 }) = &**ty
                 {
-                    quote! {
-                        #(#attrs)* #pat #colon_token #and_token #lifetime #mutability #syn_type
-                    }
+                    syn::Type::Reference(syn::TypeReference {
+                        and_token: *and_token,
+                        lifetime: lifetime.clone(),
+                        mutability: *mutability,
+                        elem: Box::new(syn_type),
+                    })
                 } else {
-                    quote! { #(#attrs)* #pat #colon_token #syn_type }
-                }
+                    syn_type
+                };
+
+                let param = quote! {
+                    #(#attrs)* #pat #colon_token #syn_type
+                };
+
+                (param, syn_type)
             },
             syn::FnArg::Receiver(_) => unreachable!(),
         })
-        .collect::<Vec<_>>();
+        .unzip();
 
     let raw_func_input_wrap =
         generate_raw_func_input_wrap(crate_path, inputs, fn_ident, func_params);
 
+    let full_generics = generic_kernel_params.iter().map(|param| match param {
+        syn::GenericParam::Type(syn::TypeParam { ident, .. }) | syn::GenericParam::Const(syn::ConstParam { ident, .. }) => quote!(#ident),
+        syn::GenericParam::Lifetime(syn::LifetimeDef { lifetime, .. }) => quote!(#lifetime),
+    }).collect::<Vec<_>>();
+    
+    let ty_turbofish = ty_generics.as_turbofish();
+
     quote! {
+        #[cfg(not(target_os = "cuda"))]
+        #[allow(non_camel_case_types)]
+        #visibility type #func_ident <#generic_kernel_params> = impl Copy + Fn(
+            &mut #crate_path::host::Launcher<#func_ident <#(#full_generics),*>>,
+            #(#kernel_func_input_tys),*
+        ) -> #crate_path::rustacuda::error::CudaResult<()>;
+
+        #[cfg(not(target_os = "cuda"))]
         #(#func_attrs)*
         #[allow(clippy::needless_lifetimes)]
         #[allow(clippy::too_many_arguments)]
         #[allow(clippy::used_underscore_binding)]
         #[allow(unused_variables)]
-        fn #func_ident <'stream, #generic_wrapper_params>(
-            &mut self,
-            stream: &'stream #crate_path::rustacuda::stream::Stream,
+        #visibility fn #func_ident </*'stream,*/ #generic_kernel_params>(
+            // &mut self,
+            // TODO: move the stream
+            // stream: &'stream #crate_path::rustacuda::stream::Stream,
+            // kernel: &mut #crate_path::host::TypedKernel<#func_ident #ty_generics>,
+            launcher: &mut #crate_path::host::Launcher<#func_ident #ty_generics>,
             #(#kernel_func_inputs),*
         ) -> #crate_path::rustacuda::error::CudaResult<()>
-            #generic_wrapper_where_clause
+        // TODO: don't allow where clause
+            //#generic_wrapper_where_clause
         {
+            let _: #func_ident <#(#full_generics),*> = #func_ident #ty_turbofish;
+
             // impls check adapted from Nikolai Vazquez's `impls` crate:
             //  https://docs.rs/impls/1.0.3/src/impls/lib.rs.html#584-602
             const fn __check_is_sync<T: ?Sized>(_x: &T) -> bool {
@@ -110,7 +141,9 @@ pub(super) fn quote_kernel_func_inputs(
                 <CheckIs<T>>::SYNC
             }
 
-            #raw_func_input_wrap
+            todo!()
+
+            // #raw_func_input_wrap
         }
     }
 }
diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func_async/mod.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func_async/mod.rs
index 747f4a278..63d0d472f 100644
--- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func_async/mod.rs
+++ b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func_async/mod.rs
@@ -61,6 +61,7 @@ pub(super) fn quote_kernel_func_async(
         generate_launch_types(crate_path, config, impl_generics, func_inputs);
 
     quote! {
+        #[cfg(not(target_os = "cuda"))]
         #(#func_attrs)*
         #[allow(clippy::extra_unused_type_parameters)]
         #[allow(clippy::too_many_arguments)]
diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/mod.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/mod.rs
index 1b984f920..7007abe87 100644
--- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/mod.rs
+++ b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/mod.rs
@@ -14,10 +14,7 @@ use kernel_func_async::quote_kernel_func_async;
 pub(in super::super) fn quote_cpu_wrapper(
     crate_path: &syn::Path,
     config @ KernelConfig {
-        visibility,
-        kernel,
-        ptx,
-        ..
+        visibility, kernel, ..
     }: &KernelConfig,
     decl @ DeclGenerics {
         generic_start_token,
@@ -37,12 +34,6 @@ pub(in super::super) fn quote_cpu_wrapper(
     func_params: &[syn::Ident],
     func_attrs: &[syn::Attribute],
 ) -> TokenStream {
-    let launcher_predicate = quote! {
-        Self: Sized + #crate_path::host::Launcher<
-            KernelTraitObject = dyn #kernel #ty_generics
-        >
-    };
-
     let kernel_func = quote_kernel_func_inputs(
         crate_path,
         config,
@@ -65,32 +56,27 @@ pub(in super::super) fn quote_cpu_wrapper(
     );
 
     quote! {
-        #[cfg(not(target_os = "cuda"))]
-        #[allow(clippy::missing_safety_doc)]
-        #visibility unsafe trait #ptx #generic_start_token #generic_trait_params #generic_close_token
-            #generic_trait_where_clause
-        {
-            fn get_ptx_str() -> &'static str where #launcher_predicate;
+        // #[cfg(not(target_os = "cuda"))]
+        // #[allow(clippy::missing_safety_doc)]
+        // #visibility unsafe trait #kernel #generic_start_token
+        //     #generic_trait_params
+        // #generic_close_token: #crate_path::host::CompiledKernelPtx<
+        //     dyn #kernel #ty_generics
+        // > #generic_trait_where_clause
+        // {
+        //     #kernel_func
 
-            fn new_kernel() -> #crate_path::rustacuda::error::CudaResult<
-                #crate_path::host::TypedKernel<dyn #kernel #ty_generics>
-            > where #launcher_predicate;
-        }
+        //     #kernel_func_async
+        // }
 
-        #[cfg(not(target_os = "cuda"))]
-        #[allow(clippy::missing_safety_doc)]
-        #visibility unsafe trait #kernel #generic_start_token #generic_trait_params #generic_close_token: #ptx #ty_generics
-            #generic_trait_where_clause
-        {
-            #kernel_func
+        // #[cfg(not(target_os = "cuda"))]
+        // #[allow(clippy::missing_safety_doc)]
+        // unsafe impl #blanket_impl_generics #kernel #ty_generics for #blanket_ty
+        //     #blanket_where_clause
+        // {}
 
-            #kernel_func_async
-        }
+        #kernel_func
 
-        #[cfg(not(target_os = "cuda"))]
-        #[allow(clippy::missing_safety_doc)]
-        unsafe impl #blanket_impl_generics #kernel #ty_generics for #blanket_ty
-            #blanket_where_clause
-        {}
+        // #kernel_func_async
     }
 }
diff --git a/rust-cuda-derive/src/kernel/wrapper/inputs/mod.rs b/rust-cuda-derive/src/kernel/wrapper/inputs/mod.rs
index f3cc1a4d8..4a25bf958 100644
--- a/rust-cuda-derive/src/kernel/wrapper/inputs/mod.rs
+++ b/rust-cuda-derive/src/kernel/wrapper/inputs/mod.rs
@@ -136,26 +136,26 @@ fn ensure_reference_type_lifetime(
             mutability,
             elem,
         }) => {
-            let lifetime = lifetime.clone().unwrap_or_else(|| {
-                let lifetime = syn::Lifetime::new(
-                    &format!("'__r2c_lt_{implicit_lifetime_id}"),
-                    lifetime.span(),
-                );
-
-                generic_params.insert(
-                    *implicit_lifetime_id,
-                    syn::GenericParam::Lifetime(syn::LifetimeDef {
-                        attrs: Vec::new(),
-                        lifetime: lifetime.clone(),
-                        colon_token: None,
-                        bounds: syn::punctuated::Punctuated::new(),
-                    }),
-                );
-
-                *implicit_lifetime_id += 1;
-
-                lifetime
-            });
+            // let lifetime = lifetime.clone().unwrap_or_else(|| {
+            //     let lifetime = syn::Lifetime::new(
+            //         &format!("'__r2c_lt_{implicit_lifetime_id}"),
+            //         lifetime.span(),
+            //     );
+
+            //     generic_params.insert(
+            //         *implicit_lifetime_id,
+            //         syn::GenericParam::Lifetime(syn::LifetimeDef {
+            //             attrs: Vec::new(),
+            //             lifetime: lifetime.clone(),
+            //             colon_token: None,
+            //             bounds: syn::punctuated::Punctuated::new(),
+            //         }),
+            //     );
+
+            //     *implicit_lifetime_id += 1;
+
+            //     lifetime
+            // });
 
             let elem = if matches!(cuda_type, InputCudaType::LendRustToCuda) {
                 (|| {
@@ -203,25 +203,25 @@ fn ensure_reference_type_lifetime(
 
             Box::new(syn::Type::Reference(syn::TypeReference {
                 and_token: *and_token,
-                lifetime: Some(lifetime),
+                lifetime: lifetime.clone(),//Some(lifetime),
                 mutability: *mutability,
                 elem,
             }))
         },
         ty => {
-            if matches!(cuda_type, InputCudaType::LendRustToCuda) {
-                generic_params.insert(
-                    *implicit_lifetime_id,
-                    syn::GenericParam::Lifetime(syn::LifetimeDef {
-                        attrs: Vec::new(),
-                        lifetime: r2c_move_lifetime(i, ty),
-                        colon_token: None,
-                        bounds: syn::punctuated::Punctuated::new(),
-                    }),
-                );
-
-                *implicit_lifetime_id += 1;
-            }
+            // if matches!(cuda_type, InputCudaType::LendRustToCuda) {
+            //     generic_params.insert(
+            //         *implicit_lifetime_id,
+            //         syn::GenericParam::Lifetime(syn::LifetimeDef {
+            //             attrs: Vec::new(),
+            //             lifetime: r2c_move_lifetime(i, ty),
+            //             colon_token: None,
+            //             bounds: syn::punctuated::Punctuated::new(),
+            //         }),
+            //     );
+
+            //     *implicit_lifetime_id += 1;
+            // }
 
             Box::new(ty.clone())
         },
diff --git a/rust-cuda-derive/src/kernel/wrapper/mod.rs b/rust-cuda-derive/src/kernel/wrapper/mod.rs
index a812f9dd4..a4db5f7f3 100644
--- a/rust-cuda-derive/src/kernel/wrapper/mod.rs
+++ b/rust-cuda-derive/src/kernel/wrapper/mod.rs
@@ -36,8 +36,8 @@ pub fn kernel(attr: TokenStream, func: TokenStream) -> TokenStream {
         Ok(config) => config,
         Err(err) => {
             abort_call_site!(
-                "#[kernel(pub? use LINKER! as impl KERNEL<ARGS, PTX> for LAUNCHER)] expects \
-                 LINKER, KERNEL, ARGS, PTX, and LAUNCHER identifiers: {:?}",
+                "#[kernel(pub? use LINKER! as impl KERNEL<ARGS> for LAUNCHER)] expects LINKER, \
+                 KERNEL, ARGS, and LAUNCHER identifiers: {:?}",
                 err
             )
         },
@@ -211,12 +211,18 @@ pub fn kernel(attr: TokenStream, func: TokenStream) -> TokenStream {
     let (impl_generics, ty_generics, where_clause) = trait_generics.split_for_impl();
     let blanket_ty = syn::Ident::new("K", Span::mixed_site());
     let mut blanket_params = generic_trait_params.clone();
-    let ptx = &config.ptx;
     blanket_params.push(syn::GenericParam::Type(syn::TypeParam {
         attrs: Vec::new(),
         ident: blanket_ty.clone(),
         colon_token: syn::parse_quote!(:),
-        bounds: syn::parse_quote!(#ptx #ty_generics),
+        bounds: {
+            let kernel = &config.kernel;
+            syn::parse_quote! {
+                #crate_path::host::CompiledKernelPtx<
+                    dyn #kernel #ty_generics
+                >
+            }
+        },
         eq_token: None,
         default: None,
     }));
diff --git a/rust-cuda-ptx-jit/src/host/compiler/mod.rs b/rust-cuda-ptx-jit/src/host/compiler/mod.rs
deleted file mode 100644
index 156e8223c..000000000
--- a/rust-cuda-ptx-jit/src/host/compiler/mod.rs
+++ /dev/null
@@ -1,41 +0,0 @@
-use std::ffi::{CStr, CString};
-
-mod preprocess;
-mod regex;
-mod replace;
-
-type ByteSliceOptionalArguments = Option<Box<[Option<Box<[u8]>>]>>;
-
-#[doc(cfg(feature = "host"))]
-#[allow(clippy::module_name_repetitions)]
-pub struct PtxJITCompiler {
-    ptx_slices: Box<[PtxElement]>,
-    last_arguments: ByteSliceOptionalArguments,
-    last_ptx: CString,
-}
-
-#[doc(cfg(feature = "host"))]
-pub enum PtxJITResult<'s> {
-    Cached(&'s CStr),
-    Recomputed(&'s CStr),
-}
-
-enum PtxLoadWidth {
-    B1,
-    B2,
-    B4,
-    B8,
-}
-
-enum PtxElement {
-    CopiedSource {
-        ptx: Box<[u8]>,
-    },
-    ConstLoad {
-        ptx: Box<[u8]>,
-        parameter_index: usize,
-        byte_offset: usize,
-        load_width: PtxLoadWidth,
-        registers: Box<[Box<[u8]>]>,
-    },
-}
diff --git a/rust-cuda-ptx-jit/src/host/kernel.rs b/rust-cuda-ptx-jit/src/host/kernel.rs
deleted file mode 100644
index 02baabfcf..000000000
--- a/rust-cuda-ptx-jit/src/host/kernel.rs
+++ /dev/null
@@ -1,58 +0,0 @@
-use std::{ffi::CStr, mem::ManuallyDrop};
-
-use rustacuda::{error::CudaResult, function::Function, module::Module};
-
-#[doc(cfg(feature = "host"))]
-#[allow(clippy::module_name_repetitions)]
-pub struct CudaKernel {
-    module: ManuallyDrop<Box<Module>>,
-    function: ManuallyDrop<Function<'static>>,
-}
-
-impl CudaKernel {
-    /// # Errors
-    ///
-    /// Returns a `CudaError` if `ptx` is not a valid PTX source, or it does
-    ///  not contain an entry point named `entry_point`.
-    pub fn new(ptx: &CStr, entry_point: &CStr) -> CudaResult<Self> {
-        let module = Box::new(Module::load_from_string(ptx)?);
-
-        let function = unsafe { &*(module.as_ref() as *const Module) }.get_function(entry_point);
-
-        let function = match function {
-            Ok(function) => function,
-            Err(err) => {
-                if let Err((_err, module)) = Module::drop(*module) {
-                    std::mem::forget(module);
-                }
-
-                return Err(err);
-            },
-        };
-
-        Ok(Self {
-            function: ManuallyDrop::new(function),
-            module: ManuallyDrop::new(module),
-        })
-    }
-
-    #[must_use]
-    pub fn get_function(&self) -> &Function {
-        &self.function
-    }
-}
-
-impl Drop for CudaKernel {
-    fn drop(&mut self) {
-        {
-            // Ensure that self.function is dropped before self.module as
-            //  it borrows data from the module and must not outlive it
-            let _function = unsafe { ManuallyDrop::take(&mut self.function) };
-        }
-
-        if let Err((_err, module)) = Module::drop(*unsafe { ManuallyDrop::take(&mut self.module) })
-        {
-            std::mem::forget(module);
-        }
-    }
-}
diff --git a/rust-cuda-ptx-jit/src/host/mod.rs b/rust-cuda-ptx-jit/src/host/mod.rs
index 2ace3405d..156e8223c 100644
--- a/rust-cuda-ptx-jit/src/host/mod.rs
+++ b/rust-cuda-ptx-jit/src/host/mod.rs
@@ -1,2 +1,41 @@
-pub mod compiler;
-pub mod kernel;
+use std::ffi::{CStr, CString};
+
+mod preprocess;
+mod regex;
+mod replace;
+
+type ByteSliceOptionalArguments = Option<Box<[Option<Box<[u8]>>]>>;
+
+#[doc(cfg(feature = "host"))]
+#[allow(clippy::module_name_repetitions)]
+pub struct PtxJITCompiler {
+    ptx_slices: Box<[PtxElement]>,
+    last_arguments: ByteSliceOptionalArguments,
+    last_ptx: CString,
+}
+
+#[doc(cfg(feature = "host"))]
+pub enum PtxJITResult<'s> {
+    Cached(&'s CStr),
+    Recomputed(&'s CStr),
+}
+
+enum PtxLoadWidth {
+    B1,
+    B2,
+    B4,
+    B8,
+}
+
+enum PtxElement {
+    CopiedSource {
+        ptx: Box<[u8]>,
+    },
+    ConstLoad {
+        ptx: Box<[u8]>,
+        parameter_index: usize,
+        byte_offset: usize,
+        load_width: PtxLoadWidth,
+        registers: Box<[Box<[u8]>]>,
+    },
+}
diff --git a/rust-cuda-ptx-jit/src/host/compiler/preprocess.rs b/rust-cuda-ptx-jit/src/host/preprocess.rs
similarity index 100%
rename from rust-cuda-ptx-jit/src/host/compiler/preprocess.rs
rename to rust-cuda-ptx-jit/src/host/preprocess.rs
diff --git a/rust-cuda-ptx-jit/src/host/compiler/regex.rs b/rust-cuda-ptx-jit/src/host/regex.rs
similarity index 100%
rename from rust-cuda-ptx-jit/src/host/compiler/regex.rs
rename to rust-cuda-ptx-jit/src/host/regex.rs
diff --git a/rust-cuda-ptx-jit/src/host/compiler/replace.rs b/rust-cuda-ptx-jit/src/host/replace.rs
similarity index 100%
rename from rust-cuda-ptx-jit/src/host/compiler/replace.rs
rename to rust-cuda-ptx-jit/src/host/replace.rs
diff --git a/rust-cuda-ptx-jit/src/lib.rs b/rust-cuda-ptx-jit/src/lib.rs
index 802e26cdd..8b25fc9a0 100644
--- a/rust-cuda-ptx-jit/src/lib.rs
+++ b/rust-cuda-ptx-jit/src/lib.rs
@@ -9,7 +9,7 @@
 mod host;
 
 #[cfg(feature = "host")]
-pub use host::{compiler::PtxJITCompiler, compiler::PtxJITResult, kernel::CudaKernel};
+pub use host::{PtxJITCompiler, PtxJITResult};
 
 #[cfg(any(not(feature = "host"), doc))]
 #[doc(cfg(not(feature = "host")))]
diff --git a/src/host.rs b/src/host.rs
index 1bdce5ee6..ea2bd11a8 100644
--- a/src/host.rs
+++ b/src/host.rs
@@ -1,4 +1,5 @@
-use core::{
+use std::{
+    ffi::{CStr, CString},
     marker::PhantomData,
     mem::ManuallyDrop,
     ops::{Deref, DerefMut},
@@ -23,27 +24,28 @@ use crate::{
     common::{
         DeviceAccessible, DeviceConstRef, DeviceMutRef, EmptyCudaAlloc, NoCudaAlloc, RustToCuda,
     },
-    ptx_jit::{CudaKernel, PtxJITCompiler, PtxJITResult},
+    ptx_jit::{PtxJITCompiler, PtxJITResult},
     safety::SafeDeviceCopy,
 };
 
-pub trait Launcher {
-    type KernelTraitObject: ?Sized;
-    type CompilationWatcher<'a>;
+pub struct Launcher<'a, Kernel> {
+    pub kernel: &'a mut TypedPtxKernel<Kernel>,
+    pub config: LaunchConfig,
+}
 
-    fn get_launch_package(&mut self) -> LaunchPackage<Self>;
+impl<'a, Kernel> Launcher<'a, Kernel> {
+    #[allow(clippy::missing_errors_doc)]
+    pub fn launch0(&mut self) -> CudaResult<()> where Kernel: Copy + FnOnce(&mut Launcher<Kernel>) -> CudaResult<()> {
+        self.kernel.launch0(&self.config)
+    }
 
-    /// # Errors
-    ///
-    /// Should only return a [`CudaError`] if some implementation-defined
-    ///  critical kernel function configuration failed.
-    #[allow(unused_variables)]
-    fn on_compile(kernel: &Function, watcher: Self::CompilationWatcher<'_>) -> CudaResult<()> {
-        Ok(())
+    #[allow(clippy::missing_errors_doc)]
+    pub fn launch1<A>(&mut self, arg1: A) -> CudaResult<()> where Kernel: Copy + FnOnce(&mut Launcher<Kernel>, A) -> CudaResult<()> {
+        self.kernel.launch1(&self.config, arg1)
     }
 }
 
-#[derive(Clone, Debug, PartialEq, Eq)]
+#[derive(Clone, Debug, PartialEq)]
 pub struct LaunchConfig {
     pub grid: rustacuda::function::GridSize,
     pub block: rustacuda::function::BlockSize,
@@ -51,26 +53,57 @@ pub struct LaunchConfig {
     pub ptx_jit: bool,
 }
 
-pub struct LaunchPackage<'l, L: ?Sized + Launcher> {
-    pub config: LaunchConfig,
-    pub kernel: &'l mut TypedKernel<L::KernelTraitObject>,
-    pub watcher: L::CompilationWatcher<'l>,
+#[doc(cfg(feature = "host"))]
+#[allow(clippy::module_name_repetitions)]
+pub struct PtxKernel {
+    module: ManuallyDrop<Box<Module>>,
+    function: ManuallyDrop<Function<'static>>,
 }
 
-pub struct SimpleKernelLauncher<KernelTraitObject: ?Sized> {
-    pub kernel: TypedKernel<KernelTraitObject>,
-    pub config: LaunchConfig,
+impl PtxKernel {
+    /// # Errors
+    ///
+    /// Returns a [`CudaError`] if `ptx` is not a valid PTX source, or it does
+    ///  not contain an entry point named `entry_point`.
+    pub fn new(ptx: &CStr, entry_point: &CStr) -> CudaResult<Self> {
+        let module = Box::new(Module::load_from_string(ptx)?);
+
+        let function = unsafe { &*(module.as_ref() as *const Module) }.get_function(entry_point);
+
+        let function = match function {
+            Ok(function) => function,
+            Err(err) => {
+                if let Err((_err, module)) = Module::drop(*module) {
+                    std::mem::forget(module);
+                }
+
+                return Err(err);
+            },
+        };
+
+        Ok(Self {
+            function: ManuallyDrop::new(function),
+            module: ManuallyDrop::new(module),
+        })
+    }
+
+    #[must_use]
+    pub fn get_function(&self) -> &Function {
+        &self.function
+    }
 }
 
-impl<KernelTraitObject: ?Sized> Launcher for SimpleKernelLauncher<KernelTraitObject> {
-    type CompilationWatcher<'a> = ();
-    type KernelTraitObject = KernelTraitObject;
+impl Drop for PtxKernel {
+    fn drop(&mut self) {
+        {
+            // Ensure that self.function is dropped before self.module as
+            //  it borrows data from the module and must not outlive it
+            let _function = unsafe { ManuallyDrop::take(&mut self.function) };
+        }
 
-    fn get_launch_package(&mut self) -> LaunchPackage<Self> {
-        LaunchPackage {
-            config: self.config.clone(),
-            kernel: &mut self.kernel,
-            watcher: (),
+        if let Err((_err, module)) = Module::drop(*unsafe { ManuallyDrop::take(&mut self.module) })
+        {
+            std::mem::forget(module);
         }
     }
 }
@@ -80,63 +113,92 @@ pub enum KernelJITResult<'k> {
     Recompiled(&'k Function<'k>),
 }
 
-pub struct TypedKernel<KernelTraitObject: ?Sized> {
+pub type PtxKernelConfigure = dyn FnMut(&Function) -> CudaResult<()>;
+
+pub struct TypedPtxKernel<Kernel> {
     compiler: PtxJITCompiler,
-    kernel: Option<CudaKernel>,
-    entry_point: alloc::boxed::Box<std::ffi::CStr>,
-    marker: PhantomData<KernelTraitObject>,
+    ptx_kernel: Option<PtxKernel>,
+    entry_point: Box<CStr>,
+    configure: Option<Box<PtxKernelConfigure>>,
+    marker: PhantomData<Kernel>,
 }
 
-impl<KernelTraitObject: ?Sized> TypedKernel<KernelTraitObject> {
-    /// # Errors
-    ///
-    /// Returns a [`CudaError`] if `ptx` or `entry_point` contain nul bytes.
-    pub fn new(ptx: &str, entry_point: &str) -> CudaResult<Self> {
-        let ptx_cstring = std::ffi::CString::new(ptx).map_err(|_| CudaError::InvalidPtx)?;
-
-        let compiler = crate::ptx_jit::PtxJITCompiler::new(&ptx_cstring);
-
-        let entry_point_cstring =
-            std::ffi::CString::new(entry_point).map_err(|_| CudaError::InvalidValue)?;
-        let entry_point = entry_point_cstring.into_boxed_c_str();
+impl<Kernel> TypedPtxKernel<Kernel> {
+    #[must_use]
+    pub fn new<T: CompiledKernelPtx<Kernel>>(configure: Option<Box<PtxKernelConfigure>>) -> Self {
+        let compiler = crate::ptx_jit::PtxJITCompiler::new(T::get_ptx());
+        let entry_point = CString::from(T::get_entry_point()).into_boxed_c_str();
 
-        Ok(Self {
+        Self {
             compiler,
-            kernel: None,
+            ptx_kernel: None,
             entry_point,
-            marker: PhantomData,
-        })
+            configure,
+            marker: PhantomData::<Kernel>,
+        }
     }
 
     /// # Errors
     ///
-    /// Returns a [`CudaError`] if `ptx` (from [`Self::new`]) is not a valid
-    ///  PTX source, or it does not contain an entry point named `entry_point`
-    ///  (from [`Self::new`]).
+    /// Returns a [`CudaError`] if the [`CompiledKernelPtx`] provided to
+    /// [`Self::new`] is not a valid PTX source or does not contain the
+    /// entry point it declares.
     pub fn compile_with_ptx_jit_args(
         &mut self,
         arguments: Option<&[Option<*const [u8]>]>,
     ) -> CudaResult<KernelJITResult> {
         let ptx_jit = self.compiler.with_arguments(arguments);
 
-        let kernel_jit = match (&mut self.kernel, ptx_jit) {
-            (Some(kernel), PtxJITResult::Cached(_)) => {
-                KernelJITResult::Cached(kernel.get_function())
+        let kernel_jit = match (&mut self.ptx_kernel, ptx_jit) {
+            (Some(ptx_kernel), PtxJITResult::Cached(_)) => {
+                KernelJITResult::Cached(ptx_kernel.get_function())
             },
-            (kernel, PtxJITResult::Cached(ptx_cstr) | PtxJITResult::Recomputed(ptx_cstr)) => {
-                let recomputed_kernel = CudaKernel::new(ptx_cstr, &self.entry_point)?;
+            (ptx_kernel, PtxJITResult::Cached(ptx_cstr) | PtxJITResult::Recomputed(ptx_cstr)) => {
+                let recomputed_ptx_kernel = PtxKernel::new(ptx_cstr, &self.entry_point)?;
 
                 // Replace the existing compiled kernel, drop the old one
-                let kernel = kernel.insert(recomputed_kernel);
+                let ptx_kernel = ptx_kernel.insert(recomputed_ptx_kernel);
+
+                let function = ptx_kernel.get_function();
+
+                if let Some(configure) = self.configure.as_mut() {
+                    configure(function)?;
+                }
 
-                KernelJITResult::Recompiled(kernel.get_function())
+                KernelJITResult::Recompiled(function)
             },
         };
 
         Ok(kernel_jit)
     }
+
+    #[allow(clippy::missing_errors_doc)]
+    pub fn launch0(&mut self, config: &LaunchConfig) -> CudaResult<()> where Kernel: Copy + FnOnce(&mut Launcher<Kernel>) -> CudaResult<()> {
+        (const { conjure::<Kernel>() })(&mut Launcher { kernel: self, config: config.clone() })
+    }
+
+    #[allow(clippy::missing_errors_doc)]
+    pub fn launch1<A>(&mut self, config: &LaunchConfig, arg1: A) -> CudaResult<()> where Kernel: Copy + FnOnce(&mut Launcher<Kernel>, A) -> CudaResult<()> {
+        (const { conjure::<Kernel>() })(&mut Launcher { kernel: self, config: config.clone() }, arg1)
+    }
+}
+
+const fn conjure<T: Copy>() -> T {
+    union Transmute<T: Copy> {
+        empty: (),
+        magic: T,
+    }
+
+    assert!(std::mem::size_of::<T>() == 0);
+    assert!(std::mem::align_of::<T>() == 1);
+
+    unsafe { Transmute { empty: () }.magic }
 }
 
+struct Assert<const ASSERT: bool>;
+trait True {}
+impl True for Assert<true> {}
+
 pub trait LendToCuda: RustToCuda {
     /// Lends an immutable copy of `&self` to CUDA:
     /// - code in the CUDA kernel can only access `&self` through the
@@ -908,3 +970,17 @@ impl<'stream, 'a, T: SafeDeviceCopy + DeviceCopy> HostAndDeviceOwnedAsync<'strea
         self.host_val
     }
 }
+
+/// # Safety
+///
+/// The PTX string returned by [`CompiledKernelPtx::get_ptx`] must correspond
+/// to the compiled kernel code for the `Kernel` function and contain a kernel
+/// entry point whose name is returned by
+/// [`CompiledKernelPtx::get_entry_point`].
+///
+/// This trait should not be implemented manually &ndash; use the
+/// [`kernel`](crate::common::kernel) macro instead.
+pub unsafe trait CompiledKernelPtx<Kernel> {
+    fn get_ptx() -> &'static CStr;
+    fn get_entry_point() -> &'static CStr;
+}
diff --git a/src/lib.rs b/src/lib.rs
index 100e95325..15e704e79 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -26,6 +26,7 @@
 #![feature(decl_macro)]
 #![feature(panic_info_message)]
 #![feature(let_chains)]
+#![feature(inline_const)]
 #![feature(cfg_version)]
 #![cfg_attr(not(version("1.76.0")), feature(c_str_literals))]
 #![cfg_attr(not(version("1.76.0")), feature(ptr_from_ref))]
diff --git a/src/safety/kernel_signature.rs b/src/safety/kernel_signature.rs
index 4a82ec1d0..96bdd3f32 100644
--- a/src/safety/kernel_signature.rs
+++ b/src/safety/kernel_signature.rs
@@ -7,13 +7,33 @@ pub enum CpuAndGpuKernelSignatures {
 pub struct Assert<const MATCH: CpuAndGpuKernelSignatures>;
 
 #[must_use]
-pub const fn check(haystack: &[u8], needle: &[u8]) -> CpuAndGpuKernelSignatures {
-    let mut i = 0;
+pub const fn check(ptx: &[u8], entry_point: &[u8]) -> CpuAndGpuKernelSignatures {
+    const KERNEL_TYPE: &[u8] = b".visible .entry ";
+
     let mut j = 0;
 
+    while j < ptx.len() {
+        let Some(j2) = find(ptx, KERNEL_TYPE, j) else {
+            return CpuAndGpuKernelSignatures::Mismatch;
+        };
+
+        if starts_with(ptx, entry_point, j2) {
+            return CpuAndGpuKernelSignatures::Match;
+        }
+
+        j += 1;
+    }
+
+    CpuAndGpuKernelSignatures::Mismatch
+}
+
+const fn find(haystack: &[u8], needle: &[u8], from: usize) -> Option<usize> {
+    let mut i = 0;
+    let mut j = from;
+
     while i < needle.len() {
         if j >= haystack.len() {
-            return CpuAndGpuKernelSignatures::Mismatch;
+            return None;
         }
 
         if needle[i] == haystack[j] {
@@ -25,5 +45,23 @@ pub const fn check(haystack: &[u8], needle: &[u8]) -> CpuAndGpuKernelSignatures
         }
     }
 
-    CpuAndGpuKernelSignatures::Match
+    Some(j)
+}
+
+const fn starts_with(haystack: &[u8], needle: &[u8], from: usize) -> bool {
+    let mut i = 0;
+
+    while i < needle.len() {
+        if (from + i) >= haystack.len() {
+            return false;
+        }
+
+        if needle[i] == haystack[from + i] {
+            i += 1;
+        } else {
+            return false;
+        }
+    }
+
+    true
 }

From a3ec63a7611280b63f3c8eda90d92fc3f2f2964d Mon Sep 17 00:00:00 2001
From: Juniper Tyree <juniper.tyree@helsinki.fi>
Date: Wed, 20 Dec 2023 12:59:18 +0000
Subject: [PATCH 050/120] More work into kernel functions instead of traits

---
 examples/print/src/main.rs                    |  14 +-
 examples/single-source/src/main.rs            |  18 ++-
 rust-cuda-derive/src/kernel/specialise/ty.rs  |  62 +++++++--
 rust-cuda-derive/src/kernel/utils.rs          |   6 -
 rust-cuda-derive/src/kernel/wrapper/config.rs |  18 +--
 .../src/kernel/wrapper/generate/args_trait.rs |  33 +----
 .../generate/cpu_linker_macro/get_ptx.rs      |  14 +-
 .../wrapper/generate/cpu_linker_macro/mod.rs  |  63 ++++-----
 .../generate/cpu_wrapper/kernel_func.rs       |  82 ++++--------
 .../kernel_func_async/async_func_types.rs     |  17 ++-
 .../kernel_func_async/launch_types.rs         |  17 ++-
 .../cpu_wrapper/kernel_func_async/mod.rs      |  71 +++-------
 .../wrapper/generate/cpu_wrapper/mod.rs       |  44 +-----
 .../wrapper/generate/cuda_generic_function.rs |   2 -
 .../kernel/wrapper/generate/cuda_wrapper.rs   |  28 ++--
 .../src/kernel/wrapper/inputs/mod.rs          |  69 +---------
 rust-cuda-derive/src/kernel/wrapper/mod.rs    | 125 +++---------------
 rust-cuda-derive/src/kernel/wrapper/parse.rs  |   7 +
 rust-cuda-derive/src/lib.rs                   |   1 +
 src/host.rs                                   |  46 +++++--
 20 files changed, 280 insertions(+), 457 deletions(-)

diff --git a/examples/print/src/main.rs b/examples/print/src/main.rs
index cedaa6bae..dc38b3fa9 100644
--- a/examples/print/src/main.rs
+++ b/examples/print/src/main.rs
@@ -8,6 +8,7 @@
 #![feature(cfg_version)]
 #![cfg_attr(not(version("1.76.0")), feature(c_str_literals))]
 #![feature(type_alias_impl_trait)]
+#![feature(decl_macro)]
 
 extern crate alloc;
 
@@ -20,7 +21,7 @@ pub enum Action {
     AllocError,
 }
 
-#[rust_cuda::common::kernel(pub use link! as impl Kernel<KernelArgs> for Launcher)]
+#[rust_cuda::common::kernel(use link! for impl)]
 #[kernel(allow(ptx::local_memory_usage))]
 pub fn kernel(#[kernel(pass = SafeDeviceCopy)] action: Action) {
     match action {
@@ -39,7 +40,7 @@ fn main() -> rust_cuda::rustacuda::error::CudaResult<()> {
     link! { impl kernel for KernelPtx }
 
     // Initialize the CUDA API
-    /*rust_cuda::rustacuda::init(rust_cuda::rustacuda::CudaFlags::empty())?;
+    rust_cuda::rustacuda::init(rust_cuda::rustacuda::CudaFlags::empty())?;
 
     // Get the first CUDA GPU device
     let device = rust_cuda::rustacuda::device::Device::get_device(0)?;
@@ -57,7 +58,7 @@ fn main() -> rust_cuda::rustacuda::error::CudaResult<()> {
     let stream = rust_cuda::host::CudaDropWrapper::from(rust_cuda::rustacuda::stream::Stream::new(
         rust_cuda::rustacuda::stream::StreamFlags::NON_BLOCKING,
         None,
-    )?);*/
+    )?);
 
     // Create a new instance of the CUDA kernel and prepare the launch config
     let mut kernel = rust_cuda::host::TypedPtxKernel::<kernel>::new::<KernelPtx>(None);
@@ -67,17 +68,16 @@ fn main() -> rust_cuda::rustacuda::error::CudaResult<()> {
         shared_memory_size: 0,
         ptx_jit: false,
     };
-    // let mut launcher = rust_cuda::host::Launcher { kernel: &mut typed_kernel, config };
 
     // Launch the CUDA kernel on the stream and synchronise to its completion
     println!("Launching print kernel ...");
-    kernel.launch1(&config, Action::Print)?;
+    kernel.launch1(&stream, &config, Action::Print)?;
     // kernel(&mut launcher, Action::Print)?;
     println!("Launching panic kernel ...");
-    kernel.launch1(&config, Action::Panic)?;
+    kernel.launch1(&stream, &config, Action::Panic)?;
     // kernel(&mut launcher, Action::Panic)?;
     println!("Launching alloc error kernel ...");
-    kernel.launch1(&config, Action::AllocError)?;
+    kernel.launch1(&stream, &config, Action::AllocError)?;
     // kernel(&mut launcher, Action::AllocError)?;
 
     Ok(())
diff --git a/examples/single-source/src/main.rs b/examples/single-source/src/main.rs
index b06f7031c..796e6ee4f 100644
--- a/examples/single-source/src/main.rs
+++ b/examples/single-source/src/main.rs
@@ -10,6 +10,7 @@
 #![cfg_attr(not(version("1.76.0")), feature(c_str_literals))]
 #![feature(type_alias_impl_trait)]
 #![feature(associated_type_bounds)]
+#![feature(decl_macro)]
 
 extern crate alloc;
 
@@ -46,13 +47,20 @@ pub struct Tuple(u32, i32);
 #[layout(crate = "rc::const_type_layout")]
 pub struct Triple(i32, i32, i32);
 
-#[rc::common::kernel(use link! as impl Kernel<KernelArgs> for Launcher)]
+#[rc::common::kernel(pub use link! for impl)]
 #[kernel(crate = "rc")]
 #[kernel(
     allow(ptx::double_precision_use),
     forbid(ptx::local_memory_usage, ptx::register_spills)
 )]
-pub fn kernel<'a, T: rc::common::RustToCuda<CudaRepresentation: rc::safety::StackOnly, CudaAllocation: rc::common::EmptyCudaAlloc> + rc::safety::StackOnly + rc::safety::NoSafeAliasing>(
+pub fn kernel<
+    'a,
+    T: rc::common::RustToCuda<
+            CudaRepresentation: rc::safety::StackOnly,
+            CudaAllocation: rc::common::EmptyCudaAlloc,
+        > + rc::safety::StackOnly
+        + rc::safety::NoSafeAliasing,
+>(
     #[kernel(pass = SafeDeviceCopy)] _x: &Dummy,
     #[kernel(pass = LendRustToCuda, jit)] _y: &mut ShallowCopy<Wrapper<T>>,
     #[kernel(pass = LendRustToCuda)] _z: &ShallowCopy<Wrapper<T>>,
@@ -80,12 +88,12 @@ pub fn kernel<'a, T: rc::common::RustToCuda<CudaRepresentation: rc::safety::Stac
 
 #[cfg(not(target_os = "cuda"))]
 mod host {
-    use super::{kernel, KernelArgs};
+    // use super::{link, kernel};
 
     // Link several instances of the generic CUDA kernel
     struct KernelPtx<'a, T>(std::marker::PhantomData<&'a T>);
-    link! { impl kernel<'a, crate::Empty> for KernelPtx }
-    link! { impl kernel<'a, rc::utils::device_copy::SafeDeviceCopyWrapper<u64>> for KernelPtx }
+    crate::link! { impl kernel<'a, crate::Empty> for KernelPtx }
+    crate::link! { impl kernel<'a, rc::utils::device_copy::SafeDeviceCopyWrapper<u64>> for KernelPtx }
 }
 
 #[cfg(target_os = "cuda")]
diff --git a/rust-cuda-derive/src/kernel/specialise/ty.rs b/rust-cuda-derive/src/kernel/specialise/ty.rs
index 9b5a06955..196f2556a 100644
--- a/rust-cuda-derive/src/kernel/specialise/ty.rs
+++ b/rust-cuda-derive/src/kernel/specialise/ty.rs
@@ -1,11 +1,16 @@
 use proc_macro::TokenStream;
+use quote::ToTokens;
 
 pub fn specialise_kernel_type(tokens: TokenStream) -> TokenStream {
-    let SpecialiseTypeConfig { kernel, typedef } = match syn::parse_macro_input::parse(tokens) {
+    let SpecialiseTypeConfig {
+        _private, // TODO: either use or remove the private path
+        args,
+        typedef,
+    } = match syn::parse_macro_input::parse(tokens) {
         Ok(config) => config,
         Err(err) => {
             abort_call_site!(
-                "specialise_kernel_type!(KERNEL::TYPEDEF) expects KERNEL and TYPEDEF identifiers: \
+                "specialise_kernel_type!(ARGS::TYPEDEF) expects ARGS path and TYPEDEF identifier: \
                  {:?}",
                 err
             )
@@ -20,15 +25,47 @@ pub fn specialise_kernel_type(tokens: TokenStream) -> TokenStream {
     let specialisation_var = format!(
         "RUST_CUDA_DERIVE_SPECIALISE_{}_{}",
         crate_name,
-        kernel.to_string().to_uppercase()
+        args.to_string().to_uppercase()
     );
 
     match proc_macro::tracked_env::var(&specialisation_var) {
         Ok(specialisation) => {
-            match format!("<() as {kernel}{specialisation}>::{typedef}").parse() {
-                Ok(parsed_specialisation) => parsed_specialisation,
+            let specialisation = match syn::parse_str(&specialisation) {
+                _ if specialisation.is_empty() => syn::PathArguments::None,
+                Ok(specialisation) => syn::PathArguments::AngleBracketed(specialisation),
                 Err(err) => abort_call_site!("Failed to parse specialisation: {:?}", err),
-            }
+            };
+
+            syn::Type::Path(syn::TypePath {
+                qself: Some(syn::QSelf {
+                    lt_token: syn::parse_quote!(<),
+                    ty: syn::parse_quote!(()),
+                    position: 1, // 2,
+                    as_token: syn::parse_quote!(as),
+                    gt_token: syn::parse_quote!(>),
+                }),
+                path: syn::Path {
+                    leading_colon: None,
+                    segments: [
+                        // syn::PathSegment {
+                        //     ident: private,
+                        //     arguments: syn::PathArguments::None,
+                        // },
+                        syn::PathSegment {
+                            ident: args,
+                            arguments: specialisation,
+                        },
+                        syn::PathSegment {
+                            ident: typedef,
+                            arguments: syn::PathArguments::None,
+                        },
+                    ]
+                    .into_iter()
+                    .collect(),
+                },
+            })
+            .into_token_stream()
+            .into()
         },
         Err(err) => abort_call_site!(
             "Failed to read specialisation from {:?}: {:?}",
@@ -39,16 +76,23 @@ pub fn specialise_kernel_type(tokens: TokenStream) -> TokenStream {
 }
 
 struct SpecialiseTypeConfig {
-    kernel: syn::Ident,
+    _private: syn::Ident,
+    args: syn::Ident,
     typedef: syn::Ident,
 }
 
 impl syn::parse::Parse for SpecialiseTypeConfig {
     fn parse(input: syn::parse::ParseStream) -> syn::Result<Self> {
-        let kernel: syn::Ident = input.parse()?;
+        let private: syn::Ident = input.parse()?;
+        let _dc: syn::token::Colon2 = input.parse()?;
+        let args: syn::Ident = input.parse()?;
         let _dc: syn::token::Colon2 = input.parse()?;
         let typedef: syn::Ident = input.parse()?;
 
-        Ok(Self { kernel, typedef })
+        Ok(Self {
+            _private: private,
+            args,
+            typedef,
+        })
     }
 }
diff --git a/rust-cuda-derive/src/kernel/utils.rs b/rust-cuda-derive/src/kernel/utils.rs
index 5afd05858..c73876f09 100644
--- a/rust-cuda-derive/src/kernel/utils.rs
+++ b/rust-cuda-derive/src/kernel/utils.rs
@@ -1,5 +1,3 @@
-use syn::spanned::Spanned;
-
 pub fn skip_kernel_compilation() -> bool {
     let mut skip_compilation = false;
 
@@ -13,7 +11,3 @@ pub fn skip_kernel_compilation() -> bool {
 
     skip_compilation
 }
-
-pub fn r2c_move_lifetime(arg: usize, ty: &syn::Type) -> syn::Lifetime {
-    syn::Lifetime::new(&format!("'__r2c_move_lt_{arg}"), ty.span())
-}
diff --git a/rust-cuda-derive/src/kernel/wrapper/config.rs b/rust-cuda-derive/src/kernel/wrapper/config.rs
index 6ba9ebedc..cc9531acc 100644
--- a/rust-cuda-derive/src/kernel/wrapper/config.rs
+++ b/rust-cuda-derive/src/kernel/wrapper/config.rs
@@ -1,9 +1,8 @@
 pub(super) struct KernelConfig {
     pub(super) visibility: Option<syn::token::Pub>,
     pub(super) linker: syn::Ident,
-    pub(super) kernel: syn::Ident,
+    pub(super) private: syn::Ident,
     pub(super) args: syn::Ident,
-    pub(super) launcher: syn::Ident,
 }
 
 impl syn::parse::Parse for KernelConfig {
@@ -12,22 +11,17 @@ impl syn::parse::Parse for KernelConfig {
         let _use: syn::token::Use = input.parse()?;
         let linker: syn::Ident = input.parse()?;
         let _bang: syn::token::Bang = input.parse()?;
-        let _as: syn::token::As = input.parse()?;
-        let _impl: syn::token::Impl = input.parse()?;
-        let kernel: syn::Ident = input.parse()?;
-        let _lt_token: syn::token::Lt = input.parse()?;
-        let args: syn::Ident = input.parse()?;
-        let _comma: Option<syn::token::Comma> = input.parse()?;
-        let _gt_token: syn::token::Gt = input.parse()?;
         let _for: syn::token::For = input.parse()?;
-        let launcher: syn::Ident = input.parse()?;
+        let _impl: syn::token::Impl = input.parse()?;
+
+        let private = syn::Ident::new("private", proc_macro::Span::def_site().into());
+        let args = syn::Ident::new("KernelArgs", proc_macro::Span::def_site().into());
 
         Ok(Self {
             visibility,
             linker,
-            kernel,
+            private,
             args,
-            launcher,
         })
     }
 }
diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/args_trait.rs b/rust-cuda-derive/src/kernel/wrapper/generate/args_trait.rs
index 4c725601b..d45a35fb0 100644
--- a/rust-cuda-derive/src/kernel/wrapper/generate/args_trait.rs
+++ b/rust-cuda-derive/src/kernel/wrapper/generate/args_trait.rs
@@ -1,22 +1,12 @@
 use proc_macro2::TokenStream;
 
-use super::super::{DeclGenerics, FunctionInputs, ImplGenerics, KernelConfig};
+use super::super::{FunctionInputs, ImplGenerics, KernelConfig};
 
 pub(in super::super) fn quote_args_trait(
-    KernelConfig {
-        visibility, args, ..
-    }: &KernelConfig,
-    DeclGenerics {
-        generic_start_token,
-        generic_trait_params: generic_params,
-        generic_close_token,
-        generic_trait_where_clause: generic_where_clause,
-        ..
-    }: &DeclGenerics,
+    KernelConfig { args, .. }: &KernelConfig,
     ImplGenerics {
         impl_generics,
         ty_generics,
-        where_clause,
     }: &ImplGenerics,
     FunctionInputs { func_inputs, .. }: &FunctionInputs,
 ) -> TokenStream {
@@ -52,25 +42,12 @@ pub(in super::super) fn quote_args_trait(
         .collect::<Vec<_>>();
 
     quote! {
-        #[cfg(not(target_os = "cuda"))]
-        #[allow(clippy::missing_safety_doc)]
-        #visibility unsafe trait #args #generic_start_token #generic_params #generic_close_token
-            #generic_where_clause
-        {
+        #[allow(non_camel_case_types)]
+        pub trait #args #impl_generics {
             #(#func_input_typedefs)*
         }
 
-        // #args must always be pub in CUDA kernel as it is used to define the
-        //  public kernel entry point signature
-        #[cfg(target_os = "cuda")]
-        #[allow(clippy::missing_safety_doc)]
-        pub unsafe trait #args #generic_start_token #generic_params #generic_close_token
-            #generic_where_clause
-        {
-            #(#func_input_typedefs)*
-        }
-
-        unsafe impl #impl_generics #args #ty_generics for () #where_clause {
+        impl #impl_generics #args #ty_generics for () {
             #(#func_input_types)*
         }
     }
diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/get_ptx.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/get_ptx.rs
index 790b3b8df..75fc008ed 100644
--- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/get_ptx.rs
+++ b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/get_ptx.rs
@@ -3,7 +3,9 @@ use syn::spanned::Spanned;
 
 use crate::kernel::utils::skip_kernel_compilation;
 
-use super::super::super::{DeclGenerics, FuncIdent, FunctionInputs, InputCudaType, KernelConfig};
+use super::super::super::{
+    DeclGenerics, FuncIdent, FunctionInputs, ImplGenerics, InputCudaType, KernelConfig,
+};
 
 #[allow(clippy::too_many_arguments)]
 pub(super) fn quote_get_ptx(
@@ -19,6 +21,7 @@ pub(super) fn quote_get_ptx(
         generic_close_token,
         ..
     }: &DeclGenerics,
+    impl_generics: &ImplGenerics,
     inputs: &FunctionInputs,
     func_params: &[syn::Ident],
     macro_type_ids: &[syn::Ident],
@@ -32,6 +35,8 @@ pub(super) fn quote_get_ptx(
     let crate_manifest_dir = proc_macro::tracked_env::var("CARGO_MANIFEST_DIR")
         .unwrap_or_else(|err| abort_call_site!("Failed to read crate path: {:?}.", err));
 
+    let args_trait = super::super::args_trait::quote_args_trait(config, impl_generics, inputs);
+
     let cpu_func_lifetime_erased_types =
         generate_lifetime_erased_types(crate_path, config, generics, inputs, macro_type_ids);
 
@@ -79,6 +84,9 @@ pub(super) fn quote_get_ptx(
 
     quote! {
         fn get_ptx() -> &'static ::core::ffi::CStr {
+            #[allow(unused_imports)]
+            use __rust_cuda_ffi_safe_assert::#args;
+
             #crate_path::host::link_kernel!{
                 #func_ident #func_ident_hash #args #crate_name #crate_manifest_dir #generic_start_token
                     #($#macro_type_ids),*
@@ -92,7 +100,9 @@ pub(super) fn quote_get_ptx(
             #[deny(improper_ctypes)]
             mod __rust_cuda_ffi_safe_assert {
                 #[allow(unused_imports)]
-                use super::#args;
+                use super::*;
+
+                #args_trait
 
                 extern "C" { #(
                     #[allow(dead_code)]
diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/mod.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/mod.rs
index 495b61870..ae2be49d9 100644
--- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/mod.rs
+++ b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/mod.rs
@@ -1,36 +1,34 @@
 use proc_macro2::TokenStream;
 
-use super::super::{DeclGenerics, FuncIdent, FunctionInputs, KernelConfig};
+use super::super::{DeclGenerics, FuncIdent, FunctionInputs, ImplGenerics, KernelConfig};
 
 mod get_ptx;
 
 use get_ptx::quote_get_ptx;
 
+#[allow(clippy::too_many_arguments)] // FIXME
 pub(in super::super) fn quote_cpu_linker_macro(
     crate_path: &syn::Path,
     config @ KernelConfig {
-        visibility,
-        linker,
-        kernel,
-        launcher,
-        ..
+        visibility, linker, ..
     }: &KernelConfig,
     decl_generics @ DeclGenerics {
         generic_start_token,
-        generic_trait_params: generic_params,
         generic_close_token,
         generic_kernel_params,
         ..
     }: &DeclGenerics,
+    impl_generics: &ImplGenerics,
     func_inputs: &FunctionInputs,
     func_ident @ FuncIdent {
         func_ident: func_ident_name,
-        func_ident_hash, ..
+        func_ident_hash,
+        ..
     }: &FuncIdent,
     func_params: &[syn::Ident],
     ptx_lint_levels: &TokenStream,
 ) -> TokenStream {
-    let macro_generics = generic_kernel_params//generic_params
+    let macro_generics = generic_kernel_params
         .iter()
         .enumerate()
         .map(|(i, generic)| {
@@ -39,7 +37,7 @@ pub(in super::super) fn quote_cpu_linker_macro(
             match generic {
                 syn::GenericParam::Type(_) => quote!($#generic_ident:ty),
                 syn::GenericParam::Const(_) => quote!($#generic_ident:expr),
-                syn::GenericParam::Lifetime(_) => quote!($#generic_ident:lifetime),//unreachable!(),
+                syn::GenericParam::Lifetime(_) => quote!($#generic_ident:lifetime),
             }
         })
         .collect::<Vec<_>>();
@@ -48,7 +46,7 @@ pub(in super::super) fn quote_cpu_linker_macro(
         .map(|i| quote::format_ident!("__g_{}", i))
         .collect::<Vec<_>>();
 
-    let macro_only_lt_generic_ids = generic_kernel_params//generic_params
+    let macro_only_lt_generic_ids = generic_kernel_params
         .iter()
         .enumerate()
         .filter_map(|(i, generic)| {
@@ -61,7 +59,7 @@ pub(in super::super) fn quote_cpu_linker_macro(
         })
         .collect::<Vec<_>>();
 
-    let macro_non_lt_generic_ids = generic_kernel_params//generic_params
+    let macro_non_lt_generic_ids = generic_kernel_params
         .iter()
         .enumerate()
         .filter_map(|(i, generic)| {
@@ -74,17 +72,12 @@ pub(in super::super) fn quote_cpu_linker_macro(
         })
         .collect::<Vec<_>>();
 
-    let cpu_linker_macro_visibility = if visibility.is_some() {
-        quote! { #[macro_export] }
-    } else {
-        quote! {}
-    };
-
     let get_ptx = quote_get_ptx(
         crate_path,
         func_ident,
         config,
         decl_generics,
+        impl_generics,
         func_inputs,
         func_params,
         &macro_non_lt_generic_ids,
@@ -93,25 +86,25 @@ pub(in super::super) fn quote_cpu_linker_macro(
 
     quote! {
         #[cfg(not(target_os = "cuda"))]
-        #cpu_linker_macro_visibility
-        macro_rules! #linker {
-            (impl #func_ident_name #generic_start_token #(#macro_generics),* $(,)? #generic_close_token for $ptx:ident) => {
-                unsafe impl<#($#macro_only_lt_generic_ids),*> #crate_path::host::CompiledKernelPtx<
-                    #func_ident_name #generic_start_token #($#macro_generic_ids),* #generic_close_token
-                    //dyn #kernel #generic_start_token #($#macro_type_ids),* #generic_close_token
-                > for $ptx #generic_start_token #($#macro_generic_ids),* #generic_close_token // #launcher #generic_start_token #($#macro_type_ids),* #generic_close_token
-                {
-                    #get_ptx
+        #visibility macro #linker(
+            impl #func_ident_name #generic_start_token
+                #(#macro_generics),* $(,)?
+            #generic_close_token for $ptx:ident
+        ) {
+            unsafe impl<#($#macro_only_lt_generic_ids),*> #crate_path::host::CompiledKernelPtx<
+                #func_ident_name #generic_start_token #($#macro_generic_ids),* #generic_close_token
+            > for $ptx #generic_start_token #($#macro_generic_ids),* #generic_close_token
+            {
+                #get_ptx
 
-                    fn get_entry_point() -> &'static ::core::ffi::CStr {
-                        #crate_path::host::specialise_kernel_call!(
-                            #func_ident_hash #generic_start_token
-                                #($#macro_non_lt_generic_ids),*
-                            #generic_close_token
-                        )
-                    }
+                fn get_entry_point() -> &'static ::core::ffi::CStr {
+                    #crate_path::host::specialise_kernel_call!(
+                        #func_ident_hash #generic_start_token
+                            #($#macro_non_lt_generic_ids),*
+                        #generic_close_token
+                    )
                 }
-            };
+            }
         }
     }
 }
diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func.rs
index 6fa778eb3..b0fa4625f 100644
--- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func.rs
+++ b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func.rs
@@ -8,12 +8,12 @@ use super::super::super::{
 #[allow(clippy::too_many_lines)] // FIXME
 pub(super) fn quote_kernel_func_inputs(
     crate_path: &syn::Path,
-    KernelConfig { kernel, args, visibility, .. }: &KernelConfig,
+    KernelConfig { args, private, .. }: &KernelConfig,
     ImplGenerics { ty_generics, .. }: &ImplGenerics,
     DeclGenerics {
         generic_kernel_params,
-        generic_wrapper_params,
-        generic_wrapper_where_clause,
+        generic_start_token,
+        generic_close_token,
         ..
     }: &DeclGenerics,
     inputs @ FunctionInputs { func_inputs, .. }: &FunctionInputs,
@@ -21,32 +21,6 @@ pub(super) fn quote_kernel_func_inputs(
     func_params: &[syn::Ident],
     func_attrs: &[syn::Attribute],
 ) -> TokenStream {
-    let launcher_predicate = quote! {
-        Self: Sized + #crate_path::host::Launcher<
-            KernelTraitObject = dyn #kernel #ty_generics
-        >
-    };
-
-    let generic_wrapper_where_clause = match generic_wrapper_where_clause {
-        Some(syn::WhereClause {
-            where_token,
-            predicates,
-        }) if !predicates.is_empty() => {
-            let comma = if predicates.empty_or_trailing() {
-                quote!()
-            } else {
-                quote!(,)
-            };
-
-            quote! {
-                #where_token #predicates #comma #launcher_predicate
-            }
-        },
-        _ => quote! {
-            where #launcher_predicate
-        },
-    };
-
     let (kernel_func_inputs, kernel_func_input_tys): (Vec<_>, Vec<_>) = func_inputs
         .iter()
         .enumerate()
@@ -59,7 +33,7 @@ pub(super) fn quote_kernel_func_inputs(
             }) => {
                 let type_ident = quote::format_ident!("__T_{}", i);
                 let syn_type: syn::Type = syn::parse_quote! {
-                    <() as #args #ty_generics>::#type_ident
+                    <() as #private :: #args #ty_generics>::#type_ident
                 };
                 let syn_type = if let syn::Type::Reference(syn::TypeReference {
                     and_token,
@@ -88,21 +62,31 @@ pub(super) fn quote_kernel_func_inputs(
         })
         .unzip();
 
+    let launcher = syn::Ident::new("launcher", proc_macro2::Span::mixed_site());
+
     let raw_func_input_wrap =
-        generate_raw_func_input_wrap(crate_path, inputs, fn_ident, func_params);
+        generate_raw_func_input_wrap(crate_path, inputs, fn_ident, func_params, &launcher);
+
+    let full_generics = generic_kernel_params
+        .iter()
+        .map(|param| match param {
+            syn::GenericParam::Type(syn::TypeParam { ident, .. })
+            | syn::GenericParam::Const(syn::ConstParam { ident, .. }) => quote!(#ident),
+            syn::GenericParam::Lifetime(syn::LifetimeDef { lifetime, .. }) => quote!(#lifetime),
+        })
+        .collect::<Vec<_>>();
 
-    let full_generics = generic_kernel_params.iter().map(|param| match param {
-        syn::GenericParam::Type(syn::TypeParam { ident, .. }) | syn::GenericParam::Const(syn::ConstParam { ident, .. }) => quote!(#ident),
-        syn::GenericParam::Lifetime(syn::LifetimeDef { lifetime, .. }) => quote!(#lifetime),
-    }).collect::<Vec<_>>();
-    
     let ty_turbofish = ty_generics.as_turbofish();
 
     quote! {
         #[cfg(not(target_os = "cuda"))]
         #[allow(non_camel_case_types)]
-        #visibility type #func_ident <#generic_kernel_params> = impl Copy + Fn(
-            &mut #crate_path::host::Launcher<#func_ident <#(#full_generics),*>>,
+        pub type #func_ident #generic_start_token
+            #generic_kernel_params
+        #generic_close_token = impl Copy + Fn(
+            &mut #crate_path::host::Launcher<#func_ident #generic_start_token
+                #(#full_generics),*
+            #generic_close_token>,
             #(#kernel_func_input_tys),*
         ) -> #crate_path::rustacuda::error::CudaResult<()>;
 
@@ -112,17 +96,10 @@ pub(super) fn quote_kernel_func_inputs(
         #[allow(clippy::too_many_arguments)]
         #[allow(clippy::used_underscore_binding)]
         #[allow(unused_variables)]
-        #visibility fn #func_ident </*'stream,*/ #generic_kernel_params>(
-            // &mut self,
-            // TODO: move the stream
-            // stream: &'stream #crate_path::rustacuda::stream::Stream,
-            // kernel: &mut #crate_path::host::TypedKernel<#func_ident #ty_generics>,
-            launcher: &mut #crate_path::host::Launcher<#func_ident #ty_generics>,
+        pub fn #func_ident <#generic_kernel_params>(
+            #launcher: &mut #crate_path::host::Launcher<#func_ident #ty_generics>,
             #(#kernel_func_inputs),*
-        ) -> #crate_path::rustacuda::error::CudaResult<()>
-        // TODO: don't allow where clause
-            //#generic_wrapper_where_clause
-        {
+        ) -> #crate_path::rustacuda::error::CudaResult<()> {
             let _: #func_ident <#(#full_generics),*> = #func_ident #ty_turbofish;
 
             // impls check adapted from Nikolai Vazquez's `impls` crate:
@@ -141,9 +118,7 @@ pub(super) fn quote_kernel_func_inputs(
                 <CheckIs<T>>::SYNC
             }
 
-            todo!()
-
-            // #raw_func_input_wrap
+            #raw_func_input_wrap
         }
     }
 }
@@ -159,6 +134,7 @@ fn generate_raw_func_input_wrap(
         func_ident_async, ..
     }: &FuncIdent,
     func_params: &[syn::Ident],
+    launcher: &syn::Ident,
 ) -> TokenStream {
     func_inputs
         .iter()
@@ -167,8 +143,8 @@ fn generate_raw_func_input_wrap(
         .rev()
         .fold(
             quote! {
-                self.#func_ident_async(stream, #(#func_params),*)?;
-                stream.synchronize()
+                #func_ident_async(#launcher, #(#func_params),*)?;
+                #launcher.stream.synchronize()
             },
             |inner, ((arg, param), (cuda_mode, _ptx_jit))| match arg {
                 syn::FnArg::Typed(syn::PatType { pat, ty, .. }) => match cuda_mode {
diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func_async/async_func_types.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func_async/async_func_types.rs
index efe8026eb..652ff4bc6 100644
--- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func_async/async_func_types.rs
+++ b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func_async/async_func_types.rs
@@ -1,18 +1,17 @@
 use proc_macro2::TokenStream;
 use syn::spanned::Spanned;
 
-use crate::kernel::utils::r2c_move_lifetime;
-
 use super::super::super::super::{FunctionInputs, ImplGenerics, InputCudaType, KernelConfig};
 
 pub(super) fn generate_async_func_types(
     crate_path: &syn::Path,
-    KernelConfig { args, .. }: &KernelConfig,
+    KernelConfig { args, private, .. }: &KernelConfig,
     ImplGenerics { ty_generics, .. }: &ImplGenerics,
     FunctionInputs {
         func_inputs,
         func_input_cuda_types,
     }: &FunctionInputs,
+    stream: &syn::Lifetime,
 ) -> Vec<TokenStream> {
     func_inputs
         .iter()
@@ -27,7 +26,7 @@ pub(super) fn generate_async_func_types(
             }) => {
                 let type_ident = quote::format_ident!("__T_{}", i);
                 let syn_type = quote! {
-                    <() as #args #ty_generics>::#type_ident
+                    <() as #private :: #args #ty_generics>::#type_ident
                 };
 
                 let cuda_type = match cuda_mode {
@@ -47,6 +46,8 @@ pub(super) fn generate_async_func_types(
                     ..
                 }) = &**ty
                 {
+                    let lifetime = lifetime.clone().unwrap_or(syn::parse_quote!('_));
+
                     let wrapped_type = if mutability.is_some() {
                         if matches!(cuda_mode, InputCudaType::SafeDeviceCopy) {
                             abort!(
@@ -56,11 +57,11 @@ pub(super) fn generate_async_func_types(
                         }
 
                         quote!(
-                            #crate_path::host::HostAndDeviceMutRefAsync<'stream, #lifetime, #cuda_type>
+                            #crate_path::host::HostAndDeviceMutRefAsync<#stream, #lifetime, #cuda_type>
                         )
                     } else {
                         quote!(
-                            #crate_path::host::HostAndDeviceConstRefAsync<'stream, #lifetime, #cuda_type>
+                            #crate_path::host::HostAndDeviceConstRefAsync<#stream, #lifetime, #cuda_type>
                         )
                     };
 
@@ -68,10 +69,8 @@ pub(super) fn generate_async_func_types(
                         #(#attrs)* #mutability #pat #colon_token #wrapped_type
                     }
                 } else if matches!(cuda_mode, InputCudaType::LendRustToCuda) {
-                    let lifetime = r2c_move_lifetime(i, ty);
-
                     let wrapped_type = quote! {
-                        #crate_path::host::HostAndDeviceOwnedAsync<'stream, #lifetime, #cuda_type>
+                        #crate_path::host::HostAndDeviceOwnedAsync<#stream, '_, #cuda_type>
                     };
 
                     quote! {
diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func_async/launch_types.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func_async/launch_types.rs
index 454bdcd57..55771c3c8 100644
--- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func_async/launch_types.rs
+++ b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func_async/launch_types.rs
@@ -1,13 +1,11 @@
 use proc_macro2::TokenStream;
 use syn::spanned::Spanned;
 
-use crate::kernel::utils::r2c_move_lifetime;
-
 use super::super::super::super::{FunctionInputs, ImplGenerics, InputCudaType, KernelConfig};
 
 pub(in super::super) fn generate_launch_types(
     crate_path: &syn::Path,
-    KernelConfig { args, .. }: &KernelConfig,
+    KernelConfig { args, private, .. }: &KernelConfig,
     ImplGenerics { ty_generics, .. }: &ImplGenerics,
     FunctionInputs {
         func_inputs,
@@ -25,7 +23,7 @@ pub(in super::super) fn generate_launch_types(
             syn::FnArg::Typed(syn::PatType { ty, .. }) => {
                 let type_ident = quote::format_ident!("__T_{}", i);
                 let syn_type = quote::quote_spanned! { ty.span()=>
-                    <() as #args #ty_generics>::#type_ident
+                    <() as #private :: #args #ty_generics>::#type_ident
                 };
 
                 cpu_func_unboxed_types.push(syn_type.clone());
@@ -48,20 +46,21 @@ pub(in super::super) fn generate_launch_types(
                         ..
                     }) = &**ty
                     {
+                        let comma: Option<syn::token::Comma> =
+                            lifetime.as_ref().map(|_| syn::parse_quote!(,));
+
                         if mutability.is_some() {
                             quote::quote_spanned! { ty.span()=>
-                                #crate_path::common::DeviceMutRef<#lifetime, #cuda_type>
+                                #crate_path::common::DeviceMutRef<#lifetime #comma #cuda_type>
                             }
                         } else {
                             quote::quote_spanned! { ty.span()=>
-                                #crate_path::common::DeviceConstRef<#lifetime, #cuda_type>
+                                #crate_path::common::DeviceConstRef<#lifetime #comma #cuda_type>
                             }
                         }
                     } else if matches!(cuda_mode, InputCudaType::LendRustToCuda) {
-                        let lifetime = r2c_move_lifetime(i, ty);
-
                         quote::quote_spanned! { ty.span()=>
-                            #crate_path::common::DeviceMutRef<#lifetime, #cuda_type>
+                            #crate_path::common::DeviceMutRef<#cuda_type>
                         }
                     } else {
                         quote! { #cuda_type }
diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func_async/mod.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func_async/mod.rs
index 63d0d472f..d4830d254 100644
--- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func_async/mod.rs
+++ b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func_async/mod.rs
@@ -13,48 +13,26 @@ use type_wrap::generate_func_input_and_ptx_jit_wraps;
 #[allow(clippy::too_many_arguments)]
 pub(super) fn quote_kernel_func_async(
     crate_path: &syn::Path,
-    config @ KernelConfig { kernel, .. }: &KernelConfig,
+    config: &KernelConfig,
     impl_generics @ ImplGenerics { ty_generics, .. }: &ImplGenerics,
     DeclGenerics {
-        generic_wrapper_params,
-        generic_wrapper_where_clause,
+        generic_kernel_params,
         ..
     }: &DeclGenerics,
     func_inputs: &FunctionInputs,
     FuncIdent {
-        func_ident_async, ..
+        func_ident,
+        func_ident_async,
+        ..
     }: &FuncIdent,
     func_params: &[syn::Ident],
     func_attrs: &[syn::Attribute],
 ) -> TokenStream {
-    let launcher_predicate = quote! {
-        Self: Sized + #crate_path::host::Launcher<
-            KernelTraitObject = dyn #kernel #ty_generics
-        >
-    };
-
-    let generic_wrapper_where_clause = match generic_wrapper_where_clause {
-        Some(syn::WhereClause {
-            where_token,
-            predicates,
-        }) if !predicates.is_empty() => {
-            let comma = if predicates.empty_or_trailing() {
-                quote!()
-            } else {
-                quote!(,)
-            };
-
-            quote! {
-                #where_token #predicates #comma #launcher_predicate
-            }
-        },
-        _ => quote! {
-            where #launcher_predicate
-        },
-    };
+    let launcher = syn::Ident::new("launcher", proc_macro2::Span::mixed_site());
+    let stream = syn::Lifetime::new("'stream", proc_macro2::Span::mixed_site());
 
     let kernel_func_async_inputs =
-        generate_async_func_types(crate_path, config, impl_generics, func_inputs);
+        generate_async_func_types(crate_path, config, impl_generics, func_inputs, &stream);
     let (func_input_wrap, func_cpu_ptx_jit_wrap) =
         generate_func_input_and_ptx_jit_wraps(crate_path, func_inputs);
     let (cpu_func_types_launch, cpu_func_unboxed_types) =
@@ -67,31 +45,18 @@ pub(super) fn quote_kernel_func_async(
         #[allow(clippy::too_many_arguments)]
         #[allow(clippy::used_underscore_binding)]
         #[allow(unused_variables)]
-        fn #func_ident_async <'stream, #generic_wrapper_params>(
-            &mut self,
-            stream: &'stream #crate_path::rustacuda::stream::Stream,
+        pub fn #func_ident_async <#stream, #generic_kernel_params>(
+            #launcher: &mut #crate_path::host::Launcher<#stream, '_, #func_ident #ty_generics>,
             #(#kernel_func_async_inputs),*
-        ) -> #crate_path::rustacuda::error::CudaResult<()>
-            #generic_wrapper_where_clause
-        {
-            let #crate_path::host::LaunchPackage {
-                kernel, watcher, config
-            } = #crate_path::host::Launcher::get_launch_package(self);
-
-            let kernel_jit_result = if config.ptx_jit {
-                kernel.compile_with_ptx_jit_args(#func_cpu_ptx_jit_wrap)?
+        ) -> #crate_path::rustacuda::error::CudaResult<()> {
+            let kernel_jit_result = if #launcher.config.ptx_jit {
+                #launcher.kernel.compile_with_ptx_jit_args(#func_cpu_ptx_jit_wrap)?
             } else {
-                kernel.compile_with_ptx_jit_args(None)?
+                #launcher.kernel.compile_with_ptx_jit_args(None)?
             };
-
             let function = match kernel_jit_result {
-                #crate_path::host::KernelJITResult::Recompiled(function) => {
-                    // Call launcher hook on kernel compilation
-                    <Self as #crate_path::host::Launcher>::on_compile(function, watcher)?;
-
-                    function
-                },
-                #crate_path::host::KernelJITResult::Cached(function) => function,
+                #crate_path::host::KernelJITResult::Recompiled(function)
+                | #crate_path::host::KernelJITResult::Cached(function) => function,
             };
 
             #[allow(clippy::redundant_closure_call)]
@@ -109,9 +74,9 @@ pub(super) fn quote_kernel_func_async(
 
                 let #crate_path::host::LaunchConfig {
                     grid, block, shared_memory_size, ptx_jit: _,
-                } = config;
+                } = #launcher.config.clone();
 
-                unsafe { stream.launch(function, grid, block, shared_memory_size,
+                unsafe { #launcher.stream.launch(function, grid, block, shared_memory_size,
                     &[
                         #(
                             &#func_params as *const _ as *mut ::core::ffi::c_void
diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/mod.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/mod.rs
index 7007abe87..ef99f68fc 100644
--- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/mod.rs
+++ b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/mod.rs
@@ -1,8 +1,6 @@
 use proc_macro2::TokenStream;
 
-use super::super::{
-    BlanketGenerics, DeclGenerics, FuncIdent, FunctionInputs, ImplGenerics, KernelConfig,
-};
+use super::super::{DeclGenerics, FuncIdent, FunctionInputs, ImplGenerics, KernelConfig};
 
 mod kernel_func;
 mod kernel_func_async;
@@ -13,22 +11,9 @@ use kernel_func_async::quote_kernel_func_async;
 #[allow(clippy::too_many_arguments)]
 pub(in super::super) fn quote_cpu_wrapper(
     crate_path: &syn::Path,
-    config @ KernelConfig {
-        visibility, kernel, ..
-    }: &KernelConfig,
-    decl @ DeclGenerics {
-        generic_start_token,
-        generic_trait_params,
-        generic_close_token,
-        generic_trait_where_clause,
-        ..
-    }: &DeclGenerics,
-    impl_generics @ ImplGenerics { ty_generics, .. }: &ImplGenerics,
-    BlanketGenerics {
-        blanket_ty,
-        impl_generics: blanket_impl_generics,
-        where_clause: blanket_where_clause,
-    }: &BlanketGenerics,
+    config: &KernelConfig,
+    decl: &DeclGenerics,
+    impl_generics: &ImplGenerics,
     func_inputs: &FunctionInputs,
     fn_ident: &FuncIdent,
     func_params: &[syn::Ident],
@@ -56,27 +41,8 @@ pub(in super::super) fn quote_cpu_wrapper(
     );
 
     quote! {
-        // #[cfg(not(target_os = "cuda"))]
-        // #[allow(clippy::missing_safety_doc)]
-        // #visibility unsafe trait #kernel #generic_start_token
-        //     #generic_trait_params
-        // #generic_close_token: #crate_path::host::CompiledKernelPtx<
-        //     dyn #kernel #ty_generics
-        // > #generic_trait_where_clause
-        // {
-        //     #kernel_func
-
-        //     #kernel_func_async
-        // }
-
-        // #[cfg(not(target_os = "cuda"))]
-        // #[allow(clippy::missing_safety_doc)]
-        // unsafe impl #blanket_impl_generics #kernel #ty_generics for #blanket_ty
-        //     #blanket_where_clause
-        // {}
-
         #kernel_func
 
-        // #kernel_func_async
+        #kernel_func_async
     }
 }
diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cuda_generic_function.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cuda_generic_function.rs
index 628642fc0..aa23b77c6 100644
--- a/rust-cuda-derive/src/kernel/wrapper/generate/cuda_generic_function.rs
+++ b/rust-cuda-derive/src/kernel/wrapper/generate/cuda_generic_function.rs
@@ -7,7 +7,6 @@ pub(in super::super) fn quote_cuda_generic_function(
         generic_start_token,
         generic_kernel_params: generic_params,
         generic_close_token,
-        generic_kernel_where_clause: generic_where_clause,
         ..
     }: &DeclGenerics,
     func_inputs: &syn::punctuated::Punctuated<syn::FnArg, syn::token::Comma>,
@@ -19,7 +18,6 @@ pub(in super::super) fn quote_cuda_generic_function(
         #[cfg(target_os = "cuda")]
         #(#func_attrs)*
         fn #func_ident #generic_start_token #generic_params #generic_close_token (#func_inputs)
-            #generic_where_clause
         #func_block
     }
 }
diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cuda_wrapper.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cuda_wrapper.rs
index 40d4abfbf..3e573d583 100644
--- a/rust-cuda-derive/src/kernel/wrapper/generate/cuda_wrapper.rs
+++ b/rust-cuda-derive/src/kernel/wrapper/generate/cuda_wrapper.rs
@@ -4,13 +4,13 @@ use syn::spanned::Spanned;
 
 use super::super::{
     super::{KERNEL_TYPE_USE_END_CANARY, KERNEL_TYPE_USE_START_CANARY},
-    FuncIdent, FunctionInputs, InputCudaType, KernelConfig,
+    FuncIdent, FunctionInputs, ImplGenerics, InputCudaType, KernelConfig,
 };
 
 #[allow(clippy::too_many_lines)]
 pub(in super::super) fn quote_cuda_wrapper(
     crate_path: &syn::Path,
-    config @ KernelConfig { args, .. }: &KernelConfig,
+    config @ KernelConfig { args, private, .. }: &KernelConfig,
     inputs @ FunctionInputs {
         func_inputs,
         func_input_cuda_types,
@@ -20,6 +20,7 @@ pub(in super::super) fn quote_cuda_wrapper(
         func_ident_hash,
         ..
     }: &FuncIdent,
+    impl_generics: &ImplGenerics,
     func_attrs: &[syn::Attribute],
     func_params: &[syn::Ident],
 ) -> TokenStream {
@@ -56,7 +57,7 @@ pub(in super::super) fn quote_cuda_wrapper(
 
                 let type_ident = quote::format_ident!("__T_{}", i);
                 let syn_type = quote::quote_spanned! { ty.span()=>
-                    #crate_path::device::specialise_kernel_type!(#args :: #type_ident)
+                    #crate_path::device::specialise_kernel_type!(#private :: #args :: #type_ident)
                 };
 
                 match cuda_mode {
@@ -99,12 +100,21 @@ pub(in super::super) fn quote_cuda_wrapper(
             syn::FnArg::Receiver(_) => unreachable!(),
         });
 
+    let args_trait = super::args_trait::quote_args_trait(config, impl_generics, inputs);
+
     quote! {
+        // TODO: args trait should not be publicly available like this
+        //       but specialisation requires it right now
+        #args_trait
+
         #[cfg(target_os = "cuda")]
         #[#crate_path::device::specialise_kernel_entry(#args)]
         #[no_mangle]
         #(#func_attrs)*
         pub unsafe extern "ptx-kernel" fn #func_ident_hash(#(#ptx_func_inputs),*) {
+            #[allow(unused_imports)]
+            use __rust_cuda_ffi_safe_assert::#args;
+
             unsafe {
                 ::core::arch::asm!(#KERNEL_TYPE_USE_START_CANARY);
             }
@@ -123,7 +133,9 @@ pub(in super::super) fn quote_cuda_wrapper(
             #[deny(improper_ctypes)]
             mod __rust_cuda_ffi_safe_assert {
                 #[allow(unused_imports)]
-                use super::#args;
+                use super::*;
+
+                #args_trait
 
                 extern "C" { #(
                     #[allow(dead_code)]
@@ -149,7 +161,7 @@ pub(in super::super) fn quote_cuda_wrapper(
 
 fn specialise_ptx_func_inputs(
     crate_path: &syn::Path,
-    KernelConfig { args, .. }: &KernelConfig,
+    KernelConfig { args, private, .. }: &KernelConfig,
     FunctionInputs {
         func_inputs,
         func_input_cuda_types,
@@ -170,7 +182,7 @@ fn specialise_ptx_func_inputs(
             ) => {
                 let type_ident = quote::format_ident!("__T_{}", i);
                 let syn_type = quote::quote_spanned! { ty.span()=>
-                    #crate_path::device::specialise_kernel_type!(#args :: #type_ident)
+                    #crate_path::device::specialise_kernel_type!(#private :: #args :: #type_ident)
                 };
 
                 let cuda_type = match cuda_mode {
@@ -228,7 +240,7 @@ fn specialise_ptx_func_inputs(
 
 fn specialise_ptx_unboxed_types(
     crate_path: &syn::Path,
-    KernelConfig { args, .. }: &KernelConfig,
+    KernelConfig { args, private, .. }: &KernelConfig,
     FunctionInputs { func_inputs, .. }: &FunctionInputs,
 ) -> Vec<TokenStream> {
     func_inputs
@@ -239,7 +251,7 @@ fn specialise_ptx_unboxed_types(
                 let type_ident = quote::format_ident!("__T_{}", i);
 
                 quote::quote_spanned! { ty.span()=>
-                    #crate_path::device::specialise_kernel_type!(#args :: #type_ident)
+                    #crate_path::device::specialise_kernel_type!(#private :: #args :: #type_ident)
                 }
             },
             syn::FnArg::Receiver(_) => unreachable!(),
diff --git a/rust-cuda-derive/src/kernel/wrapper/inputs/mod.rs b/rust-cuda-derive/src/kernel/wrapper/inputs/mod.rs
index 4a25bf958..9222de237 100644
--- a/rust-cuda-derive/src/kernel/wrapper/inputs/mod.rs
+++ b/rust-cuda-derive/src/kernel/wrapper/inputs/mod.rs
@@ -1,7 +1,5 @@
 use syn::spanned::Spanned;
 
-use crate::kernel::utils::r2c_move_lifetime;
-
 use super::{InputCudaType, InputPtxJit};
 
 mod attribute;
@@ -12,12 +10,7 @@ pub(super) struct FunctionInputs {
     pub(super) func_input_cuda_types: Vec<(InputCudaType, InputPtxJit)>,
 }
 
-pub(super) fn parse_function_inputs(
-    func: &syn::ItemFn,
-    generic_params: &mut syn::punctuated::Punctuated<syn::GenericParam, syn::token::Comma>,
-) -> FunctionInputs {
-    let mut implicit_lifetime_id: usize = 0;
-
+pub(super) fn parse_function_inputs(func: &syn::ItemFn) -> FunctionInputs {
     let (func_inputs, func_input_cuda_types): (
         syn::punctuated::Punctuated<syn::FnArg, syn::token::Comma>,
         Vec<(InputCudaType, InputPtxJit)>,
@@ -25,8 +18,7 @@ pub(super) fn parse_function_inputs(
         .sig
         .inputs
         .iter()
-        .enumerate()
-        .map(|(i, arg)| match arg {
+        .map(|arg| match arg {
             receiver @ syn::FnArg::Receiver(_) => {
                 abort!(receiver.span(), "Kernel function must not have a receiver.")
             },
@@ -94,13 +86,7 @@ pub(super) fn parse_function_inputs(
                     );
                 });
 
-                let ty = ensure_reference_type_lifetime(
-                    i,
-                    ty,
-                    &cuda_type,
-                    &mut implicit_lifetime_id,
-                    generic_params,
-                );
+                let ty = ensure_reference_type_lifetime(ty, &cuda_type);
 
                 (
                     syn::FnArg::Typed(syn::PatType {
@@ -122,13 +108,7 @@ pub(super) fn parse_function_inputs(
 }
 
 #[allow(clippy::unnecessary_box_returns)]
-fn ensure_reference_type_lifetime(
-    i: usize,
-    ty: &syn::Type,
-    cuda_type: &InputCudaType,
-    implicit_lifetime_id: &mut usize,
-    generic_params: &mut syn::punctuated::Punctuated<syn::GenericParam, syn::token::Comma>,
-) -> Box<syn::Type> {
+fn ensure_reference_type_lifetime(ty: &syn::Type, cuda_type: &InputCudaType) -> Box<syn::Type> {
     match ty {
         syn::Type::Reference(syn::TypeReference {
             and_token,
@@ -136,27 +116,6 @@ fn ensure_reference_type_lifetime(
             mutability,
             elem,
         }) => {
-            // let lifetime = lifetime.clone().unwrap_or_else(|| {
-            //     let lifetime = syn::Lifetime::new(
-            //         &format!("'__r2c_lt_{implicit_lifetime_id}"),
-            //         lifetime.span(),
-            //     );
-
-            //     generic_params.insert(
-            //         *implicit_lifetime_id,
-            //         syn::GenericParam::Lifetime(syn::LifetimeDef {
-            //             attrs: Vec::new(),
-            //             lifetime: lifetime.clone(),
-            //             colon_token: None,
-            //             bounds: syn::punctuated::Punctuated::new(),
-            //         }),
-            //     );
-
-            //     *implicit_lifetime_id += 1;
-
-            //     lifetime
-            // });
-
             let elem = if matches!(cuda_type, InputCudaType::LendRustToCuda) {
                 (|| {
                     if let syn::Type::Path(syn::TypePath {
@@ -203,27 +162,11 @@ fn ensure_reference_type_lifetime(
 
             Box::new(syn::Type::Reference(syn::TypeReference {
                 and_token: *and_token,
-                lifetime: lifetime.clone(),//Some(lifetime),
+                lifetime: lifetime.clone(),
                 mutability: *mutability,
                 elem,
             }))
         },
-        ty => {
-            // if matches!(cuda_type, InputCudaType::LendRustToCuda) {
-            //     generic_params.insert(
-            //         *implicit_lifetime_id,
-            //         syn::GenericParam::Lifetime(syn::LifetimeDef {
-            //             attrs: Vec::new(),
-            //             lifetime: r2c_move_lifetime(i, ty),
-            //             colon_token: None,
-            //             bounds: syn::punctuated::Punctuated::new(),
-            //         }),
-            //     );
-
-            //     *implicit_lifetime_id += 1;
-            // }
-
-            Box::new(ty.clone())
-        },
+        ty => Box::new(ty.clone()),
     }
 }
diff --git a/rust-cuda-derive/src/kernel/wrapper/mod.rs b/rust-cuda-derive/src/kernel/wrapper/mod.rs
index a4db5f7f3..3d42c9d8b 100644
--- a/rust-cuda-derive/src/kernel/wrapper/mod.rs
+++ b/rust-cuda-derive/src/kernel/wrapper/mod.rs
@@ -132,116 +132,33 @@ pub fn kernel(attr: TokenStream, func: TokenStream) -> TokenStream {
         }
     };
 
-    let mut generic_kernel_params = func.sig.generics.params.clone();
-    let mut func_inputs = parse_function_inputs(&func, &mut generic_kernel_params);
-
-    let (generic_start_token, generic_close_token) = if generic_kernel_params.is_empty() {
-        (None, None)
-    } else if let (Some(start), Some(close)) =
-        (func.sig.generics.lt_token, func.sig.generics.gt_token)
-    {
-        (Some(start), Some(close))
-    } else {
-        (Some(syn::parse_quote!(<)), Some(syn::parse_quote!(>)))
-    };
+    let mut func_inputs = parse_function_inputs(&func);
+
+    let generic_kernel_params = func.sig.generics.params.clone();
+    let (generic_start_token, generic_close_token) =
+        (func.sig.generics.lt_token, func.sig.generics.gt_token);
 
     let generic_trait_params = generic_kernel_params
         .iter()
         .filter(|generic_param| !matches!(generic_param, syn::GenericParam::Lifetime(_)))
         .cloned()
         .collect();
-    let generic_wrapper_params = generic_kernel_params
-        .iter()
-        .filter(|generic_param| matches!(generic_param, syn::GenericParam::Lifetime(_)))
-        .cloned()
-        .collect();
-
-    let generic_kernel_where_clause = &func.sig.generics.where_clause;
-    let generic_trait_where_clause = generic_kernel_where_clause.as_ref().map(
-        |syn::WhereClause {
-             where_token,
-             predicates,
-         }: &syn::WhereClause| {
-            let predicates = predicates
-                .iter()
-                .filter(|predicate| !matches!(predicate, syn::WherePredicate::Lifetime(_)))
-                .cloned()
-                .collect();
-
-            syn::WhereClause {
-                where_token: *where_token,
-                predicates,
-            }
-        },
-    );
-    let generic_wrapper_where_clause = generic_kernel_where_clause.as_ref().map(
-        |syn::WhereClause {
-             where_token,
-             predicates,
-         }: &syn::WhereClause| {
-            let predicates = predicates
-                .iter()
-                .filter(|predicate| matches!(predicate, syn::WherePredicate::Lifetime(_)))
-                .cloned()
-                .collect();
-
-            syn::WhereClause {
-                where_token: *where_token,
-                predicates,
-            }
-        },
-    );
 
     let decl_generics = DeclGenerics {
         generic_start_token: &generic_start_token,
-        generic_trait_params: &generic_trait_params,
         generic_close_token: &generic_close_token,
-        generic_trait_where_clause: &generic_trait_where_clause,
-        generic_wrapper_params: &generic_wrapper_params,
-        generic_wrapper_where_clause: &generic_wrapper_where_clause,
         generic_kernel_params: &generic_kernel_params,
-        generic_kernel_where_clause,
     };
     let trait_generics = syn::Generics {
         lt_token: generic_start_token,
-        params: generic_trait_params.clone(),
+        params: generic_trait_params,
         gt_token: generic_close_token,
-        where_clause: generic_trait_where_clause.clone(),
-    };
-    let (impl_generics, ty_generics, where_clause) = trait_generics.split_for_impl();
-    let blanket_ty = syn::Ident::new("K", Span::mixed_site());
-    let mut blanket_params = generic_trait_params.clone();
-    blanket_params.push(syn::GenericParam::Type(syn::TypeParam {
-        attrs: Vec::new(),
-        ident: blanket_ty.clone(),
-        colon_token: syn::parse_quote!(:),
-        bounds: {
-            let kernel = &config.kernel;
-            syn::parse_quote! {
-                #crate_path::host::CompiledKernelPtx<
-                    dyn #kernel #ty_generics
-                >
-            }
-        },
-        eq_token: None,
-        default: None,
-    }));
-    let trait_blanket_generics = syn::Generics {
-        lt_token: Some(generic_start_token.unwrap_or(syn::parse_quote!(<))),
-        params: blanket_params,
-        gt_token: Some(generic_close_token.unwrap_or(syn::parse_quote!(>))),
-        where_clause: generic_trait_where_clause.clone(),
-    };
-    let (blanket_impl_generics, _, blanket_where_clause) = trait_blanket_generics.split_for_impl();
-    let blanket_generics = BlanketGenerics {
-        blanket_ty,
-        impl_generics: blanket_impl_generics,
-        where_clause: blanket_where_clause,
+        where_clause: None,
     };
+    let (impl_generics, ty_generics, _where_clause) = trait_generics.split_for_impl();
     let impl_generics = ImplGenerics {
         impl_generics,
         ty_generics,
-        where_clause,
     };
 
     let func_ident = FuncIdent {
@@ -293,13 +210,12 @@ pub fn kernel(attr: TokenStream, func: TokenStream) -> TokenStream {
         })
         .collect();
 
-    let args_trait = quote_args_trait(&config, &decl_generics, &impl_generics, &func_inputs);
+    let args_trait = quote_args_trait(&config, &impl_generics, &func_inputs);
     let cpu_wrapper = quote_cpu_wrapper(
         &crate_path,
         &config,
         &decl_generics,
         &impl_generics,
-        &blanket_generics,
         &func_inputs,
         &func_ident,
         &func_params,
@@ -310,6 +226,7 @@ pub fn kernel(attr: TokenStream, func: TokenStream) -> TokenStream {
         &crate_path,
         &config,
         &decl_generics,
+        &impl_generics,
         &func_inputs,
         &func_ident,
         &func_params,
@@ -320,6 +237,7 @@ pub fn kernel(attr: TokenStream, func: TokenStream) -> TokenStream {
         &config,
         &func_inputs,
         &func_ident,
+        &impl_generics,
         &func.attrs,
         &func_params,
     );
@@ -330,9 +248,16 @@ pub fn kernel(attr: TokenStream, func: TokenStream) -> TokenStream {
         &func.attrs,
         &func.block,
     );
+    let private = &config.private;
 
     (quote! {
-        #args_trait
+        mod #private {
+            #[allow(unused_imports)]
+            use super::*;
+
+            #args_trait
+        }
+
         #cpu_wrapper
 
         #cpu_cuda_check
@@ -355,26 +280,14 @@ struct InputPtxJit(bool);
 #[allow(clippy::struct_field_names)]
 struct DeclGenerics<'f> {
     generic_start_token: &'f Option<syn::token::Lt>,
-    generic_trait_params: &'f syn::punctuated::Punctuated<syn::GenericParam, syn::token::Comma>,
     generic_close_token: &'f Option<syn::token::Gt>,
-    generic_trait_where_clause: &'f Option<syn::WhereClause>,
-    generic_wrapper_params: &'f syn::punctuated::Punctuated<syn::GenericParam, syn::token::Comma>,
-    generic_wrapper_where_clause: &'f Option<syn::WhereClause>,
     generic_kernel_params: &'f syn::punctuated::Punctuated<syn::GenericParam, syn::token::Comma>,
-    generic_kernel_where_clause: &'f Option<syn::WhereClause>,
 }
 
 struct ImplGenerics<'f> {
     #[allow(clippy::struct_field_names)]
     impl_generics: syn::ImplGenerics<'f>,
     ty_generics: syn::TypeGenerics<'f>,
-    where_clause: Option<&'f syn::WhereClause>,
-}
-
-struct BlanketGenerics<'f> {
-    blanket_ty: syn::Ident,
-    impl_generics: syn::ImplGenerics<'f>,
-    where_clause: Option<&'f syn::WhereClause>,
 }
 
 #[allow(clippy::struct_field_names)]
diff --git a/rust-cuda-derive/src/kernel/wrapper/parse.rs b/rust-cuda-derive/src/kernel/wrapper/parse.rs
index 7d523adb0..56aa60053 100644
--- a/rust-cuda-derive/src/kernel/wrapper/parse.rs
+++ b/rust-cuda-derive/src/kernel/wrapper/parse.rs
@@ -50,5 +50,12 @@ pub(super) fn parse_kernel_fn(tokens: TokenStream) -> syn::ItemFn {
         ),
     };
 
+    if let Some(r#where) = &func.sig.generics.where_clause {
+        abort!(
+            r#where.span(),
+            "Kernel function must not have a where clause, use type generic bounds instead."
+        );
+    }
+
     func
 }
diff --git a/rust-cuda-derive/src/lib.rs b/rust-cuda-derive/src/lib.rs
index 74e76a2cc..1a0550bc5 100644
--- a/rust-cuda-derive/src/lib.rs
+++ b/rust-cuda-derive/src/lib.rs
@@ -5,6 +5,7 @@
 #![feature(if_let_guard)]
 #![feature(let_chains)]
 #![feature(map_try_insert)]
+#![feature(proc_macro_def_site)]
 #![feature(cfg_version)]
 #![cfg_attr(not(version("1.76.0")), feature(c_str_literals))]
 #![doc(html_root_url = "https://juntyr.github.io/rust-cuda/")]
diff --git a/src/host.rs b/src/host.rs
index ea2bd11a8..5e01e5b1e 100644
--- a/src/host.rs
+++ b/src/host.rs
@@ -28,20 +28,27 @@ use crate::{
     safety::SafeDeviceCopy,
 };
 
-pub struct Launcher<'a, Kernel> {
-    pub kernel: &'a mut TypedPtxKernel<Kernel>,
+pub struct Launcher<'stream, 'kernel, Kernel> {
+    pub stream: &'stream Stream,
+    pub kernel: &'kernel mut TypedPtxKernel<Kernel>,
     pub config: LaunchConfig,
 }
 
-impl<'a, Kernel> Launcher<'a, Kernel> {
+impl<'stream, 'kernel, Kernel> Launcher<'stream, 'kernel, Kernel> {
     #[allow(clippy::missing_errors_doc)]
-    pub fn launch0(&mut self) -> CudaResult<()> where Kernel: Copy + FnOnce(&mut Launcher<Kernel>) -> CudaResult<()> {
-        self.kernel.launch0(&self.config)
+    pub fn launch0(&mut self) -> CudaResult<()>
+    where
+        Kernel: Copy + FnOnce(&mut Launcher<Kernel>) -> CudaResult<()>,
+    {
+        self.kernel.launch0(self.stream, &self.config)
     }
 
     #[allow(clippy::missing_errors_doc)]
-    pub fn launch1<A>(&mut self, arg1: A) -> CudaResult<()> where Kernel: Copy + FnOnce(&mut Launcher<Kernel>, A) -> CudaResult<()> {
-        self.kernel.launch1(&self.config, arg1)
+    pub fn launch1<A>(&mut self, arg1: A) -> CudaResult<()>
+    where
+        Kernel: Copy + FnOnce(&mut Launcher<Kernel>, A) -> CudaResult<()>,
+    {
+        self.kernel.launch1(self.stream, &self.config, arg1)
     }
 }
 
@@ -173,13 +180,30 @@ impl<Kernel> TypedPtxKernel<Kernel> {
     }
 
     #[allow(clippy::missing_errors_doc)]
-    pub fn launch0(&mut self, config: &LaunchConfig) -> CudaResult<()> where Kernel: Copy + FnOnce(&mut Launcher<Kernel>) -> CudaResult<()> {
-        (const { conjure::<Kernel>() })(&mut Launcher { kernel: self, config: config.clone() })
+    pub fn launch0(&mut self, stream: &Stream, config: &LaunchConfig) -> CudaResult<()>
+    where
+        Kernel: Copy + FnOnce(&mut Launcher<Kernel>) -> CudaResult<()>,
+    {
+        (const { conjure::<Kernel>() })(&mut Launcher {
+            stream,
+            kernel: self,
+            config: config.clone(),
+        })
     }
 
     #[allow(clippy::missing_errors_doc)]
-    pub fn launch1<A>(&mut self, config: &LaunchConfig, arg1: A) -> CudaResult<()> where Kernel: Copy + FnOnce(&mut Launcher<Kernel>, A) -> CudaResult<()> {
-        (const { conjure::<Kernel>() })(&mut Launcher { kernel: self, config: config.clone() }, arg1)
+    pub fn launch1<A>(&mut self, stream: &Stream, config: &LaunchConfig, arg1: A) -> CudaResult<()>
+    where
+        Kernel: Copy + FnOnce(&mut Launcher<Kernel>, A) -> CudaResult<()>,
+    {
+        (const { conjure::<Kernel>() })(
+            &mut Launcher {
+                stream,
+                kernel: self,
+                config: config.clone(),
+            },
+            arg1,
+        )
     }
 }
 

From 9af625374ede272a2090bf8ff48e21e72f9db3dd Mon Sep 17 00:00:00 2001
From: Juniper Tyree <juniper.tyree@helsinki.fi>
Date: Thu, 21 Dec 2023 04:59:58 +0000
Subject: [PATCH 051/120] Eliminate almost all ArgsTrait usages

---
 rust-cuda-derive/Cargo.toml                   |   2 +-
 rust-cuda-derive/src/kernel/link/config.rs    |   9 +-
 rust-cuda-derive/src/kernel/link/mod.rs       |  23 +-
 .../specialise/{call.rs => entry_point.rs}    |   6 +-
 .../specialise/{entry.rs => function.rs}      |   6 +-
 rust-cuda-derive/src/kernel/specialise/mod.rs |   4 +-
 rust-cuda-derive/src/kernel/specialise/ty.rs  | 281 ++++++++++++++----
 rust-cuda-derive/src/kernel/wrapper/config.rs |  12 +-
 .../{ => cpu_linker_macro}/args_trait.rs      |   4 +-
 .../generate/cpu_linker_macro/get_ptx.rs      |  16 +-
 .../wrapper/generate/cpu_linker_macro/mod.rs  |   6 +-
 .../generate/cpu_wrapper/kernel_func.rs       |  49 +--
 .../kernel_func_async/async_func_types.rs     |  13 +-
 .../kernel_func_async/launch_types.rs         |  15 +-
 .../cpu_wrapper/kernel_func_async/mod.rs      |  10 +-
 .../wrapper/generate/cpu_wrapper/mod.rs       |   6 +-
 .../kernel/wrapper/generate/cuda_wrapper.rs   |  62 ++--
 .../src/kernel/wrapper/generate/mod.rs        |   1 -
 rust-cuda-derive/src/kernel/wrapper/mod.rs    |  25 +-
 rust-cuda-derive/src/lib.rs                   |   8 +-
 src/device/mod.rs                             |   2 +-
 src/host.rs                                   |   2 +-
 22 files changed, 328 insertions(+), 234 deletions(-)
 rename rust-cuda-derive/src/kernel/specialise/{call.rs => entry_point.rs} (91%)
 rename rust-cuda-derive/src/kernel/specialise/{entry.rs => function.rs} (86%)
 rename rust-cuda-derive/src/kernel/wrapper/generate/{ => cpu_linker_macro}/args_trait.rs (92%)

diff --git a/rust-cuda-derive/Cargo.toml b/rust-cuda-derive/Cargo.toml
index 31a686008..60677b1dd 100644
--- a/rust-cuda-derive/Cargo.toml
+++ b/rust-cuda-derive/Cargo.toml
@@ -12,7 +12,7 @@ links = "libnvptxcompiler_static"
 proc-macro = true
 
 [dependencies]
-syn = { version = "1.0", features = ["full"] }
+syn = { version = "1.0", features = ["full", "fold"] }
 quote = "1.0"
 proc-macro2 = "1.0"
 proc-macro-error = "1.0"
diff --git a/rust-cuda-derive/src/kernel/link/config.rs b/rust-cuda-derive/src/kernel/link/config.rs
index efb7899fa..d7a4d0458 100644
--- a/rust-cuda-derive/src/kernel/link/config.rs
+++ b/rust-cuda-derive/src/kernel/link/config.rs
@@ -6,7 +6,6 @@ use super::super::lints::{parse_ptx_lint_level, LintLevel, PtxLint};
 pub(super) struct LinkKernelConfig {
     pub(super) kernel: syn::Ident,
     pub(super) kernel_hash: syn::Ident,
-    pub(super) args: syn::Ident,
     pub(super) crate_name: String,
     pub(super) crate_path: PathBuf,
     pub(super) specialisation: String,
@@ -17,7 +16,6 @@ impl syn::parse::Parse for LinkKernelConfig {
     fn parse(input: syn::parse::ParseStream) -> syn::Result<Self> {
         let kernel: syn::Ident = input.parse()?;
         let kernel_hash: syn::Ident = input.parse()?;
-        let args: syn::Ident = input.parse()?;
         let name: syn::LitStr = input.parse()?;
         let path: syn::LitStr = input.parse()?;
 
@@ -56,7 +54,6 @@ impl syn::parse::Parse for LinkKernelConfig {
         Ok(Self {
             kernel,
             kernel_hash,
-            args,
             crate_name: name.value(),
             crate_path: PathBuf::from(path.value()),
             specialisation,
@@ -67,22 +64,22 @@ impl syn::parse::Parse for LinkKernelConfig {
 
 #[allow(clippy::module_name_repetitions)]
 pub(super) struct CheckKernelConfig {
+    pub(super) kernel: syn::Ident,
     pub(super) kernel_hash: syn::Ident,
-    pub(super) args: syn::Ident,
     pub(super) crate_name: String,
     pub(super) crate_path: PathBuf,
 }
 
 impl syn::parse::Parse for CheckKernelConfig {
     fn parse(input: syn::parse::ParseStream) -> syn::Result<Self> {
+        let kernel: syn::Ident = input.parse()?;
         let kernel_hash: syn::Ident = input.parse()?;
-        let args: syn::Ident = input.parse()?;
         let name: syn::LitStr = input.parse()?;
         let path: syn::LitStr = input.parse()?;
 
         Ok(Self {
+            kernel,
             kernel_hash,
-            args,
             crate_name: name.value(),
             crate_path: PathBuf::from(path.value()),
         })
diff --git a/rust-cuda-derive/src/kernel/link/mod.rs b/rust-cuda-derive/src/kernel/link/mod.rs
index bcbe297cf..8424e7056 100644
--- a/rust-cuda-derive/src/kernel/link/mod.rs
+++ b/rust-cuda-derive/src/kernel/link/mod.rs
@@ -36,22 +36,22 @@ pub fn check_kernel(tokens: TokenStream) -> TokenStream {
     proc_macro_error::set_dummy(quote! {::core::result::Result::Err(())});
 
     let CheckKernelConfig {
+        kernel,
         kernel_hash,
-        args,
         crate_name,
         crate_path,
     } = match syn::parse_macro_input::parse(tokens) {
         Ok(config) => config,
         Err(err) => {
             abort_call_site!(
-                "check_kernel!(HASH ARGS NAME PATH) expects HASH and ARGS identifiers, annd NAME \
-                 and PATH string literals: {:?}",
+                "check_kernel!(KERNEL HASH NAME PATH) expects KERNEL and HASH identifiers, annd \
+                 NAME and PATH string literals: {:?}",
                 err
             )
         },
     };
 
-    let kernel_ptx = compile_kernel(&args, &crate_name, &crate_path, Specialisation::Check);
+    let kernel_ptx = compile_kernel(&kernel, &crate_name, &crate_path, Specialisation::Check);
 
     let Some(kernel_ptx) = kernel_ptx else {
         return quote!(::core::result::Result::Err(())).into();
@@ -74,9 +74,8 @@ pub fn link_kernel(tokens: TokenStream) -> TokenStream {
     });
 
     let LinkKernelConfig {
-        kernel: _kernel,
+        kernel,
         kernel_hash,
-        args,
         crate_name,
         crate_path,
         specialisation,
@@ -85,9 +84,9 @@ pub fn link_kernel(tokens: TokenStream) -> TokenStream {
         Ok(config) => config,
         Err(err) => {
             abort_call_site!(
-                "link_kernel!(KERNEL HASH ARGS NAME PATH SPECIALISATION LINTS,*) expects KERNEL, \
-                 HASH, and ARGS identifiers, NAME and PATH string literals, and SPECIALISATION \
-                 and LINTS tokens: {:?}",
+                "link_kernel!(KERNEL HASH NAME PATH SPECIALISATION LINTS,*) expects KERNEL and \
+                 HASH identifiers, NAME and PATH string literals, and SPECIALISATION and LINTS \
+                 tokens: {:?}",
                 err
             )
         },
@@ -101,7 +100,7 @@ pub fn link_kernel(tokens: TokenStream) -> TokenStream {
     }
 
     let Some(mut kernel_ptx) = compile_kernel(
-        &args,
+        &kernel,
         &crate_name,
         &crate_path,
         Specialisation::Link(&specialisation),
@@ -601,7 +600,7 @@ fn check_kernel_ptx(
 }
 
 fn compile_kernel(
-    args: &syn::Ident,
+    kernel: &syn::Ident,
     crate_name: &str,
     crate_path: &Path,
     specialisation: Specialisation,
@@ -618,7 +617,7 @@ fn compile_kernel(
     let specialisation_var = format!(
         "RUST_CUDA_DERIVE_SPECIALISE_{}_{}",
         crate_name,
-        args.to_string().to_uppercase()
+        kernel.to_string().to_uppercase()
     );
 
     match build_kernel_with_specialisation(crate_path, &specialisation_var, specialisation) {
diff --git a/rust-cuda-derive/src/kernel/specialise/call.rs b/rust-cuda-derive/src/kernel/specialise/entry_point.rs
similarity index 91%
rename from rust-cuda-derive/src/kernel/specialise/call.rs
rename to rust-cuda-derive/src/kernel/specialise/entry_point.rs
index 10e43d26a..5653a5539 100644
--- a/rust-cuda-derive/src/kernel/specialise/call.rs
+++ b/rust-cuda-derive/src/kernel/specialise/entry_point.rs
@@ -3,7 +3,7 @@ use std::ffi::CStr;
 use proc_macro::TokenStream;
 
 #[allow(clippy::module_name_repetitions)]
-pub fn specialise_kernel_call(tokens: TokenStream) -> TokenStream {
+pub fn specialise_kernel_entry_point(tokens: TokenStream) -> TokenStream {
     let SpecialiseMangleConfig {
         kernel,
         specialisation,
@@ -11,8 +11,8 @@ pub fn specialise_kernel_call(tokens: TokenStream) -> TokenStream {
         Ok(config) => config,
         Err(err) => {
             abort_call_site!(
-                "specialise_kernel_call!(KERNEL SPECIALISATION) expects KERNEL identifier and \
-                 SPECIALISATION tokens: {:?}",
+                "specialise_kernel_entry_point!(KERNEL SPECIALISATION) expects KERNEL identifier \
+                 and SPECIALISATION tokens: {:?}",
                 err
             )
         },
diff --git a/rust-cuda-derive/src/kernel/specialise/entry.rs b/rust-cuda-derive/src/kernel/specialise/function.rs
similarity index 86%
rename from rust-cuda-derive/src/kernel/specialise/entry.rs
rename to rust-cuda-derive/src/kernel/specialise/function.rs
index b85a433e7..068f30d97 100644
--- a/rust-cuda-derive/src/kernel/specialise/entry.rs
+++ b/rust-cuda-derive/src/kernel/specialise/function.rs
@@ -3,10 +3,10 @@ use std::env::VarError;
 use proc_macro::TokenStream;
 
 #[allow(clippy::module_name_repetitions)]
-pub fn specialise_kernel_entry(attr: TokenStream, func: TokenStream) -> TokenStream {
+pub fn specialise_kernel_function(attr: TokenStream, func: TokenStream) -> TokenStream {
     let mut func: syn::ItemFn = syn::parse(func).unwrap_or_else(|err| {
         abort_call_site!(
-            "#[specialise_kernel_entry(...)] must be wrapped around a function: {:?}",
+            "#[specialise_kernel_function(...)] must be wrapped around a function: {:?}",
             err
         )
     });
@@ -14,7 +14,7 @@ pub fn specialise_kernel_entry(attr: TokenStream, func: TokenStream) -> TokenStr
     let kernel: syn::Ident = match syn::parse_macro_input::parse(attr) {
         Ok(kernel) => kernel,
         Err(err) => abort_call_site!(
-            "#[specialise_kernel_entry(KERNEL)] expects KERNEL identifier: {:?}",
+            "#[specialise_kernel_function(KERNEL)] expects KERNEL identifier: {:?}",
             err
         ),
     };
diff --git a/rust-cuda-derive/src/kernel/specialise/mod.rs b/rust-cuda-derive/src/kernel/specialise/mod.rs
index 337508b5b..6d30d4d5d 100644
--- a/rust-cuda-derive/src/kernel/specialise/mod.rs
+++ b/rust-cuda-derive/src/kernel/specialise/mod.rs
@@ -1,3 +1,3 @@
-pub mod call;
-pub mod entry;
+pub mod entry_point;
+pub mod function;
 pub mod ty;
diff --git a/rust-cuda-derive/src/kernel/specialise/ty.rs b/rust-cuda-derive/src/kernel/specialise/ty.rs
index 196f2556a..9805abc3c 100644
--- a/rust-cuda-derive/src/kernel/specialise/ty.rs
+++ b/rust-cuda-derive/src/kernel/specialise/ty.rs
@@ -3,15 +3,15 @@ use quote::ToTokens;
 
 pub fn specialise_kernel_type(tokens: TokenStream) -> TokenStream {
     let SpecialiseTypeConfig {
-        _private, // TODO: either use or remove the private path
-        args,
-        typedef,
+        mut ty,
+        generics,
+        kernel,
     } = match syn::parse_macro_input::parse(tokens) {
         Ok(config) => config,
         Err(err) => {
             abort_call_site!(
-                "specialise_kernel_type!(ARGS::TYPEDEF) expects ARGS path and TYPEDEF identifier: \
-                 {:?}",
+                "specialise_kernel_type!(TY for GENERICS in KERNEL) expects TY type, GENERICS \
+                 generics, and KERNEL identifier: {:?}",
                 err
             )
         },
@@ -25,74 +25,243 @@ pub fn specialise_kernel_type(tokens: TokenStream) -> TokenStream {
     let specialisation_var = format!(
         "RUST_CUDA_DERIVE_SPECIALISE_{}_{}",
         crate_name,
-        args.to_string().to_uppercase()
+        kernel.to_string().to_uppercase()
     );
 
-    match proc_macro::tracked_env::var(&specialisation_var) {
-        Ok(specialisation) => {
-            let specialisation = match syn::parse_str(&specialisation) {
-                _ if specialisation.is_empty() => syn::PathArguments::None,
-                Ok(specialisation) => syn::PathArguments::AngleBracketed(specialisation),
-                Err(err) => abort_call_site!("Failed to parse specialisation: {:?}", err),
-            };
-
-            syn::Type::Path(syn::TypePath {
-                qself: Some(syn::QSelf {
-                    lt_token: syn::parse_quote!(<),
-                    ty: syn::parse_quote!(()),
-                    position: 1, // 2,
-                    as_token: syn::parse_quote!(as),
-                    gt_token: syn::parse_quote!(>),
-                }),
-                path: syn::Path {
-                    leading_colon: None,
-                    segments: [
-                        // syn::PathSegment {
-                        //     ident: private,
-                        //     arguments: syn::PathArguments::None,
-                        // },
-                        syn::PathSegment {
-                            ident: args,
-                            arguments: specialisation,
-                        },
-                        syn::PathSegment {
-                            ident: typedef,
-                            arguments: syn::PathArguments::None,
-                        },
-                    ]
-                    .into_iter()
-                    .collect(),
-                },
-            })
-            .into_token_stream()
-            .into()
-        },
+    let specialisation = match proc_macro::tracked_env::var(&specialisation_var) {
+        Ok(specialisation) => specialisation,
         Err(err) => abort_call_site!(
             "Failed to read specialisation from {:?}: {:?}",
             &specialisation_var,
             err
         ),
+    };
+    let specialisation = match syn::parse_str(&specialisation) {
+        _ if specialisation.is_empty() => syn::PathArguments::None,
+        Ok(specialisation) => syn::PathArguments::AngleBracketed(specialisation),
+        Err(err) => abort_call_site!("Failed to parse specialisation: {:?}", err),
+    };
+
+    if let syn::PathArguments::AngleBracketed(syn::AngleBracketedGenericArguments {
+        args, ..
+    }) = specialisation
+    {
+        if generics.params.len() != args.len() {
+            abort_call_site!(
+                "Mismatch specialising {} with {}",
+                generics.split_for_impl().1.to_token_stream(),
+                args.to_token_stream()
+            );
+        }
+
+        for (generic, arg) in generics.params.into_iter().zip(args.into_iter()) {
+            match (generic, arg) {
+                (
+                    syn::GenericParam::Lifetime(syn::LifetimeDef {
+                        lifetime: generic, ..
+                    }),
+                    syn::GenericArgument::Lifetime(arg),
+                ) => {
+                    ty = syn::fold::Fold::fold_type(&mut FoldLifetimeGeneric { generic, arg }, ty);
+                },
+                (
+                    syn::GenericParam::Const(syn::ConstParam { ident: generic, .. }),
+                    syn::GenericArgument::Const(arg),
+                ) => {
+                    ty = syn::fold::Fold::fold_type(&mut FoldConstGeneric { generic, arg }, ty);
+                },
+                (
+                    syn::GenericParam::Type(syn::TypeParam { ident: generic, .. }),
+                    syn::GenericArgument::Type(arg),
+                ) => {
+                    ty = syn::fold::Fold::fold_type(&mut FoldTypeGeneric { generic, arg }, ty);
+                },
+                (generic, arg) => abort_call_site!(
+                    "Mismatch specialising {} with {}",
+                    generic.to_token_stream(),
+                    arg.to_token_stream()
+                ),
+            }
+        }
+    } else if !generics.params.is_empty() {
+        abort_call_site!(
+            "Missing specialisation for {}",
+            generics.split_for_impl().1.to_token_stream()
+        );
     }
+
+    ty.into_token_stream().into()
 }
 
 struct SpecialiseTypeConfig {
-    _private: syn::Ident,
-    args: syn::Ident,
-    typedef: syn::Ident,
+    ty: syn::Type,
+    generics: syn::Generics,
+    kernel: syn::Ident,
 }
 
 impl syn::parse::Parse for SpecialiseTypeConfig {
     fn parse(input: syn::parse::ParseStream) -> syn::Result<Self> {
-        let private: syn::Ident = input.parse()?;
-        let _dc: syn::token::Colon2 = input.parse()?;
-        let args: syn::Ident = input.parse()?;
-        let _dc: syn::token::Colon2 = input.parse()?;
-        let typedef: syn::Ident = input.parse()?;
+        let ty: syn::Type = input.parse()?;
+        let _for: syn::token::For = input.parse()?;
+        let generics: syn::Generics = input.parse()?;
+        let _in: syn::token::In = input.parse()?;
+        let kernel: syn::Ident = input.parse()?;
 
         Ok(Self {
-            _private: private,
-            args,
-            typedef,
+            ty,
+            generics,
+            kernel,
         })
     }
 }
+
+struct FoldLifetimeGeneric {
+    generic: syn::Lifetime,
+    arg: syn::Lifetime,
+}
+
+impl syn::fold::Fold for FoldLifetimeGeneric {
+    fn fold_lifetime(&mut self, lt: syn::Lifetime) -> syn::Lifetime {
+        if lt == self.generic {
+            self.arg.clone()
+        } else {
+            lt
+        }
+    }
+}
+
+struct FoldConstGeneric {
+    generic: syn::Ident,
+    arg: syn::Expr,
+}
+
+impl syn::fold::Fold for FoldConstGeneric {
+    fn fold_generic_argument(&mut self, arg: syn::GenericArgument) -> syn::GenericArgument {
+        let syn::GenericArgument::Type(syn::Type::Path(syn::TypePath {
+            qself: None,
+            path:
+                syn::Path {
+                    leading_colon: None,
+                    segments,
+                },
+        })) = arg
+        else {
+            return syn::fold::fold_generic_argument(self, arg);
+        };
+
+        if let Some(syn::PathSegment {
+            ident,
+            arguments: syn::PathArguments::None,
+        }) = segments.first()
+            && segments.len() == 1
+            && ident == &self.generic
+        {
+            return syn::GenericArgument::Const(self.arg.clone());
+        }
+
+        syn::fold::fold_generic_argument(
+            self,
+            syn::GenericArgument::Type(syn::Type::Path(syn::TypePath {
+                qself: None,
+                path: syn::Path {
+                    leading_colon: None,
+                    segments,
+                },
+            })),
+        )
+    }
+
+    fn fold_expr(&mut self, expr: syn::Expr) -> syn::Expr {
+        let syn::Expr::Path(syn::ExprPath {
+            qself: None,
+            path:
+                syn::Path {
+                    leading_colon: None,
+                    segments,
+                },
+            attrs,
+        }) = expr
+        else {
+            return syn::fold::fold_expr(self, expr);
+        };
+
+        if let Some(syn::PathSegment {
+            ident,
+            arguments: syn::PathArguments::None,
+        }) = segments.first()
+            && segments.len() == 1
+            && ident == &self.generic
+        {
+            return self.arg.clone();
+        }
+
+        syn::fold::fold_expr(
+            self,
+            syn::Expr::Path(syn::ExprPath {
+                qself: None,
+                path: syn::Path {
+                    leading_colon: None,
+                    segments,
+                },
+                attrs,
+            }),
+        )
+    }
+}
+
+struct FoldTypeGeneric {
+    generic: syn::Ident,
+    arg: syn::Type,
+}
+
+impl syn::fold::Fold for FoldTypeGeneric {
+    fn fold_type(&mut self, ty: syn::Type) -> syn::Type {
+        let syn::Type::Path(syn::TypePath {
+            qself: None,
+            path:
+                syn::Path {
+                    leading_colon: None,
+                    segments,
+                },
+        }) = ty
+        else {
+            return syn::fold::fold_type(self, ty);
+        };
+
+        if let Some(syn::PathSegment {
+            ident,
+            arguments: syn::PathArguments::None,
+        }) = segments.first()
+            && ident == &self.generic
+        {
+            return if segments.len() > 1 {
+                syn::Type::Path(syn::TypePath {
+                    qself: Some(syn::QSelf {
+                        lt_token: syn::parse_quote!(<),
+                        ty: Box::new(self.arg.clone()),
+                        position: 0,
+                        as_token: None,
+                        gt_token: syn::parse_quote!(>),
+                    }),
+                    path: syn::Path {
+                        leading_colon: syn::parse_quote!(::),
+                        segments: segments.into_iter().skip(1).collect(),
+                    },
+                })
+            } else {
+                self.arg.clone()
+            };
+        }
+
+        syn::fold::fold_type(
+            self,
+            syn::Type::Path(syn::TypePath {
+                qself: None,
+                path: syn::Path {
+                    leading_colon: None,
+                    segments,
+                },
+            }),
+        )
+    }
+}
diff --git a/rust-cuda-derive/src/kernel/wrapper/config.rs b/rust-cuda-derive/src/kernel/wrapper/config.rs
index cc9531acc..8f8cd2240 100644
--- a/rust-cuda-derive/src/kernel/wrapper/config.rs
+++ b/rust-cuda-derive/src/kernel/wrapper/config.rs
@@ -1,8 +1,6 @@
 pub(super) struct KernelConfig {
     pub(super) visibility: Option<syn::token::Pub>,
     pub(super) linker: syn::Ident,
-    pub(super) private: syn::Ident,
-    pub(super) args: syn::Ident,
 }
 
 impl syn::parse::Parse for KernelConfig {
@@ -14,14 +12,6 @@ impl syn::parse::Parse for KernelConfig {
         let _for: syn::token::For = input.parse()?;
         let _impl: syn::token::Impl = input.parse()?;
 
-        let private = syn::Ident::new("private", proc_macro::Span::def_site().into());
-        let args = syn::Ident::new("KernelArgs", proc_macro::Span::def_site().into());
-
-        Ok(Self {
-            visibility,
-            linker,
-            private,
-            args,
-        })
+        Ok(Self { visibility, linker })
     }
 }
diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/args_trait.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/args_trait.rs
similarity index 92%
rename from rust-cuda-derive/src/kernel/wrapper/generate/args_trait.rs
rename to rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/args_trait.rs
index d45a35fb0..178ed026d 100644
--- a/rust-cuda-derive/src/kernel/wrapper/generate/args_trait.rs
+++ b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/args_trait.rs
@@ -1,9 +1,9 @@
 use proc_macro2::TokenStream;
 
-use super::super::{FunctionInputs, ImplGenerics, KernelConfig};
+use super::super::super::{FunctionInputs, ImplGenerics};
 
 pub(in super::super) fn quote_args_trait(
-    KernelConfig { args, .. }: &KernelConfig,
+    args: &syn::Ident,
     ImplGenerics {
         impl_generics,
         ty_generics,
diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/get_ptx.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/get_ptx.rs
index 75fc008ed..7e4b88f87 100644
--- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/get_ptx.rs
+++ b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/get_ptx.rs
@@ -3,9 +3,7 @@ use syn::spanned::Spanned;
 
 use crate::kernel::utils::skip_kernel_compilation;
 
-use super::super::super::{
-    DeclGenerics, FuncIdent, FunctionInputs, ImplGenerics, InputCudaType, KernelConfig,
-};
+use super::super::super::{DeclGenerics, FuncIdent, FunctionInputs, ImplGenerics, InputCudaType};
 
 #[allow(clippy::too_many_arguments)]
 pub(super) fn quote_get_ptx(
@@ -15,7 +13,6 @@ pub(super) fn quote_get_ptx(
         func_ident_hash,
         ..
     }: &FuncIdent,
-    config @ KernelConfig { args, .. }: &KernelConfig,
     generics @ DeclGenerics {
         generic_start_token,
         generic_close_token,
@@ -35,10 +32,11 @@ pub(super) fn quote_get_ptx(
     let crate_manifest_dir = proc_macro::tracked_env::var("CARGO_MANIFEST_DIR")
         .unwrap_or_else(|err| abort_call_site!("Failed to read crate path: {:?}.", err));
 
-    let args_trait = super::super::args_trait::quote_args_trait(config, impl_generics, inputs);
+    let args = syn::Ident::new("KernelArgs", proc_macro::Span::def_site().into());
+    let args_trait = super::args_trait::quote_args_trait(&args, impl_generics, inputs);
 
     let cpu_func_lifetime_erased_types =
-        generate_lifetime_erased_types(crate_path, config, generics, inputs, macro_type_ids);
+        generate_lifetime_erased_types(crate_path, &args, generics, inputs, macro_type_ids);
 
     let matching_kernel_assert = if skip_kernel_compilation() {
         quote!()
@@ -49,7 +47,7 @@ pub(super) fn quote_get_ptx(
             }> = #crate_path::safety::kernel_signature::Assert::<{
                 #crate_path::safety::kernel_signature::check(
                     PTX_CSTR.to_bytes(),
-                    #crate_path::host::specialise_kernel_call!(
+                    #crate_path::host::specialise_kernel_entry_point!(
                         #func_ident_hash #generic_start_token
                             #($#macro_type_ids),*
                         #generic_close_token
@@ -88,7 +86,7 @@ pub(super) fn quote_get_ptx(
             use __rust_cuda_ffi_safe_assert::#args;
 
             #crate_path::host::link_kernel!{
-                #func_ident #func_ident_hash #args #crate_name #crate_manifest_dir #generic_start_token
+                #func_ident #func_ident_hash #crate_name #crate_manifest_dir #generic_start_token
                     #($#macro_type_ids),*
                 #generic_close_token #ptx_lint_levels
             }
@@ -117,7 +115,7 @@ pub(super) fn quote_get_ptx(
 
 fn generate_lifetime_erased_types(
     crate_path: &syn::Path,
-    KernelConfig { args, .. }: &KernelConfig,
+    args: &syn::Ident,
     DeclGenerics {
         generic_start_token,
         generic_close_token,
diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/mod.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/mod.rs
index ae2be49d9..f68b9cf34 100644
--- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/mod.rs
+++ b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/mod.rs
@@ -2,6 +2,7 @@ use proc_macro2::TokenStream;
 
 use super::super::{DeclGenerics, FuncIdent, FunctionInputs, ImplGenerics, KernelConfig};
 
+mod args_trait;
 mod get_ptx;
 
 use get_ptx::quote_get_ptx;
@@ -9,7 +10,7 @@ use get_ptx::quote_get_ptx;
 #[allow(clippy::too_many_arguments)] // FIXME
 pub(in super::super) fn quote_cpu_linker_macro(
     crate_path: &syn::Path,
-    config @ KernelConfig {
+    KernelConfig {
         visibility, linker, ..
     }: &KernelConfig,
     decl_generics @ DeclGenerics {
@@ -75,7 +76,6 @@ pub(in super::super) fn quote_cpu_linker_macro(
     let get_ptx = quote_get_ptx(
         crate_path,
         func_ident,
-        config,
         decl_generics,
         impl_generics,
         func_inputs,
@@ -98,7 +98,7 @@ pub(in super::super) fn quote_cpu_linker_macro(
                 #get_ptx
 
                 fn get_entry_point() -> &'static ::core::ffi::CStr {
-                    #crate_path::host::specialise_kernel_call!(
+                    #crate_path::host::specialise_kernel_entry_point!(
                         #func_ident_hash #generic_start_token
                             #($#macro_non_lt_generic_ids),*
                         #generic_close_token
diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func.rs
index b0fa4625f..a51fc565a 100644
--- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func.rs
+++ b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func.rs
@@ -1,14 +1,9 @@
 use proc_macro2::TokenStream;
 
-use super::super::super::{
-    DeclGenerics, FuncIdent, FunctionInputs, ImplGenerics, InputCudaType, KernelConfig,
-};
+use super::super::super::{DeclGenerics, FuncIdent, FunctionInputs, ImplGenerics, InputCudaType};
 
-#[allow(clippy::too_many_arguments)]
-#[allow(clippy::too_many_lines)] // FIXME
 pub(super) fn quote_kernel_func_inputs(
     crate_path: &syn::Path,
-    KernelConfig { args, private, .. }: &KernelConfig,
     ImplGenerics { ty_generics, .. }: &ImplGenerics,
     DeclGenerics {
         generic_kernel_params,
@@ -21,46 +16,14 @@ pub(super) fn quote_kernel_func_inputs(
     func_params: &[syn::Ident],
     func_attrs: &[syn::Attribute],
 ) -> TokenStream {
-    let (kernel_func_inputs, kernel_func_input_tys): (Vec<_>, Vec<_>) = func_inputs
+    let kernel_func_inputs = func_inputs.iter().collect::<Vec<_>>();
+    let kernel_func_input_tys = func_inputs
         .iter()
-        .enumerate()
-        .map(|(i, arg)| match arg {
-            syn::FnArg::Typed(syn::PatType {
-                attrs,
-                pat,
-                colon_token,
-                ty,
-            }) => {
-                let type_ident = quote::format_ident!("__T_{}", i);
-                let syn_type: syn::Type = syn::parse_quote! {
-                    <() as #private :: #args #ty_generics>::#type_ident
-                };
-                let syn_type = if let syn::Type::Reference(syn::TypeReference {
-                    and_token,
-                    lifetime,
-                    mutability,
-                    ..
-                }) = &**ty
-                {
-                    syn::Type::Reference(syn::TypeReference {
-                        and_token: *and_token,
-                        lifetime: lifetime.clone(),
-                        mutability: *mutability,
-                        elem: Box::new(syn_type),
-                    })
-                } else {
-                    syn_type
-                };
-
-                let param = quote! {
-                    #(#attrs)* #pat #colon_token #syn_type
-                };
-
-                (param, syn_type)
-            },
+        .map(|arg| match arg {
+            syn::FnArg::Typed(syn::PatType { ty, .. }) => syn::Type::clone(ty),
             syn::FnArg::Receiver(_) => unreachable!(),
         })
-        .unzip();
+        .collect::<Vec<_>>();
 
     let launcher = syn::Ident::new("launcher", proc_macro2::Span::mixed_site());
 
diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func_async/async_func_types.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func_async/async_func_types.rs
index 652ff4bc6..cd539a16c 100644
--- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func_async/async_func_types.rs
+++ b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func_async/async_func_types.rs
@@ -1,12 +1,10 @@
 use proc_macro2::TokenStream;
 use syn::spanned::Spanned;
 
-use super::super::super::super::{FunctionInputs, ImplGenerics, InputCudaType, KernelConfig};
+use super::super::super::super::{FunctionInputs, InputCudaType};
 
 pub(super) fn generate_async_func_types(
     crate_path: &syn::Path,
-    KernelConfig { args, private, .. }: &KernelConfig,
-    ImplGenerics { ty_generics, .. }: &ImplGenerics,
     FunctionInputs {
         func_inputs,
         func_input_cuda_types,
@@ -16,17 +14,16 @@ pub(super) fn generate_async_func_types(
     func_inputs
         .iter()
         .zip(func_input_cuda_types.iter())
-        .enumerate()
-        .map(|(i, (arg, (cuda_mode, _ptx_jit)))| match arg {
+        .map(|(arg, (cuda_mode, _ptx_jit))| match arg {
             syn::FnArg::Typed(syn::PatType {
                 attrs,
                 pat,
                 colon_token,
                 ty,
             }) => {
-                let type_ident = quote::format_ident!("__T_{}", i);
-                let syn_type = quote! {
-                    <() as #private :: #args #ty_generics>::#type_ident
+                let syn_type = match &**ty {
+                    syn::Type::Reference(syn::TypeReference { elem, .. }) => elem,
+                    other => other,
                 };
 
                 let cuda_type = match cuda_mode {
diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func_async/launch_types.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func_async/launch_types.rs
index 55771c3c8..74402c939 100644
--- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func_async/launch_types.rs
+++ b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func_async/launch_types.rs
@@ -1,29 +1,26 @@
 use proc_macro2::TokenStream;
 use syn::spanned::Spanned;
 
-use super::super::super::super::{FunctionInputs, ImplGenerics, InputCudaType, KernelConfig};
+use super::super::super::super::{FunctionInputs, InputCudaType};
 
 pub(in super::super) fn generate_launch_types(
     crate_path: &syn::Path,
-    KernelConfig { args, private, .. }: &KernelConfig,
-    ImplGenerics { ty_generics, .. }: &ImplGenerics,
     FunctionInputs {
         func_inputs,
         func_input_cuda_types,
     }: &FunctionInputs,
-) -> (Vec<TokenStream>, Vec<TokenStream>) {
+) -> (Vec<TokenStream>, Vec<syn::Type>) {
     let mut cpu_func_types_launch = Vec::with_capacity(func_inputs.len());
     let mut cpu_func_unboxed_types = Vec::with_capacity(func_inputs.len());
 
     func_inputs
         .iter()
         .zip(func_input_cuda_types.iter())
-        .enumerate()
-        .for_each(|(i, (arg, (cuda_mode, _ptx_jit)))| match arg {
+        .for_each(|(arg, (cuda_mode, _ptx_jit))| match arg {
             syn::FnArg::Typed(syn::PatType { ty, .. }) => {
-                let type_ident = quote::format_ident!("__T_{}", i);
-                let syn_type = quote::quote_spanned! { ty.span()=>
-                    <() as #private :: #args #ty_generics>::#type_ident
+                let syn_type = match &**ty {
+                    syn::Type::Reference(syn::TypeReference { elem, .. }) => elem,
+                    other => other,
                 };
 
                 cpu_func_unboxed_types.push(syn_type.clone());
diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func_async/mod.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func_async/mod.rs
index d4830d254..c8ef3dbc0 100644
--- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func_async/mod.rs
+++ b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func_async/mod.rs
@@ -1,6 +1,6 @@
 use proc_macro2::TokenStream;
 
-use super::super::super::{DeclGenerics, FuncIdent, FunctionInputs, ImplGenerics, KernelConfig};
+use super::super::super::{DeclGenerics, FuncIdent, FunctionInputs, ImplGenerics};
 
 mod async_func_types;
 mod launch_types;
@@ -13,8 +13,7 @@ use type_wrap::generate_func_input_and_ptx_jit_wraps;
 #[allow(clippy::too_many_arguments)]
 pub(super) fn quote_kernel_func_async(
     crate_path: &syn::Path,
-    config: &KernelConfig,
-    impl_generics @ ImplGenerics { ty_generics, .. }: &ImplGenerics,
+    ImplGenerics { ty_generics, .. }: &ImplGenerics,
     DeclGenerics {
         generic_kernel_params,
         ..
@@ -31,12 +30,11 @@ pub(super) fn quote_kernel_func_async(
     let launcher = syn::Ident::new("launcher", proc_macro2::Span::mixed_site());
     let stream = syn::Lifetime::new("'stream", proc_macro2::Span::mixed_site());
 
-    let kernel_func_async_inputs =
-        generate_async_func_types(crate_path, config, impl_generics, func_inputs, &stream);
+    let kernel_func_async_inputs = generate_async_func_types(crate_path, func_inputs, &stream);
     let (func_input_wrap, func_cpu_ptx_jit_wrap) =
         generate_func_input_and_ptx_jit_wraps(crate_path, func_inputs);
     let (cpu_func_types_launch, cpu_func_unboxed_types) =
-        generate_launch_types(crate_path, config, impl_generics, func_inputs);
+        generate_launch_types(crate_path, func_inputs);
 
     quote! {
         #[cfg(not(target_os = "cuda"))]
diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/mod.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/mod.rs
index ef99f68fc..b863a478f 100644
--- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/mod.rs
+++ b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/mod.rs
@@ -1,6 +1,6 @@
 use proc_macro2::TokenStream;
 
-use super::super::{DeclGenerics, FuncIdent, FunctionInputs, ImplGenerics, KernelConfig};
+use super::super::{DeclGenerics, FuncIdent, FunctionInputs, ImplGenerics};
 
 mod kernel_func;
 mod kernel_func_async;
@@ -8,10 +8,8 @@ mod kernel_func_async;
 use kernel_func::quote_kernel_func_inputs;
 use kernel_func_async::quote_kernel_func_async;
 
-#[allow(clippy::too_many_arguments)]
 pub(in super::super) fn quote_cpu_wrapper(
     crate_path: &syn::Path,
-    config: &KernelConfig,
     decl: &DeclGenerics,
     impl_generics: &ImplGenerics,
     func_inputs: &FunctionInputs,
@@ -21,7 +19,6 @@ pub(in super::super) fn quote_cpu_wrapper(
 ) -> TokenStream {
     let kernel_func = quote_kernel_func_inputs(
         crate_path,
-        config,
         impl_generics,
         decl,
         func_inputs,
@@ -31,7 +28,6 @@ pub(in super::super) fn quote_cpu_wrapper(
     );
     let kernel_func_async = quote_kernel_func_async(
         crate_path,
-        config,
         impl_generics,
         decl,
         func_inputs,
diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cuda_wrapper.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cuda_wrapper.rs
index 3e573d583..058299b41 100644
--- a/rust-cuda-derive/src/kernel/wrapper/generate/cuda_wrapper.rs
+++ b/rust-cuda-derive/src/kernel/wrapper/generate/cuda_wrapper.rs
@@ -4,28 +4,32 @@ use syn::spanned::Spanned;
 
 use super::super::{
     super::{KERNEL_TYPE_USE_END_CANARY, KERNEL_TYPE_USE_START_CANARY},
-    FuncIdent, FunctionInputs, ImplGenerics, InputCudaType, KernelConfig,
+    FuncIdent, FunctionInputs, ImplGenerics, InputCudaType,
 };
 
 #[allow(clippy::too_many_lines)]
 pub(in super::super) fn quote_cuda_wrapper(
     crate_path: &syn::Path,
-    config @ KernelConfig { args, private, .. }: &KernelConfig,
     inputs @ FunctionInputs {
         func_inputs,
         func_input_cuda_types,
     }: &FunctionInputs,
-    FuncIdent {
+    func @ FuncIdent {
         func_ident,
         func_ident_hash,
         ..
     }: &FuncIdent,
-    impl_generics: &ImplGenerics,
+    impl_generics @ ImplGenerics {
+        impl_generics: generics,
+        ..
+    }: &ImplGenerics,
     func_attrs: &[syn::Attribute],
     func_params: &[syn::Ident],
 ) -> TokenStream {
-    let (ptx_func_inputs, ptx_func_types) = specialise_ptx_func_inputs(crate_path, config, inputs);
-    let ptx_func_unboxed_types = specialise_ptx_unboxed_types(crate_path, config, inputs);
+    let (ptx_func_inputs, ptx_func_types) =
+        specialise_ptx_func_inputs(crate_path, inputs, func, impl_generics);
+    let ptx_func_unboxed_types =
+        specialise_ptx_unboxed_types(crate_path, inputs, func, impl_generics);
 
     let func_layout_params = func_params
         .iter()
@@ -55,9 +59,12 @@ pub(in super::super) fn quote_cuda_wrapper(
                     }
                 } else { quote! {} };
 
-                let type_ident = quote::format_ident!("__T_{}", i);
+                let arg_type = match &**ty {
+                    syn::Type::Reference(syn::TypeReference { elem, .. }) => elem,
+                    other => other,
+                };
                 let syn_type = quote::quote_spanned! { ty.span()=>
-                    #crate_path::device::specialise_kernel_type!(#private :: #args :: #type_ident)
+                    #crate_path::device::specialise_kernel_type!(#arg_type for #generics in #func_ident)
                 };
 
                 match cuda_mode {
@@ -100,21 +107,12 @@ pub(in super::super) fn quote_cuda_wrapper(
             syn::FnArg::Receiver(_) => unreachable!(),
         });
 
-    let args_trait = super::args_trait::quote_args_trait(config, impl_generics, inputs);
-
     quote! {
-        // TODO: args trait should not be publicly available like this
-        //       but specialisation requires it right now
-        #args_trait
-
         #[cfg(target_os = "cuda")]
-        #[#crate_path::device::specialise_kernel_entry(#args)]
+        #[#crate_path::device::specialise_kernel_function(#func_ident)]
         #[no_mangle]
         #(#func_attrs)*
         pub unsafe extern "ptx-kernel" fn #func_ident_hash(#(#ptx_func_inputs),*) {
-            #[allow(unused_imports)]
-            use __rust_cuda_ffi_safe_assert::#args;
-
             unsafe {
                 ::core::arch::asm!(#KERNEL_TYPE_USE_START_CANARY);
             }
@@ -135,8 +133,6 @@ pub(in super::super) fn quote_cuda_wrapper(
                 #[allow(unused_imports)]
                 use super::*;
 
-                #args_trait
-
                 extern "C" { #(
                     #[allow(dead_code)]
                     static #func_params: #ptx_func_types;
@@ -161,17 +157,17 @@ pub(in super::super) fn quote_cuda_wrapper(
 
 fn specialise_ptx_func_inputs(
     crate_path: &syn::Path,
-    KernelConfig { args, private, .. }: &KernelConfig,
     FunctionInputs {
         func_inputs,
         func_input_cuda_types,
     }: &FunctionInputs,
+    FuncIdent { func_ident, .. }: &FuncIdent,
+    ImplGenerics { impl_generics, .. }: &ImplGenerics,
 ) -> (Vec<TokenStream>, Vec<TokenStream>) {
     func_inputs
         .iter()
         .zip(func_input_cuda_types.iter())
-        .enumerate()
-        .map(|(i, (arg, (cuda_mode, _ptx_jit)))| match arg {
+        .map(|(arg, (cuda_mode, _ptx_jit))| match arg {
             syn::FnArg::Typed(
                 fn_arg @ syn::PatType {
                     attrs,
@@ -180,9 +176,12 @@ fn specialise_ptx_func_inputs(
                     ty,
                 },
             ) => {
-                let type_ident = quote::format_ident!("__T_{}", i);
+                let arg_type = match &**ty {
+                    syn::Type::Reference(syn::TypeReference { elem, .. }) => elem,
+                    other => other,
+                };
                 let syn_type = quote::quote_spanned! { ty.span()=>
-                    #crate_path::device::specialise_kernel_type!(#private :: #args :: #type_ident)
+                    #crate_path::device::specialise_kernel_type!(#arg_type for #impl_generics in #func_ident)
                 };
 
                 let cuda_type = match cuda_mode {
@@ -240,18 +239,21 @@ fn specialise_ptx_func_inputs(
 
 fn specialise_ptx_unboxed_types(
     crate_path: &syn::Path,
-    KernelConfig { args, private, .. }: &KernelConfig,
     FunctionInputs { func_inputs, .. }: &FunctionInputs,
+    FuncIdent { func_ident, .. }: &FuncIdent,
+    ImplGenerics { impl_generics, .. }: &ImplGenerics,
 ) -> Vec<TokenStream> {
     func_inputs
         .iter()
-        .enumerate()
-        .map(|(i, arg)| match arg {
+        .map(|arg| match arg {
             syn::FnArg::Typed(syn::PatType { ty, .. }) => {
-                let type_ident = quote::format_ident!("__T_{}", i);
+                let arg_type = match &**ty {
+                    syn::Type::Reference(syn::TypeReference { elem, .. }) => elem,
+                    other => other,
+                };
 
                 quote::quote_spanned! { ty.span()=>
-                    #crate_path::device::specialise_kernel_type!(#private :: #args :: #type_ident)
+                    #crate_path::device::specialise_kernel_type!(#arg_type for #impl_generics in #func_ident)
                 }
             },
             syn::FnArg::Receiver(_) => unreachable!(),
diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/mod.rs b/rust-cuda-derive/src/kernel/wrapper/generate/mod.rs
index 4dd9b4096..c7a2fcabd 100644
--- a/rust-cuda-derive/src/kernel/wrapper/generate/mod.rs
+++ b/rust-cuda-derive/src/kernel/wrapper/generate/mod.rs
@@ -1,4 +1,3 @@
-pub mod args_trait;
 pub mod cpu_linker_macro;
 pub mod cpu_wrapper;
 pub mod cuda_generic_function;
diff --git a/rust-cuda-derive/src/kernel/wrapper/mod.rs b/rust-cuda-derive/src/kernel/wrapper/mod.rs
index 3d42c9d8b..64f6f4f3f 100644
--- a/rust-cuda-derive/src/kernel/wrapper/mod.rs
+++ b/rust-cuda-derive/src/kernel/wrapper/mod.rs
@@ -14,9 +14,8 @@ use super::lints::{parse_ptx_lint_level, LintLevel, PtxLint};
 
 use config::KernelConfig;
 use generate::{
-    args_trait::quote_args_trait, cpu_linker_macro::quote_cpu_linker_macro,
-    cpu_wrapper::quote_cpu_wrapper, cuda_generic_function::quote_cuda_generic_function,
-    cuda_wrapper::quote_cuda_wrapper,
+    cpu_linker_macro::quote_cpu_linker_macro, cpu_wrapper::quote_cpu_wrapper,
+    cuda_generic_function::quote_cuda_generic_function, cuda_wrapper::quote_cuda_wrapper,
 };
 use inputs::{parse_function_inputs, FunctionInputs};
 use parse::parse_kernel_fn;
@@ -210,10 +209,8 @@ pub fn kernel(attr: TokenStream, func: TokenStream) -> TokenStream {
         })
         .collect();
 
-    let args_trait = quote_args_trait(&config, &impl_generics, &func_inputs);
     let cpu_wrapper = quote_cpu_wrapper(
         &crate_path,
-        &config,
         &decl_generics,
         &impl_generics,
         &func_inputs,
@@ -221,7 +218,7 @@ pub fn kernel(attr: TokenStream, func: TokenStream) -> TokenStream {
         &func_params,
         &func.attrs,
     );
-    let cpu_cuda_check = quote_generic_check(&crate_path, &func_ident, &config);
+    let cpu_cuda_check = quote_generic_check(&crate_path, &func_ident);
     let cpu_linker_macro = quote_cpu_linker_macro(
         &crate_path,
         &config,
@@ -234,7 +231,6 @@ pub fn kernel(attr: TokenStream, func: TokenStream) -> TokenStream {
     );
     let cuda_wrapper = quote_cuda_wrapper(
         &crate_path,
-        &config,
         &func_inputs,
         &func_ident,
         &impl_generics,
@@ -248,16 +244,8 @@ pub fn kernel(attr: TokenStream, func: TokenStream) -> TokenStream {
         &func.attrs,
         &func.block,
     );
-    let private = &config.private;
 
     (quote! {
-        mod #private {
-            #[allow(unused_imports)]
-            use super::*;
-
-            #args_trait
-        }
-
         #cpu_wrapper
 
         #cpu_cuda_check
@@ -342,9 +330,10 @@ fn ident_from_pat_iter<'p, I: Iterator<Item = &'p syn::Pat>>(iter: I) -> Option<
 fn quote_generic_check(
     crate_path: &syn::Path,
     FuncIdent {
-        func_ident_hash, ..
+        func_ident,
+        func_ident_hash,
+        ..
     }: &FuncIdent,
-    KernelConfig { args, .. }: &KernelConfig,
 ) -> proc_macro2::TokenStream {
     let crate_name = match proc_macro::tracked_env::var("CARGO_CRATE_NAME") {
         Ok(crate_name) => crate_name.to_uppercase(),
@@ -357,7 +346,7 @@ fn quote_generic_check(
     quote::quote_spanned! { func_ident_hash.span()=>
         #[cfg(not(target_os = "cuda"))]
         const _: ::core::result::Result<(), ()> = #crate_path::host::check_kernel!(
-            #func_ident_hash #args #crate_name #crate_manifest_dir
+            #func_ident #func_ident_hash #crate_name #crate_manifest_dir
         );
     }
 }
diff --git a/rust-cuda-derive/src/lib.rs b/rust-cuda-derive/src/lib.rs
index 1a0550bc5..4651be684 100644
--- a/rust-cuda-derive/src/lib.rs
+++ b/rust-cuda-derive/src/lib.rs
@@ -54,15 +54,15 @@ pub fn specialise_kernel_type(tokens: TokenStream) -> TokenStream {
 #[doc(hidden)]
 #[proc_macro_error]
 #[proc_macro]
-pub fn specialise_kernel_call(tokens: TokenStream) -> TokenStream {
-    kernel::specialise::call::specialise_kernel_call(tokens)
+pub fn specialise_kernel_entry_point(tokens: TokenStream) -> TokenStream {
+    kernel::specialise::entry_point::specialise_kernel_entry_point(tokens)
 }
 
 #[doc(hidden)]
 #[proc_macro_error]
 #[proc_macro_attribute]
-pub fn specialise_kernel_entry(attr: TokenStream, func: TokenStream) -> TokenStream {
-    kernel::specialise::entry::specialise_kernel_entry(attr, func)
+pub fn specialise_kernel_function(attr: TokenStream, func: TokenStream) -> TokenStream {
+    kernel::specialise::function::specialise_kernel_function(attr, func)
 }
 
 #[doc(hidden)]
diff --git a/src/device/mod.rs b/src/device/mod.rs
index ca9aab9fd..93811bb04 100644
--- a/src/device/mod.rs
+++ b/src/device/mod.rs
@@ -5,7 +5,7 @@ use core::{
 
 #[cfg(feature = "derive")]
 #[doc(cfg(feature = "derive"))]
-pub use rust_cuda_derive::{specialise_kernel_entry, specialise_kernel_type};
+pub use rust_cuda_derive::{specialise_kernel_function, specialise_kernel_type};
 
 use crate::{
     common::{CudaAsRust, DeviceAccessible, DeviceConstRef, DeviceMutRef, RustToCuda},
diff --git a/src/host.rs b/src/host.rs
index 5e01e5b1e..8df7d2fbe 100644
--- a/src/host.rs
+++ b/src/host.rs
@@ -18,7 +18,7 @@ use rustacuda_core::{DeviceCopy, DevicePointer};
 
 #[cfg(feature = "derive")]
 #[doc(cfg(feature = "derive"))]
-pub use rust_cuda_derive::{check_kernel, link_kernel, specialise_kernel_call};
+pub use rust_cuda_derive::{check_kernel, link_kernel, specialise_kernel_entry_point};
 
 use crate::{
     common::{

From 6868d6bba6e49540fba833ee5d68c543a8cd77a3 Mon Sep 17 00:00:00 2001
From: Juniper Tyree <juniper.tyree@helsinki.fi>
Date: Thu, 21 Dec 2023 06:11:57 +0000
Subject: [PATCH 052/120] Some refactoring of the async kernel func type + wrap
 code

---
 .../generate/cpu_wrapper/kernel_func.rs       |   3 +-
 .../generate/cpu_wrapper/kernel_func_async.rs | 231 ++++++++++++++++++
 .../kernel_func_async/async_func_types.rs     |  83 -------
 .../kernel_func_async/launch_types.rs         |  71 ------
 .../cpu_wrapper/kernel_func_async/mod.rs      |  87 -------
 .../kernel_func_async/type_wrap.rs            |  53 ----
 rust-cuda-derive/src/kernel/wrapper/mod.rs    |   3 +-
 7 files changed, 233 insertions(+), 298 deletions(-)
 create mode 100644 rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func_async.rs
 delete mode 100644 rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func_async/async_func_types.rs
 delete mode 100644 rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func_async/launch_types.rs
 delete mode 100644 rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func_async/mod.rs
 delete mode 100644 rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func_async/type_wrap.rs

diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func.rs
index a51fc565a..5cc6f8077 100644
--- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func.rs
+++ b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func.rs
@@ -16,7 +16,6 @@ pub(super) fn quote_kernel_func_inputs(
     func_params: &[syn::Ident],
     func_attrs: &[syn::Attribute],
 ) -> TokenStream {
-    let kernel_func_inputs = func_inputs.iter().collect::<Vec<_>>();
     let kernel_func_input_tys = func_inputs
         .iter()
         .map(|arg| match arg {
@@ -61,7 +60,7 @@ pub(super) fn quote_kernel_func_inputs(
         #[allow(unused_variables)]
         pub fn #func_ident <#generic_kernel_params>(
             #launcher: &mut #crate_path::host::Launcher<#func_ident #ty_generics>,
-            #(#kernel_func_inputs),*
+            #func_inputs
         ) -> #crate_path::rustacuda::error::CudaResult<()> {
             let _: #func_ident <#(#full_generics),*> = #func_ident #ty_turbofish;
 
diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func_async.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func_async.rs
new file mode 100644
index 000000000..8a0013900
--- /dev/null
+++ b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func_async.rs
@@ -0,0 +1,231 @@
+use proc_macro2::TokenStream;
+use syn::spanned::Spanned;
+
+use super::super::super::{DeclGenerics, FuncIdent, FunctionInputs, ImplGenerics, InputCudaType};
+
+#[allow(clippy::too_many_arguments)]
+pub(super) fn quote_kernel_func_async(
+    crate_path: &syn::Path,
+    ImplGenerics { ty_generics, .. }: &ImplGenerics,
+    DeclGenerics {
+        generic_kernel_params,
+        ..
+    }: &DeclGenerics,
+    func_inputs: &FunctionInputs,
+    FuncIdent {
+        func_ident,
+        func_ident_async,
+        ..
+    }: &FuncIdent,
+    func_params: &[syn::Ident],
+    func_attrs: &[syn::Attribute],
+) -> TokenStream {
+    let launcher = syn::Ident::new("launcher", proc_macro2::Span::mixed_site());
+    let stream = syn::Lifetime::new("'stream", proc_macro2::Span::mixed_site());
+
+    let (
+        async_params,
+        launch_param_types,
+        unboxed_param_types,
+        launch_param_wrap,
+        ptx_jit_param_wrap,
+    ) = generate_type_wrap(crate_path, func_inputs, &stream);
+
+    quote! {
+        #[cfg(not(target_os = "cuda"))]
+        #(#func_attrs)*
+        #[allow(clippy::extra_unused_type_parameters)]
+        #[allow(clippy::too_many_arguments)]
+        #[allow(clippy::used_underscore_binding)]
+        #[allow(unused_variables)]
+        pub fn #func_ident_async <#stream, #generic_kernel_params>(
+            #launcher: &mut #crate_path::host::Launcher<#stream, '_, #func_ident #ty_generics>,
+            #(#async_params),*
+        ) -> #crate_path::rustacuda::error::CudaResult<()> {
+            let kernel_jit_result = if #launcher.config.ptx_jit {
+                #launcher.kernel.compile_with_ptx_jit_args(#ptx_jit_param_wrap)?
+            } else {
+                #launcher.kernel.compile_with_ptx_jit_args(None)?
+            };
+            let function = match kernel_jit_result {
+                #crate_path::host::KernelJITResult::Recompiled(function)
+                | #crate_path::host::KernelJITResult::Cached(function) => function,
+            };
+
+            #[allow(clippy::redundant_closure_call)]
+            (|#(#func_params: #launch_param_types),*| {
+                if false {
+                    #[allow(dead_code)]
+                    fn assert_impl_devicecopy<T: #crate_path::rustacuda_core::DeviceCopy>(_val: &T) {}
+
+                    #[allow(dead_code)]
+                    fn assert_impl_no_safe_aliasing<T: #crate_path::safety::NoSafeAliasing>() {}
+
+                    #(assert_impl_devicecopy(&#func_params);)*
+                    #(assert_impl_no_safe_aliasing::<#unboxed_param_types>();)*
+                }
+
+                let #crate_path::host::LaunchConfig {
+                    grid, block, shared_memory_size, ptx_jit: _,
+                } = #launcher.config.clone();
+
+                unsafe { #launcher.stream.launch(function, grid, block, shared_memory_size,
+                    &[
+                        #(
+                            &#func_params as *const _ as *mut ::core::ffi::c_void
+                        ),*
+                    ]
+                ) }
+            })(#(#launch_param_wrap),*)
+        }
+    }
+}
+
+#[allow(clippy::too_many_lines)] // FIXME
+fn generate_type_wrap(
+    crate_path: &syn::Path,
+    FunctionInputs {
+        func_inputs,
+        func_input_cuda_types,
+    }: &FunctionInputs,
+    stream: &syn::Lifetime,
+) -> (
+    Vec<TokenStream>,
+    Vec<TokenStream>,
+    Vec<syn::Type>,
+    Vec<TokenStream>,
+    TokenStream,
+) {
+    let mut any_ptx_jit = false;
+
+    let mut async_params = Vec::with_capacity(func_inputs.len());
+    let mut launch_param_types = Vec::with_capacity(func_inputs.len());
+    let mut unboxed_param_types = Vec::with_capacity(func_inputs.len());
+    let mut launch_param_wrap = Vec::with_capacity(func_inputs.len());
+    let mut ptx_jit_param_wrap = Vec::with_capacity(func_inputs.len());
+
+    func_inputs
+        .iter()
+        .zip(func_input_cuda_types.iter())
+        .for_each(|(arg, (cuda_mode, ptx_jit))| match arg {
+            syn::FnArg::Typed(syn::PatType {
+                attrs,
+                pat,
+                colon_token,
+                ty,
+            }) => {
+                ptx_jit_param_wrap.push(if ptx_jit.0 {
+                    any_ptx_jit = true;
+
+                    quote! { Some(#crate_path::ptx_jit::arg_as_raw_bytes(#pat.for_host())) }
+                } else {
+                    quote! { None }
+                });
+
+                #[allow(clippy::if_same_then_else)]
+                launch_param_wrap.push(if let syn::Type::Reference(_) = &**ty {
+                    quote! { unsafe { #pat.for_device_async() } }
+                } else if matches!(cuda_mode, InputCudaType::LendRustToCuda) {
+                    quote! { unsafe { #pat.for_device_async() } }
+                } else {
+                    quote! { #pat }
+                });
+
+                let unboxed_param_type = match &**ty {
+                    syn::Type::Reference(syn::TypeReference { elem, .. }) => elem,
+                    other => other,
+                };
+                unboxed_param_types.push(unboxed_param_type.clone());
+
+                let cuda_param_type = match cuda_mode {
+                    InputCudaType::SafeDeviceCopy => quote::quote_spanned! { ty.span()=>
+                        #crate_path::utils::device_copy::SafeDeviceCopyWrapper<#unboxed_param_type>
+                    },
+                    InputCudaType::LendRustToCuda => quote::quote_spanned! { ty.span()=>
+                        #crate_path::common::DeviceAccessible<
+                            <#unboxed_param_type as #crate_path::common::RustToCuda>::CudaRepresentation
+                        >
+                    },
+                };
+
+                let (async_param, launch_param_type) = if let syn::Type::Reference(syn::TypeReference {
+                    mutability,
+                    lifetime,
+                    ..
+                }) = &**ty
+                {
+                    let lifetime_or_default = lifetime.clone().unwrap_or(syn::parse_quote!('_));
+                    let comma: Option<syn::token::Comma> =
+                        lifetime.as_ref().map(|_| syn::parse_quote!(,));
+
+                    let (async_param_type, launch_param_type) = if mutability.is_some() {
+                        if matches!(cuda_mode, InputCudaType::SafeDeviceCopy) {
+                            abort!(
+                                mutability.span(),
+                                "Cannot mutably alias a `SafeDeviceCopy` kernel parameter."
+                            );
+                        }
+
+                        (
+                            quote::quote_spanned! { ty.span()=>
+                                #crate_path::host::HostAndDeviceMutRefAsync<#stream, #lifetime_or_default, #cuda_param_type>
+                            },
+                            quote::quote_spanned! { ty.span()=>
+                                #crate_path::common::DeviceMutRef<#lifetime #comma #cuda_param_type>
+                            },
+                        )
+                    } else {
+                        (
+                            quote::quote_spanned! { ty.span()=>
+                                #crate_path::host::HostAndDeviceConstRefAsync<#stream, #lifetime_or_default, #cuda_param_type>
+                            },
+                            quote::quote_spanned! { ty.span()=>
+                                #crate_path::common::DeviceConstRef<#lifetime #comma #cuda_param_type>
+                            },
+                        )
+                    };
+
+                    (quote! {
+                        #(#attrs)* #mutability #pat #colon_token #async_param_type
+                    }, launch_param_type)
+                } else if matches!(cuda_mode, InputCudaType::LendRustToCuda) {
+                    let async_param_type = quote::quote_spanned! { ty.span()=>
+                        #crate_path::host::HostAndDeviceOwnedAsync<#stream, '_, #cuda_param_type>
+                    };
+                    let launch_param_type = quote::quote_spanned! { ty.span()=>
+                        #crate_path::common::DeviceMutRef<#cuda_param_type>
+                    };
+
+                    (
+                        quote! {
+                            #(#attrs)* #pat #colon_token #async_param_type
+                        },
+                        launch_param_type
+                    )
+                } else {
+                    (
+                        quote! { #(#attrs)* #pat #colon_token #cuda_param_type },
+                        quote! { #cuda_param_type },
+                    )
+                };
+
+                async_params.push(async_param);
+                launch_param_types.push(launch_param_type);
+            },
+            syn::FnArg::Receiver(_) => unreachable!(),
+        });
+
+    let ptx_jit_param_wrap = if any_ptx_jit {
+        quote!(Some(&[#(#ptx_jit_param_wrap),*]))
+    } else {
+        quote!(None)
+    };
+
+    (
+        async_params,
+        launch_param_types,
+        unboxed_param_types,
+        launch_param_wrap,
+        ptx_jit_param_wrap,
+    )
+}
diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func_async/async_func_types.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func_async/async_func_types.rs
deleted file mode 100644
index cd539a16c..000000000
--- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func_async/async_func_types.rs
+++ /dev/null
@@ -1,83 +0,0 @@
-use proc_macro2::TokenStream;
-use syn::spanned::Spanned;
-
-use super::super::super::super::{FunctionInputs, InputCudaType};
-
-pub(super) fn generate_async_func_types(
-    crate_path: &syn::Path,
-    FunctionInputs {
-        func_inputs,
-        func_input_cuda_types,
-    }: &FunctionInputs,
-    stream: &syn::Lifetime,
-) -> Vec<TokenStream> {
-    func_inputs
-        .iter()
-        .zip(func_input_cuda_types.iter())
-        .map(|(arg, (cuda_mode, _ptx_jit))| match arg {
-            syn::FnArg::Typed(syn::PatType {
-                attrs,
-                pat,
-                colon_token,
-                ty,
-            }) => {
-                let syn_type = match &**ty {
-                    syn::Type::Reference(syn::TypeReference { elem, .. }) => elem,
-                    other => other,
-                };
-
-                let cuda_type = match cuda_mode {
-                    InputCudaType::SafeDeviceCopy => quote! {
-                        #crate_path::utils::device_copy::SafeDeviceCopyWrapper<#syn_type>
-                    },
-                    InputCudaType::LendRustToCuda => quote! {
-                        #crate_path::common::DeviceAccessible<
-                            <#syn_type as #crate_path::common::RustToCuda>::CudaRepresentation
-                        >
-                    },
-                };
-
-                if let syn::Type::Reference(syn::TypeReference {
-                    lifetime,
-                    mutability,
-                    ..
-                }) = &**ty
-                {
-                    let lifetime = lifetime.clone().unwrap_or(syn::parse_quote!('_));
-
-                    let wrapped_type = if mutability.is_some() {
-                        if matches!(cuda_mode, InputCudaType::SafeDeviceCopy) {
-                            abort!(
-                                mutability.span(),
-                                "Cannot mutably alias a `SafeDeviceCopy` kernel parameter."
-                            );
-                        }
-
-                        quote!(
-                            #crate_path::host::HostAndDeviceMutRefAsync<#stream, #lifetime, #cuda_type>
-                        )
-                    } else {
-                        quote!(
-                            #crate_path::host::HostAndDeviceConstRefAsync<#stream, #lifetime, #cuda_type>
-                        )
-                    };
-
-                    quote! {
-                        #(#attrs)* #mutability #pat #colon_token #wrapped_type
-                    }
-                } else if matches!(cuda_mode, InputCudaType::LendRustToCuda) {
-                    let wrapped_type = quote! {
-                        #crate_path::host::HostAndDeviceOwnedAsync<#stream, '_, #cuda_type>
-                    };
-
-                    quote! {
-                        #(#attrs)* #pat #colon_token #wrapped_type
-                    }
-                } else {
-                    quote! { #(#attrs)* #pat #colon_token #cuda_type }
-                }
-            },
-            syn::FnArg::Receiver(_) => unreachable!(),
-        })
-        .collect()
-}
diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func_async/launch_types.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func_async/launch_types.rs
deleted file mode 100644
index 74402c939..000000000
--- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func_async/launch_types.rs
+++ /dev/null
@@ -1,71 +0,0 @@
-use proc_macro2::TokenStream;
-use syn::spanned::Spanned;
-
-use super::super::super::super::{FunctionInputs, InputCudaType};
-
-pub(in super::super) fn generate_launch_types(
-    crate_path: &syn::Path,
-    FunctionInputs {
-        func_inputs,
-        func_input_cuda_types,
-    }: &FunctionInputs,
-) -> (Vec<TokenStream>, Vec<syn::Type>) {
-    let mut cpu_func_types_launch = Vec::with_capacity(func_inputs.len());
-    let mut cpu_func_unboxed_types = Vec::with_capacity(func_inputs.len());
-
-    func_inputs
-        .iter()
-        .zip(func_input_cuda_types.iter())
-        .for_each(|(arg, (cuda_mode, _ptx_jit))| match arg {
-            syn::FnArg::Typed(syn::PatType { ty, .. }) => {
-                let syn_type = match &**ty {
-                    syn::Type::Reference(syn::TypeReference { elem, .. }) => elem,
-                    other => other,
-                };
-
-                cpu_func_unboxed_types.push(syn_type.clone());
-
-                let cuda_type = match cuda_mode {
-                    InputCudaType::SafeDeviceCopy => quote::quote_spanned! { ty.span()=>
-                        #crate_path::utils::device_copy::SafeDeviceCopyWrapper<#syn_type>
-                    },
-                    InputCudaType::LendRustToCuda => quote::quote_spanned! { ty.span()=>
-                        #crate_path::common::DeviceAccessible<
-                            <#syn_type as #crate_path::common::RustToCuda>::CudaRepresentation
-                        >
-                    },
-                };
-
-                cpu_func_types_launch.push(
-                    if let syn::Type::Reference(syn::TypeReference {
-                        mutability,
-                        lifetime,
-                        ..
-                    }) = &**ty
-                    {
-                        let comma: Option<syn::token::Comma> =
-                            lifetime.as_ref().map(|_| syn::parse_quote!(,));
-
-                        if mutability.is_some() {
-                            quote::quote_spanned! { ty.span()=>
-                                #crate_path::common::DeviceMutRef<#lifetime #comma #cuda_type>
-                            }
-                        } else {
-                            quote::quote_spanned! { ty.span()=>
-                                #crate_path::common::DeviceConstRef<#lifetime #comma #cuda_type>
-                            }
-                        }
-                    } else if matches!(cuda_mode, InputCudaType::LendRustToCuda) {
-                        quote::quote_spanned! { ty.span()=>
-                            #crate_path::common::DeviceMutRef<#cuda_type>
-                        }
-                    } else {
-                        quote! { #cuda_type }
-                    },
-                );
-            },
-            syn::FnArg::Receiver(_) => unreachable!(),
-        });
-
-    (cpu_func_types_launch, cpu_func_unboxed_types)
-}
diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func_async/mod.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func_async/mod.rs
deleted file mode 100644
index c8ef3dbc0..000000000
--- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func_async/mod.rs
+++ /dev/null
@@ -1,87 +0,0 @@
-use proc_macro2::TokenStream;
-
-use super::super::super::{DeclGenerics, FuncIdent, FunctionInputs, ImplGenerics};
-
-mod async_func_types;
-mod launch_types;
-mod type_wrap;
-
-use async_func_types::generate_async_func_types;
-use launch_types::generate_launch_types;
-use type_wrap::generate_func_input_and_ptx_jit_wraps;
-
-#[allow(clippy::too_many_arguments)]
-pub(super) fn quote_kernel_func_async(
-    crate_path: &syn::Path,
-    ImplGenerics { ty_generics, .. }: &ImplGenerics,
-    DeclGenerics {
-        generic_kernel_params,
-        ..
-    }: &DeclGenerics,
-    func_inputs: &FunctionInputs,
-    FuncIdent {
-        func_ident,
-        func_ident_async,
-        ..
-    }: &FuncIdent,
-    func_params: &[syn::Ident],
-    func_attrs: &[syn::Attribute],
-) -> TokenStream {
-    let launcher = syn::Ident::new("launcher", proc_macro2::Span::mixed_site());
-    let stream = syn::Lifetime::new("'stream", proc_macro2::Span::mixed_site());
-
-    let kernel_func_async_inputs = generate_async_func_types(crate_path, func_inputs, &stream);
-    let (func_input_wrap, func_cpu_ptx_jit_wrap) =
-        generate_func_input_and_ptx_jit_wraps(crate_path, func_inputs);
-    let (cpu_func_types_launch, cpu_func_unboxed_types) =
-        generate_launch_types(crate_path, func_inputs);
-
-    quote! {
-        #[cfg(not(target_os = "cuda"))]
-        #(#func_attrs)*
-        #[allow(clippy::extra_unused_type_parameters)]
-        #[allow(clippy::too_many_arguments)]
-        #[allow(clippy::used_underscore_binding)]
-        #[allow(unused_variables)]
-        pub fn #func_ident_async <#stream, #generic_kernel_params>(
-            #launcher: &mut #crate_path::host::Launcher<#stream, '_, #func_ident #ty_generics>,
-            #(#kernel_func_async_inputs),*
-        ) -> #crate_path::rustacuda::error::CudaResult<()> {
-            let kernel_jit_result = if #launcher.config.ptx_jit {
-                #launcher.kernel.compile_with_ptx_jit_args(#func_cpu_ptx_jit_wrap)?
-            } else {
-                #launcher.kernel.compile_with_ptx_jit_args(None)?
-            };
-            let function = match kernel_jit_result {
-                #crate_path::host::KernelJITResult::Recompiled(function)
-                | #crate_path::host::KernelJITResult::Cached(function) => function,
-            };
-
-            #[allow(clippy::redundant_closure_call)]
-            (|#(#func_params: #cpu_func_types_launch),*| {
-                if false {
-                    #[allow(dead_code)]
-                    fn assert_impl_devicecopy<T: #crate_path::rustacuda_core::DeviceCopy>(_val: &T) {}
-
-                    #[allow(dead_code)]
-                    fn assert_impl_no_safe_aliasing<T: #crate_path::safety::NoSafeAliasing>() {}
-
-                    #(assert_impl_devicecopy(&#func_params);)*
-                    #(assert_impl_no_safe_aliasing::<#cpu_func_unboxed_types>();)*
-                }
-
-                let #crate_path::host::LaunchConfig {
-                    grid, block, shared_memory_size, ptx_jit: _,
-                } = #launcher.config.clone();
-
-                unsafe { #launcher.stream.launch(function, grid, block, shared_memory_size,
-                    &[
-                        #(
-                            &#func_params as *const _ as *mut ::core::ffi::c_void
-                        ),*
-                    ]
-                ) }
-            })(#(#func_input_wrap),*)
-        }
-    }
-}
diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func_async/type_wrap.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func_async/type_wrap.rs
deleted file mode 100644
index 54ba2945b..000000000
--- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func_async/type_wrap.rs
+++ /dev/null
@@ -1,53 +0,0 @@
-use proc_macro2::TokenStream;
-
-use crate::kernel::wrapper::InputCudaType;
-
-use super::super::super::super::FunctionInputs;
-
-pub(super) fn generate_func_input_and_ptx_jit_wraps(
-    crate_path: &syn::Path,
-    FunctionInputs {
-        func_inputs,
-        func_input_cuda_types,
-    }: &FunctionInputs,
-) -> (Vec<TokenStream>, TokenStream) {
-    let mut any_ptx_jit = false;
-
-    let (func_input_wrap, func_cpu_ptx_jit_wrap): (Vec<TokenStream>, Vec<TokenStream>) =
-        func_inputs
-            .iter()
-            .zip(func_input_cuda_types.iter())
-            .map(|(arg, (cuda_mode, ptx_jit))| match arg {
-                syn::FnArg::Typed(syn::PatType { pat, ty, .. }) => {
-                    #[allow(clippy::if_same_then_else)]
-                    let func_input = if let syn::Type::Reference(_) = &**ty {
-                        quote! { unsafe { #pat.for_device_async() } }
-                    } else if matches!(cuda_mode, InputCudaType::LendRustToCuda) {
-                        quote! { unsafe { #pat.for_device_async() } }
-                    } else {
-                        quote! { #pat }
-                    };
-
-                    let ptx_load = if ptx_jit.0 {
-                        any_ptx_jit = true;
-
-                        quote! { Some(#crate_path::ptx_jit::arg_as_raw_bytes(#pat.for_host())) }
-                    } else {
-                        quote! { None }
-                    };
-
-                    (func_input, ptx_load)
-                },
-                syn::FnArg::Receiver(_) => unreachable!(),
-            })
-            .unzip();
-
-    if any_ptx_jit {
-        (
-            func_input_wrap,
-            quote!(Some(&[#(#func_cpu_ptx_jit_wrap),*])),
-        )
-    } else {
-        (func_input_wrap, quote!(None))
-    }
-}
diff --git a/rust-cuda-derive/src/kernel/wrapper/mod.rs b/rust-cuda-derive/src/kernel/wrapper/mod.rs
index 64f6f4f3f..b79dfb1fd 100644
--- a/rust-cuda-derive/src/kernel/wrapper/mod.rs
+++ b/rust-cuda-derive/src/kernel/wrapper/mod.rs
@@ -35,8 +35,7 @@ pub fn kernel(attr: TokenStream, func: TokenStream) -> TokenStream {
         Ok(config) => config,
         Err(err) => {
             abort_call_site!(
-                "#[kernel(pub? use LINKER! as impl KERNEL<ARGS> for LAUNCHER)] expects LINKER, \
-                 KERNEL, ARGS, and LAUNCHER identifiers: {:?}",
+                "#[kernel(pub? use LINKER! for impl)] expects LINKER identifier: {:?}",
                 err
             )
         },

From 27635ee42c4f1111c622767b1e144dd3b018fd2b Mon Sep 17 00:00:00 2001
From: Juniper Tyree <juniper.tyree@helsinki.fi>
Date: Thu, 21 Dec 2023 17:59:45 +0000
Subject: [PATCH 053/120] Early sketch of extracting type wrapping from macro
 into types and traits

---
 .../kernel/wrapper/generate/cuda_wrapper.rs   |   2 +
 src/common.rs                                 | 489 ++++++++++++++++++
 src/host.rs                                   |  77 ++-
 src/lib.rs                                    |   1 +
 4 files changed, 527 insertions(+), 42 deletions(-)

diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cuda_wrapper.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cuda_wrapper.rs
index 058299b41..8aa57ab87 100644
--- a/rust-cuda-derive/src/kernel/wrapper/generate/cuda_wrapper.rs
+++ b/rust-cuda-derive/src/kernel/wrapper/generate/cuda_wrapper.rs
@@ -72,6 +72,8 @@ pub(in super::super) fn quote_cuda_wrapper(
                         syn::TypeReference { and_token, .. }
                     ) = &**ty {
                         // DeviceCopy mode only supports immutable references
+                        // TODO: ptx_jit_load should be here, not there
+                        // also ptx_jit_load should not be enabled for interior mutability
                         quote! { { let #pat: #and_token #syn_type = #pat.as_ref().into_ref(); #inner } }
                     } else {
                         quote! { #ptx_jit_load; { let #pat: #syn_type = #pat.into_inner(); #inner } }
diff --git a/src/common.rs b/src/common.rs
index cf44848a4..2e7102a73 100644
--- a/src/common.rs
+++ b/src/common.rs
@@ -239,6 +239,43 @@ impl<'r, T: DeviceCopy> AsMut<T> for DeviceMutRef<'r, T> {
     }
 }
 
+#[repr(transparent)]
+#[derive(TypeLayout)]
+pub struct DeviceOwnedRef<T: DeviceCopy> {
+    pub(super) pointer: *mut T,
+    pub(super) marker: PhantomData<T>,
+}
+
+// TODO: when should the drop run???
+#[cfg(feature = "host")]
+impl<T: DeviceCopy> Drop for DeviceOwnedRef<T> {
+    fn drop(&mut self) {
+        // Safety: pointer comes from [`DeviceBox::into_device`]
+        //         i.e. this function completes the roundtrip
+        let device_box = unsafe { rustacuda::memory::DeviceBox::from_raw(self.pointer) };
+
+        core::mem::drop(crate::host::CudaDropWrapper::from(device_box));
+    }
+}
+
+unsafe impl<T: DeviceCopy> DeviceCopy for DeviceOwnedRef<T> {}
+
+#[cfg(any(not(feature = "host"), doc))]
+#[doc(cfg(not(feature = "host")))]
+impl<T: DeviceCopy> AsRef<T> for DeviceOwnedRef<T> {
+    fn as_ref(&self) -> &T {
+        unsafe { &*self.pointer }
+    }
+}
+
+#[cfg(any(not(feature = "host"), doc))]
+#[doc(cfg(not(feature = "host")))]
+impl<T: DeviceCopy> AsMut<T> for DeviceOwnedRef<T> {
+    fn as_mut(&mut self) -> &mut T {
+        unsafe { &mut *self.pointer }
+    }
+}
+
 pub(crate) mod crate_private {
     pub mod alloc {
         pub trait Sealed {}
@@ -282,3 +319,455 @@ impl<A: CudaAlloc, B: CudaAlloc> CombinedCudaAlloc<A, B> {
         (self.0, self.1)
     }
 }
+
+mod sealed {
+    pub trait Sealed {}
+}
+
+// TODO: doc cfg
+pub trait CudaKernelParameter: sealed::Sealed {
+    #[cfg(feature = "host")]
+    type SyncHostType;
+    #[cfg(feature = "host")]
+    type AsyncHostType<'stream, 'b>;
+    type FfiType<'stream, 'b>: rustacuda_core::DeviceCopy;
+    type DeviceType;
+
+    #[cfg(feature = "host")]
+    #[allow(clippy::missing_errors_doc)] // FIXME
+    fn with_new_async<'stream, O, E: From<rustacuda::error::CudaError>>(
+        param: Self::SyncHostType,
+        stream: &'stream rustacuda::stream::Stream,
+        inner: impl for<'b> FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result<O, E>,
+    ) -> Result<O, E>;
+
+    #[cfg(feature = "host")]
+    fn async_to_ffi<'stream, 'b>(
+        param: Self::AsyncHostType<'stream, 'b>,
+    ) -> Self::FfiType<'stream, 'b>;
+
+    #[cfg(not(feature = "host"))]
+    fn with_ffi_as_device(
+        param: Self::FfiType<'static, 'static>,
+        inner: impl FnOnce(Self::DeviceType),
+    );
+}
+
+#[repr(transparent)]
+pub struct PerThreadShallowCopy<
+    T: crate::safety::SafeDeviceCopy + const_type_layout::TypeGraphLayout,
+>(T);
+
+#[cfg(not(feature = "host"))]
+impl<T: crate::safety::SafeDeviceCopy + const_type_layout::TypeGraphLayout>
+    PerThreadShallowCopy<T>
+{
+    #[must_use]
+    pub fn into_inner(self) -> T {
+        self.0
+    }
+}
+
+#[cfg(not(feature = "host"))]
+impl<T: crate::safety::SafeDeviceCopy + const_type_layout::TypeGraphLayout> core::ops::Deref
+    for PerThreadShallowCopy<T>
+{
+    type Target = T;
+
+    fn deref(&self) -> &Self::Target {
+        &self.0
+    }
+}
+
+impl<T: crate::safety::SafeDeviceCopy + const_type_layout::TypeGraphLayout> CudaKernelParameter
+    for PerThreadShallowCopy<T>
+{
+    #[cfg(feature = "host")]
+    type AsyncHostType<'stream, 'b> = crate::utils::device_copy::SafeDeviceCopyWrapper<T>;
+    type DeviceType = PerThreadShallowCopy<T>;
+    type FfiType<'stream, 'b> = crate::utils::device_copy::SafeDeviceCopyWrapper<T>;
+    #[cfg(feature = "host")]
+    type SyncHostType = T;
+
+    #[cfg(feature = "host")]
+    fn with_new_async<'stream, O, E: From<rustacuda::error::CudaError>>(
+        param: Self::SyncHostType,
+        _stream: &'stream rustacuda::stream::Stream,
+        inner: impl for<'b> FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result<O, E>,
+    ) -> Result<O, E> {
+        inner(crate::utils::device_copy::SafeDeviceCopyWrapper::from(
+            param,
+        ))
+    }
+
+    #[cfg(feature = "host")]
+    fn async_to_ffi<'stream, 'b>(
+        param: Self::AsyncHostType<'stream, 'b>,
+    ) -> Self::FfiType<'stream, 'b> {
+        param
+    }
+
+    #[cfg(not(feature = "host"))]
+    fn with_ffi_as_device(
+        param: Self::FfiType<'static, 'static>,
+        inner: impl FnOnce(Self::DeviceType),
+    ) {
+        let param = PerThreadShallowCopy(param.into_inner());
+
+        inner(param)
+    }
+}
+impl<T: crate::safety::SafeDeviceCopy + const_type_layout::TypeGraphLayout> sealed::Sealed
+    for PerThreadShallowCopy<T>
+{
+}
+
+impl<'a, T: 'static + crate::safety::SafeDeviceCopy + const_type_layout::TypeGraphLayout>
+    CudaKernelParameter for &'a PerThreadShallowCopy<T>
+{
+    #[cfg(feature = "host")]
+    type AsyncHostType<'stream, 'b> = crate::host::HostAndDeviceConstRefAsync<
+        'stream,
+        'b,
+        crate::utils::device_copy::SafeDeviceCopyWrapper<T>,
+    >;
+    type DeviceType = &'a PerThreadShallowCopy<T>;
+    type FfiType<'stream, 'b> =
+        DeviceConstRef<'b, crate::utils::device_copy::SafeDeviceCopyWrapper<T>>;
+    #[cfg(feature = "host")]
+    type SyncHostType = &'a T;
+
+    #[cfg(feature = "host")]
+    fn with_new_async<'stream, O, E: From<rustacuda::error::CudaError>>(
+        param: Self::SyncHostType,
+        _stream: &'stream rustacuda::stream::Stream,
+        inner: impl for<'b> FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result<O, E>,
+    ) -> Result<O, E> {
+        let host_box = crate::host::HostDeviceBox::from(rustacuda::memory::DeviceBox::new(
+            crate::utils::device_copy::SafeDeviceCopyWrapper::from_ref(param),
+        )?);
+
+        // Safety: `host_box` contains exactly the device copy of `param`
+        let const_ref = unsafe {
+            crate::host::HostAndDeviceConstRef::new(
+                &host_box,
+                crate::utils::device_copy::SafeDeviceCopyWrapper::from_ref(param),
+            )
+        };
+
+        inner(const_ref.as_async())
+    }
+
+    #[cfg(feature = "host")]
+    fn async_to_ffi<'stream, 'b>(
+        param: Self::AsyncHostType<'stream, 'b>,
+    ) -> Self::FfiType<'stream, 'b> {
+        unsafe { param.for_device_async() }
+    }
+
+    #[cfg(not(feature = "host"))]
+    fn with_ffi_as_device(
+        param: Self::FfiType<'static, 'static>,
+        inner: impl FnOnce(Self::DeviceType),
+    ) {
+        let param = param.as_ref().into_ref();
+        // Safety: PerThreadShallowCopy<T> is a transparent newtype wrapper around T
+        let param = unsafe { &*(param as *const T).cast::<PerThreadShallowCopy<T>>() };
+
+        inner(param)
+    }
+}
+impl<'a, T: crate::safety::SafeDeviceCopy + const_type_layout::TypeGraphLayout> sealed::Sealed
+    for &'a PerThreadShallowCopy<T>
+{
+}
+
+#[repr(transparent)]
+pub struct ShallowInteriorMutable<T: InteriorMutableSafeDeviceCopy>(T);
+
+#[cfg(not(feature = "host"))]
+impl<T: InteriorMutableSafeDeviceCopy> core::ops::Deref for ShallowInteriorMutable<T> {
+    type Target = T;
+
+    fn deref(&self) -> &Self::Target {
+        &self.0
+    }
+}
+
+impl<'a, T: 'static + InteriorMutableSafeDeviceCopy> CudaKernelParameter
+    for &'a ShallowInteriorMutable<T>
+{
+    #[cfg(feature = "host")]
+    type AsyncHostType<'stream, 'b> = crate::host::HostAndDeviceConstRefAsync<
+        'stream,
+        'b,
+        crate::utils::device_copy::SafeDeviceCopyWrapper<T>,
+    >;
+    type DeviceType = &'a ShallowInteriorMutable<T>;
+    type FfiType<'stream, 'b> =
+        DeviceConstRef<'b, crate::utils::device_copy::SafeDeviceCopyWrapper<T>>;
+    #[cfg(feature = "host")]
+    /// The kernel takes a mutable borrow of the interior mutable data to ensure
+    /// the interior mutability is limited to just this kernel invocation.
+    type SyncHostType = &'a mut T;
+
+    #[cfg(feature = "host")]
+    fn with_new_async<'stream, O, E: From<rustacuda::error::CudaError>>(
+        param: Self::SyncHostType,
+        _stream: &'stream rustacuda::stream::Stream,
+        inner: impl for<'b> FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result<O, E>,
+    ) -> Result<O, E> {
+        let host_box = crate::host::HostDeviceBox::from(rustacuda::memory::DeviceBox::new(
+            crate::utils::device_copy::SafeDeviceCopyWrapper::from_ref(param),
+        )?);
+
+        // Safety: `host_box` contains exactly the device copy of `param`
+        let const_ref = unsafe {
+            crate::host::HostAndDeviceConstRef::new(
+                &host_box,
+                crate::utils::device_copy::SafeDeviceCopyWrapper::from_ref(param),
+            )
+        };
+
+        let result = inner(const_ref.as_async());
+
+        host_box.copy_to(crate::utils::device_copy::SafeDeviceCopyWrapper::from_mut(
+            param,
+        ))?;
+
+        result
+    }
+
+    #[cfg(feature = "host")]
+    fn async_to_ffi<'stream, 'b>(
+        param: Self::AsyncHostType<'stream, 'b>,
+    ) -> Self::FfiType<'stream, 'b> {
+        unsafe { param.for_device_async() }
+    }
+
+    #[cfg(not(feature = "host"))]
+    fn with_ffi_as_device(
+        param: Self::FfiType<'static, 'static>,
+        inner: impl FnOnce(Self::DeviceType),
+    ) {
+        let param = param.as_ref().into_ref();
+        // Safety: ShallowInteriorMutable<T> is a transparent newtype wrapper around T
+        let param = unsafe { &*(param as *const T).cast::<ShallowInteriorMutable<T>>() };
+
+        inner(param)
+    }
+}
+impl<'a, T: InteriorMutableSafeDeviceCopy> sealed::Sealed for &'a ShallowInteriorMutable<T> {}
+
+pub trait InteriorMutableSafeDeviceCopy:
+    crate::safety::SafeDeviceCopy + const_type_layout::TypeGraphLayout + sealed::Sealed
+{
+}
+
+macro_rules! impl_atomic_interior_mutable {
+    ($atomic:ident($interior:ty)) => {
+        impl InteriorMutableSafeDeviceCopy for core::sync::atomic::$atomic {}
+        impl sealed::Sealed for core::sync::atomic::$atomic {}
+    };
+    ($($atomic:ident($interior:ty)),*) => {
+        $(impl_atomic_interior_mutable! { $atomic($interior) })*
+    }
+}
+
+impl_atomic_interior_mutable! {
+    AtomicBool(bool),
+    AtomicI8(i8), AtomicI16(i16), AtomicI32(i32), AtomicI64(i64), AtomicIsize(isize),
+    AtomicU8(u8), AtomicU16(u16), AtomicU32(u32), AtomicU64(u64), AtomicUsize(usize)
+}
+
+// TODO: update const type layout
+// impl<T: crate::safety::SafeDeviceCopy + const_type_layout::TypeGraphLayout>
+// InteriorMutableSafeDeviceCopy for core::cell::SyncUnsafeCell<T> {}
+// impl<T: crate::safety::SafeDeviceCopy> sealed::Sealed for
+// core::cell::SyncUnsafeCell<T> {}
+
+#[repr(transparent)]
+pub struct SharedHeapPerThreadShallowCopy<T: RustToCuda>(core::mem::ManuallyDrop<T>);
+
+#[cfg(not(feature = "host"))]
+impl<T: RustToCuda> SharedHeapPerThreadShallowCopy<T> {
+    #[must_use]
+    fn new(value: T) -> Self {
+        Self(core::mem::ManuallyDrop::new(value))
+    }
+}
+
+#[cfg(not(feature = "host"))]
+impl<
+        T: RustToCuda<
+            CudaRepresentation: crate::safety::SafeDeviceCopy,
+            CudaAllocation: EmptyCudaAlloc,
+        >,
+    > SharedHeapPerThreadShallowCopy<T>
+{
+    #[must_use]
+    pub fn into_inner(self) -> T {
+        core::mem::ManuallyDrop::into_inner(self.0)
+    }
+}
+
+#[cfg(not(feature = "host"))]
+impl<T: RustToCuda> core::ops::Deref for SharedHeapPerThreadShallowCopy<T> {
+    type Target = T;
+
+    fn deref(&self) -> &Self::Target {
+        &self.0
+    }
+}
+
+#[cfg(not(feature = "host"))]
+impl<T: RustToCuda> core::ops::DerefMut for SharedHeapPerThreadShallowCopy<T> {
+    fn deref_mut(&mut self) -> &mut Self::Target {
+        &mut self.0
+    }
+}
+
+impl<
+        T: RustToCuda<
+            CudaRepresentation: crate::safety::SafeDeviceCopy,
+            CudaAllocation: EmptyCudaAlloc,
+        >,
+    > CudaKernelParameter for SharedHeapPerThreadShallowCopy<T>
+{
+    #[cfg(feature = "host")]
+    type AsyncHostType<'stream, 'b> = crate::host::HostAndDeviceOwnedAsync<
+        'stream,
+        DeviceAccessible<<T as RustToCuda>::CudaRepresentation>,
+    >;
+    type DeviceType = SharedHeapPerThreadShallowCopy<T>;
+    // TODO: where does the drop happen?
+    type FfiType<'stream, 'b> =
+        DeviceOwnedRef<DeviceAccessible<<T as RustToCuda>::CudaRepresentation>>;
+    #[cfg(feature = "host")]
+    type SyncHostType = T;
+
+    #[cfg(feature = "host")]
+    fn with_new_async<'stream, O, E: From<rustacuda::error::CudaError>>(
+        param: Self::SyncHostType,
+        _stream: &'stream rustacuda::stream::Stream,
+        inner: impl for<'b> FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result<O, E>,
+    ) -> Result<O, E> {
+        crate::host::LendToCuda::move_to_cuda(param, |param| inner(param.into_async()))
+    }
+
+    #[cfg(feature = "host")]
+    fn async_to_ffi<'stream, 'b>(
+        param: Self::AsyncHostType<'stream, 'b>,
+    ) -> Self::FfiType<'stream, 'b> {
+        unsafe { param.for_device_async() }
+    }
+
+    #[cfg(not(feature = "host"))]
+    fn with_ffi_as_device(
+        param: Self::FfiType<'static, 'static>,
+        inner: impl FnOnce(Self::DeviceType),
+    ) {
+        let param =
+            SharedHeapPerThreadShallowCopy::new(unsafe { CudaAsRust::as_rust(param.as_ref()) });
+
+        inner(param)
+    }
+}
+impl<
+        T: RustToCuda<
+            CudaRepresentation: crate::safety::SafeDeviceCopy,
+            CudaAllocation: EmptyCudaAlloc,
+        >,
+    > sealed::Sealed for SharedHeapPerThreadShallowCopy<T>
+{
+}
+
+impl<'a, T: 'static + RustToCuda> CudaKernelParameter for &'a SharedHeapPerThreadShallowCopy<T> {
+    #[cfg(feature = "host")]
+    type AsyncHostType<'stream, 'b> = crate::host::HostAndDeviceConstRefAsync<
+        'stream,
+        'b,
+        DeviceAccessible<<T as RustToCuda>::CudaRepresentation>,
+    >;
+    type DeviceType = &'a SharedHeapPerThreadShallowCopy<T>;
+    type FfiType<'stream, 'b> =
+        DeviceConstRef<'b, DeviceAccessible<<T as RustToCuda>::CudaRepresentation>>;
+    #[cfg(feature = "host")]
+    type SyncHostType = &'a T;
+
+    #[cfg(feature = "host")]
+    fn with_new_async<'stream, O, E: From<rustacuda::error::CudaError>>(
+        param: Self::SyncHostType,
+        _stream: &'stream rustacuda::stream::Stream,
+        inner: impl for<'b> FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result<O, E>,
+    ) -> Result<O, E> {
+        crate::host::LendToCuda::lend_to_cuda(param, |param| inner(param.as_async()))
+    }
+
+    #[cfg(feature = "host")]
+    fn async_to_ffi<'stream, 'b>(
+        param: Self::AsyncHostType<'stream, 'b>,
+    ) -> Self::FfiType<'stream, 'b> {
+        unsafe { param.for_device_async() }
+    }
+
+    #[cfg(not(feature = "host"))]
+    fn with_ffi_as_device(
+        param: Self::FfiType<'static, 'static>,
+        inner: impl FnOnce(Self::DeviceType),
+    ) {
+        // param must never be dropped as we do NOT own any of the
+        //  heap memory it might reference
+        let param =
+            SharedHeapPerThreadShallowCopy::new(unsafe { CudaAsRust::as_rust(param.as_ref()) });
+
+        inner(&param)
+    }
+}
+impl<'a, T: RustToCuda> sealed::Sealed for &'a SharedHeapPerThreadShallowCopy<T> {}
+
+impl<'a, T: 'static + RustToCuda> CudaKernelParameter
+    for &'a mut SharedHeapPerThreadShallowCopy<T>
+{
+    #[cfg(feature = "host")]
+    type AsyncHostType<'stream, 'b> = crate::host::HostAndDeviceMutRefAsync<
+        'stream,
+        'b,
+        DeviceAccessible<<T as RustToCuda>::CudaRepresentation>,
+    >;
+    type DeviceType = &'a mut SharedHeapPerThreadShallowCopy<T>;
+    type FfiType<'stream, 'b> =
+        DeviceMutRef<'b, DeviceAccessible<<T as RustToCuda>::CudaRepresentation>>;
+    #[cfg(feature = "host")]
+    type SyncHostType = &'a mut T;
+
+    #[cfg(feature = "host")]
+    fn with_new_async<'stream, O, E: From<rustacuda::error::CudaError>>(
+        param: Self::SyncHostType,
+        _stream: &'stream rustacuda::stream::Stream,
+        inner: impl for<'b> FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result<O, E>,
+    ) -> Result<O, E> {
+        crate::host::LendToCuda::lend_to_cuda_mut(param, |mut param| inner(param.as_async()))
+    }
+
+    #[cfg(feature = "host")]
+    fn async_to_ffi<'stream, 'b>(
+        mut param: Self::AsyncHostType<'stream, 'b>,
+    ) -> Self::FfiType<'stream, 'b> {
+        unsafe { param.for_device_async() }
+    }
+
+    #[cfg(not(feature = "host"))]
+    fn with_ffi_as_device(
+        mut param: Self::FfiType<'static, 'static>,
+        inner: impl FnOnce(Self::DeviceType),
+    ) {
+        // param must never be dropped as we do NOT own any of the
+        //  heap memory it might reference
+        let mut param =
+            SharedHeapPerThreadShallowCopy::new(unsafe { CudaAsRust::as_rust(param.as_mut()) });
+
+        inner(&mut param)
+    }
+}
+impl<'a, T: RustToCuda> sealed::Sealed for &'a mut SharedHeapPerThreadShallowCopy<T> {}
diff --git a/src/host.rs b/src/host.rs
index 8df7d2fbe..8fa437cf3 100644
--- a/src/host.rs
+++ b/src/host.rs
@@ -22,7 +22,8 @@ pub use rust_cuda_derive::{check_kernel, link_kernel, specialise_kernel_entry_po
 
 use crate::{
     common::{
-        DeviceAccessible, DeviceConstRef, DeviceMutRef, EmptyCudaAlloc, NoCudaAlloc, RustToCuda,
+        DeviceAccessible, DeviceConstRef, DeviceMutRef, DeviceOwnedRef, EmptyCudaAlloc,
+        NoCudaAlloc, RustToCuda,
     },
     ptx_jit::{PtxJITCompiler, PtxJITResult},
     safety::SafeDeviceCopy,
@@ -779,56 +780,46 @@ impl<'a, T: DeviceCopy> HostAndDeviceConstRef<'a, T> {
 }
 
 #[allow(clippy::module_name_repetitions)]
-pub struct HostAndDeviceOwned<'a, T: SafeDeviceCopy + DeviceCopy> {
-    device_box: &'a mut HostDeviceBox<T>,
-    host_val: &'a mut T,
+pub struct HostAndDeviceOwned<T: SafeDeviceCopy + DeviceCopy> {
+    device_box: HostDeviceBox<T>,
+    host_val: T,
 }
 
-impl<'a, T: SafeDeviceCopy + DeviceCopy> HostAndDeviceOwned<'a, T> {
+impl<T: SafeDeviceCopy + DeviceCopy> HostAndDeviceOwned<T> {
     /// # Errors
     ///
     /// Returns a [`CudaError`] iff `value` cannot be moved
     ///  to CUDA or an error occurs inside `inner`.
-    pub fn with_new<
-        O,
-        E: From<CudaError>,
-        F: for<'b> FnOnce(HostAndDeviceOwned<'b, T>) -> Result<O, E>,
-    >(
-        mut value: T,
+    pub fn with_new<O, E: From<CudaError>, F: FnOnce(HostAndDeviceOwned<T>) -> Result<O, E>>(
+        value: T,
         inner: F,
     ) -> Result<O, E> {
-        let mut device_box: HostDeviceBox<_> = DeviceBox::new(&value)?.into();
+        let device_box: HostDeviceBox<_> = DeviceBox::new(&value)?.into();
 
         // Safety: `device_box` contains exactly the device copy of `value`
-        let result = inner(HostAndDeviceOwned {
-            device_box: &mut device_box,
-            host_val: &mut value,
-        });
-
-        core::mem::drop(device_box);
-        core::mem::drop(value);
-
-        result
+        inner(HostAndDeviceOwned {
+            device_box,
+            host_val: value,
+        })
     }
 
     #[must_use]
-    pub fn for_device(self) -> DeviceMutRef<'a, T> {
-        DeviceMutRef {
-            pointer: self.device_box.0.as_raw_mut(),
-            reference: PhantomData,
+    pub fn for_device(self) -> DeviceOwnedRef<T> {
+        let mut device_box = ManuallyDrop::new(self.device_box);
+
+        DeviceOwnedRef {
+            pointer: device_box.0.as_raw_mut(),
+            marker: PhantomData::<T>,
         }
     }
 
     #[must_use]
-    pub fn for_host(&'a mut self) -> &'a T {
-        self.host_val
+    pub fn for_host(&self) -> &T {
+        &self.host_val
     }
 
     #[must_use]
-    pub fn as_async<'stream, 'b>(&'b mut self) -> HostAndDeviceOwnedAsync<'stream, 'b, T>
-    where
-        'a: 'b,
-    {
+    pub fn into_async<'stream>(self) -> HostAndDeviceOwnedAsync<'stream, T> {
         HostAndDeviceOwnedAsync {
             device_box: self.device_box,
             host_val: self.host_val,
@@ -970,28 +961,30 @@ impl<'stream, 'a, T: DeviceCopy> HostAndDeviceConstRefAsync<'stream, 'a, T> {
 }
 
 #[allow(clippy::module_name_repetitions)]
-pub struct HostAndDeviceOwnedAsync<'stream, 'a, T: SafeDeviceCopy + DeviceCopy> {
-    device_box: &'a mut HostDeviceBox<T>,
-    host_val: &'a mut T,
+pub struct HostAndDeviceOwnedAsync<'stream, T: SafeDeviceCopy + DeviceCopy> {
+    device_box: HostDeviceBox<T>,
+    host_val: T,
     stream: PhantomData<&'stream Stream>,
 }
 
-impl<'stream, 'a, T: SafeDeviceCopy + DeviceCopy> HostAndDeviceOwnedAsync<'stream, 'a, T> {
+impl<'stream, T: SafeDeviceCopy + DeviceCopy> HostAndDeviceOwnedAsync<'stream, T> {
     #[must_use]
     /// # Safety
     ///
-    /// The returned [`DeviceConstRef`] must only be used on the
+    /// The returned [`DeviceOwnedRef`] must only be used on the
     /// constructed-with [`Stream`]
-    pub unsafe fn for_device_async(self) -> DeviceMutRef<'a, T> {
-        DeviceMutRef {
-            pointer: self.device_box.0.as_raw_mut(),
-            reference: PhantomData,
+    pub unsafe fn for_device_async(self) -> DeviceOwnedRef<T> {
+        let mut device_box = ManuallyDrop::new(self.device_box);
+
+        DeviceOwnedRef {
+            pointer: device_box.0.as_raw_mut(),
+            marker: PhantomData,
         }
     }
 
     #[must_use]
-    pub fn for_host(&'a mut self) -> &'a T {
-        self.host_val
+    pub fn for_host(&self) -> &T {
+        &self.host_val
     }
 }
 
diff --git a/src/lib.rs b/src/lib.rs
index 15e704e79..f4bc7bbe0 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -27,6 +27,7 @@
 #![feature(panic_info_message)]
 #![feature(let_chains)]
 #![feature(inline_const)]
+#![feature(sync_unsafe_cell)]
 #![feature(cfg_version)]
 #![cfg_attr(not(version("1.76.0")), feature(c_str_literals))]
 #![cfg_attr(not(version("1.76.0")), feature(ptr_from_ref))]

From adde7a0359607f2aef5491ca79a2a1419752d580 Mon Sep 17 00:00:00 2001
From: Juniper Tyree <juniper.tyree@helsinki.fi>
Date: Fri, 22 Dec 2023 13:11:00 +0000
Subject: [PATCH 054/120] Early work towards using trait for kernel type wrap,
 ptx jit workaround missing

---
 examples/print/src/main.rs                    |   2 +-
 examples/single-source/src/main.rs            |  24 +-
 rust-cuda-derive/src/kernel/specialise/ty.rs  |  48 +++-
 .../generate/cpu_linker_macro/get_ptx.rs      |  58 +---
 .../generate/cpu_wrapper/kernel_func.rs       | 140 +++-------
 .../generate/cpu_wrapper/kernel_func_async.rs | 133 ++-------
 .../wrapper/generate/cuda_generic_function.rs |  30 +-
 .../kernel/wrapper/generate/cuda_wrapper.rs   | 194 +++----------
 .../src/kernel/wrapper/inputs/attribute.rs    |  30 +-
 .../src/kernel/wrapper/inputs/mod.rs          | 107 +------
 rust-cuda-derive/src/kernel/wrapper/mod.rs    |   6 +-
 src/common.rs                                 | 262 +++++++++---------
 src/host.rs                                   |  46 ++-
 src/lib.rs                                    |   1 +
 14 files changed, 366 insertions(+), 715 deletions(-)

diff --git a/examples/print/src/main.rs b/examples/print/src/main.rs
index dc38b3fa9..17cf42fd8 100644
--- a/examples/print/src/main.rs
+++ b/examples/print/src/main.rs
@@ -23,7 +23,7 @@ pub enum Action {
 
 #[rust_cuda::common::kernel(use link! for impl)]
 #[kernel(allow(ptx::local_memory_usage))]
-pub fn kernel(#[kernel(pass = SafeDeviceCopy)] action: Action) {
+pub fn kernel(action: rust_cuda::common::PerThreadShallowCopy<Action>) {
     match action {
         Action::Print => rust_cuda::device::utils::println!("println! from CUDA kernel"),
         Action::Panic => panic!("panic! from CUDA kernel"),
diff --git a/examples/single-source/src/main.rs b/examples/single-source/src/main.rs
index 796e6ee4f..10be57d65 100644
--- a/examples/single-source/src/main.rs
+++ b/examples/single-source/src/main.rs
@@ -11,6 +11,7 @@
 #![feature(type_alias_impl_trait)]
 #![feature(associated_type_bounds)]
 #![feature(decl_macro)]
+#![recursion_limit = "1024"]
 
 extern crate alloc;
 
@@ -55,20 +56,22 @@ pub struct Triple(i32, i32, i32);
 )]
 pub fn kernel<
     'a,
-    T: rc::common::RustToCuda<
+    T: 'static
+        + rc::common::RustToCuda<
             CudaRepresentation: rc::safety::StackOnly,
             CudaAllocation: rc::common::EmptyCudaAlloc,
-        > + rc::safety::StackOnly
+        >
+        + rc::safety::StackOnly
         + rc::safety::NoSafeAliasing,
 >(
-    #[kernel(pass = SafeDeviceCopy)] _x: &Dummy,
-    #[kernel(pass = LendRustToCuda, jit)] _y: &mut ShallowCopy<Wrapper<T>>,
-    #[kernel(pass = LendRustToCuda)] _z: &ShallowCopy<Wrapper<T>>,
-    #[kernel(pass = SafeDeviceCopy, jit)] _v @ _w: &'a core::sync::atomic::AtomicU64,
-    #[kernel(pass = LendRustToCuda)] _: Wrapper<T>,
-    #[kernel(pass = SafeDeviceCopy)] Tuple(s, mut __t): Tuple,
-    #[kernel(pass = SafeDeviceCopy)] q: Triple,
-    // #[kernel(pass = SafeDeviceCopy)] shared3: ThreadBlockShared<u32>,
+    _x: &rc::common::PerThreadShallowCopy<Dummy>,
+    #[kernel(jit)] _y: &mut rc::common::SharedHeapPerThreadShallowCopy<Wrapper<T>>,
+    _z: &rc::common::SharedHeapPerThreadShallowCopy<Wrapper<T>>,
+    #[kernel(jit)] _v @ _w: &'a rc::common::ShallowInteriorMutable<core::sync::atomic::AtomicU64>,
+    _: rc::common::SharedHeapPerThreadShallowCopy<Wrapper<T>>,
+    Tuple(s, mut __t): rc::common::PerThreadShallowCopy<Tuple>,
+    q: rc::common::PerThreadShallowCopy<Triple>,
+    // shared3: ThreadBlockShared<u32>,
 ) {
     let shared: ThreadBlockShared<[Tuple; 3]> = ThreadBlockShared::new_uninit();
     let shared2: ThreadBlockShared<[Tuple; 3]> = ThreadBlockShared::new_uninit();
@@ -80,6 +83,7 @@ pub fn kernel<
     unsafe {
         (*shared2.index_mut_unchecked(2)).1 = q.0 + q.1 + q.2;
     }
+
     // unsafe { core::arch::asm!("hi") }
     // unsafe {
     //     *shared3.as_mut_ptr() = 12;
diff --git a/rust-cuda-derive/src/kernel/specialise/ty.rs b/rust-cuda-derive/src/kernel/specialise/ty.rs
index 9805abc3c..1671f43f0 100644
--- a/rust-cuda-derive/src/kernel/specialise/ty.rs
+++ b/rust-cuda-derive/src/kernel/specialise/ty.rs
@@ -54,15 +54,23 @@ pub fn specialise_kernel_type(tokens: TokenStream) -> TokenStream {
             );
         }
 
+        // replace all lifetimes with 'static
+        ty = syn::fold::Fold::fold_type(
+            &mut FoldLifetimeAllStatic {
+                r#static: syn::parse_quote!('static),
+            },
+            ty,
+        );
+
         for (generic, arg) in generics.params.into_iter().zip(args.into_iter()) {
             match (generic, arg) {
                 (
                     syn::GenericParam::Lifetime(syn::LifetimeDef {
-                        lifetime: generic, ..
+                        lifetime: _generic, ..
                     }),
-                    syn::GenericArgument::Lifetime(arg),
+                    syn::GenericArgument::Lifetime(_arg),
                 ) => {
-                    ty = syn::fold::Fold::fold_type(&mut FoldLifetimeGeneric { generic, arg }, ty);
+                    // all lifetimes are already replaced with 'static above
                 },
                 (
                     syn::GenericParam::Const(syn::ConstParam { ident: generic, .. }),
@@ -115,18 +123,34 @@ impl syn::parse::Parse for SpecialiseTypeConfig {
     }
 }
 
-struct FoldLifetimeGeneric {
-    generic: syn::Lifetime,
-    arg: syn::Lifetime,
+struct FoldLifetimeAllStatic {
+    r#static: syn::Lifetime,
 }
 
-impl syn::fold::Fold for FoldLifetimeGeneric {
+impl syn::fold::Fold for FoldLifetimeAllStatic {
+    fn fold_type_reference(&mut self, r#ref: syn::TypeReference) -> syn::TypeReference {
+        let syn::TypeReference {
+            and_token,
+            lifetime: _,
+            mutability,
+            elem,
+        } = r#ref;
+
+        syn::fold::fold_type_reference(
+            self,
+            syn::TypeReference {
+                and_token,
+                lifetime: Some(self.r#static.clone()),
+                mutability,
+                elem,
+            },
+        )
+    }
+
     fn fold_lifetime(&mut self, lt: syn::Lifetime) -> syn::Lifetime {
-        if lt == self.generic {
-            self.arg.clone()
-        } else {
-            lt
-        }
+        let mut r#static = self.r#static.clone();
+        r#static.set_span(lt.span());
+        r#static
     }
 }
 
diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/get_ptx.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/get_ptx.rs
index 7e4b88f87..e838d400c 100644
--- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/get_ptx.rs
+++ b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/get_ptx.rs
@@ -3,7 +3,7 @@ use syn::spanned::Spanned;
 
 use crate::kernel::utils::skip_kernel_compilation;
 
-use super::super::super::{DeclGenerics, FuncIdent, FunctionInputs, ImplGenerics, InputCudaType};
+use super::super::super::{DeclGenerics, FuncIdent, FunctionInputs, ImplGenerics};
 
 #[allow(clippy::too_many_arguments)]
 pub(super) fn quote_get_ptx(
@@ -121,60 +121,32 @@ fn generate_lifetime_erased_types(
         generic_close_token,
         ..
     }: &DeclGenerics,
-    FunctionInputs {
-        func_inputs,
-        func_input_cuda_types,
-    }: &FunctionInputs,
+    FunctionInputs { func_inputs, .. }: &FunctionInputs,
     macro_type_ids: &[syn::Ident],
-) -> Vec<TokenStream> {
-    let mut cpu_func_lifetime_erased_types = Vec::with_capacity(func_inputs.len());
-
+) -> Vec<proc_macro2::TokenStream> {
     func_inputs
         .iter()
-        .zip(func_input_cuda_types.iter())
         .enumerate()
-        .for_each(|(i, (arg, (cuda_mode, _ptx_jit)))| match arg {
+        .map(|(i, arg)| match arg {
             syn::FnArg::Typed(syn::PatType { ty, .. }) => {
                 let type_ident = quote::format_ident!("__T_{}", i);
-                let syn_type = quote::quote_spanned! { ty.span()=>
+
+                let mut specialised_ty = quote::quote_spanned! { ty.span()=>
                     <() as #args #generic_start_token
                         #($#macro_type_ids),*
                     #generic_close_token>::#type_ident
                 };
+                // the args trait has to unbox outer lifetimes, so we need to add them back in here
+                if let syn::Type::Reference(syn::TypeReference { and_token, lifetime, mutability, .. }) = &**ty {
+                    let lifetime = quote::quote_spanned! { lifetime.span()=> 'static };
 
-                let cuda_type = match cuda_mode {
-                    InputCudaType::SafeDeviceCopy => quote::quote_spanned! { ty.span()=>
-                        #crate_path::utils::device_copy::SafeDeviceCopyWrapper<#syn_type>
-                    },
-                    InputCudaType::LendRustToCuda => quote::quote_spanned! { ty.span()=>
-                        #crate_path::common::DeviceAccessible<
-                            <#syn_type as #crate_path::common::RustToCuda>::CudaRepresentation
-                        >
-                    },
-                };
+                    specialised_ty = quote! { #and_token #lifetime #mutability #specialised_ty };
+                }
 
-                cpu_func_lifetime_erased_types.push(
-                    if let syn::Type::Reference(syn::TypeReference { mutability, .. }) = &**ty {
-                        if mutability.is_some() {
-                            quote::quote_spanned! { ty.span()=>
-                                #crate_path::common::DeviceMutRef<'static, #cuda_type>
-                            }
-                        } else {
-                            quote::quote_spanned! { ty.span()=>
-                                #crate_path::common::DeviceConstRef<'static, #cuda_type>
-                            }
-                        }
-                    } else if matches!(cuda_mode, InputCudaType::LendRustToCuda) {
-                        quote::quote_spanned! { ty.span()=>
-                            #crate_path::common::DeviceMutRef<'static, #cuda_type>
-                        }
-                    } else {
-                        cuda_type
-                    },
-                );
+                quote::quote_spanned! { ty.span()=>
+                    <#specialised_ty as #crate_path::common::CudaKernelParameter>::FfiType<'static, 'static>
+                }
             },
             syn::FnArg::Receiver(_) => unreachable!(),
-        });
-
-    cpu_func_lifetime_erased_types
+        }).collect()
 }
diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func.rs
index 5cc6f8077..7eb7db1a4 100644
--- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func.rs
+++ b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func.rs
@@ -1,6 +1,7 @@
 use proc_macro2::TokenStream;
+use syn::spanned::Spanned;
 
-use super::super::super::{DeclGenerics, FuncIdent, FunctionInputs, ImplGenerics, InputCudaType};
+use super::super::super::{DeclGenerics, FuncIdent, FunctionInputs, ImplGenerics};
 
 pub(super) fn quote_kernel_func_inputs(
     crate_path: &syn::Path,
@@ -16,13 +17,32 @@ pub(super) fn quote_kernel_func_inputs(
     func_params: &[syn::Ident],
     func_attrs: &[syn::Attribute],
 ) -> TokenStream {
-    let kernel_func_input_tys = func_inputs
+    let (kernel_func_inputs, kernel_func_input_tys): (Vec<_>, Vec<_>) = func_inputs
         .iter()
         .map(|arg| match arg {
-            syn::FnArg::Typed(syn::PatType { ty, .. }) => syn::Type::clone(ty),
+            syn::FnArg::Typed(syn::PatType {
+                attrs,
+                ty,
+                pat,
+                colon_token,
+            }) => {
+                let ty: syn::Type = syn::parse_quote_spanned! { ty.span()=>
+                    <#ty as #crate_path::common::CudaKernelParameter>::SyncHostType
+                };
+
+                (
+                    syn::FnArg::Typed(syn::PatType {
+                        attrs: attrs.clone(),
+                        ty: Box::new(ty.clone()),
+                        pat: pat.clone(),
+                        colon_token: *colon_token,
+                    }),
+                    ty,
+                )
+            },
             syn::FnArg::Receiver(_) => unreachable!(),
         })
-        .collect::<Vec<_>>();
+        .unzip();
 
     let launcher = syn::Ident::new("launcher", proc_macro2::Span::mixed_site());
 
@@ -60,26 +80,10 @@ pub(super) fn quote_kernel_func_inputs(
         #[allow(unused_variables)]
         pub fn #func_ident <#generic_kernel_params>(
             #launcher: &mut #crate_path::host::Launcher<#func_ident #ty_generics>,
-            #func_inputs
+            #(#kernel_func_inputs),*
         ) -> #crate_path::rustacuda::error::CudaResult<()> {
             let _: #func_ident <#(#full_generics),*> = #func_ident #ty_turbofish;
 
-            // impls check adapted from Nikolai Vazquez's `impls` crate:
-            //  https://docs.rs/impls/1.0.3/src/impls/lib.rs.html#584-602
-            const fn __check_is_sync<T: ?Sized>(_x: &T) -> bool {
-                trait IsSyncMarker {
-                    const SYNC: bool = false;
-                }
-                impl<T: ?Sized> IsSyncMarker for T {}
-                struct CheckIs<T: ?Sized>(::core::marker::PhantomData<T>);
-                #[allow(dead_code)]
-                impl<T: ?Sized + Sync> CheckIs<T> {
-                    const SYNC: bool = true;
-                }
-
-                <CheckIs<T>>::SYNC
-            }
-
             #raw_func_input_wrap
         }
     }
@@ -88,89 +92,27 @@ pub(super) fn quote_kernel_func_inputs(
 #[allow(clippy::too_many_lines)]
 fn generate_raw_func_input_wrap(
     crate_path: &syn::Path,
-    FunctionInputs {
-        func_inputs,
-        func_input_cuda_types,
-    }: &FunctionInputs,
+    FunctionInputs { func_inputs, .. }: &FunctionInputs,
     FuncIdent {
         func_ident_async, ..
     }: &FuncIdent,
     func_params: &[syn::Ident],
     launcher: &syn::Ident,
 ) -> TokenStream {
-    func_inputs
-        .iter()
-        .zip(func_params)
-        .zip(func_input_cuda_types.iter())
-        .rev()
-        .fold(
-            quote! {
-                #func_ident_async(#launcher, #(#func_params),*)?;
-                #launcher.stream.synchronize()
-            },
-            |inner, ((arg, param), (cuda_mode, _ptx_jit))| match arg {
-                syn::FnArg::Typed(syn::PatType { pat, ty, .. }) => match cuda_mode {
-                    InputCudaType::SafeDeviceCopy => {
-                        if let syn::Type::Reference(..) = &**ty {
-                            let pat_box = quote::format_ident!("__{}_box", param);
-
-                            // DeviceCopy mode only supports immutable references
-                            quote! {
-                                let mut #pat_box = #crate_path::host::HostDeviceBox::from(
-                                    #crate_path::rustacuda::memory::DeviceBox::new(
-                                        #crate_path::utils::device_copy::SafeDeviceCopyWrapper::from_ref(#pat)
-                                    )?
-                                );
-                                #[allow(clippy::redundant_closure_call)]
-                                // Safety: `#pat_box` contains exactly the device copy of `#pat`
-                                let __result = (|#pat| { #inner })(unsafe {
-                                    #crate_path::host::HostAndDeviceConstRef::new(
-                                        &#pat_box,  #crate_path::utils::device_copy::SafeDeviceCopyWrapper::from_ref(#pat)
-                                    ).as_async()
-                                });
-
-                                #[allow(invalid_reference_casting)]
-                                if !__check_is_sync(#pat) {
-                                    // Safety:
-                                    // * Since `#ty` is `!Sync`, it contains interior mutability
-                                    // * Therefore, part of the 'immutable' device copy may have
-                                    //    been mutated
-                                    // * If all mutation was confined to interior mutability,
-                                    //    then passing these changes on is safe (and expected)
-                                    // * If any mutations occured outside interior mutability,
-                                    //    then UB occurred, in the kernel (we're not the cause)
-                                    #pat_box.copy_to(unsafe { &mut *(#pat as *const _ as *mut _) })?;
-                                }
-
-                                ::core::mem::drop(#pat_box);
-                                __result
-                            }
-                        } else {
-                            quote! { {
-                                let #pat = #crate_path::utils::device_copy::SafeDeviceCopyWrapper::from(#pat);
-                                #inner
-                            } }
-                        }
-                    },
-                    InputCudaType::LendRustToCuda => {
-                        if let syn::Type::Reference(syn::TypeReference { mutability, .. }) = &**ty {
-                            if mutability.is_some() {
-                                quote! { #crate_path::host::LendToCuda::lend_to_cuda_mut(
-                                    #pat, |mut #pat| { (|#pat| { #inner })(#pat.as_async()) }
-                                ) }
-                            } else {
-                                quote! { #crate_path::host::LendToCuda::lend_to_cuda(
-                                    #pat, |#pat| { (|#pat| { #inner })(#pat.as_async()) }
-                                ) }
-                            }
-                        } else {
-                            quote! { #crate_path::host::LendToCuda::move_to_cuda(
-                                #pat, |mut #pat| { (|#pat| { #inner })(#pat.as_async()) }
-                            ) }
-                        }
-                    },
-                },
-                syn::FnArg::Receiver(_) => unreachable!(),
+    func_inputs.iter().rev().fold(
+        quote! {
+            #func_ident_async(#launcher, #(#func_params),*)?;
+            #launcher.stream.synchronize()
+        },
+        |inner, arg| match arg {
+            syn::FnArg::Typed(syn::PatType { pat, ty, .. }) => {
+                quote::quote_spanned! { ty.span()=>
+                    <#ty as #crate_path::common::CudaKernelParameter>::with_new_async(
+                        #pat, #launcher.stream, |#pat| { #inner }
+                    )
+                }
             },
-        )
+            syn::FnArg::Receiver(_) => unreachable!(),
+        },
+    )
 }
diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func_async.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func_async.rs
index 8a0013900..39ce95e9d 100644
--- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func_async.rs
+++ b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func_async.rs
@@ -1,7 +1,7 @@
 use proc_macro2::TokenStream;
 use syn::spanned::Spanned;
 
-use super::super::super::{DeclGenerics, FuncIdent, FunctionInputs, ImplGenerics, InputCudaType};
+use super::super::super::{DeclGenerics, FuncIdent, FunctionInputs, ImplGenerics};
 
 #[allow(clippy::too_many_arguments)]
 pub(super) fn quote_kernel_func_async(
@@ -23,13 +23,8 @@ pub(super) fn quote_kernel_func_async(
     let launcher = syn::Ident::new("launcher", proc_macro2::Span::mixed_site());
     let stream = syn::Lifetime::new("'stream", proc_macro2::Span::mixed_site());
 
-    let (
-        async_params,
-        launch_param_types,
-        unboxed_param_types,
-        launch_param_wrap,
-        ptx_jit_param_wrap,
-    ) = generate_type_wrap(crate_path, func_inputs, &stream);
+    let (async_params, launch_param_types, launch_param_wrap, _ptx_jit_param_wrap) =
+        generate_type_wrap(crate_path, func_inputs, &stream);
 
     quote! {
         #[cfg(not(target_os = "cuda"))]
@@ -43,7 +38,7 @@ pub(super) fn quote_kernel_func_async(
             #(#async_params),*
         ) -> #crate_path::rustacuda::error::CudaResult<()> {
             let kernel_jit_result = if #launcher.config.ptx_jit {
-                #launcher.kernel.compile_with_ptx_jit_args(#ptx_jit_param_wrap)?
+                #launcher.kernel.compile_with_ptx_jit_args(None)? // TODO: #ptx_jit_param_wrap)?
             } else {
                 #launcher.kernel.compile_with_ptx_jit_args(None)?
             };
@@ -54,17 +49,6 @@ pub(super) fn quote_kernel_func_async(
 
             #[allow(clippy::redundant_closure_call)]
             (|#(#func_params: #launch_param_types),*| {
-                if false {
-                    #[allow(dead_code)]
-                    fn assert_impl_devicecopy<T: #crate_path::rustacuda_core::DeviceCopy>(_val: &T) {}
-
-                    #[allow(dead_code)]
-                    fn assert_impl_no_safe_aliasing<T: #crate_path::safety::NoSafeAliasing>() {}
-
-                    #(assert_impl_devicecopy(&#func_params);)*
-                    #(assert_impl_no_safe_aliasing::<#unboxed_param_types>();)*
-                }
-
                 let #crate_path::host::LaunchConfig {
                     grid, block, shared_memory_size, ptx_jit: _,
                 } = #launcher.config.clone();
@@ -81,7 +65,6 @@ pub(super) fn quote_kernel_func_async(
     }
 }
 
-#[allow(clippy::too_many_lines)] // FIXME
 fn generate_type_wrap(
     crate_path: &syn::Path,
     FunctionInputs {
@@ -90,8 +73,7 @@ fn generate_type_wrap(
     }: &FunctionInputs,
     stream: &syn::Lifetime,
 ) -> (
-    Vec<TokenStream>,
-    Vec<TokenStream>,
+    Vec<syn::FnArg>,
     Vec<syn::Type>,
     Vec<TokenStream>,
     TokenStream,
@@ -100,14 +82,13 @@ fn generate_type_wrap(
 
     let mut async_params = Vec::with_capacity(func_inputs.len());
     let mut launch_param_types = Vec::with_capacity(func_inputs.len());
-    let mut unboxed_param_types = Vec::with_capacity(func_inputs.len());
     let mut launch_param_wrap = Vec::with_capacity(func_inputs.len());
     let mut ptx_jit_param_wrap = Vec::with_capacity(func_inputs.len());
 
     func_inputs
         .iter()
         .zip(func_input_cuda_types.iter())
-        .for_each(|(arg, (cuda_mode, ptx_jit))| match arg {
+        .for_each(|(arg, ptx_jit)| match arg {
             syn::FnArg::Typed(syn::PatType {
                 attrs,
                 pat,
@@ -122,95 +103,30 @@ fn generate_type_wrap(
                     quote! { None }
                 });
 
-                #[allow(clippy::if_same_then_else)]
-                launch_param_wrap.push(if let syn::Type::Reference(_) = &**ty {
-                    quote! { unsafe { #pat.for_device_async() } }
-                } else if matches!(cuda_mode, InputCudaType::LendRustToCuda) {
-                    quote! { unsafe { #pat.for_device_async() } }
-                } else {
-                    quote! { #pat }
+                let async_ty: syn::Type = syn::parse_quote_spanned! { ty.span()=>
+                    <#ty as #crate_path::common::CudaKernelParameter>::AsyncHostType<#stream, '_>
+                };
+
+                let async_param = syn::FnArg::Typed(syn::PatType {
+                    attrs: attrs.clone(),
+                    ty: Box::new(async_ty),
+                    pat: pat.clone(),
+                    colon_token: *colon_token,
                 });
 
-                let unboxed_param_type = match &**ty {
-                    syn::Type::Reference(syn::TypeReference { elem, .. }) => elem,
-                    other => other,
-                };
-                unboxed_param_types.push(unboxed_param_type.clone());
-
-                let cuda_param_type = match cuda_mode {
-                    InputCudaType::SafeDeviceCopy => quote::quote_spanned! { ty.span()=>
-                        #crate_path::utils::device_copy::SafeDeviceCopyWrapper<#unboxed_param_type>
-                    },
-                    InputCudaType::LendRustToCuda => quote::quote_spanned! { ty.span()=>
-                        #crate_path::common::DeviceAccessible<
-                            <#unboxed_param_type as #crate_path::common::RustToCuda>::CudaRepresentation
-                        >
-                    },
+                async_params.push(async_param);
+
+                let launch_ty: syn::Type = syn::parse_quote_spanned! { ty.span()=>
+                    <#ty as #crate_path::common::CudaKernelParameter>::FfiType<#stream, '_>
                 };
 
-                let (async_param, launch_param_type) = if let syn::Type::Reference(syn::TypeReference {
-                    mutability,
-                    lifetime,
-                    ..
-                }) = &**ty
-                {
-                    let lifetime_or_default = lifetime.clone().unwrap_or(syn::parse_quote!('_));
-                    let comma: Option<syn::token::Comma> =
-                        lifetime.as_ref().map(|_| syn::parse_quote!(,));
-
-                    let (async_param_type, launch_param_type) = if mutability.is_some() {
-                        if matches!(cuda_mode, InputCudaType::SafeDeviceCopy) {
-                            abort!(
-                                mutability.span(),
-                                "Cannot mutably alias a `SafeDeviceCopy` kernel parameter."
-                            );
-                        }
-
-                        (
-                            quote::quote_spanned! { ty.span()=>
-                                #crate_path::host::HostAndDeviceMutRefAsync<#stream, #lifetime_or_default, #cuda_param_type>
-                            },
-                            quote::quote_spanned! { ty.span()=>
-                                #crate_path::common::DeviceMutRef<#lifetime #comma #cuda_param_type>
-                            },
-                        )
-                    } else {
-                        (
-                            quote::quote_spanned! { ty.span()=>
-                                #crate_path::host::HostAndDeviceConstRefAsync<#stream, #lifetime_or_default, #cuda_param_type>
-                            },
-                            quote::quote_spanned! { ty.span()=>
-                                #crate_path::common::DeviceConstRef<#lifetime #comma #cuda_param_type>
-                            },
-                        )
-                    };
-
-                    (quote! {
-                        #(#attrs)* #mutability #pat #colon_token #async_param_type
-                    }, launch_param_type)
-                } else if matches!(cuda_mode, InputCudaType::LendRustToCuda) {
-                    let async_param_type = quote::quote_spanned! { ty.span()=>
-                        #crate_path::host::HostAndDeviceOwnedAsync<#stream, '_, #cuda_param_type>
-                    };
-                    let launch_param_type = quote::quote_spanned! { ty.span()=>
-                        #crate_path::common::DeviceMutRef<#cuda_param_type>
-                    };
-
-                    (
-                        quote! {
-                            #(#attrs)* #pat #colon_token #async_param_type
-                        },
-                        launch_param_type
-                    )
-                } else {
-                    (
-                        quote! { #(#attrs)* #pat #colon_token #cuda_param_type },
-                        quote! { #cuda_param_type },
-                    )
+                launch_param_types.push(launch_ty);
+
+                let launch_wrap = quote::quote_spanned! { ty.span()=>
+                    <#ty as #crate_path::common::CudaKernelParameter>::async_to_ffi(#pat)
                 };
 
-                async_params.push(async_param);
-                launch_param_types.push(launch_param_type);
+                launch_param_wrap.push(launch_wrap);
             },
             syn::FnArg::Receiver(_) => unreachable!(),
         });
@@ -224,7 +140,6 @@ fn generate_type_wrap(
     (
         async_params,
         launch_param_types,
-        unboxed_param_types,
         launch_param_wrap,
         ptx_jit_param_wrap,
     )
diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cuda_generic_function.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cuda_generic_function.rs
index aa23b77c6..a6d8ac550 100644
--- a/rust-cuda-derive/src/kernel/wrapper/generate/cuda_generic_function.rs
+++ b/rust-cuda-derive/src/kernel/wrapper/generate/cuda_generic_function.rs
@@ -1,8 +1,10 @@
 use proc_macro2::TokenStream;
+use syn::spanned::Spanned;
 
 use super::super::{DeclGenerics, FuncIdent};
 
 pub(in super::super) fn quote_cuda_generic_function(
+    crate_path: &syn::Path,
     DeclGenerics {
         generic_start_token,
         generic_kernel_params: generic_params,
@@ -14,10 +16,36 @@ pub(in super::super) fn quote_cuda_generic_function(
     func_attrs: &[syn::Attribute],
     func_block: &syn::Block,
 ) -> TokenStream {
+    let kernel_func_inputs = func_inputs
+        .iter()
+        .map(|arg| match arg {
+            syn::FnArg::Typed(syn::PatType {
+                attrs,
+                ty,
+                pat,
+                colon_token,
+            }) => {
+                let ty: syn::Type = syn::parse_quote_spanned! { ty.span()=>
+                    <#ty as #crate_path::common::CudaKernelParameter>::DeviceType<'_>
+                };
+
+                syn::FnArg::Typed(syn::PatType {
+                    attrs: attrs.clone(),
+                    ty: Box::new(ty),
+                    pat: pat.clone(),
+                    colon_token: *colon_token,
+                })
+            },
+            syn::FnArg::Receiver(_) => unreachable!(),
+        })
+        .collect::<Vec<_>>();
+
     quote! {
         #[cfg(target_os = "cuda")]
         #(#func_attrs)*
-        fn #func_ident #generic_start_token #generic_params #generic_close_token (#func_inputs)
+        fn #func_ident #generic_start_token #generic_params #generic_close_token (
+            #(#kernel_func_inputs),*
+        )
         #func_block
     }
 }
diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cuda_wrapper.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cuda_wrapper.rs
index 8aa57ab87..24365ee29 100644
--- a/rust-cuda-derive/src/kernel/wrapper/generate/cuda_wrapper.rs
+++ b/rust-cuda-derive/src/kernel/wrapper/generate/cuda_wrapper.rs
@@ -1,10 +1,9 @@
 use proc_macro2::TokenStream;
-use quote::quote_spanned;
 use syn::spanned::Spanned;
 
 use super::super::{
     super::{KERNEL_TYPE_USE_END_CANARY, KERNEL_TYPE_USE_START_CANARY},
-    FuncIdent, FunctionInputs, ImplGenerics, InputCudaType,
+    FuncIdent, FunctionInputs, ImplGenerics,
 };
 
 #[allow(clippy::too_many_lines)]
@@ -26,10 +25,8 @@ pub(in super::super) fn quote_cuda_wrapper(
     func_attrs: &[syn::Attribute],
     func_params: &[syn::Ident],
 ) -> TokenStream {
-    let (ptx_func_inputs, ptx_func_types) =
-        specialise_ptx_func_inputs(crate_path, inputs, func, impl_generics);
-    let ptx_func_unboxed_types =
-        specialise_ptx_unboxed_types(crate_path, inputs, func, impl_generics);
+    let (ffi_inputs, ffi_types) =
+        specialise_ffi_input_types(crate_path, inputs, func, impl_generics);
 
     let func_layout_params = func_params
         .iter()
@@ -41,69 +38,32 @@ pub(in super::super) fn quote_cuda_wrapper(
         })
         .collect::<Vec<_>>();
 
-    let ptx_func_input_unwrap = func_inputs
+    let ffi_param_ptx_jit_wrap = func_inputs
         .iter().zip(func_input_cuda_types.iter()).enumerate()
         .rev()
         .fold(quote! {
             #func_ident(#(#func_params),*)
-        }, |inner, (i, (arg, (cuda_mode, ptx_jit)))| match arg {
+        }, |inner, (_i, (arg, _ptx_jit))| match arg {
             syn::FnArg::Typed(syn::PatType {
                 pat,
                 ty,
                 ..
             }) => {
                 // Emit PTX JIT load markers
-                let ptx_jit_load = if ptx_jit.0 {
-                    quote! {
-                        #crate_path::ptx_jit::PtxJITConstLoad!([#i] => #pat.as_ref())
-                    }
-                } else { quote! {} };
-
-                let arg_type = match &**ty {
-                    syn::Type::Reference(syn::TypeReference { elem, .. }) => elem,
-                    other => other,
-                };
-                let syn_type = quote::quote_spanned! { ty.span()=>
-                    #crate_path::device::specialise_kernel_type!(#arg_type for #generics in #func_ident)
+                // let ptx_jit_load = if ptx_jit.0 {
+                //     quote! {
+                //         #crate_path::ptx_jit::PtxJITConstLoad!([#i] => #pat.as_ref())
+                //     }
+                // } else { quote! {} };
+
+                let specialised_ty = quote::quote_spanned! { ty.span()=>
+                    #crate_path::device::specialise_kernel_type!(#ty for #generics in #func_ident)
                 };
 
-                match cuda_mode {
-                    InputCudaType::SafeDeviceCopy => if let syn::Type::Reference(
-                        syn::TypeReference { and_token, .. }
-                    ) = &**ty {
-                        // DeviceCopy mode only supports immutable references
-                        // TODO: ptx_jit_load should be here, not there
-                        // also ptx_jit_load should not be enabled for interior mutability
-                        quote! { { let #pat: #and_token #syn_type = #pat.as_ref().into_ref(); #inner } }
-                    } else {
-                        quote! { #ptx_jit_load; { let #pat: #syn_type = #pat.into_inner(); #inner } }
-                    },
-                    InputCudaType::LendRustToCuda => if let syn::Type::Reference(
-                        syn::TypeReference { and_token, mutability, ..}
-                    ) = &**ty {
-                        if mutability.is_some() {
-                            quote! {
-                                #ptx_jit_load;
-                                #crate_path::device::BorrowFromRust::with_borrow_from_rust_mut(
-                                    #pat, |#pat: #and_token #mutability #crate_path::device::ShallowCopy<#syn_type>| { #inner },
-                                )
-                            }
-                        } else {
-                            quote! {
-                                #ptx_jit_load;
-                                #crate_path::device::BorrowFromRust::with_borrow_from_rust(
-                                    #pat, |#pat: #and_token #crate_path::device::ShallowCopy<#syn_type>| { #inner },
-                                )
-                            }
-                        }
-                    } else {
-                        quote! {
-                            #ptx_jit_load;
-                            #crate_path::device::BorrowFromRust::with_moved_from_rust(
-                                #pat, |#pat: #syn_type| { #inner },
-                            )
-                        }
-                    }
+                quote::quote_spanned! { ty.span()=>
+                    <#specialised_ty as #crate_path::common::CudaKernelParameter>::with_ffi_as_device(
+                        #pat, |#pat| { #inner }
+                    )
                 }
             },
             syn::FnArg::Receiver(_) => unreachable!(),
@@ -114,15 +74,15 @@ pub(in super::super) fn quote_cuda_wrapper(
         #[#crate_path::device::specialise_kernel_function(#func_ident)]
         #[no_mangle]
         #(#func_attrs)*
-        pub unsafe extern "ptx-kernel" fn #func_ident_hash(#(#ptx_func_inputs),*) {
+        pub unsafe extern "ptx-kernel" fn #func_ident_hash(#(#ffi_inputs),*) {
             unsafe {
                 ::core::arch::asm!(#KERNEL_TYPE_USE_START_CANARY);
             }
             #(
                 #[no_mangle]
                 static #func_layout_params: [
-                    u8; #crate_path::const_type_layout::serialised_type_graph_len::<#ptx_func_types>()
-                ] = #crate_path::const_type_layout::serialise_type_graph::<#ptx_func_types>();
+                    u8; #crate_path::const_type_layout::serialised_type_graph_len::<#ffi_types>()
+                ] = #crate_path::const_type_layout::serialise_type_graph::<#ffi_types>();
 
                 unsafe { ::core::ptr::read_volatile(&#func_layout_params[0]) };
             )*
@@ -137,128 +97,50 @@ pub(in super::super) fn quote_cuda_wrapper(
 
                 extern "C" { #(
                     #[allow(dead_code)]
-                    static #func_params: #ptx_func_types;
+                    static #func_params: #ffi_types;
                 )* }
             }
 
-            if false {
-                #[allow(dead_code)]
-                fn assert_impl_devicecopy<T: #crate_path::rustacuda_core::DeviceCopy>(_val: &T) {}
-
-                #[allow(dead_code)]
-                fn assert_impl_no_safe_aliasing<T: #crate_path::safety::NoSafeAliasing>() {}
-
-                #(assert_impl_devicecopy(&#func_params);)*
-                #(assert_impl_no_safe_aliasing::<#ptx_func_unboxed_types>();)*
-            }
-
-            #ptx_func_input_unwrap
+            #ffi_param_ptx_jit_wrap
         }
     }
 }
 
-fn specialise_ptx_func_inputs(
+fn specialise_ffi_input_types(
     crate_path: &syn::Path,
-    FunctionInputs {
-        func_inputs,
-        func_input_cuda_types,
-    }: &FunctionInputs,
+    FunctionInputs { func_inputs, .. }: &FunctionInputs,
     FuncIdent { func_ident, .. }: &FuncIdent,
     ImplGenerics { impl_generics, .. }: &ImplGenerics,
-) -> (Vec<TokenStream>, Vec<TokenStream>) {
+) -> (Vec<syn::FnArg>, Vec<syn::Type>) {
     func_inputs
         .iter()
-        .zip(func_input_cuda_types.iter())
-        .map(|(arg, (cuda_mode, _ptx_jit))| match arg {
+        .map(|arg| match arg {
             syn::FnArg::Typed(
-                fn_arg @ syn::PatType {
+                syn::PatType {
                     attrs,
                     pat,
                     colon_token,
                     ty,
                 },
             ) => {
-                let arg_type = match &**ty {
-                    syn::Type::Reference(syn::TypeReference { elem, .. }) => elem,
-                    other => other,
-                };
-                let syn_type = quote::quote_spanned! { ty.span()=>
-                    #crate_path::device::specialise_kernel_type!(#arg_type for #impl_generics in #func_ident)
+                let specialised_ty = quote::quote_spanned! { ty.span()=>
+                    #crate_path::device::specialise_kernel_type!(#ty for #impl_generics in #func_ident)
                 };
 
-                let cuda_type = match cuda_mode {
-                    InputCudaType::SafeDeviceCopy => quote::quote_spanned! { ty.span()=>
-                        #crate_path::utils::device_copy::SafeDeviceCopyWrapper<#syn_type>
-                    },
-                    InputCudaType::LendRustToCuda => quote::quote_spanned! { ty.span()=>
-                        #crate_path::common::DeviceAccessible<
-                            <#syn_type as #crate_path::common::RustToCuda>::CudaRepresentation
-                        >
-                    },
-                };
-
-                let ty = if let syn::Type::Reference(syn::TypeReference {
-                    lifetime,
-                    mutability,
-                    ..
-                }) = &**ty
-                {
-                    let lifetime = quote_spanned! { lifetime.span()=>
-                        'static
-                    };
-
-                    if mutability.is_some() {
-                        quote::quote_spanned! { ty.span()=>
-                            #crate_path::common::DeviceMutRef<#lifetime, #cuda_type>
-                        }
-                    } else {
-                        quote::quote_spanned! { ty.span()=>
-                            #crate_path::common::DeviceConstRef<#lifetime, #cuda_type>
-                        }
-                    }
-                } else if matches!(cuda_mode, InputCudaType::LendRustToCuda) {
-                    let lifetime = quote_spanned! { ty.span()=>
-                        'static
-                    };
-
-                    quote::quote_spanned! { ty.span()=>
-                        #crate_path::common::DeviceMutRef<#lifetime, #cuda_type>
-                    }
-                } else {
-                    cuda_type
+                let ffi_ty: syn::Type = syn::parse_quote_spanned! { ty.span()=>
+                    <#specialised_ty as #crate_path::common::CudaKernelParameter>::FfiType<'static, 'static>
                 };
 
-                let fn_arg = quote::quote_spanned! { fn_arg.span()=>
-                    #(#attrs)* #pat #colon_token #ty
-                };
+                let ffi_param = syn::FnArg::Typed(syn::PatType {
+                    attrs: attrs.clone(),
+                    ty: Box::new(ffi_ty.clone()),
+                    pat: pat.clone(),
+                    colon_token: *colon_token,
+                });
 
-                (fn_arg, ty)
+                (ffi_param, ffi_ty)
             },
             syn::FnArg::Receiver(_) => unreachable!(),
         })
         .unzip()
 }
-
-fn specialise_ptx_unboxed_types(
-    crate_path: &syn::Path,
-    FunctionInputs { func_inputs, .. }: &FunctionInputs,
-    FuncIdent { func_ident, .. }: &FuncIdent,
-    ImplGenerics { impl_generics, .. }: &ImplGenerics,
-) -> Vec<TokenStream> {
-    func_inputs
-        .iter()
-        .map(|arg| match arg {
-            syn::FnArg::Typed(syn::PatType { ty, .. }) => {
-                let arg_type = match &**ty {
-                    syn::Type::Reference(syn::TypeReference { elem, .. }) => elem,
-                    other => other,
-                };
-
-                quote::quote_spanned! { ty.span()=>
-                    #crate_path::device::specialise_kernel_type!(#arg_type for #impl_generics in #func_ident)
-                }
-            },
-            syn::FnArg::Receiver(_) => unreachable!(),
-        })
-        .collect()
-}
diff --git a/rust-cuda-derive/src/kernel/wrapper/inputs/attribute.rs b/rust-cuda-derive/src/kernel/wrapper/inputs/attribute.rs
index ceeee1e3e..4ca2ff7bf 100644
--- a/rust-cuda-derive/src/kernel/wrapper/inputs/attribute.rs
+++ b/rust-cuda-derive/src/kernel/wrapper/inputs/attribute.rs
@@ -1,9 +1,6 @@
 use syn::spanned::Spanned;
 
-use super::InputCudaType;
-
 pub(super) enum KernelInputAttribute {
-    PassType(proc_macro2::Span, InputCudaType),
     PtxJit(proc_macro2::Span, bool),
 }
 
@@ -12,31 +9,6 @@ impl syn::parse::Parse for KernelInputAttribute {
         let ident: syn::Ident = input.parse()?;
 
         match &*ident.to_string() {
-            "pass" => {
-                let eq: syn::token::Eq = input.parse()?;
-                let mode: syn::Ident = input.parse()?;
-
-                let cuda_type = match &*mode.to_string() {
-                    "SafeDeviceCopy" => InputCudaType::SafeDeviceCopy,
-                    "LendRustToCuda" => InputCudaType::LendRustToCuda,
-                    _ => abort!(
-                        mode.span(),
-                        "Unexpected CUDA transfer mode `{:?}`: Expected `SafeDeviceCopy` or \
-                         `LendRustToCuda`.",
-                        mode
-                    ),
-                };
-
-                Ok(KernelInputAttribute::PassType(
-                    ident
-                        .span()
-                        .join(eq.span())
-                        .unwrap()
-                        .join(mode.span())
-                        .unwrap(),
-                    cuda_type,
-                ))
-            },
             "jit" => {
                 let eq: Option<syn::token::Eq> = input.parse()?;
 
@@ -61,7 +33,7 @@ impl syn::parse::Parse for KernelInputAttribute {
             },
             _ => abort!(
                 ident.span(),
-                "Unexpected kernel attribute `{:?}`: Expected `pass` or `jit`.",
+                "Unexpected kernel attribute `{:?}`: Expected `jit`.",
                 ident
             ),
         }
diff --git a/rust-cuda-derive/src/kernel/wrapper/inputs/mod.rs b/rust-cuda-derive/src/kernel/wrapper/inputs/mod.rs
index 9222de237..154503702 100644
--- a/rust-cuda-derive/src/kernel/wrapper/inputs/mod.rs
+++ b/rust-cuda-derive/src/kernel/wrapper/inputs/mod.rs
@@ -1,19 +1,19 @@
 use syn::spanned::Spanned;
 
-use super::{InputCudaType, InputPtxJit};
+use super::InputPtxJit;
 
 mod attribute;
 use attribute::{KernelInputAttribute, KernelInputAttributes};
 
 pub(super) struct FunctionInputs {
     pub(super) func_inputs: syn::punctuated::Punctuated<syn::FnArg, syn::token::Comma>,
-    pub(super) func_input_cuda_types: Vec<(InputCudaType, InputPtxJit)>,
+    pub(super) func_input_cuda_types: Vec<InputPtxJit>,
 }
 
 pub(super) fn parse_function_inputs(func: &syn::ItemFn) -> FunctionInputs {
     let (func_inputs, func_input_cuda_types): (
         syn::punctuated::Punctuated<syn::FnArg, syn::token::Comma>,
-        Vec<(InputCudaType, InputPtxJit)>,
+        Vec<InputPtxJit>,
     ) = func
         .sig
         .inputs
@@ -22,15 +22,12 @@ pub(super) fn parse_function_inputs(func: &syn::ItemFn) -> FunctionInputs {
             receiver @ syn::FnArg::Receiver(_) => {
                 abort!(receiver.span(), "Kernel function must not have a receiver.")
             },
-            syn::FnArg::Typed(
-                input @ syn::PatType {
-                    attrs,
-                    pat,
-                    colon_token,
-                    ty,
-                },
-            ) => {
-                let mut cuda_type: Option<InputCudaType> = None;
+            syn::FnArg::Typed(syn::PatType {
+                attrs,
+                pat,
+                colon_token,
+                ty,
+            }) => {
                 let mut ptx_jit: Option<InputPtxJit> = None;
 
                 let attrs = attrs
@@ -45,14 +42,6 @@ pub(super) fn parse_function_inputs(func: &syn::ItemFn) -> FunctionInputs {
 
                             for attr in attrs {
                                 match attr {
-                                    KernelInputAttribute::PassType(_span, pass_type)
-                                        if cuda_type.is_none() =>
-                                    {
-                                        cuda_type = Some(pass_type);
-                                    },
-                                    KernelInputAttribute::PassType(span, _pass_type) => {
-                                        abort!(span, "Duplicate CUDA transfer mode declaration.");
-                                    },
                                     KernelInputAttribute::PtxJit(span, jit)
                                         if ptx_jit.is_none() =>
                                     {
@@ -78,24 +67,14 @@ pub(super) fn parse_function_inputs(func: &syn::ItemFn) -> FunctionInputs {
                     .cloned()
                     .collect();
 
-                let cuda_type = cuda_type.unwrap_or_else(|| {
-                    abort!(
-                        input.span(),
-                        "Kernel function input must specify its CUDA transfer mode using \
-                         #[kernel(pass = ...)]."
-                    );
-                });
-
-                let ty = ensure_reference_type_lifetime(ty, &cuda_type);
-
                 (
                     syn::FnArg::Typed(syn::PatType {
                         attrs,
                         pat: pat.clone(),
                         colon_token: *colon_token,
-                        ty,
+                        ty: ty.clone(),
                     }),
-                    (cuda_type, ptx_jit.unwrap_or(InputPtxJit(false))),
+                    ptx_jit.unwrap_or(InputPtxJit(false)),
                 )
             },
         })
@@ -106,67 +85,3 @@ pub(super) fn parse_function_inputs(func: &syn::ItemFn) -> FunctionInputs {
         func_input_cuda_types,
     }
 }
-
-#[allow(clippy::unnecessary_box_returns)]
-fn ensure_reference_type_lifetime(ty: &syn::Type, cuda_type: &InputCudaType) -> Box<syn::Type> {
-    match ty {
-        syn::Type::Reference(syn::TypeReference {
-            and_token,
-            lifetime,
-            mutability,
-            elem,
-        }) => {
-            let elem = if matches!(cuda_type, InputCudaType::LendRustToCuda) {
-                (|| {
-                    if let syn::Type::Path(syn::TypePath {
-                        path: syn::Path { segments, .. },
-                        qself: None,
-                    }) = &**elem
-                    {
-                        if let Some(syn::PathSegment {
-                            ident,
-                            arguments:
-                                syn::PathArguments::AngleBracketed(
-                                    syn::AngleBracketedGenericArguments { args, .. },
-                                ),
-                        }) = segments.last()
-                        {
-                            if ident == "ShallowCopy" && segments.len() == 1 {
-                                match args.last() {
-                                    Some(syn::GenericArgument::Type(elem)) if args.len() == 1 => {
-                                        return Box::new(elem.clone());
-                                    },
-                                    _ => {
-                                        abort!(
-                                            args.span(),
-                                            "`ShallowCopy<T>` takes exactly one generic type \
-                                             argument."
-                                        );
-                                    },
-                                }
-                            }
-                        }
-                    }
-
-                    emit_warning!(
-                        elem.span(),
-                        "RustToCuda kernel parameters should be explicitly wrapped with the \
-                         `ShallowCopy<T>` marker to communicate their aliasing behaviour."
-                    );
-
-                    elem.clone()
-                })()
-            } else {
-                elem.clone()
-            };
-
-            Box::new(syn::Type::Reference(syn::TypeReference {
-                and_token: *and_token,
-                lifetime: lifetime.clone(),
-                mutability: *mutability,
-                elem,
-            }))
-        },
-        ty => Box::new(ty.clone()),
-    }
-}
diff --git a/rust-cuda-derive/src/kernel/wrapper/mod.rs b/rust-cuda-derive/src/kernel/wrapper/mod.rs
index b79dfb1fd..79bae8dbd 100644
--- a/rust-cuda-derive/src/kernel/wrapper/mod.rs
+++ b/rust-cuda-derive/src/kernel/wrapper/mod.rs
@@ -237,6 +237,7 @@ pub fn kernel(attr: TokenStream, func: TokenStream) -> TokenStream {
         &func_params,
     );
     let cuda_generic_function = quote_cuda_generic_function(
+        &crate_path,
         &decl_generics,
         &pat_func_inputs,
         &func_ident,
@@ -257,11 +258,6 @@ pub fn kernel(attr: TokenStream, func: TokenStream) -> TokenStream {
     .into()
 }
 
-enum InputCudaType {
-    SafeDeviceCopy,
-    LendRustToCuda,
-}
-
 struct InputPtxJit(bool);
 
 #[allow(clippy::struct_field_names)]
diff --git a/src/common.rs b/src/common.rs
index 2e7102a73..c4a880262 100644
--- a/src/common.rs
+++ b/src/common.rs
@@ -1,11 +1,12 @@
 #[cfg(any(not(feature = "host"), doc))]
 use core::convert::{AsMut, AsRef};
-use core::marker::PhantomData;
+use core::{
+    marker::PhantomData,
+    ops::{Deref, DerefMut},
+};
 
 #[cfg(feature = "host")]
 use alloc::fmt;
-#[cfg(not(feature = "host"))]
-use core::ops::{Deref, DerefMut};
 #[cfg(feature = "host")]
 use core::{mem::MaybeUninit, ptr::copy_nonoverlapping};
 
@@ -241,28 +242,18 @@ impl<'r, T: DeviceCopy> AsMut<T> for DeviceMutRef<'r, T> {
 
 #[repr(transparent)]
 #[derive(TypeLayout)]
-pub struct DeviceOwnedRef<T: DeviceCopy> {
+pub struct DeviceOwnedRef<'r, T: DeviceCopy> {
+    #[cfg_attr(feature = "host", allow(dead_code))]
     pub(super) pointer: *mut T,
+    pub(super) reference: PhantomData<&'r mut ()>,
     pub(super) marker: PhantomData<T>,
 }
 
-// TODO: when should the drop run???
-#[cfg(feature = "host")]
-impl<T: DeviceCopy> Drop for DeviceOwnedRef<T> {
-    fn drop(&mut self) {
-        // Safety: pointer comes from [`DeviceBox::into_device`]
-        //         i.e. this function completes the roundtrip
-        let device_box = unsafe { rustacuda::memory::DeviceBox::from_raw(self.pointer) };
-
-        core::mem::drop(crate::host::CudaDropWrapper::from(device_box));
-    }
-}
-
-unsafe impl<T: DeviceCopy> DeviceCopy for DeviceOwnedRef<T> {}
+unsafe impl<'r, T: DeviceCopy> DeviceCopy for DeviceOwnedRef<'r, T> {}
 
 #[cfg(any(not(feature = "host"), doc))]
 #[doc(cfg(not(feature = "host")))]
-impl<T: DeviceCopy> AsRef<T> for DeviceOwnedRef<T> {
+impl<'r, T: DeviceCopy> AsRef<T> for DeviceOwnedRef<'r, T> {
     fn as_ref(&self) -> &T {
         unsafe { &*self.pointer }
     }
@@ -270,7 +261,7 @@ impl<T: DeviceCopy> AsRef<T> for DeviceOwnedRef<T> {
 
 #[cfg(any(not(feature = "host"), doc))]
 #[doc(cfg(not(feature = "host")))]
-impl<T: DeviceCopy> AsMut<T> for DeviceOwnedRef<T> {
+impl<'r, T: DeviceCopy> AsMut<T> for DeviceOwnedRef<'r, T> {
     fn as_mut(&mut self) -> &mut T {
         unsafe { &mut *self.pointer }
     }
@@ -321,6 +312,7 @@ impl<A: CudaAlloc, B: CudaAlloc> CombinedCudaAlloc<A, B> {
 }
 
 mod sealed {
+    #[doc(hidden)]
     pub trait Sealed {}
 }
 
@@ -330,8 +322,8 @@ pub trait CudaKernelParameter: sealed::Sealed {
     type SyncHostType;
     #[cfg(feature = "host")]
     type AsyncHostType<'stream, 'b>;
-    type FfiType<'stream, 'b>: rustacuda_core::DeviceCopy;
-    type DeviceType;
+    type FfiType<'stream, 'b>: rustacuda_core::DeviceCopy + TypeGraphLayout;
+    type DeviceType<'b>;
 
     #[cfg(feature = "host")]
     #[allow(clippy::missing_errors_doc)] // FIXME
@@ -347,44 +339,55 @@ pub trait CudaKernelParameter: sealed::Sealed {
     ) -> Self::FfiType<'stream, 'b>;
 
     #[cfg(not(feature = "host"))]
-    fn with_ffi_as_device(
+    fn with_ffi_as_device<O>(
         param: Self::FfiType<'static, 'static>,
-        inner: impl FnOnce(Self::DeviceType),
-    );
+        inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O,
+    ) -> O;
 }
 
 #[repr(transparent)]
 pub struct PerThreadShallowCopy<
-    T: crate::safety::SafeDeviceCopy + const_type_layout::TypeGraphLayout,
->(T);
-
-#[cfg(not(feature = "host"))]
-impl<T: crate::safety::SafeDeviceCopy + const_type_layout::TypeGraphLayout>
-    PerThreadShallowCopy<T>
-{
-    #[must_use]
-    pub fn into_inner(self) -> T {
-        self.0
-    }
+    T: crate::safety::SafeDeviceCopy
+        + crate::safety::NoSafeAliasing
+        + const_type_layout::TypeGraphLayout,
+> {
+    never: !,
+    _marker: PhantomData<T>,
 }
 
-#[cfg(not(feature = "host"))]
-impl<T: crate::safety::SafeDeviceCopy + const_type_layout::TypeGraphLayout> core::ops::Deref
-    for PerThreadShallowCopy<T>
+impl<
+        T: crate::safety::SafeDeviceCopy
+            + crate::safety::NoSafeAliasing
+            + const_type_layout::TypeGraphLayout,
+    > Deref for PerThreadShallowCopy<T>
 {
     type Target = T;
 
     fn deref(&self) -> &Self::Target {
-        &self.0
+        self.never
+    }
+}
+
+impl<
+        T: crate::safety::SafeDeviceCopy
+            + crate::safety::NoSafeAliasing
+            + const_type_layout::TypeGraphLayout,
+    > DerefMut for PerThreadShallowCopy<T>
+{
+    fn deref_mut(&mut self) -> &mut Self::Target {
+        self.never
     }
 }
 
-impl<T: crate::safety::SafeDeviceCopy + const_type_layout::TypeGraphLayout> CudaKernelParameter
-    for PerThreadShallowCopy<T>
+impl<
+        T: crate::safety::SafeDeviceCopy
+            + crate::safety::NoSafeAliasing
+            + const_type_layout::TypeGraphLayout,
+    > CudaKernelParameter for PerThreadShallowCopy<T>
 {
     #[cfg(feature = "host")]
     type AsyncHostType<'stream, 'b> = crate::utils::device_copy::SafeDeviceCopyWrapper<T>;
-    type DeviceType = PerThreadShallowCopy<T>;
+    type DeviceType<'b> = T;
     type FfiType<'stream, 'b> = crate::utils::device_copy::SafeDeviceCopyWrapper<T>;
     #[cfg(feature = "host")]
     type SyncHostType = T;
@@ -408,22 +411,30 @@ impl<T: crate::safety::SafeDeviceCopy + const_type_layout::TypeGraphLayout> Cuda
     }
 
     #[cfg(not(feature = "host"))]
-    fn with_ffi_as_device(
+    fn with_ffi_as_device<O>(
         param: Self::FfiType<'static, 'static>,
-        inner: impl FnOnce(Self::DeviceType),
-    ) {
-        let param = PerThreadShallowCopy(param.into_inner());
+        inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O,
+    ) -> O {
+        let param = param.into_inner();
 
         inner(param)
     }
 }
-impl<T: crate::safety::SafeDeviceCopy + const_type_layout::TypeGraphLayout> sealed::Sealed
-    for PerThreadShallowCopy<T>
+impl<
+        T: crate::safety::SafeDeviceCopy
+            + crate::safety::NoSafeAliasing
+            + const_type_layout::TypeGraphLayout,
+    > sealed::Sealed for PerThreadShallowCopy<T>
 {
 }
 
-impl<'a, T: 'static + crate::safety::SafeDeviceCopy + const_type_layout::TypeGraphLayout>
-    CudaKernelParameter for &'a PerThreadShallowCopy<T>
+impl<
+        'a,
+        T: 'static
+            + crate::safety::SafeDeviceCopy
+            + crate::safety::NoSafeAliasing
+            + const_type_layout::TypeGraphLayout,
+    > CudaKernelParameter for &'a PerThreadShallowCopy<T>
 {
     #[cfg(feature = "host")]
     type AsyncHostType<'stream, 'b> = crate::host::HostAndDeviceConstRefAsync<
@@ -431,7 +442,7 @@ impl<'a, T: 'static + crate::safety::SafeDeviceCopy + const_type_layout::TypeGra
         'b,
         crate::utils::device_copy::SafeDeviceCopyWrapper<T>,
     >;
-    type DeviceType = &'a PerThreadShallowCopy<T>;
+    type DeviceType<'b> = &'b T;
     type FfiType<'stream, 'b> =
         DeviceConstRef<'b, crate::utils::device_copy::SafeDeviceCopyWrapper<T>>;
     #[cfg(feature = "host")]
@@ -466,31 +477,35 @@ impl<'a, T: 'static + crate::safety::SafeDeviceCopy + const_type_layout::TypeGra
     }
 
     #[cfg(not(feature = "host"))]
-    fn with_ffi_as_device(
+    fn with_ffi_as_device<O>(
         param: Self::FfiType<'static, 'static>,
-        inner: impl FnOnce(Self::DeviceType),
-    ) {
+        inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O,
+    ) -> O {
         let param = param.as_ref().into_ref();
-        // Safety: PerThreadShallowCopy<T> is a transparent newtype wrapper around T
-        let param = unsafe { &*(param as *const T).cast::<PerThreadShallowCopy<T>>() };
 
         inner(param)
     }
 }
-impl<'a, T: crate::safety::SafeDeviceCopy + const_type_layout::TypeGraphLayout> sealed::Sealed
-    for &'a PerThreadShallowCopy<T>
+impl<
+        'a,
+        T: crate::safety::SafeDeviceCopy
+            + crate::safety::NoSafeAliasing
+            + const_type_layout::TypeGraphLayout,
+    > sealed::Sealed for &'a PerThreadShallowCopy<T>
 {
 }
 
 #[repr(transparent)]
-pub struct ShallowInteriorMutable<T: InteriorMutableSafeDeviceCopy>(T);
+pub struct ShallowInteriorMutable<T: InteriorMutableSafeDeviceCopy> {
+    never: !,
+    _marker: PhantomData<T>,
+}
 
-#[cfg(not(feature = "host"))]
-impl<T: InteriorMutableSafeDeviceCopy> core::ops::Deref for ShallowInteriorMutable<T> {
+impl<T: InteriorMutableSafeDeviceCopy> Deref for ShallowInteriorMutable<T> {
     type Target = T;
 
     fn deref(&self) -> &Self::Target {
-        &self.0
+        self.never
     }
 }
 
@@ -503,7 +518,7 @@ impl<'a, T: 'static + InteriorMutableSafeDeviceCopy> CudaKernelParameter
         'b,
         crate::utils::device_copy::SafeDeviceCopyWrapper<T>,
     >;
-    type DeviceType = &'a ShallowInteriorMutable<T>;
+    type DeviceType<'b> = &'b T;
     type FfiType<'stream, 'b> =
         DeviceConstRef<'b, crate::utils::device_copy::SafeDeviceCopyWrapper<T>>;
     #[cfg(feature = "host")]
@@ -546,13 +561,11 @@ impl<'a, T: 'static + InteriorMutableSafeDeviceCopy> CudaKernelParameter
     }
 
     #[cfg(not(feature = "host"))]
-    fn with_ffi_as_device(
+    fn with_ffi_as_device<O>(
         param: Self::FfiType<'static, 'static>,
-        inner: impl FnOnce(Self::DeviceType),
-    ) {
+        inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O,
+    ) -> O {
         let param = param.as_ref().into_ref();
-        // Safety: ShallowInteriorMutable<T> is a transparent newtype wrapper around T
-        let param = unsafe { &*(param as *const T).cast::<ShallowInteriorMutable<T>>() };
 
         inner(param)
     }
@@ -560,7 +573,10 @@ impl<'a, T: 'static + InteriorMutableSafeDeviceCopy> CudaKernelParameter
 impl<'a, T: InteriorMutableSafeDeviceCopy> sealed::Sealed for &'a ShallowInteriorMutable<T> {}
 
 pub trait InteriorMutableSafeDeviceCopy:
-    crate::safety::SafeDeviceCopy + const_type_layout::TypeGraphLayout + sealed::Sealed
+    crate::safety::SafeDeviceCopy
+    + crate::safety::NoSafeAliasing
+    + const_type_layout::TypeGraphLayout
+    + sealed::Sealed
 {
 }
 
@@ -587,62 +603,41 @@ impl_atomic_interior_mutable! {
 // core::cell::SyncUnsafeCell<T> {}
 
 #[repr(transparent)]
-pub struct SharedHeapPerThreadShallowCopy<T: RustToCuda>(core::mem::ManuallyDrop<T>);
-
-#[cfg(not(feature = "host"))]
-impl<T: RustToCuda> SharedHeapPerThreadShallowCopy<T> {
-    #[must_use]
-    fn new(value: T) -> Self {
-        Self(core::mem::ManuallyDrop::new(value))
-    }
-}
-
-#[cfg(not(feature = "host"))]
-impl<
-        T: RustToCuda<
-            CudaRepresentation: crate::safety::SafeDeviceCopy,
-            CudaAllocation: EmptyCudaAlloc,
-        >,
-    > SharedHeapPerThreadShallowCopy<T>
-{
-    #[must_use]
-    pub fn into_inner(self) -> T {
-        core::mem::ManuallyDrop::into_inner(self.0)
-    }
+pub struct SharedHeapPerThreadShallowCopy<T: RustToCuda + crate::safety::NoSafeAliasing> {
+    never: !,
+    _marker: PhantomData<T>,
 }
 
-#[cfg(not(feature = "host"))]
-impl<T: RustToCuda> core::ops::Deref for SharedHeapPerThreadShallowCopy<T> {
+impl<T: RustToCuda + crate::safety::NoSafeAliasing> Deref for SharedHeapPerThreadShallowCopy<T> {
     type Target = T;
 
     fn deref(&self) -> &Self::Target {
-        &self.0
+        self.never
     }
 }
 
-#[cfg(not(feature = "host"))]
-impl<T: RustToCuda> core::ops::DerefMut for SharedHeapPerThreadShallowCopy<T> {
+impl<T: RustToCuda + crate::safety::NoSafeAliasing> DerefMut for SharedHeapPerThreadShallowCopy<T> {
     fn deref_mut(&mut self) -> &mut Self::Target {
-        &mut self.0
+        self.never
     }
 }
 
 impl<
         T: RustToCuda<
-            CudaRepresentation: crate::safety::SafeDeviceCopy,
-            CudaAllocation: EmptyCudaAlloc,
-        >,
+                CudaRepresentation: 'static + crate::safety::SafeDeviceCopy,
+                CudaAllocation: EmptyCudaAlloc,
+            > + crate::safety::NoSafeAliasing,
     > CudaKernelParameter for SharedHeapPerThreadShallowCopy<T>
 {
     #[cfg(feature = "host")]
     type AsyncHostType<'stream, 'b> = crate::host::HostAndDeviceOwnedAsync<
         'stream,
+        'b,
         DeviceAccessible<<T as RustToCuda>::CudaRepresentation>,
     >;
-    type DeviceType = SharedHeapPerThreadShallowCopy<T>;
-    // TODO: where does the drop happen?
+    type DeviceType<'b> = T;
     type FfiType<'stream, 'b> =
-        DeviceOwnedRef<DeviceAccessible<<T as RustToCuda>::CudaRepresentation>>;
+        DeviceOwnedRef<'b, DeviceAccessible<<T as RustToCuda>::CudaRepresentation>>;
     #[cfg(feature = "host")]
     type SyncHostType = T;
 
@@ -663,33 +658,35 @@ impl<
     }
 
     #[cfg(not(feature = "host"))]
-    fn with_ffi_as_device(
+    fn with_ffi_as_device<O>(
         param: Self::FfiType<'static, 'static>,
-        inner: impl FnOnce(Self::DeviceType),
-    ) {
-        let param =
-            SharedHeapPerThreadShallowCopy::new(unsafe { CudaAsRust::as_rust(param.as_ref()) });
+        inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O,
+    ) -> O {
+        // The type contains no allocations and is safe to copy
+        let param = unsafe { CudaAsRust::as_rust(param.as_ref()) };
 
         inner(param)
     }
 }
 impl<
         T: RustToCuda<
-            CudaRepresentation: crate::safety::SafeDeviceCopy,
-            CudaAllocation: EmptyCudaAlloc,
-        >,
+                CudaRepresentation: crate::safety::SafeDeviceCopy,
+                CudaAllocation: EmptyCudaAlloc,
+            > + crate::safety::NoSafeAliasing,
     > sealed::Sealed for SharedHeapPerThreadShallowCopy<T>
 {
 }
 
-impl<'a, T: 'static + RustToCuda> CudaKernelParameter for &'a SharedHeapPerThreadShallowCopy<T> {
+impl<'a, T: 'static + RustToCuda + crate::safety::NoSafeAliasing> CudaKernelParameter
+    for &'a SharedHeapPerThreadShallowCopy<T>
+{
     #[cfg(feature = "host")]
     type AsyncHostType<'stream, 'b> = crate::host::HostAndDeviceConstRefAsync<
         'stream,
         'b,
         DeviceAccessible<<T as RustToCuda>::CudaRepresentation>,
     >;
-    type DeviceType = &'a SharedHeapPerThreadShallowCopy<T>;
+    type DeviceType<'b> = &'b T;
     type FfiType<'stream, 'b> =
         DeviceConstRef<'b, DeviceAccessible<<T as RustToCuda>::CudaRepresentation>>;
     #[cfg(feature = "host")]
@@ -712,21 +709,23 @@ impl<'a, T: 'static + RustToCuda> CudaKernelParameter for &'a SharedHeapPerThrea
     }
 
     #[cfg(not(feature = "host"))]
-    fn with_ffi_as_device(
+    fn with_ffi_as_device<O>(
         param: Self::FfiType<'static, 'static>,
-        inner: impl FnOnce(Self::DeviceType),
-    ) {
-        // param must never be dropped as we do NOT own any of the
-        //  heap memory it might reference
-        let param =
-            SharedHeapPerThreadShallowCopy::new(unsafe { CudaAsRust::as_rust(param.as_ref()) });
+        inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O,
+    ) -> O {
+        // Safety: param must never be dropped as we do NOT own any of the
+        //         heap memory it might reference
+        let param = core::mem::ManuallyDrop::new(unsafe { CudaAsRust::as_rust(param.as_ref()) });
 
         inner(&param)
     }
 }
-impl<'a, T: RustToCuda> sealed::Sealed for &'a SharedHeapPerThreadShallowCopy<T> {}
+impl<'a, T: RustToCuda + crate::safety::NoSafeAliasing> sealed::Sealed
+    for &'a SharedHeapPerThreadShallowCopy<T>
+{
+}
 
-impl<'a, T: 'static + RustToCuda> CudaKernelParameter
+impl<'a, T: 'static + RustToCuda + crate::safety::NoSafeAliasing> CudaKernelParameter
     for &'a mut SharedHeapPerThreadShallowCopy<T>
 {
     #[cfg(feature = "host")]
@@ -735,7 +734,7 @@ impl<'a, T: 'static + RustToCuda> CudaKernelParameter
         'b,
         DeviceAccessible<<T as RustToCuda>::CudaRepresentation>,
     >;
-    type DeviceType = &'a mut SharedHeapPerThreadShallowCopy<T>;
+    type DeviceType<'b> = &'b mut T;
     type FfiType<'stream, 'b> =
         DeviceMutRef<'b, DeviceAccessible<<T as RustToCuda>::CudaRepresentation>>;
     #[cfg(feature = "host")]
@@ -758,16 +757,19 @@ impl<'a, T: 'static + RustToCuda> CudaKernelParameter
     }
 
     #[cfg(not(feature = "host"))]
-    fn with_ffi_as_device(
+    fn with_ffi_as_device<O>(
         mut param: Self::FfiType<'static, 'static>,
-        inner: impl FnOnce(Self::DeviceType),
-    ) {
-        // param must never be dropped as we do NOT own any of the
-        //  heap memory it might reference
+        inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O,
+    ) -> O {
+        // Safety: param must never be dropped as we do NOT own any of the
+        //         heap memory it might reference
         let mut param =
-            SharedHeapPerThreadShallowCopy::new(unsafe { CudaAsRust::as_rust(param.as_mut()) });
+            core::mem::ManuallyDrop::new(unsafe { CudaAsRust::as_rust(param.as_mut()) });
 
         inner(&mut param)
     }
 }
-impl<'a, T: RustToCuda> sealed::Sealed for &'a mut SharedHeapPerThreadShallowCopy<T> {}
+impl<'a, T: RustToCuda + crate::safety::NoSafeAliasing> sealed::Sealed
+    for &'a mut SharedHeapPerThreadShallowCopy<T>
+{
+}
diff --git a/src/host.rs b/src/host.rs
index 8fa437cf3..f15bca27e 100644
--- a/src/host.rs
+++ b/src/host.rs
@@ -780,46 +780,45 @@ impl<'a, T: DeviceCopy> HostAndDeviceConstRef<'a, T> {
 }
 
 #[allow(clippy::module_name_repetitions)]
-pub struct HostAndDeviceOwned<T: SafeDeviceCopy + DeviceCopy> {
-    device_box: HostDeviceBox<T>,
-    host_val: T,
+pub struct HostAndDeviceOwned<'a, T: SafeDeviceCopy + DeviceCopy> {
+    device_box: &'a mut HostDeviceBox<T>,
+    host_val: &'a mut T,
 }
 
-impl<T: SafeDeviceCopy + DeviceCopy> HostAndDeviceOwned<T> {
+impl<'a, T: SafeDeviceCopy + DeviceCopy> HostAndDeviceOwned<'a, T> {
     /// # Errors
     ///
     /// Returns a [`CudaError`] iff `value` cannot be moved
     ///  to CUDA or an error occurs inside `inner`.
     pub fn with_new<O, E: From<CudaError>, F: FnOnce(HostAndDeviceOwned<T>) -> Result<O, E>>(
-        value: T,
+        mut value: T,
         inner: F,
     ) -> Result<O, E> {
-        let device_box: HostDeviceBox<_> = DeviceBox::new(&value)?.into();
+        let mut device_box: HostDeviceBox<_> = DeviceBox::new(&value)?.into();
 
         // Safety: `device_box` contains exactly the device copy of `value`
         inner(HostAndDeviceOwned {
-            device_box,
-            host_val: value,
+            device_box: &mut device_box,
+            host_val: &mut value,
         })
     }
 
     #[must_use]
-    pub fn for_device(self) -> DeviceOwnedRef<T> {
-        let mut device_box = ManuallyDrop::new(self.device_box);
-
+    pub fn for_device(self) -> DeviceOwnedRef<'a, T> {
         DeviceOwnedRef {
-            pointer: device_box.0.as_raw_mut(),
+            pointer: self.device_box.0.as_raw_mut(),
             marker: PhantomData::<T>,
+            reference: PhantomData::<&'a mut ()>,
         }
     }
 
     #[must_use]
     pub fn for_host(&self) -> &T {
-        &self.host_val
+        self.host_val
     }
 
     #[must_use]
-    pub fn into_async<'stream>(self) -> HostAndDeviceOwnedAsync<'stream, T> {
+    pub fn into_async<'stream>(self) -> HostAndDeviceOwnedAsync<'stream, 'a, T> {
         HostAndDeviceOwnedAsync {
             device_box: self.device_box,
             host_val: self.host_val,
@@ -961,30 +960,29 @@ impl<'stream, 'a, T: DeviceCopy> HostAndDeviceConstRefAsync<'stream, 'a, T> {
 }
 
 #[allow(clippy::module_name_repetitions)]
-pub struct HostAndDeviceOwnedAsync<'stream, T: SafeDeviceCopy + DeviceCopy> {
-    device_box: HostDeviceBox<T>,
-    host_val: T,
+pub struct HostAndDeviceOwnedAsync<'stream, 'a, T: SafeDeviceCopy + DeviceCopy> {
+    device_box: &'a mut HostDeviceBox<T>,
+    host_val: &'a mut T,
     stream: PhantomData<&'stream Stream>,
 }
 
-impl<'stream, T: SafeDeviceCopy + DeviceCopy> HostAndDeviceOwnedAsync<'stream, T> {
+impl<'stream, 'a, T: SafeDeviceCopy + DeviceCopy> HostAndDeviceOwnedAsync<'stream, 'a, T> {
     #[must_use]
     /// # Safety
     ///
     /// The returned [`DeviceOwnedRef`] must only be used on the
     /// constructed-with [`Stream`]
-    pub unsafe fn for_device_async(self) -> DeviceOwnedRef<T> {
-        let mut device_box = ManuallyDrop::new(self.device_box);
-
+    pub unsafe fn for_device_async(self) -> DeviceOwnedRef<'a, T> {
         DeviceOwnedRef {
-            pointer: device_box.0.as_raw_mut(),
-            marker: PhantomData,
+            pointer: self.device_box.0.as_raw_mut(),
+            marker: PhantomData::<T>,
+            reference: PhantomData::<&'a mut ()>,
         }
     }
 
     #[must_use]
     pub fn for_host(&self) -> &T {
-        &self.host_val
+        self.host_val
     }
 }
 
diff --git a/src/lib.rs b/src/lib.rs
index f4bc7bbe0..61b807d8b 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -28,6 +28,7 @@
 #![feature(let_chains)]
 #![feature(inline_const)]
 #![feature(sync_unsafe_cell)]
+#![feature(never_type)]
 #![feature(cfg_version)]
 #![cfg_attr(not(version("1.76.0")), feature(c_str_literals))]
 #![cfg_attr(not(version("1.76.0")), feature(ptr_from_ref))]

From 446b1f7a8675dc04a616f2aff4379471b0ab9204 Mon Sep 17 00:00:00 2001
From: Juniper Tyree <juniper.tyree@helsinki.fi>
Date: Sat, 23 Dec 2023 10:48:00 +0000
Subject: [PATCH 055/120] Lift complete CPU kernel wrapper from proc macro into
 public functions

---
 Cargo.toml                                    |   9 +-
 examples/print/src/main.rs                    |  18 +-
 examples/single-source/src/main.rs            |   4 +-
 .../generate/cpu_linker_macro/args_trait.rs   |   9 +-
 .../generate/cpu_linker_macro/get_ptx.rs      |  39 +-
 .../generate/cpu_wrapper/kernel_func.rs       |  67 ++--
 .../generate/cpu_wrapper/kernel_func_async.rs | 146 -------
 .../wrapper/generate/cpu_wrapper/mod.rs       |  14 -
 .../wrapper/generate/cuda_generic_function.rs |  19 +-
 .../kernel/wrapper/generate/cuda_wrapper.rs   |  91 ++---
 .../src/kernel/wrapper/inputs/attribute.rs    |  65 ----
 .../src/kernel/wrapper/inputs/mod.rs          |  87 -----
 rust-cuda-derive/src/kernel/wrapper/mod.rs    |  68 ++--
 rust-cuda-derive/src/kernel/wrapper/parse.rs  |  14 +
 rust-cuda-ptx-jit/Cargo.toml                  |  17 -
 rust-cuda-ptx-jit/src/device.rs               |  13 -
 rust-cuda-ptx-jit/src/host/regex.rs           |  46 ---
 rust-cuda-ptx-jit/src/lib.rs                  |  23 --
 src/common.rs                                 | 355 +++++++++++++++++-
 src/{host.rs => host/mod.rs}                  | 271 +++++++++----
 .../src/host => src/host/ptx_jit}/mod.rs      |   0
 .../host => src/host/ptx_jit}/preprocess.rs   |  10 +-
 src/host/ptx_jit/regex.rs                     |  58 +++
 .../src/host => src/host/ptx_jit}/replace.rs  |   9 +-
 src/lib.rs                                    |   1 -
 25 files changed, 783 insertions(+), 670 deletions(-)
 delete mode 100644 rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func_async.rs
 delete mode 100644 rust-cuda-derive/src/kernel/wrapper/inputs/attribute.rs
 delete mode 100644 rust-cuda-derive/src/kernel/wrapper/inputs/mod.rs
 delete mode 100644 rust-cuda-ptx-jit/Cargo.toml
 delete mode 100644 rust-cuda-ptx-jit/src/device.rs
 delete mode 100644 rust-cuda-ptx-jit/src/host/regex.rs
 delete mode 100644 rust-cuda-ptx-jit/src/lib.rs
 rename src/{host.rs => host/mod.rs} (77%)
 rename {rust-cuda-ptx-jit/src/host => src/host/ptx_jit}/mod.rs (100%)
 rename {rust-cuda-ptx-jit/src/host => src/host/ptx_jit}/preprocess.rs (93%)
 create mode 100644 src/host/ptx_jit/regex.rs
 rename {rust-cuda-ptx-jit/src/host => src/host/ptx_jit}/replace.rs (96%)

diff --git a/Cargo.toml b/Cargo.toml
index 9e9a568f2..0a1375547 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,10 +1,10 @@
 [workspace]
 members = [
-    ".", "rust-cuda-derive", "rust-cuda-ptx-jit",
+    ".", "rust-cuda-derive",
     "examples/derive", "examples/print", "examples/single-source",
 ]
 default-members = [
-    ".", "rust-cuda-derive", "rust-cuda-ptx-jit"
+    ".", "rust-cuda-derive",
 ]
 
 [package]
@@ -19,7 +19,7 @@ rust-version = "1.75" # nightly
 
 [features]
 default = []
-host = ["rustacuda", "rust-cuda-ptx-jit/host"]
+host = ["rustacuda", "regex"]
 derive = ["rustacuda_derive", "rust-cuda-derive"]
 
 [dependencies]
@@ -28,9 +28,10 @@ rustacuda_core = { git = "https://github.com/juntyr/RustaCUDA", rev = "c6ea7cc"
 rustacuda = { git = "https://github.com/juntyr/RustaCUDA", rev = "c6ea7cc", optional = true }
 rustacuda_derive = { git = "https://github.com/juntyr/RustaCUDA", rev = "c6ea7cc", optional = true }
 
+regex = { version = "1.10", optional = true }
+
 const-type-layout = { version = "0.2.0", features = ["derive"] }
 
 final = "0.1.1"
 
 rust-cuda-derive = { path = "rust-cuda-derive", optional = true }
-rust-cuda-ptx-jit = { path = "rust-cuda-ptx-jit" }
diff --git a/examples/print/src/main.rs b/examples/print/src/main.rs
index 17cf42fd8..3d2f776e4 100644
--- a/examples/print/src/main.rs
+++ b/examples/print/src/main.rs
@@ -71,13 +71,25 @@ fn main() -> rust_cuda::rustacuda::error::CudaResult<()> {
 
     // Launch the CUDA kernel on the stream and synchronise to its completion
     println!("Launching print kernel ...");
-    kernel.launch1(&stream, &config, Action::Print)?;
+    kernel.launch1::<rust_cuda::common::PerThreadShallowCopy<Action>>(
+        &stream,
+        &config,
+        Action::Print,
+    )?;
     // kernel(&mut launcher, Action::Print)?;
     println!("Launching panic kernel ...");
-    kernel.launch1(&stream, &config, Action::Panic)?;
+    kernel.launch1::<rust_cuda::common::PerThreadShallowCopy<Action>>(
+        &stream,
+        &config,
+        Action::Panic,
+    )?;
     // kernel(&mut launcher, Action::Panic)?;
     println!("Launching alloc error kernel ...");
-    kernel.launch1(&stream, &config, Action::AllocError)?;
+    kernel.launch1::<rust_cuda::common::PerThreadShallowCopy<Action>>(
+        &stream,
+        &config,
+        Action::AllocError,
+    )?;
     // kernel(&mut launcher, Action::AllocError)?;
 
     Ok(())
diff --git a/examples/single-source/src/main.rs b/examples/single-source/src/main.rs
index 10be57d65..f53963f9d 100644
--- a/examples/single-source/src/main.rs
+++ b/examples/single-source/src/main.rs
@@ -65,9 +65,9 @@ pub fn kernel<
         + rc::safety::NoSafeAliasing,
 >(
     _x: &rc::common::PerThreadShallowCopy<Dummy>,
-    #[kernel(jit)] _y: &mut rc::common::SharedHeapPerThreadShallowCopy<Wrapper<T>>,
+    _y: &mut rc::common::PtxJit<rc::common::SharedHeapPerThreadShallowCopy<Wrapper<T>>>,
     _z: &rc::common::SharedHeapPerThreadShallowCopy<Wrapper<T>>,
-    #[kernel(jit)] _v @ _w: &'a rc::common::ShallowInteriorMutable<core::sync::atomic::AtomicU64>,
+    _v @ _w: &'a rc::common::ShallowInteriorMutable<core::sync::atomic::AtomicU64>,
     _: rc::common::SharedHeapPerThreadShallowCopy<Wrapper<T>>,
     Tuple(s, mut __t): rc::common::PerThreadShallowCopy<Tuple>,
     q: rc::common::PerThreadShallowCopy<Triple>,
diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/args_trait.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/args_trait.rs
index 178ed026d..25cc27955 100644
--- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/args_trait.rs
+++ b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/args_trait.rs
@@ -8,7 +8,7 @@ pub(in super::super) fn quote_args_trait(
         impl_generics,
         ty_generics,
     }: &ImplGenerics,
-    FunctionInputs { func_inputs, .. }: &FunctionInputs,
+    FunctionInputs { func_inputs }: &FunctionInputs,
 ) -> TokenStream {
     let func_input_typedefs = (0..func_inputs.len())
         .map(|i| {
@@ -23,12 +23,7 @@ pub(in super::super) fn quote_args_trait(
     let func_input_types = func_inputs
         .iter()
         .enumerate()
-        .map(|(i, arg)| {
-            let pat_type = match arg {
-                syn::FnArg::Typed(pat_type) => pat_type,
-                syn::FnArg::Receiver(_) => unreachable!(),
-            };
-
+        .map(|(i, pat_type)| {
             let type_ident = quote::format_ident!("__T_{}", i);
             let arg_type = match &*pat_type.ty {
                 syn::Type::Reference(syn::TypeReference { elem, .. }) => elem,
diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/get_ptx.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/get_ptx.rs
index e838d400c..439f27f9e 100644
--- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/get_ptx.rs
+++ b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/get_ptx.rs
@@ -121,32 +121,29 @@ fn generate_lifetime_erased_types(
         generic_close_token,
         ..
     }: &DeclGenerics,
-    FunctionInputs { func_inputs, .. }: &FunctionInputs,
+    FunctionInputs { func_inputs }: &FunctionInputs,
     macro_type_ids: &[syn::Ident],
 ) -> Vec<proc_macro2::TokenStream> {
     func_inputs
         .iter()
         .enumerate()
-        .map(|(i, arg)| match arg {
-            syn::FnArg::Typed(syn::PatType { ty, .. }) => {
-                let type_ident = quote::format_ident!("__T_{}", i);
-
-                let mut specialised_ty = quote::quote_spanned! { ty.span()=>
-                    <() as #args #generic_start_token
-                        #($#macro_type_ids),*
-                    #generic_close_token>::#type_ident
-                };
-                // the args trait has to unbox outer lifetimes, so we need to add them back in here
-                if let syn::Type::Reference(syn::TypeReference { and_token, lifetime, mutability, .. }) = &**ty {
-                    let lifetime = quote::quote_spanned! { lifetime.span()=> 'static };
-
-                    specialised_ty = quote! { #and_token #lifetime #mutability #specialised_ty };
-                }
+        .map(|(i, syn::PatType { ty, .. })| {
+            let type_ident = quote::format_ident!("__T_{}", i);
 
-                quote::quote_spanned! { ty.span()=>
-                    <#specialised_ty as #crate_path::common::CudaKernelParameter>::FfiType<'static, 'static>
-                }
-            },
-            syn::FnArg::Receiver(_) => unreachable!(),
+            let mut specialised_ty = quote::quote_spanned! { ty.span()=>
+                <() as #args #generic_start_token
+                    #($#macro_type_ids),*
+                #generic_close_token>::#type_ident
+            };
+            // the args trait has to unbox outer lifetimes, so we need to add them back in here
+            if let syn::Type::Reference(syn::TypeReference { and_token, lifetime, mutability, .. }) = &**ty {
+                let lifetime = quote::quote_spanned! { lifetime.span()=> 'static };
+
+                specialised_ty = quote! { #and_token #lifetime #mutability #specialised_ty };
+            }
+
+            quote::quote_spanned! { ty.span()=>
+                <#specialised_ty as #crate_path::common::CudaKernelParameter>::FfiType<'static, 'static>
+            }
         }).collect()
 }
diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func.rs
index 7eb7db1a4..b854ce160 100644
--- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func.rs
+++ b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func.rs
@@ -12,20 +12,20 @@ pub(super) fn quote_kernel_func_inputs(
         generic_close_token,
         ..
     }: &DeclGenerics,
-    inputs @ FunctionInputs { func_inputs, .. }: &FunctionInputs,
-    fn_ident @ FuncIdent { func_ident, .. }: &FuncIdent,
+    FunctionInputs { func_inputs }: &FunctionInputs,
+    FuncIdent { func_ident, .. }: &FuncIdent,
     func_params: &[syn::Ident],
     func_attrs: &[syn::Attribute],
 ) -> TokenStream {
     let (kernel_func_inputs, kernel_func_input_tys): (Vec<_>, Vec<_>) = func_inputs
         .iter()
-        .map(|arg| match arg {
-            syn::FnArg::Typed(syn::PatType {
-                attrs,
-                ty,
-                pat,
-                colon_token,
-            }) => {
+        .map(
+            |syn::PatType {
+                 attrs,
+                 ty,
+                 pat,
+                 colon_token,
+             }| {
                 let ty: syn::Type = syn::parse_quote_spanned! { ty.span()=>
                     <#ty as #crate_path::common::CudaKernelParameter>::SyncHostType
                 };
@@ -40,14 +40,17 @@ pub(super) fn quote_kernel_func_inputs(
                     ty,
                 )
             },
-            syn::FnArg::Receiver(_) => unreachable!(),
-        })
+        )
         .unzip();
 
+    let cuda_kernel_param_tys = func_inputs
+        .iter()
+        .map(|syn::PatType { ty, .. }| &**ty)
+        .collect::<Vec<_>>();
+
     let launcher = syn::Ident::new("launcher", proc_macro2::Span::mixed_site());
 
-    let raw_func_input_wrap =
-        generate_raw_func_input_wrap(crate_path, inputs, fn_ident, func_params, &launcher);
+    let launch = quote::format_ident!("launch{}", func_inputs.len());
 
     let full_generics = generic_kernel_params
         .iter()
@@ -74,45 +77,19 @@ pub(super) fn quote_kernel_func_inputs(
 
         #[cfg(not(target_os = "cuda"))]
         #(#func_attrs)*
-        #[allow(clippy::needless_lifetimes)]
         #[allow(clippy::too_many_arguments)]
         #[allow(clippy::used_underscore_binding)]
-        #[allow(unused_variables)]
         pub fn #func_ident <#generic_kernel_params>(
-            #launcher: &mut #crate_path::host::Launcher<#func_ident #ty_generics>,
+            #launcher: &mut #crate_path::host::Launcher<#func_ident #generic_start_token
+                #(#full_generics),*
+            #generic_close_token>,
             #(#kernel_func_inputs),*
         ) -> #crate_path::rustacuda::error::CudaResult<()> {
             let _: #func_ident <#(#full_generics),*> = #func_ident #ty_turbofish;
 
-            #raw_func_input_wrap
+            #launcher.#launch::<
+                #(#cuda_kernel_param_tys),*
+            >(#(#func_params),*)
         }
     }
 }
-
-#[allow(clippy::too_many_lines)]
-fn generate_raw_func_input_wrap(
-    crate_path: &syn::Path,
-    FunctionInputs { func_inputs, .. }: &FunctionInputs,
-    FuncIdent {
-        func_ident_async, ..
-    }: &FuncIdent,
-    func_params: &[syn::Ident],
-    launcher: &syn::Ident,
-) -> TokenStream {
-    func_inputs.iter().rev().fold(
-        quote! {
-            #func_ident_async(#launcher, #(#func_params),*)?;
-            #launcher.stream.synchronize()
-        },
-        |inner, arg| match arg {
-            syn::FnArg::Typed(syn::PatType { pat, ty, .. }) => {
-                quote::quote_spanned! { ty.span()=>
-                    <#ty as #crate_path::common::CudaKernelParameter>::with_new_async(
-                        #pat, #launcher.stream, |#pat| { #inner }
-                    )
-                }
-            },
-            syn::FnArg::Receiver(_) => unreachable!(),
-        },
-    )
-}
diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func_async.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func_async.rs
deleted file mode 100644
index 39ce95e9d..000000000
--- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func_async.rs
+++ /dev/null
@@ -1,146 +0,0 @@
-use proc_macro2::TokenStream;
-use syn::spanned::Spanned;
-
-use super::super::super::{DeclGenerics, FuncIdent, FunctionInputs, ImplGenerics};
-
-#[allow(clippy::too_many_arguments)]
-pub(super) fn quote_kernel_func_async(
-    crate_path: &syn::Path,
-    ImplGenerics { ty_generics, .. }: &ImplGenerics,
-    DeclGenerics {
-        generic_kernel_params,
-        ..
-    }: &DeclGenerics,
-    func_inputs: &FunctionInputs,
-    FuncIdent {
-        func_ident,
-        func_ident_async,
-        ..
-    }: &FuncIdent,
-    func_params: &[syn::Ident],
-    func_attrs: &[syn::Attribute],
-) -> TokenStream {
-    let launcher = syn::Ident::new("launcher", proc_macro2::Span::mixed_site());
-    let stream = syn::Lifetime::new("'stream", proc_macro2::Span::mixed_site());
-
-    let (async_params, launch_param_types, launch_param_wrap, _ptx_jit_param_wrap) =
-        generate_type_wrap(crate_path, func_inputs, &stream);
-
-    quote! {
-        #[cfg(not(target_os = "cuda"))]
-        #(#func_attrs)*
-        #[allow(clippy::extra_unused_type_parameters)]
-        #[allow(clippy::too_many_arguments)]
-        #[allow(clippy::used_underscore_binding)]
-        #[allow(unused_variables)]
-        pub fn #func_ident_async <#stream, #generic_kernel_params>(
-            #launcher: &mut #crate_path::host::Launcher<#stream, '_, #func_ident #ty_generics>,
-            #(#async_params),*
-        ) -> #crate_path::rustacuda::error::CudaResult<()> {
-            let kernel_jit_result = if #launcher.config.ptx_jit {
-                #launcher.kernel.compile_with_ptx_jit_args(None)? // TODO: #ptx_jit_param_wrap)?
-            } else {
-                #launcher.kernel.compile_with_ptx_jit_args(None)?
-            };
-            let function = match kernel_jit_result {
-                #crate_path::host::KernelJITResult::Recompiled(function)
-                | #crate_path::host::KernelJITResult::Cached(function) => function,
-            };
-
-            #[allow(clippy::redundant_closure_call)]
-            (|#(#func_params: #launch_param_types),*| {
-                let #crate_path::host::LaunchConfig {
-                    grid, block, shared_memory_size, ptx_jit: _,
-                } = #launcher.config.clone();
-
-                unsafe { #launcher.stream.launch(function, grid, block, shared_memory_size,
-                    &[
-                        #(
-                            &#func_params as *const _ as *mut ::core::ffi::c_void
-                        ),*
-                    ]
-                ) }
-            })(#(#launch_param_wrap),*)
-        }
-    }
-}
-
-fn generate_type_wrap(
-    crate_path: &syn::Path,
-    FunctionInputs {
-        func_inputs,
-        func_input_cuda_types,
-    }: &FunctionInputs,
-    stream: &syn::Lifetime,
-) -> (
-    Vec<syn::FnArg>,
-    Vec<syn::Type>,
-    Vec<TokenStream>,
-    TokenStream,
-) {
-    let mut any_ptx_jit = false;
-
-    let mut async_params = Vec::with_capacity(func_inputs.len());
-    let mut launch_param_types = Vec::with_capacity(func_inputs.len());
-    let mut launch_param_wrap = Vec::with_capacity(func_inputs.len());
-    let mut ptx_jit_param_wrap = Vec::with_capacity(func_inputs.len());
-
-    func_inputs
-        .iter()
-        .zip(func_input_cuda_types.iter())
-        .for_each(|(arg, ptx_jit)| match arg {
-            syn::FnArg::Typed(syn::PatType {
-                attrs,
-                pat,
-                colon_token,
-                ty,
-            }) => {
-                ptx_jit_param_wrap.push(if ptx_jit.0 {
-                    any_ptx_jit = true;
-
-                    quote! { Some(#crate_path::ptx_jit::arg_as_raw_bytes(#pat.for_host())) }
-                } else {
-                    quote! { None }
-                });
-
-                let async_ty: syn::Type = syn::parse_quote_spanned! { ty.span()=>
-                    <#ty as #crate_path::common::CudaKernelParameter>::AsyncHostType<#stream, '_>
-                };
-
-                let async_param = syn::FnArg::Typed(syn::PatType {
-                    attrs: attrs.clone(),
-                    ty: Box::new(async_ty),
-                    pat: pat.clone(),
-                    colon_token: *colon_token,
-                });
-
-                async_params.push(async_param);
-
-                let launch_ty: syn::Type = syn::parse_quote_spanned! { ty.span()=>
-                    <#ty as #crate_path::common::CudaKernelParameter>::FfiType<#stream, '_>
-                };
-
-                launch_param_types.push(launch_ty);
-
-                let launch_wrap = quote::quote_spanned! { ty.span()=>
-                    <#ty as #crate_path::common::CudaKernelParameter>::async_to_ffi(#pat)
-                };
-
-                launch_param_wrap.push(launch_wrap);
-            },
-            syn::FnArg::Receiver(_) => unreachable!(),
-        });
-
-    let ptx_jit_param_wrap = if any_ptx_jit {
-        quote!(Some(&[#(#ptx_jit_param_wrap),*]))
-    } else {
-        quote!(None)
-    };
-
-    (
-        async_params,
-        launch_param_types,
-        launch_param_wrap,
-        ptx_jit_param_wrap,
-    )
-}
diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/mod.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/mod.rs
index b863a478f..eeb5cd5d4 100644
--- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/mod.rs
+++ b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/mod.rs
@@ -3,10 +3,7 @@ use proc_macro2::TokenStream;
 use super::super::{DeclGenerics, FuncIdent, FunctionInputs, ImplGenerics};
 
 mod kernel_func;
-mod kernel_func_async;
-
 use kernel_func::quote_kernel_func_inputs;
-use kernel_func_async::quote_kernel_func_async;
 
 pub(in super::super) fn quote_cpu_wrapper(
     crate_path: &syn::Path,
@@ -26,19 +23,8 @@ pub(in super::super) fn quote_cpu_wrapper(
         func_params,
         func_attrs,
     );
-    let kernel_func_async = quote_kernel_func_async(
-        crate_path,
-        impl_generics,
-        decl,
-        func_inputs,
-        fn_ident,
-        func_params,
-        func_attrs,
-    );
 
     quote! {
         #kernel_func
-
-        #kernel_func_async
     }
 }
diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cuda_generic_function.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cuda_generic_function.rs
index a6d8ac550..8a5de226e 100644
--- a/rust-cuda-derive/src/kernel/wrapper/generate/cuda_generic_function.rs
+++ b/rust-cuda-derive/src/kernel/wrapper/generate/cuda_generic_function.rs
@@ -11,20 +11,20 @@ pub(in super::super) fn quote_cuda_generic_function(
         generic_close_token,
         ..
     }: &DeclGenerics,
-    func_inputs: &syn::punctuated::Punctuated<syn::FnArg, syn::token::Comma>,
+    func_inputs: &syn::punctuated::Punctuated<syn::PatType, syn::token::Comma>,
     FuncIdent { func_ident, .. }: &FuncIdent,
     func_attrs: &[syn::Attribute],
     func_block: &syn::Block,
 ) -> TokenStream {
     let kernel_func_inputs = func_inputs
         .iter()
-        .map(|arg| match arg {
-            syn::FnArg::Typed(syn::PatType {
-                attrs,
-                ty,
-                pat,
-                colon_token,
-            }) => {
+        .map(
+            |syn::PatType {
+                 attrs,
+                 ty,
+                 pat,
+                 colon_token,
+             }| {
                 let ty: syn::Type = syn::parse_quote_spanned! { ty.span()=>
                     <#ty as #crate_path::common::CudaKernelParameter>::DeviceType<'_>
                 };
@@ -36,8 +36,7 @@ pub(in super::super) fn quote_cuda_generic_function(
                     colon_token: *colon_token,
                 })
             },
-            syn::FnArg::Receiver(_) => unreachable!(),
-        })
+        )
         .collect::<Vec<_>>();
 
     quote! {
diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cuda_wrapper.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cuda_wrapper.rs
index 24365ee29..7fce0a925 100644
--- a/rust-cuda-derive/src/kernel/wrapper/generate/cuda_wrapper.rs
+++ b/rust-cuda-derive/src/kernel/wrapper/generate/cuda_wrapper.rs
@@ -9,10 +9,7 @@ use super::super::{
 #[allow(clippy::too_many_lines)]
 pub(in super::super) fn quote_cuda_wrapper(
     crate_path: &syn::Path,
-    inputs @ FunctionInputs {
-        func_inputs,
-        func_input_cuda_types,
-    }: &FunctionInputs,
+    inputs @ FunctionInputs { func_inputs }: &FunctionInputs,
     func @ FuncIdent {
         func_ident,
         func_ident_hash,
@@ -39,34 +36,27 @@ pub(in super::super) fn quote_cuda_wrapper(
         .collect::<Vec<_>>();
 
     let ffi_param_ptx_jit_wrap = func_inputs
-        .iter().zip(func_input_cuda_types.iter()).enumerate()
+        .iter().enumerate()
         .rev()
         .fold(quote! {
             #func_ident(#(#func_params),*)
-        }, |inner, (_i, (arg, _ptx_jit))| match arg {
-            syn::FnArg::Typed(syn::PatType {
-                pat,
-                ty,
-                ..
-            }) => {
-                // Emit PTX JIT load markers
-                // let ptx_jit_load = if ptx_jit.0 {
-                //     quote! {
-                //         #crate_path::ptx_jit::PtxJITConstLoad!([#i] => #pat.as_ref())
-                //     }
-                // } else { quote! {} };
+        }, |inner, (i, syn::PatType {
+            pat,
+            ty,
+            ..
+        })| {
+            let specialised_ty = quote::quote_spanned! { ty.span()=>
+                #crate_path::device::specialise_kernel_type!(#ty for #generics in #func_ident)
+            };
 
-                let specialised_ty = quote::quote_spanned! { ty.span()=>
-                    #crate_path::device::specialise_kernel_type!(#ty for #generics in #func_ident)
-                };
-
-                quote::quote_spanned! { ty.span()=>
-                    <#specialised_ty as #crate_path::common::CudaKernelParameter>::with_ffi_as_device(
-                        #pat, |#pat| { #inner }
-                    )
-                }
-            },
-            syn::FnArg::Receiver(_) => unreachable!(),
+            // Load the device param from its FFI representation
+            // To allow some parameters to also inject PTX JIT load markers here,
+            //  we pass them the param index i
+            quote::quote_spanned! { ty.span()=>
+                <#specialised_ty as #crate_path::common::CudaKernelParameter>::with_ffi_as_device::<_, #i>(
+                    #pat, |#pat| { #inner }
+                )
+            }
         });
 
     quote! {
@@ -108,39 +98,34 @@ pub(in super::super) fn quote_cuda_wrapper(
 
 fn specialise_ffi_input_types(
     crate_path: &syn::Path,
-    FunctionInputs { func_inputs, .. }: &FunctionInputs,
+    FunctionInputs { func_inputs }: &FunctionInputs,
     FuncIdent { func_ident, .. }: &FuncIdent,
     ImplGenerics { impl_generics, .. }: &ImplGenerics,
 ) -> (Vec<syn::FnArg>, Vec<syn::Type>) {
     func_inputs
         .iter()
-        .map(|arg| match arg {
-            syn::FnArg::Typed(
-                syn::PatType {
-                    attrs,
-                    pat,
-                    colon_token,
-                    ty,
-                },
-            ) => {
-                let specialised_ty = quote::quote_spanned! { ty.span()=>
-                    #crate_path::device::specialise_kernel_type!(#ty for #impl_generics in #func_ident)
-                };
+        .map(|syn::PatType {
+            attrs,
+            pat,
+            colon_token,
+            ty,
+        }| {
+            let specialised_ty = quote::quote_spanned! { ty.span()=>
+                #crate_path::device::specialise_kernel_type!(#ty for #impl_generics in #func_ident)
+            };
 
-                let ffi_ty: syn::Type = syn::parse_quote_spanned! { ty.span()=>
-                    <#specialised_ty as #crate_path::common::CudaKernelParameter>::FfiType<'static, 'static>
-                };
+            let ffi_ty: syn::Type = syn::parse_quote_spanned! { ty.span()=>
+                <#specialised_ty as #crate_path::common::CudaKernelParameter>::FfiType<'static, 'static>
+            };
 
-                let ffi_param = syn::FnArg::Typed(syn::PatType {
-                    attrs: attrs.clone(),
-                    ty: Box::new(ffi_ty.clone()),
-                    pat: pat.clone(),
-                    colon_token: *colon_token,
-                });
+            let ffi_param = syn::FnArg::Typed(syn::PatType {
+                attrs: attrs.clone(),
+                ty: Box::new(ffi_ty.clone()),
+                pat: pat.clone(),
+                colon_token: *colon_token,
+            });
 
-                (ffi_param, ffi_ty)
-            },
-            syn::FnArg::Receiver(_) => unreachable!(),
+            (ffi_param, ffi_ty)
         })
         .unzip()
 }
diff --git a/rust-cuda-derive/src/kernel/wrapper/inputs/attribute.rs b/rust-cuda-derive/src/kernel/wrapper/inputs/attribute.rs
deleted file mode 100644
index 4ca2ff7bf..000000000
--- a/rust-cuda-derive/src/kernel/wrapper/inputs/attribute.rs
+++ /dev/null
@@ -1,65 +0,0 @@
-use syn::spanned::Spanned;
-
-pub(super) enum KernelInputAttribute {
-    PtxJit(proc_macro2::Span, bool),
-}
-
-impl syn::parse::Parse for KernelInputAttribute {
-    fn parse(input: syn::parse::ParseStream) -> syn::Result<Self> {
-        let ident: syn::Ident = input.parse()?;
-
-        match &*ident.to_string() {
-            "jit" => {
-                let eq: Option<syn::token::Eq> = input.parse()?;
-
-                let (ptx_jit, span) = if eq.is_some() {
-                    let value: syn::LitBool = input.parse()?;
-
-                    (
-                        value.value(),
-                        ident
-                            .span()
-                            .join(eq.span())
-                            .unwrap()
-                            .span()
-                            .join(value.span())
-                            .unwrap(),
-                    )
-                } else {
-                    (true, ident.span())
-                };
-
-                Ok(KernelInputAttribute::PtxJit(span, ptx_jit))
-            },
-            _ => abort!(
-                ident.span(),
-                "Unexpected kernel attribute `{:?}`: Expected `jit`.",
-                ident
-            ),
-        }
-    }
-}
-
-pub(super) struct KernelInputAttributes(Vec<KernelInputAttribute>);
-
-impl syn::parse::Parse for KernelInputAttributes {
-    fn parse(input: syn::parse::ParseStream) -> syn::Result<Self> {
-        let content;
-        let _parens = syn::parenthesized!(content in input);
-
-        syn::punctuated::Punctuated::<
-            KernelInputAttribute, syn::token::Comma
-        >::parse_separated_nonempty(&content).map(|punctuated| {
-            Self(punctuated.into_iter().collect())
-        })
-    }
-}
-
-impl IntoIterator for KernelInputAttributes {
-    type IntoIter = std::vec::IntoIter<Self::Item>;
-    type Item = KernelInputAttribute;
-
-    fn into_iter(self) -> Self::IntoIter {
-        self.0.into_iter()
-    }
-}
diff --git a/rust-cuda-derive/src/kernel/wrapper/inputs/mod.rs b/rust-cuda-derive/src/kernel/wrapper/inputs/mod.rs
deleted file mode 100644
index 154503702..000000000
--- a/rust-cuda-derive/src/kernel/wrapper/inputs/mod.rs
+++ /dev/null
@@ -1,87 +0,0 @@
-use syn::spanned::Spanned;
-
-use super::InputPtxJit;
-
-mod attribute;
-use attribute::{KernelInputAttribute, KernelInputAttributes};
-
-pub(super) struct FunctionInputs {
-    pub(super) func_inputs: syn::punctuated::Punctuated<syn::FnArg, syn::token::Comma>,
-    pub(super) func_input_cuda_types: Vec<InputPtxJit>,
-}
-
-pub(super) fn parse_function_inputs(func: &syn::ItemFn) -> FunctionInputs {
-    let (func_inputs, func_input_cuda_types): (
-        syn::punctuated::Punctuated<syn::FnArg, syn::token::Comma>,
-        Vec<InputPtxJit>,
-    ) = func
-        .sig
-        .inputs
-        .iter()
-        .map(|arg| match arg {
-            receiver @ syn::FnArg::Receiver(_) => {
-                abort!(receiver.span(), "Kernel function must not have a receiver.")
-            },
-            syn::FnArg::Typed(syn::PatType {
-                attrs,
-                pat,
-                colon_token,
-                ty,
-            }) => {
-                let mut ptx_jit: Option<InputPtxJit> = None;
-
-                let attrs = attrs
-                    .iter()
-                    .filter(|attr| match attr.path.get_ident() {
-                        Some(ident) if ident == "kernel" => {
-                            let attrs: KernelInputAttributes =
-                                match syn::parse_macro_input::parse(attr.tokens.clone().into()) {
-                                    Ok(data) => data,
-                                    Err(err) => abort!(attr.span(), err),
-                                };
-
-                            for attr in attrs {
-                                match attr {
-                                    KernelInputAttribute::PtxJit(span, jit)
-                                        if ptx_jit.is_none() =>
-                                    {
-                                        if !matches!(&**ty, syn::Type::Reference(_)) && jit {
-                                            abort!(
-                                                span,
-                                                "Only reference types can be PTX JIT loaded."
-                                            );
-                                        }
-
-                                        ptx_jit = Some(InputPtxJit(jit));
-                                    },
-                                    KernelInputAttribute::PtxJit(span, _jit) => {
-                                        abort!(span, "Duplicate PTX JIT declaration.");
-                                    },
-                                }
-                            }
-
-                            false
-                        },
-                        _ => true,
-                    })
-                    .cloned()
-                    .collect();
-
-                (
-                    syn::FnArg::Typed(syn::PatType {
-                        attrs,
-                        pat: pat.clone(),
-                        colon_token: *colon_token,
-                        ty: ty.clone(),
-                    }),
-                    ptx_jit.unwrap_or(InputPtxJit(false)),
-                )
-            },
-        })
-        .unzip();
-
-    FunctionInputs {
-        func_inputs,
-        func_input_cuda_types,
-    }
-}
diff --git a/rust-cuda-derive/src/kernel/wrapper/mod.rs b/rust-cuda-derive/src/kernel/wrapper/mod.rs
index 79bae8dbd..4486f4c49 100644
--- a/rust-cuda-derive/src/kernel/wrapper/mod.rs
+++ b/rust-cuda-derive/src/kernel/wrapper/mod.rs
@@ -7,7 +7,6 @@ use proc_macro::TokenStream;
 
 mod config;
 mod generate;
-mod inputs;
 mod parse;
 
 use super::lints::{parse_ptx_lint_level, LintLevel, PtxLint};
@@ -17,7 +16,6 @@ use generate::{
     cpu_linker_macro::quote_cpu_linker_macro, cpu_wrapper::quote_cpu_wrapper,
     cuda_generic_function::quote_cuda_generic_function, cuda_wrapper::quote_cuda_wrapper,
 };
-use inputs::{parse_function_inputs, FunctionInputs};
 use parse::parse_kernel_fn;
 use proc_macro2::{Ident, Span};
 use syn::spanned::Spanned;
@@ -130,7 +128,19 @@ pub fn kernel(attr: TokenStream, func: TokenStream) -> TokenStream {
         }
     };
 
-    let mut func_inputs = parse_function_inputs(&func);
+    let mut func_inputs = FunctionInputs {
+        func_inputs: func
+            .sig
+            .inputs
+            .into_iter()
+            .map(|arg| match arg {
+                syn::FnArg::Typed(arg) => arg,
+                syn::FnArg::Receiver(_) => {
+                    unreachable!("already checked that no receiver arg exists")
+                },
+            })
+            .collect(),
+    };
 
     let generic_kernel_params = func.sig.generics.params.clone();
     let (generic_start_token, generic_close_token) =
@@ -161,7 +171,6 @@ pub fn kernel(attr: TokenStream, func: TokenStream) -> TokenStream {
 
     let func_ident = FuncIdent {
         func_ident: &func.sig.ident,
-        func_ident_async: quote::format_ident!("{}_async", &func.sig.ident),
         func_ident_hash: quote::format_ident!("{}_{:016x}", &func.sig.ident, kernel_hash),
     };
 
@@ -169,12 +178,9 @@ pub fn kernel(attr: TokenStream, func: TokenStream) -> TokenStream {
         .func_inputs
         .iter()
         .enumerate()
-        .map(|(i, arg)| match arg {
-            syn::FnArg::Typed(syn::PatType { pat, .. }) => match ident_from_pat(pat) {
-                Some(ident) => ident,
-                None => syn::Ident::new(&format!("{}_arg_{i}", func_ident.func_ident), pat.span()),
-            },
-            syn::FnArg::Receiver(_) => unreachable!(),
+        .map(|(i, syn::PatType { pat, .. })| match ident_from_pat(pat) {
+            Some(ident) => ident,
+            None => syn::Ident::new(&format!("{}_arg_{i}", func_ident.func_ident), pat.span()),
         })
         .collect::<Vec<_>>();
 
@@ -182,29 +188,28 @@ pub fn kernel(attr: TokenStream, func: TokenStream) -> TokenStream {
         .func_inputs
         .iter_mut()
         .zip(&func_params)
-        .map(|(arg, ident)| match arg {
-            syn::FnArg::Typed(syn::PatType {
+        .map(|(arg, ident)| {
+            let syn::PatType {
                 attrs,
                 colon_token,
                 ty,
                 ..
-            }) => {
-                let ident_fn_arg = syn::FnArg::Typed(syn::PatType {
-                    attrs: attrs.clone(),
-                    pat: Box::new(syn::Pat::Ident(syn::PatIdent {
-                        attrs: Vec::new(),
-                        by_ref: None,
-                        mutability: None,
-                        ident: ident.clone(),
-                        subpat: None,
-                    })),
-                    colon_token: *colon_token,
-                    ty: ty.clone(),
-                });
-
-                std::mem::replace(arg, ident_fn_arg)
-            },
-            syn::FnArg::Receiver(_) => unreachable!(),
+            } = arg;
+
+            let ident_fn_arg = syn::PatType {
+                attrs: attrs.clone(),
+                pat: Box::new(syn::Pat::Ident(syn::PatIdent {
+                    attrs: Vec::new(),
+                    by_ref: None,
+                    mutability: None,
+                    ident: ident.clone(),
+                    subpat: None,
+                })),
+                colon_token: *colon_token,
+                ty: ty.clone(),
+            };
+
+            std::mem::replace(arg, ident_fn_arg)
         })
         .collect();
 
@@ -258,7 +263,9 @@ pub fn kernel(attr: TokenStream, func: TokenStream) -> TokenStream {
     .into()
 }
 
-struct InputPtxJit(bool);
+struct FunctionInputs {
+    func_inputs: syn::punctuated::Punctuated<syn::PatType, syn::token::Comma>,
+}
 
 #[allow(clippy::struct_field_names)]
 struct DeclGenerics<'f> {
@@ -276,7 +283,6 @@ struct ImplGenerics<'f> {
 #[allow(clippy::struct_field_names)]
 struct FuncIdent<'f> {
     func_ident: &'f syn::Ident,
-    func_ident_async: syn::Ident,
     func_ident_hash: syn::Ident,
 }
 
diff --git a/rust-cuda-derive/src/kernel/wrapper/parse.rs b/rust-cuda-derive/src/kernel/wrapper/parse.rs
index 56aa60053..6d31697cf 100644
--- a/rust-cuda-derive/src/kernel/wrapper/parse.rs
+++ b/rust-cuda-derive/src/kernel/wrapper/parse.rs
@@ -41,6 +41,20 @@ pub(super) fn parse_kernel_fn(tokens: TokenStream) -> syn::ItemFn {
         );
     }
 
+    for param in &func.sig.inputs {
+        if let syn::FnArg::Receiver(receiver) = param {
+            abort!(receiver.span(), "Kernel function must not have a receiver.");
+        }
+    }
+
+    if func.sig.inputs.len() > 12 {
+        abort!(
+            func.sig.inputs.span(),
+            "Kernel function has too many arguments, {} were found but at most 12 are supported.",
+            func.sig.inputs.len()
+        );
+    }
+
     match &func.sig.output {
         syn::ReturnType::Default => (),
         syn::ReturnType::Type(_, box syn::Type::Tuple(tuple)) if tuple.elems.is_empty() => (),
diff --git a/rust-cuda-ptx-jit/Cargo.toml b/rust-cuda-ptx-jit/Cargo.toml
deleted file mode 100644
index dc5fe4249..000000000
--- a/rust-cuda-ptx-jit/Cargo.toml
+++ /dev/null
@@ -1,17 +0,0 @@
-[package]
-name = "rust-cuda-ptx-jit"
-version = "0.1.0"
-authors = ["Juniper Tyree <juniper.tyree@helsinki.fi>"]
-license = "MIT OR Apache-2.0"
-edition = "2021"
-
-# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
-
-[features]
-default = []
-host = ["regex", "rustacuda", "lazy_static"]
-
-[dependencies]
-rustacuda = { git = "https://github.com/juntyr/RustaCUDA", rev = "c6ea7cc", optional = true }
-regex = { version = "1.5", optional = true }
-lazy_static = { version = "1.4", optional = true }
diff --git a/rust-cuda-ptx-jit/src/device.rs b/rust-cuda-ptx-jit/src/device.rs
deleted file mode 100644
index c647a65eb..000000000
--- a/rust-cuda-ptx-jit/src/device.rs
+++ /dev/null
@@ -1,13 +0,0 @@
-#[macro_export]
-#[doc(hidden)]
-#[doc(cfg(not(feature = "host")))]
-macro_rules! PtxJITConstLoad {
-    ([$index:literal] => $reference:expr) => {
-        unsafe {
-            ::core::arch::asm!(
-                ::core::concat!("// <rust-cuda-ptx-jit-const-load-{}-", $index, "> //"),
-                in(reg32) *($reference as *const _ as *const u32),
-            )
-        }
-    };
-}
diff --git a/rust-cuda-ptx-jit/src/host/regex.rs b/rust-cuda-ptx-jit/src/host/regex.rs
deleted file mode 100644
index 5cff3bdc9..000000000
--- a/rust-cuda-ptx-jit/src/host/regex.rs
+++ /dev/null
@@ -1,46 +0,0 @@
-#[allow(unused_imports)]
-use regex::bytes::Regex;
-
-lazy_static::lazy_static! {
-    pub static ref CONST_MARKER_REGEX: Regex = {
-        Regex::new(
-            r"(?-u)// <rust-cuda-ptx-jit-const-load-(?P<tmpreg>%r\d+)-(?P<param>\d+)> //"
-        ).unwrap()
-    };
-
-    pub static ref CONST_BASE_REGISTER_REGEX: Regex = {
-        Regex::new(
-            r"(?-u)ld\.global\.u32\s*(?P<tmpreg>%r\d+)\s*,\s*\[(?P<basereg>%r[ds]?\d+)]\s*;",
-        ).unwrap()
-    };
-
-    pub static ref CONST_LOAD_INSTRUCTION_REGEX: Regex = {
-        Regex::new(
-            r"(?x-u)(?P<instruction>
-                ld\.global
-                (?:\.(?P<vector>v[24]))?
-                \.
-                (?P<loadtype>[suf])
-                (?P<loadwidth>8|16|32|64)
-                \s*
-                (?P<constreg>
-                    (?:%[rf][sd]?\d+) |
-                    (?:\{(?:\s*%[rf][sd]?\d+,)*\s*%[rf][sd]?\d+\s*\})
-                )
-                ,\s*
-                \[
-                (?P<basereg>%r[ds]?\d+)
-                (?:
-                    \+
-                    (?P<loadoffset>\d+)
-                )?
-                \]
-                \s*;
-            )",
-        ).unwrap()
-    };
-
-    pub static ref REGISTER_REGEX: Regex = {
-        Regex::new(r"(?-u)(?P<register>%[rf][sd]?\d+)").unwrap()
-    };
-}
diff --git a/rust-cuda-ptx-jit/src/lib.rs b/rust-cuda-ptx-jit/src/lib.rs
deleted file mode 100644
index 8b25fc9a0..000000000
--- a/rust-cuda-ptx-jit/src/lib.rs
+++ /dev/null
@@ -1,23 +0,0 @@
-#![deny(clippy::pedantic)]
-#![cfg_attr(not(feature = "host"), no_std)]
-#![feature(cfg_version)]
-#![cfg_attr(not(version("1.76.0")), feature(ptr_from_ref))]
-#![feature(doc_cfg)]
-#![doc(html_root_url = "https://juntyr.github.io/rust-cuda/")]
-
-#[cfg(feature = "host")]
-mod host;
-
-#[cfg(feature = "host")]
-pub use host::{PtxJITCompiler, PtxJITResult};
-
-#[cfg(any(not(feature = "host"), doc))]
-#[doc(cfg(not(feature = "host")))]
-mod device;
-
-pub fn arg_as_raw_bytes<T: ?Sized>(r: &T) -> *const [u8] {
-    core::ptr::slice_from_raw_parts(
-        core::ptr::from_ref(r).cast::<u8>(),
-        core::mem::size_of_val(r),
-    )
-}
diff --git a/src/common.rs b/src/common.rs
index c4a880262..5360ccbbc 100644
--- a/src/common.rs
+++ b/src/common.rs
@@ -8,7 +8,10 @@ use core::{
 #[cfg(feature = "host")]
 use alloc::fmt;
 #[cfg(feature = "host")]
-use core::{mem::MaybeUninit, ptr::copy_nonoverlapping};
+use core::{
+    mem::MaybeUninit,
+    ptr::{copy_nonoverlapping, NonNull},
+};
 
 use const_type_layout::TypeGraphLayout;
 use rustacuda_core::DeviceCopy;
@@ -333,19 +336,43 @@ pub trait CudaKernelParameter: sealed::Sealed {
         inner: impl for<'b> FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result<O, E>,
     ) -> Result<O, E>;
 
+    #[cfg(feature = "host")]
+    fn with_async_as_ptx_jit<O>(
+        param: &Self::AsyncHostType<'_, '_>,
+        inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O,
+    ) -> O;
+
     #[cfg(feature = "host")]
     fn async_to_ffi<'stream, 'b>(
         param: Self::AsyncHostType<'stream, 'b>,
     ) -> Self::FfiType<'stream, 'b>;
 
     #[cfg(not(feature = "host"))]
-    fn with_ffi_as_device<O>(
+    fn with_ffi_as_device<O, const PARAM: usize>(
         param: Self::FfiType<'static, 'static>,
         inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O,
     ) -> O;
 }
 
-#[repr(transparent)]
+pub struct PtxJit<T> {
+    never: !,
+    _marker: PhantomData<T>,
+}
+
+impl<T> Deref for PtxJit<T> {
+    type Target = T;
+
+    fn deref(&self) -> &Self::Target {
+        self.never
+    }
+}
+
+impl<T> DerefMut for PtxJit<T> {
+    fn deref_mut(&mut self) -> &mut Self::Target {
+        self.never
+    }
+}
+
 pub struct PerThreadShallowCopy<
     T: crate::safety::SafeDeviceCopy
         + crate::safety::NoSafeAliasing
@@ -403,6 +430,14 @@ impl<
         ))
     }
 
+    #[cfg(feature = "host")]
+    fn with_async_as_ptx_jit<O>(
+        _param: &Self::AsyncHostType<'_, '_>,
+        inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O,
+    ) -> O {
+        inner(None)
+    }
+
     #[cfg(feature = "host")]
     fn async_to_ffi<'stream, 'b>(
         param: Self::AsyncHostType<'stream, 'b>,
@@ -411,7 +446,7 @@ impl<
     }
 
     #[cfg(not(feature = "host"))]
-    fn with_ffi_as_device<O>(
+    fn with_ffi_as_device<O, const PARAM: usize>(
         param: Self::FfiType<'static, 'static>,
         inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O,
     ) -> O {
@@ -469,6 +504,14 @@ impl<
         inner(const_ref.as_async())
     }
 
+    #[cfg(feature = "host")]
+    fn with_async_as_ptx_jit<O>(
+        _param: &Self::AsyncHostType<'_, '_>,
+        inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O,
+    ) -> O {
+        inner(None)
+    }
+
     #[cfg(feature = "host")]
     fn async_to_ffi<'stream, 'b>(
         param: Self::AsyncHostType<'stream, 'b>,
@@ -477,7 +520,7 @@ impl<
     }
 
     #[cfg(not(feature = "host"))]
-    fn with_ffi_as_device<O>(
+    fn with_ffi_as_device<O, const PARAM: usize>(
         param: Self::FfiType<'static, 'static>,
         inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O,
     ) -> O {
@@ -495,7 +538,68 @@ impl<
 {
 }
 
-#[repr(transparent)]
+impl<
+        'a,
+        T: 'static
+            + crate::safety::SafeDeviceCopy
+            + crate::safety::NoSafeAliasing
+            + const_type_layout::TypeGraphLayout,
+    > CudaKernelParameter for &'a PtxJit<PerThreadShallowCopy<T>>
+{
+    #[cfg(feature = "host")]
+    type AsyncHostType<'stream, 'b> =
+        <&'a PerThreadShallowCopy<T> as CudaKernelParameter>::AsyncHostType<'stream, 'b>;
+    type DeviceType<'b> = <&'a PerThreadShallowCopy<T> as CudaKernelParameter>::DeviceType<'b>;
+    type FfiType<'stream, 'b> =
+        <&'a PerThreadShallowCopy<T> as CudaKernelParameter>::FfiType<'stream, 'b>;
+    #[cfg(feature = "host")]
+    type SyncHostType = <&'a PerThreadShallowCopy<T> as CudaKernelParameter>::SyncHostType;
+
+    #[cfg(feature = "host")]
+    fn with_new_async<'stream, O, E: From<rustacuda::error::CudaError>>(
+        param: Self::SyncHostType,
+        stream: &'stream rustacuda::stream::Stream,
+        inner: impl for<'b> FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result<O, E>,
+    ) -> Result<O, E> {
+        <&'a PerThreadShallowCopy<T> as CudaKernelParameter>::with_new_async(param, stream, inner)
+    }
+
+    #[cfg(feature = "host")]
+    fn with_async_as_ptx_jit<O>(
+        param: &Self::AsyncHostType<'_, '_>,
+        inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O,
+    ) -> O {
+        inner(Some(&param_as_raw_bytes(param.for_host())))
+    }
+
+    #[cfg(feature = "host")]
+    fn async_to_ffi<'stream, 'b>(
+        param: Self::AsyncHostType<'stream, 'b>,
+    ) -> Self::FfiType<'stream, 'b> {
+        <&'a PerThreadShallowCopy<T> as CudaKernelParameter>::async_to_ffi(param)
+    }
+
+    #[cfg(not(feature = "host"))]
+    fn with_ffi_as_device<O, const PARAM: usize>(
+        param: Self::FfiType<'static, 'static>,
+        inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O,
+    ) -> O {
+        emit_param_ptx_jit_marker::<_, PARAM>(param.as_ref());
+
+        <&'a PerThreadShallowCopy<T> as CudaKernelParameter>::with_ffi_as_device::<O, PARAM>(
+            param, inner,
+        )
+    }
+}
+impl<
+        'a,
+        T: crate::safety::SafeDeviceCopy
+            + crate::safety::NoSafeAliasing
+            + const_type_layout::TypeGraphLayout,
+    > sealed::Sealed for &'a PtxJit<PerThreadShallowCopy<T>>
+{
+}
+
 pub struct ShallowInteriorMutable<T: InteriorMutableSafeDeviceCopy> {
     never: !,
     _marker: PhantomData<T>,
@@ -553,6 +657,14 @@ impl<'a, T: 'static + InteriorMutableSafeDeviceCopy> CudaKernelParameter
         result
     }
 
+    #[cfg(feature = "host")]
+    fn with_async_as_ptx_jit<O>(
+        _param: &Self::AsyncHostType<'_, '_>,
+        inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O,
+    ) -> O {
+        inner(None)
+    }
+
     #[cfg(feature = "host")]
     fn async_to_ffi<'stream, 'b>(
         param: Self::AsyncHostType<'stream, 'b>,
@@ -561,7 +673,7 @@ impl<'a, T: 'static + InteriorMutableSafeDeviceCopy> CudaKernelParameter
     }
 
     #[cfg(not(feature = "host"))]
-    fn with_ffi_as_device<O>(
+    fn with_ffi_as_device<O, const PARAM: usize>(
         param: Self::FfiType<'static, 'static>,
         inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O,
     ) -> O {
@@ -602,7 +714,6 @@ impl_atomic_interior_mutable! {
 // impl<T: crate::safety::SafeDeviceCopy> sealed::Sealed for
 // core::cell::SyncUnsafeCell<T> {}
 
-#[repr(transparent)]
 pub struct SharedHeapPerThreadShallowCopy<T: RustToCuda + crate::safety::NoSafeAliasing> {
     never: !,
     _marker: PhantomData<T>,
@@ -650,6 +761,14 @@ impl<
         crate::host::LendToCuda::move_to_cuda(param, |param| inner(param.into_async()))
     }
 
+    #[cfg(feature = "host")]
+    fn with_async_as_ptx_jit<O>(
+        _param: &Self::AsyncHostType<'_, '_>,
+        inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O,
+    ) -> O {
+        inner(None)
+    }
+
     #[cfg(feature = "host")]
     fn async_to_ffi<'stream, 'b>(
         param: Self::AsyncHostType<'stream, 'b>,
@@ -658,7 +777,7 @@ impl<
     }
 
     #[cfg(not(feature = "host"))]
-    fn with_ffi_as_device<O>(
+    fn with_ffi_as_device<O, const PARAM: usize>(
         param: Self::FfiType<'static, 'static>,
         inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O,
     ) -> O {
@@ -701,6 +820,14 @@ impl<'a, T: 'static + RustToCuda + crate::safety::NoSafeAliasing> CudaKernelPara
         crate::host::LendToCuda::lend_to_cuda(param, |param| inner(param.as_async()))
     }
 
+    #[cfg(feature = "host")]
+    fn with_async_as_ptx_jit<O>(
+        _param: &Self::AsyncHostType<'_, '_>,
+        inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O,
+    ) -> O {
+        inner(None)
+    }
+
     #[cfg(feature = "host")]
     fn async_to_ffi<'stream, 'b>(
         param: Self::AsyncHostType<'stream, 'b>,
@@ -709,7 +836,7 @@ impl<'a, T: 'static + RustToCuda + crate::safety::NoSafeAliasing> CudaKernelPara
     }
 
     #[cfg(not(feature = "host"))]
-    fn with_ffi_as_device<O>(
+    fn with_ffi_as_device<O, const PARAM: usize>(
         param: Self::FfiType<'static, 'static>,
         inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O,
     ) -> O {
@@ -749,6 +876,14 @@ impl<'a, T: 'static + RustToCuda + crate::safety::NoSafeAliasing> CudaKernelPara
         crate::host::LendToCuda::lend_to_cuda_mut(param, |mut param| inner(param.as_async()))
     }
 
+    #[cfg(feature = "host")]
+    fn with_async_as_ptx_jit<O>(
+        _param: &Self::AsyncHostType<'_, '_>,
+        inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O,
+    ) -> O {
+        inner(None)
+    }
+
     #[cfg(feature = "host")]
     fn async_to_ffi<'stream, 'b>(
         mut param: Self::AsyncHostType<'stream, 'b>,
@@ -757,7 +892,7 @@ impl<'a, T: 'static + RustToCuda + crate::safety::NoSafeAliasing> CudaKernelPara
     }
 
     #[cfg(not(feature = "host"))]
-    fn with_ffi_as_device<O>(
+    fn with_ffi_as_device<O, const PARAM: usize>(
         mut param: Self::FfiType<'static, 'static>,
         inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O,
     ) -> O {
@@ -773,3 +908,201 @@ impl<'a, T: RustToCuda + crate::safety::NoSafeAliasing> sealed::Sealed
     for &'a mut SharedHeapPerThreadShallowCopy<T>
 {
 }
+
+impl<
+        T: RustToCuda<
+                CudaRepresentation: 'static + crate::safety::SafeDeviceCopy,
+                CudaAllocation: EmptyCudaAlloc,
+            > + crate::safety::NoSafeAliasing,
+    > CudaKernelParameter for PtxJit<SharedHeapPerThreadShallowCopy<T>>
+{
+    #[cfg(feature = "host")]
+    type AsyncHostType<'stream, 'b> =
+        <SharedHeapPerThreadShallowCopy<T> as CudaKernelParameter>::AsyncHostType<'stream, 'b>;
+    type DeviceType<'b> =
+        <SharedHeapPerThreadShallowCopy<T> as CudaKernelParameter>::DeviceType<'b>;
+    type FfiType<'stream, 'b> =
+        <SharedHeapPerThreadShallowCopy<T> as CudaKernelParameter>::FfiType<'stream, 'b>;
+    #[cfg(feature = "host")]
+    type SyncHostType = <SharedHeapPerThreadShallowCopy<T> as CudaKernelParameter>::SyncHostType;
+
+    #[cfg(feature = "host")]
+    fn with_new_async<'stream, O, E: From<rustacuda::error::CudaError>>(
+        param: Self::SyncHostType,
+        stream: &'stream rustacuda::stream::Stream,
+        inner: impl for<'b> FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result<O, E>,
+    ) -> Result<O, E> {
+        <SharedHeapPerThreadShallowCopy<T> as CudaKernelParameter>::with_new_async(
+            param, stream, inner,
+        )
+    }
+
+    #[cfg(feature = "host")]
+    fn with_async_as_ptx_jit<O>(
+        param: &Self::AsyncHostType<'_, '_>,
+        inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O,
+    ) -> O {
+        inner(Some(&param_as_raw_bytes(param.for_host())))
+    }
+
+    #[cfg(feature = "host")]
+    fn async_to_ffi<'stream, 'b>(
+        param: Self::AsyncHostType<'stream, 'b>,
+    ) -> Self::FfiType<'stream, 'b> {
+        <SharedHeapPerThreadShallowCopy<T> as CudaKernelParameter>::async_to_ffi(param)
+    }
+
+    #[cfg(not(feature = "host"))]
+    fn with_ffi_as_device<O, const PARAM: usize>(
+        param: Self::FfiType<'static, 'static>,
+        inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O,
+    ) -> O {
+        emit_param_ptx_jit_marker::<_, PARAM>(param.as_ref());
+
+        <SharedHeapPerThreadShallowCopy<T> as CudaKernelParameter>::with_ffi_as_device::<O, PARAM>(
+            param, inner,
+        )
+    }
+}
+impl<
+        T: RustToCuda<
+                CudaRepresentation: crate::safety::SafeDeviceCopy,
+                CudaAllocation: EmptyCudaAlloc,
+            > + crate::safety::NoSafeAliasing,
+    > sealed::Sealed for PtxJit<SharedHeapPerThreadShallowCopy<T>>
+{
+}
+
+impl<'a, T: 'static + RustToCuda + crate::safety::NoSafeAliasing> CudaKernelParameter
+    for &'a PtxJit<SharedHeapPerThreadShallowCopy<T>>
+{
+    #[cfg(feature = "host")]
+    type AsyncHostType<'stream, 'b> =
+        <&'a SharedHeapPerThreadShallowCopy<T> as CudaKernelParameter>::AsyncHostType<'stream, 'b>;
+    type DeviceType<'b> =
+        <&'a SharedHeapPerThreadShallowCopy<T> as CudaKernelParameter>::DeviceType<'b>;
+    type FfiType<'stream, 'b> =
+        <&'a SharedHeapPerThreadShallowCopy<T> as CudaKernelParameter>::FfiType<'stream, 'b>;
+    #[cfg(feature = "host")]
+    type SyncHostType =
+        <&'a SharedHeapPerThreadShallowCopy<T> as CudaKernelParameter>::SyncHostType;
+
+    #[cfg(feature = "host")]
+    fn with_new_async<'stream, O, E: From<rustacuda::error::CudaError>>(
+        param: Self::SyncHostType,
+        stream: &'stream rustacuda::stream::Stream,
+        inner: impl for<'b> FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result<O, E>,
+    ) -> Result<O, E> {
+        <&'a SharedHeapPerThreadShallowCopy<T> as CudaKernelParameter>::with_new_async(
+            param, stream, inner,
+        )
+    }
+
+    #[cfg(feature = "host")]
+    fn with_async_as_ptx_jit<O>(
+        param: &Self::AsyncHostType<'_, '_>,
+        inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O,
+    ) -> O {
+        inner(Some(&param_as_raw_bytes(param.for_host())))
+    }
+
+    #[cfg(feature = "host")]
+    fn async_to_ffi<'stream, 'b>(
+        param: Self::AsyncHostType<'stream, 'b>,
+    ) -> Self::FfiType<'stream, 'b> {
+        <&'a SharedHeapPerThreadShallowCopy<T> as CudaKernelParameter>::async_to_ffi(param)
+    }
+
+    #[cfg(not(feature = "host"))]
+    fn with_ffi_as_device<O, const PARAM: usize>(
+        param: Self::FfiType<'static, 'static>,
+        inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O,
+    ) -> O {
+        emit_param_ptx_jit_marker::<_, PARAM>(param.as_ref());
+
+        <&'a SharedHeapPerThreadShallowCopy<T> as CudaKernelParameter>::with_ffi_as_device::<O, PARAM>(
+            param, inner,
+        )
+    }
+}
+impl<'a, T: RustToCuda + crate::safety::NoSafeAliasing> sealed::Sealed
+    for &'a PtxJit<SharedHeapPerThreadShallowCopy<T>>
+{
+}
+
+impl<'a, T: 'static + RustToCuda + crate::safety::NoSafeAliasing> CudaKernelParameter
+    for &'a mut PtxJit<SharedHeapPerThreadShallowCopy<T>>
+{
+    #[cfg(feature = "host")]
+    type AsyncHostType<'stream, 'b> =
+        <&'a mut SharedHeapPerThreadShallowCopy<T> as CudaKernelParameter>::AsyncHostType<
+            'stream,
+            'b,
+        >;
+    type DeviceType<'b> =
+        <&'a mut SharedHeapPerThreadShallowCopy<T> as CudaKernelParameter>::DeviceType<'b>;
+    type FfiType<'stream, 'b> =
+        <&'a mut SharedHeapPerThreadShallowCopy<T> as CudaKernelParameter>::FfiType<'stream, 'b>;
+    #[cfg(feature = "host")]
+    type SyncHostType =
+        <&'a mut SharedHeapPerThreadShallowCopy<T> as CudaKernelParameter>::SyncHostType;
+
+    #[cfg(feature = "host")]
+    fn with_new_async<'stream, O, E: From<rustacuda::error::CudaError>>(
+        param: Self::SyncHostType,
+        stream: &'stream rustacuda::stream::Stream,
+        inner: impl for<'b> FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result<O, E>,
+    ) -> Result<O, E> {
+        <&'a mut SharedHeapPerThreadShallowCopy<T> as CudaKernelParameter>::with_new_async(
+            param, stream, inner,
+        )
+    }
+
+    #[cfg(feature = "host")]
+    fn with_async_as_ptx_jit<O>(
+        param: &Self::AsyncHostType<'_, '_>,
+        inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O,
+    ) -> O {
+        inner(Some(&param_as_raw_bytes(param.for_host())))
+    }
+
+    #[cfg(feature = "host")]
+    fn async_to_ffi<'stream, 'b>(
+        param: Self::AsyncHostType<'stream, 'b>,
+    ) -> Self::FfiType<'stream, 'b> {
+        <&'a mut SharedHeapPerThreadShallowCopy<T> as CudaKernelParameter>::async_to_ffi(param)
+    }
+
+    #[cfg(not(feature = "host"))]
+    fn with_ffi_as_device<O, const PARAM: usize>(
+        param: Self::FfiType<'static, 'static>,
+        inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O,
+    ) -> O {
+        emit_param_ptx_jit_marker::<_, PARAM>(param.as_ref());
+
+        <&'a mut SharedHeapPerThreadShallowCopy<T> as CudaKernelParameter>::with_ffi_as_device::<
+            O,
+            PARAM,
+        >(param, inner)
+    }
+}
+impl<'a, T: RustToCuda + crate::safety::NoSafeAliasing> sealed::Sealed
+    for &'a mut PtxJit<SharedHeapPerThreadShallowCopy<T>>
+{
+}
+
+#[cfg(feature = "host")]
+fn param_as_raw_bytes<T: ?Sized>(r: &T) -> NonNull<[u8]> {
+    NonNull::slice_from_raw_parts(NonNull::from(r).cast::<u8>(), core::mem::size_of_val(r))
+}
+
+#[cfg(not(feature = "host"))]
+fn emit_param_ptx_jit_marker<T: ?Sized, const INDEX: usize>(param: &T) {
+    unsafe {
+        core::arch::asm!(
+            "// <rust-cuda-ptx-jit-const-load-{}-{}> //",
+            in(reg32) *(core::ptr::from_ref(param).cast::<u32>()),
+            const(INDEX),
+        );
+    }
+}
diff --git a/src/host.rs b/src/host/mod.rs
similarity index 77%
rename from src/host.rs
rename to src/host/mod.rs
index f15bca27e..6cb31a508 100644
--- a/src/host.rs
+++ b/src/host/mod.rs
@@ -1,3 +1,4 @@
+use core::ptr::NonNull;
 use std::{
     ffi::{CStr, CString},
     marker::PhantomData,
@@ -22,35 +23,99 @@ pub use rust_cuda_derive::{check_kernel, link_kernel, specialise_kernel_entry_po
 
 use crate::{
     common::{
-        DeviceAccessible, DeviceConstRef, DeviceMutRef, DeviceOwnedRef, EmptyCudaAlloc,
-        NoCudaAlloc, RustToCuda,
+        CudaKernelParameter, DeviceAccessible, DeviceConstRef, DeviceMutRef, DeviceOwnedRef,
+        EmptyCudaAlloc, NoCudaAlloc, RustToCuda,
     },
-    ptx_jit::{PtxJITCompiler, PtxJITResult},
     safety::SafeDeviceCopy,
 };
 
+mod ptx_jit;
+use ptx_jit::{PtxJITCompiler, PtxJITResult};
+
 pub struct Launcher<'stream, 'kernel, Kernel> {
     pub stream: &'stream Stream,
     pub kernel: &'kernel mut TypedPtxKernel<Kernel>,
     pub config: LaunchConfig,
 }
 
+macro_rules! impl_launcher_launch {
+    ($launch:ident($($arg:ident : $T:ident),*) => $launch_async:ident) => {
+        #[allow(clippy::missing_errors_doc)]
+        #[allow(clippy::too_many_arguments)] // func is defined for <= 12 args
+        pub fn $launch<$($T: CudaKernelParameter),*>(
+            &mut self,
+            $($arg: $T::SyncHostType),*
+        ) -> CudaResult<()>
+        where
+            Kernel: Copy + FnOnce(
+                &mut Launcher<Kernel>,
+                $($T::SyncHostType),*
+            ) -> CudaResult<()>,
+        {
+            self.kernel.$launch::<$($T),*>(self.stream, &self.config, $($arg),*)
+        }
+
+        #[allow(clippy::missing_errors_doc)]
+        #[allow(clippy::too_many_arguments)] // func is defined for <= 12 args
+        pub fn $launch_async<$($T: CudaKernelParameter),*>(
+            &mut self,
+            $($arg: $T::AsyncHostType<'stream, '_>),*
+        ) -> CudaResult<()>
+        where
+            Kernel: Copy + FnOnce(
+                &mut Launcher<Kernel>,
+                $($T::SyncHostType),*
+            ) -> CudaResult<()>,
+        {
+            self.kernel.$launch_async::<$($T),*>(self.stream, &self.config, $($arg),*)
+        }
+    };
+}
+
 impl<'stream, 'kernel, Kernel> Launcher<'stream, 'kernel, Kernel> {
-    #[allow(clippy::missing_errors_doc)]
-    pub fn launch0(&mut self) -> CudaResult<()>
-    where
-        Kernel: Copy + FnOnce(&mut Launcher<Kernel>) -> CudaResult<()>,
-    {
-        self.kernel.launch0(self.stream, &self.config)
-    }
+    impl_launcher_launch! { launch0() => launch0_async }
 
-    #[allow(clippy::missing_errors_doc)]
-    pub fn launch1<A>(&mut self, arg1: A) -> CudaResult<()>
-    where
-        Kernel: Copy + FnOnce(&mut Launcher<Kernel>, A) -> CudaResult<()>,
-    {
-        self.kernel.launch1(self.stream, &self.config, arg1)
-    }
+    impl_launcher_launch! { launch1(arg1: A) => launch1_async }
+
+    impl_launcher_launch! { launch2(arg1: A, arg2: B) => launch2_async }
+
+    impl_launcher_launch! { launch3(arg1: A, arg2: B, arg3: C) => launch3_async }
+
+    impl_launcher_launch! { launch4(arg1: A, arg2: B, arg3: C, arg4: D) => launch4_async }
+
+    impl_launcher_launch! { launch5(
+        arg1: A, arg2: B, arg3: C, arg4: D, arg5: E
+    ) => launch5_async }
+
+    impl_launcher_launch! { launch6(
+        arg1: A, arg2: B, arg3: C, arg4: D, arg5: E, arg6: F
+    ) => launch6_async }
+
+    impl_launcher_launch! { launch7(
+        arg1: A, arg2: B, arg3: C, arg4: D, arg5: E, arg6: F, arg7: G
+    ) => launch7_async }
+
+    impl_launcher_launch! { launch8(
+        arg1: A, arg2: B, arg3: C, arg4: D, arg5: E, arg6: F, arg7: G, arg8: H
+    ) => launch8_async }
+
+    impl_launcher_launch! { launch9(
+        arg1: A, arg2: B, arg3: C, arg4: D, arg5: E, arg6: F, arg7: G, arg8: H, arg9: I
+    ) => launch9_async }
+
+    impl_launcher_launch! { launch10(
+        arg1: A, arg2: B, arg3: C, arg4: D, arg5: E, arg6: F, arg7: G, arg8: H, arg9: I, arg10: J
+    ) => launch10_async }
+
+    impl_launcher_launch! { launch11(
+        arg1: A, arg2: B, arg3: C, arg4: D, arg5: E, arg6: F, arg7: G, arg8: H, arg9: I, arg10: J,
+        arg11: K
+    ) => launch11_async }
+
+    impl_launcher_launch! { launch12(
+        arg1: A, arg2: B, arg3: C, arg4: D, arg5: E, arg6: F, arg7: G, arg8: H, arg9: I, arg10: J,
+        arg11: K, arg12: L
+    ) => launch12_async }
 }
 
 #[derive(Clone, Debug, PartialEq)]
@@ -131,10 +196,133 @@ pub struct TypedPtxKernel<Kernel> {
     marker: PhantomData<Kernel>,
 }
 
+macro_rules! impl_typed_kernel_launch {
+    ($launch:ident($($arg:ident : $T:ident),*) => $launch_async:ident) => {
+        #[allow(clippy::missing_errors_doc)]
+        #[allow(clippy::too_many_arguments)] // func is defined for <= 12 args
+        pub fn $launch<$($T: CudaKernelParameter),*>(
+            &mut self,
+            stream: &Stream,
+            config: &LaunchConfig,
+            $($arg: $T::SyncHostType),*
+        ) -> CudaResult<()>
+        where
+            Kernel: Copy + FnOnce(
+                &mut Launcher<Kernel>,
+                $($T::SyncHostType),*
+            ) -> CudaResult<()>,
+        {
+            impl_typed_kernel_launch! { impl with_new_async ($($arg: $T),*) + (stream) {
+                self.$launch_async::<$($T),*>(stream, config, $($arg),*)
+            } }
+        }
+
+        #[allow(clippy::missing_errors_doc)]
+        #[allow(clippy::needless_lifetimes)] // 'stream is unused for zero args
+        #[allow(clippy::too_many_arguments)] // func is defined for <= 12 args
+        pub fn $launch_async<'stream, $($T: CudaKernelParameter),*>(
+            &mut self,
+            stream: &'stream Stream,
+            config: &LaunchConfig,
+            $($arg: $T::AsyncHostType<'stream, '_>),*
+        ) -> CudaResult<()>
+        where
+            Kernel: Copy + FnOnce(
+                &mut Launcher<Kernel>,
+                $($T::SyncHostType),*
+            ) -> CudaResult<()>,
+        {
+            let kernel_jit_result = if config.ptx_jit {
+                impl_typed_kernel_launch! { impl with_async_as_ptx_jit ref ($($arg: $T),*) + () {
+                    self.compile_with_ptx_jit_args(Some(&[$($arg),*]))
+                } }?
+            } else {
+                self.compile_with_ptx_jit_args(None)?
+            };
+            let function = match kernel_jit_result {
+                KernelJITResult::Recompiled(function)
+                | KernelJITResult::Cached(function) => function,
+            };
+
+            unsafe { stream.launch(
+                function,
+                config.grid.clone(),
+                config.block.clone(),
+                config.shared_memory_size,
+                &[
+                    $(core::ptr::from_mut(
+                        &mut $T::async_to_ffi($arg)
+                    ).cast::<core::ffi::c_void>()),*
+                ],
+            ) }
+        }
+    };
+    (impl $func:ident () + ($($other:ident),*) $inner:block) => {
+        $inner
+    };
+    (impl $func:ident ($arg0:ident : $T0:ident $(, $arg:ident : $T:ident)*) + ($($other:ident),*) $inner:block) => {
+        $T0::$func($arg0 $(, $other)*, |$arg0| {
+            impl_typed_kernel_launch! { impl $func ($($arg: $T),*) + ($($other),*) $inner }
+        })
+    };
+    (impl $func:ident ref () + ($($other:ident),*) $inner:block) => {
+        $inner
+    };
+    (impl $func:ident ref ($arg0:ident : $T0:ident $(, $arg:ident : $T:ident)*) + ($($other:ident),*) $inner:block) => {
+        $T0::$func(&$arg0 $(, $other)*, |$arg0| {
+            impl_typed_kernel_launch! { impl $func ref ($($arg: $T),*) + ($($other),*) $inner }
+        })
+    };
+}
+
 impl<Kernel> TypedPtxKernel<Kernel> {
+    impl_typed_kernel_launch! { launch0() => launch0_async }
+
+    impl_typed_kernel_launch! { launch1(arg1: A) => launch1_async }
+
+    impl_typed_kernel_launch! { launch2(arg1: A, arg2: B) => launch2_async }
+
+    impl_typed_kernel_launch! { launch3(arg1: A, arg2: B, arg3: C) => launch3_async }
+
+    impl_typed_kernel_launch! { launch4(arg1: A, arg2: B, arg3: C, arg4: D) => launch4_async }
+
+    impl_typed_kernel_launch! { launch5(
+        arg1: A, arg2: B, arg3: C, arg4: D, arg5: E
+    ) => launch5_async }
+
+    impl_typed_kernel_launch! { launch6(
+        arg1: A, arg2: B, arg3: C, arg4: D, arg5: E, arg6: F
+    ) => launch6_async }
+
+    impl_typed_kernel_launch! { launch7(
+        arg1: A, arg2: B, arg3: C, arg4: D, arg5: E, arg6: F, arg7: G
+    ) => launch7_async }
+
+    impl_typed_kernel_launch! { launch8(
+        arg1: A, arg2: B, arg3: C, arg4: D, arg5: E, arg6: F, arg7: G, arg8: H
+    ) => launch8_async }
+
+    impl_typed_kernel_launch! { launch9(
+        arg1: A, arg2: B, arg3: C, arg4: D, arg5: E, arg6: F, arg7: G, arg8: H, arg9: I
+    ) => launch9_async }
+
+    impl_typed_kernel_launch! { launch10(
+        arg1: A, arg2: B, arg3: C, arg4: D, arg5: E, arg6: F, arg7: G, arg8: H, arg9: I, arg10: J
+    ) => launch10_async }
+
+    impl_typed_kernel_launch! { launch11(
+        arg1: A, arg2: B, arg3: C, arg4: D, arg5: E, arg6: F, arg7: G, arg8: H, arg9: I, arg10: J,
+        arg11: K
+    ) => launch11_async }
+
+    impl_typed_kernel_launch! { launch12(
+        arg1: A, arg2: B, arg3: C, arg4: D, arg5: E, arg6: F, arg7: G, arg8: H, arg9: I, arg10: J,
+        arg11: K, arg12: L
+    ) => launch12_async }
+
     #[must_use]
     pub fn new<T: CompiledKernelPtx<Kernel>>(configure: Option<Box<PtxKernelConfigure>>) -> Self {
-        let compiler = crate::ptx_jit::PtxJITCompiler::new(T::get_ptx());
+        let compiler = PtxJITCompiler::new(T::get_ptx());
         let entry_point = CString::from(T::get_entry_point()).into_boxed_c_str();
 
         Self {
@@ -151,9 +339,9 @@ impl<Kernel> TypedPtxKernel<Kernel> {
     /// Returns a [`CudaError`] if the [`CompiledKernelPtx`] provided to
     /// [`Self::new`] is not a valid PTX source or does not contain the
     /// entry point it declares.
-    pub fn compile_with_ptx_jit_args(
+    fn compile_with_ptx_jit_args(
         &mut self,
-        arguments: Option<&[Option<*const [u8]>]>,
+        arguments: Option<&[Option<&NonNull<[u8]>>]>,
     ) -> CudaResult<KernelJITResult> {
         let ptx_jit = self.compiler.with_arguments(arguments);
 
@@ -179,51 +367,8 @@ impl<Kernel> TypedPtxKernel<Kernel> {
 
         Ok(kernel_jit)
     }
-
-    #[allow(clippy::missing_errors_doc)]
-    pub fn launch0(&mut self, stream: &Stream, config: &LaunchConfig) -> CudaResult<()>
-    where
-        Kernel: Copy + FnOnce(&mut Launcher<Kernel>) -> CudaResult<()>,
-    {
-        (const { conjure::<Kernel>() })(&mut Launcher {
-            stream,
-            kernel: self,
-            config: config.clone(),
-        })
-    }
-
-    #[allow(clippy::missing_errors_doc)]
-    pub fn launch1<A>(&mut self, stream: &Stream, config: &LaunchConfig, arg1: A) -> CudaResult<()>
-    where
-        Kernel: Copy + FnOnce(&mut Launcher<Kernel>, A) -> CudaResult<()>,
-    {
-        (const { conjure::<Kernel>() })(
-            &mut Launcher {
-                stream,
-                kernel: self,
-                config: config.clone(),
-            },
-            arg1,
-        )
-    }
-}
-
-const fn conjure<T: Copy>() -> T {
-    union Transmute<T: Copy> {
-        empty: (),
-        magic: T,
-    }
-
-    assert!(std::mem::size_of::<T>() == 0);
-    assert!(std::mem::align_of::<T>() == 1);
-
-    unsafe { Transmute { empty: () }.magic }
 }
 
-struct Assert<const ASSERT: bool>;
-trait True {}
-impl True for Assert<true> {}
-
 pub trait LendToCuda: RustToCuda {
     /// Lends an immutable copy of `&self` to CUDA:
     /// - code in the CUDA kernel can only access `&self` through the
diff --git a/rust-cuda-ptx-jit/src/host/mod.rs b/src/host/ptx_jit/mod.rs
similarity index 100%
rename from rust-cuda-ptx-jit/src/host/mod.rs
rename to src/host/ptx_jit/mod.rs
diff --git a/rust-cuda-ptx-jit/src/host/preprocess.rs b/src/host/ptx_jit/preprocess.rs
similarity index 93%
rename from rust-cuda-ptx-jit/src/host/preprocess.rs
rename to src/host/ptx_jit/preprocess.rs
index 0ee17733f..c22cf63e9 100644
--- a/rust-cuda-ptx-jit/src/host/preprocess.rs
+++ b/src/host/ptx_jit/preprocess.rs
@@ -5,7 +5,7 @@ use std::{
 
 use super::{
     regex::{
-        CONST_BASE_REGISTER_REGEX, CONST_LOAD_INSTRUCTION_REGEX, CONST_MARKER_REGEX, REGISTER_REGEX,
+        const_base_register_regex, const_load_instruction_regex, const_marker_regex, register_regex,
     },
     PtxElement, PtxJITCompiler, PtxLoadWidth,
 };
@@ -19,7 +19,7 @@ impl PtxJITCompiler {
         let mut const_markers: HashMap<&[u8], usize> = HashMap::new();
 
         // Find injected rust-cuda-const-markers which identify dummy register rxx
-        for const_marker in CONST_MARKER_REGEX.captures_iter(ptx) {
+        for const_marker in const_marker_regex().captures_iter(ptx) {
             if let Some(tmpreg) = const_marker.name("tmpreg").map(|s| s.as_bytes()) {
                 if let Some(param) = const_marker
                     .name("param")
@@ -36,7 +36,7 @@ impl PtxJITCompiler {
         let mut const_base_registers: HashMap<&[u8], usize> = HashMap::new();
 
         // Find base register ryy which was used in `ld.global.u32 rxx, [ryy];`
-        for const_base_register in CONST_BASE_REGISTER_REGEX.captures_iter(ptx) {
+        for const_base_register in const_base_register_regex().captures_iter(ptx) {
             if let Some(tmpreg) = const_base_register.name("tmpreg").map(|s| s.as_bytes()) {
                 if let Some(param) = const_markers.get(tmpreg) {
                     if let Some(basereg) = const_base_register.name("basereg").map(|s| s.as_bytes())
@@ -54,7 +54,7 @@ impl PtxJITCompiler {
         let mut ptx_slices: Vec<PtxElement> = Vec::new();
 
         // Iterate over all load from base register with offset instructions
-        for const_load_instruction in CONST_LOAD_INSTRUCTION_REGEX.captures_iter(ptx) {
+        for const_load_instruction in const_load_instruction_regex().captures_iter(ptx) {
             // Only consider instructions where the base register is ryy
             if let Some(basereg) = const_load_instruction.name("basereg").map(|s| s.as_bytes()) {
                 if let Some(param) = const_base_registers.get(basereg) {
@@ -100,7 +100,7 @@ impl PtxJITCompiler {
                                         parameter_index: *param,
                                         byte_offset: loadoffset,
                                         load_width: loadwidth,
-                                        registers: REGISTER_REGEX
+                                        registers: register_regex()
                                             .captures_iter(constreg)
                                             .filter_map(|m| {
                                                 m.name("register").map(|s| {
diff --git a/src/host/ptx_jit/regex.rs b/src/host/ptx_jit/regex.rs
new file mode 100644
index 000000000..58406b01e
--- /dev/null
+++ b/src/host/ptx_jit/regex.rs
@@ -0,0 +1,58 @@
+use std::sync::OnceLock;
+
+use regex::bytes::Regex;
+
+#[allow(clippy::module_name_repetitions)]
+pub fn const_marker_regex() -> &'static Regex {
+    static CONST_MARKER_REGEX: OnceLock<Regex> = OnceLock::new();
+    CONST_MARKER_REGEX.get_or_init(|| {
+        Regex::new(r"(?-u)// <rust-cuda-ptx-jit-const-load-(?P<tmpreg>%r\d+)-(?P<param>\d+)> //")
+            .unwrap()
+    })
+}
+
+#[allow(clippy::module_name_repetitions)]
+pub fn const_base_register_regex() -> &'static Regex {
+    static CONST_BASE_REGISTER_REGEX: OnceLock<Regex> = OnceLock::new();
+    CONST_BASE_REGISTER_REGEX.get_or_init(|| {
+        Regex::new(r"(?-u)ld\.global\.u32\s*(?P<tmpreg>%r\d+)\s*,\s*\[(?P<basereg>%r[ds]?\d+)]\s*;")
+            .unwrap()
+    })
+}
+
+#[allow(clippy::module_name_repetitions)]
+pub fn const_load_instruction_regex() -> &'static Regex {
+    static CONST_LOAD_INSTRUCTION_REGEX: OnceLock<Regex> = OnceLock::new();
+    CONST_LOAD_INSTRUCTION_REGEX.get_or_init(|| {
+        Regex::new(
+            r"(?x-u)(?P<instruction>
+                ld\.global
+                (?:\.(?P<vector>v[24]))?
+                \.
+                (?P<loadtype>[suf])
+                (?P<loadwidth>8|16|32|64)
+                \s*
+                (?P<constreg>
+                    (?:%[rf][sd]?\d+) |
+                    (?:\{(?:\s*%[rf][sd]?\d+,)*\s*%[rf][sd]?\d+\s*\})
+                )
+                ,\s*
+                \[
+                (?P<basereg>%r[ds]?\d+)
+                (?:
+                    \+
+                    (?P<loadoffset>\d+)
+                )?
+                \]
+                \s*;
+            )",
+        )
+        .unwrap()
+    })
+}
+
+#[allow(clippy::module_name_repetitions)]
+pub fn register_regex() -> &'static Regex {
+    static REGISTER_REGEX: OnceLock<Regex> = OnceLock::new();
+    REGISTER_REGEX.get_or_init(|| Regex::new(r"(?-u)(?P<register>%[rf][sd]?\d+)").unwrap())
+}
diff --git a/rust-cuda-ptx-jit/src/host/replace.rs b/src/host/ptx_jit/replace.rs
similarity index 96%
rename from rust-cuda-ptx-jit/src/host/replace.rs
rename to src/host/ptx_jit/replace.rs
index 920842d6f..ed59701c7 100644
--- a/rust-cuda-ptx-jit/src/host/replace.rs
+++ b/src/host/ptx_jit/replace.rs
@@ -1,10 +1,11 @@
+use core::ptr::NonNull;
 use std::{ffi::CString, ops::Deref};
 
 use super::{PtxElement, PtxJITCompiler, PtxJITResult, PtxLoadWidth};
 
 impl PtxJITCompiler {
     #[allow(clippy::too_many_lines)]
-    pub fn with_arguments(&mut self, arguments: Option<&[Option<*const [u8]>]>) -> PtxJITResult {
+    pub fn with_arguments(&mut self, arguments: Option<&[Option<&NonNull<[u8]>>]>) -> PtxJITResult {
         // Check if the arguments, cast as byte slices, are the same as the last cached
         //  ones
         #[allow(clippy::explicit_deref_methods)]
@@ -16,7 +17,7 @@ impl PtxJITCompiler {
                     .zip(last_arguments.iter())
                     .all(|(a, b)| match (a, b) {
                         (None, None) => false,
-                        (Some(a), Some(b)) => (unsafe { &**a }) != b.deref(),
+                        (Some(a), Some(b)) => (unsafe { a.as_ref() }) != b.deref(),
                         _ => true,
                     })
             },
@@ -30,7 +31,9 @@ impl PtxJITCompiler {
             self.last_arguments = arguments.map(|arguments| {
                 arguments
                     .iter()
-                    .map(|arg| arg.map(|bytes| unsafe { &*bytes }.to_owned().into_boxed_slice()))
+                    .map(|arg| {
+                        arg.map(|bytes| unsafe { bytes.as_ref() }.to_owned().into_boxed_slice())
+                    })
                     .collect::<Vec<Option<Box<[u8]>>>>()
                     .into_boxed_slice()
             });
diff --git a/src/lib.rs b/src/lib.rs
index 61b807d8b..0bf8b0e21 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -40,7 +40,6 @@
 #[doc(hidden)]
 pub extern crate alloc;
 
-pub extern crate rust_cuda_ptx_jit as ptx_jit;
 pub extern crate rustacuda_core;
 
 #[doc(hidden)]

From adfff4328d6a1b62640d7f276bfaf8bcbe394d6c Mon Sep 17 00:00:00 2001
From: Juniper Tyree <juniper.tyree@helsinki.fi>
Date: Sat, 23 Dec 2023 11:28:02 +0000
Subject: [PATCH 056/120] Add async launch helper

---
 src/host/mod.rs | 153 +++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 124 insertions(+), 29 deletions(-)

diff --git a/src/host/mod.rs b/src/host/mod.rs
index 6cb31a508..539d24207 100644
--- a/src/host/mod.rs
+++ b/src/host/mod.rs
@@ -39,7 +39,7 @@ pub struct Launcher<'stream, 'kernel, Kernel> {
 }
 
 macro_rules! impl_launcher_launch {
-    ($launch:ident($($arg:ident : $T:ident),*) => $launch_async:ident) => {
+    ($launch:ident($($arg:ident : $T:ident),*) => $with_async:ident => $launch_async:ident) => {
         #[allow(clippy::missing_errors_doc)]
         #[allow(clippy::too_many_arguments)] // func is defined for <= 12 args
         pub fn $launch<$($T: CudaKernelParameter),*>(
@@ -55,6 +55,35 @@ macro_rules! impl_launcher_launch {
             self.kernel.$launch::<$($T),*>(self.stream, &self.config, $($arg),*)
         }
 
+        #[allow(clippy::missing_errors_doc)]
+        #[allow(clippy::too_many_arguments)] // func is defined for <= 12 args
+        pub fn $with_async<
+            'a,
+            Ok,
+            Err: From<CudaError>,
+            $($T: CudaKernelParameter),*
+        >(
+            &'a mut self,
+            $($arg: $T::SyncHostType,)*
+            inner: impl FnOnce(
+                &'a mut Self,
+                $($T::AsyncHostType<'stream, '_>),*
+            ) -> Result<Ok, Err>,
+        ) -> Result<Ok, Err>
+        where
+            Kernel: Copy + FnOnce(
+                &mut Launcher<Kernel>,
+                $($T::SyncHostType),*
+            ) -> CudaResult<()>,
+        {
+            #[allow(unused_variables)]
+            let stream = self.stream;
+
+            impl_launcher_launch! { impl with_new_async ($($arg: $T),*) + (stream) {
+                inner(self, $($arg),*)
+            } }
+        }
+
         #[allow(clippy::missing_errors_doc)]
         #[allow(clippy::too_many_arguments)] // func is defined for <= 12 args
         pub fn $launch_async<$($T: CudaKernelParameter),*>(
@@ -70,52 +99,68 @@ macro_rules! impl_launcher_launch {
             self.kernel.$launch_async::<$($T),*>(self.stream, &self.config, $($arg),*)
         }
     };
+    (impl $func:ident () + ($($other:ident),*) $inner:block) => {
+        $inner
+    };
+    (impl $func:ident ($arg0:ident : $T0:ident $(, $arg:ident : $T:ident)*) + ($($other:ident),*) $inner:block) => {
+        $T0::$func($arg0 $(, $other)*, |$arg0| {
+            impl_launcher_launch! { impl $func ($($arg: $T),*) + ($($other),*) $inner }
+        })
+    };
 }
 
 impl<'stream, 'kernel, Kernel> Launcher<'stream, 'kernel, Kernel> {
-    impl_launcher_launch! { launch0() => launch0_async }
+    impl_launcher_launch! { launch0() => with0_async => launch0_async }
 
-    impl_launcher_launch! { launch1(arg1: A) => launch1_async }
+    impl_launcher_launch! { launch1(
+        arg1: A
+    ) => with1_async => launch1_async }
 
-    impl_launcher_launch! { launch2(arg1: A, arg2: B) => launch2_async }
+    impl_launcher_launch! { launch2(
+        arg1: A, arg2: B
+    ) => with2_async => launch2_async }
 
-    impl_launcher_launch! { launch3(arg1: A, arg2: B, arg3: C) => launch3_async }
+    impl_launcher_launch! { launch3(
+        arg1: A, arg2: B, arg3: C
+    ) => with3_async => launch3_async }
 
-    impl_launcher_launch! { launch4(arg1: A, arg2: B, arg3: C, arg4: D) => launch4_async }
+    impl_launcher_launch! { launch4(
+        arg1: A, arg2: B, arg3: C, arg4: D
+    ) => with4_async => launch4_async }
 
     impl_launcher_launch! { launch5(
         arg1: A, arg2: B, arg3: C, arg4: D, arg5: E
-    ) => launch5_async }
+    ) => with5_async => launch5_async }
 
     impl_launcher_launch! { launch6(
         arg1: A, arg2: B, arg3: C, arg4: D, arg5: E, arg6: F
-    ) => launch6_async }
+    ) => with6_async => launch6_async }
 
     impl_launcher_launch! { launch7(
         arg1: A, arg2: B, arg3: C, arg4: D, arg5: E, arg6: F, arg7: G
-    ) => launch7_async }
+    ) => with7_async => launch7_async }
 
     impl_launcher_launch! { launch8(
         arg1: A, arg2: B, arg3: C, arg4: D, arg5: E, arg6: F, arg7: G, arg8: H
-    ) => launch8_async }
+    ) => with8_async => launch8_async }
 
     impl_launcher_launch! { launch9(
         arg1: A, arg2: B, arg3: C, arg4: D, arg5: E, arg6: F, arg7: G, arg8: H, arg9: I
-    ) => launch9_async }
+    ) => with9_async => launch9_async }
 
     impl_launcher_launch! { launch10(
         arg1: A, arg2: B, arg3: C, arg4: D, arg5: E, arg6: F, arg7: G, arg8: H, arg9: I, arg10: J
-    ) => launch10_async }
+    ) => with10_async => launch10_async }
 
     impl_launcher_launch! { launch11(
         arg1: A, arg2: B, arg3: C, arg4: D, arg5: E, arg6: F, arg7: G, arg8: H, arg9: I, arg10: J,
         arg11: K
-    ) => launch11_async }
+    ) => with11_async => launch11_async }
 
     impl_launcher_launch! { launch12(
         arg1: A, arg2: B, arg3: C, arg4: D, arg5: E, arg6: F, arg7: G, arg8: H, arg9: I, arg10: J,
         arg11: K, arg12: L
-    ) => launch12_async }
+    ) => with12_async => launch12_async }
 }
 
 #[derive(Clone, Debug, PartialEq)]
@@ -197,7 +242,7 @@ pub struct TypedPtxKernel<Kernel> {
 }
 
 macro_rules! impl_typed_kernel_launch {
-    ($launch:ident($($arg:ident : $T:ident),*) => $launch_async:ident) => {
+    ($launch:ident($($arg:ident : $T:ident),*) => $with_async:ident => $launch_async:ident) => {
         #[allow(clippy::missing_errors_doc)]
         #[allow(clippy::too_many_arguments)] // func is defined for <= 12 args
         pub fn $launch<$($T: CudaKernelParameter),*>(
@@ -206,6 +251,48 @@ macro_rules! impl_typed_kernel_launch {
             config: &LaunchConfig,
             $($arg: $T::SyncHostType),*
         ) -> CudaResult<()>
+        where
+            Kernel: Copy + FnOnce(
+                &mut Launcher<Kernel>,
+                $($T::SyncHostType),*
+            ) -> CudaResult<()>,
+        {
+            self.$with_async::<(), CudaError, $($T),*>(
+                stream,
+                config,
+                $($arg,)*
+                |kernel, stream, config, $($arg),*| {
+                    let result = kernel.$launch_async::<$($T),*>(stream, config, $($arg),*);
+
+                    // important: always synchronise here, this function is sync!
+                    match (stream.synchronize(), result) {
+                        (Ok(()), result) => result,
+                        (Err(_), Err(err)) | (Err(err), Ok(())) => Err(err),
+                    }
+                },
+            )
+        }
+
+        #[allow(clippy::missing_errors_doc)]
+        #[allow(clippy::too_many_arguments)] // func is defined for <= 12 args
+        pub fn $with_async<
+            'a,
+            'stream,
+            Ok,
+            Err: From<CudaError>,
+            $($T: CudaKernelParameter),*
+        >(
+            &'a mut self,
+            stream: &'stream Stream,
+            config: &LaunchConfig,
+            $($arg: $T::SyncHostType,)*
+            inner: impl FnOnce(
+                &'a mut Self,
+                &'stream Stream,
+                &LaunchConfig,
+                $($T::AsyncHostType<'stream, '_>),*
+            ) -> Result<Ok, Err>,
+        ) -> Result<Ok, Err>
         where
             Kernel: Copy + FnOnce(
                 &mut Launcher<Kernel>,
@@ -213,7 +300,7 @@ macro_rules! impl_typed_kernel_launch {
             ) -> CudaResult<()>,
         {
             impl_typed_kernel_launch! { impl with_new_async ($($arg: $T),*) + (stream) {
-                self.$launch_async::<$($T),*>(stream, config, $($arg),*)
+                inner(self, stream, config, $($arg),*)
             } }
         }
 
@@ -276,49 +363,57 @@ macro_rules! impl_typed_kernel_launch {
 }
 
 impl<Kernel> TypedPtxKernel<Kernel> {
-    impl_typed_kernel_launch! { launch0() => launch0_async }
+    impl_typed_kernel_launch! { launch0() => with0_async => launch0_async }
 
-    impl_typed_kernel_launch! { launch1(arg1: A) => launch1_async }
+    impl_typed_kernel_launch! { launch1(
+        arg1: A
+    ) => with1_async => launch1_async }
 
-    impl_typed_kernel_launch! { launch2(arg1: A, arg2: B) => launch2_async }
+    impl_typed_kernel_launch! { launch2(
+        arg1: A, arg2: B
+    ) => with2_async => launch2_async }
 
-    impl_typed_kernel_launch! { launch3(arg1: A, arg2: B, arg3: C) => launch3_async }
+    impl_typed_kernel_launch! { launch3(
+        arg1: A, arg2: B, arg3: C
+    ) => with3_async => launch3_async }
 
-    impl_typed_kernel_launch! { launch4(arg1: A, arg2: B, arg3: C, arg4: D) => launch4_async }
+    impl_typed_kernel_launch! { launch4(
+        arg1: A, arg2: B, arg3: C, arg4: D
+    ) => with4_async => launch4_async }
 
     impl_typed_kernel_launch! { launch5(
         arg1: A, arg2: B, arg3: C, arg4: D, arg5: E
-    ) => launch5_async }
+    ) => with5_async => launch5_async }
 
     impl_typed_kernel_launch! { launch6(
         arg1: A, arg2: B, arg3: C, arg4: D, arg5: E, arg6: F
-    ) => launch6_async }
+    ) => with6_async => launch6_async }
 
     impl_typed_kernel_launch! { launch7(
         arg1: A, arg2: B, arg3: C, arg4: D, arg5: E, arg6: F, arg7: G
-    ) => launch7_async }
+    ) => with7_async => launch7_async }
 
     impl_typed_kernel_launch! { launch8(
         arg1: A, arg2: B, arg3: C, arg4: D, arg5: E, arg6: F, arg7: G, arg8: H
-    ) => launch8_async }
+    ) => with8_async => launch8_async }
 
     impl_typed_kernel_launch! { launch9(
         arg1: A, arg2: B, arg3: C, arg4: D, arg5: E, arg6: F, arg7: G, arg8: H, arg9: I
-    ) => launch9_async }
+    ) => with9_async => launch9_async }
 
     impl_typed_kernel_launch! { launch10(
         arg1: A, arg2: B, arg3: C, arg4: D, arg5: E, arg6: F, arg7: G, arg8: H, arg9: I, arg10: J
-    ) => launch10_async }
+    ) => with10_async => launch10_async }
 
     impl_typed_kernel_launch! { launch11(
         arg1: A, arg2: B, arg3: C, arg4: D, arg5: E, arg6: F, arg7: G, arg8: H, arg9: I, arg10: J,
         arg11: K
-    ) => launch11_async }
+    ) => with11_async => launch11_async }
 
     impl_typed_kernel_launch! { launch12(
         arg1: A, arg2: B, arg3: C, arg4: D, arg5: E, arg6: F, arg7: G, arg8: H, arg9: I, arg10: J,
         arg11: K, arg12: L
-    ) => launch12_async }
+    ) => with12_async => launch12_async }
 
     #[must_use]
     pub fn new<T: CompiledKernelPtx<Kernel>>(configure: Option<Box<PtxKernelConfigure>>) -> Self {

From 93e8d202e015a8618f41e2937ea241866c01383d Mon Sep 17 00:00:00 2001
From: Juniper Tyree <juniper.tyree@helsinki.fi>
Date: Sat, 23 Dec 2023 13:03:31 +0000
Subject: [PATCH 057/120] Further cleanup of the new kernel param API

---
 examples/print/src/main.rs                    | 21 +------
 .../wrapper/generate/cpu_wrapper/mod.rs       | 30 ----------
 .../kernel_func.rs => host_kernel_ty.rs}      | 60 ++++++-------------
 .../args_trait.rs                             |  0
 .../get_ptx.rs                                |  0
 .../mod.rs                                    |  2 +-
 .../src/kernel/wrapper/generate/mod.rs        |  4 +-
 rust-cuda-derive/src/kernel/wrapper/mod.rs    | 14 ++---
 rust-cuda-derive/src/kernel/wrapper/parse.rs  |  2 +-
 src/common.rs                                 | 39 ++++++++----
 src/host/mod.rs                               | 36 +++++------
 src/lib.rs                                    |  2 +
 12 files changed, 79 insertions(+), 131 deletions(-)
 delete mode 100644 rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/mod.rs
 rename rust-cuda-derive/src/kernel/wrapper/generate/{cpu_wrapper/kernel_func.rs => host_kernel_ty.rs} (53%)
 rename rust-cuda-derive/src/kernel/wrapper/generate/{cpu_linker_macro => host_linker_macro}/args_trait.rs (100%)
 rename rust-cuda-derive/src/kernel/wrapper/generate/{cpu_linker_macro => host_linker_macro}/get_ptx.rs (100%)
 rename rust-cuda-derive/src/kernel/wrapper/generate/{cpu_linker_macro => host_linker_macro}/mod.rs (98%)

diff --git a/examples/print/src/main.rs b/examples/print/src/main.rs
index 3d2f776e4..62a0e2713 100644
--- a/examples/print/src/main.rs
+++ b/examples/print/src/main.rs
@@ -71,26 +71,11 @@ fn main() -> rust_cuda::rustacuda::error::CudaResult<()> {
 
     // Launch the CUDA kernel on the stream and synchronise to its completion
     println!("Launching print kernel ...");
-    kernel.launch1::<rust_cuda::common::PerThreadShallowCopy<Action>>(
-        &stream,
-        &config,
-        Action::Print,
-    )?;
-    // kernel(&mut launcher, Action::Print)?;
+    kernel.launch1(&stream, &config, Action::Print)?;
     println!("Launching panic kernel ...");
-    kernel.launch1::<rust_cuda::common::PerThreadShallowCopy<Action>>(
-        &stream,
-        &config,
-        Action::Panic,
-    )?;
-    // kernel(&mut launcher, Action::Panic)?;
+    kernel.launch1(&stream, &config, Action::Panic)?;
     println!("Launching alloc error kernel ...");
-    kernel.launch1::<rust_cuda::common::PerThreadShallowCopy<Action>>(
-        &stream,
-        &config,
-        Action::AllocError,
-    )?;
-    // kernel(&mut launcher, Action::AllocError)?;
+    kernel.launch1(&stream, &config, Action::AllocError)?;
 
     Ok(())
 }
diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/mod.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/mod.rs
deleted file mode 100644
index eeb5cd5d4..000000000
--- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/mod.rs
+++ /dev/null
@@ -1,30 +0,0 @@
-use proc_macro2::TokenStream;
-
-use super::super::{DeclGenerics, FuncIdent, FunctionInputs, ImplGenerics};
-
-mod kernel_func;
-use kernel_func::quote_kernel_func_inputs;
-
-pub(in super::super) fn quote_cpu_wrapper(
-    crate_path: &syn::Path,
-    decl: &DeclGenerics,
-    impl_generics: &ImplGenerics,
-    func_inputs: &FunctionInputs,
-    fn_ident: &FuncIdent,
-    func_params: &[syn::Ident],
-    func_attrs: &[syn::Attribute],
-) -> TokenStream {
-    let kernel_func = quote_kernel_func_inputs(
-        crate_path,
-        impl_generics,
-        decl,
-        func_inputs,
-        fn_ident,
-        func_params,
-        func_attrs,
-    );
-
-    quote! {
-        #kernel_func
-    }
-}
diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func.rs b/rust-cuda-derive/src/kernel/wrapper/generate/host_kernel_ty.rs
similarity index 53%
rename from rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func.rs
rename to rust-cuda-derive/src/kernel/wrapper/generate/host_kernel_ty.rs
index b854ce160..75c86820f 100644
--- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func.rs
+++ b/rust-cuda-derive/src/kernel/wrapper/generate/host_kernel_ty.rs
@@ -1,48 +1,21 @@
 use proc_macro2::TokenStream;
-use syn::spanned::Spanned;
 
-use super::super::super::{DeclGenerics, FuncIdent, FunctionInputs, ImplGenerics};
+use super::super::{DeclGenerics, FuncIdent, FunctionInputs, ImplGenerics};
 
-pub(super) fn quote_kernel_func_inputs(
+pub(in super::super) fn quote_host_kernel_ty(
     crate_path: &syn::Path,
-    ImplGenerics { ty_generics, .. }: &ImplGenerics,
     DeclGenerics {
         generic_kernel_params,
         generic_start_token,
         generic_close_token,
         ..
     }: &DeclGenerics,
+    ImplGenerics { ty_generics, .. }: &ImplGenerics,
     FunctionInputs { func_inputs }: &FunctionInputs,
     FuncIdent { func_ident, .. }: &FuncIdent,
     func_params: &[syn::Ident],
     func_attrs: &[syn::Attribute],
 ) -> TokenStream {
-    let (kernel_func_inputs, kernel_func_input_tys): (Vec<_>, Vec<_>) = func_inputs
-        .iter()
-        .map(
-            |syn::PatType {
-                 attrs,
-                 ty,
-                 pat,
-                 colon_token,
-             }| {
-                let ty: syn::Type = syn::parse_quote_spanned! { ty.span()=>
-                    <#ty as #crate_path::common::CudaKernelParameter>::SyncHostType
-                };
-
-                (
-                    syn::FnArg::Typed(syn::PatType {
-                        attrs: attrs.clone(),
-                        ty: Box::new(ty.clone()),
-                        pat: pat.clone(),
-                        colon_token: *colon_token,
-                    }),
-                    ty,
-                )
-            },
-        )
-        .unzip();
-
     let cuda_kernel_param_tys = func_inputs
         .iter()
         .map(|syn::PatType { ty, .. }| &**ty)
@@ -50,8 +23,6 @@ pub(super) fn quote_kernel_func_inputs(
 
     let launcher = syn::Ident::new("launcher", proc_macro2::Span::mixed_site());
 
-    let launch = quote::format_ident!("launch{}", func_inputs.len());
-
     let full_generics = generic_kernel_params
         .iter()
         .map(|param| match param {
@@ -61,6 +32,9 @@ pub(super) fn quote_kernel_func_inputs(
         })
         .collect::<Vec<_>>();
 
+    let mut private_func_ident = syn::Ident::clone(func_ident);
+    private_func_ident.set_span(proc_macro::Span::def_site().into());
+
     let ty_turbofish = ty_generics.as_turbofish();
 
     quote! {
@@ -68,28 +42,30 @@ pub(super) fn quote_kernel_func_inputs(
         #[allow(non_camel_case_types)]
         pub type #func_ident #generic_start_token
             #generic_kernel_params
-        #generic_close_token = impl Copy + Fn(
+        #generic_close_token = impl Fn(
             &mut #crate_path::host::Launcher<#func_ident #generic_start_token
                 #(#full_generics),*
             #generic_close_token>,
-            #(#kernel_func_input_tys),*
-        ) -> #crate_path::rustacuda::error::CudaResult<()>;
+            #(#cuda_kernel_param_tys),*
+        );
 
         #[cfg(not(target_os = "cuda"))]
         #(#func_attrs)*
         #[allow(clippy::too_many_arguments)]
         #[allow(clippy::used_underscore_binding)]
-        pub fn #func_ident <#generic_kernel_params>(
+        fn #private_func_ident #generic_start_token
+            #generic_kernel_params
+        #generic_close_token (
             #launcher: &mut #crate_path::host::Launcher<#func_ident #generic_start_token
                 #(#full_generics),*
             #generic_close_token>,
-            #(#kernel_func_inputs),*
-        ) -> #crate_path::rustacuda::error::CudaResult<()> {
-            let _: #func_ident <#(#full_generics),*> = #func_ident #ty_turbofish;
+            #func_inputs
+        ) {
+            let _: #func_ident <#(#full_generics),*> = #private_func_ident #ty_turbofish;
 
-            #launcher.#launch::<
-                #(#cuda_kernel_param_tys),*
-            >(#(#func_params),*)
+            #(
+                let _ = #func_params;
+            )*
         }
     }
 }
diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/args_trait.rs b/rust-cuda-derive/src/kernel/wrapper/generate/host_linker_macro/args_trait.rs
similarity index 100%
rename from rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/args_trait.rs
rename to rust-cuda-derive/src/kernel/wrapper/generate/host_linker_macro/args_trait.rs
diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/get_ptx.rs b/rust-cuda-derive/src/kernel/wrapper/generate/host_linker_macro/get_ptx.rs
similarity index 100%
rename from rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/get_ptx.rs
rename to rust-cuda-derive/src/kernel/wrapper/generate/host_linker_macro/get_ptx.rs
diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/mod.rs b/rust-cuda-derive/src/kernel/wrapper/generate/host_linker_macro/mod.rs
similarity index 98%
rename from rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/mod.rs
rename to rust-cuda-derive/src/kernel/wrapper/generate/host_linker_macro/mod.rs
index f68b9cf34..dc609da26 100644
--- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/mod.rs
+++ b/rust-cuda-derive/src/kernel/wrapper/generate/host_linker_macro/mod.rs
@@ -8,7 +8,7 @@ mod get_ptx;
 use get_ptx::quote_get_ptx;
 
 #[allow(clippy::too_many_arguments)] // FIXME
-pub(in super::super) fn quote_cpu_linker_macro(
+pub(in super::super) fn quote_host_linker_macro(
     crate_path: &syn::Path,
     KernelConfig {
         visibility, linker, ..
diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/mod.rs b/rust-cuda-derive/src/kernel/wrapper/generate/mod.rs
index c7a2fcabd..bf2c293cc 100644
--- a/rust-cuda-derive/src/kernel/wrapper/generate/mod.rs
+++ b/rust-cuda-derive/src/kernel/wrapper/generate/mod.rs
@@ -1,4 +1,4 @@
-pub mod cpu_linker_macro;
-pub mod cpu_wrapper;
 pub mod cuda_generic_function;
 pub mod cuda_wrapper;
+pub mod host_kernel_ty;
+pub mod host_linker_macro;
diff --git a/rust-cuda-derive/src/kernel/wrapper/mod.rs b/rust-cuda-derive/src/kernel/wrapper/mod.rs
index 4486f4c49..f3e1177bc 100644
--- a/rust-cuda-derive/src/kernel/wrapper/mod.rs
+++ b/rust-cuda-derive/src/kernel/wrapper/mod.rs
@@ -13,8 +13,8 @@ use super::lints::{parse_ptx_lint_level, LintLevel, PtxLint};
 
 use config::KernelConfig;
 use generate::{
-    cpu_linker_macro::quote_cpu_linker_macro, cpu_wrapper::quote_cpu_wrapper,
     cuda_generic_function::quote_cuda_generic_function, cuda_wrapper::quote_cuda_wrapper,
+    host_kernel_ty::quote_host_kernel_ty, host_linker_macro::quote_host_linker_macro,
 };
 use parse::parse_kernel_fn;
 use proc_macro2::{Ident, Span};
@@ -213,7 +213,7 @@ pub fn kernel(attr: TokenStream, func: TokenStream) -> TokenStream {
         })
         .collect();
 
-    let cpu_wrapper = quote_cpu_wrapper(
+    let host_kernel_ty = quote_host_kernel_ty(
         &crate_path,
         &decl_generics,
         &impl_generics,
@@ -222,8 +222,8 @@ pub fn kernel(attr: TokenStream, func: TokenStream) -> TokenStream {
         &func_params,
         &func.attrs,
     );
-    let cpu_cuda_check = quote_generic_check(&crate_path, &func_ident);
-    let cpu_linker_macro = quote_cpu_linker_macro(
+    let host_generic_kernel_check = quote_generic_check(&crate_path, &func_ident);
+    let host_linker_macro = quote_host_linker_macro(
         &crate_path,
         &config,
         &decl_generics,
@@ -251,11 +251,11 @@ pub fn kernel(attr: TokenStream, func: TokenStream) -> TokenStream {
     );
 
     (quote! {
-        #cpu_wrapper
+        #host_kernel_ty
 
-        #cpu_cuda_check
+        #host_generic_kernel_check
 
-        #cpu_linker_macro
+        #host_linker_macro
 
         #cuda_wrapper
         #cuda_generic_function
diff --git a/rust-cuda-derive/src/kernel/wrapper/parse.rs b/rust-cuda-derive/src/kernel/wrapper/parse.rs
index 6d31697cf..8d1662772 100644
--- a/rust-cuda-derive/src/kernel/wrapper/parse.rs
+++ b/rust-cuda-derive/src/kernel/wrapper/parse.rs
@@ -48,7 +48,7 @@ pub(super) fn parse_kernel_fn(tokens: TokenStream) -> syn::ItemFn {
     }
 
     if func.sig.inputs.len() > 12 {
-        abort!(
+        emit_warning!(
             func.sig.inputs.span(),
             "Kernel function has too many arguments, {} were found but at most 12 are supported.",
             func.sig.inputs.len()
diff --git a/src/common.rs b/src/common.rs
index 5360ccbbc..deaf85220 100644
--- a/src/common.rs
+++ b/src/common.rs
@@ -325,7 +325,9 @@ pub trait CudaKernelParameter: sealed::Sealed {
     type SyncHostType;
     #[cfg(feature = "host")]
     type AsyncHostType<'stream, 'b>;
+    #[doc(hidden)]
     type FfiType<'stream, 'b>: rustacuda_core::DeviceCopy + TypeGraphLayout;
+    #[cfg(all(not(feature = "host"), target_os = "cuda"))]
     type DeviceType<'b>;
 
     #[cfg(feature = "host")]
@@ -336,18 +338,21 @@ pub trait CudaKernelParameter: sealed::Sealed {
         inner: impl for<'b> FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result<O, E>,
     ) -> Result<O, E>;
 
+    #[doc(hidden)]
     #[cfg(feature = "host")]
     fn with_async_as_ptx_jit<O>(
         param: &Self::AsyncHostType<'_, '_>,
         inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O,
     ) -> O;
 
+    #[doc(hidden)]
     #[cfg(feature = "host")]
     fn async_to_ffi<'stream, 'b>(
         param: Self::AsyncHostType<'stream, 'b>,
     ) -> Self::FfiType<'stream, 'b>;
 
-    #[cfg(not(feature = "host"))]
+    #[doc(hidden)]
+    #[cfg(all(not(feature = "host"), target_os = "cuda"))]
     fn with_ffi_as_device<O, const PARAM: usize>(
         param: Self::FfiType<'static, 'static>,
         inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O,
@@ -414,6 +419,7 @@ impl<
 {
     #[cfg(feature = "host")]
     type AsyncHostType<'stream, 'b> = crate::utils::device_copy::SafeDeviceCopyWrapper<T>;
+    #[cfg(all(not(feature = "host"), target_os = "cuda"))]
     type DeviceType<'b> = T;
     type FfiType<'stream, 'b> = crate::utils::device_copy::SafeDeviceCopyWrapper<T>;
     #[cfg(feature = "host")]
@@ -445,7 +451,7 @@ impl<
         param
     }
 
-    #[cfg(not(feature = "host"))]
+    #[cfg(all(not(feature = "host"), target_os = "cuda"))]
     fn with_ffi_as_device<O, const PARAM: usize>(
         param: Self::FfiType<'static, 'static>,
         inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O,
@@ -477,6 +483,7 @@ impl<
         'b,
         crate::utils::device_copy::SafeDeviceCopyWrapper<T>,
     >;
+    #[cfg(all(not(feature = "host"), target_os = "cuda"))]
     type DeviceType<'b> = &'b T;
     type FfiType<'stream, 'b> =
         DeviceConstRef<'b, crate::utils::device_copy::SafeDeviceCopyWrapper<T>>;
@@ -519,7 +526,7 @@ impl<
         unsafe { param.for_device_async() }
     }
 
-    #[cfg(not(feature = "host"))]
+    #[cfg(all(not(feature = "host"), target_os = "cuda"))]
     fn with_ffi_as_device<O, const PARAM: usize>(
         param: Self::FfiType<'static, 'static>,
         inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O,
@@ -549,6 +556,7 @@ impl<
     #[cfg(feature = "host")]
     type AsyncHostType<'stream, 'b> =
         <&'a PerThreadShallowCopy<T> as CudaKernelParameter>::AsyncHostType<'stream, 'b>;
+    #[cfg(all(not(feature = "host"), target_os = "cuda"))]
     type DeviceType<'b> = <&'a PerThreadShallowCopy<T> as CudaKernelParameter>::DeviceType<'b>;
     type FfiType<'stream, 'b> =
         <&'a PerThreadShallowCopy<T> as CudaKernelParameter>::FfiType<'stream, 'b>;
@@ -579,7 +587,7 @@ impl<
         <&'a PerThreadShallowCopy<T> as CudaKernelParameter>::async_to_ffi(param)
     }
 
-    #[cfg(not(feature = "host"))]
+    #[cfg(all(not(feature = "host"), target_os = "cuda"))]
     fn with_ffi_as_device<O, const PARAM: usize>(
         param: Self::FfiType<'static, 'static>,
         inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O,
@@ -622,6 +630,7 @@ impl<'a, T: 'static + InteriorMutableSafeDeviceCopy> CudaKernelParameter
         'b,
         crate::utils::device_copy::SafeDeviceCopyWrapper<T>,
     >;
+    #[cfg(all(not(feature = "host"), target_os = "cuda"))]
     type DeviceType<'b> = &'b T;
     type FfiType<'stream, 'b> =
         DeviceConstRef<'b, crate::utils::device_copy::SafeDeviceCopyWrapper<T>>;
@@ -672,7 +681,7 @@ impl<'a, T: 'static + InteriorMutableSafeDeviceCopy> CudaKernelParameter
         unsafe { param.for_device_async() }
     }
 
-    #[cfg(not(feature = "host"))]
+    #[cfg(all(not(feature = "host"), target_os = "cuda"))]
     fn with_ffi_as_device<O, const PARAM: usize>(
         param: Self::FfiType<'static, 'static>,
         inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O,
@@ -746,6 +755,7 @@ impl<
         'b,
         DeviceAccessible<<T as RustToCuda>::CudaRepresentation>,
     >;
+    #[cfg(all(not(feature = "host"), target_os = "cuda"))]
     type DeviceType<'b> = T;
     type FfiType<'stream, 'b> =
         DeviceOwnedRef<'b, DeviceAccessible<<T as RustToCuda>::CudaRepresentation>>;
@@ -776,7 +786,7 @@ impl<
         unsafe { param.for_device_async() }
     }
 
-    #[cfg(not(feature = "host"))]
+    #[cfg(all(not(feature = "host"), target_os = "cuda"))]
     fn with_ffi_as_device<O, const PARAM: usize>(
         param: Self::FfiType<'static, 'static>,
         inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O,
@@ -805,6 +815,7 @@ impl<'a, T: 'static + RustToCuda + crate::safety::NoSafeAliasing> CudaKernelPara
         'b,
         DeviceAccessible<<T as RustToCuda>::CudaRepresentation>,
     >;
+    #[cfg(all(not(feature = "host"), target_os = "cuda"))]
     type DeviceType<'b> = &'b T;
     type FfiType<'stream, 'b> =
         DeviceConstRef<'b, DeviceAccessible<<T as RustToCuda>::CudaRepresentation>>;
@@ -835,7 +846,7 @@ impl<'a, T: 'static + RustToCuda + crate::safety::NoSafeAliasing> CudaKernelPara
         unsafe { param.for_device_async() }
     }
 
-    #[cfg(not(feature = "host"))]
+    #[cfg(all(not(feature = "host"), target_os = "cuda"))]
     fn with_ffi_as_device<O, const PARAM: usize>(
         param: Self::FfiType<'static, 'static>,
         inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O,
@@ -861,6 +872,7 @@ impl<'a, T: 'static + RustToCuda + crate::safety::NoSafeAliasing> CudaKernelPara
         'b,
         DeviceAccessible<<T as RustToCuda>::CudaRepresentation>,
     >;
+    #[cfg(all(not(feature = "host"), target_os = "cuda"))]
     type DeviceType<'b> = &'b mut T;
     type FfiType<'stream, 'b> =
         DeviceMutRef<'b, DeviceAccessible<<T as RustToCuda>::CudaRepresentation>>;
@@ -891,7 +903,7 @@ impl<'a, T: 'static + RustToCuda + crate::safety::NoSafeAliasing> CudaKernelPara
         unsafe { param.for_device_async() }
     }
 
-    #[cfg(not(feature = "host"))]
+    #[cfg(all(not(feature = "host"), target_os = "cuda"))]
     fn with_ffi_as_device<O, const PARAM: usize>(
         mut param: Self::FfiType<'static, 'static>,
         inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O,
@@ -919,6 +931,7 @@ impl<
     #[cfg(feature = "host")]
     type AsyncHostType<'stream, 'b> =
         <SharedHeapPerThreadShallowCopy<T> as CudaKernelParameter>::AsyncHostType<'stream, 'b>;
+    #[cfg(all(not(feature = "host"), target_os = "cuda"))]
     type DeviceType<'b> =
         <SharedHeapPerThreadShallowCopy<T> as CudaKernelParameter>::DeviceType<'b>;
     type FfiType<'stream, 'b> =
@@ -952,7 +965,7 @@ impl<
         <SharedHeapPerThreadShallowCopy<T> as CudaKernelParameter>::async_to_ffi(param)
     }
 
-    #[cfg(not(feature = "host"))]
+    #[cfg(all(not(feature = "host"), target_os = "cuda"))]
     fn with_ffi_as_device<O, const PARAM: usize>(
         param: Self::FfiType<'static, 'static>,
         inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O,
@@ -979,6 +992,7 @@ impl<'a, T: 'static + RustToCuda + crate::safety::NoSafeAliasing> CudaKernelPara
     #[cfg(feature = "host")]
     type AsyncHostType<'stream, 'b> =
         <&'a SharedHeapPerThreadShallowCopy<T> as CudaKernelParameter>::AsyncHostType<'stream, 'b>;
+    #[cfg(all(not(feature = "host"), target_os = "cuda"))]
     type DeviceType<'b> =
         <&'a SharedHeapPerThreadShallowCopy<T> as CudaKernelParameter>::DeviceType<'b>;
     type FfiType<'stream, 'b> =
@@ -1013,7 +1027,7 @@ impl<'a, T: 'static + RustToCuda + crate::safety::NoSafeAliasing> CudaKernelPara
         <&'a SharedHeapPerThreadShallowCopy<T> as CudaKernelParameter>::async_to_ffi(param)
     }
 
-    #[cfg(not(feature = "host"))]
+    #[cfg(all(not(feature = "host"), target_os = "cuda"))]
     fn with_ffi_as_device<O, const PARAM: usize>(
         param: Self::FfiType<'static, 'static>,
         inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O,
@@ -1039,6 +1053,7 @@ impl<'a, T: 'static + RustToCuda + crate::safety::NoSafeAliasing> CudaKernelPara
             'stream,
             'b,
         >;
+    #[cfg(all(not(feature = "host"), target_os = "cuda"))]
     type DeviceType<'b> =
         <&'a mut SharedHeapPerThreadShallowCopy<T> as CudaKernelParameter>::DeviceType<'b>;
     type FfiType<'stream, 'b> =
@@ -1073,7 +1088,7 @@ impl<'a, T: 'static + RustToCuda + crate::safety::NoSafeAliasing> CudaKernelPara
         <&'a mut SharedHeapPerThreadShallowCopy<T> as CudaKernelParameter>::async_to_ffi(param)
     }
 
-    #[cfg(not(feature = "host"))]
+    #[cfg(all(not(feature = "host"), target_os = "cuda"))]
     fn with_ffi_as_device<O, const PARAM: usize>(
         param: Self::FfiType<'static, 'static>,
         inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O,
@@ -1096,7 +1111,7 @@ fn param_as_raw_bytes<T: ?Sized>(r: &T) -> NonNull<[u8]> {
     NonNull::slice_from_raw_parts(NonNull::from(r).cast::<u8>(), core::mem::size_of_val(r))
 }
 
-#[cfg(not(feature = "host"))]
+#[cfg(all(not(feature = "host"), target_os = "cuda"))]
 fn emit_param_ptx_jit_marker<T: ?Sized, const INDEX: usize>(param: &T) {
     unsafe {
         core::arch::asm!(
diff --git a/src/host/mod.rs b/src/host/mod.rs
index 539d24207..e06e180f6 100644
--- a/src/host/mod.rs
+++ b/src/host/mod.rs
@@ -47,10 +47,10 @@ macro_rules! impl_launcher_launch {
             $($arg: $T::SyncHostType),*
         ) -> CudaResult<()>
         where
-            Kernel: Copy + FnOnce(
+            Kernel: /*Copy +*/ FnOnce(
                 &mut Launcher<Kernel>,
-                $($T::SyncHostType),*
-            ) -> CudaResult<()>,
+                $($T/*::SyncHostType*/),*
+            )/* -> CudaResult<()>*/,
         {
             self.kernel.$launch::<$($T),*>(self.stream, &self.config, $($arg),*)
         }
@@ -71,10 +71,10 @@ macro_rules! impl_launcher_launch {
             ) -> Result<Ok, Err>,
         ) -> Result<Ok, Err>
         where
-            Kernel: Copy + FnOnce(
+            Kernel: /*Copy +*/ FnOnce(
                 &mut Launcher<Kernel>,
-                $($T::SyncHostType),*
-            ) -> CudaResult<()>,
+                $($T/*::SyncHostType*/),*
+            )/* -> CudaResult<()>*/,
         {
             #[allow(unused_variables)]
             let stream = self.stream;
@@ -91,10 +91,10 @@ macro_rules! impl_launcher_launch {
             $($arg: $T::AsyncHostType<'stream, '_>),*
         ) -> CudaResult<()>
         where
-            Kernel: Copy + FnOnce(
+            Kernel: /*Copy +*/ FnOnce(
                 &mut Launcher<Kernel>,
-                $($T::SyncHostType),*
-            ) -> CudaResult<()>,
+                $($T/*::SyncHostType*/),*
+            )/* -> CudaResult<()>*/,
         {
             self.kernel.$launch_async::<$($T),*>(self.stream, &self.config, $($arg),*)
         }
@@ -252,10 +252,10 @@ macro_rules! impl_typed_kernel_launch {
             $($arg: $T::SyncHostType),*
         ) -> CudaResult<()>
         where
-            Kernel: Copy + FnOnce(
+            Kernel: /*Copy +*/ FnOnce(
                 &mut Launcher<Kernel>,
-                $($T::SyncHostType),*
-            ) -> CudaResult<()>,
+                $($T/*::SyncHostType*/),*
+            )/* -> CudaResult<()>*/,
         {
             self.$with_async::<(), CudaError, $($T),*>(
                 stream,
@@ -294,10 +294,10 @@ macro_rules! impl_typed_kernel_launch {
             ) -> Result<Ok, Err>,
         ) -> Result<Ok, Err>
         where
-            Kernel: Copy + FnOnce(
+            Kernel: /*Copy +*/ FnOnce(
                 &mut Launcher<Kernel>,
-                $($T::SyncHostType),*
-            ) -> CudaResult<()>,
+                $($T/*::SyncHostType*/),*
+            )/* -> CudaResult<()>*/,
         {
             impl_typed_kernel_launch! { impl with_new_async ($($arg: $T),*) + (stream) {
                 inner(self, stream, config, $($arg),*)
@@ -314,10 +314,10 @@ macro_rules! impl_typed_kernel_launch {
             $($arg: $T::AsyncHostType<'stream, '_>),*
         ) -> CudaResult<()>
         where
-            Kernel: Copy + FnOnce(
+            Kernel: /*Copy +*/ FnOnce(
                 &mut Launcher<Kernel>,
-                $($T::SyncHostType),*
-            ) -> CudaResult<()>,
+                $($T/*::SyncHostType*/),*
+            )/* -> CudaResult<()>*/,
         {
             let kernel_jit_result = if config.ptx_jit {
                 impl_typed_kernel_launch! { impl with_async_as_ptx_jit ref ($($arg: $T),*) + () {
diff --git a/src/lib.rs b/src/lib.rs
index 0bf8b0e21..392928d29 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -29,6 +29,8 @@
 #![feature(inline_const)]
 #![feature(sync_unsafe_cell)]
 #![feature(never_type)]
+#![feature(tuple_trait)]
+#![feature(unboxed_closures)]
 #![feature(cfg_version)]
 #![cfg_attr(not(version("1.76.0")), feature(c_str_literals))]
 #![cfg_attr(not(version("1.76.0")), feature(ptr_from_ref))]

From b2ce9ee2c83741fc3cef74f5be27579243425d99 Mon Sep 17 00:00:00 2001
From: Juniper Tyree <juniper.tyree@helsinki.fi>
Date: Sat, 23 Dec 2023 14:26:46 +0000
Subject: [PATCH 058/120] Start cleaning up the public API

---
 .github/workflows/ci.yml            |   4 +-
 .github/workflows/coverage.yml      |   1 -
 Cargo.toml                          |   4 +-
 src/common.rs                       |  44 +++++-----
 src/device/mod.rs                   |  75 +++++++---------
 src/host/mod.rs                     | 130 +++++++++++-----------------
 src/utils/aliasing/const.rs         |   3 +-
 src/utils/aliasing/dynamic.rs       |   3 +-
 src/utils/aliasing/final.rs         |   3 +-
 src/utils/box.rs                    |   3 +-
 src/utils/boxed_slice.rs            |   3 +-
 src/utils/device_copy.rs            |   2 +-
 src/utils/exchange/buffer/common.rs |   3 +-
 src/utils/option.rs                 |   3 +-
 14 files changed, 112 insertions(+), 169 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 954395a77..07fa4ab26 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -62,7 +62,7 @@ jobs:
       - name: Check feature powerset on CUDA
         run: |
           cargo hack check --feature-powerset --optional-deps \
-            --skip host,rustacuda,rustacuda_derive \
+            --skip host,rustacuda,rustacuda_derive,regex \
             --keep-going \
             --target nvptx64-nvidia-cuda
 
@@ -180,7 +180,7 @@ jobs:
       - name: Check feature powerset on CUDA
         run: |
           cargo hack clippy --feature-powerset --optional-deps \
-            --skip host,rustacuda,rustacuda_derive \
+            --skip host,rustacuda,rustacuda_derive,regex \
             --keep-going \
             --target nvptx64-nvidia-cuda \
             -- -D warnings
diff --git a/.github/workflows/coverage.yml b/.github/workflows/coverage.yml
index 904e1a65c..176d98baa 100644
--- a/.github/workflows/coverage.yml
+++ b/.github/workflows/coverage.yml
@@ -59,7 +59,6 @@ jobs:
           ./grcov . -s . --binary-path ./target/debug/deps \
             -t lcov -o coverage.lcov --branch \
             --keep-only "src/*" \
-            --keep-only "rust-cuda-ptx-jit/*" \
             --keep-only "rust-cuda-derive/*" \
             --ignore-not-existing \
             --excl-line GRCOV_EXCL_LINE \
diff --git a/Cargo.toml b/Cargo.toml
index 0a1375547..43687be65 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -19,8 +19,8 @@ rust-version = "1.75" # nightly
 
 [features]
 default = []
-host = ["rustacuda", "regex"]
-derive = ["rustacuda_derive", "rust-cuda-derive"]
+host = ["dep:rustacuda", "dep:regex"]
+derive = ["dep:rustacuda_derive", "dep:rust-cuda-derive"]
 
 [dependencies]
 rustacuda_core = { git = "https://github.com/juntyr/RustaCUDA", rev = "c6ea7cc" }
diff --git a/src/common.rs b/src/common.rs
index deaf85220..e4ee3f804 100644
--- a/src/common.rs
+++ b/src/common.rs
@@ -86,8 +86,8 @@ pub unsafe trait RustToCuda {
     type CudaAllocation: CudaAlloc;
     type CudaRepresentation: CudaAsRust<RustRepresentation = Self> + TypeGraphLayout;
 
+    #[doc(hidden)]
     #[cfg(feature = "host")]
-    #[doc(cfg(feature = "host"))]
     /// # Errors
     ///
     /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside
@@ -107,8 +107,8 @@ pub unsafe trait RustToCuda {
         CombinedCudaAlloc<Self::CudaAllocation, A>,
     )>;
 
+    #[doc(hidden)]
     #[cfg(feature = "host")]
-    #[doc(cfg(feature = "host"))]
     /// # Errors
     ///
     /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside
@@ -129,8 +129,8 @@ pub unsafe trait RustToCuda {
 /// This is an internal trait and should ONLY be derived automatically using
 /// `#[derive(LendRustToCuda)]`
 pub unsafe trait RustToCudaAsync: RustToCuda {
+    #[doc(hidden)]
     #[cfg(feature = "host")]
-    #[doc(cfg(feature = "host"))]
     /// # Errors
     ///
     /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside
@@ -153,8 +153,8 @@ pub unsafe trait RustToCudaAsync: RustToCuda {
         CombinedCudaAlloc<Self::CudaAllocation, A>,
     )>;
 
+    #[doc(hidden)]
     #[cfg(feature = "host")]
-    #[doc(cfg(feature = "host"))]
     /// # Errors
     ///
     /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside
@@ -177,8 +177,8 @@ pub unsafe trait RustToCudaAsync: RustToCuda {
 pub unsafe trait CudaAsRust: DeviceCopy + TypeGraphLayout {
     type RustRepresentation: RustToCuda<CudaRepresentation = Self>;
 
-    #[cfg(any(not(feature = "host"), doc))]
-    #[doc(cfg(not(feature = "host")))]
+    #[doc(hidden)]
+    #[cfg(not(feature = "host"))]
     /// # Safety
     ///
     /// This is an internal function and should NEVER be called manually
@@ -283,23 +283,31 @@ mod private {
 }
 
 pub trait EmptyCudaAlloc: private::empty::Sealed {}
-impl<T: private::empty::Sealed> EmptyCudaAlloc for T {}
 
 pub trait CudaAlloc: crate_private::alloc::Sealed {}
-impl<T: crate_private::alloc::Sealed> CudaAlloc for T {}
 
+impl<T: CudaAlloc> CudaAlloc for Option<T> {}
 impl<T: CudaAlloc> crate_private::alloc::Sealed for Option<T> {}
 
 pub struct NoCudaAlloc;
+impl CudaAlloc for NoCudaAlloc {}
 impl crate_private::alloc::Sealed for NoCudaAlloc {}
+impl EmptyCudaAlloc for NoCudaAlloc {}
 impl private::empty::Sealed for NoCudaAlloc {}
 
 pub struct SomeCudaAlloc(());
+impl CudaAlloc for SomeCudaAlloc {}
 impl crate_private::alloc::Sealed for SomeCudaAlloc {}
+impl !EmptyCudaAlloc for SomeCudaAlloc {}
 impl !private::empty::Sealed for SomeCudaAlloc {}
 
 pub struct CombinedCudaAlloc<A: CudaAlloc, B: CudaAlloc>(A, B);
+impl<A: CudaAlloc, B: CudaAlloc> CudaAlloc for CombinedCudaAlloc<A, B> {}
 impl<A: CudaAlloc, B: CudaAlloc> crate_private::alloc::Sealed for CombinedCudaAlloc<A, B> {}
+impl<A: CudaAlloc + EmptyCudaAlloc, B: CudaAlloc + EmptyCudaAlloc> EmptyCudaAlloc
+    for CombinedCudaAlloc<A, B>
+{
+}
 impl<A: CudaAlloc + EmptyCudaAlloc, B: CudaAlloc + EmptyCudaAlloc> private::empty::Sealed
     for CombinedCudaAlloc<A, B>
 {
@@ -791,10 +799,7 @@ impl<
         param: Self::FfiType<'static, 'static>,
         inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O,
     ) -> O {
-        // The type contains no allocations and is safe to copy
-        let param = unsafe { CudaAsRust::as_rust(param.as_ref()) };
-
-        inner(param)
+        unsafe { crate::device::BorrowFromRust::with_moved_from_rust(param, inner) }
     }
 }
 impl<
@@ -851,11 +856,7 @@ impl<'a, T: 'static + RustToCuda + crate::safety::NoSafeAliasing> CudaKernelPara
         param: Self::FfiType<'static, 'static>,
         inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O,
     ) -> O {
-        // Safety: param must never be dropped as we do NOT own any of the
-        //         heap memory it might reference
-        let param = core::mem::ManuallyDrop::new(unsafe { CudaAsRust::as_rust(param.as_ref()) });
-
-        inner(&param)
+        unsafe { crate::device::BorrowFromRust::with_borrow_from_rust(param, inner) }
     }
 }
 impl<'a, T: RustToCuda + crate::safety::NoSafeAliasing> sealed::Sealed
@@ -905,15 +906,10 @@ impl<'a, T: 'static + RustToCuda + crate::safety::NoSafeAliasing> CudaKernelPara
 
     #[cfg(all(not(feature = "host"), target_os = "cuda"))]
     fn with_ffi_as_device<O, const PARAM: usize>(
-        mut param: Self::FfiType<'static, 'static>,
+        param: Self::FfiType<'static, 'static>,
         inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O,
     ) -> O {
-        // Safety: param must never be dropped as we do NOT own any of the
-        //         heap memory it might reference
-        let mut param =
-            core::mem::ManuallyDrop::new(unsafe { CudaAsRust::as_rust(param.as_mut()) });
-
-        inner(&mut param)
+        unsafe { crate::device::BorrowFromRust::with_borrow_from_rust_mut(param, inner) }
     }
 }
 impl<'a, T: RustToCuda + crate::safety::NoSafeAliasing> sealed::Sealed
diff --git a/src/device/mod.rs b/src/device/mod.rs
index 93811bb04..5ce92bbbe 100644
--- a/src/device/mod.rs
+++ b/src/device/mod.rs
@@ -1,28 +1,27 @@
-use core::{
-    mem::ManuallyDrop,
-    ops::{Deref, DerefMut},
-};
+use core::mem::ManuallyDrop;
 
 #[cfg(feature = "derive")]
 #[doc(cfg(feature = "derive"))]
 pub use rust_cuda_derive::{specialise_kernel_function, specialise_kernel_type};
 
 use crate::{
-    common::{CudaAsRust, DeviceAccessible, DeviceConstRef, DeviceMutRef, RustToCuda},
-    safety::SafeDeviceCopy,
+    common::{
+        CudaAsRust, DeviceAccessible, DeviceConstRef, DeviceMutRef, DeviceOwnedRef, RustToCuda,
+    },
+    safety::{NoSafeAliasing, SafeDeviceCopy},
 };
 
 pub mod alloc;
 pub mod thread;
 pub mod utils;
 
-pub trait BorrowFromRust: RustToCuda {
+pub trait BorrowFromRust: RustToCuda + NoSafeAliasing {
     /// # Safety
     ///
     /// This function is only safe to call iff `cuda_repr` is the
     ///  [`DeviceConstRef`] borrowed on the CPU using the corresponding
     ///  [`LendToCuda::lend_to_cuda`](crate::host::LendToCuda::lend_to_cuda).
-    unsafe fn with_borrow_from_rust<O, F: FnOnce(&ShallowCopy<Self>) -> O>(
+    unsafe fn with_borrow_from_rust<O, F: FnOnce(&Self) -> O>(
         cuda_repr: DeviceConstRef<DeviceAccessible<<Self as RustToCuda>::CudaRepresentation>>,
         inner: F,
     ) -> O;
@@ -35,7 +34,7 @@ pub trait BorrowFromRust: RustToCuda {
     /// Furthermore, since different GPU threads can access heap storage
     ///  mutably inside the safe `inner` scope, there must not be any
     ///  aliasing between concurrently running threads.
-    unsafe fn with_borrow_from_rust_mut<O, F: FnOnce(&mut ShallowCopy<Self>) -> O>(
+    unsafe fn with_borrow_from_rust_mut<O, F: FnOnce(&mut Self) -> O>(
         cuda_repr_mut: DeviceMutRef<DeviceAccessible<<Self as RustToCuda>::CudaRepresentation>>,
         inner: F,
     ) -> O;
@@ -43,10 +42,10 @@ pub trait BorrowFromRust: RustToCuda {
     /// # Safety
     ///
     /// This function is only safe to call iff `cuda_repr` is the
-    ///  [`DeviceMutRef`] borrowed on the CPU using the corresponding
+    ///  [`DeviceOwnedRef`] borrowed on the CPU using the corresponding
     ///  [`LendToCuda::move_to_cuda`](crate::host::LendToCuda::move_to_cuda).
     unsafe fn with_moved_from_rust<O, F: FnOnce(Self) -> O>(
-        cuda_repr_mut: DeviceMutRef<DeviceAccessible<<Self as RustToCuda>::CudaRepresentation>>,
+        cuda_repr_mut: DeviceOwnedRef<DeviceAccessible<<Self as RustToCuda>::CudaRepresentation>>,
         inner: F,
     ) -> O
     where
@@ -54,34 +53,46 @@ pub trait BorrowFromRust: RustToCuda {
         <Self as RustToCuda>::CudaRepresentation: SafeDeviceCopy;
 }
 
-impl<T: RustToCuda> BorrowFromRust for T {
+impl<T: RustToCuda + NoSafeAliasing> BorrowFromRust for T {
     #[inline]
-    unsafe fn with_borrow_from_rust<O, F: FnOnce(&ShallowCopy<Self>) -> O>(
+    unsafe fn with_borrow_from_rust<O, F: FnOnce(&Self) -> O>(
         cuda_repr: DeviceConstRef<DeviceAccessible<<Self as RustToCuda>::CudaRepresentation>>,
         inner: F,
     ) -> O {
-        // rust_repr must never be dropped as we do NOT own any of the
+        // `rust_repr` must never be dropped as we do NOT own any of the
         //  heap memory it might reference
-        let rust_repr = ShallowCopy::new(CudaAsRust::as_rust(cuda_repr.as_ref()));
+        let rust_repr = ManuallyDrop::new(CudaAsRust::as_rust(cuda_repr.as_ref()));
 
         inner(&rust_repr)
     }
 
     #[inline]
-    unsafe fn with_borrow_from_rust_mut<O, F: FnOnce(&mut ShallowCopy<Self>) -> O>(
+    unsafe fn with_borrow_from_rust_mut<O, F: FnOnce(&mut Self) -> O>(
         mut cuda_repr_mut: DeviceMutRef<DeviceAccessible<<Self as RustToCuda>::CudaRepresentation>>,
         inner: F,
     ) -> O {
-        // rust_repr must never be dropped as we do NOT own any of the
+        // `rust_repr_mut` must never be dropped as we do NOT own any of the
         //  heap memory it might reference
-        let mut rust_repr_mut = ShallowCopy::new(CudaAsRust::as_rust(cuda_repr_mut.as_mut()));
-
+        let mut rust_repr_mut = ManuallyDrop::new(CudaAsRust::as_rust(cuda_repr_mut.as_mut()));
+
+        // Ideally, we would only provide access to `Pin<&mut T>` s.t.
+        // `core::mem::replace`  cannot be used.
+        // However, we should still be fine because
+        // - the shallow part of `rust_repr_mut` is a unique copy per thread, so
+        //   replacing it affects no other thread (immediately, drop is handled below)
+        // - any deep parts of `rust_repr_mut` are not allowed to hand out aliasing
+        //   mutable references, so any deep memory replacement would not affect other
+        //   threads
+        // - since any deep data is allocated from the host, we *hope* that trying to
+        //   drop it in a CUDA thread turns into a no-op
         inner(&mut rust_repr_mut)
     }
 
     #[inline]
     unsafe fn with_moved_from_rust<O, F: FnOnce(Self) -> O>(
-        mut cuda_repr_mut: DeviceMutRef<DeviceAccessible<<Self as RustToCuda>::CudaRepresentation>>,
+        mut cuda_repr_mut: DeviceOwnedRef<
+            DeviceAccessible<<Self as RustToCuda>::CudaRepresentation>,
+        >,
         inner: F,
     ) -> O
     where
@@ -91,27 +102,3 @@ impl<T: RustToCuda> BorrowFromRust for T {
         inner(CudaAsRust::as_rust(cuda_repr_mut.as_mut()))
     }
 }
-
-#[repr(transparent)]
-#[derive(Debug)]
-pub struct ShallowCopy<T>(ManuallyDrop<T>);
-
-impl<T> ShallowCopy<T> {
-    fn new(value: T) -> Self {
-        Self(ManuallyDrop::new(value))
-    }
-}
-
-impl<T> Deref for ShallowCopy<T> {
-    type Target = T;
-
-    fn deref(&self) -> &Self::Target {
-        &self.0
-    }
-}
-
-impl<T> DerefMut for ShallowCopy<T> {
-    fn deref_mut(&mut self) -> &mut Self::Target {
-        &mut self.0
-    }
-}
diff --git a/src/host/mod.rs b/src/host/mod.rs
index e06e180f6..f619b4415 100644
--- a/src/host/mod.rs
+++ b/src/host/mod.rs
@@ -26,7 +26,7 @@ use crate::{
         CudaKernelParameter, DeviceAccessible, DeviceConstRef, DeviceMutRef, DeviceOwnedRef,
         EmptyCudaAlloc, NoCudaAlloc, RustToCuda,
     },
-    safety::SafeDeviceCopy,
+    safety::{NoSafeAliasing, SafeDeviceCopy},
 };
 
 mod ptx_jit;
@@ -47,10 +47,7 @@ macro_rules! impl_launcher_launch {
             $($arg: $T::SyncHostType),*
         ) -> CudaResult<()>
         where
-            Kernel: /*Copy +*/ FnOnce(
-                &mut Launcher<Kernel>,
-                $($T/*::SyncHostType*/),*
-            )/* -> CudaResult<()>*/,
+            Kernel: FnOnce(&mut Launcher<Kernel>, $($T),*),
         {
             self.kernel.$launch::<$($T),*>(self.stream, &self.config, $($arg),*)
         }
@@ -71,10 +68,7 @@ macro_rules! impl_launcher_launch {
             ) -> Result<Ok, Err>,
         ) -> Result<Ok, Err>
         where
-            Kernel: /*Copy +*/ FnOnce(
-                &mut Launcher<Kernel>,
-                $($T/*::SyncHostType*/),*
-            )/* -> CudaResult<()>*/,
+            Kernel: FnOnce(&mut Launcher<Kernel>, $($T),*),
         {
             #[allow(unused_variables)]
             let stream = self.stream;
@@ -91,10 +85,7 @@ macro_rules! impl_launcher_launch {
             $($arg: $T::AsyncHostType<'stream, '_>),*
         ) -> CudaResult<()>
         where
-            Kernel: /*Copy +*/ FnOnce(
-                &mut Launcher<Kernel>,
-                $($T/*::SyncHostType*/),*
-            )/* -> CudaResult<()>*/,
+            Kernel: FnOnce(&mut Launcher<Kernel>, $($T),*),
         {
             self.kernel.$launch_async::<$($T),*>(self.stream, &self.config, $($arg),*)
         }
@@ -173,12 +164,12 @@ pub struct LaunchConfig {
 
 #[doc(cfg(feature = "host"))]
 #[allow(clippy::module_name_repetitions)]
-pub struct PtxKernel {
+pub struct RawPtxKernel {
     module: ManuallyDrop<Box<Module>>,
     function: ManuallyDrop<Function<'static>>,
 }
 
-impl PtxKernel {
+impl RawPtxKernel {
     /// # Errors
     ///
     /// Returns a [`CudaError`] if `ptx` is not a valid PTX source, or it does
@@ -211,7 +202,7 @@ impl PtxKernel {
     }
 }
 
-impl Drop for PtxKernel {
+impl Drop for RawPtxKernel {
     fn drop(&mut self) {
         {
             // Ensure that self.function is dropped before self.module as
@@ -226,16 +217,11 @@ impl Drop for PtxKernel {
     }
 }
 
-pub enum KernelJITResult<'k> {
-    Cached(&'k Function<'k>),
-    Recompiled(&'k Function<'k>),
-}
-
 pub type PtxKernelConfigure = dyn FnMut(&Function) -> CudaResult<()>;
 
 pub struct TypedPtxKernel<Kernel> {
     compiler: PtxJITCompiler,
-    ptx_kernel: Option<PtxKernel>,
+    ptx_kernel: Option<RawPtxKernel>,
     entry_point: Box<CStr>,
     configure: Option<Box<PtxKernelConfigure>>,
     marker: PhantomData<Kernel>,
@@ -252,10 +238,7 @@ macro_rules! impl_typed_kernel_launch {
             $($arg: $T::SyncHostType),*
         ) -> CudaResult<()>
         where
-            Kernel: /*Copy +*/ FnOnce(
-                &mut Launcher<Kernel>,
-                $($T/*::SyncHostType*/),*
-            )/* -> CudaResult<()>*/,
+            Kernel: FnOnce(&mut Launcher<Kernel>, $($T),*),
         {
             self.$with_async::<(), CudaError, $($T),*>(
                 stream,
@@ -294,10 +277,7 @@ macro_rules! impl_typed_kernel_launch {
             ) -> Result<Ok, Err>,
         ) -> Result<Ok, Err>
         where
-            Kernel: /*Copy +*/ FnOnce(
-                &mut Launcher<Kernel>,
-                $($T/*::SyncHostType*/),*
-            )/* -> CudaResult<()>*/,
+            Kernel: FnOnce(&mut Launcher<Kernel>, $($T),*),
         {
             impl_typed_kernel_launch! { impl with_new_async ($($arg: $T),*) + (stream) {
                 inner(self, stream, config, $($arg),*)
@@ -314,22 +294,15 @@ macro_rules! impl_typed_kernel_launch {
             $($arg: $T::AsyncHostType<'stream, '_>),*
         ) -> CudaResult<()>
         where
-            Kernel: /*Copy +*/ FnOnce(
-                &mut Launcher<Kernel>,
-                $($T/*::SyncHostType*/),*
-            )/* -> CudaResult<()>*/,
+            Kernel: FnOnce(&mut Launcher<Kernel>, $($T),*),
         {
-            let kernel_jit_result = if config.ptx_jit {
+            let function = if config.ptx_jit {
                 impl_typed_kernel_launch! { impl with_async_as_ptx_jit ref ($($arg: $T),*) + () {
                     self.compile_with_ptx_jit_args(Some(&[$($arg),*]))
                 } }?
             } else {
                 self.compile_with_ptx_jit_args(None)?
             };
-            let function = match kernel_jit_result {
-                KernelJITResult::Recompiled(function)
-                | KernelJITResult::Cached(function) => function,
-            };
 
             unsafe { stream.launch(
                 function,
@@ -362,6 +335,22 @@ macro_rules! impl_typed_kernel_launch {
     };
 }
 
+impl<Kernel> TypedPtxKernel<Kernel> {
+    #[must_use]
+    pub fn new<T: CompiledKernelPtx<Kernel>>(configure: Option<Box<PtxKernelConfigure>>) -> Self {
+        let compiler = PtxJITCompiler::new(T::get_ptx());
+        let entry_point = CString::from(T::get_entry_point()).into_boxed_c_str();
+
+        Self {
+            compiler,
+            ptx_kernel: None,
+            entry_point,
+            configure,
+            marker: PhantomData::<Kernel>,
+        }
+    }
+}
+
 impl<Kernel> TypedPtxKernel<Kernel> {
     impl_typed_kernel_launch! { launch0() => with0_async => launch0_async }
 
@@ -415,20 +404,6 @@ impl<Kernel> TypedPtxKernel<Kernel> {
         arg11: K, arg12: L
     ) => with12_async => launch12_async }
 
-    #[must_use]
-    pub fn new<T: CompiledKernelPtx<Kernel>>(configure: Option<Box<PtxKernelConfigure>>) -> Self {
-        let compiler = PtxJITCompiler::new(T::get_ptx());
-        let entry_point = CString::from(T::get_entry_point()).into_boxed_c_str();
-
-        Self {
-            compiler,
-            ptx_kernel: None,
-            entry_point,
-            configure,
-            marker: PhantomData::<Kernel>,
-        }
-    }
-
     /// # Errors
     ///
     /// Returns a [`CudaError`] if the [`CompiledKernelPtx`] provided to
@@ -437,15 +412,13 @@ impl<Kernel> TypedPtxKernel<Kernel> {
     fn compile_with_ptx_jit_args(
         &mut self,
         arguments: Option<&[Option<&NonNull<[u8]>>]>,
-    ) -> CudaResult<KernelJITResult> {
+    ) -> CudaResult<&Function> {
         let ptx_jit = self.compiler.with_arguments(arguments);
 
         let kernel_jit = match (&mut self.ptx_kernel, ptx_jit) {
-            (Some(ptx_kernel), PtxJITResult::Cached(_)) => {
-                KernelJITResult::Cached(ptx_kernel.get_function())
-            },
+            (Some(ptx_kernel), PtxJITResult::Cached(_)) => ptx_kernel.get_function(),
             (ptx_kernel, PtxJITResult::Cached(ptx_cstr) | PtxJITResult::Recomputed(ptx_cstr)) => {
-                let recomputed_ptx_kernel = PtxKernel::new(ptx_cstr, &self.entry_point)?;
+                let recomputed_ptx_kernel = RawPtxKernel::new(ptx_cstr, &self.entry_point)?;
 
                 // Replace the existing compiled kernel, drop the old one
                 let ptx_kernel = ptx_kernel.insert(recomputed_ptx_kernel);
@@ -456,7 +429,7 @@ impl<Kernel> TypedPtxKernel<Kernel> {
                     configure(function)?;
                 }
 
-                KernelJITResult::Recompiled(function)
+                function
             },
         };
 
@@ -464,7 +437,7 @@ impl<Kernel> TypedPtxKernel<Kernel> {
     }
 }
 
-pub trait LendToCuda: RustToCuda {
+pub trait LendToCuda: RustToCuda + NoSafeAliasing {
     /// Lends an immutable copy of `&self` to CUDA:
     /// - code in the CUDA kernel can only access `&self` through the
     ///   [`DeviceConstRef`] inside the closure
@@ -525,12 +498,10 @@ pub trait LendToCuda: RustToCuda {
         inner: F,
     ) -> Result<O, E>
     where
-        Self: Sized,
-        <Self as RustToCuda>::CudaRepresentation: SafeDeviceCopy,
-        <Self as RustToCuda>::CudaAllocation: EmptyCudaAlloc;
+        Self: RustToCuda<CudaRepresentation: SafeDeviceCopy, CudaAllocation: EmptyCudaAlloc>;
 }
 
-impl<T: RustToCuda> LendToCuda for T {
+impl<T: RustToCuda + NoSafeAliasing> LendToCuda for T {
     fn lend_to_cuda<
         O,
         E: From<CudaError>,
@@ -583,9 +554,7 @@ impl<T: RustToCuda> LendToCuda for T {
         inner: F,
     ) -> Result<O, E>
     where
-        Self: Sized,
-        <Self as RustToCuda>::CudaRepresentation: SafeDeviceCopy,
-        <Self as RustToCuda>::CudaAllocation: EmptyCudaAlloc,
+        Self: RustToCuda<CudaRepresentation: SafeDeviceCopy, CudaAllocation: EmptyCudaAlloc>,
     {
         let (cuda_repr, alloc) = unsafe { self.borrow(NoCudaAlloc) }?;
 
@@ -597,23 +566,21 @@ impl<T: RustToCuda> LendToCuda for T {
     }
 }
 
-mod private {
-    pub mod drop {
-        pub trait Sealed: Sized {
-            fn drop(val: Self) -> Result<(), (rustacuda::error::CudaError, Self)>;
-        }
-    }
+pub trait CudaDroppable: Sized {
+    #[allow(clippy::missing_errors_doc)]
+    fn drop(val: Self) -> Result<(), (rustacuda::error::CudaError, Self)>;
 }
 
 #[repr(transparent)]
-pub struct CudaDropWrapper<C: private::drop::Sealed>(ManuallyDrop<C>);
-impl<C: private::drop::Sealed> crate::common::crate_private::alloc::Sealed for CudaDropWrapper<C> {}
-impl<C: private::drop::Sealed> From<C> for CudaDropWrapper<C> {
+pub struct CudaDropWrapper<C: CudaDroppable>(ManuallyDrop<C>);
+impl<C: CudaDroppable> crate::common::CudaAlloc for CudaDropWrapper<C> {}
+impl<C: CudaDroppable> crate::common::crate_private::alloc::Sealed for CudaDropWrapper<C> {}
+impl<C: CudaDroppable> From<C> for CudaDropWrapper<C> {
     fn from(val: C) -> Self {
         Self(ManuallyDrop::new(val))
     }
 }
-impl<C: private::drop::Sealed> Drop for CudaDropWrapper<C> {
+impl<C: CudaDroppable> Drop for CudaDropWrapper<C> {
     fn drop(&mut self) {
         // Safety: drop is only ever called once
         let val = unsafe { ManuallyDrop::take(&mut self.0) };
@@ -623,14 +590,14 @@ impl<C: private::drop::Sealed> Drop for CudaDropWrapper<C> {
         }
     }
 }
-impl<C: private::drop::Sealed> Deref for CudaDropWrapper<C> {
+impl<C: CudaDroppable> Deref for CudaDropWrapper<C> {
     type Target = C;
 
     fn deref(&self) -> &Self::Target {
         &self.0
     }
 }
-impl<C: private::drop::Sealed> DerefMut for CudaDropWrapper<C> {
+impl<C: CudaDroppable> DerefMut for CudaDropWrapper<C> {
     fn deref_mut(&mut self) -> &mut Self::Target {
         &mut self.0
     }
@@ -638,7 +605,7 @@ impl<C: private::drop::Sealed> DerefMut for CudaDropWrapper<C> {
 
 macro_rules! impl_sealed_drop_collection {
     ($type:ident) => {
-        impl<C: DeviceCopy> private::drop::Sealed for $type<C> {
+        impl<C: DeviceCopy> CudaDroppable for $type<C> {
             fn drop(val: Self) -> Result<(), (CudaError, Self)> {
                 Self::drop(val)
             }
@@ -653,7 +620,7 @@ impl_sealed_drop_collection!(LockedBox);
 
 macro_rules! impl_sealed_drop_value {
     ($type:ident) => {
-        impl private::drop::Sealed for $type {
+        impl CudaDroppable for $type {
             fn drop(val: Self) -> Result<(), (CudaError, Self)> {
                 Self::drop(val)
             }
@@ -727,6 +694,7 @@ impl<T: DeviceCopy> Drop for HostLockedBox<T> {
 #[allow(clippy::module_name_repetitions)]
 pub struct HostDeviceBox<T: DeviceCopy>(DevicePointer<T>);
 
+impl<T: DeviceCopy> crate::common::CudaAlloc for HostDeviceBox<T> {}
 impl<T: DeviceCopy> crate::common::crate_private::alloc::Sealed for HostDeviceBox<T> {}
 
 impl<T: DeviceCopy> HostDeviceBox<T> {
diff --git a/src/utils/aliasing/const.rs b/src/utils/aliasing/const.rs
index 91496a47d..c40b3642a 100644
--- a/src/utils/aliasing/const.rs
+++ b/src/utils/aliasing/const.rs
@@ -248,8 +248,7 @@ unsafe impl<T: CudaAsRust, const STRIDE: usize> CudaAsRust
 {
     type RustRepresentation = SplitSliceOverCudaThreadsConstStride<T::RustRepresentation, STRIDE>;
 
-    #[cfg(any(not(feature = "host"), doc))]
-    #[doc(cfg(not(feature = "host")))]
+    #[cfg(not(feature = "host"))]
     unsafe fn as_rust(this: &DeviceAccessible<Self>) -> Self::RustRepresentation {
         SplitSliceOverCudaThreadsConstStride::new(CudaAsRust::as_rust(&this.0))
     }
diff --git a/src/utils/aliasing/dynamic.rs b/src/utils/aliasing/dynamic.rs
index d7b48b05f..c70ca80f8 100644
--- a/src/utils/aliasing/dynamic.rs
+++ b/src/utils/aliasing/dynamic.rs
@@ -226,8 +226,7 @@ unsafe impl<T: CudaAsRust> CudaAsRust
 {
     type RustRepresentation = SplitSliceOverCudaThreadsDynamicStride<T::RustRepresentation>;
 
-    #[cfg(any(not(feature = "host"), doc))]
-    #[doc(cfg(not(feature = "host")))]
+    #[cfg(not(feature = "host"))]
     unsafe fn as_rust(this: &DeviceAccessible<Self>) -> Self::RustRepresentation {
         SplitSliceOverCudaThreadsDynamicStride::new(CudaAsRust::as_rust(&this.inner), this.stride)
     }
diff --git a/src/utils/aliasing/final.rs b/src/utils/aliasing/final.rs
index 019ece1b6..366de9557 100644
--- a/src/utils/aliasing/final.rs
+++ b/src/utils/aliasing/final.rs
@@ -83,8 +83,7 @@ unsafe impl<T: RustToCudaAsync> RustToCudaAsync for Final<T> {
 unsafe impl<T: CudaAsRust> CudaAsRust for FinalCudaRepresentation<T> {
     type RustRepresentation = Final<T::RustRepresentation>;
 
-    #[cfg(any(not(feature = "host"), doc))]
-    #[doc(cfg(not(feature = "host")))]
+    #[cfg(not(feature = "host"))]
     unsafe fn as_rust(this: &DeviceAccessible<Self>) -> Self::RustRepresentation {
         Final::new(CudaAsRust::as_rust(&this.0))
     }
diff --git a/src/utils/box.rs b/src/utils/box.rs
index 8e81941a1..ab0b22708 100644
--- a/src/utils/box.rs
+++ b/src/utils/box.rs
@@ -79,8 +79,7 @@ unsafe impl<T: SafeDeviceCopy + TypeGraphLayout> RustToCuda for Box<T> {
 unsafe impl<T: SafeDeviceCopy + TypeGraphLayout> CudaAsRust for BoxCudaRepresentation<T> {
     type RustRepresentation = Box<T>;
 
-    #[cfg(any(not(feature = "host"), doc))]
-    #[doc(cfg(not(feature = "host")))]
+    #[cfg(not(feature = "host"))]
     unsafe fn as_rust(this: &DeviceAccessible<Self>) -> Self::RustRepresentation {
         alloc::boxed::Box::from_raw(this.0)
     }
diff --git a/src/utils/boxed_slice.rs b/src/utils/boxed_slice.rs
index 4a06e0a8d..588fa8c07 100644
--- a/src/utils/boxed_slice.rs
+++ b/src/utils/boxed_slice.rs
@@ -81,8 +81,7 @@ unsafe impl<T: SafeDeviceCopy + TypeGraphLayout> RustToCuda for Box<[T]> {
 unsafe impl<T: SafeDeviceCopy + TypeGraphLayout> CudaAsRust for BoxedSliceCudaRepresentation<T> {
     type RustRepresentation = Box<[T]>;
 
-    #[cfg(any(not(feature = "host"), doc))]
-    #[doc(cfg(not(feature = "host")))]
+    #[cfg(not(feature = "host"))]
     unsafe fn as_rust(this: &DeviceAccessible<Self>) -> Self::RustRepresentation {
         alloc::boxed::Box::from_raw(core::slice::from_raw_parts_mut(this.0, this.1))
     }
diff --git a/src/utils/device_copy.rs b/src/utils/device_copy.rs
index 2869cd296..b06735692 100644
--- a/src/utils/device_copy.rs
+++ b/src/utils/device_copy.rs
@@ -133,7 +133,7 @@ unsafe impl<T: SafeDeviceCopy + TypeGraphLayout> RustToCudaAsync for SafeDeviceC
 unsafe impl<T: SafeDeviceCopy + TypeGraphLayout> CudaAsRust for SafeDeviceCopyWrapper<T> {
     type RustRepresentation = Self;
 
-    #[cfg(any(not(feature = "host"), doc))]
+    #[cfg(not(feature = "host"))]
     unsafe fn as_rust(this: &DeviceAccessible<Self>) -> Self::RustRepresentation {
         let mut uninit = core::mem::MaybeUninit::uninit();
         core::ptr::copy_nonoverlapping(&**this, uninit.as_mut_ptr(), 1);
diff --git a/src/utils/exchange/buffer/common.rs b/src/utils/exchange/buffer/common.rs
index c5d1f9128..12a491b20 100644
--- a/src/utils/exchange/buffer/common.rs
+++ b/src/utils/exchange/buffer/common.rs
@@ -28,8 +28,7 @@ unsafe impl<T: SafeDeviceCopy + TypeGraphLayout, const M2D: bool, const M2H: boo
 {
     type RustRepresentation = CudaExchangeBuffer<T, M2D, M2H>;
 
-    #[cfg(any(not(feature = "host"), doc))]
-    #[doc(cfg(not(feature = "host")))]
+    #[cfg(not(feature = "host"))]
     unsafe fn as_rust(this: &crate::common::DeviceAccessible<Self>) -> Self::RustRepresentation {
         CudaExchangeBuffer(core::mem::ManuallyDrop::new(alloc::boxed::Box::from_raw(
             core::slice::from_raw_parts_mut(this.0, this.1),
diff --git a/src/utils/option.rs b/src/utils/option.rs
index f939f5ba0..a7b3e991e 100644
--- a/src/utils/option.rs
+++ b/src/utils/option.rs
@@ -146,8 +146,7 @@ unsafe impl<T: RustToCudaAsync> RustToCudaAsync for Option<T> {
 unsafe impl<T: CudaAsRust> CudaAsRust for OptionCudaRepresentation<T> {
     type RustRepresentation = Option<<T as CudaAsRust>::RustRepresentation>;
 
-    #[cfg(any(not(feature = "host"), doc))]
-    #[doc(cfg(not(feature = "host")))]
+    #[cfg(not(feature = "host"))]
     unsafe fn as_rust(this: &DeviceAccessible<Self>) -> Self::RustRepresentation {
         if this.present {
             Some(CudaAsRust::as_rust(this.maybe.assume_init_ref()))

From ed082f24ee1971772a1ff20c6024c7ef689ba995 Mon Sep 17 00:00:00 2001
From: Juniper Tyree <juniper.tyree@helsinki.fi>
Date: Sun, 24 Dec 2023 05:21:39 +0000
Subject: [PATCH 059/120] Allow passing ThreadBlockShared to kernels again

---
 examples/single-source/src/main.rs | 19 +++-----
 src/common.rs                      | 70 +++++++++++++++++++++++++++++-
 src/device/thread.rs               | 10 ++---
 src/device/utils.rs                |  2 +
 src/host/mod.rs                    | 21 ++++-----
 src/lib.rs                         |  8 +++-
 src/utils/aliasing/const.rs        |  4 +-
 src/utils/aliasing/dynamic.rs      |  4 +-
 src/utils/device_copy.rs           | 17 ++++++--
 src/utils/exchange/buffer/mod.rs   | 12 ++---
 src/utils/exchange/wrapper.rs      | 16 +++----
 src/utils/shared/slice.rs          |  4 +-
 src/utils/shared/static.rs         |  4 +-
 13 files changed, 137 insertions(+), 54 deletions(-)

diff --git a/examples/single-source/src/main.rs b/examples/single-source/src/main.rs
index f53963f9d..d88d628a7 100644
--- a/examples/single-source/src/main.rs
+++ b/examples/single-source/src/main.rs
@@ -15,9 +15,6 @@
 
 extern crate alloc;
 
-#[cfg(target_os = "cuda")]
-use rc::utils::shared::r#static::ThreadBlockShared;
-
 #[cfg(not(target_os = "cuda"))]
 fn main() {}
 
@@ -69,12 +66,11 @@ pub fn kernel<
     _z: &rc::common::SharedHeapPerThreadShallowCopy<Wrapper<T>>,
     _v @ _w: &'a rc::common::ShallowInteriorMutable<core::sync::atomic::AtomicU64>,
     _: rc::common::SharedHeapPerThreadShallowCopy<Wrapper<T>>,
-    Tuple(s, mut __t): rc::common::PerThreadShallowCopy<Tuple>,
-    q: rc::common::PerThreadShallowCopy<Triple>,
-    // shared3: ThreadBlockShared<u32>,
+    q @ Triple(s, mut __t, _u): rc::common::PerThreadShallowCopy<Triple>,
+    shared3: &mut rc::utils::shared::r#static::ThreadBlockShared<u32>,
 ) {
-    let shared: ThreadBlockShared<[Tuple; 3]> = ThreadBlockShared::new_uninit();
-    let shared2: ThreadBlockShared<[Tuple; 3]> = ThreadBlockShared::new_uninit();
+    let shared = rc::utils::shared::r#static::ThreadBlockShared::<[Tuple; 3]>::new_uninit();
+    let shared2 = rc::utils::shared::r#static::ThreadBlockShared::<[Tuple; 3]>::new_uninit();
 
     #[allow(clippy::cast_possible_truncation, clippy::cast_sign_loss)]
     unsafe {
@@ -84,10 +80,9 @@ pub fn kernel<
         (*shared2.index_mut_unchecked(2)).1 = q.0 + q.1 + q.2;
     }
 
-    // unsafe { core::arch::asm!("hi") }
-    // unsafe {
-    //     *shared3.as_mut_ptr() = 12;
-    // }
+    unsafe {
+        *shared3.as_mut_ptr() = 12;
+    }
 }
 
 #[cfg(not(target_os = "cuda"))]
diff --git a/src/common.rs b/src/common.rs
index e4ee3f804..b0263031b 100644
--- a/src/common.rs
+++ b/src/common.rs
@@ -313,7 +313,8 @@ impl<A: CudaAlloc + EmptyCudaAlloc, B: CudaAlloc + EmptyCudaAlloc> private::empt
 {
 }
 impl<A: CudaAlloc, B: CudaAlloc> CombinedCudaAlloc<A, B> {
-    pub fn new(front: A, tail: B) -> Self {
+    #[must_use]
+    pub const fn new(front: A, tail: B) -> Self {
         Self(front, tail)
     }
 
@@ -1117,3 +1118,70 @@ fn emit_param_ptx_jit_marker<T: ?Sized, const INDEX: usize>(param: &T) {
         );
     }
 }
+
+mod private_shared {
+    use const_type_layout::TypeGraphLayout;
+    use rustacuda_core::DeviceCopy;
+
+    #[doc(hidden)]
+    #[derive(TypeLayout)]
+    #[repr(C)]
+    pub struct ThreadBlockSharedFfi<T: 'static + TypeGraphLayout> {
+        pub(super) _marker: [T; 0],
+    }
+
+    // Safety: there is nothing to copy, this is just a zero-sized marker type
+    unsafe impl<T: 'static + TypeGraphLayout> DeviceCopy for ThreadBlockSharedFfi<T> {}
+}
+
+impl<'a, T: 'static + TypeGraphLayout> CudaKernelParameter
+    for &'a mut crate::utils::shared::r#static::ThreadBlockShared<T>
+{
+    #[cfg(feature = "host")]
+    type AsyncHostType<'stream, 'b> = &'b mut crate::utils::shared::r#static::ThreadBlockShared<T>;
+    #[cfg(all(not(feature = "host"), target_os = "cuda"))]
+    type DeviceType<'b> = &'b mut crate::utils::shared::r#static::ThreadBlockShared<T>;
+    type FfiType<'stream, 'b> = private_shared::ThreadBlockSharedFfi<T>;
+    #[cfg(feature = "host")]
+    type SyncHostType = Self;
+
+    #[cfg(feature = "host")]
+    fn with_new_async<'stream, O, E: From<rustacuda::error::CudaError>>(
+        param: Self::SyncHostType,
+        _stream: &'stream rustacuda::stream::Stream,
+        inner: impl for<'b> FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result<O, E>,
+    ) -> Result<O, E> {
+        inner(param)
+    }
+
+    #[cfg(feature = "host")]
+    fn with_async_as_ptx_jit<O>(
+        _param: &Self::AsyncHostType<'_, '_>,
+        inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O,
+    ) -> O {
+        inner(None)
+    }
+
+    #[cfg(feature = "host")]
+    fn async_to_ffi<'stream, 'b>(
+        _param: Self::AsyncHostType<'stream, 'b>,
+    ) -> Self::FfiType<'stream, 'b> {
+        private_shared::ThreadBlockSharedFfi { _marker: [] }
+    }
+
+    #[cfg(all(not(feature = "host"), target_os = "cuda"))]
+    #[allow(clippy::inline_always)]
+    #[inline(always)]
+    fn with_ffi_as_device<O, const PARAM: usize>(
+        _param: Self::FfiType<'static, 'static>,
+        inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O,
+    ) -> O {
+        let mut param = crate::utils::shared::r#static::ThreadBlockShared::new_uninit();
+
+        inner(&mut param)
+    }
+}
+impl<'a, T: 'static + TypeGraphLayout> sealed::Sealed
+    for &'a mut crate::utils::shared::r#static::ThreadBlockShared<T>
+{
+}
diff --git a/src/device/thread.rs b/src/device/thread.rs
index 26ee357d2..b2f3035bd 100644
--- a/src/device/thread.rs
+++ b/src/device/thread.rs
@@ -20,7 +20,7 @@ impl Thread {
     #[must_use]
     #[allow(clippy::inline_always)]
     #[inline(always)]
-    pub fn this() -> Self {
+    pub const fn this() -> Self {
         Self { _private: () }
     }
 
@@ -54,7 +54,7 @@ impl Thread {
     #[must_use]
     #[allow(clippy::inline_always)]
     #[inline(always)]
-    pub fn block(&self) -> ThreadBlock {
+    pub const fn block(&self) -> ThreadBlock {
         ThreadBlock { _private: () }
     }
 }
@@ -91,7 +91,7 @@ impl ThreadBlock {
     #[must_use]
     #[allow(clippy::inline_always)]
     #[inline(always)]
-    pub fn grid(&self) -> ThreadBlockGrid {
+    pub const fn grid(&self) -> ThreadBlockGrid {
         ThreadBlockGrid { _private: () }
     }
 
@@ -138,7 +138,7 @@ impl Dim3 {
     #[must_use]
     #[allow(clippy::inline_always)]
     #[inline(always)]
-    pub fn size(&self) -> usize {
+    pub const fn size(&self) -> usize {
         (self.x as usize) * (self.y as usize) * (self.z as usize)
     }
 }
@@ -147,7 +147,7 @@ impl Idx3 {
     #[must_use]
     #[allow(clippy::inline_always)]
     #[inline(always)]
-    pub fn as_id(&self, dim: &Dim3) -> usize {
+    pub const fn as_id(&self, dim: &Dim3) -> usize {
         (self.x as usize)
             + (self.y as usize) * (dim.x as usize)
             + (self.z as usize) * (dim.x as usize) * (dim.y as usize)
diff --git a/src/device/utils.rs b/src/device/utils.rs
index 073e7bd54..3b37307a6 100644
--- a/src/device/utils.rs
+++ b/src/device/utils.rs
@@ -48,6 +48,7 @@ pub fn print(args: ::core::fmt::Arguments) {
     }
 
     let msg; // place to store the dynamically expanded format string
+    #[allow(clippy::option_if_let_else)]
     let msg = if let Some(msg) = args.as_str() {
         msg
     } else {
@@ -87,6 +88,7 @@ pub fn pretty_panic_handler(
     }
 
     let msg; // place to store the dynamically expanded format string
+    #[allow(clippy::option_if_let_else)]
     let msg = if let Some(message) = info.message() {
         if let Some(msg) = message.as_str() {
             msg
diff --git a/src/host/mod.rs b/src/host/mod.rs
index f619b4415..685584659 100644
--- a/src/host/mod.rs
+++ b/src/host/mod.rs
@@ -154,7 +154,7 @@ impl<'stream, 'kernel, Kernel> Launcher<'stream, 'kernel, Kernel> {
     ) => with12_async => launch12_async }
 }
 
-#[derive(Clone, Debug, PartialEq)]
+#[derive(Clone, Debug, PartialEq, Eq)]
 pub struct LaunchConfig {
     pub grid: rustacuda::function::GridSize,
     pub block: rustacuda::function::BlockSize,
@@ -676,7 +676,7 @@ impl<T: DeviceCopy> From<HostLockedBox<T>> for LockedBox<T> {
     fn from(host_locked_box: HostLockedBox<T>) -> Self {
         // Safety: pointer comes from [`LockedBox::into_raw`]
         //         i.e. this function completes the roundtrip
-        unsafe { LockedBox::from_raw(host_locked_box.0) }
+        unsafe { Self::from_raw(host_locked_box.0) }
     }
 }
 
@@ -790,7 +790,7 @@ impl<T: DeviceCopy> From<HostDeviceBox<T>> for DeviceBox<T> {
     fn from(host_device_box: HostDeviceBox<T>) -> Self {
         // Safety: pointer comes from [`DeviceBox::into_device`]
         //         i.e. this function completes the roundtrip
-        unsafe { DeviceBox::from_device(host_device_box.0) }
+        unsafe { Self::from_device(host_device_box.0) }
     }
 }
 
@@ -918,7 +918,7 @@ impl<'a, T: DeviceCopy> HostAndDeviceConstRef<'a, T> {
     /// # Safety
     ///
     /// `device_box` must contain EXACTLY the device copy of `host_ref`
-    pub unsafe fn new(device_box: &'a HostDeviceBox<T>, host_ref: &'a T) -> Self {
+    pub const unsafe fn new(device_box: &'a HostDeviceBox<T>, host_ref: &'a T) -> Self {
         Self {
             device_box,
             host_ref,
@@ -962,12 +962,12 @@ impl<'a, T: DeviceCopy> HostAndDeviceConstRef<'a, T> {
     }
 
     #[must_use]
-    pub fn for_host(&'a self) -> &'a T {
+    pub const fn for_host(&'a self) -> &'a T {
         self.host_ref
     }
 
     #[must_use]
-    pub fn as_ref<'b>(&'b self) -> HostAndDeviceConstRef<'b, T>
+    pub const fn as_ref<'b>(&'b self) -> HostAndDeviceConstRef<'b, T>
     where
         'a: 'b,
     {
@@ -975,7 +975,7 @@ impl<'a, T: DeviceCopy> HostAndDeviceConstRef<'a, T> {
     }
 
     #[must_use]
-    pub fn as_async<'stream, 'b>(&'b self) -> HostAndDeviceConstRefAsync<'stream, 'b, T>
+    pub const fn as_async<'stream, 'b>(&'b self) -> HostAndDeviceConstRefAsync<'stream, 'b, T>
     where
         'a: 'b,
     {
@@ -1124,7 +1124,8 @@ impl<'stream, 'a, T: DeviceCopy> HostAndDeviceConstRefAsync<'stream, 'a, T> {
     /// # Safety
     ///
     /// `device_box` must contain EXACTLY the device copy of `host_ref`
-    pub unsafe fn new(
+    #[must_use]
+    pub const unsafe fn new(
         device_box: &'a HostDeviceBox<T>,
         host_ref: &'a T,
         stream: &'stream Stream,
@@ -1154,12 +1155,12 @@ impl<'stream, 'a, T: DeviceCopy> HostAndDeviceConstRefAsync<'stream, 'a, T> {
     }
 
     #[must_use]
-    pub fn for_host(&'a self) -> &'a T {
+    pub const fn for_host(&'a self) -> &'a T {
         self.host_ref
     }
 
     #[must_use]
-    pub fn as_ref<'b>(&'b self) -> HostAndDeviceConstRefAsync<'stream, 'b, T>
+    pub const fn as_ref<'b>(&'b self) -> HostAndDeviceConstRefAsync<'stream, 'b, T>
     where
         'a: 'b,
     {
diff --git a/src/lib.rs b/src/lib.rs
index 392928d29..26f56adb6 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -1,4 +1,10 @@
-#![deny(clippy::pedantic)]
+#![deny(clippy::complexity)]
+#![deny(clippy::correctness)]
+#![warn(clippy::nursery)]
+#![warn(clippy::pedantic)]
+#![deny(clippy::perf)]
+#![deny(clippy::style)]
+#![deny(clippy::suspicious)]
 #![allow(clippy::useless_attribute)]
 #![cfg_attr(not(feature = "host"), no_std)]
 #![feature(associated_type_bounds)]
diff --git a/src/utils/aliasing/const.rs b/src/utils/aliasing/const.rs
index c40b3642a..759b14dc9 100644
--- a/src/utils/aliasing/const.rs
+++ b/src/utils/aliasing/const.rs
@@ -14,7 +14,7 @@ pub struct SplitSliceOverCudaThreadsConstStride<T, const STRIDE: usize>(T);
 
 impl<T, const STRIDE: usize> SplitSliceOverCudaThreadsConstStride<T, STRIDE> {
     #[must_use]
-    pub fn new(inner: T) -> Self {
+    pub const fn new(inner: T) -> Self {
         Self(inner)
     }
 }
@@ -49,7 +49,7 @@ impl<T, const STRIDE: usize> SplitSliceOverCudaThreadsConstStride<T, STRIDE> {
     /// All cross-CUDA-thread aliasing guarantees are lost with this method.
     /// Instead, the caller must ensure that no two threads in a kernel launch
     /// access the same underlying elements.
-    pub unsafe fn alias_unchecked(&self) -> &T {
+    pub const unsafe fn alias_unchecked(&self) -> &T {
         &self.0
     }
 
diff --git a/src/utils/aliasing/dynamic.rs b/src/utils/aliasing/dynamic.rs
index c70ca80f8..a3cecfa8f 100644
--- a/src/utils/aliasing/dynamic.rs
+++ b/src/utils/aliasing/dynamic.rs
@@ -17,7 +17,7 @@ pub struct SplitSliceOverCudaThreadsDynamicStride<T> {
 
 impl<T> SplitSliceOverCudaThreadsDynamicStride<T> {
     #[must_use]
-    pub fn new(inner: T, stride: usize) -> Self {
+    pub const fn new(inner: T, stride: usize) -> Self {
         Self { stride, inner }
     }
 }
@@ -49,7 +49,7 @@ impl<T> SplitSliceOverCudaThreadsDynamicStride<T> {
     /// All cross-CUDA-thread aliasing guarantees are lost with this method.
     /// Instead, the caller must ensure that no two threads in a kernel launch
     /// access the same underlying elements.
-    pub unsafe fn alias_unchecked(&self) -> &T {
+    pub const unsafe fn alias_unchecked(&self) -> &T {
         &self.inner
     }
 
diff --git a/src/utils/device_copy.rs b/src/utils/device_copy.rs
index b06735692..0a92e69a1 100644
--- a/src/utils/device_copy.rs
+++ b/src/utils/device_copy.rs
@@ -28,45 +28,54 @@ impl<T: SafeDeviceCopy + TypeGraphLayout> From<T> for SafeDeviceCopyWrapper<T> {
 }
 
 impl<T: SafeDeviceCopy + TypeGraphLayout> SafeDeviceCopyWrapper<T> {
+    #[must_use]
     pub fn into_inner(self) -> T {
         self.0
     }
 
-    pub fn from_ref(reference: &T) -> &Self {
+    #[must_use]
+    pub const fn from_ref(reference: &T) -> &Self {
         // Safety: [`SafeDeviceCopyWrapper`] is a transparent newtype around [`T`]
         unsafe { &*(reference as *const T).cast() }
     }
 
-    pub fn into_ref(&self) -> &T {
+    #[must_use]
+    pub const fn into_ref(&self) -> &T {
         // Safety: [`SafeDeviceCopyWrapper`] is a transparent newtype around [`T`]
         unsafe { &*(self as *const Self).cast() }
     }
 
+    #[must_use]
     pub fn from_mut(reference: &mut T) -> &mut Self {
         // Safety: [`SafeDeviceCopyWrapper`] is a transparent newtype around [`T`]
         unsafe { &mut *(reference as *mut T).cast() }
     }
 
+    #[must_use]
     pub fn into_mut(&mut self) -> &mut T {
         // Safety: [`SafeDeviceCopyWrapper`] is a transparent newtype around [`T`]
         unsafe { &mut *(self as *mut Self).cast() }
     }
 
-    pub fn from_slice(slice: &[T]) -> &[Self] {
+    #[must_use]
+    pub const fn from_slice(slice: &[T]) -> &[Self] {
         // Safety: [`SafeDeviceCopyWrapper`] is a transparent newtype around [`T`]
         unsafe { core::slice::from_raw_parts(slice.as_ptr().cast(), slice.len()) }
     }
 
-    pub fn into_slice(slice: &[Self]) -> &[T] {
+    #[must_use]
+    pub const fn into_slice(slice: &[Self]) -> &[T] {
         // Safety: [`SafeDeviceCopyWrapper`] is a transparent newtype around [`T`]
         unsafe { core::slice::from_raw_parts(slice.as_ptr().cast(), slice.len()) }
     }
 
+    #[must_use]
     pub fn from_mut_slice(slice: &mut [T]) -> &mut [Self] {
         // Safety: [`SafeDeviceCopyWrapper`] is a transparent newtype around [`T`]
         unsafe { core::slice::from_raw_parts_mut(slice.as_mut_ptr().cast(), slice.len()) }
     }
 
+    #[must_use]
     pub fn into_mut_slice(slice: &mut [Self]) -> &mut [T] {
         // Safety: [`SafeDeviceCopyWrapper`] is a transparent newtype around [`T`]
         unsafe { core::slice::from_raw_parts_mut(slice.as_mut_ptr().cast(), slice.len()) }
diff --git a/src/utils/exchange/buffer/mod.rs b/src/utils/exchange/buffer/mod.rs
index c4e4b24bd..66b2144c1 100644
--- a/src/utils/exchange/buffer/mod.rs
+++ b/src/utils/exchange/buffer/mod.rs
@@ -32,7 +32,7 @@ unsafe impl<T: SafeDeviceCopy, const M2D: bool, const M2H: bool> rustacuda_core:
 impl<T: SafeDeviceCopy, const M2D: bool> CudaExchangeItem<T, M2D, true> {
     #[cfg(any(feature = "host", doc))]
     #[doc(cfg(feature = "host"))]
-    pub fn read(&self) -> &T {
+    pub const fn read(&self) -> &T {
         &self.0
     }
 
@@ -46,7 +46,7 @@ impl<T: SafeDeviceCopy, const M2D: bool> CudaExchangeItem<T, M2D, true> {
 impl<T: SafeDeviceCopy, const M2H: bool> CudaExchangeItem<T, true, M2H> {
     #[cfg(any(not(feature = "host"), doc))]
     #[doc(cfg(not(feature = "host")))]
-    pub fn read(&self) -> &T {
+    pub const fn read(&self) -> &T {
         &self.0
     }
 
@@ -66,7 +66,7 @@ impl<T: SafeDeviceCopy> AsMut<T> for CudaExchangeItem<T, true, true> {
 impl<T: SafeDeviceCopy> CudaExchangeItem<T, false, true> {
     #[cfg(any(feature = "host", doc))]
     #[doc(cfg(feature = "host"))]
-    pub fn as_scratch(&self) -> &T {
+    pub const fn as_scratch(&self) -> &T {
         &self.0
     }
 
@@ -80,7 +80,7 @@ impl<T: SafeDeviceCopy> CudaExchangeItem<T, false, true> {
 impl<T: SafeDeviceCopy> CudaExchangeItem<T, true, false> {
     #[cfg(any(not(feature = "host"), doc))]
     #[doc(cfg(not(feature = "host")))]
-    pub fn as_scratch(&self) -> &T {
+    pub const fn as_scratch(&self) -> &T {
         &self.0
     }
 
@@ -94,7 +94,7 @@ impl<T: SafeDeviceCopy> CudaExchangeItem<T, true, false> {
 impl<T: SafeDeviceCopy> CudaExchangeItem<T, true, false> {
     #[cfg(any(feature = "host", doc))]
     #[doc(cfg(feature = "host"))]
-    pub fn as_uninit(&self) -> &MaybeUninit<T> {
+    pub const fn as_uninit(&self) -> &MaybeUninit<T> {
         // Safety:
         // - MaybeUninit is a transparent newtype union
         // - CudaExchangeItem is a transparent newtype
@@ -114,7 +114,7 @@ impl<T: SafeDeviceCopy> CudaExchangeItem<T, true, false> {
 impl<T: SafeDeviceCopy> CudaExchangeItem<T, false, true> {
     #[cfg(any(not(feature = "host"), doc))]
     #[doc(cfg(not(feature = "host")))]
-    pub fn as_uninit(&self) -> &MaybeUninit<T> {
+    pub const fn as_uninit(&self) -> &MaybeUninit<T> {
         // Safety:
         // - MaybeUninit is a transparent newtype union
         // - CudaExchangeItem is a transparent newtype
diff --git a/src/utils/exchange/wrapper.rs b/src/utils/exchange/wrapper.rs
index 4ca2474d4..4edfdebd8 100644
--- a/src/utils/exchange/wrapper.rs
+++ b/src/utils/exchange/wrapper.rs
@@ -224,13 +224,13 @@ impl<'stream, T: RustToCuda<CudaAllocation: EmptyCudaAlloc>> IntoFuture
 
         core::future::poll_fn(move |cx| match &wrapper {
             Some(inner) => match inner.move_event.query() {
-                Ok(EventStatus::NotReady) => match inner.waker.lock() {
-                    Ok(mut w) => {
+                Ok(EventStatus::NotReady) => inner.waker.lock().map_or_else(
+                    |_| Poll::Ready(Err(CudaError::OperatingSystemError)),
+                    |mut w| {
                         *w = Some(cx.waker().clone());
                         Poll::Pending
                     },
-                    Err(_) => Poll::Ready(Err(CudaError::OperatingSystemError)),
-                },
+                ),
                 Ok(EventStatus::Ready) => match wrapper.take() {
                     Some(inner) => Poll::Ready(Ok(ExchangeWrapperOnHost {
                         value: inner.value,
@@ -419,13 +419,13 @@ impl<'stream, T: RustToCuda<CudaAllocation: EmptyCudaAlloc>> IntoFuture
 
         core::future::poll_fn(move |cx| match &wrapper {
             Some(inner) => match inner.move_event.query() {
-                Ok(EventStatus::NotReady) => match inner.waker.lock() {
-                    Ok(mut w) => {
+                Ok(EventStatus::NotReady) => inner.waker.lock().map_or_else(
+                    |_| Poll::Ready(Err(CudaError::OperatingSystemError)),
+                    |mut w| {
                         *w = Some(cx.waker().clone());
                         Poll::Pending
                     },
-                    Err(_) => Poll::Ready(Err(CudaError::OperatingSystemError)),
-                },
+                ),
                 Ok(EventStatus::Ready) => match wrapper.take() {
                     Some(inner) => Poll::Ready(Ok(ExchangeWrapperOnDevice {
                         value: inner.value,
diff --git a/src/utils/shared/slice.rs b/src/utils/shared/slice.rs
index 804623ae4..920f0c58d 100644
--- a/src/utils/shared/slice.rs
+++ b/src/utils/shared/slice.rs
@@ -63,14 +63,14 @@ impl<T: 'static + TypeGraphLayout> ThreadBlockSharedSlice<T> {
     #[cfg(any(target_os = "cuda", doc))]
     #[doc(cfg(target_os = "cuda"))]
     #[must_use]
-    pub fn as_mut_ptr(&self) -> *mut T {
+    pub const fn as_mut_ptr(&self) -> *mut T {
         self.shared.cast()
     }
 
     #[cfg(any(target_os = "cuda", doc))]
     #[doc(cfg(target_os = "cuda"))]
     #[must_use]
-    pub fn as_mut_slice_ptr(&self) -> *mut [T] {
+    pub const fn as_mut_slice_ptr(&self) -> *mut [T] {
         self.shared
     }
 
diff --git a/src/utils/shared/static.rs b/src/utils/shared/static.rs
index 0ba7f9df0..41ba334ba 100644
--- a/src/utils/shared/static.rs
+++ b/src/utils/shared/static.rs
@@ -9,6 +9,8 @@ pub struct ThreadBlockShared<T: 'static> {
 
 impl<T: 'static> ThreadBlockShared<T> {
     #[must_use]
+    #[allow(clippy::inline_always, clippy::missing_const_for_fn)]
+    #[inline(always)]
     pub fn new_uninit() -> Self {
         #[cfg(not(target_os = "cuda"))]
         {
@@ -38,7 +40,7 @@ impl<T: 'static> ThreadBlockShared<T> {
     #[cfg(any(target_os = "cuda", doc))]
     #[doc(cfg(target_os = "cuda"))]
     #[must_use]
-    pub fn as_mut_ptr(&self) -> *mut T {
+    pub const fn as_mut_ptr(&self) -> *mut T {
         self.shared
     }
 }

From ea74fa2edf0b7cd7745734776886690e2ec2221a Mon Sep 17 00:00:00 2001
From: Juniper Tyree <juniper.tyree@helsinki.fi>
Date: Sun, 24 Dec 2023 05:28:22 +0000
Subject: [PATCH 060/120] Remove unsound mutable lending to CUDA for now

---
 examples/single-source/src/main.rs |   1 -
 src/common.rs                      | 115 -----------------------------
 src/device/mod.rs                  |  50 ++++---------
 src/host/mod.rs                    |  46 ------------
 4 files changed, 13 insertions(+), 199 deletions(-)

diff --git a/examples/single-source/src/main.rs b/examples/single-source/src/main.rs
index d88d628a7..1b677192d 100644
--- a/examples/single-source/src/main.rs
+++ b/examples/single-source/src/main.rs
@@ -62,7 +62,6 @@ pub fn kernel<
         + rc::safety::NoSafeAliasing,
 >(
     _x: &rc::common::PerThreadShallowCopy<Dummy>,
-    _y: &mut rc::common::PtxJit<rc::common::SharedHeapPerThreadShallowCopy<Wrapper<T>>>,
     _z: &rc::common::SharedHeapPerThreadShallowCopy<Wrapper<T>>,
     _v @ _w: &'a rc::common::ShallowInteriorMutable<core::sync::atomic::AtomicU64>,
     _: rc::common::SharedHeapPerThreadShallowCopy<Wrapper<T>>,
diff --git a/src/common.rs b/src/common.rs
index b0263031b..2e8d72a7c 100644
--- a/src/common.rs
+++ b/src/common.rs
@@ -865,59 +865,6 @@ impl<'a, T: RustToCuda + crate::safety::NoSafeAliasing> sealed::Sealed
 {
 }
 
-impl<'a, T: 'static + RustToCuda + crate::safety::NoSafeAliasing> CudaKernelParameter
-    for &'a mut SharedHeapPerThreadShallowCopy<T>
-{
-    #[cfg(feature = "host")]
-    type AsyncHostType<'stream, 'b> = crate::host::HostAndDeviceMutRefAsync<
-        'stream,
-        'b,
-        DeviceAccessible<<T as RustToCuda>::CudaRepresentation>,
-    >;
-    #[cfg(all(not(feature = "host"), target_os = "cuda"))]
-    type DeviceType<'b> = &'b mut T;
-    type FfiType<'stream, 'b> =
-        DeviceMutRef<'b, DeviceAccessible<<T as RustToCuda>::CudaRepresentation>>;
-    #[cfg(feature = "host")]
-    type SyncHostType = &'a mut T;
-
-    #[cfg(feature = "host")]
-    fn with_new_async<'stream, O, E: From<rustacuda::error::CudaError>>(
-        param: Self::SyncHostType,
-        _stream: &'stream rustacuda::stream::Stream,
-        inner: impl for<'b> FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result<O, E>,
-    ) -> Result<O, E> {
-        crate::host::LendToCuda::lend_to_cuda_mut(param, |mut param| inner(param.as_async()))
-    }
-
-    #[cfg(feature = "host")]
-    fn with_async_as_ptx_jit<O>(
-        _param: &Self::AsyncHostType<'_, '_>,
-        inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O,
-    ) -> O {
-        inner(None)
-    }
-
-    #[cfg(feature = "host")]
-    fn async_to_ffi<'stream, 'b>(
-        mut param: Self::AsyncHostType<'stream, 'b>,
-    ) -> Self::FfiType<'stream, 'b> {
-        unsafe { param.for_device_async() }
-    }
-
-    #[cfg(all(not(feature = "host"), target_os = "cuda"))]
-    fn with_ffi_as_device<O, const PARAM: usize>(
-        param: Self::FfiType<'static, 'static>,
-        inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O,
-    ) -> O {
-        unsafe { crate::device::BorrowFromRust::with_borrow_from_rust_mut(param, inner) }
-    }
-}
-impl<'a, T: RustToCuda + crate::safety::NoSafeAliasing> sealed::Sealed
-    for &'a mut SharedHeapPerThreadShallowCopy<T>
-{
-}
-
 impl<
         T: RustToCuda<
                 CudaRepresentation: 'static + crate::safety::SafeDeviceCopy,
@@ -1041,68 +988,6 @@ impl<'a, T: RustToCuda + crate::safety::NoSafeAliasing> sealed::Sealed
 {
 }
 
-impl<'a, T: 'static + RustToCuda + crate::safety::NoSafeAliasing> CudaKernelParameter
-    for &'a mut PtxJit<SharedHeapPerThreadShallowCopy<T>>
-{
-    #[cfg(feature = "host")]
-    type AsyncHostType<'stream, 'b> =
-        <&'a mut SharedHeapPerThreadShallowCopy<T> as CudaKernelParameter>::AsyncHostType<
-            'stream,
-            'b,
-        >;
-    #[cfg(all(not(feature = "host"), target_os = "cuda"))]
-    type DeviceType<'b> =
-        <&'a mut SharedHeapPerThreadShallowCopy<T> as CudaKernelParameter>::DeviceType<'b>;
-    type FfiType<'stream, 'b> =
-        <&'a mut SharedHeapPerThreadShallowCopy<T> as CudaKernelParameter>::FfiType<'stream, 'b>;
-    #[cfg(feature = "host")]
-    type SyncHostType =
-        <&'a mut SharedHeapPerThreadShallowCopy<T> as CudaKernelParameter>::SyncHostType;
-
-    #[cfg(feature = "host")]
-    fn with_new_async<'stream, O, E: From<rustacuda::error::CudaError>>(
-        param: Self::SyncHostType,
-        stream: &'stream rustacuda::stream::Stream,
-        inner: impl for<'b> FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result<O, E>,
-    ) -> Result<O, E> {
-        <&'a mut SharedHeapPerThreadShallowCopy<T> as CudaKernelParameter>::with_new_async(
-            param, stream, inner,
-        )
-    }
-
-    #[cfg(feature = "host")]
-    fn with_async_as_ptx_jit<O>(
-        param: &Self::AsyncHostType<'_, '_>,
-        inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O,
-    ) -> O {
-        inner(Some(&param_as_raw_bytes(param.for_host())))
-    }
-
-    #[cfg(feature = "host")]
-    fn async_to_ffi<'stream, 'b>(
-        param: Self::AsyncHostType<'stream, 'b>,
-    ) -> Self::FfiType<'stream, 'b> {
-        <&'a mut SharedHeapPerThreadShallowCopy<T> as CudaKernelParameter>::async_to_ffi(param)
-    }
-
-    #[cfg(all(not(feature = "host"), target_os = "cuda"))]
-    fn with_ffi_as_device<O, const PARAM: usize>(
-        param: Self::FfiType<'static, 'static>,
-        inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O,
-    ) -> O {
-        emit_param_ptx_jit_marker::<_, PARAM>(param.as_ref());
-
-        <&'a mut SharedHeapPerThreadShallowCopy<T> as CudaKernelParameter>::with_ffi_as_device::<
-            O,
-            PARAM,
-        >(param, inner)
-    }
-}
-impl<'a, T: RustToCuda + crate::safety::NoSafeAliasing> sealed::Sealed
-    for &'a mut PtxJit<SharedHeapPerThreadShallowCopy<T>>
-{
-}
-
 #[cfg(feature = "host")]
 fn param_as_raw_bytes<T: ?Sized>(r: &T) -> NonNull<[u8]> {
     NonNull::slice_from_raw_parts(NonNull::from(r).cast::<u8>(), core::mem::size_of_val(r))
diff --git a/src/device/mod.rs b/src/device/mod.rs
index 5ce92bbbe..c4a459087 100644
--- a/src/device/mod.rs
+++ b/src/device/mod.rs
@@ -5,9 +5,7 @@ use core::mem::ManuallyDrop;
 pub use rust_cuda_derive::{specialise_kernel_function, specialise_kernel_type};
 
 use crate::{
-    common::{
-        CudaAsRust, DeviceAccessible, DeviceConstRef, DeviceMutRef, DeviceOwnedRef, RustToCuda,
-    },
+    common::{CudaAsRust, DeviceAccessible, DeviceConstRef, DeviceOwnedRef, RustToCuda},
     safety::{NoSafeAliasing, SafeDeviceCopy},
 };
 
@@ -26,18 +24,18 @@ pub trait BorrowFromRust: RustToCuda + NoSafeAliasing {
         inner: F,
     ) -> O;
 
-    /// # Safety
-    ///
-    /// This function is only safe to call iff `cuda_repr_mut` is the
-    ///  [`DeviceMutRef`] borrowed on the CPU using the corresponding
-    ///  [`LendToCuda::lend_to_cuda_mut`](crate::host::LendToCuda::lend_to_cuda_mut).
-    /// Furthermore, since different GPU threads can access heap storage
-    ///  mutably inside the safe `inner` scope, there must not be any
-    ///  aliasing between concurrently running threads.
-    unsafe fn with_borrow_from_rust_mut<O, F: FnOnce(&mut Self) -> O>(
-        cuda_repr_mut: DeviceMutRef<DeviceAccessible<<Self as RustToCuda>::CudaRepresentation>>,
-        inner: F,
-    ) -> O;
+    // /// # Safety
+    // ///
+    // /// This function is only safe to call iff `cuda_repr_mut` is the
+    // ///  [`DeviceMutRef`] borrowed on the CPU using the corresponding
+    // ///  [`LendToCuda::lend_to_cuda_mut`](crate::host::LendToCuda::lend_to_cuda_mut).
+    // /// Furthermore, since different GPU threads can access heap storage
+    // ///  mutably inside the safe `inner` scope, there must not be any
+    // ///  aliasing between concurrently running threads.
+    // unsafe fn with_borrow_from_rust_mut<O, F: FnOnce(&mut Self) -> O>(
+    //     cuda_repr_mut: DeviceMutRef<DeviceAccessible<<Self as
+    // RustToCuda>::CudaRepresentation>>,     inner: F,
+    // ) -> O;
 
     /// # Safety
     ///
@@ -66,28 +64,6 @@ impl<T: RustToCuda + NoSafeAliasing> BorrowFromRust for T {
         inner(&rust_repr)
     }
 
-    #[inline]
-    unsafe fn with_borrow_from_rust_mut<O, F: FnOnce(&mut Self) -> O>(
-        mut cuda_repr_mut: DeviceMutRef<DeviceAccessible<<Self as RustToCuda>::CudaRepresentation>>,
-        inner: F,
-    ) -> O {
-        // `rust_repr_mut` must never be dropped as we do NOT own any of the
-        //  heap memory it might reference
-        let mut rust_repr_mut = ManuallyDrop::new(CudaAsRust::as_rust(cuda_repr_mut.as_mut()));
-
-        // Ideally, we would only provide access to `Pin<&mut T>` s.t.
-        // `core::mem::replace`  cannot be used.
-        // However, we should still be fine because
-        // - the shallow part of `rust_repr_mut` is a unique copy per thread, so
-        //   replacing it affects no other thread (immediately, drop is handled below)
-        // - any deep parts of `rust_repr_mut` are not allowed to hand out aliasing
-        //   mutable references, so any deep memory replacement would not affect other
-        //   threads
-        // - since any deep data is allocated from the host, we *hope* that trying to
-        //   drop it in a CUDA thread turns into a no-op
-        inner(&mut rust_repr_mut)
-    }
-
     #[inline]
     unsafe fn with_moved_from_rust<O, F: FnOnce(Self) -> O>(
         mut cuda_repr_mut: DeviceOwnedRef<
diff --git a/src/host/mod.rs b/src/host/mod.rs
index 685584659..424b5726c 100644
--- a/src/host/mod.rs
+++ b/src/host/mod.rs
@@ -457,31 +457,6 @@ pub trait LendToCuda: RustToCuda + NoSafeAliasing {
         inner: F,
     ) -> Result<O, E>;
 
-    /// Lends a mutable copy of `&mut self` to CUDA:
-    /// - code in the CUDA kernel can only access `&mut self` through the
-    ///   [`DeviceMutRef`] inside the closure
-    /// - after the closure, `&mut self` might have changed in the following
-    ///   ways:
-    ///   - to avoid aliasing, each CUDA thread gets its own shallow copy of
-    ///     `&mut self`, i.e. any shallow changes will NOT be reflected after
-    ///     the closure
-    ///   - each CUDA thread can access the same heap allocated storage, i.e.
-    ///     any deep changes will be reflected after the closure
-    ///
-    /// # Errors
-    ///
-    /// Returns a [`CudaError`] iff an error occurs inside CUDA
-    fn lend_to_cuda_mut<
-        O,
-        E: From<CudaError>,
-        F: FnOnce(
-            HostAndDeviceMutRef<DeviceAccessible<<Self as RustToCuda>::CudaRepresentation>>,
-        ) -> Result<O, E>,
-    >(
-        &mut self,
-        inner: F,
-    ) -> Result<O, E>;
-
     /// Moves `self` to CUDA iff `self` is [`SafeDeviceCopy`]
     ///
     /// # Errors
@@ -522,27 +497,6 @@ impl<T: RustToCuda + NoSafeAliasing> LendToCuda for T {
         result
     }
 
-    fn lend_to_cuda_mut<
-        O,
-        E: From<CudaError>,
-        F: FnOnce(
-            HostAndDeviceMutRef<DeviceAccessible<<Self as RustToCuda>::CudaRepresentation>>,
-        ) -> Result<O, E>,
-    >(
-        &mut self,
-        inner: F,
-    ) -> Result<O, E> {
-        let (mut cuda_repr, alloc) = unsafe { self.borrow(NoCudaAlloc) }?;
-
-        let result = HostAndDeviceMutRef::with_new(&mut cuda_repr, inner);
-
-        core::mem::drop(cuda_repr);
-
-        let _: NoCudaAlloc = unsafe { self.restore(alloc) }?;
-
-        result
-    }
-
     fn move_to_cuda<
         O,
         E: From<CudaError>,

From 000a3f60cf48ab8b46c5ada22c7f3040c68b851f Mon Sep 17 00:00:00 2001
From: Juniper Tyree <juniper.tyree@helsinki.fi>
Date: Sun, 24 Dec 2023 08:46:31 +0000
Subject: [PATCH 061/120] Allow passing ThreadBlockSharedSlice to kernel for
 dynamic shared memory

---
 .github/workflows/rustdoc.yml                 |   2 +-
 Cargo.toml                                    |   2 +-
 examples/single-source/src/main.rs            |   8 ++
 .../kernel/wrapper/generate/cuda_wrapper.rs   |  33 +++--
 src/common.rs                                 | 114 +++++++++++++++---
 src/utils/shared/slice.rs                     |  57 +++++++++
 6 files changed, 183 insertions(+), 33 deletions(-)

diff --git a/.github/workflows/rustdoc.yml b/.github/workflows/rustdoc.yml
index 23f4f1c07..5c756572c 100644
--- a/.github/workflows/rustdoc.yml
+++ b/.github/workflows/rustdoc.yml
@@ -28,7 +28,7 @@ jobs:
         run: |
           RUSTDOCFLAGS="\
             --enable-index-page \
-            --extern-html-root-url const_type_layout=https://docs.rs/const-type-layout/0.2.0/ \
+            --extern-html-root-url const_type_layout=https://docs.rs/const-type-layout/0.2.1/ \
             --extern-html-root-url final=https://docs.rs/final/0.1.1/ \
             --extern-html-root-url rustacuda=https://docs.rs/rustacuda/0.1.3/ \
             --extern-html-root-url rustacuda_core=https://docs.rs/rustacuda_core/0.1.2/ \
diff --git a/Cargo.toml b/Cargo.toml
index 43687be65..a218c629d 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -30,7 +30,7 @@ rustacuda_derive = { git = "https://github.com/juntyr/RustaCUDA", rev = "c6ea7cc
 
 regex = { version = "1.10", optional = true }
 
-const-type-layout = { version = "0.2.0", features = ["derive"] }
+const-type-layout = { version = "0.2.1", features = ["derive"] }
 
 final = "0.1.1"
 
diff --git a/examples/single-source/src/main.rs b/examples/single-source/src/main.rs
index 1b677192d..b57556963 100644
--- a/examples/single-source/src/main.rs
+++ b/examples/single-source/src/main.rs
@@ -67,6 +67,7 @@ pub fn kernel<
     _: rc::common::SharedHeapPerThreadShallowCopy<Wrapper<T>>,
     q @ Triple(s, mut __t, _u): rc::common::PerThreadShallowCopy<Triple>,
     shared3: &mut rc::utils::shared::r#static::ThreadBlockShared<u32>,
+    dynamic: &mut rc::utils::shared::slice::ThreadBlockSharedSlice<Dummy>,
 ) {
     let shared = rc::utils::shared::r#static::ThreadBlockShared::<[Tuple; 3]>::new_uninit();
     let shared2 = rc::utils::shared::r#static::ThreadBlockShared::<[Tuple; 3]>::new_uninit();
@@ -82,6 +83,13 @@ pub fn kernel<
     unsafe {
         *shared3.as_mut_ptr() = 12;
     }
+
+    let index = rc::device::thread::Thread::this().index();
+    if index < dynamic.len() {
+        unsafe {
+            *dynamic.index_mut_unchecked(index) = Dummy(42);
+        }
+    }
 }
 
 #[cfg(not(target_os = "cuda"))]
diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cuda_wrapper.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cuda_wrapper.rs
index 7fce0a925..1bb8a5577 100644
--- a/rust-cuda-derive/src/kernel/wrapper/generate/cuda_wrapper.rs
+++ b/rust-cuda-derive/src/kernel/wrapper/generate/cuda_wrapper.rs
@@ -35,16 +35,11 @@ pub(in super::super) fn quote_cuda_wrapper(
         })
         .collect::<Vec<_>>();
 
-    let ffi_param_ptx_jit_wrap = func_inputs
-        .iter().enumerate()
-        .rev()
-        .fold(quote! {
+    let ffi_param_ptx_jit_wrap = func_inputs.iter().enumerate().rev().fold(
+        quote! {
             #func_ident(#(#func_params),*)
-        }, |inner, (i, syn::PatType {
-            pat,
-            ty,
-            ..
-        })| {
+        },
+        |inner, (i, syn::PatType { pat, ty, .. })| {
             let specialised_ty = quote::quote_spanned! { ty.span()=>
                 #crate_path::device::specialise_kernel_type!(#ty for #generics in #func_ident)
             };
@@ -53,18 +48,30 @@ pub(in super::super) fn quote_cuda_wrapper(
             // To allow some parameters to also inject PTX JIT load markers here,
             //  we pass them the param index i
             quote::quote_spanned! { ty.span()=>
-                <#specialised_ty as #crate_path::common::CudaKernelParameter>::with_ffi_as_device::<_, #i>(
-                    #pat, |#pat| { #inner }
-                )
+                unsafe {
+                    <
+                        #specialised_ty as #crate_path::common::CudaKernelParameter
+                    >::with_ffi_as_device::<_, #i>(
+                        #pat, |#pat| { #inner }
+                    )
+                }
             }
-        });
+        },
+    );
 
     quote! {
         #[cfg(target_os = "cuda")]
         #[#crate_path::device::specialise_kernel_function(#func_ident)]
         #[no_mangle]
+        #[allow(unused_unsafe)]
         #(#func_attrs)*
         pub unsafe extern "ptx-kernel" fn #func_ident_hash(#(#ffi_inputs),*) {
+            unsafe {
+                // Initialise the dynamically-sized thread-block shared memory
+                //  and the thread-local offset pointer that points to it
+                #crate_path::utils::shared::slice::init();
+            }
+
             unsafe {
                 ::core::arch::asm!(#KERNEL_TYPE_USE_START_CANARY);
             }
diff --git a/src/common.rs b/src/common.rs
index 2e8d72a7c..7f8f6cfec 100644
--- a/src/common.rs
+++ b/src/common.rs
@@ -362,7 +362,7 @@ pub trait CudaKernelParameter: sealed::Sealed {
 
     #[doc(hidden)]
     #[cfg(all(not(feature = "host"), target_os = "cuda"))]
-    fn with_ffi_as_device<O, const PARAM: usize>(
+    unsafe fn with_ffi_as_device<O, const PARAM: usize>(
         param: Self::FfiType<'static, 'static>,
         inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O,
     ) -> O;
@@ -461,7 +461,7 @@ impl<
     }
 
     #[cfg(all(not(feature = "host"), target_os = "cuda"))]
-    fn with_ffi_as_device<O, const PARAM: usize>(
+    unsafe fn with_ffi_as_device<O, const PARAM: usize>(
         param: Self::FfiType<'static, 'static>,
         inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O,
     ) -> O {
@@ -536,7 +536,7 @@ impl<
     }
 
     #[cfg(all(not(feature = "host"), target_os = "cuda"))]
-    fn with_ffi_as_device<O, const PARAM: usize>(
+    unsafe fn with_ffi_as_device<O, const PARAM: usize>(
         param: Self::FfiType<'static, 'static>,
         inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O,
     ) -> O {
@@ -597,7 +597,7 @@ impl<
     }
 
     #[cfg(all(not(feature = "host"), target_os = "cuda"))]
-    fn with_ffi_as_device<O, const PARAM: usize>(
+    unsafe fn with_ffi_as_device<O, const PARAM: usize>(
         param: Self::FfiType<'static, 'static>,
         inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O,
     ) -> O {
@@ -691,7 +691,7 @@ impl<'a, T: 'static + InteriorMutableSafeDeviceCopy> CudaKernelParameter
     }
 
     #[cfg(all(not(feature = "host"), target_os = "cuda"))]
-    fn with_ffi_as_device<O, const PARAM: usize>(
+    unsafe fn with_ffi_as_device<O, const PARAM: usize>(
         param: Self::FfiType<'static, 'static>,
         inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O,
     ) -> O {
@@ -726,11 +726,20 @@ impl_atomic_interior_mutable! {
     AtomicU8(u8), AtomicU16(u16), AtomicU32(u32), AtomicU64(u64), AtomicUsize(usize)
 }
 
-// TODO: update const type layout
-// impl<T: crate::safety::SafeDeviceCopy + const_type_layout::TypeGraphLayout>
-// InteriorMutableSafeDeviceCopy for core::cell::SyncUnsafeCell<T> {}
-// impl<T: crate::safety::SafeDeviceCopy> sealed::Sealed for
-// core::cell::SyncUnsafeCell<T> {}
+impl<
+        T: crate::safety::StackOnly
+            + crate::safety::NoSafeAliasing
+            + const_type_layout::TypeGraphLayout,
+    > InteriorMutableSafeDeviceCopy for core::cell::SyncUnsafeCell<T>
+{
+}
+impl<
+        T: crate::safety::StackOnly
+            + crate::safety::NoSafeAliasing
+            + const_type_layout::TypeGraphLayout,
+    > sealed::Sealed for core::cell::SyncUnsafeCell<T>
+{
+}
 
 pub struct SharedHeapPerThreadShallowCopy<T: RustToCuda + crate::safety::NoSafeAliasing> {
     never: !,
@@ -796,7 +805,7 @@ impl<
     }
 
     #[cfg(all(not(feature = "host"), target_os = "cuda"))]
-    fn with_ffi_as_device<O, const PARAM: usize>(
+    unsafe fn with_ffi_as_device<O, const PARAM: usize>(
         param: Self::FfiType<'static, 'static>,
         inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O,
     ) -> O {
@@ -853,7 +862,7 @@ impl<'a, T: 'static + RustToCuda + crate::safety::NoSafeAliasing> CudaKernelPara
     }
 
     #[cfg(all(not(feature = "host"), target_os = "cuda"))]
-    fn with_ffi_as_device<O, const PARAM: usize>(
+    unsafe fn with_ffi_as_device<O, const PARAM: usize>(
         param: Self::FfiType<'static, 'static>,
         inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O,
     ) -> O {
@@ -910,7 +919,7 @@ impl<
     }
 
     #[cfg(all(not(feature = "host"), target_os = "cuda"))]
-    fn with_ffi_as_device<O, const PARAM: usize>(
+    unsafe fn with_ffi_as_device<O, const PARAM: usize>(
         param: Self::FfiType<'static, 'static>,
         inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O,
     ) -> O {
@@ -972,7 +981,7 @@ impl<'a, T: 'static + RustToCuda + crate::safety::NoSafeAliasing> CudaKernelPara
     }
 
     #[cfg(all(not(feature = "host"), target_os = "cuda"))]
-    fn with_ffi_as_device<O, const PARAM: usize>(
+    unsafe fn with_ffi_as_device<O, const PARAM: usize>(
         param: Self::FfiType<'static, 'static>,
         inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O,
     ) -> O {
@@ -997,9 +1006,9 @@ fn param_as_raw_bytes<T: ?Sized>(r: &T) -> NonNull<[u8]> {
 fn emit_param_ptx_jit_marker<T: ?Sized, const INDEX: usize>(param: &T) {
     unsafe {
         core::arch::asm!(
-            "// <rust-cuda-ptx-jit-const-load-{}-{}> //",
-            in(reg32) *(core::ptr::from_ref(param).cast::<u32>()),
-            const(INDEX),
+            "// <rust-cuda-ptx-jit-const-load-{param_reg}-{param_index}> //",
+            param_reg = in(reg32) *(core::ptr::from_ref(param).cast::<u32>()),
+            param_index = const(INDEX),
         );
     }
 }
@@ -1017,6 +1026,17 @@ mod private_shared {
 
     // Safety: there is nothing to copy, this is just a zero-sized marker type
     unsafe impl<T: 'static + TypeGraphLayout> DeviceCopy for ThreadBlockSharedFfi<T> {}
+
+    #[doc(hidden)]
+    #[derive(TypeLayout)]
+    #[repr(C)]
+    pub struct ThreadBlockSharedSliceFfi<T: 'static + TypeGraphLayout> {
+        pub(super) len: usize,
+        pub(super) _marker: [T; 0],
+    }
+
+    // Safety: we only copy a usize, which implements `DeviceCopy`
+    unsafe impl<T: 'static + TypeGraphLayout> DeviceCopy for ThreadBlockSharedSliceFfi<T> {}
 }
 
 impl<'a, T: 'static + TypeGraphLayout> CudaKernelParameter
@@ -1057,7 +1077,7 @@ impl<'a, T: 'static + TypeGraphLayout> CudaKernelParameter
     #[cfg(all(not(feature = "host"), target_os = "cuda"))]
     #[allow(clippy::inline_always)]
     #[inline(always)]
-    fn with_ffi_as_device<O, const PARAM: usize>(
+    unsafe fn with_ffi_as_device<O, const PARAM: usize>(
         _param: Self::FfiType<'static, 'static>,
         inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O,
     ) -> O {
@@ -1070,3 +1090,61 @@ impl<'a, T: 'static + TypeGraphLayout> sealed::Sealed
     for &'a mut crate::utils::shared::r#static::ThreadBlockShared<T>
 {
 }
+
+impl<'a, T: 'static + TypeGraphLayout> CudaKernelParameter
+    for &'a mut crate::utils::shared::slice::ThreadBlockSharedSlice<T>
+{
+    #[cfg(feature = "host")]
+    type AsyncHostType<'stream, 'b> =
+        &'b mut crate::utils::shared::slice::ThreadBlockSharedSlice<T>;
+    #[cfg(all(not(feature = "host"), target_os = "cuda"))]
+    type DeviceType<'b> = &'b mut crate::utils::shared::slice::ThreadBlockSharedSlice<T>;
+    type FfiType<'stream, 'b> = private_shared::ThreadBlockSharedSliceFfi<T>;
+    #[cfg(feature = "host")]
+    type SyncHostType = Self;
+
+    #[cfg(feature = "host")]
+    fn with_new_async<'stream, O, E: From<rustacuda::error::CudaError>>(
+        param: Self::SyncHostType,
+        _stream: &'stream rustacuda::stream::Stream,
+        inner: impl for<'b> FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result<O, E>,
+    ) -> Result<O, E> {
+        inner(param)
+    }
+
+    #[cfg(feature = "host")]
+    fn with_async_as_ptx_jit<O>(
+        _param: &Self::AsyncHostType<'_, '_>,
+        inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O,
+    ) -> O {
+        inner(None)
+    }
+
+    #[cfg(feature = "host")]
+    fn async_to_ffi<'stream, 'b>(
+        param: Self::AsyncHostType<'stream, 'b>,
+    ) -> Self::FfiType<'stream, 'b> {
+        private_shared::ThreadBlockSharedSliceFfi {
+            len: param.len(),
+            _marker: [],
+        }
+    }
+
+    #[cfg(all(not(feature = "host"), target_os = "cuda"))]
+    #[allow(clippy::inline_always)]
+    #[inline(always)]
+    unsafe fn with_ffi_as_device<O, const PARAM: usize>(
+        param: Self::FfiType<'static, 'static>,
+        inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O,
+    ) -> O {
+        unsafe {
+            crate::utils::shared::slice::ThreadBlockSharedSlice::with_uninit_for_len(
+                param.len, inner,
+            )
+        }
+    }
+}
+impl<'a, T: 'static + TypeGraphLayout> sealed::Sealed
+    for &'a mut crate::utils::shared::slice::ThreadBlockSharedSlice<T>
+{
+}
diff --git a/src/utils/shared/slice.rs b/src/utils/shared/slice.rs
index 920f0c58d..7039e15f9 100644
--- a/src/utils/shared/slice.rs
+++ b/src/utils/shared/slice.rs
@@ -88,3 +88,60 @@ impl<T: 'static + TypeGraphLayout> ThreadBlockSharedSlice<T> {
         self.shared.get_unchecked_mut(index)
     }
 }
+
+#[cfg(all(not(feature = "host"), target_os = "cuda"))]
+#[doc(cfg(all(not(feature = "host"), target_os = "cuda")))]
+impl<T: 'static + TypeGraphLayout> ThreadBlockSharedSlice<T> {
+    /// # Safety
+    ///
+    /// Exposing the [`ThreadBlockSharedSlice`] must be preceded by exactly one
+    /// call to [`init`].
+    pub(crate) unsafe fn with_uninit_for_len<F: FnOnce(&mut Self) -> Q, Q>(
+        len: usize,
+        inner: F,
+    ) -> Q {
+        let base: *mut u8;
+
+        unsafe {
+            core::arch::asm!(
+                "mov.u64    {base}, %rust_cuda_dynamic_shared;",
+                base = out(reg64) base,
+            );
+        }
+
+        let aligned_base = base.byte_add(base.align_offset(core::mem::align_of::<T>()));
+
+        let data: *mut T = aligned_base.cast();
+
+        let new_base = data.add(len).cast::<u8>();
+
+        unsafe {
+            core::arch::asm!(
+                "mov.u64    %rust_cuda_dynamic_shared, {new_base};",
+                new_base = in(reg64) new_base,
+            );
+        }
+
+        let shared = core::ptr::slice_from_raw_parts_mut(data, len);
+
+        inner(&mut Self { shared })
+    }
+}
+
+#[doc(hidden)]
+#[cfg(all(not(feature = "host"), target_os = "cuda"))]
+/// # Safety
+///
+/// The thread-block shared dynamic memory must be initialised once and
+/// only once per kernel.
+pub unsafe fn init() {
+    unsafe {
+        core::arch::asm!(".reg .u64    %rust_cuda_dynamic_shared;");
+        core::arch::asm!(
+            "cvta.shared.u64    %rust_cuda_dynamic_shared, rust_cuda_dynamic_shared_base;",
+        );
+    }
+}
+
+#[cfg(all(not(feature = "host"), target_os = "cuda"))]
+core::arch::global_asm!(".extern .shared .align 8 .b8 rust_cuda_dynamic_shared_base[];");

From cd8f4b4da519ddaebfa736673632735edf59e434 Mon Sep 17 00:00:00 2001
From: Juniper Tyree <juniper.tyree@helsinki.fi>
Date: Mon, 25 Dec 2023 09:52:56 +0000
Subject: [PATCH 062/120] Begin refactoring the public API with device feature

---
 .github/workflows/ci.yml                      |   2 +
 Cargo.toml                                    |   1 +
 examples/print/Cargo.toml                     |   2 +-
 examples/print/src/main.rs                    |  29 +--
 examples/single-source/Cargo.toml             |   2 +-
 examples/single-source/src/main.rs            |  14 +-
 .../kernel/wrapper/generate/cuda_wrapper.rs   |   4 +-
 rust-cuda-derive/src/rust_to_cuda/impl.rs     |  18 +-
 src/common.rs                                 |  79 ++++----
 src/deps.rs                                   |   6 +
 src/device/alloc.rs                           |   2 +-
 src/device/mod.rs                             |   1 -
 src/device/thread.rs                          |   2 +-
 src/host/mod.rs                               |   3 -
 src/host/ptx_jit/mod.rs                       |   2 -
 src/lib.rs                                    |  49 ++---
 src/utils/aliasing/const.rs                   |  47 ++---
 src/utils/aliasing/dynamic.rs                 |  50 ++---
 src/utils/aliasing/final.rs                   |   7 +-
 src/utils/box.rs                              |  20 +-
 src/utils/boxed_slice.rs                      |  20 +-
 src/utils/device_copy.rs                      |  11 +-
 src/utils/exchange/buffer/common.rs           |  12 +-
 src/utils/exchange/buffer/device.rs           |  40 +---
 src/utils/exchange/buffer/host.rs             |  42 ++---
 src/utils/exchange/buffer/mod.rs              | 178 ++++++++++++++----
 src/utils/exchange/mod.rs                     |   1 -
 src/utils/option.rs                           |  16 +-
 src/utils/shared/slice.rs                     |  48 ++---
 src/utils/shared/static.rs                    |  18 +-
 30 files changed, 386 insertions(+), 340 deletions(-)
 create mode 100644 src/deps.rs

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 07fa4ab26..a8f37a6dd 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -57,6 +57,7 @@ jobs:
       - name: Check feature powerset on the CPU
         run: |
           cargo hack check --feature-powerset --optional-deps \
+            --skip device \
             --keep-going
 
       - name: Check feature powerset on CUDA
@@ -174,6 +175,7 @@ jobs:
       - name: Check feature powerset on the CPU
         run: |
           cargo hack clippy --feature-powerset --optional-deps \
+            --skip device \
             --keep-going \
             -- -D warnings
       
diff --git a/Cargo.toml b/Cargo.toml
index a218c629d..12a90ef59 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -20,6 +20,7 @@ rust-version = "1.75" # nightly
 [features]
 default = []
 host = ["dep:rustacuda", "dep:regex"]
+device = []
 derive = ["dep:rustacuda_derive", "dep:rust-cuda-derive"]
 
 [dependencies]
diff --git a/examples/print/Cargo.toml b/examples/print/Cargo.toml
index 21f513d8f..05f3a537e 100644
--- a/examples/print/Cargo.toml
+++ b/examples/print/Cargo.toml
@@ -8,7 +8,7 @@ edition = "2021"
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 
 [target.'cfg(target_os = "cuda")'.dependencies]
-rust-cuda = { path = "../../", features = ["derive"] }
+rust-cuda = { path = "../../", features = ["derive", "device"] }
 
 [target.'cfg(not(target_os = "cuda"))'.dependencies]
 rust-cuda = { path = "../../", features = ["derive", "host"] }
diff --git a/examples/print/src/main.rs b/examples/print/src/main.rs
index 62a0e2713..462603ca6 100644
--- a/examples/print/src/main.rs
+++ b/examples/print/src/main.rs
@@ -12,8 +12,8 @@
 
 extern crate alloc;
 
-#[derive(rust_cuda::const_type_layout::TypeLayout)]
-#[layout(crate = "rust_cuda::const_type_layout")]
+#[derive(rust_cuda::deps::const_type_layout::TypeLayout)]
+#[layout(crate = "rust_cuda::deps::const_type_layout")]
 #[repr(C)]
 pub enum Action {
     Print,
@@ -34,37 +34,38 @@ pub fn kernel(action: rust_cuda::common::PerThreadShallowCopy<Action>) {
 }
 
 #[cfg(not(target_os = "cuda"))]
-fn main() -> rust_cuda::rustacuda::error::CudaResult<()> {
+fn main() -> rust_cuda::deps::rustacuda::error::CudaResult<()> {
     // Link the non-generic CUDA kernel
     struct KernelPtx;
     link! { impl kernel for KernelPtx }
 
     // Initialize the CUDA API
-    rust_cuda::rustacuda::init(rust_cuda::rustacuda::CudaFlags::empty())?;
+    rust_cuda::deps::rustacuda::init(rust_cuda::deps::rustacuda::CudaFlags::empty())?;
 
     // Get the first CUDA GPU device
-    let device = rust_cuda::rustacuda::device::Device::get_device(0)?;
+    let device = rust_cuda::deps::rustacuda::device::Device::get_device(0)?;
 
     // Create a CUDA context associated to this device
     let _context = rust_cuda::host::CudaDropWrapper::from(
-        rust_cuda::rustacuda::context::Context::create_and_push(
-            rust_cuda::rustacuda::context::ContextFlags::MAP_HOST
-                | rust_cuda::rustacuda::context::ContextFlags::SCHED_AUTO,
+        rust_cuda::deps::rustacuda::context::Context::create_and_push(
+            rust_cuda::deps::rustacuda::context::ContextFlags::MAP_HOST
+                | rust_cuda::deps::rustacuda::context::ContextFlags::SCHED_AUTO,
             device,
         )?,
     );
 
     // Create a new CUDA stream to submit kernels to
-    let stream = rust_cuda::host::CudaDropWrapper::from(rust_cuda::rustacuda::stream::Stream::new(
-        rust_cuda::rustacuda::stream::StreamFlags::NON_BLOCKING,
-        None,
-    )?);
+    let stream =
+        rust_cuda::host::CudaDropWrapper::from(rust_cuda::deps::rustacuda::stream::Stream::new(
+            rust_cuda::deps::rustacuda::stream::StreamFlags::NON_BLOCKING,
+            None,
+        )?);
 
     // Create a new instance of the CUDA kernel and prepare the launch config
     let mut kernel = rust_cuda::host::TypedPtxKernel::<kernel>::new::<KernelPtx>(None);
     let config = rust_cuda::host::LaunchConfig {
-        grid: rust_cuda::rustacuda::function::GridSize::x(1),
-        block: rust_cuda::rustacuda::function::BlockSize::x(4),
+        grid: rust_cuda::deps::rustacuda::function::GridSize::x(1),
+        block: rust_cuda::deps::rustacuda::function::BlockSize::x(4),
         shared_memory_size: 0,
         ptx_jit: false,
     };
diff --git a/examples/single-source/Cargo.toml b/examples/single-source/Cargo.toml
index eeada181d..6f53359cd 100644
--- a/examples/single-source/Cargo.toml
+++ b/examples/single-source/Cargo.toml
@@ -8,7 +8,7 @@ edition = "2021"
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 
 [target.'cfg(target_os = "cuda")'.dependencies]
-rc = { package = "rust-cuda", path = "../../", features = ["derive"] }
+rc = { package = "rust-cuda", path = "../../", features = ["derive", "device"] }
 
 [target.'cfg(not(target_os = "cuda"))'.dependencies]
 rc = { package = "rust-cuda", path = "../../", features = ["derive", "host"] }
diff --git a/examples/single-source/src/main.rs b/examples/single-source/src/main.rs
index b57556963..41df1705d 100644
--- a/examples/single-source/src/main.rs
+++ b/examples/single-source/src/main.rs
@@ -19,8 +19,8 @@ extern crate alloc;
 fn main() {}
 
 #[repr(C)]
-#[derive(rc::const_type_layout::TypeLayout)]
-#[layout(crate = "rc::const_type_layout")]
+#[derive(rc::deps::const_type_layout::TypeLayout)]
+#[layout(crate = "rc::deps::const_type_layout")]
 pub struct Dummy(i32);
 
 #[derive(rc::common::LendRustToCuda)]
@@ -36,13 +36,13 @@ pub struct Wrapper<T> {
 pub struct Empty([u8; 0]);
 
 #[repr(C)]
-#[derive(rc::const_type_layout::TypeLayout)]
-#[layout(crate = "rc::const_type_layout")]
+#[derive(rc::deps::const_type_layout::TypeLayout)]
+#[layout(crate = "rc::deps::const_type_layout")]
 pub struct Tuple(u32, i32);
 
 #[repr(C)]
-#[derive(rc::const_type_layout::TypeLayout)]
-#[layout(crate = "rc::const_type_layout")]
+#[derive(rc::deps::const_type_layout::TypeLayout)]
+#[layout(crate = "rc::deps::const_type_layout")]
 pub struct Triple(i32, i32, i32);
 
 #[rc::common::kernel(pub use link! for impl)]
@@ -94,8 +94,6 @@ pub fn kernel<
 
 #[cfg(not(target_os = "cuda"))]
 mod host {
-    // use super::{link, kernel};
-
     // Link several instances of the generic CUDA kernel
     struct KernelPtx<'a, T>(std::marker::PhantomData<&'a T>);
     crate::link! { impl kernel<'a, crate::Empty> for KernelPtx }
diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cuda_wrapper.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cuda_wrapper.rs
index 1bb8a5577..e87bd0d16 100644
--- a/rust-cuda-derive/src/kernel/wrapper/generate/cuda_wrapper.rs
+++ b/rust-cuda-derive/src/kernel/wrapper/generate/cuda_wrapper.rs
@@ -78,8 +78,8 @@ pub(in super::super) fn quote_cuda_wrapper(
             #(
                 #[no_mangle]
                 static #func_layout_params: [
-                    u8; #crate_path::const_type_layout::serialised_type_graph_len::<#ffi_types>()
-                ] = #crate_path::const_type_layout::serialise_type_graph::<#ffi_types>();
+                    u8; #crate_path::deps::const_type_layout::serialised_type_graph_len::<#ffi_types>()
+                ] = #crate_path::deps::const_type_layout::serialise_type_graph::<#ffi_types>();
 
                 unsafe { ::core::ptr::read_volatile(&#func_layout_params[0]) };
             )*
diff --git a/rust-cuda-derive/src/rust_to_cuda/impl.rs b/rust-cuda-derive/src/rust_to_cuda/impl.rs
index 896e51e89..2928cebef 100644
--- a/rust-cuda-derive/src/rust_to_cuda/impl.rs
+++ b/rust-cuda-derive/src/rust_to_cuda/impl.rs
@@ -29,20 +29,20 @@ pub fn cuda_struct_declaration(
         quote!(#where_clause #struct_fields_cuda)
     };
 
-    let const_type_layout_crate_path = quote! { #crate_path::const_type_layout }.to_string();
+    let const_type_layout_crate_path = quote! { #crate_path::deps::const_type_layout }.to_string();
 
     quote! {
         #[allow(dead_code)]
         #[doc(hidden)]
         #(#struct_attrs_cuda)*
-        #[derive(#crate_path::const_type_layout::TypeLayout)]
+        #[derive(#crate_path::deps::const_type_layout::TypeLayout)]
         #struct_repr
         #(#struct_layout_attrs)*
         #[layout(crate = #const_type_layout_crate_path)]
         #struct_vis_cuda struct #struct_name_cuda #struct_generics_cuda #struct_fields_where_clause
 
         // #[derive(DeviceCopy)] can interfer with type parameters
-        unsafe impl #impl_generics #crate_path::rustacuda_core::DeviceCopy
+        unsafe impl #impl_generics #crate_path::deps::rustacuda_core::DeviceCopy
             for #struct_name_cuda #ty_generics #where_clause {}
     }
 }
@@ -87,7 +87,7 @@ pub fn rust_to_cuda_trait(
             unsafe fn borrow<CudaAllocType: #crate_path::common::CudaAlloc>(
                 &self,
                 alloc: CudaAllocType,
-            ) -> #crate_path::rustacuda::error::CudaResult<(
+            ) -> #crate_path::deps::rustacuda::error::CudaResult<(
                 #crate_path::common::DeviceAccessible<Self::CudaRepresentation>,
                 #crate_path::common::CombinedCudaAlloc<Self::CudaAllocation, CudaAllocType>
             )> {
@@ -110,7 +110,7 @@ pub fn rust_to_cuda_trait(
                 alloc: #crate_path::common::CombinedCudaAlloc<
                     Self::CudaAllocation, CudaAllocType
                 >,
-            ) -> #crate_path::rustacuda::error::CudaResult<CudaAllocType> {
+            ) -> #crate_path::deps::rustacuda::error::CudaResult<CudaAllocType> {
                 let (alloc_front, alloc_tail) = alloc.split();
 
                 #(#r2c_field_destructors)*
@@ -156,8 +156,8 @@ pub fn rust_to_cuda_async_trait(
             unsafe fn borrow_async<CudaAllocType: #crate_path::common::CudaAlloc>(
                 &self,
                 alloc: CudaAllocType,
-                stream: &#crate_path::rustacuda::stream::Stream,
-            ) -> #crate_path::rustacuda::error::CudaResult<(
+                stream: &#crate_path::deps::rustacuda::stream::Stream,
+            ) -> #crate_path::deps::rustacuda::error::CudaResult<(
                 #crate_path::common::DeviceAccessible<Self::CudaRepresentation>,
                 #crate_path::common::CombinedCudaAlloc<Self::CudaAllocation, CudaAllocType>
             )> {
@@ -180,8 +180,8 @@ pub fn rust_to_cuda_async_trait(
                 alloc: #crate_path::common::CombinedCudaAlloc<
                     Self::CudaAllocation, CudaAllocType
                 >,
-                stream: &#crate_path::rustacuda::stream::Stream,
-            ) -> #crate_path::rustacuda::error::CudaResult<CudaAllocType> {
+                stream: &#crate_path::deps::rustacuda::stream::Stream,
+            ) -> #crate_path::deps::rustacuda::error::CudaResult<CudaAllocType> {
                 let (alloc_front, alloc_tail) = alloc.split();
 
                 #(#r2c_field_async_destructors)*
diff --git a/src/common.rs b/src/common.rs
index 7f8f6cfec..d9d1a955a 100644
--- a/src/common.rs
+++ b/src/common.rs
@@ -1,4 +1,4 @@
-#[cfg(any(not(feature = "host"), doc))]
+#[cfg(feature = "device")]
 use core::convert::{AsMut, AsRef};
 use core::{
     marker::PhantomData,
@@ -13,22 +13,20 @@ use core::{
     ptr::{copy_nonoverlapping, NonNull},
 };
 
-use const_type_layout::TypeGraphLayout;
+use const_type_layout::{TypeGraphLayout, TypeLayout};
 use rustacuda_core::DeviceCopy;
 
 #[cfg(feature = "derive")]
-#[doc(cfg(feature = "derive"))]
 pub use rust_cuda_derive::LendRustToCuda;
 
 #[cfg(feature = "derive")]
-#[doc(cfg(feature = "derive"))]
 pub use rust_cuda_derive::kernel;
 
 #[cfg(feature = "host")]
 use crate::{safety::SafeDeviceCopy, utils::device_copy::SafeDeviceCopyWrapper};
 
 #[repr(transparent)]
-#[cfg_attr(not(feature = "host"), derive(Debug))]
+#[cfg_attr(any(feature = "device", doc), derive(Debug))]
 #[derive(TypeLayout)]
 pub struct DeviceAccessible<T: ?Sized + DeviceCopy>(T);
 
@@ -54,7 +52,7 @@ impl<T: SafeDeviceCopy + TypeGraphLayout> From<&T> for DeviceAccessible<SafeDevi
     }
 }
 
-#[cfg(feature = "host")]
+#[cfg(all(feature = "host", not(doc)))]
 impl<T: ?Sized + DeviceCopy + fmt::Debug> fmt::Debug for DeviceAccessible<T> {
     fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result {
         fmt.debug_struct(stringify!(DeviceAccessible))
@@ -62,7 +60,7 @@ impl<T: ?Sized + DeviceCopy + fmt::Debug> fmt::Debug for DeviceAccessible<T> {
     }
 }
 
-#[cfg(not(feature = "host"))]
+#[cfg(feature = "device")]
 impl<T: ?Sized + DeviceCopy> Deref for DeviceAccessible<T> {
     type Target = T;
 
@@ -71,7 +69,7 @@ impl<T: ?Sized + DeviceCopy> Deref for DeviceAccessible<T> {
     }
 }
 
-#[cfg(not(feature = "host"))]
+#[cfg(feature = "device")]
 impl<T: ?Sized + DeviceCopy> DerefMut for DeviceAccessible<T> {
     fn deref_mut(&mut self) -> &mut Self::Target {
         &mut self.0
@@ -178,7 +176,7 @@ pub unsafe trait CudaAsRust: DeviceCopy + TypeGraphLayout {
     type RustRepresentation: RustToCuda<CudaRepresentation = Self>;
 
     #[doc(hidden)]
-    #[cfg(not(feature = "host"))]
+    #[cfg(feature = "device")]
     /// # Safety
     ///
     /// This is an internal function and should NEVER be called manually
@@ -209,8 +207,7 @@ pub struct DeviceConstRef<'r, T: DeviceCopy + 'r> {
 
 unsafe impl<'r, T: DeviceCopy> DeviceCopy for DeviceConstRef<'r, T> {}
 
-#[cfg(any(not(feature = "host"), doc))]
-#[doc(cfg(not(feature = "host")))]
+#[cfg(feature = "device")]
 impl<'r, T: DeviceCopy> AsRef<T> for DeviceConstRef<'r, T> {
     fn as_ref(&self) -> &T {
         unsafe { &*self.pointer }
@@ -227,16 +224,14 @@ pub struct DeviceMutRef<'r, T: DeviceCopy + 'r> {
 
 unsafe impl<'r, T: DeviceCopy> DeviceCopy for DeviceMutRef<'r, T> {}
 
-#[cfg(any(not(feature = "host"), doc))]
-#[doc(cfg(not(feature = "host")))]
+#[cfg(feature = "device")]
 impl<'r, T: DeviceCopy> AsRef<T> for DeviceMutRef<'r, T> {
     fn as_ref(&self) -> &T {
         unsafe { &*self.pointer }
     }
 }
 
-#[cfg(any(not(feature = "host"), doc))]
-#[doc(cfg(not(feature = "host")))]
+#[cfg(feature = "device")]
 impl<'r, T: DeviceCopy> AsMut<T> for DeviceMutRef<'r, T> {
     fn as_mut(&mut self) -> &mut T {
         unsafe { &mut *self.pointer }
@@ -254,16 +249,14 @@ pub struct DeviceOwnedRef<'r, T: DeviceCopy> {
 
 unsafe impl<'r, T: DeviceCopy> DeviceCopy for DeviceOwnedRef<'r, T> {}
 
-#[cfg(any(not(feature = "host"), doc))]
-#[doc(cfg(not(feature = "host")))]
+#[cfg(feature = "device")]
 impl<'r, T: DeviceCopy> AsRef<T> for DeviceOwnedRef<'r, T> {
     fn as_ref(&self) -> &T {
         unsafe { &*self.pointer }
     }
 }
 
-#[cfg(any(not(feature = "host"), doc))]
-#[doc(cfg(not(feature = "host")))]
+#[cfg(feature = "device")]
 impl<'r, T: DeviceCopy> AsMut<T> for DeviceOwnedRef<'r, T> {
     fn as_mut(&mut self) -> &mut T {
         unsafe { &mut *self.pointer }
@@ -336,7 +329,7 @@ pub trait CudaKernelParameter: sealed::Sealed {
     type AsyncHostType<'stream, 'b>;
     #[doc(hidden)]
     type FfiType<'stream, 'b>: rustacuda_core::DeviceCopy + TypeGraphLayout;
-    #[cfg(all(not(feature = "host"), target_os = "cuda"))]
+    #[cfg(any(feature = "device", doc))]
     type DeviceType<'b>;
 
     #[cfg(feature = "host")]
@@ -361,7 +354,7 @@ pub trait CudaKernelParameter: sealed::Sealed {
     ) -> Self::FfiType<'stream, 'b>;
 
     #[doc(hidden)]
-    #[cfg(all(not(feature = "host"), target_os = "cuda"))]
+    #[cfg(feature = "device")]
     unsafe fn with_ffi_as_device<O, const PARAM: usize>(
         param: Self::FfiType<'static, 'static>,
         inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O,
@@ -428,7 +421,7 @@ impl<
 {
     #[cfg(feature = "host")]
     type AsyncHostType<'stream, 'b> = crate::utils::device_copy::SafeDeviceCopyWrapper<T>;
-    #[cfg(all(not(feature = "host"), target_os = "cuda"))]
+    #[cfg(any(feature = "device", doc))]
     type DeviceType<'b> = T;
     type FfiType<'stream, 'b> = crate::utils::device_copy::SafeDeviceCopyWrapper<T>;
     #[cfg(feature = "host")]
@@ -460,7 +453,7 @@ impl<
         param
     }
 
-    #[cfg(all(not(feature = "host"), target_os = "cuda"))]
+    #[cfg(feature = "device")]
     unsafe fn with_ffi_as_device<O, const PARAM: usize>(
         param: Self::FfiType<'static, 'static>,
         inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O,
@@ -492,7 +485,7 @@ impl<
         'b,
         crate::utils::device_copy::SafeDeviceCopyWrapper<T>,
     >;
-    #[cfg(all(not(feature = "host"), target_os = "cuda"))]
+    #[cfg(any(feature = "device", doc))]
     type DeviceType<'b> = &'b T;
     type FfiType<'stream, 'b> =
         DeviceConstRef<'b, crate::utils::device_copy::SafeDeviceCopyWrapper<T>>;
@@ -535,7 +528,7 @@ impl<
         unsafe { param.for_device_async() }
     }
 
-    #[cfg(all(not(feature = "host"), target_os = "cuda"))]
+    #[cfg(feature = "device")]
     unsafe fn with_ffi_as_device<O, const PARAM: usize>(
         param: Self::FfiType<'static, 'static>,
         inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O,
@@ -565,7 +558,7 @@ impl<
     #[cfg(feature = "host")]
     type AsyncHostType<'stream, 'b> =
         <&'a PerThreadShallowCopy<T> as CudaKernelParameter>::AsyncHostType<'stream, 'b>;
-    #[cfg(all(not(feature = "host"), target_os = "cuda"))]
+    #[cfg(any(feature = "device", doc))]
     type DeviceType<'b> = <&'a PerThreadShallowCopy<T> as CudaKernelParameter>::DeviceType<'b>;
     type FfiType<'stream, 'b> =
         <&'a PerThreadShallowCopy<T> as CudaKernelParameter>::FfiType<'stream, 'b>;
@@ -596,7 +589,7 @@ impl<
         <&'a PerThreadShallowCopy<T> as CudaKernelParameter>::async_to_ffi(param)
     }
 
-    #[cfg(all(not(feature = "host"), target_os = "cuda"))]
+    #[cfg(feature = "device")]
     unsafe fn with_ffi_as_device<O, const PARAM: usize>(
         param: Self::FfiType<'static, 'static>,
         inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O,
@@ -639,7 +632,7 @@ impl<'a, T: 'static + InteriorMutableSafeDeviceCopy> CudaKernelParameter
         'b,
         crate::utils::device_copy::SafeDeviceCopyWrapper<T>,
     >;
-    #[cfg(all(not(feature = "host"), target_os = "cuda"))]
+    #[cfg(any(feature = "device", doc))]
     type DeviceType<'b> = &'b T;
     type FfiType<'stream, 'b> =
         DeviceConstRef<'b, crate::utils::device_copy::SafeDeviceCopyWrapper<T>>;
@@ -690,7 +683,7 @@ impl<'a, T: 'static + InteriorMutableSafeDeviceCopy> CudaKernelParameter
         unsafe { param.for_device_async() }
     }
 
-    #[cfg(all(not(feature = "host"), target_os = "cuda"))]
+    #[cfg(feature = "device")]
     unsafe fn with_ffi_as_device<O, const PARAM: usize>(
         param: Self::FfiType<'static, 'static>,
         inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O,
@@ -773,7 +766,7 @@ impl<
         'b,
         DeviceAccessible<<T as RustToCuda>::CudaRepresentation>,
     >;
-    #[cfg(all(not(feature = "host"), target_os = "cuda"))]
+    #[cfg(any(feature = "device", doc))]
     type DeviceType<'b> = T;
     type FfiType<'stream, 'b> =
         DeviceOwnedRef<'b, DeviceAccessible<<T as RustToCuda>::CudaRepresentation>>;
@@ -804,7 +797,7 @@ impl<
         unsafe { param.for_device_async() }
     }
 
-    #[cfg(all(not(feature = "host"), target_os = "cuda"))]
+    #[cfg(feature = "device")]
     unsafe fn with_ffi_as_device<O, const PARAM: usize>(
         param: Self::FfiType<'static, 'static>,
         inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O,
@@ -830,7 +823,7 @@ impl<'a, T: 'static + RustToCuda + crate::safety::NoSafeAliasing> CudaKernelPara
         'b,
         DeviceAccessible<<T as RustToCuda>::CudaRepresentation>,
     >;
-    #[cfg(all(not(feature = "host"), target_os = "cuda"))]
+    #[cfg(any(feature = "device", doc))]
     type DeviceType<'b> = &'b T;
     type FfiType<'stream, 'b> =
         DeviceConstRef<'b, DeviceAccessible<<T as RustToCuda>::CudaRepresentation>>;
@@ -861,7 +854,7 @@ impl<'a, T: 'static + RustToCuda + crate::safety::NoSafeAliasing> CudaKernelPara
         unsafe { param.for_device_async() }
     }
 
-    #[cfg(all(not(feature = "host"), target_os = "cuda"))]
+    #[cfg(feature = "device")]
     unsafe fn with_ffi_as_device<O, const PARAM: usize>(
         param: Self::FfiType<'static, 'static>,
         inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O,
@@ -884,7 +877,7 @@ impl<
     #[cfg(feature = "host")]
     type AsyncHostType<'stream, 'b> =
         <SharedHeapPerThreadShallowCopy<T> as CudaKernelParameter>::AsyncHostType<'stream, 'b>;
-    #[cfg(all(not(feature = "host"), target_os = "cuda"))]
+    #[cfg(any(feature = "device", doc))]
     type DeviceType<'b> =
         <SharedHeapPerThreadShallowCopy<T> as CudaKernelParameter>::DeviceType<'b>;
     type FfiType<'stream, 'b> =
@@ -918,7 +911,7 @@ impl<
         <SharedHeapPerThreadShallowCopy<T> as CudaKernelParameter>::async_to_ffi(param)
     }
 
-    #[cfg(all(not(feature = "host"), target_os = "cuda"))]
+    #[cfg(feature = "device")]
     unsafe fn with_ffi_as_device<O, const PARAM: usize>(
         param: Self::FfiType<'static, 'static>,
         inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O,
@@ -945,7 +938,7 @@ impl<'a, T: 'static + RustToCuda + crate::safety::NoSafeAliasing> CudaKernelPara
     #[cfg(feature = "host")]
     type AsyncHostType<'stream, 'b> =
         <&'a SharedHeapPerThreadShallowCopy<T> as CudaKernelParameter>::AsyncHostType<'stream, 'b>;
-    #[cfg(all(not(feature = "host"), target_os = "cuda"))]
+    #[cfg(any(feature = "device", doc))]
     type DeviceType<'b> =
         <&'a SharedHeapPerThreadShallowCopy<T> as CudaKernelParameter>::DeviceType<'b>;
     type FfiType<'stream, 'b> =
@@ -980,7 +973,7 @@ impl<'a, T: 'static + RustToCuda + crate::safety::NoSafeAliasing> CudaKernelPara
         <&'a SharedHeapPerThreadShallowCopy<T> as CudaKernelParameter>::async_to_ffi(param)
     }
 
-    #[cfg(all(not(feature = "host"), target_os = "cuda"))]
+    #[cfg(feature = "device")]
     unsafe fn with_ffi_as_device<O, const PARAM: usize>(
         param: Self::FfiType<'static, 'static>,
         inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O,
@@ -1002,7 +995,7 @@ fn param_as_raw_bytes<T: ?Sized>(r: &T) -> NonNull<[u8]> {
     NonNull::slice_from_raw_parts(NonNull::from(r).cast::<u8>(), core::mem::size_of_val(r))
 }
 
-#[cfg(all(not(feature = "host"), target_os = "cuda"))]
+#[cfg(feature = "device")]
 fn emit_param_ptx_jit_marker<T: ?Sized, const INDEX: usize>(param: &T) {
     unsafe {
         core::arch::asm!(
@@ -1014,7 +1007,7 @@ fn emit_param_ptx_jit_marker<T: ?Sized, const INDEX: usize>(param: &T) {
 }
 
 mod private_shared {
-    use const_type_layout::TypeGraphLayout;
+    use const_type_layout::{TypeGraphLayout, TypeLayout};
     use rustacuda_core::DeviceCopy;
 
     #[doc(hidden)]
@@ -1044,7 +1037,7 @@ impl<'a, T: 'static + TypeGraphLayout> CudaKernelParameter
 {
     #[cfg(feature = "host")]
     type AsyncHostType<'stream, 'b> = &'b mut crate::utils::shared::r#static::ThreadBlockShared<T>;
-    #[cfg(all(not(feature = "host"), target_os = "cuda"))]
+    #[cfg(any(feature = "device", doc))]
     type DeviceType<'b> = &'b mut crate::utils::shared::r#static::ThreadBlockShared<T>;
     type FfiType<'stream, 'b> = private_shared::ThreadBlockSharedFfi<T>;
     #[cfg(feature = "host")]
@@ -1074,7 +1067,7 @@ impl<'a, T: 'static + TypeGraphLayout> CudaKernelParameter
         private_shared::ThreadBlockSharedFfi { _marker: [] }
     }
 
-    #[cfg(all(not(feature = "host"), target_os = "cuda"))]
+    #[cfg(feature = "device")]
     #[allow(clippy::inline_always)]
     #[inline(always)]
     unsafe fn with_ffi_as_device<O, const PARAM: usize>(
@@ -1097,7 +1090,7 @@ impl<'a, T: 'static + TypeGraphLayout> CudaKernelParameter
     #[cfg(feature = "host")]
     type AsyncHostType<'stream, 'b> =
         &'b mut crate::utils::shared::slice::ThreadBlockSharedSlice<T>;
-    #[cfg(all(not(feature = "host"), target_os = "cuda"))]
+    #[cfg(any(feature = "device", doc))]
     type DeviceType<'b> = &'b mut crate::utils::shared::slice::ThreadBlockSharedSlice<T>;
     type FfiType<'stream, 'b> = private_shared::ThreadBlockSharedSliceFfi<T>;
     #[cfg(feature = "host")]
@@ -1130,7 +1123,7 @@ impl<'a, T: 'static + TypeGraphLayout> CudaKernelParameter
         }
     }
 
-    #[cfg(all(not(feature = "host"), target_os = "cuda"))]
+    #[cfg(feature = "device")]
     #[allow(clippy::inline_always)]
     #[inline(always)]
     unsafe fn with_ffi_as_device<O, const PARAM: usize>(
diff --git a/src/deps.rs b/src/deps.rs
new file mode 100644
index 000000000..fe001e054
--- /dev/null
+++ b/src/deps.rs
@@ -0,0 +1,6 @@
+pub extern crate const_type_layout;
+
+#[cfg(feature = "host")]
+pub extern crate rustacuda;
+
+pub extern crate rustacuda_core;
diff --git a/src/device/alloc.rs b/src/device/alloc.rs
index 0217fa939..c1c28f931 100644
--- a/src/device/alloc.rs
+++ b/src/device/alloc.rs
@@ -1,5 +1,5 @@
 use alloc::alloc::{GlobalAlloc, Layout};
-#[cfg(target_os = "cuda")]
+#[cfg(all(feature = "device", not(doc)))]
 use core::arch::nvptx;
 
 /// Memory allocator using CUDA malloc/free
diff --git a/src/device/mod.rs b/src/device/mod.rs
index c4a459087..07894b5bb 100644
--- a/src/device/mod.rs
+++ b/src/device/mod.rs
@@ -1,7 +1,6 @@
 use core::mem::ManuallyDrop;
 
 #[cfg(feature = "derive")]
-#[doc(cfg(feature = "derive"))]
 pub use rust_cuda_derive::{specialise_kernel_function, specialise_kernel_type};
 
 use crate::{
diff --git a/src/device/thread.rs b/src/device/thread.rs
index b2f3035bd..bb5599cda 100644
--- a/src/device/thread.rs
+++ b/src/device/thread.rs
@@ -1,4 +1,4 @@
-#[cfg(target_os = "cuda")]
+#[cfg(all(feature = "device", not(doc)))]
 use core::arch::nvptx;
 
 #[allow(clippy::module_name_repetitions)]
diff --git a/src/host/mod.rs b/src/host/mod.rs
index 424b5726c..cc7fa681f 100644
--- a/src/host/mod.rs
+++ b/src/host/mod.rs
@@ -18,7 +18,6 @@ use rustacuda::{
 use rustacuda_core::{DeviceCopy, DevicePointer};
 
 #[cfg(feature = "derive")]
-#[doc(cfg(feature = "derive"))]
 pub use rust_cuda_derive::{check_kernel, link_kernel, specialise_kernel_entry_point};
 
 use crate::{
@@ -162,8 +161,6 @@ pub struct LaunchConfig {
     pub ptx_jit: bool,
 }
 
-#[doc(cfg(feature = "host"))]
-#[allow(clippy::module_name_repetitions)]
 pub struct RawPtxKernel {
     module: ManuallyDrop<Box<Module>>,
     function: ManuallyDrop<Function<'static>>,
diff --git a/src/host/ptx_jit/mod.rs b/src/host/ptx_jit/mod.rs
index 156e8223c..43c555ab2 100644
--- a/src/host/ptx_jit/mod.rs
+++ b/src/host/ptx_jit/mod.rs
@@ -6,7 +6,6 @@ mod replace;
 
 type ByteSliceOptionalArguments = Option<Box<[Option<Box<[u8]>>]>>;
 
-#[doc(cfg(feature = "host"))]
 #[allow(clippy::module_name_repetitions)]
 pub struct PtxJITCompiler {
     ptx_slices: Box<[PtxElement]>,
@@ -14,7 +13,6 @@ pub struct PtxJITCompiler {
     last_ptx: CString,
 }
 
-#[doc(cfg(feature = "host"))]
 pub enum PtxJITResult<'s> {
     Cached(&'s CStr),
     Recomputed(&'s CStr),
diff --git a/src/lib.rs b/src/lib.rs
index 26f56adb6..118a55343 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -6,22 +6,14 @@
 #![deny(clippy::style)]
 #![deny(clippy::suspicious)]
 #![allow(clippy::useless_attribute)]
-#![cfg_attr(not(feature = "host"), no_std)]
+#![cfg_attr(all(feature = "device", not(doc)), no_std)]
 #![feature(associated_type_bounds)]
 #![feature(auto_traits)]
 #![feature(negative_impls)]
-#![cfg_attr(
-    any(all(not(feature = "host"), target_os = "cuda"), doc),
-    feature(stdsimd)
-)]
-#![cfg_attr(
-    any(all(not(feature = "host"), target_os = "cuda"), doc),
-    feature(asm_experimental_arch)
-)]
-#![cfg_attr(
-    any(all(not(feature = "host"), target_os = "cuda"), doc),
-    feature(asm_const)
-)]
+#![cfg_attr(feature = "device", feature(stdsimd))]
+#![cfg_attr(feature = "device", feature(asm_experimental_arch))]
+#![cfg_attr(feature = "device", feature(asm_const))]
+#![feature(doc_auto_cfg)]
 #![feature(doc_cfg)]
 #![feature(marker_trait_attr)]
 #![feature(const_type_name)]
@@ -35,43 +27,36 @@
 #![feature(inline_const)]
 #![feature(sync_unsafe_cell)]
 #![feature(never_type)]
-#![feature(tuple_trait)]
-#![feature(unboxed_closures)]
 #![feature(cfg_version)]
 #![cfg_attr(not(version("1.76.0")), feature(c_str_literals))]
 #![cfg_attr(not(version("1.76.0")), feature(ptr_from_ref))]
 #![allow(incomplete_features)]
 #![feature(generic_const_exprs)]
-#![cfg_attr(target_os = "cuda", feature(slice_ptr_get))]
+#![cfg_attr(feature = "device", feature(slice_ptr_get))]
 #![doc(html_root_url = "https://juntyr.github.io/rust-cuda/")]
 
-#[doc(hidden)]
-pub extern crate alloc;
+#[cfg(all(feature = "host", feature = "device", not(doc)))]
+core::compile_error!("cannot enable the `host` and `device` features at the same time");
 
-pub extern crate rustacuda_core;
+#[cfg(all(feature = "host", targt_os = "cuda", not(doc)))]
+core::compile_error!("cannot enable the `host` feature on a target with `target_os=\"cuda\"`");
 
-#[doc(hidden)]
-#[macro_use]
-pub extern crate const_type_layout;
+#[cfg(all(feature = "device", not(target_os = "cuda"), not(doc)))]
+core::compile_error!("cannot enable the `device` feature on a target without `target_os=\"cuda\"`");
 
-#[cfg(feature = "derive")]
-#[doc(cfg(feature = "derive"))]
-pub extern crate rustacuda_derive;
+#[doc(hidden)]
+pub extern crate alloc;
 
 pub mod common;
 
 #[cfg(feature = "host")]
-#[doc(cfg(feature = "host"))]
 pub mod host;
 
-#[cfg(feature = "host")]
-#[doc(cfg(feature = "host"))]
-pub extern crate rustacuda;
-
-#[cfg(any(all(not(feature = "host"), target_os = "cuda"), doc))]
-#[doc(cfg(all(not(feature = "host"), target_os = "cuda")))]
+#[cfg(feature = "device")]
 pub mod device;
 
 pub mod utils;
 
 pub mod safety;
+
+pub mod deps;
diff --git a/src/utils/aliasing/const.rs b/src/utils/aliasing/const.rs
index 759b14dc9..131a05803 100644
--- a/src/utils/aliasing/const.rs
+++ b/src/utils/aliasing/const.rs
@@ -1,9 +1,11 @@
+#[cfg(any(feature = "host", feature = "device"))]
 use core::{
     borrow::{Borrow, BorrowMut},
     convert::{AsMut, AsRef},
     ops::{Deref, DerefMut},
 };
 
+use const_type_layout::TypeLayout;
 use rustacuda_core::DeviceCopy;
 
 use crate::common::{CudaAsRust, DeviceAccessible, RustToCuda, RustToCudaAsync};
@@ -13,6 +15,7 @@ use crate::common::{CudaAsRust, DeviceAccessible, RustToCuda, RustToCudaAsync};
 pub struct SplitSliceOverCudaThreadsConstStride<T, const STRIDE: usize>(T);
 
 impl<T, const STRIDE: usize> SplitSliceOverCudaThreadsConstStride<T, STRIDE> {
+    #[cfg(feature = "host")]
     #[must_use]
     pub const fn new(inner: T) -> Self {
         Self(inner)
@@ -26,7 +29,7 @@ unsafe impl<T: DeviceCopy, const STRIDE: usize> DeviceCopy
 {
 }
 
-#[cfg(all(not(feature = "host"), target_os = "cuda"))]
+#[cfg(feature = "device")]
 fn split_slice_const_stride<E, const STRIDE: usize>(slice: &[E]) -> &[E] {
     let offset: usize = crate::device::thread::Thread::this().index() * STRIDE;
     let len = slice.len().min(offset + STRIDE).saturating_sub(offset);
@@ -34,7 +37,7 @@ fn split_slice_const_stride<E, const STRIDE: usize>(slice: &[E]) -> &[E] {
     unsafe { core::slice::from_raw_parts(slice.as_ptr().add(offset), len) }
 }
 
-#[cfg(all(not(feature = "host"), target_os = "cuda"))]
+#[cfg(feature = "device")]
 fn split_slice_const_stride_mut<E, const STRIDE: usize>(slice: &mut [E]) -> &mut [E] {
     let offset: usize = crate::device::thread::Thread::this().index() * STRIDE;
     let len = slice.len().min(offset + STRIDE).saturating_sub(offset);
@@ -42,7 +45,7 @@ fn split_slice_const_stride_mut<E, const STRIDE: usize>(slice: &mut [E]) -> &mut
     unsafe { core::slice::from_raw_parts_mut(slice.as_mut_ptr().add(offset), len) }
 }
 
-#[cfg(all(not(feature = "host"), target_os = "cuda"))]
+#[cfg(feature = "device")]
 impl<T, const STRIDE: usize> SplitSliceOverCudaThreadsConstStride<T, STRIDE> {
     /// # Safety
     ///
@@ -63,7 +66,8 @@ impl<T, const STRIDE: usize> SplitSliceOverCudaThreadsConstStride<T, STRIDE> {
     }
 }
 
-#[cfg(all(not(feature = "host"), target_os = "cuda"))]
+#[cfg(any(feature = "device", doc))]
+#[doc(cfg(any(feature = "device", feature = "host")))]
 impl<E, T: Deref<Target = [E]>, const STRIDE: usize> Deref
     for SplitSliceOverCudaThreadsConstStride<T, STRIDE>
 {
@@ -74,7 +78,8 @@ impl<E, T: Deref<Target = [E]>, const STRIDE: usize> Deref
     }
 }
 
-#[cfg(all(not(feature = "host"), target_os = "cuda"))]
+#[cfg(any(feature = "device", doc))]
+#[doc(cfg(any(feature = "device", feature = "host")))]
 impl<E, T: DerefMut<Target = [E]>, const STRIDE: usize> DerefMut
     for SplitSliceOverCudaThreadsConstStride<T, STRIDE>
 {
@@ -83,7 +88,8 @@ impl<E, T: DerefMut<Target = [E]>, const STRIDE: usize> DerefMut
     }
 }
 
-#[cfg(all(not(feature = "host"), target_os = "cuda"))]
+#[cfg(any(feature = "device", doc))]
+#[doc(cfg(any(feature = "device", feature = "host")))]
 impl<E, T: AsRef<[E]>, const STRIDE: usize> AsRef<[E]>
     for SplitSliceOverCudaThreadsConstStride<T, STRIDE>
 {
@@ -92,7 +98,8 @@ impl<E, T: AsRef<[E]>, const STRIDE: usize> AsRef<[E]>
     }
 }
 
-#[cfg(all(not(feature = "host"), target_os = "cuda"))]
+#[cfg(any(feature = "device", doc))]
+#[doc(cfg(any(feature = "device", feature = "host")))]
 impl<E, T: AsMut<[E]>, const STRIDE: usize> AsMut<[E]>
     for SplitSliceOverCudaThreadsConstStride<T, STRIDE>
 {
@@ -101,7 +108,8 @@ impl<E, T: AsMut<[E]>, const STRIDE: usize> AsMut<[E]>
     }
 }
 
-#[cfg(all(not(feature = "host"), target_os = "cuda"))]
+#[cfg(any(feature = "device", doc))]
+#[doc(cfg(any(feature = "device", feature = "host")))]
 impl<E, T: Borrow<[E]>, const STRIDE: usize> Borrow<[E]>
     for SplitSliceOverCudaThreadsConstStride<T, STRIDE>
 {
@@ -110,7 +118,8 @@ impl<E, T: Borrow<[E]>, const STRIDE: usize> Borrow<[E]>
     }
 }
 
-#[cfg(all(not(feature = "host"), target_os = "cuda"))]
+#[cfg(any(feature = "device", doc))]
+#[doc(cfg(any(feature = "device", feature = "host")))]
 impl<E, T: BorrowMut<[E]>, const STRIDE: usize> BorrowMut<[E]>
     for SplitSliceOverCudaThreadsConstStride<T, STRIDE>
 {
@@ -119,7 +128,7 @@ impl<E, T: BorrowMut<[E]>, const STRIDE: usize> BorrowMut<[E]>
     }
 }
 
-#[cfg(any(feature = "host", not(target_os = "cuda")))]
+#[cfg(all(feature = "host", not(doc)))]
 impl<E, T: Deref<Target = [E]>, const STRIDE: usize> Deref
     for SplitSliceOverCudaThreadsConstStride<T, STRIDE>
 {
@@ -130,7 +139,7 @@ impl<E, T: Deref<Target = [E]>, const STRIDE: usize> Deref
     }
 }
 
-#[cfg(any(feature = "host", not(target_os = "cuda")))]
+#[cfg(all(feature = "host", not(doc)))]
 impl<E, T: DerefMut<Target = [E]>, const STRIDE: usize> DerefMut
     for SplitSliceOverCudaThreadsConstStride<T, STRIDE>
 {
@@ -139,7 +148,7 @@ impl<E, T: DerefMut<Target = [E]>, const STRIDE: usize> DerefMut
     }
 }
 
-#[cfg(any(feature = "host", not(target_os = "cuda")))]
+#[cfg(all(feature = "host", not(doc)))]
 impl<E, T: AsRef<[E]>, const STRIDE: usize> AsRef<[E]>
     for SplitSliceOverCudaThreadsConstStride<T, STRIDE>
 {
@@ -148,7 +157,7 @@ impl<E, T: AsRef<[E]>, const STRIDE: usize> AsRef<[E]>
     }
 }
 
-#[cfg(any(feature = "host", not(target_os = "cuda")))]
+#[cfg(all(feature = "host", not(doc)))]
 impl<E, T: AsMut<[E]>, const STRIDE: usize> AsMut<[E]>
     for SplitSliceOverCudaThreadsConstStride<T, STRIDE>
 {
@@ -157,7 +166,7 @@ impl<E, T: AsMut<[E]>, const STRIDE: usize> AsMut<[E]>
     }
 }
 
-#[cfg(any(feature = "host", not(target_os = "cuda")))]
+#[cfg(all(feature = "host", not(doc)))]
 impl<E, T: Borrow<[E]>, const STRIDE: usize> Borrow<[E]>
     for SplitSliceOverCudaThreadsConstStride<T, STRIDE>
 {
@@ -166,7 +175,7 @@ impl<E, T: Borrow<[E]>, const STRIDE: usize> Borrow<[E]>
     }
 }
 
-#[cfg(any(feature = "host", not(target_os = "cuda")))]
+#[cfg(all(feature = "host", not(doc)))]
 impl<E, T: BorrowMut<[E]>, const STRIDE: usize> BorrowMut<[E]>
     for SplitSliceOverCudaThreadsConstStride<T, STRIDE>
 {
@@ -183,7 +192,6 @@ unsafe impl<T: RustToCuda, const STRIDE: usize> RustToCuda
         SplitSliceOverCudaThreadsConstStride<DeviceAccessible<T::CudaRepresentation>, STRIDE>;
 
     #[cfg(feature = "host")]
-    #[doc(cfg(feature = "host"))]
     #[allow(clippy::type_complexity)]
     unsafe fn borrow<A: crate::common::CudaAlloc>(
         &self,
@@ -201,7 +209,6 @@ unsafe impl<T: RustToCuda, const STRIDE: usize> RustToCuda
     }
 
     #[cfg(feature = "host")]
-    #[doc(cfg(feature = "host"))]
     unsafe fn restore<A: crate::common::CudaAlloc>(
         &mut self,
         alloc: crate::common::CombinedCudaAlloc<Self::CudaAllocation, A>,
@@ -214,7 +221,6 @@ unsafe impl<T: RustToCudaAsync, const STRIDE: usize> RustToCudaAsync
     for SplitSliceOverCudaThreadsConstStride<T, STRIDE>
 {
     #[cfg(feature = "host")]
-    #[doc(cfg(feature = "host"))]
     #[allow(clippy::type_complexity)]
     unsafe fn borrow_async<A: crate::common::CudaAlloc>(
         &self,
@@ -233,7 +239,6 @@ unsafe impl<T: RustToCudaAsync, const STRIDE: usize> RustToCudaAsync
     }
 
     #[cfg(feature = "host")]
-    #[doc(cfg(feature = "host"))]
     unsafe fn restore_async<A: crate::common::CudaAlloc>(
         &mut self,
         alloc: crate::common::CombinedCudaAlloc<Self::CudaAllocation, A>,
@@ -248,8 +253,8 @@ unsafe impl<T: CudaAsRust, const STRIDE: usize> CudaAsRust
 {
     type RustRepresentation = SplitSliceOverCudaThreadsConstStride<T::RustRepresentation, STRIDE>;
 
-    #[cfg(not(feature = "host"))]
+    #[cfg(feature = "device")]
     unsafe fn as_rust(this: &DeviceAccessible<Self>) -> Self::RustRepresentation {
-        SplitSliceOverCudaThreadsConstStride::new(CudaAsRust::as_rust(&this.0))
+        SplitSliceOverCudaThreadsConstStride(CudaAsRust::as_rust(&this.0))
     }
 }
diff --git a/src/utils/aliasing/dynamic.rs b/src/utils/aliasing/dynamic.rs
index a3cecfa8f..a6577fc6f 100644
--- a/src/utils/aliasing/dynamic.rs
+++ b/src/utils/aliasing/dynamic.rs
@@ -1,9 +1,11 @@
+#[cfg(any(feature = "host", feature = "device"))]
 use core::{
     borrow::{Borrow, BorrowMut},
     convert::{AsMut, AsRef},
     ops::{Deref, DerefMut},
 };
 
+use const_type_layout::TypeLayout;
 use rustacuda_core::DeviceCopy;
 
 use crate::common::{CudaAsRust, DeviceAccessible, RustToCuda, RustToCudaAsync};
@@ -16,6 +18,7 @@ pub struct SplitSliceOverCudaThreadsDynamicStride<T> {
 }
 
 impl<T> SplitSliceOverCudaThreadsDynamicStride<T> {
+    #[cfg(feature = "host")]
     #[must_use]
     pub const fn new(inner: T, stride: usize) -> Self {
         Self { stride, inner }
@@ -26,7 +29,7 @@ impl<T> SplitSliceOverCudaThreadsDynamicStride<T> {
 // [`DeviceCopy`]
 unsafe impl<T: DeviceCopy> DeviceCopy for SplitSliceOverCudaThreadsDynamicStride<T> {}
 
-#[cfg(all(not(feature = "host"), target_os = "cuda"))]
+#[cfg(feature = "device")]
 fn split_slice_dynamic_stride<E>(slice: &[E], stride: usize) -> &[E] {
     let offset: usize = crate::device::thread::Thread::this().index() * stride;
     let len = slice.len().min(offset + stride).saturating_sub(offset);
@@ -34,7 +37,7 @@ fn split_slice_dynamic_stride<E>(slice: &[E], stride: usize) -> &[E] {
     unsafe { core::slice::from_raw_parts(slice.as_ptr().add(offset), len) }
 }
 
-#[cfg(all(not(feature = "host"), target_os = "cuda"))]
+#[cfg(feature = "device")]
 fn split_slice_dynamic_stride_mut<E>(slice: &mut [E], stride: usize) -> &mut [E] {
     let offset: usize = crate::device::thread::Thread::this().index() * stride;
     let len = slice.len().min(offset + stride).saturating_sub(offset);
@@ -42,7 +45,7 @@ fn split_slice_dynamic_stride_mut<E>(slice: &mut [E], stride: usize) -> &mut [E]
     unsafe { core::slice::from_raw_parts_mut(slice.as_mut_ptr().add(offset), len) }
 }
 
-#[cfg(all(not(feature = "host"), target_os = "cuda"))]
+#[cfg(feature = "device")]
 impl<T> SplitSliceOverCudaThreadsDynamicStride<T> {
     /// # Safety
     ///
@@ -63,7 +66,8 @@ impl<T> SplitSliceOverCudaThreadsDynamicStride<T> {
     }
 }
 
-#[cfg(all(not(feature = "host"), target_os = "cuda"))]
+#[cfg(any(feature = "device", doc))]
+#[doc(cfg(any(feature = "device", feature = "host")))]
 impl<E, T: Deref<Target = [E]>> Deref for SplitSliceOverCudaThreadsDynamicStride<T> {
     type Target = [E];
 
@@ -72,42 +76,47 @@ impl<E, T: Deref<Target = [E]>> Deref for SplitSliceOverCudaThreadsDynamicStride
     }
 }
 
-#[cfg(all(not(feature = "host"), target_os = "cuda"))]
+#[cfg(any(feature = "device", doc))]
+#[doc(cfg(any(feature = "device", feature = "host")))]
 impl<E, T: DerefMut<Target = [E]>> DerefMut for SplitSliceOverCudaThreadsDynamicStride<T> {
     fn deref_mut(&mut self) -> &mut Self::Target {
         split_slice_dynamic_stride_mut(&mut self.inner, self.stride)
     }
 }
 
-#[cfg(all(not(feature = "host"), target_os = "cuda"))]
+#[cfg(any(feature = "device", doc))]
+#[doc(cfg(any(feature = "device", feature = "host")))]
 impl<E, T: AsRef<[E]>> AsRef<[E]> for SplitSliceOverCudaThreadsDynamicStride<T> {
     fn as_ref(&self) -> &[E] {
         split_slice_dynamic_stride(self.inner.as_ref(), self.stride)
     }
 }
 
-#[cfg(all(not(feature = "host"), target_os = "cuda"))]
+#[cfg(any(feature = "device", doc))]
+#[doc(cfg(any(feature = "device", feature = "host")))]
 impl<E, T: AsMut<[E]>> AsMut<[E]> for SplitSliceOverCudaThreadsDynamicStride<T> {
     fn as_mut(&mut self) -> &mut [E] {
         split_slice_dynamic_stride_mut(self.inner.as_mut(), self.stride)
     }
 }
 
-#[cfg(all(not(feature = "host"), target_os = "cuda"))]
+#[cfg(any(feature = "device", doc))]
+#[doc(cfg(any(feature = "device", feature = "host")))]
 impl<E, T: Borrow<[E]>> Borrow<[E]> for SplitSliceOverCudaThreadsDynamicStride<T> {
     fn borrow(&self) -> &[E] {
         split_slice_dynamic_stride(self.inner.borrow(), self.stride)
     }
 }
 
-#[cfg(all(not(feature = "host"), target_os = "cuda"))]
+#[cfg(any(feature = "device", doc))]
+#[doc(cfg(any(feature = "device", feature = "host")))]
 impl<E, T: BorrowMut<[E]>> BorrowMut<[E]> for SplitSliceOverCudaThreadsDynamicStride<T> {
     fn borrow_mut(&mut self) -> &mut [E] {
         split_slice_dynamic_stride_mut(self.inner.borrow_mut(), self.stride)
     }
 }
 
-#[cfg(any(feature = "host", not(target_os = "cuda")))]
+#[cfg(all(feature = "host", not(doc)))]
 impl<E, T: Deref<Target = [E]>> Deref for SplitSliceOverCudaThreadsDynamicStride<T> {
     type Target = [E];
 
@@ -116,35 +125,35 @@ impl<E, T: Deref<Target = [E]>> Deref for SplitSliceOverCudaThreadsDynamicStride
     }
 }
 
-#[cfg(any(feature = "host", not(target_os = "cuda")))]
+#[cfg(all(feature = "host", not(doc)))]
 impl<E, T: DerefMut<Target = [E]>> DerefMut for SplitSliceOverCudaThreadsDynamicStride<T> {
     fn deref_mut(&mut self) -> &mut Self::Target {
         &mut self.inner
     }
 }
 
-#[cfg(any(feature = "host", not(target_os = "cuda")))]
+#[cfg(all(feature = "host", not(doc)))]
 impl<E, T: AsRef<[E]>> AsRef<[E]> for SplitSliceOverCudaThreadsDynamicStride<T> {
     fn as_ref(&self) -> &[E] {
         self.inner.as_ref()
     }
 }
 
-#[cfg(any(feature = "host", not(target_os = "cuda")))]
+#[cfg(all(feature = "host", not(doc)))]
 impl<E, T: AsMut<[E]>> AsMut<[E]> for SplitSliceOverCudaThreadsDynamicStride<T> {
     fn as_mut(&mut self) -> &mut [E] {
         self.inner.as_mut()
     }
 }
 
-#[cfg(any(feature = "host", not(target_os = "cuda")))]
+#[cfg(all(feature = "host", not(doc)))]
 impl<E, T: Borrow<[E]>> Borrow<[E]> for SplitSliceOverCudaThreadsDynamicStride<T> {
     fn borrow(&self) -> &[E] {
         self.inner.borrow()
     }
 }
 
-#[cfg(any(feature = "host", not(target_os = "cuda")))]
+#[cfg(all(feature = "host", not(doc)))]
 impl<E, T: BorrowMut<[E]>> BorrowMut<[E]> for SplitSliceOverCudaThreadsDynamicStride<T> {
     fn borrow_mut(&mut self) -> &mut [E] {
         self.inner.borrow_mut()
@@ -157,7 +166,6 @@ unsafe impl<T: RustToCuda> RustToCuda for SplitSliceOverCudaThreadsDynamicStride
         SplitSliceOverCudaThreadsDynamicStride<DeviceAccessible<T::CudaRepresentation>>;
 
     #[cfg(feature = "host")]
-    #[doc(cfg(feature = "host"))]
     #[allow(clippy::type_complexity)]
     unsafe fn borrow<A: crate::common::CudaAlloc>(
         &self,
@@ -178,7 +186,6 @@ unsafe impl<T: RustToCuda> RustToCuda for SplitSliceOverCudaThreadsDynamicStride
     }
 
     #[cfg(feature = "host")]
-    #[doc(cfg(feature = "host"))]
     unsafe fn restore<A: crate::common::CudaAlloc>(
         &mut self,
         alloc: crate::common::CombinedCudaAlloc<Self::CudaAllocation, A>,
@@ -189,7 +196,6 @@ unsafe impl<T: RustToCuda> RustToCuda for SplitSliceOverCudaThreadsDynamicStride
 
 unsafe impl<T: RustToCudaAsync> RustToCudaAsync for SplitSliceOverCudaThreadsDynamicStride<T> {
     #[cfg(feature = "host")]
-    #[doc(cfg(feature = "host"))]
     #[allow(clippy::type_complexity)]
     unsafe fn borrow_async<A: crate::common::CudaAlloc>(
         &self,
@@ -211,7 +217,6 @@ unsafe impl<T: RustToCudaAsync> RustToCudaAsync for SplitSliceOverCudaThreadsDyn
     }
 
     #[cfg(feature = "host")]
-    #[doc(cfg(feature = "host"))]
     unsafe fn restore_async<A: crate::common::CudaAlloc>(
         &mut self,
         alloc: crate::common::CombinedCudaAlloc<Self::CudaAllocation, A>,
@@ -226,8 +231,11 @@ unsafe impl<T: CudaAsRust> CudaAsRust
 {
     type RustRepresentation = SplitSliceOverCudaThreadsDynamicStride<T::RustRepresentation>;
 
-    #[cfg(not(feature = "host"))]
+    #[cfg(feature = "device")]
     unsafe fn as_rust(this: &DeviceAccessible<Self>) -> Self::RustRepresentation {
-        SplitSliceOverCudaThreadsDynamicStride::new(CudaAsRust::as_rust(&this.inner), this.stride)
+        SplitSliceOverCudaThreadsDynamicStride {
+            stride: this.stride,
+            inner: CudaAsRust::as_rust(&this.inner),
+        }
     }
 }
diff --git a/src/utils/aliasing/final.rs b/src/utils/aliasing/final.rs
index 366de9557..230ea4e8a 100644
--- a/src/utils/aliasing/final.rs
+++ b/src/utils/aliasing/final.rs
@@ -1,3 +1,4 @@
+use const_type_layout::TypeLayout;
 use r#final::Final;
 
 use crate::common::{CudaAsRust, DeviceAccessible, RustToCuda, RustToCudaAsync};
@@ -16,7 +17,6 @@ unsafe impl<T: RustToCuda> RustToCuda for Final<T> {
     type CudaRepresentation = FinalCudaRepresentation<T::CudaRepresentation>;
 
     #[cfg(feature = "host")]
-    #[doc(cfg(feature = "host"))]
     #[allow(clippy::type_complexity)]
     unsafe fn borrow<A: crate::common::CudaAlloc>(
         &self,
@@ -34,7 +34,6 @@ unsafe impl<T: RustToCuda> RustToCuda for Final<T> {
     }
 
     #[cfg(feature = "host")]
-    #[doc(cfg(feature = "host"))]
     unsafe fn restore<A: crate::common::CudaAlloc>(
         &mut self,
         alloc: crate::common::CombinedCudaAlloc<Self::CudaAllocation, A>,
@@ -48,7 +47,6 @@ unsafe impl<T: RustToCuda> RustToCuda for Final<T> {
 
 unsafe impl<T: RustToCudaAsync> RustToCudaAsync for Final<T> {
     #[cfg(feature = "host")]
-    #[doc(cfg(feature = "host"))]
     #[allow(clippy::type_complexity)]
     unsafe fn borrow_async<A: crate::common::CudaAlloc>(
         &self,
@@ -67,7 +65,6 @@ unsafe impl<T: RustToCudaAsync> RustToCudaAsync for Final<T> {
     }
 
     #[cfg(feature = "host")]
-    #[doc(cfg(feature = "host"))]
     unsafe fn restore_async<A: crate::common::CudaAlloc>(
         &mut self,
         alloc: crate::common::CombinedCudaAlloc<Self::CudaAllocation, A>,
@@ -83,7 +80,7 @@ unsafe impl<T: RustToCudaAsync> RustToCudaAsync for Final<T> {
 unsafe impl<T: CudaAsRust> CudaAsRust for FinalCudaRepresentation<T> {
     type RustRepresentation = Final<T::RustRepresentation>;
 
-    #[cfg(not(feature = "host"))]
+    #[cfg(feature = "device")]
     unsafe fn as_rust(this: &DeviceAccessible<Self>) -> Self::RustRepresentation {
         Final::new(CudaAsRust::as_rust(&this.0))
     }
diff --git a/src/utils/box.rs b/src/utils/box.rs
index ab0b22708..9972c4ef3 100644
--- a/src/utils/box.rs
+++ b/src/utils/box.rs
@@ -1,18 +1,22 @@
 use alloc::boxed::Box;
 
-use const_type_layout::TypeGraphLayout;
+use const_type_layout::{TypeGraphLayout, TypeLayout};
+
+#[cfg(feature = "host")]
+use rustacuda::{error::CudaResult, memory::DeviceBox};
 
 use crate::{
-    common::{CudaAsRust, DeviceAccessible, RustToCuda},
+    common::{CudaAsRust, RustToCuda},
     safety::SafeDeviceCopy,
 };
 
+#[cfg(any(feature = "host", feature = "device"))]
+use crate::common::DeviceAccessible;
+
 #[cfg(feature = "host")]
 use crate::{
     common::{CombinedCudaAlloc, CudaAlloc},
     host::CudaDropWrapper,
-    rustacuda::error::CudaResult,
-    rustacuda::memory::DeviceBox,
     utils::device_copy::SafeDeviceCopyWrapper,
 };
 
@@ -31,14 +35,13 @@ unsafe impl<T: SafeDeviceCopy + TypeGraphLayout> rustacuda_core::DeviceCopy
 }
 
 unsafe impl<T: SafeDeviceCopy + TypeGraphLayout> RustToCuda for Box<T> {
-    #[cfg(feature = "host")]
+    #[cfg(all(feature = "host", not(doc)))]
     type CudaAllocation = crate::host::CudaDropWrapper<DeviceBox<SafeDeviceCopyWrapper<T>>>;
-    #[cfg(not(feature = "host"))]
+    #[cfg(any(not(feature = "host"), doc))]
     type CudaAllocation = crate::common::SomeCudaAlloc;
     type CudaRepresentation = BoxCudaRepresentation<T>;
 
     #[cfg(feature = "host")]
-    #[doc(cfg(feature = "host"))]
     #[allow(clippy::type_complexity)]
     unsafe fn borrow<A: CudaAlloc>(
         &self,
@@ -59,7 +62,6 @@ unsafe impl<T: SafeDeviceCopy + TypeGraphLayout> RustToCuda for Box<T> {
     }
 
     #[cfg(feature = "host")]
-    #[doc(cfg(feature = "host"))]
     unsafe fn restore<A: CudaAlloc>(
         &mut self,
         alloc: CombinedCudaAlloc<Self::CudaAllocation, A>,
@@ -79,7 +81,7 @@ unsafe impl<T: SafeDeviceCopy + TypeGraphLayout> RustToCuda for Box<T> {
 unsafe impl<T: SafeDeviceCopy + TypeGraphLayout> CudaAsRust for BoxCudaRepresentation<T> {
     type RustRepresentation = Box<T>;
 
-    #[cfg(not(feature = "host"))]
+    #[cfg(feature = "device")]
     unsafe fn as_rust(this: &DeviceAccessible<Self>) -> Self::RustRepresentation {
         alloc::boxed::Box::from_raw(this.0)
     }
diff --git a/src/utils/boxed_slice.rs b/src/utils/boxed_slice.rs
index 588fa8c07..bd9e74aee 100644
--- a/src/utils/boxed_slice.rs
+++ b/src/utils/boxed_slice.rs
@@ -1,18 +1,22 @@
 use alloc::boxed::Box;
 
-use const_type_layout::TypeGraphLayout;
+use const_type_layout::{TypeGraphLayout, TypeLayout};
+
+#[cfg(feature = "host")]
+use rustacuda::{error::CudaResult, memory::DeviceBuffer};
 
 use crate::{
-    common::{CudaAsRust, DeviceAccessible, RustToCuda},
+    common::{CudaAsRust, RustToCuda},
     safety::SafeDeviceCopy,
 };
 
+#[cfg(any(feature = "host", feature = "device"))]
+use crate::common::DeviceAccessible;
+
 #[cfg(feature = "host")]
 use crate::{
     common::{CombinedCudaAlloc, CudaAlloc},
     host::CudaDropWrapper,
-    rustacuda::error::CudaResult,
-    rustacuda::memory::DeviceBuffer,
     utils::device_copy::SafeDeviceCopyWrapper,
 };
 
@@ -31,14 +35,13 @@ unsafe impl<T: SafeDeviceCopy + TypeGraphLayout> rustacuda_core::DeviceCopy
 }
 
 unsafe impl<T: SafeDeviceCopy + TypeGraphLayout> RustToCuda for Box<[T]> {
-    #[cfg(feature = "host")]
+    #[cfg(all(feature = "host", not(doc)))]
     type CudaAllocation = crate::host::CudaDropWrapper<DeviceBuffer<SafeDeviceCopyWrapper<T>>>;
-    #[cfg(not(feature = "host"))]
+    #[cfg(any(not(feature = "host"), doc))]
     type CudaAllocation = crate::common::SomeCudaAlloc;
     type CudaRepresentation = BoxedSliceCudaRepresentation<T>;
 
     #[cfg(feature = "host")]
-    #[doc(cfg(feature = "host"))]
     #[allow(clippy::type_complexity)]
     unsafe fn borrow<A: CudaAlloc>(
         &self,
@@ -61,7 +64,6 @@ unsafe impl<T: SafeDeviceCopy + TypeGraphLayout> RustToCuda for Box<[T]> {
     }
 
     #[cfg(feature = "host")]
-    #[doc(cfg(feature = "host"))]
     unsafe fn restore<A: CudaAlloc>(
         &mut self,
         alloc: CombinedCudaAlloc<Self::CudaAllocation, A>,
@@ -81,7 +83,7 @@ unsafe impl<T: SafeDeviceCopy + TypeGraphLayout> RustToCuda for Box<[T]> {
 unsafe impl<T: SafeDeviceCopy + TypeGraphLayout> CudaAsRust for BoxedSliceCudaRepresentation<T> {
     type RustRepresentation = Box<[T]>;
 
-    #[cfg(not(feature = "host"))]
+    #[cfg(feature = "device")]
     unsafe fn as_rust(this: &DeviceAccessible<Self>) -> Self::RustRepresentation {
         alloc::boxed::Box::from_raw(core::slice::from_raw_parts_mut(this.0, this.1))
     }
diff --git a/src/utils/device_copy.rs b/src/utils/device_copy.rs
index 0a92e69a1..1f03c1799 100644
--- a/src/utils/device_copy.rs
+++ b/src/utils/device_copy.rs
@@ -1,12 +1,15 @@
 #![allow(clippy::trait_duplication_in_bounds)]
 
-use const_type_layout::TypeGraphLayout;
+use const_type_layout::{TypeGraphLayout, TypeLayout};
 
 use crate::{
-    common::{CudaAsRust, DeviceAccessible, NoCudaAlloc, RustToCuda, RustToCudaAsync},
+    common::{CudaAsRust, NoCudaAlloc, RustToCuda, RustToCudaAsync},
     safety::SafeDeviceCopy,
 };
 
+#[cfg(any(feature = "host", feature = "device"))]
+use crate::common::DeviceAccessible;
+
 #[cfg(feature = "host")]
 use crate::common::{CombinedCudaAlloc, CudaAlloc};
 
@@ -100,7 +103,6 @@ unsafe impl<T: SafeDeviceCopy + TypeGraphLayout> RustToCuda for SafeDeviceCopyWr
     }
 
     #[cfg(feature = "host")]
-    #[doc(cfg(feature = "host"))]
     unsafe fn restore<A: CudaAlloc>(
         &mut self,
         alloc: CombinedCudaAlloc<Self::CudaAllocation, A>,
@@ -127,7 +129,6 @@ unsafe impl<T: SafeDeviceCopy + TypeGraphLayout> RustToCudaAsync for SafeDeviceC
     }
 
     #[cfg(feature = "host")]
-    #[doc(cfg(feature = "host"))]
     unsafe fn restore_async<A: CudaAlloc>(
         &mut self,
         alloc: CombinedCudaAlloc<Self::CudaAllocation, A>,
@@ -142,7 +143,7 @@ unsafe impl<T: SafeDeviceCopy + TypeGraphLayout> RustToCudaAsync for SafeDeviceC
 unsafe impl<T: SafeDeviceCopy + TypeGraphLayout> CudaAsRust for SafeDeviceCopyWrapper<T> {
     type RustRepresentation = Self;
 
-    #[cfg(not(feature = "host"))]
+    #[cfg(feature = "device")]
     unsafe fn as_rust(this: &DeviceAccessible<Self>) -> Self::RustRepresentation {
         let mut uninit = core::mem::MaybeUninit::uninit();
         core::ptr::copy_nonoverlapping(&**this, uninit.as_mut_ptr(), 1);
diff --git a/src/utils/exchange/buffer/common.rs b/src/utils/exchange/buffer/common.rs
index 12a491b20..2725811ca 100644
--- a/src/utils/exchange/buffer/common.rs
+++ b/src/utils/exchange/buffer/common.rs
@@ -1,4 +1,4 @@
-use const_type_layout::TypeGraphLayout;
+use const_type_layout::{TypeGraphLayout, TypeLayout};
 use rustacuda_core::DeviceCopy;
 
 use crate::{common::CudaAsRust, safety::SafeDeviceCopy};
@@ -28,10 +28,12 @@ unsafe impl<T: SafeDeviceCopy + TypeGraphLayout, const M2D: bool, const M2H: boo
 {
     type RustRepresentation = CudaExchangeBuffer<T, M2D, M2H>;
 
-    #[cfg(not(feature = "host"))]
+    #[cfg(feature = "device")]
     unsafe fn as_rust(this: &crate::common::DeviceAccessible<Self>) -> Self::RustRepresentation {
-        CudaExchangeBuffer(core::mem::ManuallyDrop::new(alloc::boxed::Box::from_raw(
-            core::slice::from_raw_parts_mut(this.0, this.1),
-        )))
+        CudaExchangeBuffer {
+            inner: super::device::CudaExchangeBufferDevice(core::mem::ManuallyDrop::new(
+                alloc::boxed::Box::from_raw(core::slice::from_raw_parts_mut(this.0, this.1)),
+            )),
+        }
     }
 }
diff --git a/src/utils/exchange/buffer/device.rs b/src/utils/exchange/buffer/device.rs
index f6f00248b..ed160e185 100644
--- a/src/utils/exchange/buffer/device.rs
+++ b/src/utils/exchange/buffer/device.rs
@@ -2,26 +2,18 @@ use core::ops::{Deref, DerefMut};
 
 use const_type_layout::TypeGraphLayout;
 
-use crate::{
-    common::{NoCudaAlloc, RustToCuda, RustToCudaAsync},
-    safety::SafeDeviceCopy,
-};
+use crate::safety::SafeDeviceCopy;
 
-use super::{common::CudaExchangeBufferCudaRepresentation, CudaExchangeItem};
+use super::CudaExchangeItem;
 
 #[allow(clippy::module_name_repetitions)]
-#[doc(cfg(not(feature = "host")))]
-/// When the `host` feature is set,
-/// [`CudaExchangeBuffer`](super::CudaExchangeBuffer)
-/// refers to
-/// [`CudaExchangeBufferHost`](super::CudaExchangeBufferHost)
-/// instead.
-/// [`CudaExchangeBufferDevice`](Self) is never exposed directly.
-pub struct CudaExchangeBufferDevice<T: SafeDeviceCopy, const M2D: bool, const M2H: bool>(
-    pub(super) core::mem::ManuallyDrop<alloc::boxed::Box<[CudaExchangeItem<T, M2D, M2H>]>>,
-);
+pub struct CudaExchangeBufferDevice<
+    T: SafeDeviceCopy + TypeGraphLayout,
+    const M2D: bool,
+    const M2H: bool,
+>(pub(super) core::mem::ManuallyDrop<alloc::boxed::Box<[CudaExchangeItem<T, M2D, M2H>]>>);
 
-impl<T: SafeDeviceCopy, const M2D: bool, const M2H: bool> Deref
+impl<T: SafeDeviceCopy + TypeGraphLayout, const M2D: bool, const M2H: bool> Deref
     for CudaExchangeBufferDevice<T, M2D, M2H>
 {
     type Target = [CudaExchangeItem<T, M2D, M2H>];
@@ -31,24 +23,10 @@ impl<T: SafeDeviceCopy, const M2D: bool, const M2H: bool> Deref
     }
 }
 
-impl<T: SafeDeviceCopy, const M2D: bool, const M2H: bool> DerefMut
+impl<T: SafeDeviceCopy + TypeGraphLayout, const M2D: bool, const M2H: bool> DerefMut
     for CudaExchangeBufferDevice<T, M2D, M2H>
 {
     fn deref_mut(&mut self) -> &mut Self::Target {
         &mut self.0
     }
 }
-
-#[cfg(not(all(doc, feature = "host")))]
-unsafe impl<T: SafeDeviceCopy + TypeGraphLayout, const M2D: bool, const M2H: bool> RustToCuda
-    for CudaExchangeBufferDevice<T, M2D, M2H>
-{
-    type CudaAllocation = NoCudaAlloc;
-    type CudaRepresentation = CudaExchangeBufferCudaRepresentation<T, M2D, M2H>;
-}
-
-#[cfg(not(all(doc, feature = "host")))]
-unsafe impl<T: SafeDeviceCopy + TypeGraphLayout, const M2D: bool, const M2H: bool> RustToCudaAsync
-    for CudaExchangeBufferDevice<T, M2D, M2H>
-{
-}
diff --git a/src/utils/exchange/buffer/host.rs b/src/utils/exchange/buffer/host.rs
index 24a95bfe3..56fc259cd 100644
--- a/src/utils/exchange/buffer/host.rs
+++ b/src/utils/exchange/buffer/host.rs
@@ -11,9 +11,7 @@ use rustacuda::{
 };
 
 use crate::{
-    common::{
-        CombinedCudaAlloc, CudaAlloc, DeviceAccessible, NoCudaAlloc, RustToCuda, RustToCudaAsync,
-    },
+    common::{CombinedCudaAlloc, CudaAlloc, DeviceAccessible, NoCudaAlloc},
     host::CudaDropWrapper,
     safety::SafeDeviceCopy,
 };
@@ -21,13 +19,6 @@ use crate::{
 use super::{common::CudaExchangeBufferCudaRepresentation, CudaExchangeItem};
 
 #[allow(clippy::module_name_repetitions)]
-#[doc(cfg(feature = "host"))]
-/// When the `host` feature is **not** set,
-/// [`CudaExchangeBuffer`](super::CudaExchangeBuffer)
-/// refers to
-/// [`CudaExchangeBufferDevice`](super::CudaExchangeBufferDevice)
-/// instead.
-/// [`CudaExchangeBufferHost`](Self) is never exposed directly.
 pub struct CudaExchangeBufferHost<
     T: SafeDeviceCopy + TypeGraphLayout,
     const M2D: bool,
@@ -104,19 +95,16 @@ impl<T: SafeDeviceCopy + TypeGraphLayout, const M2D: bool, const M2H: bool> Dere
     }
 }
 
-unsafe impl<T: SafeDeviceCopy + TypeGraphLayout, const M2D: bool, const M2H: bool> RustToCuda
-    for CudaExchangeBufferHost<T, M2D, M2H>
+impl<T: SafeDeviceCopy + TypeGraphLayout, const M2D: bool, const M2H: bool>
+    CudaExchangeBufferHost<T, M2D, M2H>
 {
-    type CudaAllocation = NoCudaAlloc;
-    type CudaRepresentation = CudaExchangeBufferCudaRepresentation<T, M2D, M2H>;
-
     #[allow(clippy::type_complexity)]
-    unsafe fn borrow<A: CudaAlloc>(
+    pub unsafe fn borrow<A: CudaAlloc>(
         &self,
         alloc: A,
     ) -> rustacuda::error::CudaResult<(
-        DeviceAccessible<Self::CudaRepresentation>,
-        CombinedCudaAlloc<Self::CudaAllocation, A>,
+        DeviceAccessible<CudaExchangeBufferCudaRepresentation<T, M2D, M2H>>,
+        CombinedCudaAlloc<NoCudaAlloc, A>,
     )> {
         // Safety: device_buffer is inside an UnsafeCell
         //         borrow checks must be satisfied through LendToCuda
@@ -141,9 +129,9 @@ unsafe impl<T: SafeDeviceCopy + TypeGraphLayout, const M2D: bool, const M2H: boo
     }
 
     #[allow(clippy::type_complexity)]
-    unsafe fn restore<A: CudaAlloc>(
+    pub unsafe fn restore<A: CudaAlloc>(
         &mut self,
-        alloc: CombinedCudaAlloc<Self::CudaAllocation, A>,
+        alloc: CombinedCudaAlloc<NoCudaAlloc, A>,
     ) -> rustacuda::error::CudaResult<A> {
         let (_alloc_front, alloc_tail) = alloc.split();
 
@@ -160,17 +148,17 @@ unsafe impl<T: SafeDeviceCopy + TypeGraphLayout, const M2D: bool, const M2H: boo
     }
 }
 
-unsafe impl<T: SafeDeviceCopy + TypeGraphLayout, const M2D: bool, const M2H: bool> RustToCudaAsync
-    for CudaExchangeBufferHost<T, M2D, M2H>
+impl<T: SafeDeviceCopy + TypeGraphLayout, const M2D: bool, const M2H: bool>
+    CudaExchangeBufferHost<T, M2D, M2H>
 {
     #[allow(clippy::type_complexity)]
-    unsafe fn borrow_async<A: CudaAlloc>(
+    pub unsafe fn borrow_async<A: CudaAlloc>(
         &self,
         alloc: A,
         stream: &rustacuda::stream::Stream,
     ) -> rustacuda::error::CudaResult<(
-        DeviceAccessible<Self::CudaRepresentation>,
-        CombinedCudaAlloc<Self::CudaAllocation, A>,
+        DeviceAccessible<CudaExchangeBufferCudaRepresentation<T, M2D, M2H>>,
+        CombinedCudaAlloc<NoCudaAlloc, A>,
     )> {
         // Safety: device_buffer is inside an UnsafeCell
         //         borrow checks must be satisfied through LendToCuda
@@ -196,9 +184,9 @@ unsafe impl<T: SafeDeviceCopy + TypeGraphLayout, const M2D: bool, const M2H: boo
     }
 
     #[allow(clippy::type_complexity)]
-    unsafe fn restore_async<A: CudaAlloc>(
+    pub unsafe fn restore_async<A: CudaAlloc>(
         &mut self,
-        alloc: CombinedCudaAlloc<Self::CudaAllocation, A>,
+        alloc: CombinedCudaAlloc<NoCudaAlloc, A>,
         stream: &rustacuda::stream::Stream,
     ) -> rustacuda::error::CudaResult<A> {
         let (_alloc_front, alloc_tail) = alloc.split();
diff --git a/src/utils/exchange/buffer/mod.rs b/src/utils/exchange/buffer/mod.rs
index 66b2144c1..dcbbc036f 100644
--- a/src/utils/exchange/buffer/mod.rs
+++ b/src/utils/exchange/buffer/mod.rs
@@ -1,22 +1,146 @@
-use core::mem::MaybeUninit;
+#[cfg(any(feature = "host", feature = "device"))]
+use core::{
+    mem::MaybeUninit,
+    ops::{Deref, DerefMut},
+};
 
+use const_type_layout::TypeLayout;
+
+#[cfg(any(feature = "host", feature = "device"))]
+use const_type_layout::TypeGraphLayout;
+
+use crate::safety::SafeDeviceCopy;
+
+#[cfg(any(feature = "host", feature = "device"))]
+use crate::common::{NoCudaAlloc, RustToCuda, RustToCudaAsync};
+
+#[cfg(feature = "host")]
+use crate::common::{CombinedCudaAlloc, CudaAlloc, DeviceAccessible};
+
+#[cfg(any(feature = "host", feature = "device"))]
+use self::common::CudaExchangeBufferCudaRepresentation;
+
+#[cfg(any(feature = "host", feature = "device"))]
 mod common;
-#[cfg(any(not(feature = "host"), doc))]
+#[cfg(feature = "device")]
 mod device;
 #[cfg(feature = "host")]
 mod host;
 
-#[cfg(not(feature = "host"))]
+#[cfg(any(feature = "host", feature = "device"))]
 #[allow(clippy::module_name_repetitions)]
-pub use device::CudaExchangeBufferDevice as CudaExchangeBuffer;
+pub struct CudaExchangeBuffer<T: SafeDeviceCopy + TypeGraphLayout, const M2D: bool, const M2H: bool>
+{
+    #[cfg(feature = "host")]
+    inner: host::CudaExchangeBufferHost<T, M2D, M2H>,
+    #[cfg(all(feature = "device", not(feature = "host")))]
+    inner: device::CudaExchangeBufferDevice<T, M2D, M2H>,
+}
+
 #[cfg(feature = "host")]
-#[allow(clippy::module_name_repetitions)]
-pub use host::CudaExchangeBufferHost as CudaExchangeBuffer;
+impl<T: Clone + SafeDeviceCopy + TypeGraphLayout, const M2D: bool, const M2H: bool>
+    CudaExchangeBuffer<T, M2D, M2H>
+{
+    /// # Errors
+    /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside
+    /// CUDA
+    pub fn new(elem: &T, capacity: usize) -> rustacuda::error::CudaResult<Self> {
+        Ok(Self {
+            inner: host::CudaExchangeBufferHost::new(elem, capacity)?,
+        })
+    }
+}
 
-#[cfg(doc)]
-pub use self::{device::CudaExchangeBufferDevice, host::CudaExchangeBufferHost};
+#[cfg(feature = "host")]
+impl<T: SafeDeviceCopy + TypeGraphLayout, const M2D: bool, const M2H: bool>
+    CudaExchangeBuffer<T, M2D, M2H>
+{
+    /// # Errors
+    /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside
+    /// CUDA
+    pub fn from_vec(vec: Vec<T>) -> rustacuda::error::CudaResult<Self> {
+        Ok(Self {
+            inner: host::CudaExchangeBufferHost::from_vec(vec)?,
+        })
+    }
+}
 
-use crate::safety::SafeDeviceCopy;
+#[cfg(any(feature = "host", feature = "device"))]
+impl<T: SafeDeviceCopy + TypeGraphLayout, const M2D: bool, const M2H: bool> Deref
+    for CudaExchangeBuffer<T, M2D, M2H>
+{
+    type Target = [CudaExchangeItem<T, M2D, M2H>];
+
+    fn deref(&self) -> &Self::Target {
+        &self.inner
+    }
+}
+
+#[cfg(any(feature = "host", feature = "device"))]
+impl<T: SafeDeviceCopy + TypeGraphLayout, const M2D: bool, const M2H: bool> DerefMut
+    for CudaExchangeBuffer<T, M2D, M2H>
+{
+    fn deref_mut(&mut self) -> &mut Self::Target {
+        &mut self.inner
+    }
+}
+
+#[cfg(any(feature = "host", feature = "device"))]
+unsafe impl<T: SafeDeviceCopy + TypeGraphLayout, const M2D: bool, const M2H: bool> RustToCuda
+    for CudaExchangeBuffer<T, M2D, M2H>
+{
+    type CudaAllocation = NoCudaAlloc;
+    type CudaRepresentation = CudaExchangeBufferCudaRepresentation<T, M2D, M2H>;
+
+    #[cfg(feature = "host")]
+    #[allow(clippy::type_complexity)]
+    unsafe fn borrow<A: CudaAlloc>(
+        &self,
+        alloc: A,
+    ) -> rustacuda::error::CudaResult<(
+        DeviceAccessible<Self::CudaRepresentation>,
+        CombinedCudaAlloc<Self::CudaAllocation, A>,
+    )> {
+        self.inner.borrow(alloc)
+    }
+
+    #[cfg(feature = "host")]
+    #[allow(clippy::type_complexity)]
+    unsafe fn restore<A: CudaAlloc>(
+        &mut self,
+        alloc: CombinedCudaAlloc<Self::CudaAllocation, A>,
+    ) -> rustacuda::error::CudaResult<A> {
+        self.inner.restore(alloc)
+    }
+}
+
+#[cfg(any(feature = "host", feature = "device"))]
+unsafe impl<T: SafeDeviceCopy + TypeGraphLayout, const M2D: bool, const M2H: bool> RustToCudaAsync
+    for CudaExchangeBuffer<T, M2D, M2H>
+{
+    #[cfg(feature = "host")]
+    #[allow(clippy::type_complexity)]
+    unsafe fn borrow_async<A: CudaAlloc>(
+        &self,
+        alloc: A,
+        stream: &rustacuda::stream::Stream,
+    ) -> rustacuda::error::CudaResult<(
+        DeviceAccessible<Self::CudaRepresentation>,
+        CombinedCudaAlloc<Self::CudaAllocation, A>,
+    )> {
+        self.inner.borrow_async(alloc, stream)
+    }
+
+    #[cfg(feature = "host")]
+    #[allow(clippy::type_complexity)]
+    unsafe fn restore_async<A: CudaAlloc>(
+        &mut self,
+        alloc: CombinedCudaAlloc<Self::CudaAllocation, A>,
+        stream: &rustacuda::stream::Stream,
+    ) -> rustacuda::error::CudaResult<A> {
+        self.inner.restore_async(alloc, stream)
+    }
+}
 
 #[repr(transparent)]
 #[derive(Clone, Copy, TypeLayout)]
@@ -30,28 +154,24 @@ unsafe impl<T: SafeDeviceCopy, const M2D: bool, const M2H: bool> rustacuda_core:
 }
 
 impl<T: SafeDeviceCopy, const M2D: bool> CudaExchangeItem<T, M2D, true> {
-    #[cfg(any(feature = "host", doc))]
-    #[doc(cfg(feature = "host"))]
+    #[cfg(feature = "host")]
     pub const fn read(&self) -> &T {
         &self.0
     }
 
-    #[cfg(any(not(feature = "host"), doc))]
-    #[doc(cfg(not(feature = "host")))]
+    #[cfg(feature = "device")]
     pub fn write(&mut self, value: T) {
         self.0 = value;
     }
 }
 
 impl<T: SafeDeviceCopy, const M2H: bool> CudaExchangeItem<T, true, M2H> {
-    #[cfg(any(not(feature = "host"), doc))]
-    #[doc(cfg(not(feature = "host")))]
+    #[cfg(feature = "device")]
     pub const fn read(&self) -> &T {
         &self.0
     }
 
-    #[cfg(any(feature = "host", doc))]
-    #[doc(cfg(feature = "host"))]
+    #[cfg(feature = "host")]
     pub fn write(&mut self, value: T) {
         self.0 = value;
     }
@@ -64,36 +184,31 @@ impl<T: SafeDeviceCopy> AsMut<T> for CudaExchangeItem<T, true, true> {
 }
 
 impl<T: SafeDeviceCopy> CudaExchangeItem<T, false, true> {
-    #[cfg(any(feature = "host", doc))]
-    #[doc(cfg(feature = "host"))]
+    #[cfg(feature = "host")]
     pub const fn as_scratch(&self) -> &T {
         &self.0
     }
 
-    #[cfg(any(feature = "host", doc))]
-    #[doc(cfg(feature = "host"))]
+    #[cfg(feature = "host")]
     pub fn as_scratch_mut(&mut self) -> &mut T {
         &mut self.0
     }
 }
 
 impl<T: SafeDeviceCopy> CudaExchangeItem<T, true, false> {
-    #[cfg(any(not(feature = "host"), doc))]
-    #[doc(cfg(not(feature = "host")))]
+    #[cfg(feature = "device")]
     pub const fn as_scratch(&self) -> &T {
         &self.0
     }
 
-    #[cfg(any(not(feature = "host"), doc))]
-    #[doc(cfg(not(feature = "host")))]
+    #[cfg(feature = "device")]
     pub fn as_scratch_mut(&mut self) -> &mut T {
         &mut self.0
     }
 }
 
 impl<T: SafeDeviceCopy> CudaExchangeItem<T, true, false> {
-    #[cfg(any(feature = "host", doc))]
-    #[doc(cfg(feature = "host"))]
+    #[cfg(feature = "host")]
     pub const fn as_uninit(&self) -> &MaybeUninit<T> {
         // Safety:
         // - MaybeUninit is a transparent newtype union
@@ -101,8 +216,7 @@ impl<T: SafeDeviceCopy> CudaExchangeItem<T, true, false> {
         unsafe { &*(self as *const Self).cast() }
     }
 
-    #[cfg(any(feature = "host", doc))]
-    #[doc(cfg(feature = "host"))]
+    #[cfg(feature = "host")]
     pub fn as_uninit_mut(&mut self) -> &mut MaybeUninit<T> {
         // Safety:
         // - MaybeUninit is a transparent newtype union
@@ -112,8 +226,7 @@ impl<T: SafeDeviceCopy> CudaExchangeItem<T, true, false> {
 }
 
 impl<T: SafeDeviceCopy> CudaExchangeItem<T, false, true> {
-    #[cfg(any(not(feature = "host"), doc))]
-    #[doc(cfg(not(feature = "host")))]
+    #[cfg(feature = "device")]
     pub const fn as_uninit(&self) -> &MaybeUninit<T> {
         // Safety:
         // - MaybeUninit is a transparent newtype union
@@ -121,8 +234,7 @@ impl<T: SafeDeviceCopy> CudaExchangeItem<T, false, true> {
         unsafe { &*(self as *const Self).cast() }
     }
 
-    #[cfg(any(not(feature = "host"), doc))]
-    #[doc(cfg(not(feature = "host")))]
+    #[cfg(feature = "device")]
     pub fn as_uninit_mut(&mut self) -> &mut MaybeUninit<T> {
         // Safety:
         // - MaybeUninit is a transparent newtype union
diff --git a/src/utils/exchange/mod.rs b/src/utils/exchange/mod.rs
index ffca4bbf3..722e02559 100644
--- a/src/utils/exchange/mod.rs
+++ b/src/utils/exchange/mod.rs
@@ -1,5 +1,4 @@
 pub mod buffer;
 
 #[cfg(feature = "host")]
-#[doc(cfg(feature = "host"))]
 pub mod wrapper;
diff --git a/src/utils/option.rs b/src/utils/option.rs
index a7b3e991e..dec109f38 100644
--- a/src/utils/option.rs
+++ b/src/utils/option.rs
@@ -1,6 +1,9 @@
 use core::mem::MaybeUninit;
 
-use const_type_layout::TypeGraphLayout;
+use const_type_layout::{TypeGraphLayout, TypeLayout};
+
+#[cfg(feature = "host")]
+use rustacuda::error::CudaResult;
 
 use crate::{
     common::{
@@ -12,10 +15,7 @@ use crate::{
 };
 
 #[cfg(feature = "host")]
-use crate::{
-    common::{CombinedCudaAlloc, CudaAlloc},
-    rustacuda::error::CudaResult,
-};
+use crate::common::{CombinedCudaAlloc, CudaAlloc};
 
 #[doc(hidden)]
 #[allow(clippy::module_name_repetitions)]
@@ -35,7 +35,6 @@ unsafe impl<T: RustToCuda> RustToCuda for Option<T> {
     type CudaRepresentation = OptionCudaRepresentation<<T as RustToCuda>::CudaRepresentation>;
 
     #[cfg(feature = "host")]
-    #[doc(cfg(feature = "host"))]
     #[allow(clippy::type_complexity)]
     unsafe fn borrow<A: CudaAlloc>(
         &self,
@@ -71,7 +70,6 @@ unsafe impl<T: RustToCuda> RustToCuda for Option<T> {
     }
 
     #[cfg(feature = "host")]
-    #[doc(cfg(feature = "host"))]
     unsafe fn restore<A: CudaAlloc>(
         &mut self,
         alloc: CombinedCudaAlloc<Self::CudaAllocation, A>,
@@ -89,7 +87,6 @@ unsafe impl<T: RustToCuda> RustToCuda for Option<T> {
 
 unsafe impl<T: RustToCudaAsync> RustToCudaAsync for Option<T> {
     #[cfg(feature = "host")]
-    #[doc(cfg(feature = "host"))]
     #[allow(clippy::type_complexity)]
     unsafe fn borrow_async<A: CudaAlloc>(
         &self,
@@ -126,7 +123,6 @@ unsafe impl<T: RustToCudaAsync> RustToCudaAsync for Option<T> {
     }
 
     #[cfg(feature = "host")]
-    #[doc(cfg(feature = "host"))]
     unsafe fn restore_async<A: CudaAlloc>(
         &mut self,
         alloc: CombinedCudaAlloc<Self::CudaAllocation, A>,
@@ -146,7 +142,7 @@ unsafe impl<T: RustToCudaAsync> RustToCudaAsync for Option<T> {
 unsafe impl<T: CudaAsRust> CudaAsRust for OptionCudaRepresentation<T> {
     type RustRepresentation = Option<<T as CudaAsRust>::RustRepresentation>;
 
-    #[cfg(not(feature = "host"))]
+    #[cfg(feature = "device")]
     unsafe fn as_rust(this: &DeviceAccessible<Self>) -> Self::RustRepresentation {
         if this.present {
             Some(CudaAsRust::as_rust(this.maybe.assume_init_ref()))
diff --git a/src/utils/shared/slice.rs b/src/utils/shared/slice.rs
index 7039e15f9..bec725bd1 100644
--- a/src/utils/shared/slice.rs
+++ b/src/utils/shared/slice.rs
@@ -3,56 +3,40 @@ use const_type_layout::TypeGraphLayout;
 #[allow(clippy::module_name_repetitions)]
 #[repr(transparent)]
 pub struct ThreadBlockSharedSlice<T: 'static + TypeGraphLayout> {
-    #[cfg(not(target_os = "cuda"))]
-    // dangling marker s.t. Self is not StackOnly
-    dangling: *mut [T],
-    #[cfg(target_os = "cuda")]
     shared: *mut [T],
 }
 
 impl<T: 'static + TypeGraphLayout> ThreadBlockSharedSlice<T> {
-    #[cfg(any(not(target_os = "cuda"), doc))]
-    #[doc(cfg(not(target_os = "cuda")))]
+    #[cfg(feature = "host")]
     #[must_use]
     pub fn new_uninit_with_len(len: usize) -> Self {
         Self {
-            dangling: Self::dangling_slice_with_len(len),
+            shared: Self::dangling_slice_with_len(len),
         }
     }
 
-    #[cfg(any(not(target_os = "cuda"), doc))]
-    #[doc(cfg(not(target_os = "cuda")))]
+    #[cfg(feature = "host")]
     #[must_use]
     pub fn with_len(mut self, len: usize) -> Self {
-        self.dangling = Self::dangling_slice_with_len(len);
+        self.shared = Self::dangling_slice_with_len(len);
         self
     }
 
-    #[cfg(any(not(target_os = "cuda"), doc))]
-    #[doc(cfg(not(target_os = "cuda")))]
+    #[cfg(feature = "host")]
     #[must_use]
     pub fn with_len_mut(&mut self, len: usize) -> &mut Self {
-        self.dangling = Self::dangling_slice_with_len(len);
+        self.shared = Self::dangling_slice_with_len(len);
         self
     }
 
-    #[cfg(not(target_os = "cuda"))]
+    #[cfg(feature = "host")]
     fn dangling_slice_with_len(len: usize) -> *mut [T] {
         core::ptr::slice_from_raw_parts_mut(core::ptr::NonNull::dangling().as_ptr(), len)
     }
 
     #[must_use]
     pub fn len(&self) -> usize {
-        core::ptr::metadata({
-            #[cfg(not(target_os = "cuda"))]
-            {
-                self.dangling
-            }
-            #[cfg(target_os = "cuda")]
-            {
-                self.shared
-            }
-        })
+        core::ptr::metadata(self.shared)
     }
 
     #[must_use]
@@ -60,22 +44,19 @@ impl<T: 'static + TypeGraphLayout> ThreadBlockSharedSlice<T> {
         self.len() == 0
     }
 
-    #[cfg(any(target_os = "cuda", doc))]
-    #[doc(cfg(target_os = "cuda"))]
+    #[cfg(feature = "device")]
     #[must_use]
     pub const fn as_mut_ptr(&self) -> *mut T {
         self.shared.cast()
     }
 
-    #[cfg(any(target_os = "cuda", doc))]
-    #[doc(cfg(target_os = "cuda"))]
+    #[cfg(feature = "device")]
     #[must_use]
     pub const fn as_mut_slice_ptr(&self) -> *mut [T] {
         self.shared
     }
 
-    #[cfg(any(target_os = "cuda", doc))]
-    #[doc(cfg(target_os = "cuda"))]
+    #[cfg(feature = "device")]
     /// # Safety
     ///
     /// The provided `index` must not be out of bounds.
@@ -89,8 +70,7 @@ impl<T: 'static + TypeGraphLayout> ThreadBlockSharedSlice<T> {
     }
 }
 
-#[cfg(all(not(feature = "host"), target_os = "cuda"))]
-#[doc(cfg(all(not(feature = "host"), target_os = "cuda")))]
+#[cfg(feature = "device")]
 impl<T: 'static + TypeGraphLayout> ThreadBlockSharedSlice<T> {
     /// # Safety
     ///
@@ -129,7 +109,7 @@ impl<T: 'static + TypeGraphLayout> ThreadBlockSharedSlice<T> {
 }
 
 #[doc(hidden)]
-#[cfg(all(not(feature = "host"), target_os = "cuda"))]
+#[cfg(feature = "device")]
 /// # Safety
 ///
 /// The thread-block shared dynamic memory must be initialised once and
@@ -143,5 +123,5 @@ pub unsafe fn init() {
     }
 }
 
-#[cfg(all(not(feature = "host"), target_os = "cuda"))]
+#[cfg(feature = "device")]
 core::arch::global_asm!(".extern .shared .align 8 .b8 rust_cuda_dynamic_shared_base[];");
diff --git a/src/utils/shared/static.rs b/src/utils/shared/static.rs
index 41ba334ba..62c3a0c49 100644
--- a/src/utils/shared/static.rs
+++ b/src/utils/shared/static.rs
@@ -1,25 +1,23 @@
 #[repr(transparent)]
 pub struct ThreadBlockShared<T: 'static> {
-    #[cfg(not(target_os = "cuda"))]
-    // dangling marker s.t. Self is not StackOnly
-    _dangling: *mut T,
-    #[cfg(target_os = "cuda")]
+    #[cfg_attr(not(feature = "device"), allow(dead_code))]
     shared: *mut T,
 }
 
 impl<T: 'static> ThreadBlockShared<T> {
+    #[cfg(any(feature = "host", feature = "device"))]
     #[must_use]
     #[allow(clippy::inline_always, clippy::missing_const_for_fn)]
     #[inline(always)]
     pub fn new_uninit() -> Self {
-        #[cfg(not(target_os = "cuda"))]
+        #[cfg(feature = "host")]
         {
             Self {
-                _dangling: core::ptr::NonNull::dangling().as_ptr(),
+                shared: core::ptr::NonNull::dangling().as_ptr(),
             }
         }
 
-        #[cfg(target_os = "cuda")]
+        #[cfg(feature = "device")]
         {
             let shared: *mut T;
 
@@ -37,8 +35,7 @@ impl<T: 'static> ThreadBlockShared<T> {
         }
     }
 
-    #[cfg(any(target_os = "cuda", doc))]
-    #[doc(cfg(target_os = "cuda"))]
+    #[cfg(feature = "device")]
     #[must_use]
     pub const fn as_mut_ptr(&self) -> *mut T {
         self.shared
@@ -46,8 +43,7 @@ impl<T: 'static> ThreadBlockShared<T> {
 }
 
 impl<T: 'static, const N: usize> ThreadBlockShared<[T; N]> {
-    #[cfg(any(target_os = "cuda", doc))]
-    #[doc(cfg(target_os = "cuda"))]
+    #[cfg(feature = "device")]
     /// # Safety
     ///
     /// The provided `index` must not be out of bounds.

From 28a1e266ddee49ff6cdb883ff5548ce4dd381838 Mon Sep 17 00:00:00 2001
From: Juniper Tyree <juniper.tyree@helsinki.fi>
Date: Mon, 25 Dec 2023 10:08:39 +0000
Subject: [PATCH 063/120] Refactoring to prepare for better module structure

---
 src/common.rs                       | 4 ++--
 src/deps.rs                         | 2 ++
 src/device/alloc.rs                 | 3 ++-
 src/device/utils.rs                 | 8 +++++---
 src/host/mod.rs                     | 2 +-
 src/host/ptx_jit/replace.rs         | 3 +--
 src/lib.rs                          | 5 +----
 src/utils/box.rs                    | 4 ++--
 src/utils/boxed_slice.rs            | 4 ++--
 src/utils/exchange/buffer/common.rs | 4 +++-
 src/utils/exchange/buffer/device.rs | 4 ++--
 src/utils/exchange/buffer/host.rs   | 3 +--
 src/utils/exchange/wrapper.rs       | 5 ++---
 13 files changed, 26 insertions(+), 25 deletions(-)

diff --git a/src/common.rs b/src/common.rs
index d9d1a955a..d7b9815b4 100644
--- a/src/common.rs
+++ b/src/common.rs
@@ -5,13 +5,13 @@ use core::{
     ops::{Deref, DerefMut},
 };
 
-#[cfg(feature = "host")]
-use alloc::fmt;
 #[cfg(feature = "host")]
 use core::{
     mem::MaybeUninit,
     ptr::{copy_nonoverlapping, NonNull},
 };
+#[cfg(feature = "host")]
+use std::fmt;
 
 use const_type_layout::{TypeGraphLayout, TypeLayout};
 use rustacuda_core::DeviceCopy;
diff --git a/src/deps.rs b/src/deps.rs
index fe001e054..68257e095 100644
--- a/src/deps.rs
+++ b/src/deps.rs
@@ -1,3 +1,5 @@
+pub(crate) extern crate alloc;
+
 pub extern crate const_type_layout;
 
 #[cfg(feature = "host")]
diff --git a/src/device/alloc.rs b/src/device/alloc.rs
index c1c28f931..bca59a1eb 100644
--- a/src/device/alloc.rs
+++ b/src/device/alloc.rs
@@ -1,7 +1,8 @@
-use alloc::alloc::{GlobalAlloc, Layout};
 #[cfg(all(feature = "device", not(doc)))]
 use core::arch::nvptx;
 
+use crate::deps::alloc::alloc::{GlobalAlloc, Layout};
+
 /// Memory allocator using CUDA malloc/free
 pub struct PTXAllocator;
 
diff --git a/src/device/utils.rs b/src/device/utils.rs
index 3b37307a6..cbc5080ab 100644
--- a/src/device/utils.rs
+++ b/src/device/utils.rs
@@ -1,3 +1,5 @@
+use crate::deps::alloc::{fmt, string::String};
+
 /// Abort the CUDA kernel using the `trap` system call.
 #[allow(clippy::inline_always)]
 #[inline(always)]
@@ -52,7 +54,7 @@ pub fn print(args: ::core::fmt::Arguments) {
     let msg = if let Some(msg) = args.as_str() {
         msg
     } else {
-        msg = ::alloc::fmt::format(args);
+        msg = fmt::format(args);
         msg.as_str()
     };
 
@@ -93,7 +95,7 @@ pub fn pretty_panic_handler(
         if let Some(msg) = message.as_str() {
             msg
         } else if allow_dynamic_message {
-            msg = ::alloc::fmt::format(*message);
+            msg = fmt::format(*message);
             msg.as_str()
         } else {
             "<dynamic panic message>"
@@ -102,7 +104,7 @@ pub fn pretty_panic_handler(
         && allow_dynamic_payload
     {
         msg
-    } else if let Some(msg) = info.payload().downcast_ref::<::alloc::string::String>()
+    } else if let Some(msg) = info.payload().downcast_ref::<String>()
         && allow_dynamic_payload
     {
         msg.as_str()
diff --git a/src/host/mod.rs b/src/host/mod.rs
index cc7fa681f..45dc6f059 100644
--- a/src/host/mod.rs
+++ b/src/host/mod.rs
@@ -1,9 +1,9 @@
-use core::ptr::NonNull;
 use std::{
     ffi::{CStr, CString},
     marker::PhantomData,
     mem::ManuallyDrop,
     ops::{Deref, DerefMut},
+    ptr::NonNull,
 };
 
 use rustacuda::{
diff --git a/src/host/ptx_jit/replace.rs b/src/host/ptx_jit/replace.rs
index ed59701c7..97a592da9 100644
--- a/src/host/ptx_jit/replace.rs
+++ b/src/host/ptx_jit/replace.rs
@@ -1,5 +1,4 @@
-use core::ptr::NonNull;
-use std::{ffi::CString, ops::Deref};
+use std::{ffi::CString, ops::Deref, ptr::NonNull};
 
 use super::{PtxElement, PtxJITCompiler, PtxJITResult, PtxLoadWidth};
 
diff --git a/src/lib.rs b/src/lib.rs
index 118a55343..16d48d0b3 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -6,7 +6,7 @@
 #![deny(clippy::style)]
 #![deny(clippy::suspicious)]
 #![allow(clippy::useless_attribute)]
-#![cfg_attr(all(feature = "device", not(doc)), no_std)]
+#![cfg_attr(all(any(feature = "device", target_os = "cuda"), not(doc)), no_std)]
 #![feature(associated_type_bounds)]
 #![feature(auto_traits)]
 #![feature(negative_impls)]
@@ -44,9 +44,6 @@ core::compile_error!("cannot enable the `host` feature on a target with `target_
 #[cfg(all(feature = "device", not(target_os = "cuda"), not(doc)))]
 core::compile_error!("cannot enable the `device` feature on a target without `target_os=\"cuda\"`");
 
-#[doc(hidden)]
-pub extern crate alloc;
-
 pub mod common;
 
 #[cfg(feature = "host")]
diff --git a/src/utils/box.rs b/src/utils/box.rs
index 9972c4ef3..8672c36a0 100644
--- a/src/utils/box.rs
+++ b/src/utils/box.rs
@@ -1,4 +1,4 @@
-use alloc::boxed::Box;
+use crate::deps::alloc::boxed::Box;
 
 use const_type_layout::{TypeGraphLayout, TypeLayout};
 
@@ -83,6 +83,6 @@ unsafe impl<T: SafeDeviceCopy + TypeGraphLayout> CudaAsRust for BoxCudaRepresent
 
     #[cfg(feature = "device")]
     unsafe fn as_rust(this: &DeviceAccessible<Self>) -> Self::RustRepresentation {
-        alloc::boxed::Box::from_raw(this.0)
+        crate::deps::alloc::boxed::Box::from_raw(this.0)
     }
 }
diff --git a/src/utils/boxed_slice.rs b/src/utils/boxed_slice.rs
index bd9e74aee..e9113d865 100644
--- a/src/utils/boxed_slice.rs
+++ b/src/utils/boxed_slice.rs
@@ -1,4 +1,4 @@
-use alloc::boxed::Box;
+use crate::deps::alloc::boxed::Box;
 
 use const_type_layout::{TypeGraphLayout, TypeLayout};
 
@@ -85,6 +85,6 @@ unsafe impl<T: SafeDeviceCopy + TypeGraphLayout> CudaAsRust for BoxedSliceCudaRe
 
     #[cfg(feature = "device")]
     unsafe fn as_rust(this: &DeviceAccessible<Self>) -> Self::RustRepresentation {
-        alloc::boxed::Box::from_raw(core::slice::from_raw_parts_mut(this.0, this.1))
+        crate::deps::alloc::boxed::Box::from_raw(core::slice::from_raw_parts_mut(this.0, this.1))
     }
 }
diff --git a/src/utils/exchange/buffer/common.rs b/src/utils/exchange/buffer/common.rs
index 2725811ca..31f50cb68 100644
--- a/src/utils/exchange/buffer/common.rs
+++ b/src/utils/exchange/buffer/common.rs
@@ -32,7 +32,9 @@ unsafe impl<T: SafeDeviceCopy + TypeGraphLayout, const M2D: bool, const M2H: boo
     unsafe fn as_rust(this: &crate::common::DeviceAccessible<Self>) -> Self::RustRepresentation {
         CudaExchangeBuffer {
             inner: super::device::CudaExchangeBufferDevice(core::mem::ManuallyDrop::new(
-                alloc::boxed::Box::from_raw(core::slice::from_raw_parts_mut(this.0, this.1)),
+                crate::deps::alloc::boxed::Box::from_raw(core::slice::from_raw_parts_mut(
+                    this.0, this.1,
+                )),
             )),
         }
     }
diff --git a/src/utils/exchange/buffer/device.rs b/src/utils/exchange/buffer/device.rs
index ed160e185..139224da3 100644
--- a/src/utils/exchange/buffer/device.rs
+++ b/src/utils/exchange/buffer/device.rs
@@ -2,7 +2,7 @@ use core::ops::{Deref, DerefMut};
 
 use const_type_layout::TypeGraphLayout;
 
-use crate::safety::SafeDeviceCopy;
+use crate::{deps::alloc::boxed::Box, safety::SafeDeviceCopy};
 
 use super::CudaExchangeItem;
 
@@ -11,7 +11,7 @@ pub struct CudaExchangeBufferDevice<
     T: SafeDeviceCopy + TypeGraphLayout,
     const M2D: bool,
     const M2H: bool,
->(pub(super) core::mem::ManuallyDrop<alloc::boxed::Box<[CudaExchangeItem<T, M2D, M2H>]>>);
+>(pub(super) core::mem::ManuallyDrop<Box<[CudaExchangeItem<T, M2D, M2H>]>>);
 
 impl<T: SafeDeviceCopy + TypeGraphLayout, const M2D: bool, const M2H: bool> Deref
     for CudaExchangeBufferDevice<T, M2D, M2H>
diff --git a/src/utils/exchange/buffer/host.rs b/src/utils/exchange/buffer/host.rs
index 56fc259cd..9bbf8a0af 100644
--- a/src/utils/exchange/buffer/host.rs
+++ b/src/utils/exchange/buffer/host.rs
@@ -1,5 +1,4 @@
-use alloc::vec::Vec;
-use core::{
+use std::{
     cell::UnsafeCell,
     ops::{Deref, DerefMut},
 };
diff --git a/src/utils/exchange/wrapper.rs b/src/utils/exchange/wrapper.rs
index 4edfdebd8..5f64d3d05 100644
--- a/src/utils/exchange/wrapper.rs
+++ b/src/utils/exchange/wrapper.rs
@@ -1,12 +1,11 @@
-use core::{
+use std::{
     future::{Future, IntoFuture},
     marker::PhantomData,
     ops::{Deref, DerefMut},
+    sync::{Arc, Mutex},
     task::{Poll, Waker},
 };
-use std::sync::Mutex;
 
-use alloc::sync::Arc;
 use rustacuda::{
     error::{CudaError, CudaResult},
     event::{Event, EventFlags, EventStatus},

From cdd84a4d6b671a700b65da7c80ff85b8fcb2e937 Mon Sep 17 00:00:00 2001
From: Juniper Tyree <juniper.tyree@helsinki.fi>
Date: Mon, 25 Dec 2023 10:31:51 +0000
Subject: [PATCH 064/120] Extract kernel module just for parameters

---
 examples/print/src/main.rs                    |   4 +-
 examples/single-source/src/main.rs            |  20 +-
 .../wrapper/generate/cuda_generic_function.rs |   2 +-
 .../kernel/wrapper/generate/cuda_wrapper.rs   |   6 +-
 .../generate/host_linker_macro/get_ptx.rs     |   2 +-
 src/common.rs                                 | 838 +-----------------
 src/host/mod.rs                               |   7 +-
 src/kernel.rs                                 | 838 ++++++++++++++++++
 src/lib.rs                                    |  10 +-
 src/safety/no_aliasing.rs                     |   8 +-
 src/utils/shared/mod.rs                       |  13 +-
 src/utils/shared/slice.rs                     |   1 -
 12 files changed, 881 insertions(+), 868 deletions(-)
 create mode 100644 src/kernel.rs

diff --git a/examples/print/src/main.rs b/examples/print/src/main.rs
index 462603ca6..7a26ce2bd 100644
--- a/examples/print/src/main.rs
+++ b/examples/print/src/main.rs
@@ -21,9 +21,9 @@ pub enum Action {
     AllocError,
 }
 
-#[rust_cuda::common::kernel(use link! for impl)]
+#[rust_cuda::kernel::kernel(use link! for impl)]
 #[kernel(allow(ptx::local_memory_usage))]
-pub fn kernel(action: rust_cuda::common::PerThreadShallowCopy<Action>) {
+pub fn kernel(action: rust_cuda::kernel::PerThreadShallowCopy<Action>) {
     match action {
         Action::Print => rust_cuda::device::utils::println!("println! from CUDA kernel"),
         Action::Panic => panic!("panic! from CUDA kernel"),
diff --git a/examples/single-source/src/main.rs b/examples/single-source/src/main.rs
index 41df1705d..383ade30a 100644
--- a/examples/single-source/src/main.rs
+++ b/examples/single-source/src/main.rs
@@ -45,7 +45,7 @@ pub struct Tuple(u32, i32);
 #[layout(crate = "rc::deps::const_type_layout")]
 pub struct Triple(i32, i32, i32);
 
-#[rc::common::kernel(pub use link! for impl)]
+#[rc::kernel::kernel(pub use link! for impl)]
 #[kernel(crate = "rc")]
 #[kernel(
     allow(ptx::double_precision_use),
@@ -61,16 +61,16 @@ pub fn kernel<
         + rc::safety::StackOnly
         + rc::safety::NoSafeAliasing,
 >(
-    _x: &rc::common::PerThreadShallowCopy<Dummy>,
-    _z: &rc::common::SharedHeapPerThreadShallowCopy<Wrapper<T>>,
-    _v @ _w: &'a rc::common::ShallowInteriorMutable<core::sync::atomic::AtomicU64>,
-    _: rc::common::SharedHeapPerThreadShallowCopy<Wrapper<T>>,
-    q @ Triple(s, mut __t, _u): rc::common::PerThreadShallowCopy<Triple>,
-    shared3: &mut rc::utils::shared::r#static::ThreadBlockShared<u32>,
-    dynamic: &mut rc::utils::shared::slice::ThreadBlockSharedSlice<Dummy>,
+    _x: &rc::kernel::PerThreadShallowCopy<Dummy>,
+    _z: &rc::kernel::SharedHeapPerThreadShallowCopy<Wrapper<T>>,
+    _v @ _w: &'a rc::kernel::ShallowInteriorMutable<core::sync::atomic::AtomicU64>,
+    _: rc::kernel::SharedHeapPerThreadShallowCopy<Wrapper<T>>,
+    q @ Triple(s, mut __t, _u): rc::kernel::PerThreadShallowCopy<Triple>,
+    shared3: &mut rc::utils::shared::ThreadBlockShared<u32>,
+    dynamic: &mut rc::utils::shared::ThreadBlockSharedSlice<Dummy>,
 ) {
-    let shared = rc::utils::shared::r#static::ThreadBlockShared::<[Tuple; 3]>::new_uninit();
-    let shared2 = rc::utils::shared::r#static::ThreadBlockShared::<[Tuple; 3]>::new_uninit();
+    let shared = rc::utils::shared::ThreadBlockShared::<[Tuple; 3]>::new_uninit();
+    let shared2 = rc::utils::shared::ThreadBlockShared::<[Tuple; 3]>::new_uninit();
 
     #[allow(clippy::cast_possible_truncation, clippy::cast_sign_loss)]
     unsafe {
diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cuda_generic_function.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cuda_generic_function.rs
index 8a5de226e..1b05df23b 100644
--- a/rust-cuda-derive/src/kernel/wrapper/generate/cuda_generic_function.rs
+++ b/rust-cuda-derive/src/kernel/wrapper/generate/cuda_generic_function.rs
@@ -26,7 +26,7 @@ pub(in super::super) fn quote_cuda_generic_function(
                  colon_token,
              }| {
                 let ty: syn::Type = syn::parse_quote_spanned! { ty.span()=>
-                    <#ty as #crate_path::common::CudaKernelParameter>::DeviceType<'_>
+                    <#ty as #crate_path::kernel::CudaKernelParameter>::DeviceType<'_>
                 };
 
                 syn::FnArg::Typed(syn::PatType {
diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cuda_wrapper.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cuda_wrapper.rs
index e87bd0d16..f61bb9b32 100644
--- a/rust-cuda-derive/src/kernel/wrapper/generate/cuda_wrapper.rs
+++ b/rust-cuda-derive/src/kernel/wrapper/generate/cuda_wrapper.rs
@@ -50,7 +50,7 @@ pub(in super::super) fn quote_cuda_wrapper(
             quote::quote_spanned! { ty.span()=>
                 unsafe {
                     <
-                        #specialised_ty as #crate_path::common::CudaKernelParameter
+                        #specialised_ty as #crate_path::kernel::CudaKernelParameter
                     >::with_ffi_as_device::<_, #i>(
                         #pat, |#pat| { #inner }
                     )
@@ -69,7 +69,7 @@ pub(in super::super) fn quote_cuda_wrapper(
             unsafe {
                 // Initialise the dynamically-sized thread-block shared memory
                 //  and the thread-local offset pointer that points to it
-                #crate_path::utils::shared::slice::init();
+                #crate_path::utils::shared::init();
             }
 
             unsafe {
@@ -122,7 +122,7 @@ fn specialise_ffi_input_types(
             };
 
             let ffi_ty: syn::Type = syn::parse_quote_spanned! { ty.span()=>
-                <#specialised_ty as #crate_path::common::CudaKernelParameter>::FfiType<'static, 'static>
+                <#specialised_ty as #crate_path::kernel::CudaKernelParameter>::FfiType<'static, 'static>
             };
 
             let ffi_param = syn::FnArg::Typed(syn::PatType {
diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/host_linker_macro/get_ptx.rs b/rust-cuda-derive/src/kernel/wrapper/generate/host_linker_macro/get_ptx.rs
index 439f27f9e..d7394142e 100644
--- a/rust-cuda-derive/src/kernel/wrapper/generate/host_linker_macro/get_ptx.rs
+++ b/rust-cuda-derive/src/kernel/wrapper/generate/host_linker_macro/get_ptx.rs
@@ -143,7 +143,7 @@ fn generate_lifetime_erased_types(
             }
 
             quote::quote_spanned! { ty.span()=>
-                <#specialised_ty as #crate_path::common::CudaKernelParameter>::FfiType<'static, 'static>
+                <#specialised_ty as #crate_path::kernel::CudaKernelParameter>::FfiType<'static, 'static>
             }
         }).collect()
 }
diff --git a/src/common.rs b/src/common.rs
index d7b9815b4..37d005ac4 100644
--- a/src/common.rs
+++ b/src/common.rs
@@ -1,15 +1,12 @@
+use core::marker::PhantomData;
 #[cfg(feature = "device")]
-use core::convert::{AsMut, AsRef};
 use core::{
-    marker::PhantomData,
+    convert::{AsMut, AsRef},
     ops::{Deref, DerefMut},
 };
 
 #[cfg(feature = "host")]
-use core::{
-    mem::MaybeUninit,
-    ptr::{copy_nonoverlapping, NonNull},
-};
+use core::{mem::MaybeUninit, ptr::copy_nonoverlapping};
 #[cfg(feature = "host")]
 use std::fmt;
 
@@ -19,9 +16,6 @@ use rustacuda_core::DeviceCopy;
 #[cfg(feature = "derive")]
 pub use rust_cuda_derive::LendRustToCuda;
 
-#[cfg(feature = "derive")]
-pub use rust_cuda_derive::kernel;
-
 #[cfg(feature = "host")]
 use crate::{safety::SafeDeviceCopy, utils::device_copy::SafeDeviceCopyWrapper};
 
@@ -315,829 +309,3 @@ impl<A: CudaAlloc, B: CudaAlloc> CombinedCudaAlloc<A, B> {
         (self.0, self.1)
     }
 }
-
-mod sealed {
-    #[doc(hidden)]
-    pub trait Sealed {}
-}
-
-// TODO: doc cfg
-pub trait CudaKernelParameter: sealed::Sealed {
-    #[cfg(feature = "host")]
-    type SyncHostType;
-    #[cfg(feature = "host")]
-    type AsyncHostType<'stream, 'b>;
-    #[doc(hidden)]
-    type FfiType<'stream, 'b>: rustacuda_core::DeviceCopy + TypeGraphLayout;
-    #[cfg(any(feature = "device", doc))]
-    type DeviceType<'b>;
-
-    #[cfg(feature = "host")]
-    #[allow(clippy::missing_errors_doc)] // FIXME
-    fn with_new_async<'stream, O, E: From<rustacuda::error::CudaError>>(
-        param: Self::SyncHostType,
-        stream: &'stream rustacuda::stream::Stream,
-        inner: impl for<'b> FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result<O, E>,
-    ) -> Result<O, E>;
-
-    #[doc(hidden)]
-    #[cfg(feature = "host")]
-    fn with_async_as_ptx_jit<O>(
-        param: &Self::AsyncHostType<'_, '_>,
-        inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O,
-    ) -> O;
-
-    #[doc(hidden)]
-    #[cfg(feature = "host")]
-    fn async_to_ffi<'stream, 'b>(
-        param: Self::AsyncHostType<'stream, 'b>,
-    ) -> Self::FfiType<'stream, 'b>;
-
-    #[doc(hidden)]
-    #[cfg(feature = "device")]
-    unsafe fn with_ffi_as_device<O, const PARAM: usize>(
-        param: Self::FfiType<'static, 'static>,
-        inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O,
-    ) -> O;
-}
-
-pub struct PtxJit<T> {
-    never: !,
-    _marker: PhantomData<T>,
-}
-
-impl<T> Deref for PtxJit<T> {
-    type Target = T;
-
-    fn deref(&self) -> &Self::Target {
-        self.never
-    }
-}
-
-impl<T> DerefMut for PtxJit<T> {
-    fn deref_mut(&mut self) -> &mut Self::Target {
-        self.never
-    }
-}
-
-pub struct PerThreadShallowCopy<
-    T: crate::safety::SafeDeviceCopy
-        + crate::safety::NoSafeAliasing
-        + const_type_layout::TypeGraphLayout,
-> {
-    never: !,
-    _marker: PhantomData<T>,
-}
-
-impl<
-        T: crate::safety::SafeDeviceCopy
-            + crate::safety::NoSafeAliasing
-            + const_type_layout::TypeGraphLayout,
-    > Deref for PerThreadShallowCopy<T>
-{
-    type Target = T;
-
-    fn deref(&self) -> &Self::Target {
-        self.never
-    }
-}
-
-impl<
-        T: crate::safety::SafeDeviceCopy
-            + crate::safety::NoSafeAliasing
-            + const_type_layout::TypeGraphLayout,
-    > DerefMut for PerThreadShallowCopy<T>
-{
-    fn deref_mut(&mut self) -> &mut Self::Target {
-        self.never
-    }
-}
-
-impl<
-        T: crate::safety::SafeDeviceCopy
-            + crate::safety::NoSafeAliasing
-            + const_type_layout::TypeGraphLayout,
-    > CudaKernelParameter for PerThreadShallowCopy<T>
-{
-    #[cfg(feature = "host")]
-    type AsyncHostType<'stream, 'b> = crate::utils::device_copy::SafeDeviceCopyWrapper<T>;
-    #[cfg(any(feature = "device", doc))]
-    type DeviceType<'b> = T;
-    type FfiType<'stream, 'b> = crate::utils::device_copy::SafeDeviceCopyWrapper<T>;
-    #[cfg(feature = "host")]
-    type SyncHostType = T;
-
-    #[cfg(feature = "host")]
-    fn with_new_async<'stream, O, E: From<rustacuda::error::CudaError>>(
-        param: Self::SyncHostType,
-        _stream: &'stream rustacuda::stream::Stream,
-        inner: impl for<'b> FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result<O, E>,
-    ) -> Result<O, E> {
-        inner(crate::utils::device_copy::SafeDeviceCopyWrapper::from(
-            param,
-        ))
-    }
-
-    #[cfg(feature = "host")]
-    fn with_async_as_ptx_jit<O>(
-        _param: &Self::AsyncHostType<'_, '_>,
-        inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O,
-    ) -> O {
-        inner(None)
-    }
-
-    #[cfg(feature = "host")]
-    fn async_to_ffi<'stream, 'b>(
-        param: Self::AsyncHostType<'stream, 'b>,
-    ) -> Self::FfiType<'stream, 'b> {
-        param
-    }
-
-    #[cfg(feature = "device")]
-    unsafe fn with_ffi_as_device<O, const PARAM: usize>(
-        param: Self::FfiType<'static, 'static>,
-        inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O,
-    ) -> O {
-        let param = param.into_inner();
-
-        inner(param)
-    }
-}
-impl<
-        T: crate::safety::SafeDeviceCopy
-            + crate::safety::NoSafeAliasing
-            + const_type_layout::TypeGraphLayout,
-    > sealed::Sealed for PerThreadShallowCopy<T>
-{
-}
-
-impl<
-        'a,
-        T: 'static
-            + crate::safety::SafeDeviceCopy
-            + crate::safety::NoSafeAliasing
-            + const_type_layout::TypeGraphLayout,
-    > CudaKernelParameter for &'a PerThreadShallowCopy<T>
-{
-    #[cfg(feature = "host")]
-    type AsyncHostType<'stream, 'b> = crate::host::HostAndDeviceConstRefAsync<
-        'stream,
-        'b,
-        crate::utils::device_copy::SafeDeviceCopyWrapper<T>,
-    >;
-    #[cfg(any(feature = "device", doc))]
-    type DeviceType<'b> = &'b T;
-    type FfiType<'stream, 'b> =
-        DeviceConstRef<'b, crate::utils::device_copy::SafeDeviceCopyWrapper<T>>;
-    #[cfg(feature = "host")]
-    type SyncHostType = &'a T;
-
-    #[cfg(feature = "host")]
-    fn with_new_async<'stream, O, E: From<rustacuda::error::CudaError>>(
-        param: Self::SyncHostType,
-        _stream: &'stream rustacuda::stream::Stream,
-        inner: impl for<'b> FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result<O, E>,
-    ) -> Result<O, E> {
-        let host_box = crate::host::HostDeviceBox::from(rustacuda::memory::DeviceBox::new(
-            crate::utils::device_copy::SafeDeviceCopyWrapper::from_ref(param),
-        )?);
-
-        // Safety: `host_box` contains exactly the device copy of `param`
-        let const_ref = unsafe {
-            crate::host::HostAndDeviceConstRef::new(
-                &host_box,
-                crate::utils::device_copy::SafeDeviceCopyWrapper::from_ref(param),
-            )
-        };
-
-        inner(const_ref.as_async())
-    }
-
-    #[cfg(feature = "host")]
-    fn with_async_as_ptx_jit<O>(
-        _param: &Self::AsyncHostType<'_, '_>,
-        inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O,
-    ) -> O {
-        inner(None)
-    }
-
-    #[cfg(feature = "host")]
-    fn async_to_ffi<'stream, 'b>(
-        param: Self::AsyncHostType<'stream, 'b>,
-    ) -> Self::FfiType<'stream, 'b> {
-        unsafe { param.for_device_async() }
-    }
-
-    #[cfg(feature = "device")]
-    unsafe fn with_ffi_as_device<O, const PARAM: usize>(
-        param: Self::FfiType<'static, 'static>,
-        inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O,
-    ) -> O {
-        let param = param.as_ref().into_ref();
-
-        inner(param)
-    }
-}
-impl<
-        'a,
-        T: crate::safety::SafeDeviceCopy
-            + crate::safety::NoSafeAliasing
-            + const_type_layout::TypeGraphLayout,
-    > sealed::Sealed for &'a PerThreadShallowCopy<T>
-{
-}
-
-impl<
-        'a,
-        T: 'static
-            + crate::safety::SafeDeviceCopy
-            + crate::safety::NoSafeAliasing
-            + const_type_layout::TypeGraphLayout,
-    > CudaKernelParameter for &'a PtxJit<PerThreadShallowCopy<T>>
-{
-    #[cfg(feature = "host")]
-    type AsyncHostType<'stream, 'b> =
-        <&'a PerThreadShallowCopy<T> as CudaKernelParameter>::AsyncHostType<'stream, 'b>;
-    #[cfg(any(feature = "device", doc))]
-    type DeviceType<'b> = <&'a PerThreadShallowCopy<T> as CudaKernelParameter>::DeviceType<'b>;
-    type FfiType<'stream, 'b> =
-        <&'a PerThreadShallowCopy<T> as CudaKernelParameter>::FfiType<'stream, 'b>;
-    #[cfg(feature = "host")]
-    type SyncHostType = <&'a PerThreadShallowCopy<T> as CudaKernelParameter>::SyncHostType;
-
-    #[cfg(feature = "host")]
-    fn with_new_async<'stream, O, E: From<rustacuda::error::CudaError>>(
-        param: Self::SyncHostType,
-        stream: &'stream rustacuda::stream::Stream,
-        inner: impl for<'b> FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result<O, E>,
-    ) -> Result<O, E> {
-        <&'a PerThreadShallowCopy<T> as CudaKernelParameter>::with_new_async(param, stream, inner)
-    }
-
-    #[cfg(feature = "host")]
-    fn with_async_as_ptx_jit<O>(
-        param: &Self::AsyncHostType<'_, '_>,
-        inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O,
-    ) -> O {
-        inner(Some(&param_as_raw_bytes(param.for_host())))
-    }
-
-    #[cfg(feature = "host")]
-    fn async_to_ffi<'stream, 'b>(
-        param: Self::AsyncHostType<'stream, 'b>,
-    ) -> Self::FfiType<'stream, 'b> {
-        <&'a PerThreadShallowCopy<T> as CudaKernelParameter>::async_to_ffi(param)
-    }
-
-    #[cfg(feature = "device")]
-    unsafe fn with_ffi_as_device<O, const PARAM: usize>(
-        param: Self::FfiType<'static, 'static>,
-        inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O,
-    ) -> O {
-        emit_param_ptx_jit_marker::<_, PARAM>(param.as_ref());
-
-        <&'a PerThreadShallowCopy<T> as CudaKernelParameter>::with_ffi_as_device::<O, PARAM>(
-            param, inner,
-        )
-    }
-}
-impl<
-        'a,
-        T: crate::safety::SafeDeviceCopy
-            + crate::safety::NoSafeAliasing
-            + const_type_layout::TypeGraphLayout,
-    > sealed::Sealed for &'a PtxJit<PerThreadShallowCopy<T>>
-{
-}
-
-pub struct ShallowInteriorMutable<T: InteriorMutableSafeDeviceCopy> {
-    never: !,
-    _marker: PhantomData<T>,
-}
-
-impl<T: InteriorMutableSafeDeviceCopy> Deref for ShallowInteriorMutable<T> {
-    type Target = T;
-
-    fn deref(&self) -> &Self::Target {
-        self.never
-    }
-}
-
-impl<'a, T: 'static + InteriorMutableSafeDeviceCopy> CudaKernelParameter
-    for &'a ShallowInteriorMutable<T>
-{
-    #[cfg(feature = "host")]
-    type AsyncHostType<'stream, 'b> = crate::host::HostAndDeviceConstRefAsync<
-        'stream,
-        'b,
-        crate::utils::device_copy::SafeDeviceCopyWrapper<T>,
-    >;
-    #[cfg(any(feature = "device", doc))]
-    type DeviceType<'b> = &'b T;
-    type FfiType<'stream, 'b> =
-        DeviceConstRef<'b, crate::utils::device_copy::SafeDeviceCopyWrapper<T>>;
-    #[cfg(feature = "host")]
-    /// The kernel takes a mutable borrow of the interior mutable data to ensure
-    /// the interior mutability is limited to just this kernel invocation.
-    type SyncHostType = &'a mut T;
-
-    #[cfg(feature = "host")]
-    fn with_new_async<'stream, O, E: From<rustacuda::error::CudaError>>(
-        param: Self::SyncHostType,
-        _stream: &'stream rustacuda::stream::Stream,
-        inner: impl for<'b> FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result<O, E>,
-    ) -> Result<O, E> {
-        let host_box = crate::host::HostDeviceBox::from(rustacuda::memory::DeviceBox::new(
-            crate::utils::device_copy::SafeDeviceCopyWrapper::from_ref(param),
-        )?);
-
-        // Safety: `host_box` contains exactly the device copy of `param`
-        let const_ref = unsafe {
-            crate::host::HostAndDeviceConstRef::new(
-                &host_box,
-                crate::utils::device_copy::SafeDeviceCopyWrapper::from_ref(param),
-            )
-        };
-
-        let result = inner(const_ref.as_async());
-
-        host_box.copy_to(crate::utils::device_copy::SafeDeviceCopyWrapper::from_mut(
-            param,
-        ))?;
-
-        result
-    }
-
-    #[cfg(feature = "host")]
-    fn with_async_as_ptx_jit<O>(
-        _param: &Self::AsyncHostType<'_, '_>,
-        inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O,
-    ) -> O {
-        inner(None)
-    }
-
-    #[cfg(feature = "host")]
-    fn async_to_ffi<'stream, 'b>(
-        param: Self::AsyncHostType<'stream, 'b>,
-    ) -> Self::FfiType<'stream, 'b> {
-        unsafe { param.for_device_async() }
-    }
-
-    #[cfg(feature = "device")]
-    unsafe fn with_ffi_as_device<O, const PARAM: usize>(
-        param: Self::FfiType<'static, 'static>,
-        inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O,
-    ) -> O {
-        let param = param.as_ref().into_ref();
-
-        inner(param)
-    }
-}
-impl<'a, T: InteriorMutableSafeDeviceCopy> sealed::Sealed for &'a ShallowInteriorMutable<T> {}
-
-pub trait InteriorMutableSafeDeviceCopy:
-    crate::safety::SafeDeviceCopy
-    + crate::safety::NoSafeAliasing
-    + const_type_layout::TypeGraphLayout
-    + sealed::Sealed
-{
-}
-
-macro_rules! impl_atomic_interior_mutable {
-    ($atomic:ident($interior:ty)) => {
-        impl InteriorMutableSafeDeviceCopy for core::sync::atomic::$atomic {}
-        impl sealed::Sealed for core::sync::atomic::$atomic {}
-    };
-    ($($atomic:ident($interior:ty)),*) => {
-        $(impl_atomic_interior_mutable! { $atomic($interior) })*
-    }
-}
-
-impl_atomic_interior_mutable! {
-    AtomicBool(bool),
-    AtomicI8(i8), AtomicI16(i16), AtomicI32(i32), AtomicI64(i64), AtomicIsize(isize),
-    AtomicU8(u8), AtomicU16(u16), AtomicU32(u32), AtomicU64(u64), AtomicUsize(usize)
-}
-
-impl<
-        T: crate::safety::StackOnly
-            + crate::safety::NoSafeAliasing
-            + const_type_layout::TypeGraphLayout,
-    > InteriorMutableSafeDeviceCopy for core::cell::SyncUnsafeCell<T>
-{
-}
-impl<
-        T: crate::safety::StackOnly
-            + crate::safety::NoSafeAliasing
-            + const_type_layout::TypeGraphLayout,
-    > sealed::Sealed for core::cell::SyncUnsafeCell<T>
-{
-}
-
-pub struct SharedHeapPerThreadShallowCopy<T: RustToCuda + crate::safety::NoSafeAliasing> {
-    never: !,
-    _marker: PhantomData<T>,
-}
-
-impl<T: RustToCuda + crate::safety::NoSafeAliasing> Deref for SharedHeapPerThreadShallowCopy<T> {
-    type Target = T;
-
-    fn deref(&self) -> &Self::Target {
-        self.never
-    }
-}
-
-impl<T: RustToCuda + crate::safety::NoSafeAliasing> DerefMut for SharedHeapPerThreadShallowCopy<T> {
-    fn deref_mut(&mut self) -> &mut Self::Target {
-        self.never
-    }
-}
-
-impl<
-        T: RustToCuda<
-                CudaRepresentation: 'static + crate::safety::SafeDeviceCopy,
-                CudaAllocation: EmptyCudaAlloc,
-            > + crate::safety::NoSafeAliasing,
-    > CudaKernelParameter for SharedHeapPerThreadShallowCopy<T>
-{
-    #[cfg(feature = "host")]
-    type AsyncHostType<'stream, 'b> = crate::host::HostAndDeviceOwnedAsync<
-        'stream,
-        'b,
-        DeviceAccessible<<T as RustToCuda>::CudaRepresentation>,
-    >;
-    #[cfg(any(feature = "device", doc))]
-    type DeviceType<'b> = T;
-    type FfiType<'stream, 'b> =
-        DeviceOwnedRef<'b, DeviceAccessible<<T as RustToCuda>::CudaRepresentation>>;
-    #[cfg(feature = "host")]
-    type SyncHostType = T;
-
-    #[cfg(feature = "host")]
-    fn with_new_async<'stream, O, E: From<rustacuda::error::CudaError>>(
-        param: Self::SyncHostType,
-        _stream: &'stream rustacuda::stream::Stream,
-        inner: impl for<'b> FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result<O, E>,
-    ) -> Result<O, E> {
-        crate::host::LendToCuda::move_to_cuda(param, |param| inner(param.into_async()))
-    }
-
-    #[cfg(feature = "host")]
-    fn with_async_as_ptx_jit<O>(
-        _param: &Self::AsyncHostType<'_, '_>,
-        inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O,
-    ) -> O {
-        inner(None)
-    }
-
-    #[cfg(feature = "host")]
-    fn async_to_ffi<'stream, 'b>(
-        param: Self::AsyncHostType<'stream, 'b>,
-    ) -> Self::FfiType<'stream, 'b> {
-        unsafe { param.for_device_async() }
-    }
-
-    #[cfg(feature = "device")]
-    unsafe fn with_ffi_as_device<O, const PARAM: usize>(
-        param: Self::FfiType<'static, 'static>,
-        inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O,
-    ) -> O {
-        unsafe { crate::device::BorrowFromRust::with_moved_from_rust(param, inner) }
-    }
-}
-impl<
-        T: RustToCuda<
-                CudaRepresentation: crate::safety::SafeDeviceCopy,
-                CudaAllocation: EmptyCudaAlloc,
-            > + crate::safety::NoSafeAliasing,
-    > sealed::Sealed for SharedHeapPerThreadShallowCopy<T>
-{
-}
-
-impl<'a, T: 'static + RustToCuda + crate::safety::NoSafeAliasing> CudaKernelParameter
-    for &'a SharedHeapPerThreadShallowCopy<T>
-{
-    #[cfg(feature = "host")]
-    type AsyncHostType<'stream, 'b> = crate::host::HostAndDeviceConstRefAsync<
-        'stream,
-        'b,
-        DeviceAccessible<<T as RustToCuda>::CudaRepresentation>,
-    >;
-    #[cfg(any(feature = "device", doc))]
-    type DeviceType<'b> = &'b T;
-    type FfiType<'stream, 'b> =
-        DeviceConstRef<'b, DeviceAccessible<<T as RustToCuda>::CudaRepresentation>>;
-    #[cfg(feature = "host")]
-    type SyncHostType = &'a T;
-
-    #[cfg(feature = "host")]
-    fn with_new_async<'stream, O, E: From<rustacuda::error::CudaError>>(
-        param: Self::SyncHostType,
-        _stream: &'stream rustacuda::stream::Stream,
-        inner: impl for<'b> FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result<O, E>,
-    ) -> Result<O, E> {
-        crate::host::LendToCuda::lend_to_cuda(param, |param| inner(param.as_async()))
-    }
-
-    #[cfg(feature = "host")]
-    fn with_async_as_ptx_jit<O>(
-        _param: &Self::AsyncHostType<'_, '_>,
-        inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O,
-    ) -> O {
-        inner(None)
-    }
-
-    #[cfg(feature = "host")]
-    fn async_to_ffi<'stream, 'b>(
-        param: Self::AsyncHostType<'stream, 'b>,
-    ) -> Self::FfiType<'stream, 'b> {
-        unsafe { param.for_device_async() }
-    }
-
-    #[cfg(feature = "device")]
-    unsafe fn with_ffi_as_device<O, const PARAM: usize>(
-        param: Self::FfiType<'static, 'static>,
-        inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O,
-    ) -> O {
-        unsafe { crate::device::BorrowFromRust::with_borrow_from_rust(param, inner) }
-    }
-}
-impl<'a, T: RustToCuda + crate::safety::NoSafeAliasing> sealed::Sealed
-    for &'a SharedHeapPerThreadShallowCopy<T>
-{
-}
-
-impl<
-        T: RustToCuda<
-                CudaRepresentation: 'static + crate::safety::SafeDeviceCopy,
-                CudaAllocation: EmptyCudaAlloc,
-            > + crate::safety::NoSafeAliasing,
-    > CudaKernelParameter for PtxJit<SharedHeapPerThreadShallowCopy<T>>
-{
-    #[cfg(feature = "host")]
-    type AsyncHostType<'stream, 'b> =
-        <SharedHeapPerThreadShallowCopy<T> as CudaKernelParameter>::AsyncHostType<'stream, 'b>;
-    #[cfg(any(feature = "device", doc))]
-    type DeviceType<'b> =
-        <SharedHeapPerThreadShallowCopy<T> as CudaKernelParameter>::DeviceType<'b>;
-    type FfiType<'stream, 'b> =
-        <SharedHeapPerThreadShallowCopy<T> as CudaKernelParameter>::FfiType<'stream, 'b>;
-    #[cfg(feature = "host")]
-    type SyncHostType = <SharedHeapPerThreadShallowCopy<T> as CudaKernelParameter>::SyncHostType;
-
-    #[cfg(feature = "host")]
-    fn with_new_async<'stream, O, E: From<rustacuda::error::CudaError>>(
-        param: Self::SyncHostType,
-        stream: &'stream rustacuda::stream::Stream,
-        inner: impl for<'b> FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result<O, E>,
-    ) -> Result<O, E> {
-        <SharedHeapPerThreadShallowCopy<T> as CudaKernelParameter>::with_new_async(
-            param, stream, inner,
-        )
-    }
-
-    #[cfg(feature = "host")]
-    fn with_async_as_ptx_jit<O>(
-        param: &Self::AsyncHostType<'_, '_>,
-        inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O,
-    ) -> O {
-        inner(Some(&param_as_raw_bytes(param.for_host())))
-    }
-
-    #[cfg(feature = "host")]
-    fn async_to_ffi<'stream, 'b>(
-        param: Self::AsyncHostType<'stream, 'b>,
-    ) -> Self::FfiType<'stream, 'b> {
-        <SharedHeapPerThreadShallowCopy<T> as CudaKernelParameter>::async_to_ffi(param)
-    }
-
-    #[cfg(feature = "device")]
-    unsafe fn with_ffi_as_device<O, const PARAM: usize>(
-        param: Self::FfiType<'static, 'static>,
-        inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O,
-    ) -> O {
-        emit_param_ptx_jit_marker::<_, PARAM>(param.as_ref());
-
-        <SharedHeapPerThreadShallowCopy<T> as CudaKernelParameter>::with_ffi_as_device::<O, PARAM>(
-            param, inner,
-        )
-    }
-}
-impl<
-        T: RustToCuda<
-                CudaRepresentation: crate::safety::SafeDeviceCopy,
-                CudaAllocation: EmptyCudaAlloc,
-            > + crate::safety::NoSafeAliasing,
-    > sealed::Sealed for PtxJit<SharedHeapPerThreadShallowCopy<T>>
-{
-}
-
-impl<'a, T: 'static + RustToCuda + crate::safety::NoSafeAliasing> CudaKernelParameter
-    for &'a PtxJit<SharedHeapPerThreadShallowCopy<T>>
-{
-    #[cfg(feature = "host")]
-    type AsyncHostType<'stream, 'b> =
-        <&'a SharedHeapPerThreadShallowCopy<T> as CudaKernelParameter>::AsyncHostType<'stream, 'b>;
-    #[cfg(any(feature = "device", doc))]
-    type DeviceType<'b> =
-        <&'a SharedHeapPerThreadShallowCopy<T> as CudaKernelParameter>::DeviceType<'b>;
-    type FfiType<'stream, 'b> =
-        <&'a SharedHeapPerThreadShallowCopy<T> as CudaKernelParameter>::FfiType<'stream, 'b>;
-    #[cfg(feature = "host")]
-    type SyncHostType =
-        <&'a SharedHeapPerThreadShallowCopy<T> as CudaKernelParameter>::SyncHostType;
-
-    #[cfg(feature = "host")]
-    fn with_new_async<'stream, O, E: From<rustacuda::error::CudaError>>(
-        param: Self::SyncHostType,
-        stream: &'stream rustacuda::stream::Stream,
-        inner: impl for<'b> FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result<O, E>,
-    ) -> Result<O, E> {
-        <&'a SharedHeapPerThreadShallowCopy<T> as CudaKernelParameter>::with_new_async(
-            param, stream, inner,
-        )
-    }
-
-    #[cfg(feature = "host")]
-    fn with_async_as_ptx_jit<O>(
-        param: &Self::AsyncHostType<'_, '_>,
-        inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O,
-    ) -> O {
-        inner(Some(&param_as_raw_bytes(param.for_host())))
-    }
-
-    #[cfg(feature = "host")]
-    fn async_to_ffi<'stream, 'b>(
-        param: Self::AsyncHostType<'stream, 'b>,
-    ) -> Self::FfiType<'stream, 'b> {
-        <&'a SharedHeapPerThreadShallowCopy<T> as CudaKernelParameter>::async_to_ffi(param)
-    }
-
-    #[cfg(feature = "device")]
-    unsafe fn with_ffi_as_device<O, const PARAM: usize>(
-        param: Self::FfiType<'static, 'static>,
-        inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O,
-    ) -> O {
-        emit_param_ptx_jit_marker::<_, PARAM>(param.as_ref());
-
-        <&'a SharedHeapPerThreadShallowCopy<T> as CudaKernelParameter>::with_ffi_as_device::<O, PARAM>(
-            param, inner,
-        )
-    }
-}
-impl<'a, T: RustToCuda + crate::safety::NoSafeAliasing> sealed::Sealed
-    for &'a PtxJit<SharedHeapPerThreadShallowCopy<T>>
-{
-}
-
-#[cfg(feature = "host")]
-fn param_as_raw_bytes<T: ?Sized>(r: &T) -> NonNull<[u8]> {
-    NonNull::slice_from_raw_parts(NonNull::from(r).cast::<u8>(), core::mem::size_of_val(r))
-}
-
-#[cfg(feature = "device")]
-fn emit_param_ptx_jit_marker<T: ?Sized, const INDEX: usize>(param: &T) {
-    unsafe {
-        core::arch::asm!(
-            "// <rust-cuda-ptx-jit-const-load-{param_reg}-{param_index}> //",
-            param_reg = in(reg32) *(core::ptr::from_ref(param).cast::<u32>()),
-            param_index = const(INDEX),
-        );
-    }
-}
-
-mod private_shared {
-    use const_type_layout::{TypeGraphLayout, TypeLayout};
-    use rustacuda_core::DeviceCopy;
-
-    #[doc(hidden)]
-    #[derive(TypeLayout)]
-    #[repr(C)]
-    pub struct ThreadBlockSharedFfi<T: 'static + TypeGraphLayout> {
-        pub(super) _marker: [T; 0],
-    }
-
-    // Safety: there is nothing to copy, this is just a zero-sized marker type
-    unsafe impl<T: 'static + TypeGraphLayout> DeviceCopy for ThreadBlockSharedFfi<T> {}
-
-    #[doc(hidden)]
-    #[derive(TypeLayout)]
-    #[repr(C)]
-    pub struct ThreadBlockSharedSliceFfi<T: 'static + TypeGraphLayout> {
-        pub(super) len: usize,
-        pub(super) _marker: [T; 0],
-    }
-
-    // Safety: we only copy a usize, which implements `DeviceCopy`
-    unsafe impl<T: 'static + TypeGraphLayout> DeviceCopy for ThreadBlockSharedSliceFfi<T> {}
-}
-
-impl<'a, T: 'static + TypeGraphLayout> CudaKernelParameter
-    for &'a mut crate::utils::shared::r#static::ThreadBlockShared<T>
-{
-    #[cfg(feature = "host")]
-    type AsyncHostType<'stream, 'b> = &'b mut crate::utils::shared::r#static::ThreadBlockShared<T>;
-    #[cfg(any(feature = "device", doc))]
-    type DeviceType<'b> = &'b mut crate::utils::shared::r#static::ThreadBlockShared<T>;
-    type FfiType<'stream, 'b> = private_shared::ThreadBlockSharedFfi<T>;
-    #[cfg(feature = "host")]
-    type SyncHostType = Self;
-
-    #[cfg(feature = "host")]
-    fn with_new_async<'stream, O, E: From<rustacuda::error::CudaError>>(
-        param: Self::SyncHostType,
-        _stream: &'stream rustacuda::stream::Stream,
-        inner: impl for<'b> FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result<O, E>,
-    ) -> Result<O, E> {
-        inner(param)
-    }
-
-    #[cfg(feature = "host")]
-    fn with_async_as_ptx_jit<O>(
-        _param: &Self::AsyncHostType<'_, '_>,
-        inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O,
-    ) -> O {
-        inner(None)
-    }
-
-    #[cfg(feature = "host")]
-    fn async_to_ffi<'stream, 'b>(
-        _param: Self::AsyncHostType<'stream, 'b>,
-    ) -> Self::FfiType<'stream, 'b> {
-        private_shared::ThreadBlockSharedFfi { _marker: [] }
-    }
-
-    #[cfg(feature = "device")]
-    #[allow(clippy::inline_always)]
-    #[inline(always)]
-    unsafe fn with_ffi_as_device<O, const PARAM: usize>(
-        _param: Self::FfiType<'static, 'static>,
-        inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O,
-    ) -> O {
-        let mut param = crate::utils::shared::r#static::ThreadBlockShared::new_uninit();
-
-        inner(&mut param)
-    }
-}
-impl<'a, T: 'static + TypeGraphLayout> sealed::Sealed
-    for &'a mut crate::utils::shared::r#static::ThreadBlockShared<T>
-{
-}
-
-impl<'a, T: 'static + TypeGraphLayout> CudaKernelParameter
-    for &'a mut crate::utils::shared::slice::ThreadBlockSharedSlice<T>
-{
-    #[cfg(feature = "host")]
-    type AsyncHostType<'stream, 'b> =
-        &'b mut crate::utils::shared::slice::ThreadBlockSharedSlice<T>;
-    #[cfg(any(feature = "device", doc))]
-    type DeviceType<'b> = &'b mut crate::utils::shared::slice::ThreadBlockSharedSlice<T>;
-    type FfiType<'stream, 'b> = private_shared::ThreadBlockSharedSliceFfi<T>;
-    #[cfg(feature = "host")]
-    type SyncHostType = Self;
-
-    #[cfg(feature = "host")]
-    fn with_new_async<'stream, O, E: From<rustacuda::error::CudaError>>(
-        param: Self::SyncHostType,
-        _stream: &'stream rustacuda::stream::Stream,
-        inner: impl for<'b> FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result<O, E>,
-    ) -> Result<O, E> {
-        inner(param)
-    }
-
-    #[cfg(feature = "host")]
-    fn with_async_as_ptx_jit<O>(
-        _param: &Self::AsyncHostType<'_, '_>,
-        inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O,
-    ) -> O {
-        inner(None)
-    }
-
-    #[cfg(feature = "host")]
-    fn async_to_ffi<'stream, 'b>(
-        param: Self::AsyncHostType<'stream, 'b>,
-    ) -> Self::FfiType<'stream, 'b> {
-        private_shared::ThreadBlockSharedSliceFfi {
-            len: param.len(),
-            _marker: [],
-        }
-    }
-
-    #[cfg(feature = "device")]
-    #[allow(clippy::inline_always)]
-    #[inline(always)]
-    unsafe fn with_ffi_as_device<O, const PARAM: usize>(
-        param: Self::FfiType<'static, 'static>,
-        inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O,
-    ) -> O {
-        unsafe {
-            crate::utils::shared::slice::ThreadBlockSharedSlice::with_uninit_for_len(
-                param.len, inner,
-            )
-        }
-    }
-}
-impl<'a, T: 'static + TypeGraphLayout> sealed::Sealed
-    for &'a mut crate::utils::shared::slice::ThreadBlockSharedSlice<T>
-{
-}
diff --git a/src/host/mod.rs b/src/host/mod.rs
index 45dc6f059..2d423362a 100644
--- a/src/host/mod.rs
+++ b/src/host/mod.rs
@@ -22,9 +22,10 @@ pub use rust_cuda_derive::{check_kernel, link_kernel, specialise_kernel_entry_po
 
 use crate::{
     common::{
-        CudaKernelParameter, DeviceAccessible, DeviceConstRef, DeviceMutRef, DeviceOwnedRef,
-        EmptyCudaAlloc, NoCudaAlloc, RustToCuda,
+        DeviceAccessible, DeviceConstRef, DeviceMutRef, DeviceOwnedRef, EmptyCudaAlloc,
+        NoCudaAlloc, RustToCuda,
     },
+    kernel::CudaKernelParameter,
     safety::{NoSafeAliasing, SafeDeviceCopy},
 };
 
@@ -1154,7 +1155,7 @@ impl<'stream, 'a, T: SafeDeviceCopy + DeviceCopy> HostAndDeviceOwnedAsync<'strea
 /// [`CompiledKernelPtx::get_entry_point`].
 ///
 /// This trait should not be implemented manually &ndash; use the
-/// [`kernel`](crate::common::kernel) macro instead.
+/// [`kernel`](crate::kernel::kernel) macro instead.
 pub unsafe trait CompiledKernelPtx<Kernel> {
     fn get_ptx() -> &'static CStr;
     fn get_entry_point() -> &'static CStr;
diff --git a/src/kernel.rs b/src/kernel.rs
new file mode 100644
index 000000000..98ae0220c
--- /dev/null
+++ b/src/kernel.rs
@@ -0,0 +1,838 @@
+#[cfg(feature = "device")]
+use core::convert::AsRef;
+use core::{
+    marker::PhantomData,
+    ops::{Deref, DerefMut},
+};
+
+#[cfg(feature = "host")]
+use core::ptr::NonNull;
+
+use const_type_layout::TypeGraphLayout;
+
+#[cfg(feature = "derive")]
+pub use rust_cuda_derive::kernel;
+
+use crate::common::{DeviceAccessible, DeviceConstRef, DeviceOwnedRef, EmptyCudaAlloc, RustToCuda};
+
+mod sealed {
+    #[doc(hidden)]
+    pub trait Sealed {}
+}
+
+pub trait CudaKernelParameter: sealed::Sealed {
+    #[cfg(feature = "host")]
+    type SyncHostType;
+    #[cfg(feature = "host")]
+    type AsyncHostType<'stream, 'b>;
+    #[doc(hidden)]
+    type FfiType<'stream, 'b>: rustacuda_core::DeviceCopy + TypeGraphLayout;
+    #[cfg(any(feature = "device", doc))]
+    type DeviceType<'b>;
+
+    #[cfg(feature = "host")]
+    #[allow(clippy::missing_errors_doc)] // FIXME
+    fn with_new_async<'stream, O, E: From<rustacuda::error::CudaError>>(
+        param: Self::SyncHostType,
+        stream: &'stream rustacuda::stream::Stream,
+        inner: impl for<'b> FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result<O, E>,
+    ) -> Result<O, E>;
+
+    #[doc(hidden)]
+    #[cfg(feature = "host")]
+    fn with_async_as_ptx_jit<O>(
+        param: &Self::AsyncHostType<'_, '_>,
+        inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O,
+    ) -> O;
+
+    #[doc(hidden)]
+    #[cfg(feature = "host")]
+    fn async_to_ffi<'stream, 'b>(
+        param: Self::AsyncHostType<'stream, 'b>,
+    ) -> Self::FfiType<'stream, 'b>;
+
+    #[doc(hidden)]
+    #[cfg(feature = "device")]
+    unsafe fn with_ffi_as_device<O, const PARAM: usize>(
+        param: Self::FfiType<'static, 'static>,
+        inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O,
+    ) -> O;
+}
+
+pub struct PtxJit<T> {
+    never: !,
+    _marker: PhantomData<T>,
+}
+
+impl<T> Deref for PtxJit<T> {
+    type Target = T;
+
+    fn deref(&self) -> &Self::Target {
+        self.never
+    }
+}
+
+impl<T> DerefMut for PtxJit<T> {
+    fn deref_mut(&mut self) -> &mut Self::Target {
+        self.never
+    }
+}
+
+pub struct PerThreadShallowCopy<
+    T: crate::safety::SafeDeviceCopy
+        + crate::safety::NoSafeAliasing
+        + const_type_layout::TypeGraphLayout,
+> {
+    never: !,
+    _marker: PhantomData<T>,
+}
+
+impl<
+        T: crate::safety::SafeDeviceCopy
+            + crate::safety::NoSafeAliasing
+            + const_type_layout::TypeGraphLayout,
+    > Deref for PerThreadShallowCopy<T>
+{
+    type Target = T;
+
+    fn deref(&self) -> &Self::Target {
+        self.never
+    }
+}
+
+impl<
+        T: crate::safety::SafeDeviceCopy
+            + crate::safety::NoSafeAliasing
+            + const_type_layout::TypeGraphLayout,
+    > DerefMut for PerThreadShallowCopy<T>
+{
+    fn deref_mut(&mut self) -> &mut Self::Target {
+        self.never
+    }
+}
+
+impl<
+        T: crate::safety::SafeDeviceCopy
+            + crate::safety::NoSafeAliasing
+            + const_type_layout::TypeGraphLayout,
+    > CudaKernelParameter for PerThreadShallowCopy<T>
+{
+    #[cfg(feature = "host")]
+    type AsyncHostType<'stream, 'b> = crate::utils::device_copy::SafeDeviceCopyWrapper<T>;
+    #[cfg(any(feature = "device", doc))]
+    type DeviceType<'b> = T;
+    type FfiType<'stream, 'b> = crate::utils::device_copy::SafeDeviceCopyWrapper<T>;
+    #[cfg(feature = "host")]
+    type SyncHostType = T;
+
+    #[cfg(feature = "host")]
+    fn with_new_async<'stream, O, E: From<rustacuda::error::CudaError>>(
+        param: Self::SyncHostType,
+        _stream: &'stream rustacuda::stream::Stream,
+        inner: impl for<'b> FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result<O, E>,
+    ) -> Result<O, E> {
+        inner(crate::utils::device_copy::SafeDeviceCopyWrapper::from(
+            param,
+        ))
+    }
+
+    #[cfg(feature = "host")]
+    fn with_async_as_ptx_jit<O>(
+        _param: &Self::AsyncHostType<'_, '_>,
+        inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O,
+    ) -> O {
+        inner(None)
+    }
+
+    #[cfg(feature = "host")]
+    fn async_to_ffi<'stream, 'b>(
+        param: Self::AsyncHostType<'stream, 'b>,
+    ) -> Self::FfiType<'stream, 'b> {
+        param
+    }
+
+    #[cfg(feature = "device")]
+    unsafe fn with_ffi_as_device<O, const PARAM: usize>(
+        param: Self::FfiType<'static, 'static>,
+        inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O,
+    ) -> O {
+        let param = param.into_inner();
+
+        inner(param)
+    }
+}
+impl<
+        T: crate::safety::SafeDeviceCopy
+            + crate::safety::NoSafeAliasing
+            + const_type_layout::TypeGraphLayout,
+    > sealed::Sealed for PerThreadShallowCopy<T>
+{
+}
+
+impl<
+        'a,
+        T: 'static
+            + crate::safety::SafeDeviceCopy
+            + crate::safety::NoSafeAliasing
+            + const_type_layout::TypeGraphLayout,
+    > CudaKernelParameter for &'a PerThreadShallowCopy<T>
+{
+    #[cfg(feature = "host")]
+    type AsyncHostType<'stream, 'b> = crate::host::HostAndDeviceConstRefAsync<
+        'stream,
+        'b,
+        crate::utils::device_copy::SafeDeviceCopyWrapper<T>,
+    >;
+    #[cfg(any(feature = "device", doc))]
+    type DeviceType<'b> = &'b T;
+    type FfiType<'stream, 'b> =
+        DeviceConstRef<'b, crate::utils::device_copy::SafeDeviceCopyWrapper<T>>;
+    #[cfg(feature = "host")]
+    type SyncHostType = &'a T;
+
+    #[cfg(feature = "host")]
+    fn with_new_async<'stream, O, E: From<rustacuda::error::CudaError>>(
+        param: Self::SyncHostType,
+        _stream: &'stream rustacuda::stream::Stream,
+        inner: impl for<'b> FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result<O, E>,
+    ) -> Result<O, E> {
+        let host_box = crate::host::HostDeviceBox::from(rustacuda::memory::DeviceBox::new(
+            crate::utils::device_copy::SafeDeviceCopyWrapper::from_ref(param),
+        )?);
+
+        // Safety: `host_box` contains exactly the device copy of `param`
+        let const_ref = unsafe {
+            crate::host::HostAndDeviceConstRef::new(
+                &host_box,
+                crate::utils::device_copy::SafeDeviceCopyWrapper::from_ref(param),
+            )
+        };
+
+        inner(const_ref.as_async())
+    }
+
+    #[cfg(feature = "host")]
+    fn with_async_as_ptx_jit<O>(
+        _param: &Self::AsyncHostType<'_, '_>,
+        inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O,
+    ) -> O {
+        inner(None)
+    }
+
+    #[cfg(feature = "host")]
+    fn async_to_ffi<'stream, 'b>(
+        param: Self::AsyncHostType<'stream, 'b>,
+    ) -> Self::FfiType<'stream, 'b> {
+        unsafe { param.for_device_async() }
+    }
+
+    #[cfg(feature = "device")]
+    unsafe fn with_ffi_as_device<O, const PARAM: usize>(
+        param: Self::FfiType<'static, 'static>,
+        inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O,
+    ) -> O {
+        let param = param.as_ref().into_ref();
+
+        inner(param)
+    }
+}
+impl<
+        'a,
+        T: crate::safety::SafeDeviceCopy
+            + crate::safety::NoSafeAliasing
+            + const_type_layout::TypeGraphLayout,
+    > sealed::Sealed for &'a PerThreadShallowCopy<T>
+{
+}
+
+impl<
+        'a,
+        T: 'static
+            + crate::safety::SafeDeviceCopy
+            + crate::safety::NoSafeAliasing
+            + const_type_layout::TypeGraphLayout,
+    > CudaKernelParameter for &'a PtxJit<PerThreadShallowCopy<T>>
+{
+    #[cfg(feature = "host")]
+    type AsyncHostType<'stream, 'b> =
+        <&'a PerThreadShallowCopy<T> as CudaKernelParameter>::AsyncHostType<'stream, 'b>;
+    #[cfg(any(feature = "device", doc))]
+    type DeviceType<'b> = <&'a PerThreadShallowCopy<T> as CudaKernelParameter>::DeviceType<'b>;
+    type FfiType<'stream, 'b> =
+        <&'a PerThreadShallowCopy<T> as CudaKernelParameter>::FfiType<'stream, 'b>;
+    #[cfg(feature = "host")]
+    type SyncHostType = <&'a PerThreadShallowCopy<T> as CudaKernelParameter>::SyncHostType;
+
+    #[cfg(feature = "host")]
+    fn with_new_async<'stream, O, E: From<rustacuda::error::CudaError>>(
+        param: Self::SyncHostType,
+        stream: &'stream rustacuda::stream::Stream,
+        inner: impl for<'b> FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result<O, E>,
+    ) -> Result<O, E> {
+        <&'a PerThreadShallowCopy<T> as CudaKernelParameter>::with_new_async(param, stream, inner)
+    }
+
+    #[cfg(feature = "host")]
+    fn with_async_as_ptx_jit<O>(
+        param: &Self::AsyncHostType<'_, '_>,
+        inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O,
+    ) -> O {
+        inner(Some(&param_as_raw_bytes(param.for_host())))
+    }
+
+    #[cfg(feature = "host")]
+    fn async_to_ffi<'stream, 'b>(
+        param: Self::AsyncHostType<'stream, 'b>,
+    ) -> Self::FfiType<'stream, 'b> {
+        <&'a PerThreadShallowCopy<T> as CudaKernelParameter>::async_to_ffi(param)
+    }
+
+    #[cfg(feature = "device")]
+    unsafe fn with_ffi_as_device<O, const PARAM: usize>(
+        param: Self::FfiType<'static, 'static>,
+        inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O,
+    ) -> O {
+        emit_param_ptx_jit_marker::<_, PARAM>(param.as_ref());
+
+        <&'a PerThreadShallowCopy<T> as CudaKernelParameter>::with_ffi_as_device::<O, PARAM>(
+            param, inner,
+        )
+    }
+}
+impl<
+        'a,
+        T: crate::safety::SafeDeviceCopy
+            + crate::safety::NoSafeAliasing
+            + const_type_layout::TypeGraphLayout,
+    > sealed::Sealed for &'a PtxJit<PerThreadShallowCopy<T>>
+{
+}
+
+pub struct ShallowInteriorMutable<T: InteriorMutableSafeDeviceCopy> {
+    never: !,
+    _marker: PhantomData<T>,
+}
+
+impl<T: InteriorMutableSafeDeviceCopy> Deref for ShallowInteriorMutable<T> {
+    type Target = T;
+
+    fn deref(&self) -> &Self::Target {
+        self.never
+    }
+}
+
+impl<'a, T: 'static + InteriorMutableSafeDeviceCopy> CudaKernelParameter
+    for &'a ShallowInteriorMutable<T>
+{
+    #[cfg(feature = "host")]
+    type AsyncHostType<'stream, 'b> = crate::host::HostAndDeviceConstRefAsync<
+        'stream,
+        'b,
+        crate::utils::device_copy::SafeDeviceCopyWrapper<T>,
+    >;
+    #[cfg(any(feature = "device", doc))]
+    type DeviceType<'b> = &'b T;
+    type FfiType<'stream, 'b> =
+        DeviceConstRef<'b, crate::utils::device_copy::SafeDeviceCopyWrapper<T>>;
+    #[cfg(feature = "host")]
+    /// The kernel takes a mutable borrow of the interior mutable data to ensure
+    /// the interior mutability is limited to just this kernel invocation.
+    type SyncHostType = &'a mut T;
+
+    #[cfg(feature = "host")]
+    fn with_new_async<'stream, O, E: From<rustacuda::error::CudaError>>(
+        param: Self::SyncHostType,
+        _stream: &'stream rustacuda::stream::Stream,
+        inner: impl for<'b> FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result<O, E>,
+    ) -> Result<O, E> {
+        let host_box = crate::host::HostDeviceBox::from(rustacuda::memory::DeviceBox::new(
+            crate::utils::device_copy::SafeDeviceCopyWrapper::from_ref(param),
+        )?);
+
+        // Safety: `host_box` contains exactly the device copy of `param`
+        let const_ref = unsafe {
+            crate::host::HostAndDeviceConstRef::new(
+                &host_box,
+                crate::utils::device_copy::SafeDeviceCopyWrapper::from_ref(param),
+            )
+        };
+
+        let result = inner(const_ref.as_async());
+
+        host_box.copy_to(crate::utils::device_copy::SafeDeviceCopyWrapper::from_mut(
+            param,
+        ))?;
+
+        result
+    }
+
+    #[cfg(feature = "host")]
+    fn with_async_as_ptx_jit<O>(
+        _param: &Self::AsyncHostType<'_, '_>,
+        inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O,
+    ) -> O {
+        inner(None)
+    }
+
+    #[cfg(feature = "host")]
+    fn async_to_ffi<'stream, 'b>(
+        param: Self::AsyncHostType<'stream, 'b>,
+    ) -> Self::FfiType<'stream, 'b> {
+        unsafe { param.for_device_async() }
+    }
+
+    #[cfg(feature = "device")]
+    unsafe fn with_ffi_as_device<O, const PARAM: usize>(
+        param: Self::FfiType<'static, 'static>,
+        inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O,
+    ) -> O {
+        let param = param.as_ref().into_ref();
+
+        inner(param)
+    }
+}
+impl<'a, T: InteriorMutableSafeDeviceCopy> sealed::Sealed for &'a ShallowInteriorMutable<T> {}
+
+pub trait InteriorMutableSafeDeviceCopy:
+    crate::safety::SafeDeviceCopy
+    + crate::safety::NoSafeAliasing
+    + const_type_layout::TypeGraphLayout
+    + sealed::Sealed
+{
+}
+
+macro_rules! impl_atomic_interior_mutable {
+    ($atomic:ident($interior:ty)) => {
+        impl InteriorMutableSafeDeviceCopy for core::sync::atomic::$atomic {}
+        impl sealed::Sealed for core::sync::atomic::$atomic {}
+    };
+    ($($atomic:ident($interior:ty)),*) => {
+        $(impl_atomic_interior_mutable! { $atomic($interior) })*
+    }
+}
+
+impl_atomic_interior_mutable! {
+    AtomicBool(bool),
+    AtomicI8(i8), AtomicI16(i16), AtomicI32(i32), AtomicI64(i64), AtomicIsize(isize),
+    AtomicU8(u8), AtomicU16(u16), AtomicU32(u32), AtomicU64(u64), AtomicUsize(usize)
+}
+
+impl<
+        T: crate::safety::StackOnly
+            + crate::safety::NoSafeAliasing
+            + const_type_layout::TypeGraphLayout,
+    > InteriorMutableSafeDeviceCopy for core::cell::SyncUnsafeCell<T>
+{
+}
+impl<
+        T: crate::safety::StackOnly
+            + crate::safety::NoSafeAliasing
+            + const_type_layout::TypeGraphLayout,
+    > sealed::Sealed for core::cell::SyncUnsafeCell<T>
+{
+}
+
+pub struct SharedHeapPerThreadShallowCopy<T: RustToCuda + crate::safety::NoSafeAliasing> {
+    never: !,
+    _marker: PhantomData<T>,
+}
+
+impl<T: RustToCuda + crate::safety::NoSafeAliasing> Deref for SharedHeapPerThreadShallowCopy<T> {
+    type Target = T;
+
+    fn deref(&self) -> &Self::Target {
+        self.never
+    }
+}
+
+impl<T: RustToCuda + crate::safety::NoSafeAliasing> DerefMut for SharedHeapPerThreadShallowCopy<T> {
+    fn deref_mut(&mut self) -> &mut Self::Target {
+        self.never
+    }
+}
+
+impl<
+        T: RustToCuda<
+                CudaRepresentation: 'static + crate::safety::SafeDeviceCopy,
+                CudaAllocation: EmptyCudaAlloc,
+            > + crate::safety::NoSafeAliasing,
+    > CudaKernelParameter for SharedHeapPerThreadShallowCopy<T>
+{
+    #[cfg(feature = "host")]
+    type AsyncHostType<'stream, 'b> = crate::host::HostAndDeviceOwnedAsync<
+        'stream,
+        'b,
+        DeviceAccessible<<T as RustToCuda>::CudaRepresentation>,
+    >;
+    #[cfg(any(feature = "device", doc))]
+    type DeviceType<'b> = T;
+    type FfiType<'stream, 'b> =
+        DeviceOwnedRef<'b, DeviceAccessible<<T as RustToCuda>::CudaRepresentation>>;
+    #[cfg(feature = "host")]
+    type SyncHostType = T;
+
+    #[cfg(feature = "host")]
+    fn with_new_async<'stream, O, E: From<rustacuda::error::CudaError>>(
+        param: Self::SyncHostType,
+        _stream: &'stream rustacuda::stream::Stream,
+        inner: impl for<'b> FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result<O, E>,
+    ) -> Result<O, E> {
+        crate::host::LendToCuda::move_to_cuda(param, |param| inner(param.into_async()))
+    }
+
+    #[cfg(feature = "host")]
+    fn with_async_as_ptx_jit<O>(
+        _param: &Self::AsyncHostType<'_, '_>,
+        inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O,
+    ) -> O {
+        inner(None)
+    }
+
+    #[cfg(feature = "host")]
+    fn async_to_ffi<'stream, 'b>(
+        param: Self::AsyncHostType<'stream, 'b>,
+    ) -> Self::FfiType<'stream, 'b> {
+        unsafe { param.for_device_async() }
+    }
+
+    #[cfg(feature = "device")]
+    unsafe fn with_ffi_as_device<O, const PARAM: usize>(
+        param: Self::FfiType<'static, 'static>,
+        inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O,
+    ) -> O {
+        unsafe { crate::device::BorrowFromRust::with_moved_from_rust(param, inner) }
+    }
+}
+impl<
+        T: RustToCuda<
+                CudaRepresentation: crate::safety::SafeDeviceCopy,
+                CudaAllocation: EmptyCudaAlloc,
+            > + crate::safety::NoSafeAliasing,
+    > sealed::Sealed for SharedHeapPerThreadShallowCopy<T>
+{
+}
+
+impl<'a, T: 'static + RustToCuda + crate::safety::NoSafeAliasing> CudaKernelParameter
+    for &'a SharedHeapPerThreadShallowCopy<T>
+{
+    #[cfg(feature = "host")]
+    type AsyncHostType<'stream, 'b> = crate::host::HostAndDeviceConstRefAsync<
+        'stream,
+        'b,
+        DeviceAccessible<<T as RustToCuda>::CudaRepresentation>,
+    >;
+    #[cfg(any(feature = "device", doc))]
+    type DeviceType<'b> = &'b T;
+    type FfiType<'stream, 'b> =
+        DeviceConstRef<'b, DeviceAccessible<<T as RustToCuda>::CudaRepresentation>>;
+    #[cfg(feature = "host")]
+    type SyncHostType = &'a T;
+
+    #[cfg(feature = "host")]
+    fn with_new_async<'stream, O, E: From<rustacuda::error::CudaError>>(
+        param: Self::SyncHostType,
+        _stream: &'stream rustacuda::stream::Stream,
+        inner: impl for<'b> FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result<O, E>,
+    ) -> Result<O, E> {
+        crate::host::LendToCuda::lend_to_cuda(param, |param| inner(param.as_async()))
+    }
+
+    #[cfg(feature = "host")]
+    fn with_async_as_ptx_jit<O>(
+        _param: &Self::AsyncHostType<'_, '_>,
+        inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O,
+    ) -> O {
+        inner(None)
+    }
+
+    #[cfg(feature = "host")]
+    fn async_to_ffi<'stream, 'b>(
+        param: Self::AsyncHostType<'stream, 'b>,
+    ) -> Self::FfiType<'stream, 'b> {
+        unsafe { param.for_device_async() }
+    }
+
+    #[cfg(feature = "device")]
+    unsafe fn with_ffi_as_device<O, const PARAM: usize>(
+        param: Self::FfiType<'static, 'static>,
+        inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O,
+    ) -> O {
+        unsafe { crate::device::BorrowFromRust::with_borrow_from_rust(param, inner) }
+    }
+}
+impl<'a, T: RustToCuda + crate::safety::NoSafeAliasing> sealed::Sealed
+    for &'a SharedHeapPerThreadShallowCopy<T>
+{
+}
+
+impl<
+        T: RustToCuda<
+                CudaRepresentation: 'static + crate::safety::SafeDeviceCopy,
+                CudaAllocation: EmptyCudaAlloc,
+            > + crate::safety::NoSafeAliasing,
+    > CudaKernelParameter for PtxJit<SharedHeapPerThreadShallowCopy<T>>
+{
+    #[cfg(feature = "host")]
+    type AsyncHostType<'stream, 'b> =
+        <SharedHeapPerThreadShallowCopy<T> as CudaKernelParameter>::AsyncHostType<'stream, 'b>;
+    #[cfg(any(feature = "device", doc))]
+    type DeviceType<'b> =
+        <SharedHeapPerThreadShallowCopy<T> as CudaKernelParameter>::DeviceType<'b>;
+    type FfiType<'stream, 'b> =
+        <SharedHeapPerThreadShallowCopy<T> as CudaKernelParameter>::FfiType<'stream, 'b>;
+    #[cfg(feature = "host")]
+    type SyncHostType = <SharedHeapPerThreadShallowCopy<T> as CudaKernelParameter>::SyncHostType;
+
+    #[cfg(feature = "host")]
+    fn with_new_async<'stream, O, E: From<rustacuda::error::CudaError>>(
+        param: Self::SyncHostType,
+        stream: &'stream rustacuda::stream::Stream,
+        inner: impl for<'b> FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result<O, E>,
+    ) -> Result<O, E> {
+        <SharedHeapPerThreadShallowCopy<T> as CudaKernelParameter>::with_new_async(
+            param, stream, inner,
+        )
+    }
+
+    #[cfg(feature = "host")]
+    fn with_async_as_ptx_jit<O>(
+        param: &Self::AsyncHostType<'_, '_>,
+        inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O,
+    ) -> O {
+        inner(Some(&param_as_raw_bytes(param.for_host())))
+    }
+
+    #[cfg(feature = "host")]
+    fn async_to_ffi<'stream, 'b>(
+        param: Self::AsyncHostType<'stream, 'b>,
+    ) -> Self::FfiType<'stream, 'b> {
+        <SharedHeapPerThreadShallowCopy<T> as CudaKernelParameter>::async_to_ffi(param)
+    }
+
+    #[cfg(feature = "device")]
+    unsafe fn with_ffi_as_device<O, const PARAM: usize>(
+        param: Self::FfiType<'static, 'static>,
+        inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O,
+    ) -> O {
+        emit_param_ptx_jit_marker::<_, PARAM>(param.as_ref());
+
+        <SharedHeapPerThreadShallowCopy<T> as CudaKernelParameter>::with_ffi_as_device::<O, PARAM>(
+            param, inner,
+        )
+    }
+}
+impl<
+        T: RustToCuda<
+                CudaRepresentation: crate::safety::SafeDeviceCopy,
+                CudaAllocation: EmptyCudaAlloc,
+            > + crate::safety::NoSafeAliasing,
+    > sealed::Sealed for PtxJit<SharedHeapPerThreadShallowCopy<T>>
+{
+}
+
+impl<'a, T: 'static + RustToCuda + crate::safety::NoSafeAliasing> CudaKernelParameter
+    for &'a PtxJit<SharedHeapPerThreadShallowCopy<T>>
+{
+    #[cfg(feature = "host")]
+    type AsyncHostType<'stream, 'b> =
+        <&'a SharedHeapPerThreadShallowCopy<T> as CudaKernelParameter>::AsyncHostType<'stream, 'b>;
+    #[cfg(any(feature = "device", doc))]
+    type DeviceType<'b> =
+        <&'a SharedHeapPerThreadShallowCopy<T> as CudaKernelParameter>::DeviceType<'b>;
+    type FfiType<'stream, 'b> =
+        <&'a SharedHeapPerThreadShallowCopy<T> as CudaKernelParameter>::FfiType<'stream, 'b>;
+    #[cfg(feature = "host")]
+    type SyncHostType =
+        <&'a SharedHeapPerThreadShallowCopy<T> as CudaKernelParameter>::SyncHostType;
+
+    #[cfg(feature = "host")]
+    fn with_new_async<'stream, O, E: From<rustacuda::error::CudaError>>(
+        param: Self::SyncHostType,
+        stream: &'stream rustacuda::stream::Stream,
+        inner: impl for<'b> FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result<O, E>,
+    ) -> Result<O, E> {
+        <&'a SharedHeapPerThreadShallowCopy<T> as CudaKernelParameter>::with_new_async(
+            param, stream, inner,
+        )
+    }
+
+    #[cfg(feature = "host")]
+    fn with_async_as_ptx_jit<O>(
+        param: &Self::AsyncHostType<'_, '_>,
+        inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O,
+    ) -> O {
+        inner(Some(&param_as_raw_bytes(param.for_host())))
+    }
+
+    #[cfg(feature = "host")]
+    fn async_to_ffi<'stream, 'b>(
+        param: Self::AsyncHostType<'stream, 'b>,
+    ) -> Self::FfiType<'stream, 'b> {
+        <&'a SharedHeapPerThreadShallowCopy<T> as CudaKernelParameter>::async_to_ffi(param)
+    }
+
+    #[cfg(feature = "device")]
+    unsafe fn with_ffi_as_device<O, const PARAM: usize>(
+        param: Self::FfiType<'static, 'static>,
+        inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O,
+    ) -> O {
+        emit_param_ptx_jit_marker::<_, PARAM>(param.as_ref());
+
+        <&'a SharedHeapPerThreadShallowCopy<T> as CudaKernelParameter>::with_ffi_as_device::<O, PARAM>(
+            param, inner,
+        )
+    }
+}
+impl<'a, T: RustToCuda + crate::safety::NoSafeAliasing> sealed::Sealed
+    for &'a PtxJit<SharedHeapPerThreadShallowCopy<T>>
+{
+}
+
+#[cfg(feature = "host")]
+fn param_as_raw_bytes<T: ?Sized>(r: &T) -> NonNull<[u8]> {
+    NonNull::slice_from_raw_parts(NonNull::from(r).cast::<u8>(), core::mem::size_of_val(r))
+}
+
+#[cfg(feature = "device")]
+fn emit_param_ptx_jit_marker<T: ?Sized, const INDEX: usize>(param: &T) {
+    unsafe {
+        core::arch::asm!(
+            "// <rust-cuda-ptx-jit-const-load-{param_reg}-{param_index}> //",
+            param_reg = in(reg32) *(core::ptr::from_ref(param).cast::<u32>()),
+            param_index = const(INDEX),
+        );
+    }
+}
+
+mod private_shared {
+    use const_type_layout::{TypeGraphLayout, TypeLayout};
+    use rustacuda_core::DeviceCopy;
+
+    #[doc(hidden)]
+    #[derive(TypeLayout)]
+    #[repr(C)]
+    pub struct ThreadBlockSharedFfi<T: 'static + TypeGraphLayout> {
+        pub(super) _marker: [T; 0],
+    }
+
+    // Safety: there is nothing to copy, this is just a zero-sized marker type
+    unsafe impl<T: 'static + TypeGraphLayout> DeviceCopy for ThreadBlockSharedFfi<T> {}
+
+    #[doc(hidden)]
+    #[derive(TypeLayout)]
+    #[repr(C)]
+    pub struct ThreadBlockSharedSliceFfi<T: 'static + TypeGraphLayout> {
+        pub(super) len: usize,
+        pub(super) _marker: [T; 0],
+    }
+
+    // Safety: we only copy a usize, which implements `DeviceCopy`
+    unsafe impl<T: 'static + TypeGraphLayout> DeviceCopy for ThreadBlockSharedSliceFfi<T> {}
+}
+
+impl<'a, T: 'static + TypeGraphLayout> CudaKernelParameter
+    for &'a mut crate::utils::shared::ThreadBlockShared<T>
+{
+    #[cfg(feature = "host")]
+    type AsyncHostType<'stream, 'b> = &'b mut crate::utils::shared::ThreadBlockShared<T>;
+    #[cfg(any(feature = "device", doc))]
+    type DeviceType<'b> = &'b mut crate::utils::shared::ThreadBlockShared<T>;
+    type FfiType<'stream, 'b> = private_shared::ThreadBlockSharedFfi<T>;
+    #[cfg(feature = "host")]
+    type SyncHostType = Self;
+
+    #[cfg(feature = "host")]
+    fn with_new_async<'stream, O, E: From<rustacuda::error::CudaError>>(
+        param: Self::SyncHostType,
+        _stream: &'stream rustacuda::stream::Stream,
+        inner: impl for<'b> FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result<O, E>,
+    ) -> Result<O, E> {
+        inner(param)
+    }
+
+    #[cfg(feature = "host")]
+    fn with_async_as_ptx_jit<O>(
+        _param: &Self::AsyncHostType<'_, '_>,
+        inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O,
+    ) -> O {
+        inner(None)
+    }
+
+    #[cfg(feature = "host")]
+    fn async_to_ffi<'stream, 'b>(
+        _param: Self::AsyncHostType<'stream, 'b>,
+    ) -> Self::FfiType<'stream, 'b> {
+        private_shared::ThreadBlockSharedFfi { _marker: [] }
+    }
+
+    #[cfg(feature = "device")]
+    #[allow(clippy::inline_always)]
+    #[inline(always)]
+    unsafe fn with_ffi_as_device<O, const PARAM: usize>(
+        _param: Self::FfiType<'static, 'static>,
+        inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O,
+    ) -> O {
+        let mut param = crate::utils::shared::ThreadBlockShared::new_uninit();
+
+        inner(&mut param)
+    }
+}
+impl<'a, T: 'static + TypeGraphLayout> sealed::Sealed
+    for &'a mut crate::utils::shared::ThreadBlockShared<T>
+{
+}
+
+impl<'a, T: 'static + TypeGraphLayout> CudaKernelParameter
+    for &'a mut crate::utils::shared::ThreadBlockSharedSlice<T>
+{
+    #[cfg(feature = "host")]
+    type AsyncHostType<'stream, 'b> = &'b mut crate::utils::shared::ThreadBlockSharedSlice<T>;
+    #[cfg(any(feature = "device", doc))]
+    type DeviceType<'b> = &'b mut crate::utils::shared::ThreadBlockSharedSlice<T>;
+    type FfiType<'stream, 'b> = private_shared::ThreadBlockSharedSliceFfi<T>;
+    #[cfg(feature = "host")]
+    type SyncHostType = Self;
+
+    #[cfg(feature = "host")]
+    fn with_new_async<'stream, O, E: From<rustacuda::error::CudaError>>(
+        param: Self::SyncHostType,
+        _stream: &'stream rustacuda::stream::Stream,
+        inner: impl for<'b> FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result<O, E>,
+    ) -> Result<O, E> {
+        inner(param)
+    }
+
+    #[cfg(feature = "host")]
+    fn with_async_as_ptx_jit<O>(
+        _param: &Self::AsyncHostType<'_, '_>,
+        inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O,
+    ) -> O {
+        inner(None)
+    }
+
+    #[cfg(feature = "host")]
+    fn async_to_ffi<'stream, 'b>(
+        param: Self::AsyncHostType<'stream, 'b>,
+    ) -> Self::FfiType<'stream, 'b> {
+        private_shared::ThreadBlockSharedSliceFfi {
+            len: param.len(),
+            _marker: [],
+        }
+    }
+
+    #[cfg(feature = "device")]
+    #[allow(clippy::inline_always)]
+    #[inline(always)]
+    unsafe fn with_ffi_as_device<O, const PARAM: usize>(
+        param: Self::FfiType<'static, 'static>,
+        inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O,
+    ) -> O {
+        unsafe {
+            crate::utils::shared::ThreadBlockSharedSlice::with_uninit_for_len(param.len, inner)
+        }
+    }
+}
+impl<'a, T: 'static + TypeGraphLayout> sealed::Sealed
+    for &'a mut crate::utils::shared::ThreadBlockSharedSlice<T>
+{
+}
diff --git a/src/lib.rs b/src/lib.rs
index 16d48d0b3..6ba80f56f 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -45,15 +45,13 @@ core::compile_error!("cannot enable the `host` feature on a target with `target_
 core::compile_error!("cannot enable the `device` feature on a target without `target_os=\"cuda\"`");
 
 pub mod common;
+pub mod deps;
+pub mod kernel;
+pub mod safety;
+pub mod utils;
 
 #[cfg(feature = "host")]
 pub mod host;
 
 #[cfg(feature = "device")]
 pub mod device;
-
-pub mod utils;
-
-pub mod safety;
-
-pub mod deps;
diff --git a/src/safety/no_aliasing.rs b/src/safety/no_aliasing.rs
index 0fc3abf9c..f5c80e354 100644
--- a/src/safety/no_aliasing.rs
+++ b/src/safety/no_aliasing.rs
@@ -47,9 +47,9 @@
 /// with mutable access to its own partition of a slice and thus avoid mutable
 /// aliasing.
 ///
-/// * [`ThreadBlockShared`](crate::utils::shared::static::ThreadBlockShared)
+/// * [`ThreadBlockShared`](crate::utils::shared::ThreadBlockShared)
 /// and
-/// [`ThreadBlockSharedSlice`](crate::utils::shared::slice::ThreadBlockSharedSlice)
+/// [`ThreadBlockSharedSlice`](crate::utils::shared::ThreadBlockSharedSlice)
 /// also implement [`NoSafeAliasing`] since they only provide access to `*mut
 /// T`, which is always unsafe to mutate and thus moves the burden to uphoald
 /// the no-mutable-aliasing safety invariant to the user who derefereces these
@@ -79,8 +79,8 @@ unsafe impl<T> NoSafeAliasing
 
 // Thread-block-shared data only allows unsafe aliasing since only raw pointers
 //  are exposed
-unsafe impl<T: 'static> NoSafeAliasing for crate::utils::shared::r#static::ThreadBlockShared<T> {}
+unsafe impl<T: 'static> NoSafeAliasing for crate::utils::shared::ThreadBlockShared<T> {}
 unsafe impl<T: 'static + const_type_layout::TypeGraphLayout> NoSafeAliasing
-    for crate::utils::shared::slice::ThreadBlockSharedSlice<T>
+    for crate::utils::shared::ThreadBlockSharedSlice<T>
 {
 }
diff --git a/src/utils/shared/mod.rs b/src/utils/shared/mod.rs
index 88a586ad6..dfd3f2019 100644
--- a/src/utils/shared/mod.rs
+++ b/src/utils/shared/mod.rs
@@ -1,2 +1,11 @@
-pub mod slice;
-pub mod r#static;
+mod slice;
+mod r#static;
+
+pub use slice::ThreadBlockSharedSlice;
+
+#[allow(clippy::module_name_repetitions)]
+pub use r#static::ThreadBlockShared;
+
+#[doc(hidden)]
+#[cfg(feature = "device")]
+pub use slice::init;
diff --git a/src/utils/shared/slice.rs b/src/utils/shared/slice.rs
index bec725bd1..f60276e6b 100644
--- a/src/utils/shared/slice.rs
+++ b/src/utils/shared/slice.rs
@@ -108,7 +108,6 @@ impl<T: 'static + TypeGraphLayout> ThreadBlockSharedSlice<T> {
     }
 }
 
-#[doc(hidden)]
 #[cfg(feature = "device")]
 /// # Safety
 ///

From c8761b0f5a133af6e9c7f9995747b75ea2261620 Mon Sep 17 00:00:00 2001
From: Juniper Tyree <juniper.tyree@helsinki.fi>
Date: Mon, 25 Dec 2023 10:47:10 +0000
Subject: [PATCH 065/120] Add RustToCuda impls for &T, &mut T, &[T], and &mut
 [T] where T: RustToCuda

---
 src/utils/box.rs           |  4 +-
 src/utils/boxed_slice.rs   |  6 +--
 src/utils/mod.rs           |  4 ++
 src/utils/ref.rs           | 83 +++++++++++++++++++++++++++++++++
 src/utils/ref_mut.rs       | 93 +++++++++++++++++++++++++++++++++++++
 src/utils/slice_ref.rs     | 88 +++++++++++++++++++++++++++++++++++
 src/utils/slice_ref_mut.rs | 95 ++++++++++++++++++++++++++++++++++++++
 7 files changed, 366 insertions(+), 7 deletions(-)
 create mode 100644 src/utils/ref.rs
 create mode 100644 src/utils/ref_mut.rs
 create mode 100644 src/utils/slice_ref.rs
 create mode 100644 src/utils/slice_ref_mut.rs

diff --git a/src/utils/box.rs b/src/utils/box.rs
index 8672c36a0..f9c271a67 100644
--- a/src/utils/box.rs
+++ b/src/utils/box.rs
@@ -24,9 +24,7 @@ use crate::{
 #[repr(transparent)]
 #[derive(TypeLayout)]
 #[allow(clippy::module_name_repetitions)]
-pub struct BoxCudaRepresentation<T>(*mut T)
-where
-    T: SafeDeviceCopy + TypeGraphLayout;
+pub struct BoxCudaRepresentation<T: SafeDeviceCopy + TypeGraphLayout>(*mut T);
 
 // Safety: This repr(C) struct only contains a device-owned pointer
 unsafe impl<T: SafeDeviceCopy + TypeGraphLayout> rustacuda_core::DeviceCopy
diff --git a/src/utils/boxed_slice.rs b/src/utils/boxed_slice.rs
index e9113d865..e4796f2f2 100644
--- a/src/utils/boxed_slice.rs
+++ b/src/utils/boxed_slice.rs
@@ -24,11 +24,9 @@ use crate::{
 #[allow(clippy::module_name_repetitions)]
 #[derive(Debug, TypeLayout)]
 #[repr(C)]
-pub struct BoxedSliceCudaRepresentation<T>(*mut T, usize)
-where
-    T: SafeDeviceCopy + TypeGraphLayout;
+pub struct BoxedSliceCudaRepresentation<T: SafeDeviceCopy + TypeGraphLayout>(*mut T, usize);
 
-// Safety: This repr(C) struct only contains a device-owned pointer
+// Safety: This repr(C) struct only contains a device-owned pointer and a usize
 unsafe impl<T: SafeDeviceCopy + TypeGraphLayout> rustacuda_core::DeviceCopy
     for BoxedSliceCudaRepresentation<T>
 {
diff --git a/src/utils/mod.rs b/src/utils/mod.rs
index dadf5a443..73d422f05 100644
--- a/src/utils/mod.rs
+++ b/src/utils/mod.rs
@@ -6,3 +6,7 @@ pub mod shared;
 mod r#box;
 mod boxed_slice;
 mod option;
+mod r#ref;
+mod ref_mut;
+mod slice_ref;
+mod slice_ref_mut;
diff --git a/src/utils/ref.rs b/src/utils/ref.rs
new file mode 100644
index 000000000..6475d9ccf
--- /dev/null
+++ b/src/utils/ref.rs
@@ -0,0 +1,83 @@
+use core::marker::PhantomData;
+
+use const_type_layout::{TypeGraphLayout, TypeLayout};
+
+#[cfg(feature = "host")]
+use rustacuda::{error::CudaResult, memory::DeviceBox};
+
+use crate::{
+    common::{CudaAsRust, RustToCuda},
+    safety::SafeDeviceCopy,
+};
+
+#[cfg(any(feature = "host", feature = "device"))]
+use crate::common::DeviceAccessible;
+
+#[cfg(feature = "host")]
+use crate::{
+    common::{CombinedCudaAlloc, CudaAlloc},
+    host::CudaDropWrapper,
+    utils::device_copy::SafeDeviceCopyWrapper,
+};
+
+#[doc(hidden)]
+#[repr(transparent)]
+#[derive(TypeLayout)]
+#[allow(clippy::module_name_repetitions)]
+pub struct RefCudaRepresentation<'a, T: 'a + SafeDeviceCopy + TypeGraphLayout> {
+    data: *const T,
+    _marker: PhantomData<&'a T>,
+}
+
+// Safety: This repr(C) struct only contains a device-owned pointer
+unsafe impl<'a, T: SafeDeviceCopy + TypeGraphLayout> rustacuda_core::DeviceCopy
+    for RefCudaRepresentation<'a, T>
+{
+}
+
+unsafe impl<'a, T: SafeDeviceCopy + TypeGraphLayout> RustToCuda for &'a T {
+    #[cfg(all(feature = "host", not(doc)))]
+    type CudaAllocation = crate::host::CudaDropWrapper<DeviceBox<SafeDeviceCopyWrapper<T>>>;
+    #[cfg(any(not(feature = "host"), doc))]
+    type CudaAllocation = crate::common::SomeCudaAlloc;
+    type CudaRepresentation = RefCudaRepresentation<'a, T>;
+
+    #[cfg(feature = "host")]
+    #[allow(clippy::type_complexity)]
+    unsafe fn borrow<A: CudaAlloc>(
+        &self,
+        alloc: A,
+    ) -> CudaResult<(
+        DeviceAccessible<Self::CudaRepresentation>,
+        CombinedCudaAlloc<Self::CudaAllocation, A>,
+    )> {
+        let mut device_box =
+            CudaDropWrapper::from(DeviceBox::new(SafeDeviceCopyWrapper::from_ref(&**self))?);
+
+        Ok((
+            DeviceAccessible::from(RefCudaRepresentation {
+                data: device_box.as_device_ptr().as_raw().cast(),
+                _marker: PhantomData::<&'a T>,
+            }),
+            CombinedCudaAlloc::new(device_box, alloc),
+        ))
+    }
+
+    #[cfg(feature = "host")]
+    unsafe fn restore<A: CudaAlloc>(
+        &mut self,
+        alloc: CombinedCudaAlloc<Self::CudaAllocation, A>,
+    ) -> CudaResult<A> {
+        let (_alloc_front, alloc_tail) = alloc.split();
+        Ok(alloc_tail)
+    }
+}
+
+unsafe impl<'a, T: SafeDeviceCopy + TypeGraphLayout> CudaAsRust for RefCudaRepresentation<'a, T> {
+    type RustRepresentation = &'a T;
+
+    #[cfg(feature = "device")]
+    unsafe fn as_rust(this: &DeviceAccessible<Self>) -> Self::RustRepresentation {
+        &*this.data
+    }
+}
diff --git a/src/utils/ref_mut.rs b/src/utils/ref_mut.rs
new file mode 100644
index 000000000..a5cbae62a
--- /dev/null
+++ b/src/utils/ref_mut.rs
@@ -0,0 +1,93 @@
+use core::marker::PhantomData;
+
+use const_type_layout::{TypeGraphLayout, TypeLayout};
+
+#[cfg(feature = "host")]
+use rustacuda::{error::CudaResult, memory::DeviceBox};
+
+use crate::{
+    common::{CudaAsRust, RustToCuda},
+    safety::SafeDeviceCopy,
+};
+
+#[cfg(any(feature = "host", feature = "device"))]
+use crate::common::DeviceAccessible;
+
+#[cfg(feature = "host")]
+use crate::{
+    common::{CombinedCudaAlloc, CudaAlloc},
+    host::CudaDropWrapper,
+    utils::device_copy::SafeDeviceCopyWrapper,
+};
+
+#[doc(hidden)]
+#[repr(transparent)]
+#[derive(TypeLayout)]
+#[allow(clippy::module_name_repetitions)]
+pub struct RefMutCudaRepresentation<'a, T: 'a + SafeDeviceCopy + TypeGraphLayout> {
+    data: *mut T,
+    _marker: PhantomData<&'a mut T>,
+}
+
+// Safety: This repr(C) struct only contains a device-owned pointer
+unsafe impl<'a, T: SafeDeviceCopy + TypeGraphLayout> rustacuda_core::DeviceCopy
+    for RefMutCudaRepresentation<'a, T>
+{
+}
+
+unsafe impl<'a, T: SafeDeviceCopy + TypeGraphLayout> RustToCuda for &'a mut T {
+    #[cfg(all(feature = "host", not(doc)))]
+    type CudaAllocation = crate::host::CudaDropWrapper<DeviceBox<SafeDeviceCopyWrapper<T>>>;
+    #[cfg(any(not(feature = "host"), doc))]
+    type CudaAllocation = crate::common::SomeCudaAlloc;
+    type CudaRepresentation = RefMutCudaRepresentation<'a, T>;
+
+    #[cfg(feature = "host")]
+    #[allow(clippy::type_complexity)]
+    unsafe fn borrow<A: CudaAlloc>(
+        &self,
+        alloc: A,
+    ) -> CudaResult<(
+        DeviceAccessible<Self::CudaRepresentation>,
+        CombinedCudaAlloc<Self::CudaAllocation, A>,
+    )> {
+        let mut device_box =
+            CudaDropWrapper::from(DeviceBox::new(SafeDeviceCopyWrapper::from_ref(&**self))?);
+
+        Ok((
+            DeviceAccessible::from(RefMutCudaRepresentation {
+                data: device_box.as_device_ptr().as_raw_mut().cast(),
+                _marker: PhantomData::<&'a mut T>,
+            }),
+            CombinedCudaAlloc::new(device_box, alloc),
+        ))
+    }
+
+    #[cfg(feature = "host")]
+    unsafe fn restore<A: CudaAlloc>(
+        &mut self,
+        alloc: CombinedCudaAlloc<Self::CudaAllocation, A>,
+    ) -> CudaResult<A> {
+        use rustacuda::memory::CopyDestination;
+
+        let (alloc_front, alloc_tail) = alloc.split();
+
+        alloc_front.copy_to(SafeDeviceCopyWrapper::from_mut(&mut **self))?;
+
+        core::mem::drop(alloc_front);
+
+        Ok(alloc_tail)
+    }
+}
+
+unsafe impl<'a, T: SafeDeviceCopy + TypeGraphLayout> CudaAsRust
+    for RefMutCudaRepresentation<'a, T>
+{
+    type RustRepresentation = &'a mut T;
+
+    #[cfg(feature = "device")]
+    unsafe fn as_rust(this: &DeviceAccessible<Self>) -> Self::RustRepresentation {
+        let data: *mut T = this.data;
+        &mut *data
+    }
+}
diff --git a/src/utils/slice_ref.rs b/src/utils/slice_ref.rs
new file mode 100644
index 000000000..a2a5e5012
--- /dev/null
+++ b/src/utils/slice_ref.rs
@@ -0,0 +1,88 @@
+use core::marker::PhantomData;
+
+use const_type_layout::{TypeGraphLayout, TypeLayout};
+
+#[cfg(feature = "host")]
+use rustacuda::{error::CudaResult, memory::DeviceBuffer};
+
+use crate::{
+    common::{CudaAsRust, RustToCuda},
+    safety::SafeDeviceCopy,
+};
+
+#[cfg(any(feature = "host", feature = "device"))]
+use crate::common::DeviceAccessible;
+
+#[cfg(feature = "host")]
+use crate::{
+    common::{CombinedCudaAlloc, CudaAlloc},
+    host::CudaDropWrapper,
+    utils::device_copy::SafeDeviceCopyWrapper,
+};
+
+#[doc(hidden)]
+#[allow(clippy::module_name_repetitions)]
+#[derive(Debug, TypeLayout)]
+#[repr(C)]
+pub struct SliceRefCudaRepresentation<'a, T: 'a + SafeDeviceCopy + TypeGraphLayout> {
+    data: *const T,
+    len: usize,
+    _marker: PhantomData<&'a [T]>,
+}
+
+// Safety: This repr(C) struct only contains a device-owned pointer and a usize
+unsafe impl<'a, T: SafeDeviceCopy + TypeGraphLayout> rustacuda_core::DeviceCopy
+    for SliceRefCudaRepresentation<'a, T>
+{
+}
+
+unsafe impl<'a, T: SafeDeviceCopy + TypeGraphLayout> RustToCuda for &'a [T] {
+    #[cfg(all(feature = "host", not(doc)))]
+    type CudaAllocation = crate::host::CudaDropWrapper<DeviceBuffer<SafeDeviceCopyWrapper<T>>>;
+    #[cfg(any(not(feature = "host"), doc))]
+    type CudaAllocation = crate::common::SomeCudaAlloc;
+    type CudaRepresentation = SliceRefCudaRepresentation<'a, T>;
+
+    #[cfg(feature = "host")]
+    #[allow(clippy::type_complexity)]
+    unsafe fn borrow<A: CudaAlloc>(
+        &self,
+        alloc: A,
+    ) -> CudaResult<(
+        DeviceAccessible<Self::CudaRepresentation>,
+        CombinedCudaAlloc<Self::CudaAllocation, A>,
+    )> {
+        let device_buffer = CudaDropWrapper::from(DeviceBuffer::from_slice(
+            SafeDeviceCopyWrapper::from_slice(self),
+        )?);
+
+        Ok((
+            DeviceAccessible::from(SliceRefCudaRepresentation {
+                data: device_buffer.as_ptr().cast(),
+                len: device_buffer.len(),
+                _marker: PhantomData::<&'a [T]>,
+            }),
+            CombinedCudaAlloc::new(device_buffer, alloc),
+        ))
+    }
+
+    #[cfg(feature = "host")]
+    unsafe fn restore<A: CudaAlloc>(
+        &mut self,
+        alloc: CombinedCudaAlloc<Self::CudaAllocation, A>,
+    ) -> CudaResult<A> {
+        let (_alloc_front, alloc_tail) = alloc.split();
+        Ok(alloc_tail)
+    }
+}
+
+unsafe impl<'a, T: SafeDeviceCopy + TypeGraphLayout> CudaAsRust
+    for SliceRefCudaRepresentation<'a, T>
+{
+    type RustRepresentation = &'a [T];
+
+    #[cfg(feature = "device")]
+    unsafe fn as_rust(this: &DeviceAccessible<Self>) -> Self::RustRepresentation {
+        core::slice::from_raw_parts(this.data, this.len)
+    }
+}
diff --git a/src/utils/slice_ref_mut.rs b/src/utils/slice_ref_mut.rs
new file mode 100644
index 000000000..64371a1e3
--- /dev/null
+++ b/src/utils/slice_ref_mut.rs
@@ -0,0 +1,95 @@
+use core::marker::PhantomData;
+
+use const_type_layout::{TypeGraphLayout, TypeLayout};
+
+#[cfg(feature = "host")]
+use rustacuda::{error::CudaResult, memory::DeviceBuffer};
+
+use crate::{
+    common::{CudaAsRust, RustToCuda},
+    safety::SafeDeviceCopy,
+};
+
+#[cfg(any(feature = "host", feature = "device"))]
+use crate::common::DeviceAccessible;
+
+#[cfg(feature = "host")]
+use crate::{
+    common::{CombinedCudaAlloc, CudaAlloc},
+    host::CudaDropWrapper,
+    utils::device_copy::SafeDeviceCopyWrapper,
+};
+
+#[doc(hidden)]
+#[allow(clippy::module_name_repetitions)]
+#[derive(Debug, TypeLayout)]
+#[repr(C)]
+pub struct SliceRefMutCudaRepresentation<'a, T: 'a + SafeDeviceCopy + TypeGraphLayout> {
+    data: *mut T,
+    len: usize,
+    _marker: PhantomData<&'a mut [T]>,
+}
+
+// Safety: This repr(C) struct only contains a device-owned pointer and a usize
+unsafe impl<'a, T: SafeDeviceCopy + TypeGraphLayout> rustacuda_core::DeviceCopy
+    for SliceRefMutCudaRepresentation<'a, T>
+{
+}
+
+unsafe impl<'a, T: SafeDeviceCopy + TypeGraphLayout> RustToCuda for &'a mut [T] {
+    #[cfg(all(feature = "host", not(doc)))]
+    type CudaAllocation = crate::host::CudaDropWrapper<DeviceBuffer<SafeDeviceCopyWrapper<T>>>;
+    #[cfg(any(not(feature = "host"), doc))]
+    type CudaAllocation = crate::common::SomeCudaAlloc;
+    type CudaRepresentation = SliceRefMutCudaRepresentation<'a, T>;
+
+    #[cfg(feature = "host")]
+    #[allow(clippy::type_complexity)]
+    unsafe fn borrow<A: CudaAlloc>(
+        &self,
+        alloc: A,
+    ) -> CudaResult<(
+        DeviceAccessible<Self::CudaRepresentation>,
+        CombinedCudaAlloc<Self::CudaAllocation, A>,
+    )> {
+        let mut device_buffer = CudaDropWrapper::from(DeviceBuffer::from_slice(
+            SafeDeviceCopyWrapper::from_slice(self),
+        )?);
+
+        Ok((
+            DeviceAccessible::from(SliceRefMutCudaRepresentation {
+                data: device_buffer.as_mut_ptr().cast(),
+                len: device_buffer.len(),
+                _marker: PhantomData::<&'a mut [T]>,
+            }),
+            CombinedCudaAlloc::new(device_buffer, alloc),
+        ))
+    }
+
+    #[cfg(feature = "host")]
+    unsafe fn restore<A: CudaAlloc>(
+        &mut self,
+        alloc: CombinedCudaAlloc<Self::CudaAllocation, A>,
+    ) -> CudaResult<A> {
+        use rustacuda::memory::CopyDestination;
+
+        let (alloc_front, alloc_tail) = alloc.split();
+
+        alloc_front.copy_to(SafeDeviceCopyWrapper::from_mut_slice(self))?;
+
+        core::mem::drop(alloc_front);
+
+        Ok(alloc_tail)
+    }
+}
+
+unsafe impl<'a, T: SafeDeviceCopy + TypeGraphLayout> CudaAsRust
+    for SliceRefMutCudaRepresentation<'a, T>
+{
+    type RustRepresentation = &'a mut [T];
+
+    #[cfg(feature = "device")]
+    unsafe fn as_rust(this: &DeviceAccessible<Self>) -> Self::RustRepresentation {
+        core::slice::from_raw_parts_mut(this.data, this.len)
+    }
+}

From 8d2d85667308246ed885b05d4c06917b774c1e03 Mon Sep 17 00:00:00 2001
From: Juniper Tyree <juniper.tyree@helsinki.fi>
Date: Mon, 25 Dec 2023 14:51:27 +0000
Subject: [PATCH 066/120] Large restructuring of the module layout for
 rust-cuda

---
 examples/derive/src/lib.rs                    |   4 +-
 examples/print/src/main.rs                    |   7 +-
 examples/single-source/src/main.rs            |  18 +-
 .../kernel/wrapper/generate/host_kernel_ty.rs |   4 +-
 .../generate/host_linker_macro/get_ptx.rs     |   4 +-
 .../wrapper/generate/host_linker_macro/mod.rs |   4 +-
 rust-cuda-derive/src/kernel/wrapper/mod.rs    |   2 +-
 .../src/rust_to_cuda/field_copy.rs            |  44 +-
 rust-cuda-derive/src/rust_to_cuda/field_ty.rs |  10 +-
 rust-cuda-derive/src/rust_to_cuda/generics.rs |   4 +-
 rust-cuda-derive/src/rust_to_cuda/impl.rs     |  40 +-
 rust-cuda-derive/src/rust_to_cuda/mod.rs      |   2 +-
 src/alloc.rs                                  |  52 ++
 src/common.rs                                 | 311 -----------
 src/device/mod.rs                             |  73 ---
 src/host/mod.rs                               | 525 +-----------------
 src/kernel/mod.rs                             | 520 +++++++++++++++++
 src/{kernel.rs => kernel/param.rs}            | 114 ++--
 src/{host => kernel}/ptx_jit/mod.rs           |   0
 src/{host => kernel}/ptx_jit/preprocess.rs    |   0
 src/{host => kernel}/ptx_jit/regex.rs         |   0
 src/{host => kernel}/ptx_jit/replace.rs       |   0
 src/{utils => lend/impls}/box.rs              |   8 +-
 src/{utils => lend/impls}/boxed_slice.rs      |   8 +-
 src/lend/impls/mod.rs                         |   7 +
 src/{utils => lend/impls}/option.rs           |   9 +-
 src/{utils => lend/impls}/ref.rs              |   8 +-
 src/{utils => lend/impls}/ref_mut.rs          |   8 +-
 src/{utils => lend/impls}/slice_ref.rs        |   8 +-
 src/{utils => lend/impls}/slice_ref_mut.rs    |   8 +-
 src/lend/mod.rs                               | 283 ++++++++++
 src/lib.rs                                    |   4 +-
 src/safety/device_copy.rs                     |   2 +-
 src/safety/no_aliasing.rs                     |   2 +-
 src/utils/aliasing/const.rs                   |  21 +-
 src/utils/aliasing/dynamic.rs                 |  21 +-
 src/utils/aliasing/final.rs                   |  21 +-
 src/utils/device_copy.rs                      |   7 +-
 src/utils/exchange/buffer/common.rs           |   6 +-
 src/utils/exchange/buffer/host.rs             |   3 +-
 src/utils/exchange/buffer/mod.rs              |  10 +-
 src/utils/exchange/wrapper.rs                 |   7 +-
 src/utils/ffi.rs                              | 133 +++++
 src/utils/mod.rs                              |   9 +-
 src/utils/shared/mod.rs                       |   3 +
 src/utils/shared/slice.rs                     |  43 ++
 46 files changed, 1269 insertions(+), 1108 deletions(-)
 create mode 100644 src/alloc.rs
 delete mode 100644 src/common.rs
 create mode 100644 src/kernel/mod.rs
 rename src/{kernel.rs => kernel/param.rs} (93%)
 rename src/{host => kernel}/ptx_jit/mod.rs (100%)
 rename src/{host => kernel}/ptx_jit/preprocess.rs (100%)
 rename src/{host => kernel}/ptx_jit/regex.rs (100%)
 rename src/{host => kernel}/ptx_jit/replace.rs (100%)
 rename src/{utils => lend/impls}/box.rs (93%)
 rename src/{utils => lend/impls}/boxed_slice.rs (93%)
 create mode 100644 src/lend/impls/mod.rs
 rename src/{utils => lend/impls}/option.rs (96%)
 rename src/{utils => lend/impls}/ref.rs (93%)
 rename src/{utils => lend/impls}/ref_mut.rs (93%)
 rename src/{utils => lend/impls}/slice_ref.rs (93%)
 rename src/{utils => lend/impls}/slice_ref_mut.rs (93%)
 create mode 100644 src/lend/mod.rs
 create mode 100644 src/utils/ffi.rs

diff --git a/examples/derive/src/lib.rs b/examples/derive/src/lib.rs
index 76a7d3cb1..622b1b699 100644
--- a/examples/derive/src/lib.rs
+++ b/examples/derive/src/lib.rs
@@ -2,14 +2,14 @@
 #![feature(const_type_name)]
 #![feature(offset_of)]
 
-#[derive(rc::common::LendRustToCuda)]
+#[derive(rc::lend::LendRustToCuda)]
 #[cuda(crate = "rc")]
 struct Inner<T: Copy> {
     #[cuda(embed)]
     inner: T,
 }
 
-#[derive(rc::common::LendRustToCuda)]
+#[derive(rc::lend::LendRustToCuda)]
 #[cuda(crate = "rc")]
 struct Outer<T: Copy> {
     #[cuda(embed)]
diff --git a/examples/print/src/main.rs b/examples/print/src/main.rs
index 7a26ce2bd..31c6897f3 100644
--- a/examples/print/src/main.rs
+++ b/examples/print/src/main.rs
@@ -23,7 +23,7 @@ pub enum Action {
 
 #[rust_cuda::kernel::kernel(use link! for impl)]
 #[kernel(allow(ptx::local_memory_usage))]
-pub fn kernel(action: rust_cuda::kernel::PerThreadShallowCopy<Action>) {
+pub fn kernel(action: rust_cuda::kernel::param::PerThreadShallowCopy<Action>) {
     match action {
         Action::Print => rust_cuda::device::utils::println!("println! from CUDA kernel"),
         Action::Panic => panic!("panic! from CUDA kernel"),
@@ -62,11 +62,10 @@ fn main() -> rust_cuda::deps::rustacuda::error::CudaResult<()> {
         )?);
 
     // Create a new instance of the CUDA kernel and prepare the launch config
-    let mut kernel = rust_cuda::host::TypedPtxKernel::<kernel>::new::<KernelPtx>(None);
-    let config = rust_cuda::host::LaunchConfig {
+    let mut kernel = rust_cuda::kernel::TypedPtxKernel::<kernel>::new::<KernelPtx>(None);
+    let config = rust_cuda::kernel::LaunchConfig {
         grid: rust_cuda::deps::rustacuda::function::GridSize::x(1),
         block: rust_cuda::deps::rustacuda::function::BlockSize::x(4),
-        shared_memory_size: 0,
         ptx_jit: false,
     };
 
diff --git a/examples/single-source/src/main.rs b/examples/single-source/src/main.rs
index 383ade30a..40d212294 100644
--- a/examples/single-source/src/main.rs
+++ b/examples/single-source/src/main.rs
@@ -23,7 +23,7 @@ fn main() {}
 #[layout(crate = "rc::deps::const_type_layout")]
 pub struct Dummy(i32);
 
-#[derive(rc::common::LendRustToCuda)]
+#[derive(rc::lend::LendRustToCuda)]
 #[cuda(crate = "rc")]
 #[allow(dead_code)]
 pub struct Wrapper<T> {
@@ -31,7 +31,7 @@ pub struct Wrapper<T> {
     inner: T,
 }
 
-#[derive(rc::common::LendRustToCuda)]
+#[derive(rc::lend::LendRustToCuda)]
 #[cuda(crate = "rc")]
 pub struct Empty([u8; 0]);
 
@@ -54,18 +54,18 @@ pub struct Triple(i32, i32, i32);
 pub fn kernel<
     'a,
     T: 'static
-        + rc::common::RustToCuda<
+        + rc::lend::RustToCuda<
             CudaRepresentation: rc::safety::StackOnly,
-            CudaAllocation: rc::common::EmptyCudaAlloc,
+            CudaAllocation: rc::alloc::EmptyCudaAlloc,
         >
         + rc::safety::StackOnly
         + rc::safety::NoSafeAliasing,
 >(
-    _x: &rc::kernel::PerThreadShallowCopy<Dummy>,
-    _z: &rc::kernel::SharedHeapPerThreadShallowCopy<Wrapper<T>>,
-    _v @ _w: &'a rc::kernel::ShallowInteriorMutable<core::sync::atomic::AtomicU64>,
-    _: rc::kernel::SharedHeapPerThreadShallowCopy<Wrapper<T>>,
-    q @ Triple(s, mut __t, _u): rc::kernel::PerThreadShallowCopy<Triple>,
+    _x: &rc::kernel::param::PerThreadShallowCopy<Dummy>,
+    _z: &rc::kernel::param::SharedHeapPerThreadShallowCopy<Wrapper<T>>,
+    _v @ _w: &'a rc::kernel::param::ShallowInteriorMutable<core::sync::atomic::AtomicU64>,
+    _: rc::kernel::param::SharedHeapPerThreadShallowCopy<Wrapper<T>>,
+    q @ Triple(s, mut __t, _u): rc::kernel::param::PerThreadShallowCopy<Triple>,
     shared3: &mut rc::utils::shared::ThreadBlockShared<u32>,
     dynamic: &mut rc::utils::shared::ThreadBlockSharedSlice<Dummy>,
 ) {
diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/host_kernel_ty.rs b/rust-cuda-derive/src/kernel/wrapper/generate/host_kernel_ty.rs
index 75c86820f..84ece28b5 100644
--- a/rust-cuda-derive/src/kernel/wrapper/generate/host_kernel_ty.rs
+++ b/rust-cuda-derive/src/kernel/wrapper/generate/host_kernel_ty.rs
@@ -43,7 +43,7 @@ pub(in super::super) fn quote_host_kernel_ty(
         pub type #func_ident #generic_start_token
             #generic_kernel_params
         #generic_close_token = impl Fn(
-            &mut #crate_path::host::Launcher<#func_ident #generic_start_token
+            &mut #crate_path::kernel::Launcher<#func_ident #generic_start_token
                 #(#full_generics),*
             #generic_close_token>,
             #(#cuda_kernel_param_tys),*
@@ -56,7 +56,7 @@ pub(in super::super) fn quote_host_kernel_ty(
         fn #private_func_ident #generic_start_token
             #generic_kernel_params
         #generic_close_token (
-            #launcher: &mut #crate_path::host::Launcher<#func_ident #generic_start_token
+            #launcher: &mut #crate_path::kernel::Launcher<#func_ident #generic_start_token
                 #(#full_generics),*
             #generic_close_token>,
             #func_inputs
diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/host_linker_macro/get_ptx.rs b/rust-cuda-derive/src/kernel/wrapper/generate/host_linker_macro/get_ptx.rs
index d7394142e..599b68fce 100644
--- a/rust-cuda-derive/src/kernel/wrapper/generate/host_linker_macro/get_ptx.rs
+++ b/rust-cuda-derive/src/kernel/wrapper/generate/host_linker_macro/get_ptx.rs
@@ -47,7 +47,7 @@ pub(super) fn quote_get_ptx(
             }> = #crate_path::safety::kernel_signature::Assert::<{
                 #crate_path::safety::kernel_signature::check(
                     PTX_CSTR.to_bytes(),
-                    #crate_path::host::specialise_kernel_entry_point!(
+                    #crate_path::kernel::specialise_kernel_entry_point!(
                         #func_ident_hash #generic_start_token
                             #($#macro_type_ids),*
                         #generic_close_token
@@ -85,7 +85,7 @@ pub(super) fn quote_get_ptx(
             #[allow(unused_imports)]
             use __rust_cuda_ffi_safe_assert::#args;
 
-            #crate_path::host::link_kernel!{
+            #crate_path::kernel::link_kernel!{
                 #func_ident #func_ident_hash #crate_name #crate_manifest_dir #generic_start_token
                     #($#macro_type_ids),*
                 #generic_close_token #ptx_lint_levels
diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/host_linker_macro/mod.rs b/rust-cuda-derive/src/kernel/wrapper/generate/host_linker_macro/mod.rs
index dc609da26..cfc0af751 100644
--- a/rust-cuda-derive/src/kernel/wrapper/generate/host_linker_macro/mod.rs
+++ b/rust-cuda-derive/src/kernel/wrapper/generate/host_linker_macro/mod.rs
@@ -91,14 +91,14 @@ pub(in super::super) fn quote_host_linker_macro(
                 #(#macro_generics),* $(,)?
             #generic_close_token for $ptx:ident
         ) {
-            unsafe impl<#($#macro_only_lt_generic_ids),*> #crate_path::host::CompiledKernelPtx<
+            unsafe impl<#($#macro_only_lt_generic_ids),*> #crate_path::kernel::CompiledKernelPtx<
                 #func_ident_name #generic_start_token #($#macro_generic_ids),* #generic_close_token
             > for $ptx #generic_start_token #($#macro_generic_ids),* #generic_close_token
             {
                 #get_ptx
 
                 fn get_entry_point() -> &'static ::core::ffi::CStr {
-                    #crate_path::host::specialise_kernel_entry_point!(
+                    #crate_path::kernel::specialise_kernel_entry_point!(
                         #func_ident_hash #generic_start_token
                             #($#macro_non_lt_generic_ids),*
                         #generic_close_token
diff --git a/rust-cuda-derive/src/kernel/wrapper/mod.rs b/rust-cuda-derive/src/kernel/wrapper/mod.rs
index f3e1177bc..7793c2dc0 100644
--- a/rust-cuda-derive/src/kernel/wrapper/mod.rs
+++ b/rust-cuda-derive/src/kernel/wrapper/mod.rs
@@ -346,7 +346,7 @@ fn quote_generic_check(
 
     quote::quote_spanned! { func_ident_hash.span()=>
         #[cfg(not(target_os = "cuda"))]
-        const _: ::core::result::Result<(), ()> = #crate_path::host::check_kernel!(
+        const _: ::core::result::Result<(), ()> = #crate_path::kernel::check_kernel!(
             #func_ident #func_ident_hash #crate_name #crate_manifest_dir
         );
     }
diff --git a/rust-cuda-derive/src/rust_to_cuda/field_copy.rs b/rust-cuda-derive/src/rust_to_cuda/field_copy.rs
index 549f5ab56..f6464d197 100644
--- a/rust-cuda-derive/src/rust_to_cuda/field_copy.rs
+++ b/rust-cuda-derive/src/rust_to_cuda/field_copy.rs
@@ -34,12 +34,12 @@ pub fn impl_field_copy_init_and_expand_alloc_type(
     match cuda_repr_field_ty {
         CudaReprFieldTy::SafeDeviceCopy => {
             r2c_field_declarations.push(quote! {
-                let #field_repr_ident = #crate_path::common::DeviceAccessible::from(
+                let #field_repr_ident = #crate_path::utils::ffi::DeviceAccessible::from(
                     &self.#field_accessor,
                 );
             });
             r2c_field_async_declarations.push(quote! {
-                let #field_repr_ident = #crate_path::common::DeviceAccessible::from(
+                let #field_repr_ident = #crate_path::utils::ffi::DeviceAccessible::from(
                     &self.#field_accessor,
                 );
             });
@@ -50,26 +50,26 @@ pub fn impl_field_copy_init_and_expand_alloc_type(
 
             c2r_field_initialisations.push(quote! {
                 #optional_field_ident {
-                    #crate_path::common::CudaAsRust::as_rust(&this.#field_accessor).into_inner()
+                    #crate_path::lend::CudaAsRust::as_rust(&this.#field_accessor).into_inner()
                 },
             });
         },
         CudaReprFieldTy::RustToCuda { field_ty } => {
             combined_cuda_alloc_type = quote! {
-                #crate_path::common::CombinedCudaAlloc<
-                    <#field_ty as #crate_path::common::RustToCuda>::CudaAllocation,
+                #crate_path::alloc::CombinedCudaAlloc<
+                    <#field_ty as #crate_path::lend::RustToCuda>::CudaAllocation,
                     #combined_cuda_alloc_type
                 >
             };
 
             r2c_field_declarations.push(quote! {
-                let (#field_repr_ident, alloc_front) = #crate_path::common::RustToCuda::borrow(
+                let (#field_repr_ident, alloc_front) = #crate_path::lend::RustToCuda::borrow(
                     &self.#field_accessor,
                     alloc_front,
                 )?;
             });
             r2c_field_async_declarations.push(quote! {
-                let (#field_repr_ident, alloc_front) = #crate_path::common::RustToCudaAsync::borrow_async(
+                let (#field_repr_ident, alloc_front) = #crate_path::lend::RustToCudaAsync::borrow_async(
                     &self.#field_accessor,
                     alloc_front,
                     stream,
@@ -81,13 +81,13 @@ pub fn impl_field_copy_init_and_expand_alloc_type(
             });
 
             r2c_field_destructors.push(quote! {
-                let alloc_front = #crate_path::common::RustToCuda::restore(
+                let alloc_front = #crate_path::lend::RustToCuda::restore(
                     &mut self.#field_accessor,
                     alloc_front,
                 )?;
             });
             r2c_field_async_destructors.push(quote! {
-                let alloc_front = #crate_path::common::RustToCudaAsync::restore_async(
+                let alloc_front = #crate_path::lend::RustToCudaAsync::restore_async(
                     &mut self.#field_accessor,
                     alloc_front,
                     stream,
@@ -96,30 +96,30 @@ pub fn impl_field_copy_init_and_expand_alloc_type(
 
             c2r_field_initialisations.push(quote! {
                 #optional_field_ident {
-                    #crate_path::common::CudaAsRust::as_rust(&this.#field_accessor)
+                    #crate_path::lend::CudaAsRust::as_rust(&this.#field_accessor)
                 },
             });
         },
         CudaReprFieldTy::RustToCudaProxy { proxy_ty, field_ty } => {
             combined_cuda_alloc_type = quote! {
-                #crate_path::common::CombinedCudaAlloc<
-                    <#proxy_ty as #crate_path::common::RustToCuda>::CudaAllocation,
+                #crate_path::alloc::CombinedCudaAlloc<
+                    <#proxy_ty as #crate_path::lend::RustToCuda>::CudaAllocation,
                     #combined_cuda_alloc_type
                 >
             };
 
             r2c_field_declarations.push(quote! {
-                let (#field_repr_ident, alloc_front) = #crate_path::common::RustToCuda::borrow(
+                let (#field_repr_ident, alloc_front) = #crate_path::lend::RustToCuda::borrow(
                     <
-                        #proxy_ty as #crate_path::common::RustToCudaProxy<#field_ty>
+                        #proxy_ty as #crate_path::lend::RustToCudaProxy<#field_ty>
                     >::from_ref(&self.#field_accessor),
                     alloc_front,
                 )?;
             });
             r2c_field_async_declarations.push(quote! {
-                let (#field_repr_ident, alloc_front) = #crate_path::common::RustToCudaAsync::borrow_async(
+                let (#field_repr_ident, alloc_front) = #crate_path::lend::RustToCudaAsync::borrow_async(
                     <
-                        #proxy_ty as #crate_path::common::RustToCudaAsyncProxy<#field_ty>
+                        #proxy_ty as #crate_path::lend::RustToCudaAsyncProxy<#field_ty>
                     >::from_ref(&self.#field_accessor),
                     alloc_front,
                     stream,
@@ -131,17 +131,17 @@ pub fn impl_field_copy_init_and_expand_alloc_type(
             });
 
             r2c_field_destructors.push(quote! {
-                let alloc_front = #crate_path::common::RustToCuda::restore(
+                let alloc_front = #crate_path::lend::RustToCuda::restore(
                     <
-                        #proxy_ty as #crate_path::common::RustToCudaProxy<#field_ty>
+                        #proxy_ty as #crate_path::lend::RustToCudaProxy<#field_ty>
                     >::from_mut(&mut self.#field_accessor),
                     alloc_front,
                 )?;
             });
             r2c_field_async_destructors.push(quote! {
-                let alloc_front = #crate_path::common::RustToCudaAsync::restore_async(
+                let alloc_front = #crate_path::lend::RustToCudaAsync::restore_async(
                     <
-                        #proxy_ty as #crate_path::common::RustToCudaAsyncProxy<#field_ty>
+                        #proxy_ty as #crate_path::lend::RustToCudaAsyncProxy<#field_ty>
                     >::from_mut(&mut self.#field_accessor),
                     alloc_front,
                     stream,
@@ -150,8 +150,8 @@ pub fn impl_field_copy_init_and_expand_alloc_type(
 
             c2r_field_initialisations.push(quote! {
                 #optional_field_ident {
-                    #crate_path::common::RustToCudaProxy::<#field_ty>::into(
-                        #crate_path::common::CudaAsRust::as_rust(&this.#field_accessor)
+                    #crate_path::lend::RustToCudaProxy::<#field_ty>::into(
+                        #crate_path::lend::CudaAsRust::as_rust(&this.#field_accessor)
                     )
                 },
             });
diff --git a/rust-cuda-derive/src/rust_to_cuda/field_ty.rs b/rust-cuda-derive/src/rust_to_cuda/field_ty.rs
index 21509ef8c..aee846fe3 100644
--- a/rust-cuda-derive/src/rust_to_cuda/field_ty.rs
+++ b/rust-cuda-derive/src/rust_to_cuda/field_ty.rs
@@ -36,8 +36,8 @@ pub fn swap_field_type_and_filter_attrs(
                                     field_ty: Box::new(field_ty.clone()),
                                 });
                                 field_ty = parse_quote! {
-                                    #crate_path::common::DeviceAccessible<
-                                        <#field_ty as #crate_path::common::RustToCuda>::CudaRepresentation
+                                    #crate_path::utils::ffi::DeviceAccessible<
+                                        <#field_ty as #crate_path::lend::RustToCuda>::CudaRepresentation
                                     >
                                 };
                             } else {
@@ -57,8 +57,8 @@ pub fn swap_field_type_and_filter_attrs(
                                     Ok(proxy_ty) => {
                                         let old_field_ty = Box::new(field_ty.clone());
                                         field_ty = parse_quote! {
-                                            #crate_path::common::DeviceAccessible<
-                                                <#proxy_ty as #crate_path::common::RustToCuda>::CudaRepresentation
+                                            #crate_path::utils::ffi::DeviceAccessible<
+                                                <#proxy_ty as #crate_path::lend::RustToCuda>::CudaRepresentation
                                             >
                                         };
                                         cuda_repr_field_ty = Some(CudaReprFieldTy::RustToCudaProxy {
@@ -107,7 +107,7 @@ pub fn swap_field_type_and_filter_attrs(
         cuda_repr_field_ty
     } else {
         field_ty = parse_quote! {
-            #crate_path::common::DeviceAccessible<
+            #crate_path::utils::ffi::DeviceAccessible<
                 #crate_path::utils::device_copy::SafeDeviceCopyWrapper<#field_ty>
             >
         };
diff --git a/rust-cuda-derive/src/rust_to_cuda/generics.rs b/rust-cuda-derive/src/rust_to_cuda/generics.rs
index b9335db46..4325f39fb 100644
--- a/rust-cuda-derive/src/rust_to_cuda/generics.rs
+++ b/rust-cuda-derive/src/rust_to_cuda/generics.rs
@@ -190,13 +190,13 @@ pub fn expand_cuda_struct_generics_where_requested_in_attrs(
             .make_where_clause()
             .predicates
             .push(syn::parse_quote! {
-                #ty: #crate_path::common::RustToCuda
+                #ty: #crate_path::lend::RustToCuda
             });
         struct_generics_cuda_async
             .make_where_clause()
             .predicates
             .push(syn::parse_quote! {
-                #ty: #crate_path::common::RustToCudaAsync
+                #ty: #crate_path::lend::RustToCudaAsync
             });
     }
 
diff --git a/rust-cuda-derive/src/rust_to_cuda/impl.rs b/rust-cuda-derive/src/rust_to_cuda/impl.rs
index 2928cebef..b7dc1eb13 100644
--- a/rust-cuda-derive/src/rust_to_cuda/impl.rs
+++ b/rust-cuda-derive/src/rust_to_cuda/impl.rs
@@ -76,7 +76,7 @@ pub fn rust_to_cuda_trait(
     let (impl_generics, ty_generics, where_clause) = struct_generics_cuda.split_for_impl();
 
     quote! {
-        unsafe impl #impl_generics #crate_path::common::RustToCuda for #struct_name #ty_generics
+        unsafe impl #impl_generics #crate_path::lend::RustToCuda for #struct_name #ty_generics
             #where_clause
         {
             type CudaRepresentation = #struct_name_cuda #ty_generics;
@@ -84,14 +84,14 @@ pub fn rust_to_cuda_trait(
             type CudaAllocation = #combined_cuda_alloc_type;
 
             #[cfg(not(target_os = "cuda"))]
-            unsafe fn borrow<CudaAllocType: #crate_path::common::CudaAlloc>(
+            unsafe fn borrow<CudaAllocType: #crate_path::alloc::CudaAlloc>(
                 &self,
                 alloc: CudaAllocType,
             ) -> #crate_path::deps::rustacuda::error::CudaResult<(
-                #crate_path::common::DeviceAccessible<Self::CudaRepresentation>,
-                #crate_path::common::CombinedCudaAlloc<Self::CudaAllocation, CudaAllocType>
+                #crate_path::utils::ffi::DeviceAccessible<Self::CudaRepresentation>,
+                #crate_path::alloc::CombinedCudaAlloc<Self::CudaAllocation, CudaAllocType>
             )> {
-                let alloc_front = #crate_path::common::NoCudaAlloc;
+                let alloc_front = #crate_path::alloc::NoCudaAlloc;
                 let alloc_tail = alloc;
 
                 #(#r2c_field_declarations)*
@@ -99,15 +99,15 @@ pub fn rust_to_cuda_trait(
                 let borrow = #rust_to_cuda_struct_construction;
 
                 Ok((
-                    #crate_path::common::DeviceAccessible::from(borrow),
-                    #crate_path::common::CombinedCudaAlloc::new(alloc_front, alloc_tail)
+                    #crate_path::utils::ffi::DeviceAccessible::from(borrow),
+                    #crate_path::alloc::CombinedCudaAlloc::new(alloc_front, alloc_tail)
                 ))
             }
 
             #[cfg(not(target_os = "cuda"))]
-            unsafe fn restore<CudaAllocType: #crate_path::common::CudaAlloc>(
+            unsafe fn restore<CudaAllocType: #crate_path::alloc::CudaAlloc>(
                 &mut self,
-                alloc: #crate_path::common::CombinedCudaAlloc<
+                alloc: #crate_path::alloc::CombinedCudaAlloc<
                     Self::CudaAllocation, CudaAllocType
                 >,
             ) -> #crate_path::deps::rustacuda::error::CudaResult<CudaAllocType> {
@@ -149,19 +149,19 @@ pub fn rust_to_cuda_async_trait(
     let (impl_generics, ty_generics, where_clause) = struct_generics_cuda_async.split_for_impl();
 
     quote! {
-        unsafe impl #impl_generics #crate_path::common::RustToCudaAsync for #struct_name #ty_generics
+        unsafe impl #impl_generics #crate_path::lend::RustToCudaAsync for #struct_name #ty_generics
             #where_clause
         {
             #[cfg(not(target_os = "cuda"))]
-            unsafe fn borrow_async<CudaAllocType: #crate_path::common::CudaAlloc>(
+            unsafe fn borrow_async<CudaAllocType: #crate_path::alloc::CudaAlloc>(
                 &self,
                 alloc: CudaAllocType,
                 stream: &#crate_path::deps::rustacuda::stream::Stream,
             ) -> #crate_path::deps::rustacuda::error::CudaResult<(
-                #crate_path::common::DeviceAccessible<Self::CudaRepresentation>,
-                #crate_path::common::CombinedCudaAlloc<Self::CudaAllocation, CudaAllocType>
+                #crate_path::utils::ffi::DeviceAccessible<Self::CudaRepresentation>,
+                #crate_path::alloc::CombinedCudaAlloc<Self::CudaAllocation, CudaAllocType>
             )> {
-                let alloc_front = #crate_path::common::NoCudaAlloc;
+                let alloc_front = #crate_path::alloc::NoCudaAlloc;
                 let alloc_tail = alloc;
 
                 #(#r2c_field_async_declarations)*
@@ -169,15 +169,15 @@ pub fn rust_to_cuda_async_trait(
                 let borrow = #rust_to_cuda_struct_construction;
 
                 Ok((
-                    #crate_path::common::DeviceAccessible::from(borrow),
-                    #crate_path::common::CombinedCudaAlloc::new(alloc_front, alloc_tail)
+                    #crate_path::utils::ffi::DeviceAccessible::from(borrow),
+                    #crate_path::alloc::CombinedCudaAlloc::new(alloc_front, alloc_tail)
                 ))
             }
 
             #[cfg(not(target_os = "cuda"))]
-            unsafe fn restore_async<CudaAllocType: #crate_path::common::CudaAlloc>(
+            unsafe fn restore_async<CudaAllocType: #crate_path::alloc::CudaAlloc>(
                 &mut self,
-                alloc: #crate_path::common::CombinedCudaAlloc<
+                alloc: #crate_path::alloc::CombinedCudaAlloc<
                     Self::CudaAllocation, CudaAllocType
                 >,
                 stream: &#crate_path::deps::rustacuda::stream::Stream,
@@ -217,14 +217,14 @@ pub fn cuda_as_rust_trait(
     let (impl_generics, ty_generics, where_clause) = &struct_generics_cuda.split_for_impl();
 
     quote! {
-        unsafe impl #impl_generics #crate_path::common::CudaAsRust
+        unsafe impl #impl_generics #crate_path::lend::CudaAsRust
             for #struct_name_cuda #ty_generics #where_clause
         {
             type RustRepresentation = #struct_name #ty_generics;
 
             #[cfg(target_os = "cuda")]
             unsafe fn as_rust(
-                this: &#crate_path::common::DeviceAccessible<Self>,
+                this: &#crate_path::utils::ffi::DeviceAccessible<Self>,
             ) -> #struct_name #ty_generics {
                 #cuda_as_rust_struct_construction
             }
diff --git a/rust-cuda-derive/src/rust_to_cuda/mod.rs b/rust-cuda-derive/src/rust_to_cuda/mod.rs
index fb5b39503..6a885ac94 100644
--- a/rust-cuda-derive/src/rust_to_cuda/mod.rs
+++ b/rust-cuda-derive/src/rust_to_cuda/mod.rs
@@ -31,7 +31,7 @@ pub fn impl_rust_to_cuda(ast: &syn::DeriveInput) -> proc_macro::TokenStream {
     ) = generics::expand_cuda_struct_generics_where_requested_in_attrs(ast);
 
     let mut combined_cuda_alloc_type: TokenStream = quote! {
-        #crate_path::common::NoCudaAlloc
+        #crate_path::alloc::NoCudaAlloc
     };
     let mut r2c_field_declarations: Vec<TokenStream> = Vec::new();
     let mut r2c_field_async_declarations: Vec<TokenStream> = Vec::new();
diff --git a/src/alloc.rs b/src/alloc.rs
new file mode 100644
index 000000000..f16178aec
--- /dev/null
+++ b/src/alloc.rs
@@ -0,0 +1,52 @@
+#![allow(clippy::module_name_repetitions)]
+
+pub trait EmptyCudaAlloc: sealed::empty::Sealed {}
+
+pub trait CudaAlloc: sealed::alloc::Sealed {}
+
+impl<T: CudaAlloc> CudaAlloc for Option<T> {}
+impl<T: CudaAlloc> sealed::alloc::Sealed for Option<T> {}
+
+pub struct NoCudaAlloc;
+impl CudaAlloc for NoCudaAlloc {}
+impl sealed::alloc::Sealed for NoCudaAlloc {}
+impl EmptyCudaAlloc for NoCudaAlloc {}
+impl sealed::empty::Sealed for NoCudaAlloc {}
+
+pub struct SomeCudaAlloc(());
+impl CudaAlloc for SomeCudaAlloc {}
+impl sealed::alloc::Sealed for SomeCudaAlloc {}
+impl !EmptyCudaAlloc for SomeCudaAlloc {}
+impl !sealed::empty::Sealed for SomeCudaAlloc {}
+
+pub struct CombinedCudaAlloc<A: CudaAlloc, B: CudaAlloc>(A, B);
+impl<A: CudaAlloc, B: CudaAlloc> CudaAlloc for CombinedCudaAlloc<A, B> {}
+impl<A: CudaAlloc, B: CudaAlloc> sealed::alloc::Sealed for CombinedCudaAlloc<A, B> {}
+impl<A: CudaAlloc + EmptyCudaAlloc, B: CudaAlloc + EmptyCudaAlloc> EmptyCudaAlloc
+    for CombinedCudaAlloc<A, B>
+{
+}
+impl<A: CudaAlloc + EmptyCudaAlloc, B: CudaAlloc + EmptyCudaAlloc> sealed::empty::Sealed
+    for CombinedCudaAlloc<A, B>
+{
+}
+impl<A: CudaAlloc, B: CudaAlloc> CombinedCudaAlloc<A, B> {
+    #[must_use]
+    pub const fn new(front: A, tail: B) -> Self {
+        Self(front, tail)
+    }
+
+    pub fn split(self) -> (A, B) {
+        (self.0, self.1)
+    }
+}
+
+pub(crate) mod sealed {
+    pub(super) mod empty {
+        pub trait Sealed {}
+    }
+
+    pub mod alloc {
+        pub trait Sealed {}
+    }
+}
diff --git a/src/common.rs b/src/common.rs
deleted file mode 100644
index 37d005ac4..000000000
--- a/src/common.rs
+++ /dev/null
@@ -1,311 +0,0 @@
-use core::marker::PhantomData;
-#[cfg(feature = "device")]
-use core::{
-    convert::{AsMut, AsRef},
-    ops::{Deref, DerefMut},
-};
-
-#[cfg(feature = "host")]
-use core::{mem::MaybeUninit, ptr::copy_nonoverlapping};
-#[cfg(feature = "host")]
-use std::fmt;
-
-use const_type_layout::{TypeGraphLayout, TypeLayout};
-use rustacuda_core::DeviceCopy;
-
-#[cfg(feature = "derive")]
-pub use rust_cuda_derive::LendRustToCuda;
-
-#[cfg(feature = "host")]
-use crate::{safety::SafeDeviceCopy, utils::device_copy::SafeDeviceCopyWrapper};
-
-#[repr(transparent)]
-#[cfg_attr(any(feature = "device", doc), derive(Debug))]
-#[derive(TypeLayout)]
-pub struct DeviceAccessible<T: ?Sized + DeviceCopy>(T);
-
-unsafe impl<T: ?Sized + DeviceCopy> DeviceCopy for DeviceAccessible<T> {}
-
-#[cfg(feature = "host")]
-impl<T: CudaAsRust> From<T> for DeviceAccessible<T> {
-    fn from(value: T) -> Self {
-        Self(value)
-    }
-}
-
-#[cfg(feature = "host")]
-impl<T: SafeDeviceCopy + TypeGraphLayout> From<&T> for DeviceAccessible<SafeDeviceCopyWrapper<T>> {
-    fn from(value: &T) -> Self {
-        let value = unsafe {
-            let mut uninit = MaybeUninit::uninit();
-            copy_nonoverlapping(value, uninit.as_mut_ptr(), 1);
-            uninit.assume_init()
-        };
-
-        Self(SafeDeviceCopyWrapper::from(value))
-    }
-}
-
-#[cfg(all(feature = "host", not(doc)))]
-impl<T: ?Sized + DeviceCopy + fmt::Debug> fmt::Debug for DeviceAccessible<T> {
-    fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result {
-        fmt.debug_struct(stringify!(DeviceAccessible))
-            .finish_non_exhaustive()
-    }
-}
-
-#[cfg(feature = "device")]
-impl<T: ?Sized + DeviceCopy> Deref for DeviceAccessible<T> {
-    type Target = T;
-
-    fn deref(&self) -> &Self::Target {
-        &self.0
-    }
-}
-
-#[cfg(feature = "device")]
-impl<T: ?Sized + DeviceCopy> DerefMut for DeviceAccessible<T> {
-    fn deref_mut(&mut self) -> &mut Self::Target {
-        &mut self.0
-    }
-}
-
-/// # Safety
-///
-/// This is an internal trait and should ONLY be derived automatically using
-/// `#[derive(LendRustToCuda)]`
-pub unsafe trait RustToCuda {
-    type CudaAllocation: CudaAlloc;
-    type CudaRepresentation: CudaAsRust<RustRepresentation = Self> + TypeGraphLayout;
-
-    #[doc(hidden)]
-    #[cfg(feature = "host")]
-    /// # Errors
-    ///
-    /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside
-    /// CUDA
-    ///
-    /// # Safety
-    ///
-    /// This is an internal function and should NEVER be called manually
-    /// The returned [`Self::CudaRepresentation`] must NEVER be accessed on the
-    ///  CPU  as it contains a GPU-resident copy of `self`.
-    #[allow(clippy::type_complexity)]
-    unsafe fn borrow<A: CudaAlloc>(
-        &self,
-        alloc: A,
-    ) -> rustacuda::error::CudaResult<(
-        DeviceAccessible<Self::CudaRepresentation>,
-        CombinedCudaAlloc<Self::CudaAllocation, A>,
-    )>;
-
-    #[doc(hidden)]
-    #[cfg(feature = "host")]
-    /// # Errors
-    ///
-    /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside
-    /// CUDA
-    ///
-    /// # Safety
-    ///
-    /// This is an internal function and should NEVER be called manually
-    #[allow(clippy::type_complexity)]
-    unsafe fn restore<A: CudaAlloc>(
-        &mut self,
-        alloc: CombinedCudaAlloc<Self::CudaAllocation, A>,
-    ) -> rustacuda::error::CudaResult<A>;
-}
-
-/// # Safety
-///
-/// This is an internal trait and should ONLY be derived automatically using
-/// `#[derive(LendRustToCuda)]`
-pub unsafe trait RustToCudaAsync: RustToCuda {
-    #[doc(hidden)]
-    #[cfg(feature = "host")]
-    /// # Errors
-    ///
-    /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside
-    /// CUDA
-    ///
-    /// # Safety
-    ///
-    /// This is an internal function and should NEVER be called manually
-    /// The returned
-    /// [`Self::CudaRepresentation`](RustToCuda::CudaRepresentation) must NEVER
-    /// be accessed on the  CPU  as it contains a GPU-resident copy of
-    /// `self`.
-    #[allow(clippy::type_complexity)]
-    unsafe fn borrow_async<A: CudaAlloc>(
-        &self,
-        alloc: A,
-        stream: &rustacuda::stream::Stream,
-    ) -> rustacuda::error::CudaResult<(
-        DeviceAccessible<Self::CudaRepresentation>,
-        CombinedCudaAlloc<Self::CudaAllocation, A>,
-    )>;
-
-    #[doc(hidden)]
-    #[cfg(feature = "host")]
-    /// # Errors
-    ///
-    /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside
-    /// CUDA
-    ///
-    /// # Safety
-    ///
-    /// This is an internal function and should NEVER be called manually
-    #[allow(clippy::type_complexity)]
-    unsafe fn restore_async<A: CudaAlloc>(
-        &mut self,
-        alloc: CombinedCudaAlloc<Self::CudaAllocation, A>,
-        stream: &rustacuda::stream::Stream,
-    ) -> rustacuda::error::CudaResult<A>;
-}
-
-/// # Safety
-///
-/// This is an internal trait and should NEVER be implemented manually
-pub unsafe trait CudaAsRust: DeviceCopy + TypeGraphLayout {
-    type RustRepresentation: RustToCuda<CudaRepresentation = Self>;
-
-    #[doc(hidden)]
-    #[cfg(feature = "device")]
-    /// # Safety
-    ///
-    /// This is an internal function and should NEVER be called manually
-    unsafe fn as_rust(this: &DeviceAccessible<Self>) -> Self::RustRepresentation;
-}
-
-pub trait RustToCudaProxy<T>: RustToCuda {
-    fn from_ref(val: &T) -> &Self;
-    fn from_mut(val: &mut T) -> &mut Self;
-
-    fn into(self) -> T;
-}
-
-pub trait RustToCudaAsyncProxy<T>: RustToCudaAsync {
-    fn from_ref(val: &T) -> &Self;
-    fn from_mut(val: &mut T) -> &mut Self;
-
-    fn into(self) -> T;
-}
-
-#[repr(transparent)]
-#[derive(Clone, Copy, TypeLayout)]
-pub struct DeviceConstRef<'r, T: DeviceCopy + 'r> {
-    #[cfg_attr(feature = "host", allow(dead_code))]
-    pub(super) pointer: *const T,
-    pub(super) reference: PhantomData<&'r T>,
-}
-
-unsafe impl<'r, T: DeviceCopy> DeviceCopy for DeviceConstRef<'r, T> {}
-
-#[cfg(feature = "device")]
-impl<'r, T: DeviceCopy> AsRef<T> for DeviceConstRef<'r, T> {
-    fn as_ref(&self) -> &T {
-        unsafe { &*self.pointer }
-    }
-}
-
-#[repr(transparent)]
-#[derive(TypeLayout)]
-pub struct DeviceMutRef<'r, T: DeviceCopy + 'r> {
-    #[cfg_attr(feature = "host", allow(dead_code))]
-    pub(super) pointer: *mut T,
-    pub(super) reference: PhantomData<&'r mut T>,
-}
-
-unsafe impl<'r, T: DeviceCopy> DeviceCopy for DeviceMutRef<'r, T> {}
-
-#[cfg(feature = "device")]
-impl<'r, T: DeviceCopy> AsRef<T> for DeviceMutRef<'r, T> {
-    fn as_ref(&self) -> &T {
-        unsafe { &*self.pointer }
-    }
-}
-
-#[cfg(feature = "device")]
-impl<'r, T: DeviceCopy> AsMut<T> for DeviceMutRef<'r, T> {
-    fn as_mut(&mut self) -> &mut T {
-        unsafe { &mut *self.pointer }
-    }
-}
-
-#[repr(transparent)]
-#[derive(TypeLayout)]
-pub struct DeviceOwnedRef<'r, T: DeviceCopy> {
-    #[cfg_attr(feature = "host", allow(dead_code))]
-    pub(super) pointer: *mut T,
-    pub(super) reference: PhantomData<&'r mut ()>,
-    pub(super) marker: PhantomData<T>,
-}
-
-unsafe impl<'r, T: DeviceCopy> DeviceCopy for DeviceOwnedRef<'r, T> {}
-
-#[cfg(feature = "device")]
-impl<'r, T: DeviceCopy> AsRef<T> for DeviceOwnedRef<'r, T> {
-    fn as_ref(&self) -> &T {
-        unsafe { &*self.pointer }
-    }
-}
-
-#[cfg(feature = "device")]
-impl<'r, T: DeviceCopy> AsMut<T> for DeviceOwnedRef<'r, T> {
-    fn as_mut(&mut self) -> &mut T {
-        unsafe { &mut *self.pointer }
-    }
-}
-
-pub(crate) mod crate_private {
-    pub mod alloc {
-        pub trait Sealed {}
-    }
-}
-
-mod private {
-    pub mod empty {
-        pub trait Sealed {}
-    }
-}
-
-pub trait EmptyCudaAlloc: private::empty::Sealed {}
-
-pub trait CudaAlloc: crate_private::alloc::Sealed {}
-
-impl<T: CudaAlloc> CudaAlloc for Option<T> {}
-impl<T: CudaAlloc> crate_private::alloc::Sealed for Option<T> {}
-
-pub struct NoCudaAlloc;
-impl CudaAlloc for NoCudaAlloc {}
-impl crate_private::alloc::Sealed for NoCudaAlloc {}
-impl EmptyCudaAlloc for NoCudaAlloc {}
-impl private::empty::Sealed for NoCudaAlloc {}
-
-pub struct SomeCudaAlloc(());
-impl CudaAlloc for SomeCudaAlloc {}
-impl crate_private::alloc::Sealed for SomeCudaAlloc {}
-impl !EmptyCudaAlloc for SomeCudaAlloc {}
-impl !private::empty::Sealed for SomeCudaAlloc {}
-
-pub struct CombinedCudaAlloc<A: CudaAlloc, B: CudaAlloc>(A, B);
-impl<A: CudaAlloc, B: CudaAlloc> CudaAlloc for CombinedCudaAlloc<A, B> {}
-impl<A: CudaAlloc, B: CudaAlloc> crate_private::alloc::Sealed for CombinedCudaAlloc<A, B> {}
-impl<A: CudaAlloc + EmptyCudaAlloc, B: CudaAlloc + EmptyCudaAlloc> EmptyCudaAlloc
-    for CombinedCudaAlloc<A, B>
-{
-}
-impl<A: CudaAlloc + EmptyCudaAlloc, B: CudaAlloc + EmptyCudaAlloc> private::empty::Sealed
-    for CombinedCudaAlloc<A, B>
-{
-}
-impl<A: CudaAlloc, B: CudaAlloc> CombinedCudaAlloc<A, B> {
-    #[must_use]
-    pub const fn new(front: A, tail: B) -> Self {
-        Self(front, tail)
-    }
-
-    pub fn split(self) -> (A, B) {
-        (self.0, self.1)
-    }
-}
diff --git a/src/device/mod.rs b/src/device/mod.rs
index 07894b5bb..0c2a0c83f 100644
--- a/src/device/mod.rs
+++ b/src/device/mod.rs
@@ -1,79 +1,6 @@
-use core::mem::ManuallyDrop;
-
 #[cfg(feature = "derive")]
 pub use rust_cuda_derive::{specialise_kernel_function, specialise_kernel_type};
 
-use crate::{
-    common::{CudaAsRust, DeviceAccessible, DeviceConstRef, DeviceOwnedRef, RustToCuda},
-    safety::{NoSafeAliasing, SafeDeviceCopy},
-};
-
 pub mod alloc;
 pub mod thread;
 pub mod utils;
-
-pub trait BorrowFromRust: RustToCuda + NoSafeAliasing {
-    /// # Safety
-    ///
-    /// This function is only safe to call iff `cuda_repr` is the
-    ///  [`DeviceConstRef`] borrowed on the CPU using the corresponding
-    ///  [`LendToCuda::lend_to_cuda`](crate::host::LendToCuda::lend_to_cuda).
-    unsafe fn with_borrow_from_rust<O, F: FnOnce(&Self) -> O>(
-        cuda_repr: DeviceConstRef<DeviceAccessible<<Self as RustToCuda>::CudaRepresentation>>,
-        inner: F,
-    ) -> O;
-
-    // /// # Safety
-    // ///
-    // /// This function is only safe to call iff `cuda_repr_mut` is the
-    // ///  [`DeviceMutRef`] borrowed on the CPU using the corresponding
-    // ///  [`LendToCuda::lend_to_cuda_mut`](crate::host::LendToCuda::lend_to_cuda_mut).
-    // /// Furthermore, since different GPU threads can access heap storage
-    // ///  mutably inside the safe `inner` scope, there must not be any
-    // ///  aliasing between concurrently running threads.
-    // unsafe fn with_borrow_from_rust_mut<O, F: FnOnce(&mut Self) -> O>(
-    //     cuda_repr_mut: DeviceMutRef<DeviceAccessible<<Self as
-    // RustToCuda>::CudaRepresentation>>,     inner: F,
-    // ) -> O;
-
-    /// # Safety
-    ///
-    /// This function is only safe to call iff `cuda_repr` is the
-    ///  [`DeviceOwnedRef`] borrowed on the CPU using the corresponding
-    ///  [`LendToCuda::move_to_cuda`](crate::host::LendToCuda::move_to_cuda).
-    unsafe fn with_moved_from_rust<O, F: FnOnce(Self) -> O>(
-        cuda_repr_mut: DeviceOwnedRef<DeviceAccessible<<Self as RustToCuda>::CudaRepresentation>>,
-        inner: F,
-    ) -> O
-    where
-        Self: Sized,
-        <Self as RustToCuda>::CudaRepresentation: SafeDeviceCopy;
-}
-
-impl<T: RustToCuda + NoSafeAliasing> BorrowFromRust for T {
-    #[inline]
-    unsafe fn with_borrow_from_rust<O, F: FnOnce(&Self) -> O>(
-        cuda_repr: DeviceConstRef<DeviceAccessible<<Self as RustToCuda>::CudaRepresentation>>,
-        inner: F,
-    ) -> O {
-        // `rust_repr` must never be dropped as we do NOT own any of the
-        //  heap memory it might reference
-        let rust_repr = ManuallyDrop::new(CudaAsRust::as_rust(cuda_repr.as_ref()));
-
-        inner(&rust_repr)
-    }
-
-    #[inline]
-    unsafe fn with_moved_from_rust<O, F: FnOnce(Self) -> O>(
-        mut cuda_repr_mut: DeviceOwnedRef<
-            DeviceAccessible<<Self as RustToCuda>::CudaRepresentation>,
-        >,
-        inner: F,
-    ) -> O
-    where
-        Self: Sized,
-        <Self as RustToCuda>::CudaRepresentation: SafeDeviceCopy,
-    {
-        inner(CudaAsRust::as_rust(cuda_repr_mut.as_mut()))
-    }
-}
diff --git a/src/host/mod.rs b/src/host/mod.rs
index 2d423362a..ba37e32e2 100644
--- a/src/host/mod.rs
+++ b/src/host/mod.rs
@@ -1,523 +1,24 @@
 use std::{
-    ffi::{CStr, CString},
     marker::PhantomData,
     mem::ManuallyDrop,
     ops::{Deref, DerefMut},
-    ptr::NonNull,
 };
 
 use rustacuda::{
     context::Context,
     error::{CudaError, CudaResult},
     event::Event,
-    function::Function,
     memory::{DeviceBox, DeviceBuffer, LockedBox, LockedBuffer},
     module::Module,
     stream::Stream,
 };
 use rustacuda_core::{DeviceCopy, DevicePointer};
 
-#[cfg(feature = "derive")]
-pub use rust_cuda_derive::{check_kernel, link_kernel, specialise_kernel_entry_point};
-
 use crate::{
-    common::{
-        DeviceAccessible, DeviceConstRef, DeviceMutRef, DeviceOwnedRef, EmptyCudaAlloc,
-        NoCudaAlloc, RustToCuda,
-    },
-    kernel::CudaKernelParameter,
-    safety::{NoSafeAliasing, SafeDeviceCopy},
+    safety::SafeDeviceCopy,
+    utils::ffi::{DeviceConstRef, DeviceMutRef, DeviceOwnedRef},
 };
 
-mod ptx_jit;
-use ptx_jit::{PtxJITCompiler, PtxJITResult};
-
-pub struct Launcher<'stream, 'kernel, Kernel> {
-    pub stream: &'stream Stream,
-    pub kernel: &'kernel mut TypedPtxKernel<Kernel>,
-    pub config: LaunchConfig,
-}
-
-macro_rules! impl_launcher_launch {
-    ($launch:ident($($arg:ident : $T:ident),*) => $with_async:ident => $launch_async:ident) => {
-        #[allow(clippy::missing_errors_doc)]
-        #[allow(clippy::too_many_arguments)] // func is defined for <= 12 args
-        pub fn $launch<$($T: CudaKernelParameter),*>(
-            &mut self,
-            $($arg: $T::SyncHostType),*
-        ) -> CudaResult<()>
-        where
-            Kernel: FnOnce(&mut Launcher<Kernel>, $($T),*),
-        {
-            self.kernel.$launch::<$($T),*>(self.stream, &self.config, $($arg),*)
-        }
-
-        #[allow(clippy::missing_errors_doc)]
-        #[allow(clippy::too_many_arguments)] // func is defined for <= 12 args
-        pub fn $with_async<
-            'a,
-            Ok,
-            Err: From<CudaError>,
-            $($T: CudaKernelParameter),*
-        >(
-            &'a mut self,
-            $($arg: $T::SyncHostType,)*
-            inner: impl FnOnce(
-                &'a mut Self,
-                $($T::AsyncHostType<'stream, '_>),*
-            ) -> Result<Ok, Err>,
-        ) -> Result<Ok, Err>
-        where
-            Kernel: FnOnce(&mut Launcher<Kernel>, $($T),*),
-        {
-            #[allow(unused_variables)]
-            let stream = self.stream;
-
-            impl_launcher_launch! { impl with_new_async ($($arg: $T),*) + (stream) {
-                inner(self, $($arg),*)
-            } }
-        }
-
-        #[allow(clippy::missing_errors_doc)]
-        #[allow(clippy::too_many_arguments)] // func is defined for <= 12 args
-        pub fn $launch_async<$($T: CudaKernelParameter),*>(
-            &mut self,
-            $($arg: $T::AsyncHostType<'stream, '_>),*
-        ) -> CudaResult<()>
-        where
-            Kernel: FnOnce(&mut Launcher<Kernel>, $($T),*),
-        {
-            self.kernel.$launch_async::<$($T),*>(self.stream, &self.config, $($arg),*)
-        }
-    };
-    (impl $func:ident () + ($($other:ident),*) $inner:block) => {
-        $inner
-    };
-    (impl $func:ident ($arg0:ident : $T0:ident $(, $arg:ident : $T:ident)*) + ($($other:ident),*) $inner:block) => {
-        $T0::$func($arg0 $(, $other)*, |$arg0| {
-            impl_launcher_launch! { impl $func ($($arg: $T),*) + ($($other),*) $inner }
-        })
-    };
-}
-
-impl<'stream, 'kernel, Kernel> Launcher<'stream, 'kernel, Kernel> {
-    impl_launcher_launch! { launch0() => with0_async => launch0_async }
-
-    impl_launcher_launch! { launch1(
-        arg1: A
-    ) => with1_async => launch1_async }
-
-    impl_launcher_launch! { launch2(
-        arg1: A, arg2: B
-    ) => with2_async => launch2_async }
-
-    impl_launcher_launch! { launch3(
-        arg1: A, arg2: B, arg3: C
-    ) => with3_async => launch3_async }
-
-    impl_launcher_launch! { launch4(
-        arg1: A, arg2: B, arg3: C, arg4: D
-    ) => with4_async => launch4_async }
-
-    impl_launcher_launch! { launch5(
-        arg1: A, arg2: B, arg3: C, arg4: D, arg5: E
-    ) => with5_async => launch5_async }
-
-    impl_launcher_launch! { launch6(
-        arg1: A, arg2: B, arg3: C, arg4: D, arg5: E, arg6: F
-    ) => with6_async => launch6_async }
-
-    impl_launcher_launch! { launch7(
-        arg1: A, arg2: B, arg3: C, arg4: D, arg5: E, arg6: F, arg7: G
-    ) => with7_async => launch7_async }
-
-    impl_launcher_launch! { launch8(
-        arg1: A, arg2: B, arg3: C, arg4: D, arg5: E, arg6: F, arg7: G, arg8: H
-    ) => with8_async => launch8_async }
-
-    impl_launcher_launch! { launch9(
-        arg1: A, arg2: B, arg3: C, arg4: D, arg5: E, arg6: F, arg7: G, arg8: H, arg9: I
-    ) => with9_async => launch9_async }
-
-    impl_launcher_launch! { launch10(
-        arg1: A, arg2: B, arg3: C, arg4: D, arg5: E, arg6: F, arg7: G, arg8: H, arg9: I, arg10: J
-    ) => with10_async => launch10_async }
-
-    impl_launcher_launch! { launch11(
-        arg1: A, arg2: B, arg3: C, arg4: D, arg5: E, arg6: F, arg7: G, arg8: H, arg9: I, arg10: J,
-        arg11: K
-    ) => with11_async => launch11_async }
-
-    impl_launcher_launch! { launch12(
-        arg1: A, arg2: B, arg3: C, arg4: D, arg5: E, arg6: F, arg7: G, arg8: H, arg9: I, arg10: J,
-        arg11: K, arg12: L
-    ) => with12_async => launch12_async }
-}
-
-#[derive(Clone, Debug, PartialEq, Eq)]
-pub struct LaunchConfig {
-    pub grid: rustacuda::function::GridSize,
-    pub block: rustacuda::function::BlockSize,
-    pub shared_memory_size: u32,
-    pub ptx_jit: bool,
-}
-
-pub struct RawPtxKernel {
-    module: ManuallyDrop<Box<Module>>,
-    function: ManuallyDrop<Function<'static>>,
-}
-
-impl RawPtxKernel {
-    /// # Errors
-    ///
-    /// Returns a [`CudaError`] if `ptx` is not a valid PTX source, or it does
-    ///  not contain an entry point named `entry_point`.
-    pub fn new(ptx: &CStr, entry_point: &CStr) -> CudaResult<Self> {
-        let module = Box::new(Module::load_from_string(ptx)?);
-
-        let function = unsafe { &*(module.as_ref() as *const Module) }.get_function(entry_point);
-
-        let function = match function {
-            Ok(function) => function,
-            Err(err) => {
-                if let Err((_err, module)) = Module::drop(*module) {
-                    std::mem::forget(module);
-                }
-
-                return Err(err);
-            },
-        };
-
-        Ok(Self {
-            function: ManuallyDrop::new(function),
-            module: ManuallyDrop::new(module),
-        })
-    }
-
-    #[must_use]
-    pub fn get_function(&self) -> &Function {
-        &self.function
-    }
-}
-
-impl Drop for RawPtxKernel {
-    fn drop(&mut self) {
-        {
-            // Ensure that self.function is dropped before self.module as
-            //  it borrows data from the module and must not outlive it
-            let _function = unsafe { ManuallyDrop::take(&mut self.function) };
-        }
-
-        if let Err((_err, module)) = Module::drop(*unsafe { ManuallyDrop::take(&mut self.module) })
-        {
-            std::mem::forget(module);
-        }
-    }
-}
-
-pub type PtxKernelConfigure = dyn FnMut(&Function) -> CudaResult<()>;
-
-pub struct TypedPtxKernel<Kernel> {
-    compiler: PtxJITCompiler,
-    ptx_kernel: Option<RawPtxKernel>,
-    entry_point: Box<CStr>,
-    configure: Option<Box<PtxKernelConfigure>>,
-    marker: PhantomData<Kernel>,
-}
-
-macro_rules! impl_typed_kernel_launch {
-    ($launch:ident($($arg:ident : $T:ident),*) => $with_async:ident => $launch_async:ident) => {
-        #[allow(clippy::missing_errors_doc)]
-        #[allow(clippy::too_many_arguments)] // func is defined for <= 12 args
-        pub fn $launch<$($T: CudaKernelParameter),*>(
-            &mut self,
-            stream: &Stream,
-            config: &LaunchConfig,
-            $($arg: $T::SyncHostType),*
-        ) -> CudaResult<()>
-        where
-            Kernel: FnOnce(&mut Launcher<Kernel>, $($T),*),
-        {
-            self.$with_async::<(), CudaError, $($T),*>(
-                stream,
-                config,
-                $($arg,)*
-                |kernel, stream, config, $($arg),*| {
-                    let result = kernel.$launch_async::<$($T),*>(stream, config, $($arg),*);
-
-                    // important: always synchronise here, this function is sync!
-                    match (stream.synchronize(), result) {
-                        (Ok(()), result) => result,
-                        (Err(_), Err(err)) | (Err(err), Ok(())) => Err(err),
-                    }
-                },
-            )
-        }
-
-        #[allow(clippy::missing_errors_doc)]
-        #[allow(clippy::too_many_arguments)] // func is defined for <= 12 args
-        pub fn $with_async<
-            'a,
-            'stream,
-            Ok,
-            Err: From<CudaError>,
-            $($T: CudaKernelParameter),*
-        >(
-            &'a mut self,
-            stream: &'stream Stream,
-            config: &LaunchConfig,
-            $($arg: $T::SyncHostType,)*
-            inner: impl FnOnce(
-                &'a mut Self,
-                &'stream Stream,
-                &LaunchConfig,
-                $($T::AsyncHostType<'stream, '_>),*
-            ) -> Result<Ok, Err>,
-        ) -> Result<Ok, Err>
-        where
-            Kernel: FnOnce(&mut Launcher<Kernel>, $($T),*),
-        {
-            impl_typed_kernel_launch! { impl with_new_async ($($arg: $T),*) + (stream) {
-                inner(self, stream, config, $($arg),*)
-            } }
-        }
-
-        #[allow(clippy::missing_errors_doc)]
-        #[allow(clippy::needless_lifetimes)] // 'stream is unused for zero args
-        #[allow(clippy::too_many_arguments)] // func is defined for <= 12 args
-        pub fn $launch_async<'stream, $($T: CudaKernelParameter),*>(
-            &mut self,
-            stream: &'stream Stream,
-            config: &LaunchConfig,
-            $($arg: $T::AsyncHostType<'stream, '_>),*
-        ) -> CudaResult<()>
-        where
-            Kernel: FnOnce(&mut Launcher<Kernel>, $($T),*),
-        {
-            let function = if config.ptx_jit {
-                impl_typed_kernel_launch! { impl with_async_as_ptx_jit ref ($($arg: $T),*) + () {
-                    self.compile_with_ptx_jit_args(Some(&[$($arg),*]))
-                } }?
-            } else {
-                self.compile_with_ptx_jit_args(None)?
-            };
-
-            unsafe { stream.launch(
-                function,
-                config.grid.clone(),
-                config.block.clone(),
-                config.shared_memory_size,
-                &[
-                    $(core::ptr::from_mut(
-                        &mut $T::async_to_ffi($arg)
-                    ).cast::<core::ffi::c_void>()),*
-                ],
-            ) }
-        }
-    };
-    (impl $func:ident () + ($($other:ident),*) $inner:block) => {
-        $inner
-    };
-    (impl $func:ident ($arg0:ident : $T0:ident $(, $arg:ident : $T:ident)*) + ($($other:ident),*) $inner:block) => {
-        $T0::$func($arg0 $(, $other)*, |$arg0| {
-            impl_typed_kernel_launch! { impl $func ($($arg: $T),*) + ($($other),*) $inner }
-        })
-    };
-    (impl $func:ident ref () + ($($other:ident),*) $inner:block) => {
-        $inner
-    };
-    (impl $func:ident ref ($arg0:ident : $T0:ident $(, $arg:ident : $T:ident)*) + ($($other:ident),*) $inner:block) => {
-        $T0::$func(&$arg0 $(, $other)*, |$arg0| {
-            impl_typed_kernel_launch! { impl $func ref ($($arg: $T),*) + ($($other),*) $inner }
-        })
-    };
-}
-
-impl<Kernel> TypedPtxKernel<Kernel> {
-    #[must_use]
-    pub fn new<T: CompiledKernelPtx<Kernel>>(configure: Option<Box<PtxKernelConfigure>>) -> Self {
-        let compiler = PtxJITCompiler::new(T::get_ptx());
-        let entry_point = CString::from(T::get_entry_point()).into_boxed_c_str();
-
-        Self {
-            compiler,
-            ptx_kernel: None,
-            entry_point,
-            configure,
-            marker: PhantomData::<Kernel>,
-        }
-    }
-}
-
-impl<Kernel> TypedPtxKernel<Kernel> {
-    impl_typed_kernel_launch! { launch0() => with0_async => launch0_async }
-
-    impl_typed_kernel_launch! { launch1(
-        arg1: A
-    ) => with1_async => launch1_async }
-
-    impl_typed_kernel_launch! { launch2(
-        arg1: A, arg2: B
-    ) => with2_async => launch2_async }
-
-    impl_typed_kernel_launch! { launch3(
-        arg1: A, arg2: B, arg3: C
-    ) => with3_async => launch3_async }
-
-    impl_typed_kernel_launch! { launch4(
-        arg1: A, arg2: B, arg3: C, arg4: D
-    ) => with4_async => launch4_async }
-
-    impl_typed_kernel_launch! { launch5(
-        arg1: A, arg2: B, arg3: C, arg4: D, arg5: E
-    ) => with5_async => launch5_async }
-
-    impl_typed_kernel_launch! { launch6(
-        arg1: A, arg2: B, arg3: C, arg4: D, arg5: E, arg6: F
-    ) => with6_async => launch6_async }
-
-    impl_typed_kernel_launch! { launch7(
-        arg1: A, arg2: B, arg3: C, arg4: D, arg5: E, arg6: F, arg7: G
-    ) => with7_async => launch7_async }
-
-    impl_typed_kernel_launch! { launch8(
-        arg1: A, arg2: B, arg3: C, arg4: D, arg5: E, arg6: F, arg7: G, arg8: H
-    ) => with8_async => launch8_async }
-
-    impl_typed_kernel_launch! { launch9(
-        arg1: A, arg2: B, arg3: C, arg4: D, arg5: E, arg6: F, arg7: G, arg8: H, arg9: I
-    ) => with9_async => launch9_async }
-
-    impl_typed_kernel_launch! { launch10(
-        arg1: A, arg2: B, arg3: C, arg4: D, arg5: E, arg6: F, arg7: G, arg8: H, arg9: I, arg10: J
-    ) => with10_async => launch10_async }
-
-    impl_typed_kernel_launch! { launch11(
-        arg1: A, arg2: B, arg3: C, arg4: D, arg5: E, arg6: F, arg7: G, arg8: H, arg9: I, arg10: J,
-        arg11: K
-    ) => with11_async => launch11_async }
-
-    impl_typed_kernel_launch! { launch12(
-        arg1: A, arg2: B, arg3: C, arg4: D, arg5: E, arg6: F, arg7: G, arg8: H, arg9: I, arg10: J,
-        arg11: K, arg12: L
-    ) => with12_async => launch12_async }
-
-    /// # Errors
-    ///
-    /// Returns a [`CudaError`] if the [`CompiledKernelPtx`] provided to
-    /// [`Self::new`] is not a valid PTX source or does not contain the
-    /// entry point it declares.
-    fn compile_with_ptx_jit_args(
-        &mut self,
-        arguments: Option<&[Option<&NonNull<[u8]>>]>,
-    ) -> CudaResult<&Function> {
-        let ptx_jit = self.compiler.with_arguments(arguments);
-
-        let kernel_jit = match (&mut self.ptx_kernel, ptx_jit) {
-            (Some(ptx_kernel), PtxJITResult::Cached(_)) => ptx_kernel.get_function(),
-            (ptx_kernel, PtxJITResult::Cached(ptx_cstr) | PtxJITResult::Recomputed(ptx_cstr)) => {
-                let recomputed_ptx_kernel = RawPtxKernel::new(ptx_cstr, &self.entry_point)?;
-
-                // Replace the existing compiled kernel, drop the old one
-                let ptx_kernel = ptx_kernel.insert(recomputed_ptx_kernel);
-
-                let function = ptx_kernel.get_function();
-
-                if let Some(configure) = self.configure.as_mut() {
-                    configure(function)?;
-                }
-
-                function
-            },
-        };
-
-        Ok(kernel_jit)
-    }
-}
-
-pub trait LendToCuda: RustToCuda + NoSafeAliasing {
-    /// Lends an immutable copy of `&self` to CUDA:
-    /// - code in the CUDA kernel can only access `&self` through the
-    ///   [`DeviceConstRef`] inside the closure
-    /// - after the closure, `&self` will not have changed
-    ///
-    /// # Errors
-    ///
-    /// Returns a [`CudaError`] iff an error occurs inside CUDA
-    fn lend_to_cuda<
-        O,
-        E: From<CudaError>,
-        F: FnOnce(
-            HostAndDeviceConstRef<DeviceAccessible<<Self as RustToCuda>::CudaRepresentation>>,
-        ) -> Result<O, E>,
-    >(
-        &self,
-        inner: F,
-    ) -> Result<O, E>;
-
-    /// Moves `self` to CUDA iff `self` is [`SafeDeviceCopy`]
-    ///
-    /// # Errors
-    ///
-    /// Returns a [`CudaError`] iff an error occurs inside CUDA
-    fn move_to_cuda<
-        O,
-        E: From<CudaError>,
-        F: FnOnce(
-            HostAndDeviceOwned<DeviceAccessible<<Self as RustToCuda>::CudaRepresentation>>,
-        ) -> Result<O, E>,
-    >(
-        self,
-        inner: F,
-    ) -> Result<O, E>
-    where
-        Self: RustToCuda<CudaRepresentation: SafeDeviceCopy, CudaAllocation: EmptyCudaAlloc>;
-}
-
-impl<T: RustToCuda + NoSafeAliasing> LendToCuda for T {
-    fn lend_to_cuda<
-        O,
-        E: From<CudaError>,
-        F: FnOnce(
-            HostAndDeviceConstRef<DeviceAccessible<<Self as RustToCuda>::CudaRepresentation>>,
-        ) -> Result<O, E>,
-    >(
-        &self,
-        inner: F,
-    ) -> Result<O, E> {
-        let (cuda_repr, alloc) = unsafe { self.borrow(NoCudaAlloc) }?;
-
-        let result = HostAndDeviceConstRef::with_new(&cuda_repr, inner);
-
-        core::mem::drop(cuda_repr);
-        core::mem::drop(alloc);
-
-        result
-    }
-
-    fn move_to_cuda<
-        O,
-        E: From<CudaError>,
-        F: FnOnce(
-            HostAndDeviceOwned<DeviceAccessible<<Self as RustToCuda>::CudaRepresentation>>,
-        ) -> Result<O, E>,
-    >(
-        self,
-        inner: F,
-    ) -> Result<O, E>
-    where
-        Self: RustToCuda<CudaRepresentation: SafeDeviceCopy, CudaAllocation: EmptyCudaAlloc>,
-    {
-        let (cuda_repr, alloc) = unsafe { self.borrow(NoCudaAlloc) }?;
-
-        let result = HostAndDeviceOwned::with_new(cuda_repr, inner);
-
-        core::mem::drop(alloc);
-
-        result
-    }
-}
-
 pub trait CudaDroppable: Sized {
     #[allow(clippy::missing_errors_doc)]
     fn drop(val: Self) -> Result<(), (rustacuda::error::CudaError, Self)>;
@@ -525,8 +26,8 @@ pub trait CudaDroppable: Sized {
 
 #[repr(transparent)]
 pub struct CudaDropWrapper<C: CudaDroppable>(ManuallyDrop<C>);
-impl<C: CudaDroppable> crate::common::CudaAlloc for CudaDropWrapper<C> {}
-impl<C: CudaDroppable> crate::common::crate_private::alloc::Sealed for CudaDropWrapper<C> {}
+impl<C: CudaDroppable> crate::alloc::CudaAlloc for CudaDropWrapper<C> {}
+impl<C: CudaDroppable> crate::alloc::sealed::alloc::Sealed for CudaDropWrapper<C> {}
 impl<C: CudaDroppable> From<C> for CudaDropWrapper<C> {
     fn from(val: C) -> Self {
         Self(ManuallyDrop::new(val))
@@ -646,8 +147,8 @@ impl<T: DeviceCopy> Drop for HostLockedBox<T> {
 #[allow(clippy::module_name_repetitions)]
 pub struct HostDeviceBox<T: DeviceCopy>(DevicePointer<T>);
 
-impl<T: DeviceCopy> crate::common::CudaAlloc for HostDeviceBox<T> {}
-impl<T: DeviceCopy> crate::common::crate_private::alloc::Sealed for HostDeviceBox<T> {}
+impl<T: DeviceCopy> crate::alloc::CudaAlloc for HostDeviceBox<T> {}
+impl<T: DeviceCopy> crate::alloc::sealed::alloc::Sealed for HostDeviceBox<T> {}
 
 impl<T: DeviceCopy> HostDeviceBox<T> {
     /// # Errors
@@ -1146,17 +647,3 @@ impl<'stream, 'a, T: SafeDeviceCopy + DeviceCopy> HostAndDeviceOwnedAsync<'strea
         self.host_val
     }
 }
-
-/// # Safety
-///
-/// The PTX string returned by [`CompiledKernelPtx::get_ptx`] must correspond
-/// to the compiled kernel code for the `Kernel` function and contain a kernel
-/// entry point whose name is returned by
-/// [`CompiledKernelPtx::get_entry_point`].
-///
-/// This trait should not be implemented manually &ndash; use the
-/// [`kernel`](crate::kernel::kernel) macro instead.
-pub unsafe trait CompiledKernelPtx<Kernel> {
-    fn get_ptx() -> &'static CStr;
-    fn get_entry_point() -> &'static CStr;
-}
diff --git a/src/kernel/mod.rs b/src/kernel/mod.rs
new file mode 100644
index 000000000..f5aeeb4bf
--- /dev/null
+++ b/src/kernel/mod.rs
@@ -0,0 +1,520 @@
+#[cfg(feature = "host")]
+use std::{
+    ffi::{CStr, CString},
+    marker::PhantomData,
+    mem::ManuallyDrop,
+    ptr::NonNull,
+};
+
+use const_type_layout::TypeGraphLayout;
+#[cfg(feature = "host")]
+use rustacuda::{
+    error::{CudaError, CudaResult},
+    function::Function,
+    module::Module,
+    stream::Stream,
+};
+
+#[cfg(feature = "derive")]
+pub use rust_cuda_derive::kernel;
+
+#[doc(hidden)]
+#[cfg(all(feature = "derive", feature = "host"))]
+#[allow(clippy::module_name_repetitions)]
+pub use rust_cuda_derive::{check_kernel, link_kernel, specialise_kernel_entry_point};
+
+#[cfg(feature = "host")]
+mod ptx_jit;
+#[cfg(feature = "host")]
+use ptx_jit::{PtxJITCompiler, PtxJITResult};
+
+pub mod param;
+
+mod sealed {
+    #[doc(hidden)]
+    pub trait Sealed {}
+}
+
+pub trait CudaKernelParameter: sealed::Sealed {
+    #[cfg(feature = "host")]
+    type SyncHostType;
+    #[cfg(feature = "host")]
+    type AsyncHostType<'stream, 'b>;
+    #[doc(hidden)]
+    type FfiType<'stream, 'b>: rustacuda_core::DeviceCopy + TypeGraphLayout;
+    #[cfg(any(feature = "device", doc))]
+    type DeviceType<'b>;
+
+    #[cfg(feature = "host")]
+    #[allow(clippy::missing_errors_doc)] // FIXME
+    fn with_new_async<'stream, O, E: From<rustacuda::error::CudaError>>(
+        param: Self::SyncHostType,
+        stream: &'stream rustacuda::stream::Stream,
+        inner: impl for<'b> FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result<O, E>,
+    ) -> Result<O, E>;
+
+    #[doc(hidden)]
+    #[cfg(feature = "host")]
+    fn with_async_as_ptx_jit<O>(
+        param: &Self::AsyncHostType<'_, '_>,
+        inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O,
+    ) -> O;
+
+    #[doc(hidden)]
+    #[cfg(feature = "host")]
+    fn shared_layout_for_async(param: &Self::AsyncHostType<'_, '_>) -> std::alloc::Layout;
+
+    #[doc(hidden)]
+    #[cfg(feature = "host")]
+    fn async_to_ffi<'stream, 'b>(
+        param: Self::AsyncHostType<'stream, 'b>,
+    ) -> Self::FfiType<'stream, 'b>;
+
+    #[doc(hidden)]
+    #[cfg(feature = "device")]
+    unsafe fn with_ffi_as_device<O, const PARAM: usize>(
+        param: Self::FfiType<'static, 'static>,
+        inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O,
+    ) -> O;
+}
+
+#[cfg(feature = "host")]
+pub struct Launcher<'stream, 'kernel, Kernel> {
+    pub stream: &'stream Stream,
+    pub kernel: &'kernel mut TypedPtxKernel<Kernel>,
+    pub config: LaunchConfig,
+}
+
+#[cfg(feature = "host")]
+macro_rules! impl_launcher_launch {
+    ($launch:ident($($arg:ident : $T:ident),*) => $with_async:ident => $launch_async:ident) => {
+        #[allow(clippy::missing_errors_doc)]
+        #[allow(clippy::too_many_arguments)] // func is defined for <= 12 args
+        pub fn $launch<$($T: CudaKernelParameter),*>(
+            &mut self,
+            $($arg: $T::SyncHostType),*
+        ) -> CudaResult<()>
+        where
+            Kernel: FnOnce(&mut Launcher<'stream, '_, Kernel>, $($T),*),
+        {
+            self.kernel.$launch::<$($T),*>(self.stream, &self.config, $($arg),*)
+        }
+
+        #[allow(clippy::missing_errors_doc)]
+        #[allow(clippy::too_many_arguments)] // func is defined for <= 12 args
+        pub fn $with_async<
+            'a,
+            Ok,
+            Err: From<CudaError>,
+            $($T: CudaKernelParameter),*
+        >(
+            &'a mut self,
+            $($arg: $T::SyncHostType,)*
+            inner: impl FnOnce(
+                &'a mut Self,
+                $($T::AsyncHostType<'stream, '_>),*
+            ) -> Result<Ok, Err>,
+        ) -> Result<Ok, Err>
+        where
+            Kernel: FnOnce(&mut Launcher<'stream, '_, Kernel>, $($T),*),
+        {
+            #[allow(unused_variables)]
+            let stream = self.stream;
+
+            impl_launcher_launch! { impl with_new_async ($($arg: $T),*) + (stream) {
+                inner(self, $($arg),*)
+            } }
+        }
+
+        #[allow(clippy::missing_errors_doc)]
+        #[allow(clippy::too_many_arguments)] // func is defined for <= 12 args
+        pub fn $launch_async<$($T: CudaKernelParameter),*>(
+            &mut self,
+            $($arg: $T::AsyncHostType<'stream, '_>),*
+        ) -> CudaResult<()>
+        where
+            Kernel: FnOnce(&mut Launcher<'stream, '_, Kernel>, $($T),*),
+        {
+            self.kernel.$launch_async::<$($T),*>(self.stream, &self.config, $($arg),*)
+        }
+    };
+    (impl $func:ident () + ($($other:ident),*) $inner:block) => {
+        $inner
+    };
+    (impl $func:ident ($arg0:ident : $T0:ident $(, $arg:ident : $T:ident)*) + ($($other:ident),*) $inner:block) => {
+        $T0::$func($arg0 $(, $other)*, |$arg0| {
+            impl_launcher_launch! { impl $func ($($arg: $T),*) + ($($other),*) $inner }
+        })
+    };
+}
+
+#[cfg(feature = "host")]
+impl<'stream, 'kernel, Kernel> Launcher<'stream, 'kernel, Kernel> {
+    impl_launcher_launch! { launch0() => with0_async => launch0_async }
+
+    impl_launcher_launch! { launch1(
+        arg1: A
+    ) => with1_async => launch1_async }
+
+    impl_launcher_launch! { launch2(
+        arg1: A, arg2: B
+    ) => with2_async => launch2_async }
+
+    impl_launcher_launch! { launch3(
+        arg1: A, arg2: B, arg3: C
+    ) => with3_async => launch3_async }
+
+    impl_launcher_launch! { launch4(
+        arg1: A, arg2: B, arg3: C, arg4: D
+    ) => with4_async => launch4_async }
+
+    impl_launcher_launch! { launch5(
+        arg1: A, arg2: B, arg3: C, arg4: D, arg5: E
+    ) => with5_async => launch5_async }
+
+    impl_launcher_launch! { launch6(
+        arg1: A, arg2: B, arg3: C, arg4: D, arg5: E, arg6: F
+    ) => with6_async => launch6_async }
+
+    impl_launcher_launch! { launch7(
+        arg1: A, arg2: B, arg3: C, arg4: D, arg5: E, arg6: F, arg7: G
+    ) => with7_async => launch7_async }
+
+    impl_launcher_launch! { launch8(
+        arg1: A, arg2: B, arg3: C, arg4: D, arg5: E, arg6: F, arg7: G, arg8: H
+    ) => with8_async => launch8_async }
+
+    impl_launcher_launch! { launch9(
+        arg1: A, arg2: B, arg3: C, arg4: D, arg5: E, arg6: F, arg7: G, arg8: H, arg9: I
+    ) => with9_async => launch9_async }
+
+    impl_launcher_launch! { launch10(
+        arg1: A, arg2: B, arg3: C, arg4: D, arg5: E, arg6: F, arg7: G, arg8: H, arg9: I, arg10: J
+    ) => with10_async => launch10_async }
+
+    impl_launcher_launch! { launch11(
+        arg1: A, arg2: B, arg3: C, arg4: D, arg5: E, arg6: F, arg7: G, arg8: H, arg9: I, arg10: J,
+        arg11: K
+    ) => with11_async => launch11_async }
+
+    impl_launcher_launch! { launch12(
+        arg1: A, arg2: B, arg3: C, arg4: D, arg5: E, arg6: F, arg7: G, arg8: H, arg9: I, arg10: J,
+        arg11: K, arg12: L
+    ) => with12_async => launch12_async }
+}
+
+#[cfg(feature = "host")]
+#[derive(Clone, Debug, PartialEq, Eq)]
+pub struct LaunchConfig {
+    pub grid: rustacuda::function::GridSize,
+    pub block: rustacuda::function::BlockSize,
+    pub ptx_jit: bool,
+}
+
+#[cfg(feature = "host")]
+#[allow(clippy::module_name_repetitions)]
+pub struct RawPtxKernel {
+    module: ManuallyDrop<Box<Module>>,
+    function: ManuallyDrop<Function<'static>>,
+}
+
+#[cfg(feature = "host")]
+impl RawPtxKernel {
+    /// # Errors
+    ///
+    /// Returns a [`CudaError`] if `ptx` is not a valid PTX source, or it does
+    ///  not contain an entry point named `entry_point`.
+    pub fn new(ptx: &CStr, entry_point: &CStr) -> CudaResult<Self> {
+        let module = Box::new(Module::load_from_string(ptx)?);
+
+        let function = unsafe { &*(module.as_ref() as *const Module) }.get_function(entry_point);
+
+        let function = match function {
+            Ok(function) => function,
+            Err(err) => {
+                if let Err((_err, module)) = Module::drop(*module) {
+                    std::mem::forget(module);
+                }
+
+                return Err(err);
+            },
+        };
+
+        Ok(Self {
+            function: ManuallyDrop::new(function),
+            module: ManuallyDrop::new(module),
+        })
+    }
+
+    #[must_use]
+    pub fn get_function(&self) -> &Function {
+        &self.function
+    }
+}
+
+#[cfg(feature = "host")]
+impl Drop for RawPtxKernel {
+    fn drop(&mut self) {
+        {
+            // Ensure that self.function is dropped before self.module as
+            //  it borrows data from the module and must not outlive it
+            let _function = unsafe { ManuallyDrop::take(&mut self.function) };
+        }
+
+        if let Err((_err, module)) = Module::drop(*unsafe { ManuallyDrop::take(&mut self.module) })
+        {
+            std::mem::forget(module);
+        }
+    }
+}
+
+#[cfg(feature = "host")]
+pub type PtxKernelConfigure = dyn FnMut(&Function) -> CudaResult<()>;
+
+#[cfg(feature = "host")]
+#[allow(clippy::module_name_repetitions)]
+pub struct TypedPtxKernel<Kernel> {
+    compiler: PtxJITCompiler,
+    ptx_kernel: Option<RawPtxKernel>,
+    entry_point: Box<CStr>,
+    configure: Option<Box<PtxKernelConfigure>>,
+    marker: PhantomData<Kernel>,
+}
+
+#[cfg(feature = "host")]
+macro_rules! impl_typed_kernel_launch {
+    ($launch:ident($($arg:ident : $T:ident),*) => $with_async:ident => $launch_async:ident) => {
+        #[allow(clippy::missing_errors_doc)]
+        #[allow(clippy::too_many_arguments)] // func is defined for <= 12 args
+        pub fn $launch<'kernel, 'stream, $($T: CudaKernelParameter),*>(
+            &'kernel mut self,
+            stream: &'stream Stream,
+            config: &LaunchConfig,
+            $($arg: $T::SyncHostType),*
+        ) -> CudaResult<()>
+        where
+            Kernel: FnOnce(&mut Launcher<'stream, 'kernel, Kernel>, $($T),*),
+        {
+            self.$with_async::<(), CudaError, $($T),*>(
+                stream,
+                config,
+                $($arg,)*
+                |kernel, stream, config, $($arg),*| {
+                    let result = kernel.$launch_async::<$($T),*>(stream, config, $($arg),*);
+
+                    // important: always synchronise here, this function is sync!
+                    match (stream.synchronize(), result) {
+                        (Ok(()), result) => result,
+                        (Err(_), Err(err)) | (Err(err), Ok(())) => Err(err),
+                    }
+                },
+            )
+        }
+
+        #[allow(clippy::missing_errors_doc)]
+        #[allow(clippy::too_many_arguments)] // func is defined for <= 12 args
+        pub fn $with_async<
+            'kernel,
+            'stream,
+            Ok,
+            Err: From<CudaError>,
+            $($T: CudaKernelParameter),*
+        >(
+            &'kernel mut self,
+            stream: &'stream Stream,
+            config: &LaunchConfig,
+            $($arg: $T::SyncHostType,)*
+            inner: impl FnOnce(
+                &'kernel mut Self,
+                &'stream Stream,
+                &LaunchConfig,
+                $($T::AsyncHostType<'stream, '_>),*
+            ) -> Result<Ok, Err>,
+        ) -> Result<Ok, Err>
+        where
+            Kernel: FnOnce(&mut Launcher<'stream, 'kernel, Kernel>, $($T),*),
+        {
+            impl_typed_kernel_launch! { impl with_new_async ($($arg: $T),*) + (stream) {
+                inner(self, stream, config, $($arg),*)
+            } }
+        }
+
+        #[allow(clippy::missing_errors_doc)]
+        #[allow(clippy::needless_lifetimes)] // 'stream is unused for zero args
+        #[allow(clippy::too_many_arguments)] // func is defined for <= 12 args
+        pub fn $launch_async<'kernel, 'stream, $($T: CudaKernelParameter),*>(
+            &'kernel mut self,
+            stream: &'stream Stream,
+            config: &LaunchConfig,
+            $($arg: $T::AsyncHostType<'stream, '_>),*
+        ) -> CudaResult<()>
+        where
+            Kernel: FnOnce(&mut Launcher<'stream, 'kernel, Kernel>, $($T),*),
+        {
+            let function = if config.ptx_jit {
+                impl_typed_kernel_launch! { impl with_async_as_ptx_jit ref ($($arg: $T),*) + () {
+                    self.compile_with_ptx_jit_args(Some(&[$($arg),*]))
+                } }?
+            } else {
+                self.compile_with_ptx_jit_args(None)?
+            };
+
+            #[allow(unused_mut)]
+            let mut shared_memory_size = crate::utils::shared::SharedMemorySize::new();
+            $(
+                shared_memory_size.add($T::shared_layout_for_async(&$arg));
+            )*
+            let Ok(shared_memory_size) = u32::try_from(shared_memory_size.total()) else {
+                // FIXME: this should really be InvalidConfiguration = 9
+                return Err(CudaError::LaunchOutOfResources)
+            };
+
+            unsafe { stream.launch(
+                function,
+                config.grid.clone(),
+                config.block.clone(),
+                shared_memory_size,
+                &[
+                    $(core::ptr::from_mut(
+                        &mut $T::async_to_ffi($arg)
+                    ).cast::<core::ffi::c_void>()),*
+                ],
+            ) }
+        }
+    };
+    (impl $func:ident () + ($($other:ident),*) $inner:block) => {
+        $inner
+    };
+    (impl $func:ident ($arg0:ident : $T0:ident $(, $arg:ident : $T:ident)*) + ($($other:ident),*) $inner:block) => {
+        $T0::$func($arg0 $(, $other)*, |$arg0| {
+            impl_typed_kernel_launch! { impl $func ($($arg: $T),*) + ($($other),*) $inner }
+        })
+    };
+    (impl $func:ident ref () + ($($other:ident),*) $inner:block) => {
+        $inner
+    };
+    (impl $func:ident ref ($arg0:ident : $T0:ident $(, $arg:ident : $T:ident)*) + ($($other:ident),*) $inner:block) => {
+        $T0::$func(&$arg0 $(, $other)*, |$arg0| {
+            impl_typed_kernel_launch! { impl $func ref ($($arg: $T),*) + ($($other),*) $inner }
+        })
+    };
+}
+
+#[cfg(feature = "host")]
+impl<Kernel> TypedPtxKernel<Kernel> {
+    #[must_use]
+    pub fn new<T: CompiledKernelPtx<Kernel>>(configure: Option<Box<PtxKernelConfigure>>) -> Self {
+        let compiler = PtxJITCompiler::new(T::get_ptx());
+        let entry_point = CString::from(T::get_entry_point()).into_boxed_c_str();
+
+        Self {
+            compiler,
+            ptx_kernel: None,
+            entry_point,
+            configure,
+            marker: PhantomData::<Kernel>,
+        }
+    }
+}
+
+#[cfg(feature = "host")]
+impl<Kernel> TypedPtxKernel<Kernel> {
+    impl_typed_kernel_launch! { launch0() => with0_async => launch0_async }
+
+    impl_typed_kernel_launch! { launch1(
+        arg1: A
+    ) => with1_async => launch1_async }
+
+    impl_typed_kernel_launch! { launch2(
+        arg1: A, arg2: B
+    ) => with2_async => launch2_async }
+
+    impl_typed_kernel_launch! { launch3(
+        arg1: A, arg2: B, arg3: C
+    ) => with3_async => launch3_async }
+
+    impl_typed_kernel_launch! { launch4(
+        arg1: A, arg2: B, arg3: C, arg4: D
+    ) => with4_async => launch4_async }
+
+    impl_typed_kernel_launch! { launch5(
+        arg1: A, arg2: B, arg3: C, arg4: D, arg5: E
+    ) => with5_async => launch5_async }
+
+    impl_typed_kernel_launch! { launch6(
+        arg1: A, arg2: B, arg3: C, arg4: D, arg5: E, arg6: F
+    ) => with6_async => launch6_async }
+
+    impl_typed_kernel_launch! { launch7(
+        arg1: A, arg2: B, arg3: C, arg4: D, arg5: E, arg6: F, arg7: G
+    ) => with7_async => launch7_async }
+
+    impl_typed_kernel_launch! { launch8(
+        arg1: A, arg2: B, arg3: C, arg4: D, arg5: E, arg6: F, arg7: G, arg8: H
+    ) => with8_async => launch8_async }
+
+    impl_typed_kernel_launch! { launch9(
+        arg1: A, arg2: B, arg3: C, arg4: D, arg5: E, arg6: F, arg7: G, arg8: H, arg9: I
+    ) => with9_async => launch9_async }
+
+    impl_typed_kernel_launch! { launch10(
+        arg1: A, arg2: B, arg3: C, arg4: D, arg5: E, arg6: F, arg7: G, arg8: H, arg9: I, arg10: J
+    ) => with10_async => launch10_async }
+
+    impl_typed_kernel_launch! { launch11(
+        arg1: A, arg2: B, arg3: C, arg4: D, arg5: E, arg6: F, arg7: G, arg8: H, arg9: I, arg10: J,
+        arg11: K
+    ) => with11_async => launch11_async }
+
+    impl_typed_kernel_launch! { launch12(
+        arg1: A, arg2: B, arg3: C, arg4: D, arg5: E, arg6: F, arg7: G, arg8: H, arg9: I, arg10: J,
+        arg11: K, arg12: L
+    ) => with12_async => launch12_async }
+
+    /// # Errors
+    ///
+    /// Returns a [`CudaError`] if the [`CompiledKernelPtx`] provided to
+    /// [`Self::new`] is not a valid PTX source or does not contain the
+    /// entry point it declares.
+    fn compile_with_ptx_jit_args(
+        &mut self,
+        arguments: Option<&[Option<&NonNull<[u8]>>]>,
+    ) -> CudaResult<&Function> {
+        let ptx_jit = self.compiler.with_arguments(arguments);
+
+        let kernel_jit = match (&mut self.ptx_kernel, ptx_jit) {
+            (Some(ptx_kernel), PtxJITResult::Cached(_)) => ptx_kernel.get_function(),
+            (ptx_kernel, PtxJITResult::Cached(ptx_cstr) | PtxJITResult::Recomputed(ptx_cstr)) => {
+                let recomputed_ptx_kernel = RawPtxKernel::new(ptx_cstr, &self.entry_point)?;
+
+                // Replace the existing compiled kernel, drop the old one
+                let ptx_kernel = ptx_kernel.insert(recomputed_ptx_kernel);
+
+                let function = ptx_kernel.get_function();
+
+                if let Some(configure) = self.configure.as_mut() {
+                    configure(function)?;
+                }
+
+                function
+            },
+        };
+
+        Ok(kernel_jit)
+    }
+}
+
+#[cfg(feature = "host")]
+/// # Safety
+///
+/// The PTX string returned by [`CompiledKernelPtx::get_ptx`] must correspond
+/// to the compiled kernel code for the `Kernel` function and contain a kernel
+/// entry point whose name is returned by
+/// [`CompiledKernelPtx::get_entry_point`].
+///
+/// This trait should not be implemented manually &ndash; use the
+/// [`kernel`] macro instead.
+pub unsafe trait CompiledKernelPtx<Kernel> {
+    fn get_ptx() -> &'static CStr;
+    fn get_entry_point() -> &'static CStr;
+}
diff --git a/src/kernel.rs b/src/kernel/param.rs
similarity index 93%
rename from src/kernel.rs
rename to src/kernel/param.rs
index 98ae0220c..2e4461051 100644
--- a/src/kernel.rs
+++ b/src/kernel/param.rs
@@ -6,58 +6,16 @@ use core::{
 };
 
 #[cfg(feature = "host")]
-use core::ptr::NonNull;
+use std::{alloc::Layout, ptr::NonNull};
 
 use const_type_layout::TypeGraphLayout;
 
-#[cfg(feature = "derive")]
-pub use rust_cuda_derive::kernel;
-
-use crate::common::{DeviceAccessible, DeviceConstRef, DeviceOwnedRef, EmptyCudaAlloc, RustToCuda};
-
-mod sealed {
-    #[doc(hidden)]
-    pub trait Sealed {}
-}
-
-pub trait CudaKernelParameter: sealed::Sealed {
-    #[cfg(feature = "host")]
-    type SyncHostType;
-    #[cfg(feature = "host")]
-    type AsyncHostType<'stream, 'b>;
-    #[doc(hidden)]
-    type FfiType<'stream, 'b>: rustacuda_core::DeviceCopy + TypeGraphLayout;
-    #[cfg(any(feature = "device", doc))]
-    type DeviceType<'b>;
-
-    #[cfg(feature = "host")]
-    #[allow(clippy::missing_errors_doc)] // FIXME
-    fn with_new_async<'stream, O, E: From<rustacuda::error::CudaError>>(
-        param: Self::SyncHostType,
-        stream: &'stream rustacuda::stream::Stream,
-        inner: impl for<'b> FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result<O, E>,
-    ) -> Result<O, E>;
-
-    #[doc(hidden)]
-    #[cfg(feature = "host")]
-    fn with_async_as_ptx_jit<O>(
-        param: &Self::AsyncHostType<'_, '_>,
-        inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O,
-    ) -> O;
-
-    #[doc(hidden)]
-    #[cfg(feature = "host")]
-    fn async_to_ffi<'stream, 'b>(
-        param: Self::AsyncHostType<'stream, 'b>,
-    ) -> Self::FfiType<'stream, 'b>;
-
-    #[doc(hidden)]
-    #[cfg(feature = "device")]
-    unsafe fn with_ffi_as_device<O, const PARAM: usize>(
-        param: Self::FfiType<'static, 'static>,
-        inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O,
-    ) -> O;
-}
+use crate::{
+    alloc::EmptyCudaAlloc,
+    kernel::{sealed, CudaKernelParameter},
+    lend::RustToCuda,
+    utils::ffi::{DeviceAccessible, DeviceConstRef, DeviceOwnedRef},
+};
 
 pub struct PtxJit<T> {
     never: !,
@@ -144,6 +102,11 @@ impl<
         inner(None)
     }
 
+    #[cfg(feature = "host")]
+    fn shared_layout_for_async(_param: &Self::AsyncHostType<'_, '_>) -> Layout {
+        Layout::new::<()>()
+    }
+
     #[cfg(feature = "host")]
     fn async_to_ffi<'stream, 'b>(
         param: Self::AsyncHostType<'stream, 'b>,
@@ -219,6 +182,11 @@ impl<
         inner(None)
     }
 
+    #[cfg(feature = "host")]
+    fn shared_layout_for_async(_param: &Self::AsyncHostType<'_, '_>) -> Layout {
+        Layout::new::<()>()
+    }
+
     #[cfg(feature = "host")]
     fn async_to_ffi<'stream, 'b>(
         param: Self::AsyncHostType<'stream, 'b>,
@@ -280,6 +248,11 @@ impl<
         inner(Some(&param_as_raw_bytes(param.for_host())))
     }
 
+    #[cfg(feature = "host")]
+    fn shared_layout_for_async(_param: &Self::AsyncHostType<'_, '_>) -> Layout {
+        Layout::new::<()>()
+    }
+
     #[cfg(feature = "host")]
     fn async_to_ffi<'stream, 'b>(
         param: Self::AsyncHostType<'stream, 'b>,
@@ -374,6 +347,11 @@ impl<'a, T: 'static + InteriorMutableSafeDeviceCopy> CudaKernelParameter
         inner(None)
     }
 
+    #[cfg(feature = "host")]
+    fn shared_layout_for_async(_param: &Self::AsyncHostType<'_, '_>) -> Layout {
+        Layout::new::<()>()
+    }
+
     #[cfg(feature = "host")]
     fn async_to_ffi<'stream, 'b>(
         param: Self::AsyncHostType<'stream, 'b>,
@@ -477,7 +455,7 @@ impl<
         _stream: &'stream rustacuda::stream::Stream,
         inner: impl for<'b> FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result<O, E>,
     ) -> Result<O, E> {
-        crate::host::LendToCuda::move_to_cuda(param, |param| inner(param.into_async()))
+        crate::lend::LendToCuda::move_to_cuda(param, |param| inner(param.into_async()))
     }
 
     #[cfg(feature = "host")]
@@ -488,6 +466,11 @@ impl<
         inner(None)
     }
 
+    #[cfg(feature = "host")]
+    fn shared_layout_for_async(_param: &Self::AsyncHostType<'_, '_>) -> Layout {
+        Layout::new::<()>()
+    }
+
     #[cfg(feature = "host")]
     fn async_to_ffi<'stream, 'b>(
         param: Self::AsyncHostType<'stream, 'b>,
@@ -500,7 +483,7 @@ impl<
         param: Self::FfiType<'static, 'static>,
         inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O,
     ) -> O {
-        unsafe { crate::device::BorrowFromRust::with_moved_from_rust(param, inner) }
+        unsafe { crate::lend::BorrowFromRust::with_moved_from_rust(param, inner) }
     }
 }
 impl<
@@ -534,7 +517,7 @@ impl<'a, T: 'static + RustToCuda + crate::safety::NoSafeAliasing> CudaKernelPara
         _stream: &'stream rustacuda::stream::Stream,
         inner: impl for<'b> FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result<O, E>,
     ) -> Result<O, E> {
-        crate::host::LendToCuda::lend_to_cuda(param, |param| inner(param.as_async()))
+        crate::lend::LendToCuda::lend_to_cuda(param, |param| inner(param.as_async()))
     }
 
     #[cfg(feature = "host")]
@@ -545,6 +528,11 @@ impl<'a, T: 'static + RustToCuda + crate::safety::NoSafeAliasing> CudaKernelPara
         inner(None)
     }
 
+    #[cfg(feature = "host")]
+    fn shared_layout_for_async(_param: &Self::AsyncHostType<'_, '_>) -> Layout {
+        Layout::new::<()>()
+    }
+
     #[cfg(feature = "host")]
     fn async_to_ffi<'stream, 'b>(
         param: Self::AsyncHostType<'stream, 'b>,
@@ -557,7 +545,7 @@ impl<'a, T: 'static + RustToCuda + crate::safety::NoSafeAliasing> CudaKernelPara
         param: Self::FfiType<'static, 'static>,
         inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O,
     ) -> O {
-        unsafe { crate::device::BorrowFromRust::with_borrow_from_rust(param, inner) }
+        unsafe { crate::lend::BorrowFromRust::with_borrow_from_rust(param, inner) }
     }
 }
 impl<'a, T: RustToCuda + crate::safety::NoSafeAliasing> sealed::Sealed
@@ -609,6 +597,11 @@ impl<
         <SharedHeapPerThreadShallowCopy<T> as CudaKernelParameter>::async_to_ffi(param)
     }
 
+    #[cfg(feature = "host")]
+    fn shared_layout_for_async(_param: &Self::AsyncHostType<'_, '_>) -> Layout {
+        Layout::new::<()>()
+    }
+
     #[cfg(feature = "device")]
     unsafe fn with_ffi_as_device<O, const PARAM: usize>(
         param: Self::FfiType<'static, 'static>,
@@ -664,6 +657,11 @@ impl<'a, T: 'static + RustToCuda + crate::safety::NoSafeAliasing> CudaKernelPara
         inner(Some(&param_as_raw_bytes(param.for_host())))
     }
 
+    #[cfg(feature = "host")]
+    fn shared_layout_for_async(_param: &Self::AsyncHostType<'_, '_>) -> Layout {
+        Layout::new::<()>()
+    }
+
     #[cfg(feature = "host")]
     fn async_to_ffi<'stream, 'b>(
         param: Self::AsyncHostType<'stream, 'b>,
@@ -758,6 +756,11 @@ impl<'a, T: 'static + TypeGraphLayout> CudaKernelParameter
         inner(None)
     }
 
+    #[cfg(feature = "host")]
+    fn shared_layout_for_async(_param: &Self::AsyncHostType<'_, '_>) -> Layout {
+        Layout::new::<()>()
+    }
+
     #[cfg(feature = "host")]
     fn async_to_ffi<'stream, 'b>(
         _param: Self::AsyncHostType<'stream, 'b>,
@@ -810,6 +813,11 @@ impl<'a, T: 'static + TypeGraphLayout> CudaKernelParameter
         inner(None)
     }
 
+    #[cfg(feature = "host")]
+    fn shared_layout_for_async(param: &Self::AsyncHostType<'_, '_>) -> Layout {
+        param.layout()
+    }
+
     #[cfg(feature = "host")]
     fn async_to_ffi<'stream, 'b>(
         param: Self::AsyncHostType<'stream, 'b>,
diff --git a/src/host/ptx_jit/mod.rs b/src/kernel/ptx_jit/mod.rs
similarity index 100%
rename from src/host/ptx_jit/mod.rs
rename to src/kernel/ptx_jit/mod.rs
diff --git a/src/host/ptx_jit/preprocess.rs b/src/kernel/ptx_jit/preprocess.rs
similarity index 100%
rename from src/host/ptx_jit/preprocess.rs
rename to src/kernel/ptx_jit/preprocess.rs
diff --git a/src/host/ptx_jit/regex.rs b/src/kernel/ptx_jit/regex.rs
similarity index 100%
rename from src/host/ptx_jit/regex.rs
rename to src/kernel/ptx_jit/regex.rs
diff --git a/src/host/ptx_jit/replace.rs b/src/kernel/ptx_jit/replace.rs
similarity index 100%
rename from src/host/ptx_jit/replace.rs
rename to src/kernel/ptx_jit/replace.rs
diff --git a/src/utils/box.rs b/src/lend/impls/box.rs
similarity index 93%
rename from src/utils/box.rs
rename to src/lend/impls/box.rs
index f9c271a67..4acfd7b2c 100644
--- a/src/utils/box.rs
+++ b/src/lend/impls/box.rs
@@ -6,16 +6,16 @@ use const_type_layout::{TypeGraphLayout, TypeLayout};
 use rustacuda::{error::CudaResult, memory::DeviceBox};
 
 use crate::{
-    common::{CudaAsRust, RustToCuda},
+    lend::{CudaAsRust, RustToCuda},
     safety::SafeDeviceCopy,
 };
 
 #[cfg(any(feature = "host", feature = "device"))]
-use crate::common::DeviceAccessible;
+use crate::utils::ffi::DeviceAccessible;
 
 #[cfg(feature = "host")]
 use crate::{
-    common::{CombinedCudaAlloc, CudaAlloc},
+    alloc::{CombinedCudaAlloc, CudaAlloc},
     host::CudaDropWrapper,
     utils::device_copy::SafeDeviceCopyWrapper,
 };
@@ -36,7 +36,7 @@ unsafe impl<T: SafeDeviceCopy + TypeGraphLayout> RustToCuda for Box<T> {
     #[cfg(all(feature = "host", not(doc)))]
     type CudaAllocation = crate::host::CudaDropWrapper<DeviceBox<SafeDeviceCopyWrapper<T>>>;
     #[cfg(any(not(feature = "host"), doc))]
-    type CudaAllocation = crate::common::SomeCudaAlloc;
+    type CudaAllocation = crate::alloc::SomeCudaAlloc;
     type CudaRepresentation = BoxCudaRepresentation<T>;
 
     #[cfg(feature = "host")]
diff --git a/src/utils/boxed_slice.rs b/src/lend/impls/boxed_slice.rs
similarity index 93%
rename from src/utils/boxed_slice.rs
rename to src/lend/impls/boxed_slice.rs
index e4796f2f2..6e1c95d90 100644
--- a/src/utils/boxed_slice.rs
+++ b/src/lend/impls/boxed_slice.rs
@@ -6,16 +6,16 @@ use const_type_layout::{TypeGraphLayout, TypeLayout};
 use rustacuda::{error::CudaResult, memory::DeviceBuffer};
 
 use crate::{
-    common::{CudaAsRust, RustToCuda},
+    lend::{CudaAsRust, RustToCuda},
     safety::SafeDeviceCopy,
 };
 
 #[cfg(any(feature = "host", feature = "device"))]
-use crate::common::DeviceAccessible;
+use crate::utils::ffi::DeviceAccessible;
 
 #[cfg(feature = "host")]
 use crate::{
-    common::{CombinedCudaAlloc, CudaAlloc},
+    alloc::{CombinedCudaAlloc, CudaAlloc},
     host::CudaDropWrapper,
     utils::device_copy::SafeDeviceCopyWrapper,
 };
@@ -36,7 +36,7 @@ unsafe impl<T: SafeDeviceCopy + TypeGraphLayout> RustToCuda for Box<[T]> {
     #[cfg(all(feature = "host", not(doc)))]
     type CudaAllocation = crate::host::CudaDropWrapper<DeviceBuffer<SafeDeviceCopyWrapper<T>>>;
     #[cfg(any(not(feature = "host"), doc))]
-    type CudaAllocation = crate::common::SomeCudaAlloc;
+    type CudaAllocation = crate::alloc::SomeCudaAlloc;
     type CudaRepresentation = BoxedSliceCudaRepresentation<T>;
 
     #[cfg(feature = "host")]
diff --git a/src/lend/impls/mod.rs b/src/lend/impls/mod.rs
new file mode 100644
index 000000000..18f546bbd
--- /dev/null
+++ b/src/lend/impls/mod.rs
@@ -0,0 +1,7 @@
+mod r#box;
+mod boxed_slice;
+mod option;
+mod r#ref;
+mod ref_mut;
+mod slice_ref;
+mod slice_ref_mut;
diff --git a/src/utils/option.rs b/src/lend/impls/option.rs
similarity index 96%
rename from src/utils/option.rs
rename to src/lend/impls/option.rs
index dec109f38..291a4a255 100644
--- a/src/utils/option.rs
+++ b/src/lend/impls/option.rs
@@ -6,16 +6,13 @@ use const_type_layout::{TypeGraphLayout, TypeLayout};
 use rustacuda::error::CudaResult;
 
 use crate::{
-    common::{
-        CudaAsRust, DeviceAccessible, RustToCuda, RustToCudaAsync, RustToCudaAsyncProxy,
-        RustToCudaProxy,
-    },
+    lend::{CudaAsRust, RustToCuda, RustToCudaAsync, RustToCudaAsyncProxy, RustToCudaProxy},
     safety::SafeDeviceCopy,
-    utils::device_copy::SafeDeviceCopyWrapper,
+    utils::{device_copy::SafeDeviceCopyWrapper, ffi::DeviceAccessible},
 };
 
 #[cfg(feature = "host")]
-use crate::common::{CombinedCudaAlloc, CudaAlloc};
+use crate::alloc::{CombinedCudaAlloc, CudaAlloc};
 
 #[doc(hidden)]
 #[allow(clippy::module_name_repetitions)]
diff --git a/src/utils/ref.rs b/src/lend/impls/ref.rs
similarity index 93%
rename from src/utils/ref.rs
rename to src/lend/impls/ref.rs
index 6475d9ccf..c6aee84e6 100644
--- a/src/utils/ref.rs
+++ b/src/lend/impls/ref.rs
@@ -6,16 +6,16 @@ use const_type_layout::{TypeGraphLayout, TypeLayout};
 use rustacuda::{error::CudaResult, memory::DeviceBox};
 
 use crate::{
-    common::{CudaAsRust, RustToCuda},
+    lend::{CudaAsRust, RustToCuda},
     safety::SafeDeviceCopy,
 };
 
 #[cfg(any(feature = "host", feature = "device"))]
-use crate::common::DeviceAccessible;
+use crate::utils::ffi::DeviceAccessible;
 
 #[cfg(feature = "host")]
 use crate::{
-    common::{CombinedCudaAlloc, CudaAlloc},
+    alloc::{CombinedCudaAlloc, CudaAlloc},
     host::CudaDropWrapper,
     utils::device_copy::SafeDeviceCopyWrapper,
 };
@@ -39,7 +39,7 @@ unsafe impl<'a, T: SafeDeviceCopy + TypeGraphLayout> RustToCuda for &'a T {
     #[cfg(all(feature = "host", not(doc)))]
     type CudaAllocation = crate::host::CudaDropWrapper<DeviceBox<SafeDeviceCopyWrapper<T>>>;
     #[cfg(any(not(feature = "host"), doc))]
-    type CudaAllocation = crate::common::SomeCudaAlloc;
+    type CudaAllocation = crate::alloc::SomeCudaAlloc;
     type CudaRepresentation = RefCudaRepresentation<'a, T>;
 
     #[cfg(feature = "host")]
diff --git a/src/utils/ref_mut.rs b/src/lend/impls/ref_mut.rs
similarity index 93%
rename from src/utils/ref_mut.rs
rename to src/lend/impls/ref_mut.rs
index a5cbae62a..a4f4dbe29 100644
--- a/src/utils/ref_mut.rs
+++ b/src/lend/impls/ref_mut.rs
@@ -6,16 +6,16 @@ use const_type_layout::{TypeGraphLayout, TypeLayout};
 use rustacuda::{error::CudaResult, memory::DeviceBox};
 
 use crate::{
-    common::{CudaAsRust, RustToCuda},
+    lend::{CudaAsRust, RustToCuda},
     safety::SafeDeviceCopy,
 };
 
 #[cfg(any(feature = "host", feature = "device"))]
-use crate::common::DeviceAccessible;
+use crate::utils::ffi::DeviceAccessible;
 
 #[cfg(feature = "host")]
 use crate::{
-    common::{CombinedCudaAlloc, CudaAlloc},
+    alloc::{CombinedCudaAlloc, CudaAlloc},
     host::CudaDropWrapper,
     utils::device_copy::SafeDeviceCopyWrapper,
 };
@@ -39,7 +39,7 @@ unsafe impl<'a, T: SafeDeviceCopy + TypeGraphLayout> RustToCuda for &'a mut T {
     #[cfg(all(feature = "host", not(doc)))]
     type CudaAllocation = crate::host::CudaDropWrapper<DeviceBox<SafeDeviceCopyWrapper<T>>>;
     #[cfg(any(not(feature = "host"), doc))]
-    type CudaAllocation = crate::common::SomeCudaAlloc;
+    type CudaAllocation = crate::alloc::SomeCudaAlloc;
     type CudaRepresentation = RefMutCudaRepresentation<'a, T>;
 
     #[cfg(feature = "host")]
diff --git a/src/utils/slice_ref.rs b/src/lend/impls/slice_ref.rs
similarity index 93%
rename from src/utils/slice_ref.rs
rename to src/lend/impls/slice_ref.rs
index a2a5e5012..6108f9ccd 100644
--- a/src/utils/slice_ref.rs
+++ b/src/lend/impls/slice_ref.rs
@@ -6,16 +6,16 @@ use const_type_layout::{TypeGraphLayout, TypeLayout};
 use rustacuda::{error::CudaResult, memory::DeviceBuffer};
 
 use crate::{
-    common::{CudaAsRust, RustToCuda},
+    lend::{CudaAsRust, RustToCuda},
     safety::SafeDeviceCopy,
 };
 
 #[cfg(any(feature = "host", feature = "device"))]
-use crate::common::DeviceAccessible;
+use crate::utils::ffi::DeviceAccessible;
 
 #[cfg(feature = "host")]
 use crate::{
-    common::{CombinedCudaAlloc, CudaAlloc},
+    alloc::{CombinedCudaAlloc, CudaAlloc},
     host::CudaDropWrapper,
     utils::device_copy::SafeDeviceCopyWrapper,
 };
@@ -40,7 +40,7 @@ unsafe impl<'a, T: SafeDeviceCopy + TypeGraphLayout> RustToCuda for &'a [T] {
     #[cfg(all(feature = "host", not(doc)))]
     type CudaAllocation = crate::host::CudaDropWrapper<DeviceBuffer<SafeDeviceCopyWrapper<T>>>;
     #[cfg(any(not(feature = "host"), doc))]
-    type CudaAllocation = crate::common::SomeCudaAlloc;
+    type CudaAllocation = crate::alloc::SomeCudaAlloc;
     type CudaRepresentation = SliceRefCudaRepresentation<'a, T>;
 
     #[cfg(feature = "host")]
diff --git a/src/utils/slice_ref_mut.rs b/src/lend/impls/slice_ref_mut.rs
similarity index 93%
rename from src/utils/slice_ref_mut.rs
rename to src/lend/impls/slice_ref_mut.rs
index 64371a1e3..b2f79abf9 100644
--- a/src/utils/slice_ref_mut.rs
+++ b/src/lend/impls/slice_ref_mut.rs
@@ -6,16 +6,16 @@ use const_type_layout::{TypeGraphLayout, TypeLayout};
 use rustacuda::{error::CudaResult, memory::DeviceBuffer};
 
 use crate::{
-    common::{CudaAsRust, RustToCuda},
+    lend::{CudaAsRust, RustToCuda},
     safety::SafeDeviceCopy,
 };
 
 #[cfg(any(feature = "host", feature = "device"))]
-use crate::common::DeviceAccessible;
+use crate::utils::ffi::DeviceAccessible;
 
 #[cfg(feature = "host")]
 use crate::{
-    common::{CombinedCudaAlloc, CudaAlloc},
+    alloc::{CombinedCudaAlloc, CudaAlloc},
     host::CudaDropWrapper,
     utils::device_copy::SafeDeviceCopyWrapper,
 };
@@ -40,7 +40,7 @@ unsafe impl<'a, T: SafeDeviceCopy + TypeGraphLayout> RustToCuda for &'a mut [T]
     #[cfg(all(feature = "host", not(doc)))]
     type CudaAllocation = crate::host::CudaDropWrapper<DeviceBuffer<SafeDeviceCopyWrapper<T>>>;
     #[cfg(any(not(feature = "host"), doc))]
-    type CudaAllocation = crate::common::SomeCudaAlloc;
+    type CudaAllocation = crate::alloc::SomeCudaAlloc;
     type CudaRepresentation = SliceRefMutCudaRepresentation<'a, T>;
 
     #[cfg(feature = "host")]
diff --git a/src/lend/mod.rs b/src/lend/mod.rs
new file mode 100644
index 000000000..a6cffea3d
--- /dev/null
+++ b/src/lend/mod.rs
@@ -0,0 +1,283 @@
+use const_type_layout::TypeGraphLayout;
+#[cfg(feature = "host")]
+use rustacuda::error::CudaError;
+use rustacuda_core::DeviceCopy;
+
+#[cfg(feature = "derive")]
+#[allow(clippy::module_name_repetitions)]
+pub use rust_cuda_derive::LendRustToCuda;
+
+use crate::{alloc::CudaAlloc, utils::ffi::DeviceAccessible};
+
+#[cfg(feature = "device")]
+use crate::utils::ffi::{DeviceConstRef, DeviceOwnedRef};
+#[cfg(feature = "host")]
+use crate::{
+    alloc::{CombinedCudaAlloc, EmptyCudaAlloc, NoCudaAlloc},
+    host::{HostAndDeviceConstRef, HostAndDeviceOwned},
+};
+
+#[cfg(any(feature = "host", feature = "device"))]
+use crate::safety::{NoSafeAliasing, SafeDeviceCopy};
+
+mod impls;
+
+/// # Safety
+///
+/// This is an internal trait and should ONLY be derived automatically using
+/// `#[derive(LendRustToCuda)]`
+pub unsafe trait RustToCuda {
+    type CudaAllocation: CudaAlloc;
+    type CudaRepresentation: CudaAsRust<RustRepresentation = Self>;
+
+    #[doc(hidden)]
+    #[cfg(feature = "host")]
+    /// # Errors
+    ///
+    /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside
+    /// CUDA
+    ///
+    /// # Safety
+    ///
+    /// This is an internal function and should NEVER be called manually
+    /// The returned [`Self::CudaRepresentation`] must NEVER be accessed on the
+    ///  CPU  as it contains a GPU-resident copy of `self`.
+    #[allow(clippy::type_complexity)]
+    unsafe fn borrow<A: CudaAlloc>(
+        &self,
+        alloc: A,
+    ) -> rustacuda::error::CudaResult<(
+        DeviceAccessible<Self::CudaRepresentation>,
+        CombinedCudaAlloc<Self::CudaAllocation, A>,
+    )>;
+
+    #[doc(hidden)]
+    #[cfg(feature = "host")]
+    /// # Errors
+    ///
+    /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside
+    /// CUDA
+    ///
+    /// # Safety
+    ///
+    /// This is an internal function and should NEVER be called manually
+    #[allow(clippy::type_complexity)]
+    unsafe fn restore<A: CudaAlloc>(
+        &mut self,
+        alloc: CombinedCudaAlloc<Self::CudaAllocation, A>,
+    ) -> rustacuda::error::CudaResult<A>;
+}
+
+/// # Safety
+///
+/// This is an internal trait and should ONLY be derived automatically using
+/// `#[derive(LendRustToCuda)]`
+pub unsafe trait RustToCudaAsync: RustToCuda {
+    #[doc(hidden)]
+    #[cfg(feature = "host")]
+    /// # Errors
+    ///
+    /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside
+    /// CUDA
+    ///
+    /// # Safety
+    ///
+    /// This is an internal function and should NEVER be called manually
+    /// The returned
+    /// [`Self::CudaRepresentation`](RustToCuda::CudaRepresentation) must NEVER
+    /// be accessed on the  CPU  as it contains a GPU-resident copy of
+    /// `self`.
+    #[allow(clippy::type_complexity)]
+    unsafe fn borrow_async<A: CudaAlloc>(
+        &self,
+        alloc: A,
+        stream: &rustacuda::stream::Stream,
+    ) -> rustacuda::error::CudaResult<(
+        DeviceAccessible<Self::CudaRepresentation>,
+        CombinedCudaAlloc<Self::CudaAllocation, A>,
+    )>;
+
+    #[doc(hidden)]
+    #[cfg(feature = "host")]
+    /// # Errors
+    ///
+    /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside
+    /// CUDA
+    ///
+    /// # Safety
+    ///
+    /// This is an internal function and should NEVER be called manually
+    #[allow(clippy::type_complexity)]
+    unsafe fn restore_async<A: CudaAlloc>(
+        &mut self,
+        alloc: CombinedCudaAlloc<Self::CudaAllocation, A>,
+        stream: &rustacuda::stream::Stream,
+    ) -> rustacuda::error::CudaResult<A>;
+}
+
+/// # Safety
+///
+/// This is an internal trait and should NEVER be implemented manually
+pub unsafe trait CudaAsRust: DeviceCopy + TypeGraphLayout {
+    type RustRepresentation: RustToCuda<CudaRepresentation = Self>;
+
+    #[doc(hidden)]
+    #[cfg(feature = "device")]
+    /// # Safety
+    ///
+    /// This is an internal function and should NEVER be called manually
+    unsafe fn as_rust(this: &DeviceAccessible<Self>) -> Self::RustRepresentation;
+}
+
+pub trait RustToCudaProxy<T>: RustToCuda {
+    fn from_ref(val: &T) -> &Self;
+    fn from_mut(val: &mut T) -> &mut Self;
+
+    fn into(self) -> T;
+}
+
+pub trait RustToCudaAsyncProxy<T>: RustToCudaAsync {
+    fn from_ref(val: &T) -> &Self;
+    fn from_mut(val: &mut T) -> &mut Self;
+
+    fn into(self) -> T;
+}
+
+#[cfg(feature = "host")]
+#[allow(clippy::module_name_repetitions)]
+pub trait LendToCuda: RustToCuda + NoSafeAliasing {
+    /// Lends an immutable copy of `&self` to CUDA:
+    /// - code in the CUDA kernel can only access `&self` through the
+    ///   [`DeviceConstRef`] inside the closure
+    /// - after the closure, `&self` will not have changed
+    ///
+    /// # Errors
+    ///
+    /// Returns a [`CudaError`] iff an error occurs inside CUDA
+    fn lend_to_cuda<
+        O,
+        E: From<CudaError>,
+        F: FnOnce(
+            HostAndDeviceConstRef<DeviceAccessible<<Self as RustToCuda>::CudaRepresentation>>,
+        ) -> Result<O, E>,
+    >(
+        &self,
+        inner: F,
+    ) -> Result<O, E>;
+
+    /// Moves `self` to CUDA iff `self` is [`SafeDeviceCopy`]
+    ///
+    /// # Errors
+    ///
+    /// Returns a [`CudaError`] iff an error occurs inside CUDA
+    fn move_to_cuda<
+        O,
+        E: From<CudaError>,
+        F: FnOnce(
+            HostAndDeviceOwned<DeviceAccessible<<Self as RustToCuda>::CudaRepresentation>>,
+        ) -> Result<O, E>,
+    >(
+        self,
+        inner: F,
+    ) -> Result<O, E>
+    where
+        Self: RustToCuda<CudaRepresentation: SafeDeviceCopy, CudaAllocation: EmptyCudaAlloc>;
+}
+
+#[cfg(feature = "host")]
+impl<T: RustToCuda + NoSafeAliasing> LendToCuda for T {
+    fn lend_to_cuda<
+        O,
+        E: From<CudaError>,
+        F: FnOnce(
+            HostAndDeviceConstRef<DeviceAccessible<<Self as RustToCuda>::CudaRepresentation>>,
+        ) -> Result<O, E>,
+    >(
+        &self,
+        inner: F,
+    ) -> Result<O, E> {
+        let (cuda_repr, alloc) = unsafe { self.borrow(NoCudaAlloc) }?;
+
+        let result = HostAndDeviceConstRef::with_new(&cuda_repr, inner);
+
+        core::mem::drop(cuda_repr);
+        core::mem::drop(alloc);
+
+        result
+    }
+
+    fn move_to_cuda<
+        O,
+        E: From<CudaError>,
+        F: FnOnce(
+            HostAndDeviceOwned<DeviceAccessible<<Self as RustToCuda>::CudaRepresentation>>,
+        ) -> Result<O, E>,
+    >(
+        self,
+        inner: F,
+    ) -> Result<O, E>
+    where
+        Self: RustToCuda<CudaRepresentation: SafeDeviceCopy, CudaAllocation: EmptyCudaAlloc>,
+    {
+        let (cuda_repr, alloc) = unsafe { self.borrow(NoCudaAlloc) }?;
+
+        let result = HostAndDeviceOwned::with_new(cuda_repr, inner);
+
+        core::mem::drop(alloc);
+
+        result
+    }
+}
+
+#[cfg(feature = "device")]
+pub trait BorrowFromRust: RustToCuda + NoSafeAliasing {
+    /// # Safety
+    ///
+    /// This function is only safe to call iff `cuda_repr` is the
+    ///  [`DeviceConstRef`] borrowed on the CPU using the corresponding
+    ///  [`LendToCuda::lend_to_cuda`].
+    unsafe fn with_borrow_from_rust<O, F: FnOnce(&Self) -> O>(
+        cuda_repr: DeviceConstRef<DeviceAccessible<<Self as RustToCuda>::CudaRepresentation>>,
+        inner: F,
+    ) -> O;
+
+    /// # Safety
+    ///
+    /// This function is only safe to call iff `cuda_repr` is the
+    ///  [`DeviceOwnedRef`] borrowed on the CPU using the corresponding
+    ///  [`LendToCuda::move_to_cuda`].
+    unsafe fn with_moved_from_rust<O, F: FnOnce(Self) -> O>(
+        cuda_repr: DeviceOwnedRef<DeviceAccessible<<Self as RustToCuda>::CudaRepresentation>>,
+        inner: F,
+    ) -> O
+    where
+        Self: Sized,
+        <Self as RustToCuda>::CudaRepresentation: SafeDeviceCopy;
+}
+
+#[cfg(feature = "device")]
+impl<T: RustToCuda + NoSafeAliasing> BorrowFromRust for T {
+    #[inline]
+    unsafe fn with_borrow_from_rust<O, F: FnOnce(&Self) -> O>(
+        cuda_repr: DeviceConstRef<DeviceAccessible<<Self as RustToCuda>::CudaRepresentation>>,
+        inner: F,
+    ) -> O {
+        // `rust_repr` must never be dropped as we do NOT own any of the
+        //  heap memory it might reference
+        let rust_repr = core::mem::ManuallyDrop::new(CudaAsRust::as_rust(cuda_repr.as_ref()));
+
+        inner(&rust_repr)
+    }
+
+    #[inline]
+    unsafe fn with_moved_from_rust<O, F: FnOnce(Self) -> O>(
+        mut cuda_repr: DeviceOwnedRef<DeviceAccessible<<Self as RustToCuda>::CudaRepresentation>>,
+        inner: F,
+    ) -> O
+    where
+        Self: Sized,
+        <Self as RustToCuda>::CudaRepresentation: SafeDeviceCopy,
+    {
+        inner(CudaAsRust::as_rust(cuda_repr.as_mut()))
+    }
+}
diff --git a/src/lib.rs b/src/lib.rs
index 6ba80f56f..c782c4047 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -27,6 +27,7 @@
 #![feature(inline_const)]
 #![feature(sync_unsafe_cell)]
 #![feature(never_type)]
+#![feature(layout_for_ptr)]
 #![feature(cfg_version)]
 #![cfg_attr(not(version("1.76.0")), feature(c_str_literals))]
 #![cfg_attr(not(version("1.76.0")), feature(ptr_from_ref))]
@@ -44,9 +45,10 @@ core::compile_error!("cannot enable the `host` feature on a target with `target_
 #[cfg(all(feature = "device", not(target_os = "cuda"), not(doc)))]
 core::compile_error!("cannot enable the `device` feature on a target without `target_os=\"cuda\"`");
 
-pub mod common;
+pub mod alloc;
 pub mod deps;
 pub mod kernel;
+pub mod lend;
 pub mod safety;
 pub mod utils;
 
diff --git a/src/safety/device_copy.rs b/src/safety/device_copy.rs
index 9aedc8e81..a2bfc9552 100644
--- a/src/safety/device_copy.rs
+++ b/src/safety/device_copy.rs
@@ -1,6 +1,6 @@
 use const_type_layout::TypeGraphLayout;
 
-use crate::{common::DeviceAccessible, safety::StackOnly};
+use crate::{safety::StackOnly, utils::ffi::DeviceAccessible};
 
 #[allow(clippy::module_name_repetitions)]
 /// Types which are safe to memcpy from the CPU to a GPU.
diff --git a/src/safety/no_aliasing.rs b/src/safety/no_aliasing.rs
index f5c80e354..7baa06f19 100644
--- a/src/safety/no_aliasing.rs
+++ b/src/safety/no_aliasing.rs
@@ -63,7 +63,7 @@ impl<T> !NoSafeAliasing for *mut T {}
 unsafe impl<T> NoSafeAliasing for core::marker::PhantomData<T> {}
 
 unsafe impl<T> NoSafeAliasing for r#final::Final<T> {}
-unsafe impl<T: crate::common::CudaAsRust> NoSafeAliasing
+unsafe impl<T: crate::lend::CudaAsRust> NoSafeAliasing
     for crate::utils::aliasing::FinalCudaRepresentation<T>
 {
 }
diff --git a/src/utils/aliasing/const.rs b/src/utils/aliasing/const.rs
index 131a05803..b3a28cf25 100644
--- a/src/utils/aliasing/const.rs
+++ b/src/utils/aliasing/const.rs
@@ -8,7 +8,10 @@ use core::{
 use const_type_layout::TypeLayout;
 use rustacuda_core::DeviceCopy;
 
-use crate::common::{CudaAsRust, DeviceAccessible, RustToCuda, RustToCudaAsync};
+use crate::{
+    lend::{CudaAsRust, RustToCuda, RustToCudaAsync},
+    utils::ffi::DeviceAccessible,
+};
 
 #[repr(transparent)]
 #[derive(Clone, TypeLayout)]
@@ -193,12 +196,12 @@ unsafe impl<T: RustToCuda, const STRIDE: usize> RustToCuda
 
     #[cfg(feature = "host")]
     #[allow(clippy::type_complexity)]
-    unsafe fn borrow<A: crate::common::CudaAlloc>(
+    unsafe fn borrow<A: crate::alloc::CudaAlloc>(
         &self,
         alloc: A,
     ) -> rustacuda::error::CudaResult<(
         DeviceAccessible<Self::CudaRepresentation>,
-        crate::common::CombinedCudaAlloc<Self::CudaAllocation, A>,
+        crate::alloc::CombinedCudaAlloc<Self::CudaAllocation, A>,
     )> {
         let (cuda_repr, alloc) = self.0.borrow(alloc)?;
 
@@ -209,9 +212,9 @@ unsafe impl<T: RustToCuda, const STRIDE: usize> RustToCuda
     }
 
     #[cfg(feature = "host")]
-    unsafe fn restore<A: crate::common::CudaAlloc>(
+    unsafe fn restore<A: crate::alloc::CudaAlloc>(
         &mut self,
-        alloc: crate::common::CombinedCudaAlloc<Self::CudaAllocation, A>,
+        alloc: crate::alloc::CombinedCudaAlloc<Self::CudaAllocation, A>,
     ) -> rustacuda::error::CudaResult<A> {
         self.0.restore(alloc)
     }
@@ -222,13 +225,13 @@ unsafe impl<T: RustToCudaAsync, const STRIDE: usize> RustToCudaAsync
 {
     #[cfg(feature = "host")]
     #[allow(clippy::type_complexity)]
-    unsafe fn borrow_async<A: crate::common::CudaAlloc>(
+    unsafe fn borrow_async<A: crate::alloc::CudaAlloc>(
         &self,
         alloc: A,
         stream: &rustacuda::stream::Stream,
     ) -> rustacuda::error::CudaResult<(
         DeviceAccessible<Self::CudaRepresentation>,
-        crate::common::CombinedCudaAlloc<Self::CudaAllocation, A>,
+        crate::alloc::CombinedCudaAlloc<Self::CudaAllocation, A>,
     )> {
         let (cuda_repr, alloc) = self.0.borrow_async(alloc, stream)?;
 
@@ -239,9 +242,9 @@ unsafe impl<T: RustToCudaAsync, const STRIDE: usize> RustToCudaAsync
     }
 
     #[cfg(feature = "host")]
-    unsafe fn restore_async<A: crate::common::CudaAlloc>(
+    unsafe fn restore_async<A: crate::alloc::CudaAlloc>(
         &mut self,
-        alloc: crate::common::CombinedCudaAlloc<Self::CudaAllocation, A>,
+        alloc: crate::alloc::CombinedCudaAlloc<Self::CudaAllocation, A>,
         stream: &rustacuda::stream::Stream,
     ) -> rustacuda::error::CudaResult<A> {
         self.0.restore_async(alloc, stream)
diff --git a/src/utils/aliasing/dynamic.rs b/src/utils/aliasing/dynamic.rs
index a6577fc6f..50f028ec3 100644
--- a/src/utils/aliasing/dynamic.rs
+++ b/src/utils/aliasing/dynamic.rs
@@ -8,7 +8,10 @@ use core::{
 use const_type_layout::TypeLayout;
 use rustacuda_core::DeviceCopy;
 
-use crate::common::{CudaAsRust, DeviceAccessible, RustToCuda, RustToCudaAsync};
+use crate::{
+    lend::{CudaAsRust, RustToCuda, RustToCudaAsync},
+    utils::ffi::DeviceAccessible,
+};
 
 #[repr(C)]
 #[derive(Clone, TypeLayout)]
@@ -167,12 +170,12 @@ unsafe impl<T: RustToCuda> RustToCuda for SplitSliceOverCudaThreadsDynamicStride
 
     #[cfg(feature = "host")]
     #[allow(clippy::type_complexity)]
-    unsafe fn borrow<A: crate::common::CudaAlloc>(
+    unsafe fn borrow<A: crate::alloc::CudaAlloc>(
         &self,
         alloc: A,
     ) -> rustacuda::error::CudaResult<(
         DeviceAccessible<Self::CudaRepresentation>,
-        crate::common::CombinedCudaAlloc<Self::CudaAllocation, A>,
+        crate::alloc::CombinedCudaAlloc<Self::CudaAllocation, A>,
     )> {
         let (cuda_repr, alloc) = self.inner.borrow(alloc)?;
 
@@ -186,9 +189,9 @@ unsafe impl<T: RustToCuda> RustToCuda for SplitSliceOverCudaThreadsDynamicStride
     }
 
     #[cfg(feature = "host")]
-    unsafe fn restore<A: crate::common::CudaAlloc>(
+    unsafe fn restore<A: crate::alloc::CudaAlloc>(
         &mut self,
-        alloc: crate::common::CombinedCudaAlloc<Self::CudaAllocation, A>,
+        alloc: crate::alloc::CombinedCudaAlloc<Self::CudaAllocation, A>,
     ) -> rustacuda::error::CudaResult<A> {
         self.inner.restore(alloc)
     }
@@ -197,13 +200,13 @@ unsafe impl<T: RustToCuda> RustToCuda for SplitSliceOverCudaThreadsDynamicStride
 unsafe impl<T: RustToCudaAsync> RustToCudaAsync for SplitSliceOverCudaThreadsDynamicStride<T> {
     #[cfg(feature = "host")]
     #[allow(clippy::type_complexity)]
-    unsafe fn borrow_async<A: crate::common::CudaAlloc>(
+    unsafe fn borrow_async<A: crate::alloc::CudaAlloc>(
         &self,
         alloc: A,
         stream: &rustacuda::stream::Stream,
     ) -> rustacuda::error::CudaResult<(
         DeviceAccessible<Self::CudaRepresentation>,
-        crate::common::CombinedCudaAlloc<Self::CudaAllocation, A>,
+        crate::alloc::CombinedCudaAlloc<Self::CudaAllocation, A>,
     )> {
         let (cuda_repr, alloc) = self.inner.borrow_async(alloc, stream)?;
 
@@ -217,9 +220,9 @@ unsafe impl<T: RustToCudaAsync> RustToCudaAsync for SplitSliceOverCudaThreadsDyn
     }
 
     #[cfg(feature = "host")]
-    unsafe fn restore_async<A: crate::common::CudaAlloc>(
+    unsafe fn restore_async<A: crate::alloc::CudaAlloc>(
         &mut self,
-        alloc: crate::common::CombinedCudaAlloc<Self::CudaAllocation, A>,
+        alloc: crate::alloc::CombinedCudaAlloc<Self::CudaAllocation, A>,
         stream: &rustacuda::stream::Stream,
     ) -> rustacuda::error::CudaResult<A> {
         self.inner.restore_async(alloc, stream)
diff --git a/src/utils/aliasing/final.rs b/src/utils/aliasing/final.rs
index 230ea4e8a..432910920 100644
--- a/src/utils/aliasing/final.rs
+++ b/src/utils/aliasing/final.rs
@@ -1,7 +1,10 @@
 use const_type_layout::TypeLayout;
 use r#final::Final;
 
-use crate::common::{CudaAsRust, DeviceAccessible, RustToCuda, RustToCudaAsync};
+use crate::{
+    lend::{CudaAsRust, RustToCuda, RustToCudaAsync},
+    utils::ffi::DeviceAccessible,
+};
 
 #[doc(hidden)]
 #[repr(transparent)]
@@ -18,12 +21,12 @@ unsafe impl<T: RustToCuda> RustToCuda for Final<T> {
 
     #[cfg(feature = "host")]
     #[allow(clippy::type_complexity)]
-    unsafe fn borrow<A: crate::common::CudaAlloc>(
+    unsafe fn borrow<A: crate::alloc::CudaAlloc>(
         &self,
         alloc: A,
     ) -> rustacuda::error::CudaResult<(
         DeviceAccessible<Self::CudaRepresentation>,
-        crate::common::CombinedCudaAlloc<Self::CudaAllocation, A>,
+        crate::alloc::CombinedCudaAlloc<Self::CudaAllocation, A>,
     )> {
         let (cuda_repr, alloc) = (**self).borrow(alloc)?;
 
@@ -34,9 +37,9 @@ unsafe impl<T: RustToCuda> RustToCuda for Final<T> {
     }
 
     #[cfg(feature = "host")]
-    unsafe fn restore<A: crate::common::CudaAlloc>(
+    unsafe fn restore<A: crate::alloc::CudaAlloc>(
         &mut self,
-        alloc: crate::common::CombinedCudaAlloc<Self::CudaAllocation, A>,
+        alloc: crate::alloc::CombinedCudaAlloc<Self::CudaAllocation, A>,
     ) -> rustacuda::error::CudaResult<A> {
         // Safety: Final is a repr(transparent) newtype wrapper around T
         let inner: &mut T = &mut *(self as *mut Self).cast();
@@ -48,13 +51,13 @@ unsafe impl<T: RustToCuda> RustToCuda for Final<T> {
 unsafe impl<T: RustToCudaAsync> RustToCudaAsync for Final<T> {
     #[cfg(feature = "host")]
     #[allow(clippy::type_complexity)]
-    unsafe fn borrow_async<A: crate::common::CudaAlloc>(
+    unsafe fn borrow_async<A: crate::alloc::CudaAlloc>(
         &self,
         alloc: A,
         stream: &rustacuda::stream::Stream,
     ) -> rustacuda::error::CudaResult<(
         DeviceAccessible<Self::CudaRepresentation>,
-        crate::common::CombinedCudaAlloc<Self::CudaAllocation, A>,
+        crate::alloc::CombinedCudaAlloc<Self::CudaAllocation, A>,
     )> {
         let (cuda_repr, alloc) = (**self).borrow_async(alloc, stream)?;
 
@@ -65,9 +68,9 @@ unsafe impl<T: RustToCudaAsync> RustToCudaAsync for Final<T> {
     }
 
     #[cfg(feature = "host")]
-    unsafe fn restore_async<A: crate::common::CudaAlloc>(
+    unsafe fn restore_async<A: crate::alloc::CudaAlloc>(
         &mut self,
-        alloc: crate::common::CombinedCudaAlloc<Self::CudaAllocation, A>,
+        alloc: crate::alloc::CombinedCudaAlloc<Self::CudaAllocation, A>,
         stream: &rustacuda::stream::Stream,
     ) -> rustacuda::error::CudaResult<A> {
         // Safety: Final is a repr(transparent) newtype wrapper around T
diff --git a/src/utils/device_copy.rs b/src/utils/device_copy.rs
index 1f03c1799..2363b4855 100644
--- a/src/utils/device_copy.rs
+++ b/src/utils/device_copy.rs
@@ -3,15 +3,16 @@
 use const_type_layout::{TypeGraphLayout, TypeLayout};
 
 use crate::{
-    common::{CudaAsRust, NoCudaAlloc, RustToCuda, RustToCudaAsync},
+    alloc::NoCudaAlloc,
+    lend::{CudaAsRust, RustToCuda, RustToCudaAsync},
     safety::SafeDeviceCopy,
 };
 
 #[cfg(any(feature = "host", feature = "device"))]
-use crate::common::DeviceAccessible;
+use crate::utils::ffi::DeviceAccessible;
 
 #[cfg(feature = "host")]
-use crate::common::{CombinedCudaAlloc, CudaAlloc};
+use crate::alloc::{CombinedCudaAlloc, CudaAlloc};
 
 #[derive(Copy, Clone, Debug, TypeLayout)]
 #[repr(transparent)]
diff --git a/src/utils/exchange/buffer/common.rs b/src/utils/exchange/buffer/common.rs
index 31f50cb68..450ed0975 100644
--- a/src/utils/exchange/buffer/common.rs
+++ b/src/utils/exchange/buffer/common.rs
@@ -1,7 +1,7 @@
 use const_type_layout::{TypeGraphLayout, TypeLayout};
 use rustacuda_core::DeviceCopy;
 
-use crate::{common::CudaAsRust, safety::SafeDeviceCopy};
+use crate::{lend::CudaAsRust, safety::SafeDeviceCopy};
 
 use super::{CudaExchangeBuffer, CudaExchangeItem};
 
@@ -29,7 +29,9 @@ unsafe impl<T: SafeDeviceCopy + TypeGraphLayout, const M2D: bool, const M2H: boo
     type RustRepresentation = CudaExchangeBuffer<T, M2D, M2H>;
 
     #[cfg(feature = "device")]
-    unsafe fn as_rust(this: &crate::common::DeviceAccessible<Self>) -> Self::RustRepresentation {
+    unsafe fn as_rust(
+        this: &crate::utils::ffi::DeviceAccessible<Self>,
+    ) -> Self::RustRepresentation {
         CudaExchangeBuffer {
             inner: super::device::CudaExchangeBufferDevice(core::mem::ManuallyDrop::new(
                 crate::deps::alloc::boxed::Box::from_raw(core::slice::from_raw_parts_mut(
diff --git a/src/utils/exchange/buffer/host.rs b/src/utils/exchange/buffer/host.rs
index 9bbf8a0af..58e200881 100644
--- a/src/utils/exchange/buffer/host.rs
+++ b/src/utils/exchange/buffer/host.rs
@@ -10,9 +10,10 @@ use rustacuda::{
 };
 
 use crate::{
-    common::{CombinedCudaAlloc, CudaAlloc, DeviceAccessible, NoCudaAlloc},
+    alloc::{CombinedCudaAlloc, CudaAlloc, NoCudaAlloc},
     host::CudaDropWrapper,
     safety::SafeDeviceCopy,
+    utils::ffi::DeviceAccessible,
 };
 
 use super::{common::CudaExchangeBufferCudaRepresentation, CudaExchangeItem};
diff --git a/src/utils/exchange/buffer/mod.rs b/src/utils/exchange/buffer/mod.rs
index dcbbc036f..c1dea16d0 100644
--- a/src/utils/exchange/buffer/mod.rs
+++ b/src/utils/exchange/buffer/mod.rs
@@ -12,10 +12,16 @@ use const_type_layout::TypeGraphLayout;
 use crate::safety::SafeDeviceCopy;
 
 #[cfg(any(feature = "host", feature = "device"))]
-use crate::common::{NoCudaAlloc, RustToCuda, RustToCudaAsync};
+use crate::{
+    alloc::NoCudaAlloc,
+    lend::{RustToCuda, RustToCudaAsync},
+};
 
 #[cfg(feature = "host")]
-use crate::common::{CombinedCudaAlloc, CudaAlloc, DeviceAccessible};
+use crate::{
+    alloc::{CombinedCudaAlloc, CudaAlloc},
+    utils::ffi::DeviceAccessible,
+};
 
 #[cfg(any(feature = "host", feature = "device"))]
 use self::common::CudaExchangeBufferCudaRepresentation;
diff --git a/src/utils/exchange/wrapper.rs b/src/utils/exchange/wrapper.rs
index 5f64d3d05..2e9decc51 100644
--- a/src/utils/exchange/wrapper.rs
+++ b/src/utils/exchange/wrapper.rs
@@ -14,14 +14,13 @@ use rustacuda::{
 };
 
 use crate::{
-    common::{
-        CombinedCudaAlloc, DeviceAccessible, EmptyCudaAlloc, NoCudaAlloc, RustToCuda,
-        RustToCudaAsync,
-    },
+    alloc::{CombinedCudaAlloc, EmptyCudaAlloc, NoCudaAlloc},
     host::{
         CudaDropWrapper, HostAndDeviceConstRef, HostAndDeviceConstRefAsync, HostAndDeviceMutRef,
         HostAndDeviceMutRefAsync, HostDeviceBox, HostLockedBox,
     },
+    lend::{RustToCuda, RustToCudaAsync},
+    utils::ffi::DeviceAccessible,
 };
 
 #[allow(clippy::module_name_repetitions)]
diff --git a/src/utils/ffi.rs b/src/utils/ffi.rs
new file mode 100644
index 000000000..98fd945e7
--- /dev/null
+++ b/src/utils/ffi.rs
@@ -0,0 +1,133 @@
+use core::marker::PhantomData;
+#[cfg(feature = "device")]
+use core::{
+    convert::{AsMut, AsRef},
+    ops::{Deref, DerefMut},
+};
+#[cfg(feature = "host")]
+use std::{fmt, mem::MaybeUninit, ptr::copy_nonoverlapping};
+
+#[cfg(feature = "host")]
+use const_type_layout::TypeGraphLayout;
+use const_type_layout::TypeLayout;
+use rustacuda_core::DeviceCopy;
+
+#[cfg(feature = "host")]
+use crate::{lend::CudaAsRust, safety::SafeDeviceCopy, utils::device_copy::SafeDeviceCopyWrapper};
+
+#[repr(transparent)]
+#[cfg_attr(any(feature = "device", doc), derive(Debug))]
+#[derive(TypeLayout)]
+pub struct DeviceAccessible<T: ?Sized + DeviceCopy>(T);
+
+unsafe impl<T: ?Sized + DeviceCopy> DeviceCopy for DeviceAccessible<T> {}
+
+#[cfg(feature = "host")]
+impl<T: CudaAsRust> From<T> for DeviceAccessible<T> {
+    fn from(value: T) -> Self {
+        Self(value)
+    }
+}
+
+#[cfg(feature = "host")]
+impl<T: SafeDeviceCopy + TypeGraphLayout> From<&T> for DeviceAccessible<SafeDeviceCopyWrapper<T>> {
+    fn from(value: &T) -> Self {
+        let value = unsafe {
+            let mut uninit = MaybeUninit::uninit();
+            copy_nonoverlapping(value, uninit.as_mut_ptr(), 1);
+            uninit.assume_init()
+        };
+
+        Self(SafeDeviceCopyWrapper::from(value))
+    }
+}
+
+#[cfg(all(feature = "host", not(doc)))]
+impl<T: ?Sized + DeviceCopy + fmt::Debug> fmt::Debug for DeviceAccessible<T> {
+    fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result {
+        fmt.debug_struct(stringify!(DeviceAccessible))
+            .finish_non_exhaustive()
+    }
+}
+
+#[cfg(feature = "device")]
+impl<T: ?Sized + DeviceCopy> Deref for DeviceAccessible<T> {
+    type Target = T;
+
+    fn deref(&self) -> &Self::Target {
+        &self.0
+    }
+}
+
+#[cfg(feature = "device")]
+impl<T: ?Sized + DeviceCopy> DerefMut for DeviceAccessible<T> {
+    fn deref_mut(&mut self) -> &mut Self::Target {
+        &mut self.0
+    }
+}
+
+#[repr(transparent)]
+#[derive(Clone, Copy, TypeLayout)]
+pub struct DeviceConstRef<'r, T: DeviceCopy + 'r> {
+    #[cfg_attr(feature = "host", allow(dead_code))]
+    pub(crate) pointer: *const T,
+    pub(crate) reference: PhantomData<&'r T>,
+}
+
+unsafe impl<'r, T: DeviceCopy> DeviceCopy for DeviceConstRef<'r, T> {}
+
+#[cfg(feature = "device")]
+impl<'r, T: DeviceCopy> AsRef<T> for DeviceConstRef<'r, T> {
+    fn as_ref(&self) -> &T {
+        unsafe { &*self.pointer }
+    }
+}
+
+#[repr(transparent)]
+#[derive(TypeLayout)]
+pub struct DeviceMutRef<'r, T: DeviceCopy + 'r> {
+    #[cfg_attr(feature = "host", allow(dead_code))]
+    pub(crate) pointer: *mut T,
+    pub(crate) reference: PhantomData<&'r mut T>,
+}
+
+unsafe impl<'r, T: DeviceCopy> DeviceCopy for DeviceMutRef<'r, T> {}
+
+#[cfg(feature = "device")]
+impl<'r, T: DeviceCopy> AsRef<T> for DeviceMutRef<'r, T> {
+    fn as_ref(&self) -> &T {
+        unsafe { &*self.pointer }
+    }
+}
+
+#[cfg(feature = "device")]
+impl<'r, T: DeviceCopy> AsMut<T> for DeviceMutRef<'r, T> {
+    fn as_mut(&mut self) -> &mut T {
+        unsafe { &mut *self.pointer }
+    }
+}
+
+#[repr(transparent)]
+#[derive(TypeLayout)]
+pub struct DeviceOwnedRef<'r, T: DeviceCopy> {
+    #[cfg_attr(feature = "host", allow(dead_code))]
+    pub(crate) pointer: *mut T,
+    pub(crate) reference: PhantomData<&'r mut ()>,
+    pub(crate) marker: PhantomData<T>,
+}
+
+unsafe impl<'r, T: DeviceCopy> DeviceCopy for DeviceOwnedRef<'r, T> {}
+
+#[cfg(feature = "device")]
+impl<'r, T: DeviceCopy> AsRef<T> for DeviceOwnedRef<'r, T> {
+    fn as_ref(&self) -> &T {
+        unsafe { &*self.pointer }
+    }
+}
+
+#[cfg(feature = "device")]
+impl<'r, T: DeviceCopy> AsMut<T> for DeviceOwnedRef<'r, T> {
+    fn as_mut(&mut self) -> &mut T {
+        unsafe { &mut *self.pointer }
+    }
+}
diff --git a/src/utils/mod.rs b/src/utils/mod.rs
index 73d422f05..65a4379fb 100644
--- a/src/utils/mod.rs
+++ b/src/utils/mod.rs
@@ -1,12 +1,5 @@
 pub mod aliasing;
 pub mod device_copy;
 pub mod exchange;
+pub mod ffi;
 pub mod shared;
-
-mod r#box;
-mod boxed_slice;
-mod option;
-mod r#ref;
-mod ref_mut;
-mod slice_ref;
-mod slice_ref_mut;
diff --git a/src/utils/shared/mod.rs b/src/utils/shared/mod.rs
index dfd3f2019..b01dda26d 100644
--- a/src/utils/shared/mod.rs
+++ b/src/utils/shared/mod.rs
@@ -9,3 +9,6 @@ pub use r#static::ThreadBlockShared;
 #[doc(hidden)]
 #[cfg(feature = "device")]
 pub use slice::init;
+
+#[cfg(feature = "host")]
+pub(crate) use slice::SharedMemorySize;
diff --git a/src/utils/shared/slice.rs b/src/utils/shared/slice.rs
index f60276e6b..72ed7fde1 100644
--- a/src/utils/shared/slice.rs
+++ b/src/utils/shared/slice.rs
@@ -1,3 +1,5 @@
+use core::alloc::Layout;
+
 use const_type_layout::TypeGraphLayout;
 
 #[allow(clippy::module_name_repetitions)]
@@ -44,6 +46,12 @@ impl<T: 'static + TypeGraphLayout> ThreadBlockSharedSlice<T> {
         self.len() == 0
     }
 
+    #[must_use]
+    pub fn layout(&self) -> Layout {
+        // Safety: the length of self.shared is always initialised
+        unsafe { Layout::for_value_raw(self.shared) }
+    }
+
     #[cfg(feature = "device")]
     #[must_use]
     pub const fn as_mut_ptr(&self) -> *mut T {
@@ -124,3 +132,38 @@ pub unsafe fn init() {
 
 #[cfg(feature = "device")]
 core::arch::global_asm!(".extern .shared .align 8 .b8 rust_cuda_dynamic_shared_base[];");
+
+#[cfg(feature = "host")]
+pub struct SharedMemorySize {
+    last_align: usize,
+    total_size: usize,
+}
+
+#[cfg(feature = "host")]
+impl SharedMemorySize {
+    #[must_use]
+    pub const fn new() -> Self {
+        Self {
+            // we allocate the shared memory with an alignment of 8
+            last_align: 8,
+            total_size: 0,
+        }
+    }
+
+    pub fn add(&mut self, layout: core::alloc::Layout) {
+        if layout.align() > self.last_align {
+            // in the worst case, we are one element of the smaller alignment
+            //  into the larger alignment, so we need to pad the entire rest
+            let pessimistic_padding = layout.align() - self.last_align;
+
+            self.total_size += pessimistic_padding;
+        }
+
+        self.last_align = layout.align();
+        self.total_size += layout.size();
+    }
+
+    pub const fn total(self) -> usize {
+        self.total_size
+    }
+}

From 3020fb08a583b79ec1b4294246f8909ce2a4cece Mon Sep 17 00:00:00 2001
From: Juniper Tyree <juniper.tyree@helsinki.fi>
Date: Mon, 25 Dec 2023 15:09:56 +0000
Subject: [PATCH 067/120] Split rust-cuda-kernel off from rust-cuda-derive

---
 .vscode/settings.json                         |  3 +-
 Cargo.toml                                    | 10 +--
 examples/print/Cargo.toml                     |  4 +-
 examples/single-source/Cargo.toml             |  4 +-
 rust-cuda-derive/Cargo.toml                   | 20 +++---
 rust-cuda-derive/src/lib.rs                   | 62 +++----------------
 .../src/rust_to_cuda/field_copy.rs            |  2 +
 rust-cuda-derive/src/rust_to_cuda/field_ty.rs |  1 +
 rust-cuda-derive/src/rust_to_cuda/impl.rs     |  1 +
 rust-cuda-kernel/Cargo.toml                   | 30 +++++++++
 .../build.rs                                  |  0
 .../src/kernel/link/config.rs                 |  0
 .../src/kernel/link/error.rs                  |  0
 .../src/kernel/link/mod.rs                    |  0
 .../src/kernel/link/ptx_compiler_sys.rs       |  0
 .../src/kernel/lints.rs                       |  0
 .../src/kernel/mod.rs                         |  0
 .../src/kernel/specialise/entry_point.rs      |  0
 .../src/kernel/specialise/function.rs         |  0
 .../src/kernel/specialise/mod.rs              |  0
 .../src/kernel/specialise/ty.rs               |  0
 .../src/kernel/utils.rs                       |  0
 .../src/kernel/wrapper/config.rs              |  0
 .../wrapper/generate/cuda_generic_function.rs |  0
 .../kernel/wrapper/generate/cuda_wrapper.rs   |  0
 .../kernel/wrapper/generate/host_kernel_ty.rs |  0
 .../generate/host_linker_macro/args_trait.rs  |  0
 .../generate/host_linker_macro/get_ptx.rs     |  0
 .../wrapper/generate/host_linker_macro/mod.rs |  0
 .../src/kernel/wrapper/generate/mod.rs        |  0
 .../src/kernel/wrapper/mod.rs                 |  0
 .../src/kernel/wrapper/parse.rs               |  0
 rust-cuda-kernel/src/lib.rs                   | 60 ++++++++++++++++++
 rust-toolchain                                |  2 +-
 src/device/mod.rs                             |  5 +-
 src/kernel/mod.rs                             |  8 +--
 36 files changed, 128 insertions(+), 84 deletions(-)
 create mode 100644 rust-cuda-kernel/Cargo.toml
 rename {rust-cuda-derive => rust-cuda-kernel}/build.rs (100%)
 rename {rust-cuda-derive => rust-cuda-kernel}/src/kernel/link/config.rs (100%)
 rename {rust-cuda-derive => rust-cuda-kernel}/src/kernel/link/error.rs (100%)
 rename {rust-cuda-derive => rust-cuda-kernel}/src/kernel/link/mod.rs (100%)
 rename {rust-cuda-derive => rust-cuda-kernel}/src/kernel/link/ptx_compiler_sys.rs (100%)
 rename {rust-cuda-derive => rust-cuda-kernel}/src/kernel/lints.rs (100%)
 rename {rust-cuda-derive => rust-cuda-kernel}/src/kernel/mod.rs (100%)
 rename {rust-cuda-derive => rust-cuda-kernel}/src/kernel/specialise/entry_point.rs (100%)
 rename {rust-cuda-derive => rust-cuda-kernel}/src/kernel/specialise/function.rs (100%)
 rename {rust-cuda-derive => rust-cuda-kernel}/src/kernel/specialise/mod.rs (100%)
 rename {rust-cuda-derive => rust-cuda-kernel}/src/kernel/specialise/ty.rs (100%)
 rename {rust-cuda-derive => rust-cuda-kernel}/src/kernel/utils.rs (100%)
 rename {rust-cuda-derive => rust-cuda-kernel}/src/kernel/wrapper/config.rs (100%)
 rename {rust-cuda-derive => rust-cuda-kernel}/src/kernel/wrapper/generate/cuda_generic_function.rs (100%)
 rename {rust-cuda-derive => rust-cuda-kernel}/src/kernel/wrapper/generate/cuda_wrapper.rs (100%)
 rename {rust-cuda-derive => rust-cuda-kernel}/src/kernel/wrapper/generate/host_kernel_ty.rs (100%)
 rename {rust-cuda-derive => rust-cuda-kernel}/src/kernel/wrapper/generate/host_linker_macro/args_trait.rs (100%)
 rename {rust-cuda-derive => rust-cuda-kernel}/src/kernel/wrapper/generate/host_linker_macro/get_ptx.rs (100%)
 rename {rust-cuda-derive => rust-cuda-kernel}/src/kernel/wrapper/generate/host_linker_macro/mod.rs (100%)
 rename {rust-cuda-derive => rust-cuda-kernel}/src/kernel/wrapper/generate/mod.rs (100%)
 rename {rust-cuda-derive => rust-cuda-kernel}/src/kernel/wrapper/mod.rs (100%)
 rename {rust-cuda-derive => rust-cuda-kernel}/src/kernel/wrapper/parse.rs (100%)
 create mode 100644 rust-cuda-kernel/src/lib.rs

diff --git a/.vscode/settings.json b/.vscode/settings.json
index c2b4219f5..ddfa41463 100644
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -6,7 +6,8 @@
     "rust-analyzer.cargo.allFeatures": false,
     "rust-analyzer.cargo.features": [
         "derive",
-        "host"
+        "host",
+        "kernel"
     ],
     "rust-analyzer.showUnlinkedFileNotification": false,
 }
diff --git a/Cargo.toml b/Cargo.toml
index 12a90ef59..655359684 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,10 +1,10 @@
 [workspace]
 members = [
-    ".", "rust-cuda-derive",
+    ".", "rust-cuda-derive", "rust-cuda-kernel",
     "examples/derive", "examples/print", "examples/single-source",
 ]
 default-members = [
-    ".", "rust-cuda-derive",
+    ".", "rust-cuda-derive", "rust-cuda-kernel",
 ]
 
 [package]
@@ -19,9 +19,10 @@ rust-version = "1.75" # nightly
 
 [features]
 default = []
-host = ["dep:rustacuda", "dep:regex"]
-device = []
 derive = ["dep:rustacuda_derive", "dep:rust-cuda-derive"]
+device = []
+host = ["dep:rustacuda", "dep:regex"]
+kernel = ["dep:rust-cuda-kernel"]
 
 [dependencies]
 rustacuda_core = { git = "https://github.com/juntyr/RustaCUDA", rev = "c6ea7cc" }
@@ -36,3 +37,4 @@ const-type-layout = { version = "0.2.1", features = ["derive"] }
 final = "0.1.1"
 
 rust-cuda-derive = { path = "rust-cuda-derive", optional = true }
+rust-cuda-kernel = { path = "rust-cuda-kernel", optional = true }
diff --git a/examples/print/Cargo.toml b/examples/print/Cargo.toml
index 05f3a537e..b7f864b58 100644
--- a/examples/print/Cargo.toml
+++ b/examples/print/Cargo.toml
@@ -8,7 +8,7 @@ edition = "2021"
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 
 [target.'cfg(target_os = "cuda")'.dependencies]
-rust-cuda = { path = "../../", features = ["derive", "device"] }
+rust-cuda = { path = "../../", features = ["kernel", "device"] }
 
 [target.'cfg(not(target_os = "cuda"))'.dependencies]
-rust-cuda = { path = "../../", features = ["derive", "host"] }
+rust-cuda = { path = "../../", features = ["kernel", "host"] }
diff --git a/examples/single-source/Cargo.toml b/examples/single-source/Cargo.toml
index 6f53359cd..1a27dd30e 100644
--- a/examples/single-source/Cargo.toml
+++ b/examples/single-source/Cargo.toml
@@ -8,7 +8,7 @@ edition = "2021"
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 
 [target.'cfg(target_os = "cuda")'.dependencies]
-rc = { package = "rust-cuda", path = "../../", features = ["derive", "device"] }
+rc = { package = "rust-cuda", path = "../../", features = ["derive", "kernel", "device"] }
 
 [target.'cfg(not(target_os = "cuda"))'.dependencies]
-rc = { package = "rust-cuda", path = "../../", features = ["derive", "host"] }
+rc = { package = "rust-cuda", path = "../../", features = ["derive", "kernel", "host"] }
diff --git a/rust-cuda-derive/Cargo.toml b/rust-cuda-derive/Cargo.toml
index 60677b1dd..73a74907b 100644
--- a/rust-cuda-derive/Cargo.toml
+++ b/rust-cuda-derive/Cargo.toml
@@ -4,7 +4,6 @@ version = "0.1.0"
 authors = ["Juniper Tyree <juniper.tyree@helsinki.fi>"]
 license = "MIT OR Apache-2.0"
 edition = "2021"
-links = "libnvptxcompiler_static"
 
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 
@@ -16,15 +15,10 @@ syn = { version = "1.0", features = ["full", "fold"] }
 quote = "1.0"
 proc-macro2 = "1.0"
 proc-macro-error = "1.0"
-regex = "1.5"
-lazy_static = "1.4"
-serde_json = "1.0"
-cargo_metadata = { version = "0.18", features = ["builder"] }
-strip-ansi-escapes = "0.2"
-colored = "2.0"
-thiserror = "1.0"
-seahash = "4.1"
-ptx-builder = { git = "https://github.com/juntyr/rust-ptx-builder", rev = "1f1f49d" }
-
-[build-dependencies]
-find_cuda_helper = "0.2"
+# regex = "1.5"
+# lazy_static = "1.4"
+# serde_json = "1.0"
+# cargo_metadata = { version = "0.18", features = ["builder"] }
+# strip-ansi-escapes = "0.2"
+# colored = "2.0"
+# thiserror = "1.0"
diff --git a/rust-cuda-derive/src/lib.rs b/rust-cuda-derive/src/lib.rs
index 4651be684..fba846798 100644
--- a/rust-cuda-derive/src/lib.rs
+++ b/rust-cuda-derive/src/lib.rs
@@ -1,13 +1,12 @@
-#![deny(clippy::pedantic)]
-#![feature(box_patterns)]
-#![feature(proc_macro_tracked_env)]
-#![feature(proc_macro_span)]
+#![deny(clippy::complexity)]
+#![deny(clippy::correctness)]
+#![warn(clippy::nursery)]
+#![warn(clippy::pedantic)]
+#![deny(clippy::perf)]
+#![deny(clippy::style)]
+#![deny(clippy::suspicious)]
 #![feature(if_let_guard)]
 #![feature(let_chains)]
-#![feature(map_try_insert)]
-#![feature(proc_macro_def_site)]
-#![feature(cfg_version)]
-#![cfg_attr(not(version("1.76.0")), feature(c_str_literals))]
 #![doc(html_root_url = "https://juntyr.github.io/rust-cuda/")]
 
 extern crate proc_macro;
@@ -17,14 +16,8 @@ extern crate proc_macro_error;
 
 use proc_macro::TokenStream;
 
-mod kernel;
 mod rust_to_cuda;
 
-// cargo expand --target x86_64-unknown-linux-gnu --ugly \
-//  | rustfmt --config max_width=160 > out.rs
-// cargo expand --target nvptx64-nvidia-cuda --ugly \
-//  | rustfmt --config max_width=160 > out.rs
-
 #[proc_macro_error]
 #[proc_macro_derive(LendRustToCuda, attributes(cuda))]
 pub fn rust_to_cuda_derive(input: TokenStream) -> TokenStream {
@@ -37,44 +30,3 @@ pub fn rust_to_cuda_derive(input: TokenStream) -> TokenStream {
     // Build the implementation of the `RustToCuda` and `CudaAsRust` traits
     rust_to_cuda::impl_rust_to_cuda(&ast)
 }
-
-#[proc_macro_error]
-#[proc_macro_attribute]
-pub fn kernel(attr: TokenStream, func: TokenStream) -> TokenStream {
-    kernel::wrapper::kernel(attr, func)
-}
-
-#[doc(hidden)]
-#[proc_macro_error]
-#[proc_macro]
-pub fn specialise_kernel_type(tokens: TokenStream) -> TokenStream {
-    kernel::specialise::ty::specialise_kernel_type(tokens)
-}
-
-#[doc(hidden)]
-#[proc_macro_error]
-#[proc_macro]
-pub fn specialise_kernel_entry_point(tokens: TokenStream) -> TokenStream {
-    kernel::specialise::entry_point::specialise_kernel_entry_point(tokens)
-}
-
-#[doc(hidden)]
-#[proc_macro_error]
-#[proc_macro_attribute]
-pub fn specialise_kernel_function(attr: TokenStream, func: TokenStream) -> TokenStream {
-    kernel::specialise::function::specialise_kernel_function(attr, func)
-}
-
-#[doc(hidden)]
-#[proc_macro_error]
-#[proc_macro]
-pub fn check_kernel(tokens: TokenStream) -> TokenStream {
-    kernel::link::check_kernel(tokens)
-}
-
-#[doc(hidden)]
-#[proc_macro_error]
-#[proc_macro]
-pub fn link_kernel(tokens: TokenStream) -> TokenStream {
-    kernel::link::link_kernel(tokens)
-}
diff --git a/rust-cuda-derive/src/rust_to_cuda/field_copy.rs b/rust-cuda-derive/src/rust_to_cuda/field_copy.rs
index f6464d197..10f528730 100644
--- a/rust-cuda-derive/src/rust_to_cuda/field_copy.rs
+++ b/rust-cuda-derive/src/rust_to_cuda/field_copy.rs
@@ -21,10 +21,12 @@ pub fn impl_field_copy_init_and_expand_alloc_type(
 
     c2r_field_initialisations: &mut Vec<TokenStream>,
 ) -> TokenStream {
+    #[allow(clippy::option_if_let_else)]
     let field_accessor = match &field.ident {
         Some(ident) => quote! { #ident },
         None => proc_macro2::Literal::usize_unsuffixed(field_index).to_token_stream(),
     };
+    #[allow(clippy::option_if_let_else)]
     let field_repr_ident = match &field.ident {
         Some(ident) => format_ident!("field_{}_repr", ident),
         None => format_ident!("field_{}_repr", field_index),
diff --git a/rust-cuda-derive/src/rust_to_cuda/field_ty.rs b/rust-cuda-derive/src/rust_to_cuda/field_ty.rs
index aee846fe3..313daf86b 100644
--- a/rust-cuda-derive/src/rust_to_cuda/field_ty.rs
+++ b/rust-cuda-derive/src/rust_to_cuda/field_ty.rs
@@ -103,6 +103,7 @@ pub fn swap_field_type_and_filter_attrs(
         }
     });
 
+    #[allow(clippy::option_if_let_else)]
     let cuda_repr_field_ty = if let Some(cuda_repr_field_ty) = cuda_repr_field_ty {
         cuda_repr_field_ty
     } else {
diff --git a/rust-cuda-derive/src/rust_to_cuda/impl.rs b/rust-cuda-derive/src/rust_to_cuda/impl.rs
index b7dc1eb13..5eee100c1 100644
--- a/rust-cuda-derive/src/rust_to_cuda/impl.rs
+++ b/rust-cuda-derive/src/rust_to_cuda/impl.rs
@@ -23,6 +23,7 @@ pub fn cuda_struct_declaration(
         quote! { #[repr(C)] }
     };
 
+    #[allow(clippy::option_if_let_else)]
     let struct_fields_where_clause = if let Some(struct_semi_cuda) = struct_semi_cuda {
         quote!(#struct_fields_cuda #where_clause #struct_semi_cuda)
     } else {
diff --git a/rust-cuda-kernel/Cargo.toml b/rust-cuda-kernel/Cargo.toml
new file mode 100644
index 000000000..23e641841
--- /dev/null
+++ b/rust-cuda-kernel/Cargo.toml
@@ -0,0 +1,30 @@
+[package]
+name = "rust-cuda-kernel"
+version = "0.1.0"
+authors = ["Juniper Tyree <juniper.tyree@helsinki.fi>"]
+license = "MIT OR Apache-2.0"
+edition = "2021"
+links = "libnvptxcompiler_static"
+
+# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+
+[lib]
+proc-macro = true
+
+[dependencies]
+syn = { version = "1.0", features = ["full", "fold"] }
+quote = "1.0"
+proc-macro2 = "1.0"
+proc-macro-error = "1.0"
+regex = "1.5"
+lazy_static = "1.4"
+serde_json = "1.0"
+cargo_metadata = { version = "0.18", features = ["builder"] }
+strip-ansi-escapes = "0.2"
+colored = "2.0"
+thiserror = "1.0"
+seahash = "4.1"
+ptx-builder = { git = "https://github.com/juntyr/rust-ptx-builder", rev = "1f1f49d" }
+
+[build-dependencies]
+find_cuda_helper = "0.2"
diff --git a/rust-cuda-derive/build.rs b/rust-cuda-kernel/build.rs
similarity index 100%
rename from rust-cuda-derive/build.rs
rename to rust-cuda-kernel/build.rs
diff --git a/rust-cuda-derive/src/kernel/link/config.rs b/rust-cuda-kernel/src/kernel/link/config.rs
similarity index 100%
rename from rust-cuda-derive/src/kernel/link/config.rs
rename to rust-cuda-kernel/src/kernel/link/config.rs
diff --git a/rust-cuda-derive/src/kernel/link/error.rs b/rust-cuda-kernel/src/kernel/link/error.rs
similarity index 100%
rename from rust-cuda-derive/src/kernel/link/error.rs
rename to rust-cuda-kernel/src/kernel/link/error.rs
diff --git a/rust-cuda-derive/src/kernel/link/mod.rs b/rust-cuda-kernel/src/kernel/link/mod.rs
similarity index 100%
rename from rust-cuda-derive/src/kernel/link/mod.rs
rename to rust-cuda-kernel/src/kernel/link/mod.rs
diff --git a/rust-cuda-derive/src/kernel/link/ptx_compiler_sys.rs b/rust-cuda-kernel/src/kernel/link/ptx_compiler_sys.rs
similarity index 100%
rename from rust-cuda-derive/src/kernel/link/ptx_compiler_sys.rs
rename to rust-cuda-kernel/src/kernel/link/ptx_compiler_sys.rs
diff --git a/rust-cuda-derive/src/kernel/lints.rs b/rust-cuda-kernel/src/kernel/lints.rs
similarity index 100%
rename from rust-cuda-derive/src/kernel/lints.rs
rename to rust-cuda-kernel/src/kernel/lints.rs
diff --git a/rust-cuda-derive/src/kernel/mod.rs b/rust-cuda-kernel/src/kernel/mod.rs
similarity index 100%
rename from rust-cuda-derive/src/kernel/mod.rs
rename to rust-cuda-kernel/src/kernel/mod.rs
diff --git a/rust-cuda-derive/src/kernel/specialise/entry_point.rs b/rust-cuda-kernel/src/kernel/specialise/entry_point.rs
similarity index 100%
rename from rust-cuda-derive/src/kernel/specialise/entry_point.rs
rename to rust-cuda-kernel/src/kernel/specialise/entry_point.rs
diff --git a/rust-cuda-derive/src/kernel/specialise/function.rs b/rust-cuda-kernel/src/kernel/specialise/function.rs
similarity index 100%
rename from rust-cuda-derive/src/kernel/specialise/function.rs
rename to rust-cuda-kernel/src/kernel/specialise/function.rs
diff --git a/rust-cuda-derive/src/kernel/specialise/mod.rs b/rust-cuda-kernel/src/kernel/specialise/mod.rs
similarity index 100%
rename from rust-cuda-derive/src/kernel/specialise/mod.rs
rename to rust-cuda-kernel/src/kernel/specialise/mod.rs
diff --git a/rust-cuda-derive/src/kernel/specialise/ty.rs b/rust-cuda-kernel/src/kernel/specialise/ty.rs
similarity index 100%
rename from rust-cuda-derive/src/kernel/specialise/ty.rs
rename to rust-cuda-kernel/src/kernel/specialise/ty.rs
diff --git a/rust-cuda-derive/src/kernel/utils.rs b/rust-cuda-kernel/src/kernel/utils.rs
similarity index 100%
rename from rust-cuda-derive/src/kernel/utils.rs
rename to rust-cuda-kernel/src/kernel/utils.rs
diff --git a/rust-cuda-derive/src/kernel/wrapper/config.rs b/rust-cuda-kernel/src/kernel/wrapper/config.rs
similarity index 100%
rename from rust-cuda-derive/src/kernel/wrapper/config.rs
rename to rust-cuda-kernel/src/kernel/wrapper/config.rs
diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cuda_generic_function.rs b/rust-cuda-kernel/src/kernel/wrapper/generate/cuda_generic_function.rs
similarity index 100%
rename from rust-cuda-derive/src/kernel/wrapper/generate/cuda_generic_function.rs
rename to rust-cuda-kernel/src/kernel/wrapper/generate/cuda_generic_function.rs
diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cuda_wrapper.rs b/rust-cuda-kernel/src/kernel/wrapper/generate/cuda_wrapper.rs
similarity index 100%
rename from rust-cuda-derive/src/kernel/wrapper/generate/cuda_wrapper.rs
rename to rust-cuda-kernel/src/kernel/wrapper/generate/cuda_wrapper.rs
diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/host_kernel_ty.rs b/rust-cuda-kernel/src/kernel/wrapper/generate/host_kernel_ty.rs
similarity index 100%
rename from rust-cuda-derive/src/kernel/wrapper/generate/host_kernel_ty.rs
rename to rust-cuda-kernel/src/kernel/wrapper/generate/host_kernel_ty.rs
diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/host_linker_macro/args_trait.rs b/rust-cuda-kernel/src/kernel/wrapper/generate/host_linker_macro/args_trait.rs
similarity index 100%
rename from rust-cuda-derive/src/kernel/wrapper/generate/host_linker_macro/args_trait.rs
rename to rust-cuda-kernel/src/kernel/wrapper/generate/host_linker_macro/args_trait.rs
diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/host_linker_macro/get_ptx.rs b/rust-cuda-kernel/src/kernel/wrapper/generate/host_linker_macro/get_ptx.rs
similarity index 100%
rename from rust-cuda-derive/src/kernel/wrapper/generate/host_linker_macro/get_ptx.rs
rename to rust-cuda-kernel/src/kernel/wrapper/generate/host_linker_macro/get_ptx.rs
diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/host_linker_macro/mod.rs b/rust-cuda-kernel/src/kernel/wrapper/generate/host_linker_macro/mod.rs
similarity index 100%
rename from rust-cuda-derive/src/kernel/wrapper/generate/host_linker_macro/mod.rs
rename to rust-cuda-kernel/src/kernel/wrapper/generate/host_linker_macro/mod.rs
diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/mod.rs b/rust-cuda-kernel/src/kernel/wrapper/generate/mod.rs
similarity index 100%
rename from rust-cuda-derive/src/kernel/wrapper/generate/mod.rs
rename to rust-cuda-kernel/src/kernel/wrapper/generate/mod.rs
diff --git a/rust-cuda-derive/src/kernel/wrapper/mod.rs b/rust-cuda-kernel/src/kernel/wrapper/mod.rs
similarity index 100%
rename from rust-cuda-derive/src/kernel/wrapper/mod.rs
rename to rust-cuda-kernel/src/kernel/wrapper/mod.rs
diff --git a/rust-cuda-derive/src/kernel/wrapper/parse.rs b/rust-cuda-kernel/src/kernel/wrapper/parse.rs
similarity index 100%
rename from rust-cuda-derive/src/kernel/wrapper/parse.rs
rename to rust-cuda-kernel/src/kernel/wrapper/parse.rs
diff --git a/rust-cuda-kernel/src/lib.rs b/rust-cuda-kernel/src/lib.rs
new file mode 100644
index 000000000..b26a78531
--- /dev/null
+++ b/rust-cuda-kernel/src/lib.rs
@@ -0,0 +1,60 @@
+#![deny(clippy::pedantic)]
+#![feature(box_patterns)]
+#![feature(proc_macro_tracked_env)]
+#![feature(proc_macro_span)]
+#![feature(let_chains)]
+#![feature(map_try_insert)]
+#![feature(proc_macro_def_site)]
+#![feature(cfg_version)]
+#![cfg_attr(not(version("1.76.0")), feature(c_str_literals))]
+#![doc(html_root_url = "https://juntyr.github.io/rust-cuda/")]
+
+extern crate proc_macro;
+
+#[macro_use]
+extern crate proc_macro_error;
+
+use proc_macro::TokenStream;
+
+mod kernel;
+
+#[proc_macro_error]
+#[proc_macro_attribute]
+pub fn kernel(attr: TokenStream, func: TokenStream) -> TokenStream {
+    kernel::wrapper::kernel(attr, func)
+}
+
+#[doc(hidden)]
+#[proc_macro_error]
+#[proc_macro]
+pub fn specialise_kernel_type(tokens: TokenStream) -> TokenStream {
+    kernel::specialise::ty::specialise_kernel_type(tokens)
+}
+
+#[doc(hidden)]
+#[proc_macro_error]
+#[proc_macro]
+pub fn specialise_kernel_entry_point(tokens: TokenStream) -> TokenStream {
+    kernel::specialise::entry_point::specialise_kernel_entry_point(tokens)
+}
+
+#[doc(hidden)]
+#[proc_macro_error]
+#[proc_macro_attribute]
+pub fn specialise_kernel_function(attr: TokenStream, func: TokenStream) -> TokenStream {
+    kernel::specialise::function::specialise_kernel_function(attr, func)
+}
+
+#[doc(hidden)]
+#[proc_macro_error]
+#[proc_macro]
+pub fn check_kernel(tokens: TokenStream) -> TokenStream {
+    kernel::link::check_kernel(tokens)
+}
+
+#[doc(hidden)]
+#[proc_macro_error]
+#[proc_macro]
+pub fn link_kernel(tokens: TokenStream) -> TokenStream {
+    kernel::link::link_kernel(tokens)
+}
diff --git a/rust-toolchain b/rust-toolchain
index d6e655e5f..e6cfef665 100644
--- a/rust-toolchain
+++ b/rust-toolchain
@@ -1,5 +1,5 @@
 [toolchain]
 # Pin to final 1.75.0 nightly
-channel = "nightly-2023-11-10"
+channel = "nightly"
 components = [ "cargo", "rustfmt", "clippy" ]
 targets = [ "x86_64-unknown-linux-gnu", "nvptx64-nvidia-cuda" ]
diff --git a/src/device/mod.rs b/src/device/mod.rs
index 0c2a0c83f..791035d51 100644
--- a/src/device/mod.rs
+++ b/src/device/mod.rs
@@ -1,5 +1,6 @@
-#[cfg(feature = "derive")]
-pub use rust_cuda_derive::{specialise_kernel_function, specialise_kernel_type};
+#[doc(hidden)]
+#[cfg(feature = "kernel")]
+pub use rust_cuda_kernel::{specialise_kernel_function, specialise_kernel_type};
 
 pub mod alloc;
 pub mod thread;
diff --git a/src/kernel/mod.rs b/src/kernel/mod.rs
index f5aeeb4bf..0f490c9b0 100644
--- a/src/kernel/mod.rs
+++ b/src/kernel/mod.rs
@@ -15,13 +15,13 @@ use rustacuda::{
     stream::Stream,
 };
 
-#[cfg(feature = "derive")]
-pub use rust_cuda_derive::kernel;
+#[cfg(feature = "kernel")]
+pub use rust_cuda_kernel::kernel;
 
 #[doc(hidden)]
-#[cfg(all(feature = "derive", feature = "host"))]
+#[cfg(all(feature = "kernel", feature = "host"))]
 #[allow(clippy::module_name_repetitions)]
-pub use rust_cuda_derive::{check_kernel, link_kernel, specialise_kernel_entry_point};
+pub use rust_cuda_kernel::{check_kernel, link_kernel, specialise_kernel_entry_point};
 
 #[cfg(feature = "host")]
 mod ptx_jit;

From cc6edd0bd2e4dcf5e8b5ba2887b6b5b7f0fa0f97 Mon Sep 17 00:00:00 2001
From: Juniper Tyree <juniper.tyree@helsinki.fi>
Date: Mon, 25 Dec 2023 15:11:30 +0000
Subject: [PATCH 068/120] Update codecov action to handle rust-cuda-kernel

---
 .github/workflows/coverage.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/coverage.yml b/.github/workflows/coverage.yml
index 176d98baa..c54f606d5 100644
--- a/.github/workflows/coverage.yml
+++ b/.github/workflows/coverage.yml
@@ -60,6 +60,7 @@ jobs:
             -t lcov -o coverage.lcov --branch \
             --keep-only "src/*" \
             --keep-only "rust-cuda-derive/*" \
+            --keep-only "rust-cuda-kernel/*" \
             --ignore-not-existing \
             --excl-line GRCOV_EXCL_LINE \
             --excl-start GRCOV_EXCL_START \

From 1c864b56ede235928ab035de2daf758e31694476 Mon Sep 17 00:00:00 2001
From: Juniper Tyree <juniper.tyree@helsinki.fi>
Date: Mon, 25 Dec 2023 15:17:14 +0000
Subject: [PATCH 069/120] Fix clippy lint

---
 src/lend/mod.rs | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/lend/mod.rs b/src/lend/mod.rs
index a6cffea3d..6f7bab5d7 100644
--- a/src/lend/mod.rs
+++ b/src/lend/mod.rs
@@ -7,7 +7,7 @@ use rustacuda_core::DeviceCopy;
 #[allow(clippy::module_name_repetitions)]
 pub use rust_cuda_derive::LendRustToCuda;
 
-use crate::{alloc::CudaAlloc, utils::ffi::DeviceAccessible};
+use crate::alloc::CudaAlloc;
 
 #[cfg(feature = "device")]
 use crate::utils::ffi::{DeviceConstRef, DeviceOwnedRef};
@@ -16,9 +16,11 @@ use crate::{
     alloc::{CombinedCudaAlloc, EmptyCudaAlloc, NoCudaAlloc},
     host::{HostAndDeviceConstRef, HostAndDeviceOwned},
 };
-
 #[cfg(any(feature = "host", feature = "device"))]
-use crate::safety::{NoSafeAliasing, SafeDeviceCopy};
+use crate::{
+    safety::{NoSafeAliasing, SafeDeviceCopy},
+    utils::ffi::DeviceAccessible,
+};
 
 mod impls;
 

From ce8b69a13cb5fab47f5e84299291f2f09d8eb707 Mon Sep 17 00:00:00 2001
From: Juniper Tyree <juniper.tyree@helsinki.fi>
Date: Tue, 26 Dec 2023 18:46:02 +0000
Subject: [PATCH 070/120] Far too much time spent getting rid of DeviceCopy

---
 Cargo.toml                                |   2 -
 examples/print/src/main.rs                |   2 +-
 examples/single-source/src/main.rs        |   5 +-
 rust-cuda-derive/src/rust_to_cuda/impl.rs |   6 +-
 src/host/mod.rs                           | 304 ++++++----------------
 src/kernel/mod.rs                         |   5 +-
 src/kernel/param.rs                       | 259 ++++++------------
 src/lend/impls/box.rs                     |  24 +-
 src/lend/impls/boxed_slice.rs             |  38 +--
 src/lend/impls/option.rs                  |  10 +-
 src/lend/impls/ref.rs                     |  25 +-
 src/lend/impls/ref_mut.rs                 |  23 +-
 src/lend/impls/slice_ref.rs               |  23 +-
 src/lend/impls/slice_ref_mut.rs           |  23 +-
 src/lend/mod.rs                           |  26 +-
 src/safety/device_copy.rs                 |  29 ---
 src/safety/mod.rs                         |   6 +-
 src/safety/no_aliasing.rs                 |  86 ------
 src/safety/portable.rs                    |  63 +++++
 src/safety/stack_only.rs                  |  18 +-
 src/utils/aliasing/const.rs               |   8 -
 src/utils/aliasing/dynamic.rs             |   5 -
 src/utils/aliasing/final.rs               |  90 -------
 src/utils/aliasing/mod.rs                 |   3 -
 src/utils/device_copy.rs                  |  23 +-
 src/utils/exchange/buffer/common.rs       |  26 +-
 src/utils/exchange/buffer/device.rs       |   8 +-
 src/utils/exchange/buffer/host.rs         |  58 +++--
 src/utils/exchange/buffer/mod.rs          |  50 ++--
 src/utils/exchange/wrapper.rs             |  78 ++++--
 src/utils/ffi.rs                          | 150 ++++++++---
 31 files changed, 584 insertions(+), 892 deletions(-)
 delete mode 100644 src/safety/device_copy.rs
 delete mode 100644 src/safety/no_aliasing.rs
 create mode 100644 src/safety/portable.rs
 delete mode 100644 src/utils/aliasing/final.rs

diff --git a/Cargo.toml b/Cargo.toml
index 655359684..bbabb2007 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -34,7 +34,5 @@ regex = { version = "1.10", optional = true }
 
 const-type-layout = { version = "0.2.1", features = ["derive"] }
 
-final = "0.1.1"
-
 rust-cuda-derive = { path = "rust-cuda-derive", optional = true }
 rust-cuda-kernel = { path = "rust-cuda-kernel", optional = true }
diff --git a/examples/print/src/main.rs b/examples/print/src/main.rs
index 31c6897f3..7423f06ac 100644
--- a/examples/print/src/main.rs
+++ b/examples/print/src/main.rs
@@ -12,7 +12,7 @@
 
 extern crate alloc;
 
-#[derive(rust_cuda::deps::const_type_layout::TypeLayout)]
+#[derive(Copy, Clone, rust_cuda::deps::const_type_layout::TypeLayout)]
 #[layout(crate = "rust_cuda::deps::const_type_layout")]
 #[repr(C)]
 pub enum Action {
diff --git a/examples/single-source/src/main.rs b/examples/single-source/src/main.rs
index 40d212294..13f2b7efe 100644
--- a/examples/single-source/src/main.rs
+++ b/examples/single-source/src/main.rs
@@ -41,7 +41,7 @@ pub struct Empty([u8; 0]);
 pub struct Tuple(u32, i32);
 
 #[repr(C)]
-#[derive(rc::deps::const_type_layout::TypeLayout)]
+#[derive(Copy, Clone, rc::deps::const_type_layout::TypeLayout)]
 #[layout(crate = "rc::deps::const_type_layout")]
 pub struct Triple(i32, i32, i32);
 
@@ -58,8 +58,7 @@ pub fn kernel<
             CudaRepresentation: rc::safety::StackOnly,
             CudaAllocation: rc::alloc::EmptyCudaAlloc,
         >
-        + rc::safety::StackOnly
-        + rc::safety::NoSafeAliasing,
+        + rc::safety::StackOnly,
 >(
     _x: &rc::kernel::param::PerThreadShallowCopy<Dummy>,
     _z: &rc::kernel::param::SharedHeapPerThreadShallowCopy<Wrapper<T>>,
diff --git a/rust-cuda-derive/src/rust_to_cuda/impl.rs b/rust-cuda-derive/src/rust_to_cuda/impl.rs
index 5eee100c1..612d77c5a 100644
--- a/rust-cuda-derive/src/rust_to_cuda/impl.rs
+++ b/rust-cuda-derive/src/rust_to_cuda/impl.rs
@@ -12,7 +12,7 @@ pub fn cuda_struct_declaration(
     struct_fields_cuda: &syn::Fields,
     struct_semi_cuda: Option<syn::token::Semi>,
 ) -> TokenStream {
-    let (impl_generics, ty_generics, where_clause) = struct_generics_cuda.split_for_impl();
+    let (_impl_generics, _ty_generics, where_clause) = struct_generics_cuda.split_for_impl();
 
     let struct_repr = if struct_attrs_cuda
         .iter()
@@ -41,10 +41,6 @@ pub fn cuda_struct_declaration(
         #(#struct_layout_attrs)*
         #[layout(crate = #const_type_layout_crate_path)]
         #struct_vis_cuda struct #struct_name_cuda #struct_generics_cuda #struct_fields_where_clause
-
-        // #[derive(DeviceCopy)] can interfer with type parameters
-        unsafe impl #impl_generics #crate_path::deps::rustacuda_core::DeviceCopy
-            for #struct_name_cuda #ty_generics #where_clause {}
     }
 }
 
diff --git a/src/host/mod.rs b/src/host/mod.rs
index ba37e32e2..e480de9f2 100644
--- a/src/host/mod.rs
+++ b/src/host/mod.rs
@@ -6,17 +6,22 @@ use std::{
 
 use rustacuda::{
     context::Context,
-    error::{CudaError, CudaResult},
+    error::CudaError,
     event::Event,
-    memory::{DeviceBox, DeviceBuffer, LockedBox, LockedBuffer},
+    memory::{CopyDestination, DeviceBox, DeviceBuffer, LockedBox, LockedBuffer},
     module::Module,
     stream::Stream,
 };
-use rustacuda_core::{DeviceCopy, DevicePointer};
 
 use crate::{
-    safety::SafeDeviceCopy,
-    utils::ffi::{DeviceConstRef, DeviceMutRef, DeviceOwnedRef},
+    safety::PortableBitSemantics,
+    utils::{
+        device_copy::SafeDeviceCopyWrapper,
+        ffi::{
+            DeviceConstPointer, DeviceConstRef, DeviceMutPointer, DeviceMutRef, DeviceOwnedPointer,
+            DeviceOwnedRef,
+        },
+    },
 };
 
 pub trait CudaDroppable: Sized {
@@ -56,20 +61,29 @@ impl<C: CudaDroppable> DerefMut for CudaDropWrapper<C> {
     }
 }
 
-macro_rules! impl_sealed_drop_collection {
-    ($type:ident) => {
-        impl<C: DeviceCopy> CudaDroppable for $type<C> {
-            fn drop(val: Self) -> Result<(), (CudaError, Self)> {
-                Self::drop(val)
-            }
-        }
-    };
+impl<T> CudaDroppable for DeviceBox<T> {
+    fn drop(val: Self) -> Result<(), (CudaError, Self)> {
+        Self::drop(val)
+    }
+}
+
+impl<T: rustacuda_core::DeviceCopy> CudaDroppable for DeviceBuffer<T> {
+    fn drop(val: Self) -> Result<(), (CudaError, Self)> {
+        Self::drop(val)
+    }
+}
+
+impl<T> CudaDroppable for LockedBox<T> {
+    fn drop(val: Self) -> Result<(), (CudaError, Self)> {
+        Self::drop(val)
+    }
 }
 
-impl_sealed_drop_collection!(DeviceBuffer);
-impl_sealed_drop_collection!(DeviceBox);
-impl_sealed_drop_collection!(LockedBuffer);
-impl_sealed_drop_collection!(LockedBox);
+impl<T: rustacuda_core::DeviceCopy> CudaDroppable for LockedBuffer<T> {
+    fn drop(val: Self) -> Result<(), (CudaError, Self)> {
+        Self::drop(val)
+    }
+}
 
 macro_rules! impl_sealed_drop_value {
     ($type:ident) => {
@@ -86,188 +100,20 @@ impl_sealed_drop_value!(Stream);
 impl_sealed_drop_value!(Context);
 impl_sealed_drop_value!(Event);
 
-#[repr(transparent)]
 #[allow(clippy::module_name_repetitions)]
-pub struct HostLockedBox<T: DeviceCopy>(*mut T);
-
-impl<T: DeviceCopy> HostLockedBox<T> {
-    /// # Errors
-    /// Returns a [`CudaError`] iff an error occurs inside CUDA
-    pub fn new(value: T) -> CudaResult<Self> {
-        // Safety: uninitialised memory is immediately written to without reading it
-        let locked_ptr = unsafe {
-            let locked_ptr: *mut T = LockedBox::into_raw(LockedBox::uninitialized()?);
-            locked_ptr.write(value);
-            locked_ptr
-        };
-
-        Ok(Self(locked_ptr))
-    }
-}
-
-impl<T: DeviceCopy> Deref for HostLockedBox<T> {
-    type Target = T;
-
-    fn deref(&self) -> &Self::Target {
-        unsafe { &*self.0 }
-    }
-}
-
-impl<T: DeviceCopy> DerefMut for HostLockedBox<T> {
-    fn deref_mut(&mut self) -> &mut Self::Target {
-        unsafe { &mut *self.0 }
-    }
-}
-
-impl<T: DeviceCopy> From<LockedBox<T>> for HostLockedBox<T> {
-    fn from(locked_box: LockedBox<T>) -> Self {
-        Self(LockedBox::into_raw(locked_box))
-    }
-}
-
-impl<T: DeviceCopy> From<HostLockedBox<T>> for LockedBox<T> {
-    fn from(host_locked_box: HostLockedBox<T>) -> Self {
-        // Safety: pointer comes from [`LockedBox::into_raw`]
-        //         i.e. this function completes the roundtrip
-        unsafe { Self::from_raw(host_locked_box.0) }
-    }
-}
-
-impl<T: DeviceCopy> Drop for HostLockedBox<T> {
-    fn drop(&mut self) {
-        // Safety: pointer comes from [`LockedBox::into_raw`]
-        //         i.e. this function completes the roundtrip
-        let locked_box = unsafe { LockedBox::from_raw(self.0) };
-
-        core::mem::drop(CudaDropWrapper::from(locked_box));
-    }
-}
-
-#[repr(transparent)]
-#[allow(clippy::module_name_repetitions)]
-pub struct HostDeviceBox<T: DeviceCopy>(DevicePointer<T>);
-
-impl<T: DeviceCopy> crate::alloc::CudaAlloc for HostDeviceBox<T> {}
-impl<T: DeviceCopy> crate::alloc::sealed::alloc::Sealed for HostDeviceBox<T> {}
-
-impl<T: DeviceCopy> HostDeviceBox<T> {
-    /// # Errors
-    ///
-    /// Returns a [`CudaError`] iff copying from `value` into `self` failed.
-    pub fn copy_from(&mut self, value: &T) -> CudaResult<()> {
-        // Safety: pointer comes from [`DeviceBox::into_device`]
-        //         i.e. this function completes the roundtrip
-        let mut device_box = unsafe { ManuallyDrop::new(DeviceBox::from_device(self.0)) };
-
-        rustacuda::memory::CopyDestination::copy_from(&mut *device_box, value)
-    }
-
-    /// # Errors
-    ///
-    /// Returns a [`CudaError`] iff copying from `self` into `value` failed.
-    pub fn copy_to(&self, value: &mut T) -> CudaResult<()> {
-        // Safety: pointer comes from [`DeviceBox::into_device`]
-        //         i.e. this function completes the roundtrip
-        let device_box = unsafe { ManuallyDrop::new(DeviceBox::from_device(self.0)) };
-
-        rustacuda::memory::CopyDestination::copy_to(&*device_box, value)
-    }
-
-    /// # Errors
-    ///
-    /// Returns a [`CudaError`] iff copying from `value` into `self` failed.
-    ///
-    /// # Safety
-    ///
-    /// To use the data inside the device box, either
-    /// - the passed-in [`Stream`] must be synchronised
-    /// - the kernel must be launched on the passed-in [`Stream`]
-    pub unsafe fn async_copy_from(
-        &mut self,
-        value: &HostLockedBox<T>,
-        stream: &Stream,
-    ) -> CudaResult<()> {
-        // Safety: pointer comes from [`DeviceBox::into_device`]
-        //         i.e. this function completes the roundtrip
-        let mut device_box = unsafe { ManuallyDrop::new(DeviceBox::from_device(self.0)) };
-        // Safety: pointer comes from [`LockedBox::into_raw`]
-        //         i.e. this function completes the roundtrip
-        let locked_box = unsafe { ManuallyDrop::new(LockedBox::from_raw(value.0)) };
-
-        unsafe {
-            rustacuda::memory::AsyncCopyDestination::async_copy_from(
-                &mut *device_box,
-                &*locked_box,
-                stream,
-            )
-        }
-    }
-
-    /// # Errors
-    ///
-    /// Returns a [`CudaError`] iff copying from `self` into `value` failed.
-    ///
-    /// # Safety
-    ///
-    /// To use the data inside `value`, the passed-in [`Stream`] must be
-    /// synchronised.
-    pub unsafe fn async_copy_to(
-        &self,
-        value: &mut HostLockedBox<T>,
-        stream: &Stream,
-    ) -> CudaResult<()> {
-        // Safety: pointer comes from [`DeviceBox::into_device`]
-        //         i.e. this function completes the roundtrip
-        let device_box = unsafe { ManuallyDrop::new(DeviceBox::from_device(self.0)) };
-        // Safety: pointer comes from [`LockedBox::into_raw`]
-        //         i.e. this function completes the roundtrip
-        let mut locked_box = unsafe { ManuallyDrop::new(LockedBox::from_raw(value.0)) };
-
-        unsafe {
-            rustacuda::memory::AsyncCopyDestination::async_copy_to(
-                &*device_box,
-                &mut *locked_box,
-                stream,
-            )
-        }
-    }
-}
-
-impl<T: DeviceCopy> From<DeviceBox<T>> for HostDeviceBox<T> {
-    fn from(device_box: DeviceBox<T>) -> Self {
-        Self(DeviceBox::into_device(device_box))
-    }
-}
-
-impl<T: DeviceCopy> From<HostDeviceBox<T>> for DeviceBox<T> {
-    fn from(host_device_box: HostDeviceBox<T>) -> Self {
-        // Safety: pointer comes from [`DeviceBox::into_device`]
-        //         i.e. this function completes the roundtrip
-        unsafe { Self::from_device(host_device_box.0) }
-    }
-}
-
-impl<T: DeviceCopy> Drop for HostDeviceBox<T> {
-    fn drop(&mut self) {
-        // Safety: pointer comes from [`DeviceBox::into_device`]
-        //         i.e. this function completes the roundtrip
-        let device_box = unsafe { DeviceBox::from_device(self.0) };
-
-        core::mem::drop(CudaDropWrapper::from(device_box));
-    }
-}
-
-#[allow(clippy::module_name_repetitions)]
-pub struct HostAndDeviceMutRef<'a, T: DeviceCopy> {
-    device_box: &'a mut HostDeviceBox<T>,
+pub struct HostAndDeviceMutRef<'a, T: PortableBitSemantics> {
+    device_box: &'a mut DeviceBox<SafeDeviceCopyWrapper<T>>,
     host_ref: &'a mut T,
 }
 
-impl<'a, T: DeviceCopy> HostAndDeviceMutRef<'a, T> {
+impl<'a, T: PortableBitSemantics> HostAndDeviceMutRef<'a, T> {
     /// # Safety
     ///
     /// `device_box` must contain EXACTLY the device copy of `host_ref`
-    pub unsafe fn new(device_box: &'a mut HostDeviceBox<T>, host_ref: &'a mut T) -> Self {
+    pub unsafe fn new(
+        device_box: &'a mut DeviceBox<SafeDeviceCopyWrapper<T>>,
+        host_ref: &'a mut T,
+    ) -> Self {
         Self {
             device_box,
             host_ref,
@@ -286,7 +132,8 @@ impl<'a, T: DeviceCopy> HostAndDeviceMutRef<'a, T> {
         host_ref: &mut T,
         inner: F,
     ) -> Result<O, E> {
-        let mut device_box: HostDeviceBox<_> = DeviceBox::new(host_ref)?.into();
+        let mut device_box =
+            CudaDropWrapper::from(DeviceBox::new(SafeDeviceCopyWrapper::from_ref(host_ref))?);
 
         // Safety: `device_box` contains exactly the device copy of `host_ref`
         let result = inner(HostAndDeviceMutRef {
@@ -295,7 +142,7 @@ impl<'a, T: DeviceCopy> HostAndDeviceMutRef<'a, T> {
         });
 
         // Copy back any changes made
-        device_box.copy_to(host_ref)?;
+        device_box.copy_to(SafeDeviceCopyWrapper::from_mut(host_ref))?;
 
         core::mem::drop(device_box);
 
@@ -308,7 +155,7 @@ impl<'a, T: DeviceCopy> HostAndDeviceMutRef<'a, T> {
         'a: 'b,
     {
         DeviceMutRef {
-            pointer: self.device_box.0.as_raw_mut(),
+            pointer: DeviceMutPointer(self.device_box.as_device_ptr().as_raw_mut().cast()),
             reference: PhantomData,
         }
     }
@@ -354,24 +201,27 @@ impl<'a, T: DeviceCopy> HostAndDeviceMutRef<'a, T> {
 }
 
 #[allow(clippy::module_name_repetitions)]
-pub struct HostAndDeviceConstRef<'a, T: DeviceCopy> {
-    device_box: &'a HostDeviceBox<T>,
+pub struct HostAndDeviceConstRef<'a, T: PortableBitSemantics> {
+    device_box: &'a DeviceBox<SafeDeviceCopyWrapper<T>>,
     host_ref: &'a T,
 }
 
-impl<'a, T: DeviceCopy> Clone for HostAndDeviceConstRef<'a, T> {
+impl<'a, T: PortableBitSemantics> Clone for HostAndDeviceConstRef<'a, T> {
     fn clone(&self) -> Self {
         *self
     }
 }
 
-impl<'a, T: DeviceCopy> Copy for HostAndDeviceConstRef<'a, T> {}
+impl<'a, T: PortableBitSemantics> Copy for HostAndDeviceConstRef<'a, T> {}
 
-impl<'a, T: DeviceCopy> HostAndDeviceConstRef<'a, T> {
+impl<'a, T: PortableBitSemantics> HostAndDeviceConstRef<'a, T> {
     /// # Safety
     ///
     /// `device_box` must contain EXACTLY the device copy of `host_ref`
-    pub const unsafe fn new(device_box: &'a HostDeviceBox<T>, host_ref: &'a T) -> Self {
+    pub const unsafe fn new(
+        device_box: &'a DeviceBox<SafeDeviceCopyWrapper<T>>,
+        host_ref: &'a T,
+    ) -> Self {
         Self {
             device_box,
             host_ref,
@@ -390,7 +240,8 @@ impl<'a, T: DeviceCopy> HostAndDeviceConstRef<'a, T> {
         host_ref: &T,
         inner: F,
     ) -> Result<O, E> {
-        let device_box: HostDeviceBox<_> = DeviceBox::new(host_ref)?.into();
+        let device_box =
+            CudaDropWrapper::from(DeviceBox::new(SafeDeviceCopyWrapper::from_ref(host_ref))?);
 
         // Safety: `device_box` contains exactly the device copy of `host_ref`
         let result = inner(HostAndDeviceConstRef {
@@ -408,8 +259,10 @@ impl<'a, T: DeviceCopy> HostAndDeviceConstRef<'a, T> {
     where
         'a: 'b,
     {
+        let mut hack = ManuallyDrop::new(unsafe { std::ptr::read(self.device_box) });
+
         DeviceConstRef {
-            pointer: self.device_box.0.as_raw(),
+            pointer: DeviceConstPointer(hack.as_device_ptr().as_raw().cast()),
             reference: PhantomData,
         }
     }
@@ -441,12 +294,12 @@ impl<'a, T: DeviceCopy> HostAndDeviceConstRef<'a, T> {
 }
 
 #[allow(clippy::module_name_repetitions)]
-pub struct HostAndDeviceOwned<'a, T: SafeDeviceCopy + DeviceCopy> {
-    device_box: &'a mut HostDeviceBox<T>,
+pub struct HostAndDeviceOwned<'a, T: PortableBitSemantics> {
+    device_box: &'a mut DeviceBox<SafeDeviceCopyWrapper<T>>,
     host_val: &'a mut T,
 }
 
-impl<'a, T: SafeDeviceCopy + DeviceCopy> HostAndDeviceOwned<'a, T> {
+impl<'a, T: PortableBitSemantics> HostAndDeviceOwned<'a, T> {
     /// # Errors
     ///
     /// Returns a [`CudaError`] iff `value` cannot be moved
@@ -455,7 +308,8 @@ impl<'a, T: SafeDeviceCopy + DeviceCopy> HostAndDeviceOwned<'a, T> {
         mut value: T,
         inner: F,
     ) -> Result<O, E> {
-        let mut device_box: HostDeviceBox<_> = DeviceBox::new(&value)?.into();
+        let mut device_box =
+            CudaDropWrapper::from(DeviceBox::new(SafeDeviceCopyWrapper::from_ref(&value))?);
 
         // Safety: `device_box` contains exactly the device copy of `value`
         inner(HostAndDeviceOwned {
@@ -467,7 +321,7 @@ impl<'a, T: SafeDeviceCopy + DeviceCopy> HostAndDeviceOwned<'a, T> {
     #[must_use]
     pub fn for_device(self) -> DeviceOwnedRef<'a, T> {
         DeviceOwnedRef {
-            pointer: self.device_box.0.as_raw_mut(),
+            pointer: DeviceOwnedPointer(self.device_box.as_device_ptr().as_raw_mut().cast()),
             marker: PhantomData::<T>,
             reference: PhantomData::<&'a mut ()>,
         }
@@ -489,18 +343,18 @@ impl<'a, T: SafeDeviceCopy + DeviceCopy> HostAndDeviceOwned<'a, T> {
 }
 
 #[allow(clippy::module_name_repetitions)]
-pub struct HostAndDeviceMutRefAsync<'stream, 'a, T: DeviceCopy> {
-    device_box: &'a mut HostDeviceBox<T>,
+pub struct HostAndDeviceMutRefAsync<'stream, 'a, T: PortableBitSemantics> {
+    device_box: &'a mut DeviceBox<SafeDeviceCopyWrapper<T>>,
     host_ref: &'a mut T,
     stream: PhantomData<&'stream Stream>,
 }
 
-impl<'stream, 'a, T: DeviceCopy> HostAndDeviceMutRefAsync<'stream, 'a, T> {
+impl<'stream, 'a, T: PortableBitSemantics> HostAndDeviceMutRefAsync<'stream, 'a, T> {
     /// # Safety
     ///
     /// `device_box` must contain EXACTLY the device copy of `host_ref`
     pub unsafe fn new(
-        device_box: &'a mut HostDeviceBox<T>,
+        device_box: &'a mut DeviceBox<SafeDeviceCopyWrapper<T>>,
         host_ref: &'a mut T,
         stream: &'stream Stream,
     ) -> Self {
@@ -523,7 +377,7 @@ impl<'stream, 'a, T: DeviceCopy> HostAndDeviceMutRefAsync<'stream, 'a, T> {
         'a: 'b,
     {
         DeviceMutRef {
-            pointer: self.device_box.0.as_raw_mut(),
+            pointer: DeviceMutPointer(self.device_box.as_device_ptr().as_raw_mut().cast()),
             reference: PhantomData,
         }
     }
@@ -559,27 +413,27 @@ impl<'stream, 'a, T: DeviceCopy> HostAndDeviceMutRefAsync<'stream, 'a, T> {
 }
 
 #[allow(clippy::module_name_repetitions)]
-pub struct HostAndDeviceConstRefAsync<'stream, 'a, T: DeviceCopy> {
-    device_box: &'a HostDeviceBox<T>,
+pub struct HostAndDeviceConstRefAsync<'stream, 'a, T: PortableBitSemantics> {
+    device_box: &'a DeviceBox<SafeDeviceCopyWrapper<T>>,
     host_ref: &'a T,
     stream: PhantomData<&'stream Stream>,
 }
 
-impl<'stream, 'a, T: DeviceCopy> Clone for HostAndDeviceConstRefAsync<'stream, 'a, T> {
+impl<'stream, 'a, T: PortableBitSemantics> Clone for HostAndDeviceConstRefAsync<'stream, 'a, T> {
     fn clone(&self) -> Self {
         *self
     }
 }
 
-impl<'stream, 'a, T: DeviceCopy> Copy for HostAndDeviceConstRefAsync<'stream, 'a, T> {}
+impl<'stream, 'a, T: PortableBitSemantics> Copy for HostAndDeviceConstRefAsync<'stream, 'a, T> {}
 
-impl<'stream, 'a, T: DeviceCopy> HostAndDeviceConstRefAsync<'stream, 'a, T> {
+impl<'stream, 'a, T: PortableBitSemantics> HostAndDeviceConstRefAsync<'stream, 'a, T> {
     /// # Safety
     ///
     /// `device_box` must contain EXACTLY the device copy of `host_ref`
     #[must_use]
     pub const unsafe fn new(
-        device_box: &'a HostDeviceBox<T>,
+        device_box: &'a DeviceBox<SafeDeviceCopyWrapper<T>>,
         host_ref: &'a T,
         stream: &'stream Stream,
     ) -> Self {
@@ -601,8 +455,10 @@ impl<'stream, 'a, T: DeviceCopy> HostAndDeviceConstRefAsync<'stream, 'a, T> {
     where
         'a: 'b,
     {
+        let mut hack = ManuallyDrop::new(unsafe { std::ptr::read(self.device_box) });
+
         DeviceConstRef {
-            pointer: self.device_box.0.as_raw(),
+            pointer: DeviceConstPointer(hack.as_device_ptr().as_raw().cast()),
             reference: PhantomData,
         }
     }
@@ -622,13 +478,13 @@ impl<'stream, 'a, T: DeviceCopy> HostAndDeviceConstRefAsync<'stream, 'a, T> {
 }
 
 #[allow(clippy::module_name_repetitions)]
-pub struct HostAndDeviceOwnedAsync<'stream, 'a, T: SafeDeviceCopy + DeviceCopy> {
-    device_box: &'a mut HostDeviceBox<T>,
+pub struct HostAndDeviceOwnedAsync<'stream, 'a, T: PortableBitSemantics> {
+    device_box: &'a mut DeviceBox<SafeDeviceCopyWrapper<T>>,
     host_val: &'a mut T,
     stream: PhantomData<&'stream Stream>,
 }
 
-impl<'stream, 'a, T: SafeDeviceCopy + DeviceCopy> HostAndDeviceOwnedAsync<'stream, 'a, T> {
+impl<'stream, 'a, T: PortableBitSemantics> HostAndDeviceOwnedAsync<'stream, 'a, T> {
     #[must_use]
     /// # Safety
     ///
@@ -636,7 +492,7 @@ impl<'stream, 'a, T: SafeDeviceCopy + DeviceCopy> HostAndDeviceOwnedAsync<'strea
     /// constructed-with [`Stream`]
     pub unsafe fn for_device_async(self) -> DeviceOwnedRef<'a, T> {
         DeviceOwnedRef {
-            pointer: self.device_box.0.as_raw_mut(),
+            pointer: DeviceOwnedPointer(self.device_box.as_device_ptr().as_raw_mut().cast()),
             marker: PhantomData::<T>,
             reference: PhantomData::<&'a mut ()>,
         }
diff --git a/src/kernel/mod.rs b/src/kernel/mod.rs
index 0f490c9b0..29b3795c0 100644
--- a/src/kernel/mod.rs
+++ b/src/kernel/mod.rs
@@ -6,7 +6,6 @@ use std::{
     ptr::NonNull,
 };
 
-use const_type_layout::TypeGraphLayout;
 #[cfg(feature = "host")]
 use rustacuda::{
     error::{CudaError, CudaResult},
@@ -28,6 +27,8 @@ mod ptx_jit;
 #[cfg(feature = "host")]
 use ptx_jit::{PtxJITCompiler, PtxJITResult};
 
+use crate::safety::PortableBitSemantics;
+
 pub mod param;
 
 mod sealed {
@@ -41,7 +42,7 @@ pub trait CudaKernelParameter: sealed::Sealed {
     #[cfg(feature = "host")]
     type AsyncHostType<'stream, 'b>;
     #[doc(hidden)]
-    type FfiType<'stream, 'b>: rustacuda_core::DeviceCopy + TypeGraphLayout;
+    type FfiType<'stream, 'b>: PortableBitSemantics;
     #[cfg(any(feature = "device", doc))]
     type DeviceType<'b>;
 
diff --git a/src/kernel/param.rs b/src/kernel/param.rs
index 2e4461051..9b2499b51 100644
--- a/src/kernel/param.rs
+++ b/src/kernel/param.rs
@@ -14,6 +14,7 @@ use crate::{
     alloc::EmptyCudaAlloc,
     kernel::{sealed, CudaKernelParameter},
     lend::RustToCuda,
+    safety::PortableBitSemantics,
     utils::ffi::{DeviceAccessible, DeviceConstRef, DeviceOwnedRef},
 };
 
@@ -36,20 +37,13 @@ impl<T> DerefMut for PtxJit<T> {
     }
 }
 
-pub struct PerThreadShallowCopy<
-    T: crate::safety::SafeDeviceCopy
-        + crate::safety::NoSafeAliasing
-        + const_type_layout::TypeGraphLayout,
-> {
+pub struct PerThreadShallowCopy<T: crate::safety::StackOnly + crate::safety::PortableBitSemantics> {
     never: !,
     _marker: PhantomData<T>,
 }
 
-impl<
-        T: crate::safety::SafeDeviceCopy
-            + crate::safety::NoSafeAliasing
-            + const_type_layout::TypeGraphLayout,
-    > Deref for PerThreadShallowCopy<T>
+impl<T: crate::safety::StackOnly + crate::safety::PortableBitSemantics> Deref
+    for PerThreadShallowCopy<T>
 {
     type Target = T;
 
@@ -58,22 +52,16 @@ impl<
     }
 }
 
-impl<
-        T: crate::safety::SafeDeviceCopy
-            + crate::safety::NoSafeAliasing
-            + const_type_layout::TypeGraphLayout,
-    > DerefMut for PerThreadShallowCopy<T>
+impl<T: crate::safety::StackOnly + crate::safety::PortableBitSemantics> DerefMut
+    for PerThreadShallowCopy<T>
 {
     fn deref_mut(&mut self) -> &mut Self::Target {
         self.never
     }
 }
 
-impl<
-        T: crate::safety::SafeDeviceCopy
-            + crate::safety::NoSafeAliasing
-            + const_type_layout::TypeGraphLayout,
-    > CudaKernelParameter for PerThreadShallowCopy<T>
+impl<T: Copy + Send + crate::safety::StackOnly + crate::safety::PortableBitSemantics>
+    CudaKernelParameter for PerThreadShallowCopy<T>
 {
     #[cfg(feature = "host")]
     type AsyncHostType<'stream, 'b> = crate::utils::device_copy::SafeDeviceCopyWrapper<T>;
@@ -124,32 +112,19 @@ impl<
         inner(param)
     }
 }
-impl<
-        T: crate::safety::SafeDeviceCopy
-            + crate::safety::NoSafeAliasing
-            + const_type_layout::TypeGraphLayout,
-    > sealed::Sealed for PerThreadShallowCopy<T>
+impl<T: Copy + Send + crate::safety::StackOnly + crate::safety::PortableBitSemantics> sealed::Sealed
+    for PerThreadShallowCopy<T>
 {
 }
 
-impl<
-        'a,
-        T: 'static
-            + crate::safety::SafeDeviceCopy
-            + crate::safety::NoSafeAliasing
-            + const_type_layout::TypeGraphLayout,
-    > CudaKernelParameter for &'a PerThreadShallowCopy<T>
+impl<'a, T: 'static + Sync + crate::safety::StackOnly + crate::safety::PortableBitSemantics>
+    CudaKernelParameter for &'a PerThreadShallowCopy<T>
 {
     #[cfg(feature = "host")]
-    type AsyncHostType<'stream, 'b> = crate::host::HostAndDeviceConstRefAsync<
-        'stream,
-        'b,
-        crate::utils::device_copy::SafeDeviceCopyWrapper<T>,
-    >;
+    type AsyncHostType<'stream, 'b> = crate::host::HostAndDeviceConstRefAsync<'stream, 'b, T>;
     #[cfg(any(feature = "device", doc))]
     type DeviceType<'b> = &'b T;
-    type FfiType<'stream, 'b> =
-        DeviceConstRef<'b, crate::utils::device_copy::SafeDeviceCopyWrapper<T>>;
+    type FfiType<'stream, 'b> = DeviceConstRef<'b, T>;
     #[cfg(feature = "host")]
     type SyncHostType = &'a T;
 
@@ -159,19 +134,7 @@ impl<
         _stream: &'stream rustacuda::stream::Stream,
         inner: impl for<'b> FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result<O, E>,
     ) -> Result<O, E> {
-        let host_box = crate::host::HostDeviceBox::from(rustacuda::memory::DeviceBox::new(
-            crate::utils::device_copy::SafeDeviceCopyWrapper::from_ref(param),
-        )?);
-
-        // Safety: `host_box` contains exactly the device copy of `param`
-        let const_ref = unsafe {
-            crate::host::HostAndDeviceConstRef::new(
-                &host_box,
-                crate::utils::device_copy::SafeDeviceCopyWrapper::from_ref(param),
-            )
-        };
-
-        inner(const_ref.as_async())
+        crate::host::HostAndDeviceConstRef::with_new(param, |const_ref| inner(const_ref.as_async()))
     }
 
     #[cfg(feature = "host")]
@@ -199,27 +162,18 @@ impl<
         param: Self::FfiType<'static, 'static>,
         inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O,
     ) -> O {
-        let param = param.as_ref().into_ref();
+        let param = param.as_ref();
 
         inner(param)
     }
 }
-impl<
-        'a,
-        T: crate::safety::SafeDeviceCopy
-            + crate::safety::NoSafeAliasing
-            + const_type_layout::TypeGraphLayout,
-    > sealed::Sealed for &'a PerThreadShallowCopy<T>
+impl<'a, T: 'static + Sync + crate::safety::StackOnly + crate::safety::PortableBitSemantics>
+    sealed::Sealed for &'a PerThreadShallowCopy<T>
 {
 }
 
-impl<
-        'a,
-        T: 'static
-            + crate::safety::SafeDeviceCopy
-            + crate::safety::NoSafeAliasing
-            + const_type_layout::TypeGraphLayout,
-    > CudaKernelParameter for &'a PtxJit<PerThreadShallowCopy<T>>
+impl<'a, T: 'static + Sync + crate::safety::StackOnly + crate::safety::PortableBitSemantics>
+    CudaKernelParameter for &'a PtxJit<PerThreadShallowCopy<T>>
 {
     #[cfg(feature = "host")]
     type AsyncHostType<'stream, 'b> =
@@ -272,21 +226,21 @@ impl<
         )
     }
 }
-impl<
-        'a,
-        T: crate::safety::SafeDeviceCopy
-            + crate::safety::NoSafeAliasing
-            + const_type_layout::TypeGraphLayout,
-    > sealed::Sealed for &'a PtxJit<PerThreadShallowCopy<T>>
+impl<'a, T: 'static + Sync + crate::safety::StackOnly + crate::safety::PortableBitSemantics>
+    sealed::Sealed for &'a PtxJit<PerThreadShallowCopy<T>>
 {
 }
 
-pub struct ShallowInteriorMutable<T: InteriorMutableSafeDeviceCopy> {
+pub struct ShallowInteriorMutable<
+    T: crate::safety::StackOnly + crate::safety::PortableBitSemantics + InteriorMutableSync,
+> {
     never: !,
     _marker: PhantomData<T>,
 }
 
-impl<T: InteriorMutableSafeDeviceCopy> Deref for ShallowInteriorMutable<T> {
+impl<T: crate::safety::StackOnly + crate::safety::PortableBitSemantics + InteriorMutableSync> Deref
+    for ShallowInteriorMutable<T>
+{
     type Target = T;
 
     fn deref(&self) -> &Self::Target {
@@ -294,19 +248,19 @@ impl<T: InteriorMutableSafeDeviceCopy> Deref for ShallowInteriorMutable<T> {
     }
 }
 
-impl<'a, T: 'static + InteriorMutableSafeDeviceCopy> CudaKernelParameter
-    for &'a ShallowInteriorMutable<T>
+impl<
+        'a,
+        T: 'static
+            + crate::safety::StackOnly
+            + crate::safety::PortableBitSemantics
+            + InteriorMutableSync,
+    > CudaKernelParameter for &'a ShallowInteriorMutable<T>
 {
     #[cfg(feature = "host")]
-    type AsyncHostType<'stream, 'b> = crate::host::HostAndDeviceConstRefAsync<
-        'stream,
-        'b,
-        crate::utils::device_copy::SafeDeviceCopyWrapper<T>,
-    >;
+    type AsyncHostType<'stream, 'b> = crate::host::HostAndDeviceConstRefAsync<'stream, 'b, T>;
     #[cfg(any(feature = "device", doc))]
     type DeviceType<'b> = &'b T;
-    type FfiType<'stream, 'b> =
-        DeviceConstRef<'b, crate::utils::device_copy::SafeDeviceCopyWrapper<T>>;
+    type FfiType<'stream, 'b> = DeviceConstRef<'b, T>;
     #[cfg(feature = "host")]
     /// The kernel takes a mutable borrow of the interior mutable data to ensure
     /// the interior mutability is limited to just this kernel invocation.
@@ -318,25 +272,9 @@ impl<'a, T: 'static + InteriorMutableSafeDeviceCopy> CudaKernelParameter
         _stream: &'stream rustacuda::stream::Stream,
         inner: impl for<'b> FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result<O, E>,
     ) -> Result<O, E> {
-        let host_box = crate::host::HostDeviceBox::from(rustacuda::memory::DeviceBox::new(
-            crate::utils::device_copy::SafeDeviceCopyWrapper::from_ref(param),
-        )?);
-
-        // Safety: `host_box` contains exactly the device copy of `param`
-        let const_ref = unsafe {
-            crate::host::HostAndDeviceConstRef::new(
-                &host_box,
-                crate::utils::device_copy::SafeDeviceCopyWrapper::from_ref(param),
-            )
-        };
-
-        let result = inner(const_ref.as_async());
-
-        host_box.copy_to(crate::utils::device_copy::SafeDeviceCopyWrapper::from_mut(
-            param,
-        ))?;
-
-        result
+        crate::host::HostAndDeviceMutRef::with_new(param, |const_ref| {
+            inner(const_ref.as_ref().as_async())
+        })
     }
 
     #[cfg(feature = "host")]
@@ -364,24 +302,23 @@ impl<'a, T: 'static + InteriorMutableSafeDeviceCopy> CudaKernelParameter
         param: Self::FfiType<'static, 'static>,
         inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O,
     ) -> O {
-        let param = param.as_ref().into_ref();
+        let param = param.as_ref();
 
         inner(param)
     }
 }
-impl<'a, T: InteriorMutableSafeDeviceCopy> sealed::Sealed for &'a ShallowInteriorMutable<T> {}
-
-pub trait InteriorMutableSafeDeviceCopy:
-    crate::safety::SafeDeviceCopy
-    + crate::safety::NoSafeAliasing
-    + const_type_layout::TypeGraphLayout
-    + sealed::Sealed
+impl<
+        'a,
+        T: crate::safety::StackOnly + crate::safety::PortableBitSemantics + InteriorMutableSync,
+    > sealed::Sealed for &'a ShallowInteriorMutable<T>
 {
 }
 
+pub trait InteriorMutableSync: Sync + sealed::Sealed {}
+
 macro_rules! impl_atomic_interior_mutable {
     ($atomic:ident($interior:ty)) => {
-        impl InteriorMutableSafeDeviceCopy for core::sync::atomic::$atomic {}
+        impl InteriorMutableSync for core::sync::atomic::$atomic {}
         impl sealed::Sealed for core::sync::atomic::$atomic {}
     };
     ($($atomic:ident($interior:ty)),*) => {
@@ -395,27 +332,21 @@ impl_atomic_interior_mutable! {
     AtomicU8(u8), AtomicU16(u16), AtomicU32(u32), AtomicU64(u64), AtomicUsize(usize)
 }
 
-impl<
-        T: crate::safety::StackOnly
-            + crate::safety::NoSafeAliasing
-            + const_type_layout::TypeGraphLayout,
-    > InteriorMutableSafeDeviceCopy for core::cell::SyncUnsafeCell<T>
+impl<T: crate::safety::StackOnly + crate::safety::PortableBitSemantics + Sync> InteriorMutableSync
+    for core::cell::SyncUnsafeCell<T>
 {
 }
-impl<
-        T: crate::safety::StackOnly
-            + crate::safety::NoSafeAliasing
-            + const_type_layout::TypeGraphLayout,
-    > sealed::Sealed for core::cell::SyncUnsafeCell<T>
+impl<T: crate::safety::StackOnly + crate::safety::PortableBitSemantics + Sync> sealed::Sealed
+    for core::cell::SyncUnsafeCell<T>
 {
 }
 
-pub struct SharedHeapPerThreadShallowCopy<T: RustToCuda + crate::safety::NoSafeAliasing> {
+pub struct SharedHeapPerThreadShallowCopy<T: RustToCuda> {
     never: !,
     _marker: PhantomData<T>,
 }
 
-impl<T: RustToCuda + crate::safety::NoSafeAliasing> Deref for SharedHeapPerThreadShallowCopy<T> {
+impl<T: RustToCuda> Deref for SharedHeapPerThreadShallowCopy<T> {
     type Target = T;
 
     fn deref(&self) -> &Self::Target {
@@ -423,17 +354,11 @@ impl<T: RustToCuda + crate::safety::NoSafeAliasing> Deref for SharedHeapPerThrea
     }
 }
 
-impl<T: RustToCuda + crate::safety::NoSafeAliasing> DerefMut for SharedHeapPerThreadShallowCopy<T> {
-    fn deref_mut(&mut self) -> &mut Self::Target {
-        self.never
-    }
-}
-
 impl<
         T: RustToCuda<
-                CudaRepresentation: 'static + crate::safety::SafeDeviceCopy,
-                CudaAllocation: EmptyCudaAlloc,
-            > + crate::safety::NoSafeAliasing,
+            CudaRepresentation: 'static + crate::safety::PortableBitSemantics,
+            CudaAllocation: EmptyCudaAlloc,
+        >,
     > CudaKernelParameter for SharedHeapPerThreadShallowCopy<T>
 {
     #[cfg(feature = "host")]
@@ -488,16 +413,14 @@ impl<
 }
 impl<
         T: RustToCuda<
-                CudaRepresentation: crate::safety::SafeDeviceCopy,
-                CudaAllocation: EmptyCudaAlloc,
-            > + crate::safety::NoSafeAliasing,
+            CudaRepresentation: crate::safety::PortableBitSemantics,
+            CudaAllocation: EmptyCudaAlloc,
+        >,
     > sealed::Sealed for SharedHeapPerThreadShallowCopy<T>
 {
 }
 
-impl<'a, T: 'static + RustToCuda + crate::safety::NoSafeAliasing> CudaKernelParameter
-    for &'a SharedHeapPerThreadShallowCopy<T>
-{
+impl<'a, T: 'static + RustToCuda> CudaKernelParameter for &'a SharedHeapPerThreadShallowCopy<T> {
     #[cfg(feature = "host")]
     type AsyncHostType<'stream, 'b> = crate::host::HostAndDeviceConstRefAsync<
         'stream,
@@ -548,16 +471,13 @@ impl<'a, T: 'static + RustToCuda + crate::safety::NoSafeAliasing> CudaKernelPara
         unsafe { crate::lend::BorrowFromRust::with_borrow_from_rust(param, inner) }
     }
 }
-impl<'a, T: RustToCuda + crate::safety::NoSafeAliasing> sealed::Sealed
-    for &'a SharedHeapPerThreadShallowCopy<T>
-{
-}
+impl<'a, T: RustToCuda> sealed::Sealed for &'a SharedHeapPerThreadShallowCopy<T> {}
 
 impl<
         T: RustToCuda<
-                CudaRepresentation: 'static + crate::safety::SafeDeviceCopy,
-                CudaAllocation: EmptyCudaAlloc,
-            > + crate::safety::NoSafeAliasing,
+            CudaRepresentation: 'static + crate::safety::PortableBitSemantics,
+            CudaAllocation: EmptyCudaAlloc,
+        >,
     > CudaKernelParameter for PtxJit<SharedHeapPerThreadShallowCopy<T>>
 {
     #[cfg(feature = "host")]
@@ -616,14 +536,14 @@ impl<
 }
 impl<
         T: RustToCuda<
-                CudaRepresentation: crate::safety::SafeDeviceCopy,
-                CudaAllocation: EmptyCudaAlloc,
-            > + crate::safety::NoSafeAliasing,
+            CudaRepresentation: crate::safety::PortableBitSemantics,
+            CudaAllocation: EmptyCudaAlloc,
+        >,
     > sealed::Sealed for PtxJit<SharedHeapPerThreadShallowCopy<T>>
 {
 }
 
-impl<'a, T: 'static + RustToCuda + crate::safety::NoSafeAliasing> CudaKernelParameter
+impl<'a, T: 'static + RustToCuda> CudaKernelParameter
     for &'a PtxJit<SharedHeapPerThreadShallowCopy<T>>
 {
     #[cfg(feature = "host")]
@@ -681,10 +601,7 @@ impl<'a, T: 'static + RustToCuda + crate::safety::NoSafeAliasing> CudaKernelPara
         )
     }
 }
-impl<'a, T: RustToCuda + crate::safety::NoSafeAliasing> sealed::Sealed
-    for &'a PtxJit<SharedHeapPerThreadShallowCopy<T>>
-{
-}
+impl<'a, T: RustToCuda> sealed::Sealed for &'a PtxJit<SharedHeapPerThreadShallowCopy<T>> {}
 
 #[cfg(feature = "host")]
 fn param_as_raw_bytes<T: ?Sized>(r: &T) -> NonNull<[u8]> {
@@ -703,34 +620,30 @@ fn emit_param_ptx_jit_marker<T: ?Sized, const INDEX: usize>(param: &T) {
 }
 
 mod private_shared {
+    use core::marker::PhantomData;
+
     use const_type_layout::{TypeGraphLayout, TypeLayout};
-    use rustacuda_core::DeviceCopy;
+
+    use crate::safety::PortableBitSemantics;
 
     #[doc(hidden)]
     #[derive(TypeLayout)]
     #[repr(C)]
-    pub struct ThreadBlockSharedFfi<T: 'static + TypeGraphLayout> {
-        pub(super) _marker: [T; 0],
+    pub struct ThreadBlockSharedFfi<T: 'static> {
+        pub(super) _dummy: [u8; 0],
+        pub(super) _marker: PhantomData<T>,
     }
 
-    // Safety: there is nothing to copy, this is just a zero-sized marker type
-    unsafe impl<T: 'static + TypeGraphLayout> DeviceCopy for ThreadBlockSharedFfi<T> {}
-
     #[doc(hidden)]
     #[derive(TypeLayout)]
     #[repr(C)]
-    pub struct ThreadBlockSharedSliceFfi<T: 'static + TypeGraphLayout> {
+    pub struct ThreadBlockSharedSliceFfi<T: 'static + PortableBitSemantics + TypeGraphLayout> {
         pub(super) len: usize,
         pub(super) _marker: [T; 0],
     }
-
-    // Safety: we only copy a usize, which implements `DeviceCopy`
-    unsafe impl<T: 'static + TypeGraphLayout> DeviceCopy for ThreadBlockSharedSliceFfi<T> {}
 }
 
-impl<'a, T: 'static + TypeGraphLayout> CudaKernelParameter
-    for &'a mut crate::utils::shared::ThreadBlockShared<T>
-{
+impl<'a, T: 'static> CudaKernelParameter for &'a mut crate::utils::shared::ThreadBlockShared<T> {
     #[cfg(feature = "host")]
     type AsyncHostType<'stream, 'b> = &'b mut crate::utils::shared::ThreadBlockShared<T>;
     #[cfg(any(feature = "device", doc))]
@@ -765,7 +678,10 @@ impl<'a, T: 'static + TypeGraphLayout> CudaKernelParameter
     fn async_to_ffi<'stream, 'b>(
         _param: Self::AsyncHostType<'stream, 'b>,
     ) -> Self::FfiType<'stream, 'b> {
-        private_shared::ThreadBlockSharedFfi { _marker: [] }
+        private_shared::ThreadBlockSharedFfi {
+            _dummy: [],
+            _marker: PhantomData::<T>,
+        }
     }
 
     #[cfg(feature = "device")]
@@ -780,12 +696,9 @@ impl<'a, T: 'static + TypeGraphLayout> CudaKernelParameter
         inner(&mut param)
     }
 }
-impl<'a, T: 'static + TypeGraphLayout> sealed::Sealed
-    for &'a mut crate::utils::shared::ThreadBlockShared<T>
-{
-}
+impl<'a, T: 'static> sealed::Sealed for &'a mut crate::utils::shared::ThreadBlockShared<T> {}
 
-impl<'a, T: 'static + TypeGraphLayout> CudaKernelParameter
+impl<'a, T: 'static + PortableBitSemantics + TypeGraphLayout> CudaKernelParameter
     for &'a mut crate::utils::shared::ThreadBlockSharedSlice<T>
 {
     #[cfg(feature = "host")]
@@ -840,7 +753,7 @@ impl<'a, T: 'static + TypeGraphLayout> CudaKernelParameter
         }
     }
 }
-impl<'a, T: 'static + TypeGraphLayout> sealed::Sealed
+impl<'a, T: 'static + PortableBitSemantics + TypeGraphLayout> sealed::Sealed
     for &'a mut crate::utils::shared::ThreadBlockSharedSlice<T>
 {
 }
diff --git a/src/lend/impls/box.rs b/src/lend/impls/box.rs
index 4acfd7b2c..e2a78999b 100644
--- a/src/lend/impls/box.rs
+++ b/src/lend/impls/box.rs
@@ -1,4 +1,4 @@
-use crate::deps::alloc::boxed::Box;
+use crate::{deps::alloc::boxed::Box, utils::ffi::DeviceOwnedPointer};
 
 use const_type_layout::{TypeGraphLayout, TypeLayout};
 
@@ -7,7 +7,7 @@ use rustacuda::{error::CudaResult, memory::DeviceBox};
 
 use crate::{
     lend::{CudaAsRust, RustToCuda},
-    safety::SafeDeviceCopy,
+    safety::PortableBitSemantics,
 };
 
 #[cfg(any(feature = "host", feature = "device"))]
@@ -24,17 +24,11 @@ use crate::{
 #[repr(transparent)]
 #[derive(TypeLayout)]
 #[allow(clippy::module_name_repetitions)]
-pub struct BoxCudaRepresentation<T: SafeDeviceCopy + TypeGraphLayout>(*mut T);
+pub struct BoxCudaRepresentation<T: PortableBitSemantics + TypeGraphLayout>(DeviceOwnedPointer<T>);
 
-// Safety: This repr(C) struct only contains a device-owned pointer
-unsafe impl<T: SafeDeviceCopy + TypeGraphLayout> rustacuda_core::DeviceCopy
-    for BoxCudaRepresentation<T>
-{
-}
-
-unsafe impl<T: SafeDeviceCopy + TypeGraphLayout> RustToCuda for Box<T> {
+unsafe impl<T: PortableBitSemantics + TypeGraphLayout> RustToCuda for Box<T> {
     #[cfg(all(feature = "host", not(doc)))]
-    type CudaAllocation = crate::host::CudaDropWrapper<DeviceBox<SafeDeviceCopyWrapper<T>>>;
+    type CudaAllocation = CudaDropWrapper<DeviceBox<SafeDeviceCopyWrapper<T>>>;
     #[cfg(any(not(feature = "host"), doc))]
     type CudaAllocation = crate::alloc::SomeCudaAlloc;
     type CudaRepresentation = BoxCudaRepresentation<T>;
@@ -52,9 +46,9 @@ unsafe impl<T: SafeDeviceCopy + TypeGraphLayout> RustToCuda for Box<T> {
             CudaDropWrapper::from(DeviceBox::new(SafeDeviceCopyWrapper::from_ref(&**self))?);
 
         Ok((
-            DeviceAccessible::from(BoxCudaRepresentation(
+            DeviceAccessible::from(BoxCudaRepresentation(DeviceOwnedPointer(
                 device_box.as_device_ptr().as_raw_mut().cast(),
-            )),
+            ))),
             CombinedCudaAlloc::new(device_box, alloc),
         ))
     }
@@ -76,11 +70,11 @@ unsafe impl<T: SafeDeviceCopy + TypeGraphLayout> RustToCuda for Box<T> {
     }
 }
 
-unsafe impl<T: SafeDeviceCopy + TypeGraphLayout> CudaAsRust for BoxCudaRepresentation<T> {
+unsafe impl<T: PortableBitSemantics + TypeGraphLayout> CudaAsRust for BoxCudaRepresentation<T> {
     type RustRepresentation = Box<T>;
 
     #[cfg(feature = "device")]
     unsafe fn as_rust(this: &DeviceAccessible<Self>) -> Self::RustRepresentation {
-        crate::deps::alloc::boxed::Box::from_raw(this.0)
+        crate::deps::alloc::boxed::Box::from_raw(this.0 .0)
     }
 }
diff --git a/src/lend/impls/boxed_slice.rs b/src/lend/impls/boxed_slice.rs
index 6e1c95d90..677fcca7d 100644
--- a/src/lend/impls/boxed_slice.rs
+++ b/src/lend/impls/boxed_slice.rs
@@ -1,4 +1,6 @@
-use crate::deps::alloc::boxed::Box;
+use core::marker::PhantomData;
+
+use crate::{deps::alloc::boxed::Box, utils::ffi::DeviceOwnedPointer};
 
 use const_type_layout::{TypeGraphLayout, TypeLayout};
 
@@ -7,7 +9,7 @@ use rustacuda::{error::CudaResult, memory::DeviceBuffer};
 
 use crate::{
     lend::{CudaAsRust, RustToCuda},
-    safety::SafeDeviceCopy,
+    safety::PortableBitSemantics,
 };
 
 #[cfg(any(feature = "host", feature = "device"))]
@@ -22,17 +24,15 @@ use crate::{
 
 #[doc(hidden)]
 #[allow(clippy::module_name_repetitions)]
-#[derive(Debug, TypeLayout)]
+#[derive(TypeLayout)]
 #[repr(C)]
-pub struct BoxedSliceCudaRepresentation<T: SafeDeviceCopy + TypeGraphLayout>(*mut T, usize);
-
-// Safety: This repr(C) struct only contains a device-owned pointer and a usize
-unsafe impl<T: SafeDeviceCopy + TypeGraphLayout> rustacuda_core::DeviceCopy
-    for BoxedSliceCudaRepresentation<T>
-{
+pub struct BoxedSliceCudaRepresentation<T: PortableBitSemantics + TypeGraphLayout> {
+    data: DeviceOwnedPointer<T>,
+    len: usize,
+    _marker: PhantomData<T>,
 }
 
-unsafe impl<T: SafeDeviceCopy + TypeGraphLayout> RustToCuda for Box<[T]> {
+unsafe impl<T: PortableBitSemantics + TypeGraphLayout> RustToCuda for Box<[T]> {
     #[cfg(all(feature = "host", not(doc)))]
     type CudaAllocation = crate::host::CudaDropWrapper<DeviceBuffer<SafeDeviceCopyWrapper<T>>>;
     #[cfg(any(not(feature = "host"), doc))]
@@ -53,10 +53,11 @@ unsafe impl<T: SafeDeviceCopy + TypeGraphLayout> RustToCuda for Box<[T]> {
         )?);
 
         Ok((
-            DeviceAccessible::from(BoxedSliceCudaRepresentation(
-                device_buffer.as_mut_ptr().cast(),
-                device_buffer.len(),
-            )),
+            DeviceAccessible::from(BoxedSliceCudaRepresentation {
+                data: DeviceOwnedPointer(device_buffer.as_mut_ptr().cast()),
+                len: device_buffer.len(),
+                _marker: PhantomData::<T>,
+            }),
             CombinedCudaAlloc::new(device_buffer, alloc),
         ))
     }
@@ -78,11 +79,16 @@ unsafe impl<T: SafeDeviceCopy + TypeGraphLayout> RustToCuda for Box<[T]> {
     }
 }
 
-unsafe impl<T: SafeDeviceCopy + TypeGraphLayout> CudaAsRust for BoxedSliceCudaRepresentation<T> {
+unsafe impl<T: PortableBitSemantics + TypeGraphLayout> CudaAsRust
+    for BoxedSliceCudaRepresentation<T>
+{
     type RustRepresentation = Box<[T]>;
 
     #[cfg(feature = "device")]
     unsafe fn as_rust(this: &DeviceAccessible<Self>) -> Self::RustRepresentation {
-        crate::deps::alloc::boxed::Box::from_raw(core::slice::from_raw_parts_mut(this.0, this.1))
+        crate::deps::alloc::boxed::Box::from_raw(core::slice::from_raw_parts_mut(
+            this.data.0,
+            this.len,
+        ))
     }
 }
diff --git a/src/lend/impls/option.rs b/src/lend/impls/option.rs
index 291a4a255..f12f24861 100644
--- a/src/lend/impls/option.rs
+++ b/src/lend/impls/option.rs
@@ -7,7 +7,7 @@ use rustacuda::error::CudaResult;
 
 use crate::{
     lend::{CudaAsRust, RustToCuda, RustToCudaAsync, RustToCudaAsyncProxy, RustToCudaProxy},
-    safety::SafeDeviceCopy,
+    safety::PortableBitSemantics,
     utils::{device_copy::SafeDeviceCopyWrapper, ffi::DeviceAccessible},
 };
 
@@ -23,10 +23,6 @@ pub struct OptionCudaRepresentation<T: CudaAsRust> {
     present: bool,
 }
 
-// Safety: Since the CUDA representation of T is DeviceCopy,
-//         the full enum is also DeviceCopy
-unsafe impl<T: CudaAsRust> rustacuda_core::DeviceCopy for OptionCudaRepresentation<T> {}
-
 unsafe impl<T: RustToCuda> RustToCuda for Option<T> {
     type CudaAllocation = Option<<T as RustToCuda>::CudaAllocation>;
     type CudaRepresentation = OptionCudaRepresentation<<T as RustToCuda>::CudaRepresentation>;
@@ -149,7 +145,7 @@ unsafe impl<T: CudaAsRust> CudaAsRust for OptionCudaRepresentation<T> {
     }
 }
 
-impl<T: SafeDeviceCopy + TypeGraphLayout> RustToCudaProxy<Option<T>>
+impl<T: PortableBitSemantics + TypeGraphLayout> RustToCudaProxy<Option<T>>
     for Option<SafeDeviceCopyWrapper<T>>
 {
     fn from_ref(val: &Option<T>) -> &Self {
@@ -167,7 +163,7 @@ impl<T: SafeDeviceCopy + TypeGraphLayout> RustToCudaProxy<Option<T>>
     }
 }
 
-impl<T: SafeDeviceCopy + TypeGraphLayout> RustToCudaAsyncProxy<Option<T>>
+impl<T: PortableBitSemantics + TypeGraphLayout> RustToCudaAsyncProxy<Option<T>>
     for Option<SafeDeviceCopyWrapper<T>>
 {
     fn from_ref(val: &Option<T>) -> &Self {
diff --git a/src/lend/impls/ref.rs b/src/lend/impls/ref.rs
index c6aee84e6..39ba6117d 100644
--- a/src/lend/impls/ref.rs
+++ b/src/lend/impls/ref.rs
@@ -7,7 +7,8 @@ use rustacuda::{error::CudaResult, memory::DeviceBox};
 
 use crate::{
     lend::{CudaAsRust, RustToCuda},
-    safety::SafeDeviceCopy,
+    safety::PortableBitSemantics,
+    utils::ffi::DeviceConstPointer,
 };
 
 #[cfg(any(feature = "host", feature = "device"))]
@@ -24,20 +25,14 @@ use crate::{
 #[repr(transparent)]
 #[derive(TypeLayout)]
 #[allow(clippy::module_name_repetitions)]
-pub struct RefCudaRepresentation<'a, T: 'a + SafeDeviceCopy + TypeGraphLayout> {
-    data: *const T,
+pub struct RefCudaRepresentation<'a, T: 'a + PortableBitSemantics + TypeGraphLayout> {
+    data: DeviceConstPointer<T>,
     _marker: PhantomData<&'a T>,
 }
 
-// Safety: This repr(C) struct only contains a device-owned pointer
-unsafe impl<'a, T: SafeDeviceCopy + TypeGraphLayout> rustacuda_core::DeviceCopy
-    for RefCudaRepresentation<'a, T>
-{
-}
-
-unsafe impl<'a, T: SafeDeviceCopy + TypeGraphLayout> RustToCuda for &'a T {
+unsafe impl<'a, T: PortableBitSemantics + TypeGraphLayout> RustToCuda for &'a T {
     #[cfg(all(feature = "host", not(doc)))]
-    type CudaAllocation = crate::host::CudaDropWrapper<DeviceBox<SafeDeviceCopyWrapper<T>>>;
+    type CudaAllocation = CudaDropWrapper<DeviceBox<SafeDeviceCopyWrapper<T>>>;
     #[cfg(any(not(feature = "host"), doc))]
     type CudaAllocation = crate::alloc::SomeCudaAlloc;
     type CudaRepresentation = RefCudaRepresentation<'a, T>;
@@ -56,7 +51,7 @@ unsafe impl<'a, T: SafeDeviceCopy + TypeGraphLayout> RustToCuda for &'a T {
 
         Ok((
             DeviceAccessible::from(RefCudaRepresentation {
-                data: device_box.as_device_ptr().as_raw().cast(),
+                data: DeviceConstPointer(device_box.as_device_ptr().as_raw().cast()),
                 _marker: PhantomData::<&'a T>,
             }),
             CombinedCudaAlloc::new(device_box, alloc),
@@ -73,11 +68,13 @@ unsafe impl<'a, T: SafeDeviceCopy + TypeGraphLayout> RustToCuda for &'a T {
     }
 }
 
-unsafe impl<'a, T: SafeDeviceCopy + TypeGraphLayout> CudaAsRust for RefCudaRepresentation<'a, T> {
+unsafe impl<'a, T: PortableBitSemantics + TypeGraphLayout> CudaAsRust
+    for RefCudaRepresentation<'a, T>
+{
     type RustRepresentation = &'a T;
 
     #[cfg(feature = "device")]
     unsafe fn as_rust(this: &DeviceAccessible<Self>) -> Self::RustRepresentation {
-        &*this.data
+        &*this.data.0
     }
 }
diff --git a/src/lend/impls/ref_mut.rs b/src/lend/impls/ref_mut.rs
index a4f4dbe29..33d0fa6e7 100644
--- a/src/lend/impls/ref_mut.rs
+++ b/src/lend/impls/ref_mut.rs
@@ -7,7 +7,8 @@ use rustacuda::{error::CudaResult, memory::DeviceBox};
 
 use crate::{
     lend::{CudaAsRust, RustToCuda},
-    safety::SafeDeviceCopy,
+    safety::PortableBitSemantics,
+    utils::ffi::DeviceMutPointer,
 };
 
 #[cfg(any(feature = "host", feature = "device"))]
@@ -24,20 +25,14 @@ use crate::{
 #[repr(transparent)]
 #[derive(TypeLayout)]
 #[allow(clippy::module_name_repetitions)]
-pub struct RefMutCudaRepresentation<'a, T: 'a + SafeDeviceCopy + TypeGraphLayout> {
-    data: *mut T,
+pub struct RefMutCudaRepresentation<'a, T: 'a + PortableBitSemantics + TypeGraphLayout> {
+    data: DeviceMutPointer<T>,
     _marker: PhantomData<&'a mut T>,
 }
 
-// Safety: This repr(C) struct only contains a device-owned pointer
-unsafe impl<'a, T: SafeDeviceCopy + TypeGraphLayout> rustacuda_core::DeviceCopy
-    for RefMutCudaRepresentation<'a, T>
-{
-}
-
-unsafe impl<'a, T: SafeDeviceCopy + TypeGraphLayout> RustToCuda for &'a mut T {
+unsafe impl<'a, T: PortableBitSemantics + TypeGraphLayout> RustToCuda for &'a mut T {
     #[cfg(all(feature = "host", not(doc)))]
-    type CudaAllocation = crate::host::CudaDropWrapper<DeviceBox<SafeDeviceCopyWrapper<T>>>;
+    type CudaAllocation = CudaDropWrapper<DeviceBox<SafeDeviceCopyWrapper<T>>>;
     #[cfg(any(not(feature = "host"), doc))]
     type CudaAllocation = crate::alloc::SomeCudaAlloc;
     type CudaRepresentation = RefMutCudaRepresentation<'a, T>;
@@ -56,7 +51,7 @@ unsafe impl<'a, T: SafeDeviceCopy + TypeGraphLayout> RustToCuda for &'a mut T {
 
         Ok((
             DeviceAccessible::from(RefMutCudaRepresentation {
-                data: device_box.as_device_ptr().as_raw_mut().cast(),
+                data: DeviceMutPointer(device_box.as_device_ptr().as_raw_mut().cast()),
                 _marker: PhantomData::<&'a mut T>,
             }),
             CombinedCudaAlloc::new(device_box, alloc),
@@ -80,14 +75,14 @@ unsafe impl<'a, T: SafeDeviceCopy + TypeGraphLayout> RustToCuda for &'a mut T {
     }
 }
 
-unsafe impl<'a, T: SafeDeviceCopy + TypeGraphLayout> CudaAsRust
+unsafe impl<'a, T: PortableBitSemantics + TypeGraphLayout> CudaAsRust
     for RefMutCudaRepresentation<'a, T>
 {
     type RustRepresentation = &'a mut T;
 
     #[cfg(feature = "device")]
     unsafe fn as_rust(this: &DeviceAccessible<Self>) -> Self::RustRepresentation {
-        let data: *mut T = this.data;
+        let data: *mut T = this.data.0;
         &mut *data
     }
 }
diff --git a/src/lend/impls/slice_ref.rs b/src/lend/impls/slice_ref.rs
index 6108f9ccd..4b7898571 100644
--- a/src/lend/impls/slice_ref.rs
+++ b/src/lend/impls/slice_ref.rs
@@ -7,7 +7,8 @@ use rustacuda::{error::CudaResult, memory::DeviceBuffer};
 
 use crate::{
     lend::{CudaAsRust, RustToCuda},
-    safety::SafeDeviceCopy,
+    safety::PortableBitSemantics,
+    utils::ffi::DeviceConstPointer,
 };
 
 #[cfg(any(feature = "host", feature = "device"))]
@@ -22,21 +23,15 @@ use crate::{
 
 #[doc(hidden)]
 #[allow(clippy::module_name_repetitions)]
-#[derive(Debug, TypeLayout)]
+#[derive(TypeLayout)]
 #[repr(C)]
-pub struct SliceRefCudaRepresentation<'a, T: 'a + SafeDeviceCopy + TypeGraphLayout> {
-    data: *const T,
+pub struct SliceRefCudaRepresentation<'a, T: 'a + PortableBitSemantics + TypeGraphLayout> {
+    data: DeviceConstPointer<T>,
     len: usize,
     _marker: PhantomData<&'a [T]>,
 }
 
-// Safety: This repr(C) struct only contains a device-owned pointer and a usize
-unsafe impl<'a, T: SafeDeviceCopy + TypeGraphLayout> rustacuda_core::DeviceCopy
-    for SliceRefCudaRepresentation<'a, T>
-{
-}
-
-unsafe impl<'a, T: SafeDeviceCopy + TypeGraphLayout> RustToCuda for &'a [T] {
+unsafe impl<'a, T: PortableBitSemantics + TypeGraphLayout> RustToCuda for &'a [T] {
     #[cfg(all(feature = "host", not(doc)))]
     type CudaAllocation = crate::host::CudaDropWrapper<DeviceBuffer<SafeDeviceCopyWrapper<T>>>;
     #[cfg(any(not(feature = "host"), doc))]
@@ -58,7 +53,7 @@ unsafe impl<'a, T: SafeDeviceCopy + TypeGraphLayout> RustToCuda for &'a [T] {
 
         Ok((
             DeviceAccessible::from(SliceRefCudaRepresentation {
-                data: device_buffer.as_ptr().cast(),
+                data: DeviceConstPointer(device_buffer.as_ptr().cast()),
                 len: device_buffer.len(),
                 _marker: PhantomData::<&'a [T]>,
             }),
@@ -76,13 +71,13 @@ unsafe impl<'a, T: SafeDeviceCopy + TypeGraphLayout> RustToCuda for &'a [T] {
     }
 }
 
-unsafe impl<'a, T: SafeDeviceCopy + TypeGraphLayout> CudaAsRust
+unsafe impl<'a, T: PortableBitSemantics + TypeGraphLayout> CudaAsRust
     for SliceRefCudaRepresentation<'a, T>
 {
     type RustRepresentation = &'a [T];
 
     #[cfg(feature = "device")]
     unsafe fn as_rust(this: &DeviceAccessible<Self>) -> Self::RustRepresentation {
-        core::slice::from_raw_parts(this.data, this.len)
+        core::slice::from_raw_parts(this.data.0, this.len)
     }
 }
diff --git a/src/lend/impls/slice_ref_mut.rs b/src/lend/impls/slice_ref_mut.rs
index b2f79abf9..9246fa474 100644
--- a/src/lend/impls/slice_ref_mut.rs
+++ b/src/lend/impls/slice_ref_mut.rs
@@ -7,7 +7,8 @@ use rustacuda::{error::CudaResult, memory::DeviceBuffer};
 
 use crate::{
     lend::{CudaAsRust, RustToCuda},
-    safety::SafeDeviceCopy,
+    safety::PortableBitSemantics,
+    utils::ffi::DeviceMutPointer,
 };
 
 #[cfg(any(feature = "host", feature = "device"))]
@@ -22,21 +23,15 @@ use crate::{
 
 #[doc(hidden)]
 #[allow(clippy::module_name_repetitions)]
-#[derive(Debug, TypeLayout)]
+#[derive(TypeLayout)]
 #[repr(C)]
-pub struct SliceRefMutCudaRepresentation<'a, T: 'a + SafeDeviceCopy + TypeGraphLayout> {
-    data: *mut T,
+pub struct SliceRefMutCudaRepresentation<'a, T: 'a + PortableBitSemantics + TypeGraphLayout> {
+    data: DeviceMutPointer<T>,
     len: usize,
     _marker: PhantomData<&'a mut [T]>,
 }
 
-// Safety: This repr(C) struct only contains a device-owned pointer and a usize
-unsafe impl<'a, T: SafeDeviceCopy + TypeGraphLayout> rustacuda_core::DeviceCopy
-    for SliceRefMutCudaRepresentation<'a, T>
-{
-}
-
-unsafe impl<'a, T: SafeDeviceCopy + TypeGraphLayout> RustToCuda for &'a mut [T] {
+unsafe impl<'a, T: PortableBitSemantics + TypeGraphLayout> RustToCuda for &'a mut [T] {
     #[cfg(all(feature = "host", not(doc)))]
     type CudaAllocation = crate::host::CudaDropWrapper<DeviceBuffer<SafeDeviceCopyWrapper<T>>>;
     #[cfg(any(not(feature = "host"), doc))]
@@ -58,7 +53,7 @@ unsafe impl<'a, T: SafeDeviceCopy + TypeGraphLayout> RustToCuda for &'a mut [T]
 
         Ok((
             DeviceAccessible::from(SliceRefMutCudaRepresentation {
-                data: device_buffer.as_mut_ptr().cast(),
+                data: DeviceMutPointer(device_buffer.as_mut_ptr().cast()),
                 len: device_buffer.len(),
                 _marker: PhantomData::<&'a mut [T]>,
             }),
@@ -83,13 +78,13 @@ unsafe impl<'a, T: SafeDeviceCopy + TypeGraphLayout> RustToCuda for &'a mut [T]
     }
 }
 
-unsafe impl<'a, T: SafeDeviceCopy + TypeGraphLayout> CudaAsRust
+unsafe impl<'a, T: PortableBitSemantics + TypeGraphLayout> CudaAsRust
     for SliceRefMutCudaRepresentation<'a, T>
 {
     type RustRepresentation = &'a mut [T];
 
     #[cfg(feature = "device")]
     unsafe fn as_rust(this: &DeviceAccessible<Self>) -> Self::RustRepresentation {
-        core::slice::from_raw_parts_mut(this.data, this.len)
+        core::slice::from_raw_parts_mut(this.data.0, this.len)
     }
 }
diff --git a/src/lend/mod.rs b/src/lend/mod.rs
index 6f7bab5d7..2fac0a08e 100644
--- a/src/lend/mod.rs
+++ b/src/lend/mod.rs
@@ -1,7 +1,6 @@
 use const_type_layout::TypeGraphLayout;
 #[cfg(feature = "host")]
 use rustacuda::error::CudaError;
-use rustacuda_core::DeviceCopy;
 
 #[cfg(feature = "derive")]
 #[allow(clippy::module_name_repetitions)]
@@ -17,10 +16,7 @@ use crate::{
     host::{HostAndDeviceConstRef, HostAndDeviceOwned},
 };
 #[cfg(any(feature = "host", feature = "device"))]
-use crate::{
-    safety::{NoSafeAliasing, SafeDeviceCopy},
-    utils::ffi::DeviceAccessible,
-};
+use crate::{safety::PortableBitSemantics, utils::ffi::DeviceAccessible};
 
 mod impls;
 
@@ -120,7 +116,7 @@ pub unsafe trait RustToCudaAsync: RustToCuda {
 /// # Safety
 ///
 /// This is an internal trait and should NEVER be implemented manually
-pub unsafe trait CudaAsRust: DeviceCopy + TypeGraphLayout {
+pub unsafe trait CudaAsRust: PortableBitSemantics + TypeGraphLayout {
     type RustRepresentation: RustToCuda<CudaRepresentation = Self>;
 
     #[doc(hidden)]
@@ -147,7 +143,7 @@ pub trait RustToCudaAsyncProxy<T>: RustToCudaAsync {
 
 #[cfg(feature = "host")]
 #[allow(clippy::module_name_repetitions)]
-pub trait LendToCuda: RustToCuda + NoSafeAliasing {
+pub trait LendToCuda: RustToCuda {
     /// Lends an immutable copy of `&self` to CUDA:
     /// - code in the CUDA kernel can only access `&self` through the
     ///   [`DeviceConstRef`] inside the closure
@@ -167,7 +163,7 @@ pub trait LendToCuda: RustToCuda + NoSafeAliasing {
         inner: F,
     ) -> Result<O, E>;
 
-    /// Moves `self` to CUDA iff `self` is [`SafeDeviceCopy`]
+    /// Moves `self` to CUDA iff `self` has [`PortableBitSemantics`]
     ///
     /// # Errors
     ///
@@ -183,11 +179,11 @@ pub trait LendToCuda: RustToCuda + NoSafeAliasing {
         inner: F,
     ) -> Result<O, E>
     where
-        Self: RustToCuda<CudaRepresentation: SafeDeviceCopy, CudaAllocation: EmptyCudaAlloc>;
+        Self: RustToCuda<CudaRepresentation: PortableBitSemantics, CudaAllocation: EmptyCudaAlloc>;
 }
 
 #[cfg(feature = "host")]
-impl<T: RustToCuda + NoSafeAliasing> LendToCuda for T {
+impl<T: RustToCuda> LendToCuda for T {
     fn lend_to_cuda<
         O,
         E: From<CudaError>,
@@ -219,7 +215,7 @@ impl<T: RustToCuda + NoSafeAliasing> LendToCuda for T {
         inner: F,
     ) -> Result<O, E>
     where
-        Self: RustToCuda<CudaRepresentation: SafeDeviceCopy, CudaAllocation: EmptyCudaAlloc>,
+        Self: RustToCuda<CudaRepresentation: PortableBitSemantics, CudaAllocation: EmptyCudaAlloc>,
     {
         let (cuda_repr, alloc) = unsafe { self.borrow(NoCudaAlloc) }?;
 
@@ -232,7 +228,7 @@ impl<T: RustToCuda + NoSafeAliasing> LendToCuda for T {
 }
 
 #[cfg(feature = "device")]
-pub trait BorrowFromRust: RustToCuda + NoSafeAliasing {
+pub trait BorrowFromRust: RustToCuda {
     /// # Safety
     ///
     /// This function is only safe to call iff `cuda_repr` is the
@@ -254,11 +250,11 @@ pub trait BorrowFromRust: RustToCuda + NoSafeAliasing {
     ) -> O
     where
         Self: Sized,
-        <Self as RustToCuda>::CudaRepresentation: SafeDeviceCopy;
+        <Self as RustToCuda>::CudaRepresentation: PortableBitSemantics;
 }
 
 #[cfg(feature = "device")]
-impl<T: RustToCuda + NoSafeAliasing> BorrowFromRust for T {
+impl<T: RustToCuda> BorrowFromRust for T {
     #[inline]
     unsafe fn with_borrow_from_rust<O, F: FnOnce(&Self) -> O>(
         cuda_repr: DeviceConstRef<DeviceAccessible<<Self as RustToCuda>::CudaRepresentation>>,
@@ -278,7 +274,7 @@ impl<T: RustToCuda + NoSafeAliasing> BorrowFromRust for T {
     ) -> O
     where
         Self: Sized,
-        <Self as RustToCuda>::CudaRepresentation: SafeDeviceCopy,
+        <Self as RustToCuda>::CudaRepresentation: PortableBitSemantics,
     {
         inner(CudaAsRust::as_rust(cuda_repr.as_mut()))
     }
diff --git a/src/safety/device_copy.rs b/src/safety/device_copy.rs
deleted file mode 100644
index a2bfc9552..000000000
--- a/src/safety/device_copy.rs
+++ /dev/null
@@ -1,29 +0,0 @@
-use const_type_layout::TypeGraphLayout;
-
-use crate::{safety::StackOnly, utils::ffi::DeviceAccessible};
-
-#[allow(clippy::module_name_repetitions)]
-/// Types which are safe to memcpy from the CPU to a GPU.
-///
-/// For a type to implement [`SafeDeviceCopy`], it must
-///
-/// * have the same memory layout on both the CPU and GPU
-///
-/// * not contain any references to data that is inaccessible from the GPU
-///
-/// Types that implement both [`TypeGraphLayout`] and [`StackOnly`] satisfy
-/// both of these criteria and thus implement [`SafeDeviceCopy`].
-#[marker]
-pub trait SafeDeviceCopy: sealed::Sealed {}
-
-impl<T: StackOnly + TypeGraphLayout> SafeDeviceCopy for T {}
-impl<T: StackOnly + TypeGraphLayout> sealed::Sealed for T {}
-
-#[doc(hidden)]
-impl<T: SafeDeviceCopy + rustacuda_core::DeviceCopy> SafeDeviceCopy for DeviceAccessible<T> {}
-impl<T: SafeDeviceCopy + rustacuda_core::DeviceCopy> sealed::Sealed for DeviceAccessible<T> {}
-
-mod sealed {
-    #[marker]
-    pub trait Sealed {}
-}
diff --git a/src/safety/mod.rs b/src/safety/mod.rs
index 72ed9c7db..243a2a9f9 100644
--- a/src/safety/mod.rs
+++ b/src/safety/mod.rs
@@ -1,6 +1,5 @@
 mod arch;
-mod device_copy;
-mod no_aliasing;
+mod portable;
 mod stack_only;
 
 #[doc(hidden)]
@@ -8,6 +7,5 @@ pub mod kernel_signature;
 #[doc(hidden)]
 pub mod type_layout;
 
-pub use device_copy::SafeDeviceCopy;
-pub use no_aliasing::NoSafeAliasing;
+pub use portable::PortableBitSemantics;
 pub use stack_only::StackOnly;
diff --git a/src/safety/no_aliasing.rs b/src/safety/no_aliasing.rs
deleted file mode 100644
index 7baa06f19..000000000
--- a/src/safety/no_aliasing.rs
+++ /dev/null
@@ -1,86 +0,0 @@
-#[allow(clippy::module_name_repetitions)]
-/// Types which can be safely shared between CUDA threads because they do
-/// not provide safe aliasing mutable access to some shared inner state.
-///
-/// This trait is automatically implemented when the compiler determines
-/// it's appropriate.
-///
-/// Data types that contain no references and can thus live entirely on
-/// the stack, e.g. primitive types like [`u8`] and structs, tuples, and
-/// enums made only from them, or more generally those types that implement
-/// [`StackOnly`](super::StackOnly), also implement [`NoSafeAliasing`] as they
-/// do not contain any inner data that might be shared when each thread is
-/// given mutable access to a copy.
-///
-/// In contrast, `&mut T` (and any type containing a mutable reference) do *not*
-/// implement [`NoSafeAliasing`] as several threads would obtain mutable
-/// aliasing access to the same date, thus violating Rust's borrowing and
-/// memory safety rules.
-///
-/// Even though `*const T` and `*mut T` do not provide *safe* mutable aliasing
-/// access to their underlying data, as dereferincing them is always unsafe,
-/// they (and any type containing a pointer) do *not* implement
-/// [`NoSafeAliasing`] to ensure that any data type that uses them to build a
-/// safe interface to accessing data, e.g. [`Box`], does not accidentially
-/// implement [`NoSafeAliasing`]. If you have implemented a data structure that
-/// uses `*const T` or `*mut T` internally but also ensures that no safe
-/// aliasing mutable access is provided, you can *unsafely* implement
-/// [`NoSafeAliasing`] for your type. Please reference the [Safety](#safety)
-/// section below for more details on the contract you must uphold in this case.
-///
-/// # Safety
-///
-/// This trait must only be manually implemented for a type that upholds
-/// the no-mutable-aliasing guarantee through its safe API.
-///
-/// The following examples outline three different cases for types that do
-/// fulfil this safety requirement:
-///
-/// * [`Final`](final::Final) implements [`NoSafeAliasing`]
-/// because even a mutable reference to it only provides read-only access
-/// to its inner data.
-///
-/// * [`SplitSliceOverCudaThreadsConstStride`](crate::utils::aliasing::SplitSliceOverCudaThreadsConstStride)
-/// and
-/// [`SplitSliceOverCudaThreadsDynamicStride`](crate::utils::aliasing::SplitSliceOverCudaThreadsDynamicStride)
-/// also implement [`NoSafeAliasing`] because they only provide each CUDA thread
-/// with mutable access to its own partition of a slice and thus avoid mutable
-/// aliasing.
-///
-/// * [`ThreadBlockShared`](crate::utils::shared::ThreadBlockShared)
-/// and
-/// [`ThreadBlockSharedSlice`](crate::utils::shared::ThreadBlockSharedSlice)
-/// also implement [`NoSafeAliasing`] since they only provide access to `*mut
-/// T`, which is always unsafe to mutate and thus moves the burden to uphoald
-/// the no-mutable-aliasing safety invariant to the user who derefereces these
-/// pointers.
-pub unsafe auto trait NoSafeAliasing {}
-
-impl<T> !NoSafeAliasing for &mut T {}
-impl<T> !NoSafeAliasing for *const T {}
-impl<T> !NoSafeAliasing for *mut T {}
-
-unsafe impl<T> NoSafeAliasing for core::marker::PhantomData<T> {}
-
-unsafe impl<T> NoSafeAliasing for r#final::Final<T> {}
-unsafe impl<T: crate::lend::CudaAsRust> NoSafeAliasing
-    for crate::utils::aliasing::FinalCudaRepresentation<T>
-{
-}
-
-unsafe impl<T, const STRIDE: usize> NoSafeAliasing
-    for crate::utils::aliasing::SplitSliceOverCudaThreadsConstStride<T, STRIDE>
-{
-}
-unsafe impl<T> NoSafeAliasing
-    for crate::utils::aliasing::SplitSliceOverCudaThreadsDynamicStride<T>
-{
-}
-
-// Thread-block-shared data only allows unsafe aliasing since only raw pointers
-//  are exposed
-unsafe impl<T: 'static> NoSafeAliasing for crate::utils::shared::ThreadBlockShared<T> {}
-unsafe impl<T: 'static + const_type_layout::TypeGraphLayout> NoSafeAliasing
-    for crate::utils::shared::ThreadBlockSharedSlice<T>
-{
-}
diff --git a/src/safety/portable.rs b/src/safety/portable.rs
new file mode 100644
index 000000000..5b438e2f7
--- /dev/null
+++ b/src/safety/portable.rs
@@ -0,0 +1,63 @@
+macro_rules! portable_bit_semantics_docs {
+    ($item:item) => {
+        /// Types whose in-memory bit representation on the CPU host is safe to copy
+        /// to and read back on the GPU device while maintaining the same semantics,
+        /// iff the type layout on the CPU matches the type layout on the GPU.
+        ///
+        /// For a type to implement [`PortableBitSemantics`], it
+        ///
+        /// * should have the same memory layout on both the CPU and GPU, and
+        ///
+        /// * must not contain any references to data that are exposed as safely
+        ///   accessible on both ends but actually inaccessible on one.
+        ///
+        /// For instance, a reference `&u8` to host memory has the same well-defined
+        /// layout on both CPU and GPU (if their pointer sizes and alignments
+        /// match), but it is not portable since the host memory is generally
+        /// not accessible from the GPU.
+        ///
+        /// This trait is automatically implemented when the compiler determines
+        /// it's appropriate.
+        ///
+        /// Note that this trait is *sealed*, i.e. you cannot implement it on your
+        /// own custom types.
+        ///
+        /// Trait bounds usually combine [`PortableBitSemantics`] with
+        /// [`TypeGraphLayout`](const_type_layout::TypeGraphLayout) to check that
+        /// the type layout is indeed the same on both the host CPU and the GPU
+        /// device.
+        ///
+        /// Types that implement [`StackOnly`](crate::safety::StackOnly) and
+        /// [`TypeGraphLayout`](const_type_layout::TypeGraphLayout) satisfy both
+        /// of the above criteria and thus also implement [`PortableBitSemantics`].
+        $item
+    };
+}
+
+#[cfg(not(doc))]
+portable_bit_semantics_docs! {
+    #[allow(clippy::module_name_repetitions)]
+    pub trait PortableBitSemantics: sealed::PortableBitSemantics {}
+}
+#[cfg(doc)]
+portable_bit_semantics_docs! {
+    pub use sealed::PortableBitSemantics;
+}
+
+#[cfg(not(doc))]
+impl<T: ?Sized + sealed::PortableBitSemantics> PortableBitSemantics for T {}
+
+mod sealed {
+    pub auto trait PortableBitSemantics {}
+
+    impl<T: ?Sized> !PortableBitSemantics for &T {}
+    impl<T: ?Sized> !PortableBitSemantics for &mut T {}
+    impl<T: ?Sized> !PortableBitSemantics for *const T {}
+    impl<T: ?Sized> !PortableBitSemantics for *mut T {}
+
+    impl<T> PortableBitSemantics for core::marker::PhantomData<T> {}
+
+    impl<T: PortableBitSemantics> PortableBitSemantics for crate::utils::ffi::DeviceConstPointer<T> {}
+    impl<T: PortableBitSemantics> PortableBitSemantics for crate::utils::ffi::DeviceMutPointer<T> {}
+    impl<T: PortableBitSemantics> PortableBitSemantics for crate::utils::ffi::DeviceOwnedPointer<T> {}
+}
diff --git a/src/safety/stack_only.rs b/src/safety/stack_only.rs
index bfb4e80d0..eac7f9456 100644
--- a/src/safety/stack_only.rs
+++ b/src/safety/stack_only.rs
@@ -64,23 +64,23 @@ macro_rules! stack_only_docs {
 #[cfg(not(doc))]
 stack_only_docs! {
     #[allow(clippy::module_name_repetitions)]
-    pub trait StackOnly: sealed::Sealed {}
+    pub trait StackOnly: sealed::StackOnly {}
 }
 #[cfg(doc)]
 stack_only_docs! {
-    pub use sealed::Sealed as StackOnly;
+    pub use sealed::StackOnly;
 }
 
 #[cfg(not(doc))]
-impl<T: sealed::Sealed> StackOnly for T {}
+impl<T: sealed::StackOnly> StackOnly for T {}
 
 mod sealed {
-    pub auto trait Sealed {}
+    pub auto trait StackOnly {}
 
-    impl<T> !Sealed for &T {}
-    impl<T> !Sealed for &mut T {}
-    impl<T> !Sealed for *const T {}
-    impl<T> !Sealed for *mut T {}
+    impl<T: ?Sized> !StackOnly for &T {}
+    impl<T: ?Sized> !StackOnly for &mut T {}
+    impl<T: ?Sized> !StackOnly for *const T {}
+    impl<T: ?Sized> !StackOnly for *mut T {}
 
-    impl<T> Sealed for core::marker::PhantomData<T> {}
+    impl<T> StackOnly for core::marker::PhantomData<T> {}
 }
diff --git a/src/utils/aliasing/const.rs b/src/utils/aliasing/const.rs
index b3a28cf25..c36f814bf 100644
--- a/src/utils/aliasing/const.rs
+++ b/src/utils/aliasing/const.rs
@@ -6,7 +6,6 @@ use core::{
 };
 
 use const_type_layout::TypeLayout;
-use rustacuda_core::DeviceCopy;
 
 use crate::{
     lend::{CudaAsRust, RustToCuda, RustToCudaAsync},
@@ -25,13 +24,6 @@ impl<T, const STRIDE: usize> SplitSliceOverCudaThreadsConstStride<T, STRIDE> {
     }
 }
 
-// Safety: If [`T`] is [`DeviceCopy`], then the newtype struct also is
-// [`DeviceCopy`]
-unsafe impl<T: DeviceCopy, const STRIDE: usize> DeviceCopy
-    for SplitSliceOverCudaThreadsConstStride<T, STRIDE>
-{
-}
-
 #[cfg(feature = "device")]
 fn split_slice_const_stride<E, const STRIDE: usize>(slice: &[E]) -> &[E] {
     let offset: usize = crate::device::thread::Thread::this().index() * STRIDE;
diff --git a/src/utils/aliasing/dynamic.rs b/src/utils/aliasing/dynamic.rs
index 50f028ec3..0ab97016c 100644
--- a/src/utils/aliasing/dynamic.rs
+++ b/src/utils/aliasing/dynamic.rs
@@ -6,7 +6,6 @@ use core::{
 };
 
 use const_type_layout::TypeLayout;
-use rustacuda_core::DeviceCopy;
 
 use crate::{
     lend::{CudaAsRust, RustToCuda, RustToCudaAsync},
@@ -28,10 +27,6 @@ impl<T> SplitSliceOverCudaThreadsDynamicStride<T> {
     }
 }
 
-// Safety: If [`T`] is [`DeviceCopy`], then the newtype struct also is
-// [`DeviceCopy`]
-unsafe impl<T: DeviceCopy> DeviceCopy for SplitSliceOverCudaThreadsDynamicStride<T> {}
-
 #[cfg(feature = "device")]
 fn split_slice_dynamic_stride<E>(slice: &[E], stride: usize) -> &[E] {
     let offset: usize = crate::device::thread::Thread::this().index() * stride;
diff --git a/src/utils/aliasing/final.rs b/src/utils/aliasing/final.rs
deleted file mode 100644
index 432910920..000000000
--- a/src/utils/aliasing/final.rs
+++ /dev/null
@@ -1,90 +0,0 @@
-use const_type_layout::TypeLayout;
-use r#final::Final;
-
-use crate::{
-    lend::{CudaAsRust, RustToCuda, RustToCudaAsync},
-    utils::ffi::DeviceAccessible,
-};
-
-#[doc(hidden)]
-#[repr(transparent)]
-#[derive(TypeLayout)]
-#[allow(clippy::module_name_repetitions)]
-pub struct FinalCudaRepresentation<T: CudaAsRust>(DeviceAccessible<T>);
-
-// Safety: If [`T`] is [`CudaAsRust`], then the newtype struct is [`DeviceCopy`]
-unsafe impl<T: CudaAsRust> rustacuda_core::DeviceCopy for FinalCudaRepresentation<T> {}
-
-unsafe impl<T: RustToCuda> RustToCuda for Final<T> {
-    type CudaAllocation = T::CudaAllocation;
-    type CudaRepresentation = FinalCudaRepresentation<T::CudaRepresentation>;
-
-    #[cfg(feature = "host")]
-    #[allow(clippy::type_complexity)]
-    unsafe fn borrow<A: crate::alloc::CudaAlloc>(
-        &self,
-        alloc: A,
-    ) -> rustacuda::error::CudaResult<(
-        DeviceAccessible<Self::CudaRepresentation>,
-        crate::alloc::CombinedCudaAlloc<Self::CudaAllocation, A>,
-    )> {
-        let (cuda_repr, alloc) = (**self).borrow(alloc)?;
-
-        Ok((
-            DeviceAccessible::from(FinalCudaRepresentation(cuda_repr)),
-            alloc,
-        ))
-    }
-
-    #[cfg(feature = "host")]
-    unsafe fn restore<A: crate::alloc::CudaAlloc>(
-        &mut self,
-        alloc: crate::alloc::CombinedCudaAlloc<Self::CudaAllocation, A>,
-    ) -> rustacuda::error::CudaResult<A> {
-        // Safety: Final is a repr(transparent) newtype wrapper around T
-        let inner: &mut T = &mut *(self as *mut Self).cast();
-
-        inner.restore(alloc)
-    }
-}
-
-unsafe impl<T: RustToCudaAsync> RustToCudaAsync for Final<T> {
-    #[cfg(feature = "host")]
-    #[allow(clippy::type_complexity)]
-    unsafe fn borrow_async<A: crate::alloc::CudaAlloc>(
-        &self,
-        alloc: A,
-        stream: &rustacuda::stream::Stream,
-    ) -> rustacuda::error::CudaResult<(
-        DeviceAccessible<Self::CudaRepresentation>,
-        crate::alloc::CombinedCudaAlloc<Self::CudaAllocation, A>,
-    )> {
-        let (cuda_repr, alloc) = (**self).borrow_async(alloc, stream)?;
-
-        Ok((
-            DeviceAccessible::from(FinalCudaRepresentation(cuda_repr)),
-            alloc,
-        ))
-    }
-
-    #[cfg(feature = "host")]
-    unsafe fn restore_async<A: crate::alloc::CudaAlloc>(
-        &mut self,
-        alloc: crate::alloc::CombinedCudaAlloc<Self::CudaAllocation, A>,
-        stream: &rustacuda::stream::Stream,
-    ) -> rustacuda::error::CudaResult<A> {
-        // Safety: Final is a repr(transparent) newtype wrapper around T
-        let inner: &mut T = &mut *(self as *mut Self).cast();
-
-        inner.restore_async(alloc, stream)
-    }
-}
-
-unsafe impl<T: CudaAsRust> CudaAsRust for FinalCudaRepresentation<T> {
-    type RustRepresentation = Final<T::RustRepresentation>;
-
-    #[cfg(feature = "device")]
-    unsafe fn as_rust(this: &DeviceAccessible<Self>) -> Self::RustRepresentation {
-        Final::new(CudaAsRust::as_rust(&this.0))
-    }
-}
diff --git a/src/utils/aliasing/mod.rs b/src/utils/aliasing/mod.rs
index de7c58e05..e7753cf92 100644
--- a/src/utils/aliasing/mod.rs
+++ b/src/utils/aliasing/mod.rs
@@ -1,8 +1,5 @@
 mod r#const;
 mod dynamic;
-mod r#final;
 
 pub use dynamic::SplitSliceOverCudaThreadsDynamicStride;
 pub use r#const::SplitSliceOverCudaThreadsConstStride;
-
-pub(crate) use self::r#final::FinalCudaRepresentation;
diff --git a/src/utils/device_copy.rs b/src/utils/device_copy.rs
index 2363b4855..72bd7d64e 100644
--- a/src/utils/device_copy.rs
+++ b/src/utils/device_copy.rs
@@ -5,7 +5,7 @@ use const_type_layout::{TypeGraphLayout, TypeLayout};
 use crate::{
     alloc::NoCudaAlloc,
     lend::{CudaAsRust, RustToCuda, RustToCudaAsync},
-    safety::SafeDeviceCopy,
+    safety::PortableBitSemantics,
 };
 
 #[cfg(any(feature = "host", feature = "device"))]
@@ -16,22 +16,17 @@ use crate::alloc::{CombinedCudaAlloc, CudaAlloc};
 
 #[derive(Copy, Clone, Debug, TypeLayout)]
 #[repr(transparent)]
-pub struct SafeDeviceCopyWrapper<T>(T)
-where
-    T: SafeDeviceCopy + TypeGraphLayout;
+pub struct SafeDeviceCopyWrapper<T: PortableBitSemantics>(T);
 
-unsafe impl<T: SafeDeviceCopy + TypeGraphLayout> rustacuda_core::DeviceCopy
-    for SafeDeviceCopyWrapper<T>
-{
-}
+unsafe impl<T: PortableBitSemantics> rustacuda_core::DeviceCopy for SafeDeviceCopyWrapper<T> {}
 
-impl<T: SafeDeviceCopy + TypeGraphLayout> From<T> for SafeDeviceCopyWrapper<T> {
+impl<T: PortableBitSemantics> From<T> for SafeDeviceCopyWrapper<T> {
     fn from(value: T) -> Self {
         Self(value)
     }
 }
 
-impl<T: SafeDeviceCopy + TypeGraphLayout> SafeDeviceCopyWrapper<T> {
+impl<T: PortableBitSemantics> SafeDeviceCopyWrapper<T> {
     #[must_use]
     pub fn into_inner(self) -> T {
         self.0
@@ -86,7 +81,7 @@ impl<T: SafeDeviceCopy + TypeGraphLayout> SafeDeviceCopyWrapper<T> {
     }
 }
 
-unsafe impl<T: SafeDeviceCopy + TypeGraphLayout> RustToCuda for SafeDeviceCopyWrapper<T> {
+unsafe impl<T: PortableBitSemantics + TypeGraphLayout> RustToCuda for SafeDeviceCopyWrapper<T> {
     type CudaAllocation = NoCudaAlloc;
     type CudaRepresentation = Self;
 
@@ -114,7 +109,9 @@ unsafe impl<T: SafeDeviceCopy + TypeGraphLayout> RustToCuda for SafeDeviceCopyWr
     }
 }
 
-unsafe impl<T: SafeDeviceCopy + TypeGraphLayout> RustToCudaAsync for SafeDeviceCopyWrapper<T> {
+unsafe impl<T: PortableBitSemantics + TypeGraphLayout> RustToCudaAsync
+    for SafeDeviceCopyWrapper<T>
+{
     #[cfg(feature = "host")]
     #[allow(clippy::type_complexity)]
     unsafe fn borrow_async<A: CudaAlloc>(
@@ -141,7 +138,7 @@ unsafe impl<T: SafeDeviceCopy + TypeGraphLayout> RustToCudaAsync for SafeDeviceC
     }
 }
 
-unsafe impl<T: SafeDeviceCopy + TypeGraphLayout> CudaAsRust for SafeDeviceCopyWrapper<T> {
+unsafe impl<T: PortableBitSemantics + TypeGraphLayout> CudaAsRust for SafeDeviceCopyWrapper<T> {
     type RustRepresentation = Self;
 
     #[cfg(feature = "device")]
diff --git a/src/utils/exchange/buffer/common.rs b/src/utils/exchange/buffer/common.rs
index 450ed0975..cfacf61a2 100644
--- a/src/utils/exchange/buffer/common.rs
+++ b/src/utils/exchange/buffer/common.rs
@@ -1,7 +1,6 @@
 use const_type_layout::{TypeGraphLayout, TypeLayout};
-use rustacuda_core::DeviceCopy;
 
-use crate::{lend::CudaAsRust, safety::SafeDeviceCopy};
+use crate::{lend::CudaAsRust, safety::PortableBitSemantics, utils::ffi::DeviceMutPointer};
 
 use super::{CudaExchangeBuffer, CudaExchangeItem};
 
@@ -9,21 +8,16 @@ use super::{CudaExchangeBuffer, CudaExchangeItem};
 #[doc(hidden)]
 #[derive(TypeLayout)]
 #[repr(C)]
-pub struct CudaExchangeBufferCudaRepresentation<T, const M2D: bool, const M2H: bool>(
-    pub(super) *mut CudaExchangeItem<T, M2D, M2H>,
+pub struct CudaExchangeBufferCudaRepresentation<
+    T: PortableBitSemantics + TypeGraphLayout,
+    const M2D: bool,
+    const M2H: bool,
+>(
+    pub(super) DeviceMutPointer<CudaExchangeItem<T, M2D, M2H>>,
     pub(super) usize,
-)
-where
-    T: SafeDeviceCopy + TypeGraphLayout;
+);
 
-// Safety: [`CudaExchangeBufferCudaRepresentation<T>`] is [`DeviceCopy`]
-//         iff [`T`] is [`SafeDeviceCopy`]
-unsafe impl<T: SafeDeviceCopy + TypeGraphLayout, const M2D: bool, const M2H: bool> DeviceCopy
-    for CudaExchangeBufferCudaRepresentation<T, M2D, M2H>
-{
-}
-
-unsafe impl<T: SafeDeviceCopy + TypeGraphLayout, const M2D: bool, const M2H: bool> CudaAsRust
+unsafe impl<T: PortableBitSemantics + TypeGraphLayout, const M2D: bool, const M2H: bool> CudaAsRust
     for CudaExchangeBufferCudaRepresentation<T, M2D, M2H>
 {
     type RustRepresentation = CudaExchangeBuffer<T, M2D, M2H>;
@@ -35,7 +29,7 @@ unsafe impl<T: SafeDeviceCopy + TypeGraphLayout, const M2D: bool, const M2H: boo
         CudaExchangeBuffer {
             inner: super::device::CudaExchangeBufferDevice(core::mem::ManuallyDrop::new(
                 crate::deps::alloc::boxed::Box::from_raw(core::slice::from_raw_parts_mut(
-                    this.0, this.1,
+                    this.0 .0, this.1,
                 )),
             )),
         }
diff --git a/src/utils/exchange/buffer/device.rs b/src/utils/exchange/buffer/device.rs
index 139224da3..8c4b3b6ee 100644
--- a/src/utils/exchange/buffer/device.rs
+++ b/src/utils/exchange/buffer/device.rs
@@ -2,18 +2,18 @@ use core::ops::{Deref, DerefMut};
 
 use const_type_layout::TypeGraphLayout;
 
-use crate::{deps::alloc::boxed::Box, safety::SafeDeviceCopy};
+use crate::{deps::alloc::boxed::Box, safety::PortableBitSemantics};
 
 use super::CudaExchangeItem;
 
 #[allow(clippy::module_name_repetitions)]
 pub struct CudaExchangeBufferDevice<
-    T: SafeDeviceCopy + TypeGraphLayout,
+    T: PortableBitSemantics + TypeGraphLayout,
     const M2D: bool,
     const M2H: bool,
 >(pub(super) core::mem::ManuallyDrop<Box<[CudaExchangeItem<T, M2D, M2H>]>>);
 
-impl<T: SafeDeviceCopy + TypeGraphLayout, const M2D: bool, const M2H: bool> Deref
+impl<T: PortableBitSemantics + TypeGraphLayout, const M2D: bool, const M2H: bool> Deref
     for CudaExchangeBufferDevice<T, M2D, M2H>
 {
     type Target = [CudaExchangeItem<T, M2D, M2H>];
@@ -23,7 +23,7 @@ impl<T: SafeDeviceCopy + TypeGraphLayout, const M2D: bool, const M2H: bool> Dere
     }
 }
 
-impl<T: SafeDeviceCopy + TypeGraphLayout, const M2D: bool, const M2H: bool> DerefMut
+impl<T: PortableBitSemantics + TypeGraphLayout, const M2D: bool, const M2H: bool> DerefMut
     for CudaExchangeBufferDevice<T, M2D, M2H>
 {
     fn deref_mut(&mut self) -> &mut Self::Target {
diff --git a/src/utils/exchange/buffer/host.rs b/src/utils/exchange/buffer/host.rs
index 58e200881..f7fedc804 100644
--- a/src/utils/exchange/buffer/host.rs
+++ b/src/utils/exchange/buffer/host.rs
@@ -12,23 +12,29 @@ use rustacuda::{
 use crate::{
     alloc::{CombinedCudaAlloc, CudaAlloc, NoCudaAlloc},
     host::CudaDropWrapper,
-    safety::SafeDeviceCopy,
-    utils::ffi::DeviceAccessible,
+    safety::PortableBitSemantics,
+    utils::{
+        device_copy::SafeDeviceCopyWrapper,
+        ffi::{DeviceAccessible, DeviceMutPointer},
+    },
 };
 
 use super::{common::CudaExchangeBufferCudaRepresentation, CudaExchangeItem};
 
 #[allow(clippy::module_name_repetitions)]
 pub struct CudaExchangeBufferHost<
-    T: SafeDeviceCopy + TypeGraphLayout,
+    T: PortableBitSemantics + TypeGraphLayout,
     const M2D: bool,
     const M2H: bool,
 > {
-    host_buffer: CudaDropWrapper<LockedBuffer<CudaExchangeItem<T, M2D, M2H>>>,
-    device_buffer: UnsafeCell<CudaDropWrapper<DeviceBuffer<CudaExchangeItem<T, M2D, M2H>>>>,
+    host_buffer:
+        CudaDropWrapper<LockedBuffer<SafeDeviceCopyWrapper<CudaExchangeItem<T, M2D, M2H>>>>,
+    device_buffer: UnsafeCell<
+        CudaDropWrapper<DeviceBuffer<SafeDeviceCopyWrapper<CudaExchangeItem<T, M2D, M2H>>>>,
+    >,
 }
 
-impl<T: Clone + SafeDeviceCopy + TypeGraphLayout, const M2D: bool, const M2H: bool>
+impl<T: Clone + PortableBitSemantics + TypeGraphLayout, const M2D: bool, const M2H: bool>
     CudaExchangeBufferHost<T, M2D, M2H>
 {
     /// # Errors
@@ -38,7 +44,10 @@ impl<T: Clone + SafeDeviceCopy + TypeGraphLayout, const M2D: bool, const M2H: bo
         // Safety: CudaExchangeItem is a `repr(transparent)` wrapper around T
         let elem: &CudaExchangeItem<T, M2D, M2H> = unsafe { &*(elem as *const T).cast() };
 
-        let host_buffer = CudaDropWrapper::from(LockedBuffer::new(elem, capacity)?);
+        let host_buffer = CudaDropWrapper::from(LockedBuffer::new(
+            SafeDeviceCopyWrapper::from_ref(elem),
+            capacity,
+        )?);
         let device_buffer = UnsafeCell::new(CudaDropWrapper::from(DeviceBuffer::from_slice(
             host_buffer.as_slice(),
         )?));
@@ -50,21 +59,26 @@ impl<T: Clone + SafeDeviceCopy + TypeGraphLayout, const M2D: bool, const M2H: bo
     }
 }
 
-impl<T: SafeDeviceCopy + TypeGraphLayout, const M2D: bool, const M2H: bool>
+impl<T: PortableBitSemantics + TypeGraphLayout, const M2D: bool, const M2H: bool>
     CudaExchangeBufferHost<T, M2D, M2H>
 {
     /// # Errors
     /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside
     /// CUDA
     pub fn from_vec(vec: Vec<T>) -> CudaResult<Self> {
-        let mut host_buffer_uninit =
-            CudaDropWrapper::from(unsafe { LockedBuffer::uninitialized(vec.len())? });
+        let host_buffer = unsafe {
+            let mut uninit: CudaDropWrapper<LockedBuffer<SafeDeviceCopyWrapper<_>>> =
+                CudaDropWrapper::from(LockedBuffer::uninitialized(vec.len())?);
 
-        for (src, dst) in vec.into_iter().zip(host_buffer_uninit.iter_mut()) {
-            *dst = CudaExchangeItem(src);
-        }
+            for (i, src) in vec.into_iter().enumerate() {
+                uninit
+                    .as_mut_ptr()
+                    .add(i)
+                    .write(SafeDeviceCopyWrapper::from(CudaExchangeItem(src)));
+            }
 
-        let host_buffer = host_buffer_uninit;
+            uninit
+        };
 
         let device_buffer = UnsafeCell::new(CudaDropWrapper::from(DeviceBuffer::from_slice(
             host_buffer.as_slice(),
@@ -77,25 +91,25 @@ impl<T: SafeDeviceCopy + TypeGraphLayout, const M2D: bool, const M2H: bool>
     }
 }
 
-impl<T: SafeDeviceCopy + TypeGraphLayout, const M2D: bool, const M2H: bool> Deref
+impl<T: PortableBitSemantics + TypeGraphLayout, const M2D: bool, const M2H: bool> Deref
     for CudaExchangeBufferHost<T, M2D, M2H>
 {
     type Target = [CudaExchangeItem<T, M2D, M2H>];
 
     fn deref(&self) -> &Self::Target {
-        self.host_buffer.as_slice()
+        SafeDeviceCopyWrapper::into_slice(self.host_buffer.as_slice())
     }
 }
 
-impl<T: SafeDeviceCopy + TypeGraphLayout, const M2D: bool, const M2H: bool> DerefMut
+impl<T: PortableBitSemantics + TypeGraphLayout, const M2D: bool, const M2H: bool> DerefMut
     for CudaExchangeBufferHost<T, M2D, M2H>
 {
     fn deref_mut(&mut self) -> &mut Self::Target {
-        self.host_buffer.as_mut_slice()
+        SafeDeviceCopyWrapper::into_mut_slice(self.host_buffer.as_mut_slice())
     }
 }
 
-impl<T: SafeDeviceCopy + TypeGraphLayout, const M2D: bool, const M2H: bool>
+impl<T: PortableBitSemantics + TypeGraphLayout, const M2D: bool, const M2H: bool>
     CudaExchangeBufferHost<T, M2D, M2H>
 {
     #[allow(clippy::type_complexity)]
@@ -121,7 +135,7 @@ impl<T: SafeDeviceCopy + TypeGraphLayout, const M2D: bool, const M2H: bool>
 
         Ok((
             DeviceAccessible::from(CudaExchangeBufferCudaRepresentation(
-                device_buffer.as_mut_ptr(),
+                DeviceMutPointer(device_buffer.as_mut_ptr().cast()),
                 device_buffer.len(),
             )),
             CombinedCudaAlloc::new(NoCudaAlloc, alloc),
@@ -148,7 +162,7 @@ impl<T: SafeDeviceCopy + TypeGraphLayout, const M2D: bool, const M2H: bool>
     }
 }
 
-impl<T: SafeDeviceCopy + TypeGraphLayout, const M2D: bool, const M2H: bool>
+impl<T: PortableBitSemantics + TypeGraphLayout, const M2D: bool, const M2H: bool>
     CudaExchangeBufferHost<T, M2D, M2H>
 {
     #[allow(clippy::type_complexity)]
@@ -176,7 +190,7 @@ impl<T: SafeDeviceCopy + TypeGraphLayout, const M2D: bool, const M2H: bool>
 
         Ok((
             DeviceAccessible::from(CudaExchangeBufferCudaRepresentation(
-                device_buffer.as_mut_ptr(),
+                DeviceMutPointer(device_buffer.as_mut_ptr().cast()),
                 device_buffer.len(),
             )),
             CombinedCudaAlloc::new(NoCudaAlloc, alloc),
diff --git a/src/utils/exchange/buffer/mod.rs b/src/utils/exchange/buffer/mod.rs
index c1dea16d0..f493f316c 100644
--- a/src/utils/exchange/buffer/mod.rs
+++ b/src/utils/exchange/buffer/mod.rs
@@ -9,7 +9,7 @@ use const_type_layout::TypeLayout;
 #[cfg(any(feature = "host", feature = "device"))]
 use const_type_layout::TypeGraphLayout;
 
-use crate::safety::SafeDeviceCopy;
+use crate::safety::PortableBitSemantics;
 
 #[cfg(any(feature = "host", feature = "device"))]
 use crate::{
@@ -35,8 +35,11 @@ mod host;
 
 #[cfg(any(feature = "host", feature = "device"))]
 #[allow(clippy::module_name_repetitions)]
-pub struct CudaExchangeBuffer<T: SafeDeviceCopy + TypeGraphLayout, const M2D: bool, const M2H: bool>
-{
+pub struct CudaExchangeBuffer<
+    T: PortableBitSemantics + TypeGraphLayout,
+    const M2D: bool,
+    const M2H: bool,
+> {
     #[cfg(feature = "host")]
     inner: host::CudaExchangeBufferHost<T, M2D, M2H>,
     #[cfg(all(feature = "device", not(feature = "host")))]
@@ -44,7 +47,7 @@ pub struct CudaExchangeBuffer<T: SafeDeviceCopy + TypeGraphLayout, const M2D: bo
 }
 
 #[cfg(feature = "host")]
-impl<T: Clone + SafeDeviceCopy + TypeGraphLayout, const M2D: bool, const M2H: bool>
+impl<T: Clone + PortableBitSemantics + TypeGraphLayout, const M2D: bool, const M2H: bool>
     CudaExchangeBuffer<T, M2D, M2H>
 {
     /// # Errors
@@ -58,7 +61,7 @@ impl<T: Clone + SafeDeviceCopy + TypeGraphLayout, const M2D: bool, const M2H: bo
 }
 
 #[cfg(feature = "host")]
-impl<T: SafeDeviceCopy + TypeGraphLayout, const M2D: bool, const M2H: bool>
+impl<T: PortableBitSemantics + TypeGraphLayout, const M2D: bool, const M2H: bool>
     CudaExchangeBuffer<T, M2D, M2H>
 {
     /// # Errors
@@ -72,7 +75,7 @@ impl<T: SafeDeviceCopy + TypeGraphLayout, const M2D: bool, const M2H: bool>
 }
 
 #[cfg(any(feature = "host", feature = "device"))]
-impl<T: SafeDeviceCopy + TypeGraphLayout, const M2D: bool, const M2H: bool> Deref
+impl<T: PortableBitSemantics + TypeGraphLayout, const M2D: bool, const M2H: bool> Deref
     for CudaExchangeBuffer<T, M2D, M2H>
 {
     type Target = [CudaExchangeItem<T, M2D, M2H>];
@@ -83,7 +86,7 @@ impl<T: SafeDeviceCopy + TypeGraphLayout, const M2D: bool, const M2H: bool> Dere
 }
 
 #[cfg(any(feature = "host", feature = "device"))]
-impl<T: SafeDeviceCopy + TypeGraphLayout, const M2D: bool, const M2H: bool> DerefMut
+impl<T: PortableBitSemantics + TypeGraphLayout, const M2D: bool, const M2H: bool> DerefMut
     for CudaExchangeBuffer<T, M2D, M2H>
 {
     fn deref_mut(&mut self) -> &mut Self::Target {
@@ -92,7 +95,7 @@ impl<T: SafeDeviceCopy + TypeGraphLayout, const M2D: bool, const M2H: bool> Dere
 }
 
 #[cfg(any(feature = "host", feature = "device"))]
-unsafe impl<T: SafeDeviceCopy + TypeGraphLayout, const M2D: bool, const M2H: bool> RustToCuda
+unsafe impl<T: PortableBitSemantics + TypeGraphLayout, const M2D: bool, const M2H: bool> RustToCuda
     for CudaExchangeBuffer<T, M2D, M2H>
 {
     type CudaAllocation = NoCudaAlloc;
@@ -121,8 +124,8 @@ unsafe impl<T: SafeDeviceCopy + TypeGraphLayout, const M2D: bool, const M2H: boo
 }
 
 #[cfg(any(feature = "host", feature = "device"))]
-unsafe impl<T: SafeDeviceCopy + TypeGraphLayout, const M2D: bool, const M2H: bool> RustToCudaAsync
-    for CudaExchangeBuffer<T, M2D, M2H>
+unsafe impl<T: PortableBitSemantics + TypeGraphLayout, const M2D: bool, const M2H: bool>
+    RustToCudaAsync for CudaExchangeBuffer<T, M2D, M2H>
 {
     #[cfg(feature = "host")]
     #[allow(clippy::type_complexity)]
@@ -150,16 +153,13 @@ unsafe impl<T: SafeDeviceCopy + TypeGraphLayout, const M2D: bool, const M2H: boo
 
 #[repr(transparent)]
 #[derive(Clone, Copy, TypeLayout)]
-pub struct CudaExchangeItem<T: SafeDeviceCopy, const M2D: bool, const M2H: bool>(T);
-
-// Safety: Transparent newtype wrapper around [`SafeDeviceCopy`]
-//          is [`DeviceCopy`]
-unsafe impl<T: SafeDeviceCopy, const M2D: bool, const M2H: bool> rustacuda_core::DeviceCopy
-    for CudaExchangeItem<T, M2D, M2H>
-{
-}
+pub struct CudaExchangeItem<
+    T: PortableBitSemantics + TypeGraphLayout,
+    const M2D: bool,
+    const M2H: bool,
+>(T);
 
-impl<T: SafeDeviceCopy, const M2D: bool> CudaExchangeItem<T, M2D, true> {
+impl<T: PortableBitSemantics + TypeGraphLayout, const M2D: bool> CudaExchangeItem<T, M2D, true> {
     #[cfg(feature = "host")]
     pub const fn read(&self) -> &T {
         &self.0
@@ -171,7 +171,7 @@ impl<T: SafeDeviceCopy, const M2D: bool> CudaExchangeItem<T, M2D, true> {
     }
 }
 
-impl<T: SafeDeviceCopy, const M2H: bool> CudaExchangeItem<T, true, M2H> {
+impl<T: PortableBitSemantics + TypeGraphLayout, const M2H: bool> CudaExchangeItem<T, true, M2H> {
     #[cfg(feature = "device")]
     pub const fn read(&self) -> &T {
         &self.0
@@ -183,13 +183,13 @@ impl<T: SafeDeviceCopy, const M2H: bool> CudaExchangeItem<T, true, M2H> {
     }
 }
 
-impl<T: SafeDeviceCopy> AsMut<T> for CudaExchangeItem<T, true, true> {
+impl<T: PortableBitSemantics + TypeGraphLayout> AsMut<T> for CudaExchangeItem<T, true, true> {
     fn as_mut(&mut self) -> &mut T {
         &mut self.0
     }
 }
 
-impl<T: SafeDeviceCopy> CudaExchangeItem<T, false, true> {
+impl<T: PortableBitSemantics + TypeGraphLayout> CudaExchangeItem<T, false, true> {
     #[cfg(feature = "host")]
     pub const fn as_scratch(&self) -> &T {
         &self.0
@@ -201,7 +201,7 @@ impl<T: SafeDeviceCopy> CudaExchangeItem<T, false, true> {
     }
 }
 
-impl<T: SafeDeviceCopy> CudaExchangeItem<T, true, false> {
+impl<T: PortableBitSemantics + TypeGraphLayout> CudaExchangeItem<T, true, false> {
     #[cfg(feature = "device")]
     pub const fn as_scratch(&self) -> &T {
         &self.0
@@ -213,7 +213,7 @@ impl<T: SafeDeviceCopy> CudaExchangeItem<T, true, false> {
     }
 }
 
-impl<T: SafeDeviceCopy> CudaExchangeItem<T, true, false> {
+impl<T: PortableBitSemantics + TypeGraphLayout> CudaExchangeItem<T, true, false> {
     #[cfg(feature = "host")]
     pub const fn as_uninit(&self) -> &MaybeUninit<T> {
         // Safety:
@@ -231,7 +231,7 @@ impl<T: SafeDeviceCopy> CudaExchangeItem<T, true, false> {
     }
 }
 
-impl<T: SafeDeviceCopy> CudaExchangeItem<T, false, true> {
+impl<T: PortableBitSemantics + TypeGraphLayout> CudaExchangeItem<T, false, true> {
     #[cfg(feature = "device")]
     pub const fn as_uninit(&self) -> &MaybeUninit<T> {
         // Safety:
diff --git a/src/utils/exchange/wrapper.rs b/src/utils/exchange/wrapper.rs
index 2e9decc51..9eedb058e 100644
--- a/src/utils/exchange/wrapper.rs
+++ b/src/utils/exchange/wrapper.rs
@@ -9,7 +9,7 @@ use std::{
 use rustacuda::{
     error::{CudaError, CudaResult},
     event::{Event, EventFlags, EventStatus},
-    memory::DeviceBox,
+    memory::{AsyncCopyDestination, CopyDestination, DeviceBox, LockedBox},
     stream::{Stream, StreamWaitEventFlags},
 };
 
@@ -17,25 +17,33 @@ use crate::{
     alloc::{CombinedCudaAlloc, EmptyCudaAlloc, NoCudaAlloc},
     host::{
         CudaDropWrapper, HostAndDeviceConstRef, HostAndDeviceConstRefAsync, HostAndDeviceMutRef,
-        HostAndDeviceMutRefAsync, HostDeviceBox, HostLockedBox,
+        HostAndDeviceMutRefAsync,
     },
     lend::{RustToCuda, RustToCudaAsync},
-    utils::ffi::DeviceAccessible,
+    utils::{device_copy::SafeDeviceCopyWrapper, ffi::DeviceAccessible},
 };
 
 #[allow(clippy::module_name_repetitions)]
 pub struct ExchangeWrapperOnHost<T: RustToCuda<CudaAllocation: EmptyCudaAlloc>> {
     value: T,
-    device_box: HostDeviceBox<DeviceAccessible<<T as RustToCuda>::CudaRepresentation>>,
-    locked_cuda_repr: HostLockedBox<DeviceAccessible<<T as RustToCuda>::CudaRepresentation>>,
+    device_box: CudaDropWrapper<
+        DeviceBox<SafeDeviceCopyWrapper<DeviceAccessible<<T as RustToCuda>::CudaRepresentation>>>,
+    >,
+    locked_cuda_repr: CudaDropWrapper<
+        LockedBox<SafeDeviceCopyWrapper<DeviceAccessible<<T as RustToCuda>::CudaRepresentation>>>,
+    >,
     move_event: CudaDropWrapper<Event>,
 }
 
 #[allow(clippy::module_name_repetitions)]
 pub struct ExchangeWrapperOnHostAsync<'stream, T: RustToCuda<CudaAllocation: EmptyCudaAlloc>> {
     value: T,
-    device_box: HostDeviceBox<DeviceAccessible<<T as RustToCuda>::CudaRepresentation>>,
-    locked_cuda_repr: HostLockedBox<DeviceAccessible<<T as RustToCuda>::CudaRepresentation>>,
+    device_box: CudaDropWrapper<
+        DeviceBox<SafeDeviceCopyWrapper<DeviceAccessible<<T as RustToCuda>::CudaRepresentation>>>,
+    >,
+    locked_cuda_repr: CudaDropWrapper<
+        LockedBox<SafeDeviceCopyWrapper<DeviceAccessible<<T as RustToCuda>::CudaRepresentation>>>,
+    >,
     move_event: CudaDropWrapper<Event>,
     stream: PhantomData<&'stream Stream>,
     waker: Arc<Mutex<Option<Waker>>>,
@@ -44,8 +52,12 @@ pub struct ExchangeWrapperOnHostAsync<'stream, T: RustToCuda<CudaAllocation: Emp
 #[allow(clippy::module_name_repetitions)]
 pub struct ExchangeWrapperOnDevice<T: RustToCuda<CudaAllocation: EmptyCudaAlloc>> {
     value: T,
-    device_box: HostDeviceBox<DeviceAccessible<<T as RustToCuda>::CudaRepresentation>>,
-    locked_cuda_repr: HostLockedBox<DeviceAccessible<<T as RustToCuda>::CudaRepresentation>>,
+    device_box: CudaDropWrapper<
+        DeviceBox<SafeDeviceCopyWrapper<DeviceAccessible<<T as RustToCuda>::CudaRepresentation>>>,
+    >,
+    locked_cuda_repr: CudaDropWrapper<
+        LockedBox<SafeDeviceCopyWrapper<DeviceAccessible<<T as RustToCuda>::CudaRepresentation>>>,
+    >,
     null_alloc: CombinedCudaAlloc<<T as RustToCuda>::CudaAllocation, NoCudaAlloc>,
     move_event: CudaDropWrapper<Event>,
 }
@@ -53,8 +65,12 @@ pub struct ExchangeWrapperOnDevice<T: RustToCuda<CudaAllocation: EmptyCudaAlloc>
 #[allow(clippy::module_name_repetitions)]
 pub struct ExchangeWrapperOnDeviceAsync<'stream, T: RustToCuda<CudaAllocation: EmptyCudaAlloc>> {
     value: T,
-    device_box: HostDeviceBox<DeviceAccessible<<T as RustToCuda>::CudaRepresentation>>,
-    locked_cuda_repr: HostLockedBox<DeviceAccessible<<T as RustToCuda>::CudaRepresentation>>,
+    device_box: CudaDropWrapper<
+        DeviceBox<SafeDeviceCopyWrapper<DeviceAccessible<<T as RustToCuda>::CudaRepresentation>>>,
+    >,
+    locked_cuda_repr: CudaDropWrapper<
+        LockedBox<SafeDeviceCopyWrapper<DeviceAccessible<<T as RustToCuda>::CudaRepresentation>>>,
+    >,
     null_alloc: CombinedCudaAlloc<<T as RustToCuda>::CudaAllocation, NoCudaAlloc>,
     move_event: CudaDropWrapper<Event>,
     stream: &'stream Stream,
@@ -67,12 +83,20 @@ impl<T: RustToCuda<CudaAllocation: EmptyCudaAlloc>> ExchangeWrapperOnHost<T> {
     /// CUDA
     pub fn new(value: T) -> CudaResult<Self> {
         // Safety: The uninitialised memory is never exposed
-        //         To access the device memory, [`Self::move_to_device`] has to be
-        // called first,           which initialised the memory.
-        let device_box = unsafe { DeviceBox::uninitialized() }?.into();
+        //         To access the device memory, [`Self::move_to_device`] has to
+        //          be called first, which initialised the memory.
+        let device_box = CudaDropWrapper::from(unsafe { DeviceBox::uninitialized() }?);
 
         let (cuda_repr, _null_alloc) = unsafe { value.borrow(NoCudaAlloc) }?;
-        let locked_cuda_repr = HostLockedBox::new(cuda_repr)?;
+        let locked_cuda_repr = unsafe {
+            let mut uninit = CudaDropWrapper::from(LockedBox::<
+                SafeDeviceCopyWrapper<DeviceAccessible<<T as RustToCuda>::CudaRepresentation>>,
+            >::uninitialized()?);
+            uninit
+                .as_mut_ptr()
+                .write(SafeDeviceCopyWrapper::from(cuda_repr));
+            uninit
+        };
 
         let move_event = Event::new(EventFlags::DISABLE_TIMING)?.into();
 
@@ -98,9 +122,9 @@ impl<T: RustToCuda<CudaAllocation: EmptyCudaAlloc>> ExchangeWrapperOnHost<T> {
     /// CUDA
     pub fn move_to_device(mut self) -> CudaResult<ExchangeWrapperOnDevice<T>> {
         let (cuda_repr, null_alloc) = unsafe { self.value.borrow(NoCudaAlloc) }?;
-        *self.locked_cuda_repr = cuda_repr;
+        **self.locked_cuda_repr = SafeDeviceCopyWrapper::from(cuda_repr);
 
-        self.device_box.copy_from(&self.locked_cuda_repr)?;
+        self.device_box.copy_from(&**self.locked_cuda_repr)?;
 
         Ok(ExchangeWrapperOnDevice {
             value: self.value,
@@ -128,14 +152,14 @@ impl<T: RustToCudaAsync<CudaAllocation: EmptyCudaAlloc>> ExchangeWrapperOnHost<T
         stream: &Stream,
     ) -> CudaResult<ExchangeWrapperOnDeviceAsync<'_, T>> {
         let (cuda_repr, null_alloc) = unsafe { self.value.borrow_async(NoCudaAlloc, stream) }?;
-        *self.locked_cuda_repr = cuda_repr;
+        **self.locked_cuda_repr = SafeDeviceCopyWrapper::from(cuda_repr);
 
         // Safety: The device value is not safely exposed until either
         // - the passed-in [`Stream`] is synchronised
         // - the kernel is launched on the passed-in [`Stream`]
         unsafe {
             self.device_box
-                .async_copy_from(&self.locked_cuda_repr, stream)
+                .async_copy_from(&*self.locked_cuda_repr, stream)
         }?;
         self.move_event.record(stream)?;
 
@@ -316,7 +340,11 @@ impl<'stream, T: RustToCuda<CudaAllocation: EmptyCudaAlloc>>
     ) -> HostAndDeviceConstRefAsync<DeviceAccessible<<T as RustToCuda>::CudaRepresentation>> {
         // Safety: `device_box` contains exactly the device copy of `locked_cuda_repr`
         unsafe {
-            HostAndDeviceConstRefAsync::new(&self.device_box, &self.locked_cuda_repr, self.stream)
+            HostAndDeviceConstRefAsync::new(
+                &*self.device_box,
+                (**self.locked_cuda_repr).into_ref(),
+                self.stream,
+            )
         }
     }
 
@@ -327,7 +355,7 @@ impl<'stream, T: RustToCuda<CudaAllocation: EmptyCudaAlloc>>
         unsafe {
             HostAndDeviceMutRefAsync::new(
                 &mut self.device_box,
-                &mut self.locked_cuda_repr,
+                (**self.locked_cuda_repr).into_mut(),
                 self.stream,
             )
         }
@@ -470,14 +498,18 @@ impl<T: RustToCuda<CudaAllocation: EmptyCudaAlloc>> ExchangeWrapperOnDevice<T> {
         &self,
     ) -> HostAndDeviceConstRef<DeviceAccessible<<T as RustToCuda>::CudaRepresentation>> {
         // Safety: `device_box` contains exactly the device copy of `locked_cuda_repr`
-        unsafe { HostAndDeviceConstRef::new(&self.device_box, &self.locked_cuda_repr) }
+        unsafe {
+            HostAndDeviceConstRef::new(&self.device_box, (**self.locked_cuda_repr).into_ref())
+        }
     }
 
     pub fn as_mut(
         &mut self,
     ) -> HostAndDeviceMutRef<DeviceAccessible<<T as RustToCuda>::CudaRepresentation>> {
         // Safety: `device_box` contains exactly the device copy of `locked_cuda_repr`
-        unsafe { HostAndDeviceMutRef::new(&mut self.device_box, &mut self.locked_cuda_repr) }
+        unsafe {
+            HostAndDeviceMutRef::new(&mut self.device_box, (**self.locked_cuda_repr).into_mut())
+        }
     }
 }
 
diff --git a/src/utils/ffi.rs b/src/utils/ffi.rs
index 98fd945e7..3b205ffca 100644
--- a/src/utils/ffi.rs
+++ b/src/utils/ffi.rs
@@ -7,20 +7,16 @@ use core::{
 #[cfg(feature = "host")]
 use std::{fmt, mem::MaybeUninit, ptr::copy_nonoverlapping};
 
-#[cfg(feature = "host")]
-use const_type_layout::TypeGraphLayout;
 use const_type_layout::TypeLayout;
-use rustacuda_core::DeviceCopy;
 
+use crate::safety::PortableBitSemantics;
 #[cfg(feature = "host")]
-use crate::{lend::CudaAsRust, safety::SafeDeviceCopy, utils::device_copy::SafeDeviceCopyWrapper};
+use crate::{lend::CudaAsRust, utils::device_copy::SafeDeviceCopyWrapper};
 
-#[repr(transparent)]
 #[cfg_attr(any(feature = "device", doc), derive(Debug))]
 #[derive(TypeLayout)]
-pub struct DeviceAccessible<T: ?Sized + DeviceCopy>(T);
-
-unsafe impl<T: ?Sized + DeviceCopy> DeviceCopy for DeviceAccessible<T> {}
+#[repr(transparent)]
+pub struct DeviceAccessible<T: ?Sized + PortableBitSemantics>(T);
 
 #[cfg(feature = "host")]
 impl<T: CudaAsRust> From<T> for DeviceAccessible<T> {
@@ -29,8 +25,9 @@ impl<T: CudaAsRust> From<T> for DeviceAccessible<T> {
     }
 }
 
+// TODO: should there be some copy bound here?
 #[cfg(feature = "host")]
-impl<T: SafeDeviceCopy + TypeGraphLayout> From<&T> for DeviceAccessible<SafeDeviceCopyWrapper<T>> {
+impl<T: PortableBitSemantics> From<&T> for DeviceAccessible<SafeDeviceCopyWrapper<T>> {
     fn from(value: &T) -> Self {
         let value = unsafe {
             let mut uninit = MaybeUninit::uninit();
@@ -43,7 +40,7 @@ impl<T: SafeDeviceCopy + TypeGraphLayout> From<&T> for DeviceAccessible<SafeDevi
 }
 
 #[cfg(all(feature = "host", not(doc)))]
-impl<T: ?Sized + DeviceCopy + fmt::Debug> fmt::Debug for DeviceAccessible<T> {
+impl<T: ?Sized + PortableBitSemantics + fmt::Debug> fmt::Debug for DeviceAccessible<T> {
     fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result {
         fmt.debug_struct(stringify!(DeviceAccessible))
             .finish_non_exhaustive()
@@ -51,7 +48,7 @@ impl<T: ?Sized + DeviceCopy + fmt::Debug> fmt::Debug for DeviceAccessible<T> {
 }
 
 #[cfg(feature = "device")]
-impl<T: ?Sized + DeviceCopy> Deref for DeviceAccessible<T> {
+impl<T: ?Sized + PortableBitSemantics> Deref for DeviceAccessible<T> {
     type Target = T;
 
     fn deref(&self) -> &Self::Target {
@@ -60,74 +57,155 @@ impl<T: ?Sized + DeviceCopy> Deref for DeviceAccessible<T> {
 }
 
 #[cfg(feature = "device")]
-impl<T: ?Sized + DeviceCopy> DerefMut for DeviceAccessible<T> {
+impl<T: ?Sized + PortableBitSemantics> DerefMut for DeviceAccessible<T> {
     fn deref_mut(&mut self) -> &mut Self::Target {
         &mut self.0
     }
 }
 
+#[derive(TypeLayout)]
 #[repr(transparent)]
-#[derive(Clone, Copy, TypeLayout)]
-pub struct DeviceConstRef<'r, T: DeviceCopy + 'r> {
+pub struct DeviceConstRef<'r, T: PortableBitSemantics + 'r> {
     #[cfg_attr(feature = "host", allow(dead_code))]
-    pub(crate) pointer: *const T,
+    pub(crate) pointer: DeviceConstPointer<T>,
     pub(crate) reference: PhantomData<&'r T>,
 }
 
-unsafe impl<'r, T: DeviceCopy> DeviceCopy for DeviceConstRef<'r, T> {}
+impl<'r, T: PortableBitSemantics> Copy for DeviceConstRef<'r, T> {}
+
+impl<'r, T: PortableBitSemantics> Clone for DeviceConstRef<'r, T> {
+    fn clone(&self) -> Self {
+        *self
+    }
+}
 
 #[cfg(feature = "device")]
-impl<'r, T: DeviceCopy> AsRef<T> for DeviceConstRef<'r, T> {
+impl<'r, T: PortableBitSemantics> AsRef<T> for DeviceConstRef<'r, T> {
     fn as_ref(&self) -> &T {
-        unsafe { &*self.pointer }
+        unsafe { &*self.pointer.0 }
     }
 }
 
-#[repr(transparent)]
 #[derive(TypeLayout)]
-pub struct DeviceMutRef<'r, T: DeviceCopy + 'r> {
+#[repr(transparent)]
+pub struct DeviceMutRef<'r, T: PortableBitSemantics + 'r> {
     #[cfg_attr(feature = "host", allow(dead_code))]
-    pub(crate) pointer: *mut T,
+    pub(crate) pointer: DeviceMutPointer<T>,
     pub(crate) reference: PhantomData<&'r mut T>,
 }
 
-unsafe impl<'r, T: DeviceCopy> DeviceCopy for DeviceMutRef<'r, T> {}
-
 #[cfg(feature = "device")]
-impl<'r, T: DeviceCopy> AsRef<T> for DeviceMutRef<'r, T> {
+impl<'r, T: PortableBitSemantics> AsRef<T> for DeviceMutRef<'r, T> {
     fn as_ref(&self) -> &T {
-        unsafe { &*self.pointer }
+        unsafe { &*self.pointer.0 }
     }
 }
 
 #[cfg(feature = "device")]
-impl<'r, T: DeviceCopy> AsMut<T> for DeviceMutRef<'r, T> {
+impl<'r, T: PortableBitSemantics> AsMut<T> for DeviceMutRef<'r, T> {
     fn as_mut(&mut self) -> &mut T {
-        unsafe { &mut *self.pointer }
+        unsafe { &mut *self.pointer.0 }
     }
 }
 
-#[repr(transparent)]
 #[derive(TypeLayout)]
-pub struct DeviceOwnedRef<'r, T: DeviceCopy> {
+#[repr(transparent)]
+pub struct DeviceOwnedRef<'r, T: PortableBitSemantics> {
     #[cfg_attr(feature = "host", allow(dead_code))]
-    pub(crate) pointer: *mut T,
+    pub(crate) pointer: DeviceOwnedPointer<T>,
     pub(crate) reference: PhantomData<&'r mut ()>,
     pub(crate) marker: PhantomData<T>,
 }
 
-unsafe impl<'r, T: DeviceCopy> DeviceCopy for DeviceOwnedRef<'r, T> {}
-
 #[cfg(feature = "device")]
-impl<'r, T: DeviceCopy> AsRef<T> for DeviceOwnedRef<'r, T> {
+impl<'r, T: PortableBitSemantics> AsRef<T> for DeviceOwnedRef<'r, T> {
     fn as_ref(&self) -> &T {
-        unsafe { &*self.pointer }
+        unsafe { &*self.pointer.0 }
     }
 }
 
 #[cfg(feature = "device")]
-impl<'r, T: DeviceCopy> AsMut<T> for DeviceOwnedRef<'r, T> {
+impl<'r, T: PortableBitSemantics> AsMut<T> for DeviceOwnedRef<'r, T> {
     fn as_mut(&mut self) -> &mut T {
-        unsafe { &mut *self.pointer }
+        unsafe { &mut *self.pointer.0 }
+    }
+}
+
+#[derive(TypeLayout)]
+#[repr(transparent)]
+pub struct DeviceConstPointer<T: ?Sized>(pub(crate) *const T);
+
+impl<T: ?Sized> Copy for DeviceConstPointer<T> {}
+
+impl<T: ?Sized> Clone for DeviceConstPointer<T> {
+    fn clone(&self) -> Self {
+        *self
+    }
+}
+
+impl<T> DeviceConstPointer<[T]> {
+    #[must_use]
+    pub fn into_raw_parts(self) -> (DeviceConstPointer<T>, usize) {
+        let (data, len) = self.0.to_raw_parts();
+        (DeviceConstPointer(data.cast()), len)
+    }
+}
+
+#[derive(TypeLayout)]
+#[repr(transparent)]
+pub struct DeviceMutPointer<T: ?Sized>(pub(crate) *mut T);
+
+impl<T: ?Sized> Copy for DeviceMutPointer<T> {}
+
+impl<T: ?Sized> Clone for DeviceMutPointer<T> {
+    fn clone(&self) -> Self {
+        *self
+    }
+}
+
+impl<T: ?Sized> DeviceMutPointer<T> {
+    #[must_use]
+    pub const fn as_const(self) -> DeviceConstPointer<T> {
+        DeviceConstPointer(self.0.cast_const())
+    }
+}
+
+impl<T> DeviceMutPointer<[T]> {
+    #[must_use]
+    pub fn into_raw_parts(self) -> (DeviceMutPointer<T>, usize) {
+        let (data, len) = self.0.to_raw_parts();
+        (DeviceMutPointer(data.cast()), len)
+    }
+}
+
+#[derive(TypeLayout)]
+#[repr(transparent)]
+pub struct DeviceOwnedPointer<T: ?Sized>(pub(crate) *mut T);
+
+impl<T: ?Sized> Copy for DeviceOwnedPointer<T> {}
+
+impl<T: ?Sized> Clone for DeviceOwnedPointer<T> {
+    fn clone(&self) -> Self {
+        *self
+    }
+}
+
+impl<T: ?Sized> DeviceOwnedPointer<T> {
+    #[must_use]
+    pub const fn as_const(self) -> DeviceConstPointer<T> {
+        DeviceConstPointer(self.0.cast_const())
+    }
+
+    #[must_use]
+    pub const fn as_mut(self) -> DeviceMutPointer<T> {
+        DeviceMutPointer(self.0)
+    }
+}
+
+impl<T> DeviceOwnedPointer<[T]> {
+    #[must_use]
+    pub fn into_raw_parts(self) -> (DeviceOwnedPointer<T>, usize) {
+        let (data, len) = self.0.to_raw_parts();
+        (DeviceOwnedPointer(data.cast()), len)
     }
 }

From d88bac07ba57af93058524d6fd9636b8962ee7b1 Mon Sep 17 00:00:00 2001
From: Juniper Tyree <juniper.tyree@helsinki.fi>
Date: Wed, 27 Dec 2023 10:24:35 +0000
Subject: [PATCH 071/120] More refactoring and auditing kernel param bounds

---
 examples/single-source/src/main.rs            |   9 +-
 rust-cuda-derive/src/rust_to_cuda/field_ty.rs |   2 +-
 src/host/mod.rs                               |  83 +++---
 src/kernel/mod.rs                             |   4 +-
 src/kernel/param.rs                           | 151 +++++++----
 src/lend/impls/box.rs                         |  11 +-
 src/lend/impls/boxed_slice.rs                 |   9 +-
 src/lend/impls/option.rs                      |  28 +-
 src/lend/impls/ref.rs                         |   9 +-
 src/lend/impls/ref_mut.rs                     |  11 +-
 src/lend/impls/slice_ref.rs                   |   7 +-
 src/lend/impls/slice_ref_mut.rs               |   9 +-
 src/lend/mod.rs                               |  23 +-
 src/utils/adapter.rs                          | 248 ++++++++++++++++++
 src/utils/device_copy.rs                      | 150 -----------
 src/utils/exchange/buffer/common.rs           |  12 +-
 src/utils/exchange/buffer/device.rs           |  13 +-
 src/utils/exchange/buffer/host.rs             |  48 ++--
 src/utils/exchange/buffer/mod.rs              |  56 ++--
 src/utils/exchange/wrapper.rs                 |  60 ++++-
 src/utils/ffi.rs                              |  29 +-
 src/utils/mod.rs                              |   2 +-
 22 files changed, 605 insertions(+), 369 deletions(-)
 create mode 100644 src/utils/adapter.rs
 delete mode 100644 src/utils/device_copy.rs

diff --git a/examples/single-source/src/main.rs b/examples/single-source/src/main.rs
index 13f2b7efe..4783deffa 100644
--- a/examples/single-source/src/main.rs
+++ b/examples/single-source/src/main.rs
@@ -23,7 +23,7 @@ fn main() {}
 #[layout(crate = "rc::deps::const_type_layout")]
 pub struct Dummy(i32);
 
-#[derive(rc::lend::LendRustToCuda)]
+#[derive(Clone, rc::lend::LendRustToCuda)]
 #[cuda(crate = "rc")]
 #[allow(dead_code)]
 pub struct Wrapper<T> {
@@ -31,7 +31,7 @@ pub struct Wrapper<T> {
     inner: T,
 }
 
-#[derive(rc::lend::LendRustToCuda)]
+#[derive(Clone, rc::lend::LendRustToCuda)]
 #[cuda(crate = "rc")]
 pub struct Empty([u8; 0]);
 
@@ -54,6 +54,9 @@ pub struct Triple(i32, i32, i32);
 pub fn kernel<
     'a,
     T: 'static
+        + Send
+        + Sync
+        + Clone
         + rc::lend::RustToCuda<
             CudaRepresentation: rc::safety::StackOnly,
             CudaAllocation: rc::alloc::EmptyCudaAlloc,
@@ -96,7 +99,7 @@ mod host {
     // Link several instances of the generic CUDA kernel
     struct KernelPtx<'a, T>(std::marker::PhantomData<&'a T>);
     crate::link! { impl kernel<'a, crate::Empty> for KernelPtx }
-    crate::link! { impl kernel<'a, rc::utils::device_copy::SafeDeviceCopyWrapper<u64>> for KernelPtx }
+    crate::link! { impl kernel<'a, rc::utils::adapter::RustToCudaWithPortableBitCopySemantics<u64>> for KernelPtx }
 }
 
 #[cfg(target_os = "cuda")]
diff --git a/rust-cuda-derive/src/rust_to_cuda/field_ty.rs b/rust-cuda-derive/src/rust_to_cuda/field_ty.rs
index 313daf86b..36924aaf9 100644
--- a/rust-cuda-derive/src/rust_to_cuda/field_ty.rs
+++ b/rust-cuda-derive/src/rust_to_cuda/field_ty.rs
@@ -109,7 +109,7 @@ pub fn swap_field_type_and_filter_attrs(
     } else {
         field_ty = parse_quote! {
             #crate_path::utils::ffi::DeviceAccessible<
-                #crate_path::utils::device_copy::SafeDeviceCopyWrapper<#field_ty>
+                #crate_path::utils::adapter::RustToCudaWithPortableBitCopySemantics<#field_ty>
             >
         };
 
diff --git a/src/host/mod.rs b/src/host/mod.rs
index e480de9f2..f77c75792 100644
--- a/src/host/mod.rs
+++ b/src/host/mod.rs
@@ -4,6 +4,7 @@ use std::{
     ops::{Deref, DerefMut},
 };
 
+use const_type_layout::TypeGraphLayout;
 use rustacuda::{
     context::Context,
     error::CudaError,
@@ -16,7 +17,7 @@ use rustacuda::{
 use crate::{
     safety::PortableBitSemantics,
     utils::{
-        device_copy::SafeDeviceCopyWrapper,
+        adapter::DeviceCopyWithPortableBitSemantics,
         ffi::{
             DeviceConstPointer, DeviceConstRef, DeviceMutPointer, DeviceMutRef, DeviceOwnedPointer,
             DeviceOwnedRef,
@@ -101,17 +102,17 @@ impl_sealed_drop_value!(Context);
 impl_sealed_drop_value!(Event);
 
 #[allow(clippy::module_name_repetitions)]
-pub struct HostAndDeviceMutRef<'a, T: PortableBitSemantics> {
-    device_box: &'a mut DeviceBox<SafeDeviceCopyWrapper<T>>,
+pub struct HostAndDeviceMutRef<'a, T: PortableBitSemantics + TypeGraphLayout> {
+    device_box: &'a mut DeviceBox<DeviceCopyWithPortableBitSemantics<T>>,
     host_ref: &'a mut T,
 }
 
-impl<'a, T: PortableBitSemantics> HostAndDeviceMutRef<'a, T> {
+impl<'a, T: PortableBitSemantics + TypeGraphLayout> HostAndDeviceMutRef<'a, T> {
     /// # Safety
     ///
     /// `device_box` must contain EXACTLY the device copy of `host_ref`
     pub unsafe fn new(
-        device_box: &'a mut DeviceBox<SafeDeviceCopyWrapper<T>>,
+        device_box: &'a mut DeviceBox<DeviceCopyWithPortableBitSemantics<T>>,
         host_ref: &'a mut T,
     ) -> Self {
         Self {
@@ -132,8 +133,9 @@ impl<'a, T: PortableBitSemantics> HostAndDeviceMutRef<'a, T> {
         host_ref: &mut T,
         inner: F,
     ) -> Result<O, E> {
-        let mut device_box =
-            CudaDropWrapper::from(DeviceBox::new(SafeDeviceCopyWrapper::from_ref(host_ref))?);
+        let mut device_box = CudaDropWrapper::from(DeviceBox::new(
+            DeviceCopyWithPortableBitSemantics::from_ref(host_ref),
+        )?);
 
         // Safety: `device_box` contains exactly the device copy of `host_ref`
         let result = inner(HostAndDeviceMutRef {
@@ -142,7 +144,7 @@ impl<'a, T: PortableBitSemantics> HostAndDeviceMutRef<'a, T> {
         });
 
         // Copy back any changes made
-        device_box.copy_to(SafeDeviceCopyWrapper::from_mut(host_ref))?;
+        device_box.copy_to(DeviceCopyWithPortableBitSemantics::from_mut(host_ref))?;
 
         core::mem::drop(device_box);
 
@@ -201,25 +203,25 @@ impl<'a, T: PortableBitSemantics> HostAndDeviceMutRef<'a, T> {
 }
 
 #[allow(clippy::module_name_repetitions)]
-pub struct HostAndDeviceConstRef<'a, T: PortableBitSemantics> {
-    device_box: &'a DeviceBox<SafeDeviceCopyWrapper<T>>,
+pub struct HostAndDeviceConstRef<'a, T: PortableBitSemantics + TypeGraphLayout> {
+    device_box: &'a DeviceBox<DeviceCopyWithPortableBitSemantics<T>>,
     host_ref: &'a T,
 }
 
-impl<'a, T: PortableBitSemantics> Clone for HostAndDeviceConstRef<'a, T> {
+impl<'a, T: PortableBitSemantics + TypeGraphLayout> Clone for HostAndDeviceConstRef<'a, T> {
     fn clone(&self) -> Self {
         *self
     }
 }
 
-impl<'a, T: PortableBitSemantics> Copy for HostAndDeviceConstRef<'a, T> {}
+impl<'a, T: PortableBitSemantics + TypeGraphLayout> Copy for HostAndDeviceConstRef<'a, T> {}
 
-impl<'a, T: PortableBitSemantics> HostAndDeviceConstRef<'a, T> {
+impl<'a, T: PortableBitSemantics + TypeGraphLayout> HostAndDeviceConstRef<'a, T> {
     /// # Safety
     ///
     /// `device_box` must contain EXACTLY the device copy of `host_ref`
     pub const unsafe fn new(
-        device_box: &'a DeviceBox<SafeDeviceCopyWrapper<T>>,
+        device_box: &'a DeviceBox<DeviceCopyWithPortableBitSemantics<T>>,
         host_ref: &'a T,
     ) -> Self {
         Self {
@@ -240,8 +242,9 @@ impl<'a, T: PortableBitSemantics> HostAndDeviceConstRef<'a, T> {
         host_ref: &T,
         inner: F,
     ) -> Result<O, E> {
-        let device_box =
-            CudaDropWrapper::from(DeviceBox::new(SafeDeviceCopyWrapper::from_ref(host_ref))?);
+        let device_box = CudaDropWrapper::from(DeviceBox::new(
+            DeviceCopyWithPortableBitSemantics::from_ref(host_ref),
+        )?);
 
         // Safety: `device_box` contains exactly the device copy of `host_ref`
         let result = inner(HostAndDeviceConstRef {
@@ -294,12 +297,12 @@ impl<'a, T: PortableBitSemantics> HostAndDeviceConstRef<'a, T> {
 }
 
 #[allow(clippy::module_name_repetitions)]
-pub struct HostAndDeviceOwned<'a, T: PortableBitSemantics> {
-    device_box: &'a mut DeviceBox<SafeDeviceCopyWrapper<T>>,
+pub struct HostAndDeviceOwned<'a, T: PortableBitSemantics + TypeGraphLayout> {
+    device_box: &'a mut DeviceBox<DeviceCopyWithPortableBitSemantics<T>>,
     host_val: &'a mut T,
 }
 
-impl<'a, T: PortableBitSemantics> HostAndDeviceOwned<'a, T> {
+impl<'a, T: PortableBitSemantics + TypeGraphLayout> HostAndDeviceOwned<'a, T> {
     /// # Errors
     ///
     /// Returns a [`CudaError`] iff `value` cannot be moved
@@ -308,8 +311,9 @@ impl<'a, T: PortableBitSemantics> HostAndDeviceOwned<'a, T> {
         mut value: T,
         inner: F,
     ) -> Result<O, E> {
-        let mut device_box =
-            CudaDropWrapper::from(DeviceBox::new(SafeDeviceCopyWrapper::from_ref(&value))?);
+        let mut device_box = CudaDropWrapper::from(DeviceBox::new(
+            DeviceCopyWithPortableBitSemantics::from_ref(&value),
+        )?);
 
         // Safety: `device_box` contains exactly the device copy of `value`
         inner(HostAndDeviceOwned {
@@ -343,18 +347,20 @@ impl<'a, T: PortableBitSemantics> HostAndDeviceOwned<'a, T> {
 }
 
 #[allow(clippy::module_name_repetitions)]
-pub struct HostAndDeviceMutRefAsync<'stream, 'a, T: PortableBitSemantics> {
-    device_box: &'a mut DeviceBox<SafeDeviceCopyWrapper<T>>,
+pub struct HostAndDeviceMutRefAsync<'stream, 'a, T: PortableBitSemantics + TypeGraphLayout> {
+    device_box: &'a mut DeviceBox<DeviceCopyWithPortableBitSemantics<T>>,
     host_ref: &'a mut T,
     stream: PhantomData<&'stream Stream>,
 }
 
-impl<'stream, 'a, T: PortableBitSemantics> HostAndDeviceMutRefAsync<'stream, 'a, T> {
+impl<'stream, 'a, T: PortableBitSemantics + TypeGraphLayout>
+    HostAndDeviceMutRefAsync<'stream, 'a, T>
+{
     /// # Safety
     ///
     /// `device_box` must contain EXACTLY the device copy of `host_ref`
     pub unsafe fn new(
-        device_box: &'a mut DeviceBox<SafeDeviceCopyWrapper<T>>,
+        device_box: &'a mut DeviceBox<DeviceCopyWithPortableBitSemantics<T>>,
         host_ref: &'a mut T,
         stream: &'stream Stream,
     ) -> Self {
@@ -413,27 +419,34 @@ impl<'stream, 'a, T: PortableBitSemantics> HostAndDeviceMutRefAsync<'stream, 'a,
 }
 
 #[allow(clippy::module_name_repetitions)]
-pub struct HostAndDeviceConstRefAsync<'stream, 'a, T: PortableBitSemantics> {
-    device_box: &'a DeviceBox<SafeDeviceCopyWrapper<T>>,
+pub struct HostAndDeviceConstRefAsync<'stream, 'a, T: PortableBitSemantics + TypeGraphLayout> {
+    device_box: &'a DeviceBox<DeviceCopyWithPortableBitSemantics<T>>,
     host_ref: &'a T,
     stream: PhantomData<&'stream Stream>,
 }
 
-impl<'stream, 'a, T: PortableBitSemantics> Clone for HostAndDeviceConstRefAsync<'stream, 'a, T> {
+impl<'stream, 'a, T: PortableBitSemantics + TypeGraphLayout> Clone
+    for HostAndDeviceConstRefAsync<'stream, 'a, T>
+{
     fn clone(&self) -> Self {
         *self
     }
 }
 
-impl<'stream, 'a, T: PortableBitSemantics> Copy for HostAndDeviceConstRefAsync<'stream, 'a, T> {}
+impl<'stream, 'a, T: PortableBitSemantics + TypeGraphLayout> Copy
+    for HostAndDeviceConstRefAsync<'stream, 'a, T>
+{
+}
 
-impl<'stream, 'a, T: PortableBitSemantics> HostAndDeviceConstRefAsync<'stream, 'a, T> {
+impl<'stream, 'a, T: PortableBitSemantics + TypeGraphLayout>
+    HostAndDeviceConstRefAsync<'stream, 'a, T>
+{
     /// # Safety
     ///
     /// `device_box` must contain EXACTLY the device copy of `host_ref`
     #[must_use]
     pub const unsafe fn new(
-        device_box: &'a DeviceBox<SafeDeviceCopyWrapper<T>>,
+        device_box: &'a DeviceBox<DeviceCopyWithPortableBitSemantics<T>>,
         host_ref: &'a T,
         stream: &'stream Stream,
     ) -> Self {
@@ -478,13 +491,15 @@ impl<'stream, 'a, T: PortableBitSemantics> HostAndDeviceConstRefAsync<'stream, '
 }
 
 #[allow(clippy::module_name_repetitions)]
-pub struct HostAndDeviceOwnedAsync<'stream, 'a, T: PortableBitSemantics> {
-    device_box: &'a mut DeviceBox<SafeDeviceCopyWrapper<T>>,
+pub struct HostAndDeviceOwnedAsync<'stream, 'a, T: PortableBitSemantics + TypeGraphLayout> {
+    device_box: &'a mut DeviceBox<DeviceCopyWithPortableBitSemantics<T>>,
     host_val: &'a mut T,
     stream: PhantomData<&'stream Stream>,
 }
 
-impl<'stream, 'a, T: PortableBitSemantics> HostAndDeviceOwnedAsync<'stream, 'a, T> {
+impl<'stream, 'a, T: PortableBitSemantics + TypeGraphLayout>
+    HostAndDeviceOwnedAsync<'stream, 'a, T>
+{
     #[must_use]
     /// # Safety
     ///
diff --git a/src/kernel/mod.rs b/src/kernel/mod.rs
index 29b3795c0..b6ed5b8e7 100644
--- a/src/kernel/mod.rs
+++ b/src/kernel/mod.rs
@@ -226,9 +226,9 @@ impl RawPtxKernel {
     /// Returns a [`CudaError`] if `ptx` is not a valid PTX source, or it does
     ///  not contain an entry point named `entry_point`.
     pub fn new(ptx: &CStr, entry_point: &CStr) -> CudaResult<Self> {
-        let module = Box::new(Module::load_from_string(ptx)?);
+        let module: Box<Module> = Box::new(Module::load_from_string(ptx)?);
 
-        let function = unsafe { &*(module.as_ref() as *const Module) }.get_function(entry_point);
+        let function = unsafe { &*std::ptr::from_ref(module.as_ref()) }.get_function(entry_point);
 
         let function = match function {
             Ok(function) => function,
diff --git a/src/kernel/param.rs b/src/kernel/param.rs
index 9b2499b51..17d4bc3a5 100644
--- a/src/kernel/param.rs
+++ b/src/kernel/param.rs
@@ -37,12 +37,14 @@ impl<T> DerefMut for PtxJit<T> {
     }
 }
 
-pub struct PerThreadShallowCopy<T: crate::safety::StackOnly + crate::safety::PortableBitSemantics> {
+pub struct PerThreadShallowCopy<
+    T: crate::safety::StackOnly + crate::safety::PortableBitSemantics + TypeGraphLayout,
+> {
     never: !,
     _marker: PhantomData<T>,
 }
 
-impl<T: crate::safety::StackOnly + crate::safety::PortableBitSemantics> Deref
+impl<T: crate::safety::StackOnly + crate::safety::PortableBitSemantics + TypeGraphLayout> Deref
     for PerThreadShallowCopy<T>
 {
     type Target = T;
@@ -52,7 +54,7 @@ impl<T: crate::safety::StackOnly + crate::safety::PortableBitSemantics> Deref
     }
 }
 
-impl<T: crate::safety::StackOnly + crate::safety::PortableBitSemantics> DerefMut
+impl<T: crate::safety::StackOnly + crate::safety::PortableBitSemantics + TypeGraphLayout> DerefMut
     for PerThreadShallowCopy<T>
 {
     fn deref_mut(&mut self) -> &mut Self::Target {
@@ -60,14 +62,20 @@ impl<T: crate::safety::StackOnly + crate::safety::PortableBitSemantics> DerefMut
     }
 }
 
-impl<T: Copy + Send + crate::safety::StackOnly + crate::safety::PortableBitSemantics>
-    CudaKernelParameter for PerThreadShallowCopy<T>
+impl<
+        T: Copy
+            + Send
+            + crate::safety::StackOnly
+            + crate::safety::PortableBitSemantics
+            + TypeGraphLayout,
+    > CudaKernelParameter for PerThreadShallowCopy<T>
 {
     #[cfg(feature = "host")]
-    type AsyncHostType<'stream, 'b> = crate::utils::device_copy::SafeDeviceCopyWrapper<T>;
+    type AsyncHostType<'stream, 'b> =
+        crate::utils::adapter::RustToCudaWithPortableBitCopySemantics<T>;
     #[cfg(any(feature = "device", doc))]
     type DeviceType<'b> = T;
-    type FfiType<'stream, 'b> = crate::utils::device_copy::SafeDeviceCopyWrapper<T>;
+    type FfiType<'stream, 'b> = crate::utils::adapter::RustToCudaWithPortableBitCopySemantics<T>;
     #[cfg(feature = "host")]
     type SyncHostType = T;
 
@@ -77,9 +85,7 @@ impl<T: Copy + Send + crate::safety::StackOnly + crate::safety::PortableBitSeman
         _stream: &'stream rustacuda::stream::Stream,
         inner: impl for<'b> FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result<O, E>,
     ) -> Result<O, E> {
-        inner(crate::utils::device_copy::SafeDeviceCopyWrapper::from(
-            param,
-        ))
+        inner(crate::utils::adapter::RustToCudaWithPortableBitCopySemantics::from(param))
     }
 
     #[cfg(feature = "host")]
@@ -112,13 +118,24 @@ impl<T: Copy + Send + crate::safety::StackOnly + crate::safety::PortableBitSeman
         inner(param)
     }
 }
-impl<T: Copy + Send + crate::safety::StackOnly + crate::safety::PortableBitSemantics> sealed::Sealed
-    for PerThreadShallowCopy<T>
+impl<
+        T: Copy
+            + Send
+            + crate::safety::StackOnly
+            + crate::safety::PortableBitSemantics
+            + TypeGraphLayout,
+    > sealed::Sealed for PerThreadShallowCopy<T>
 {
 }
 
-impl<'a, T: 'static + Sync + crate::safety::StackOnly + crate::safety::PortableBitSemantics>
-    CudaKernelParameter for &'a PerThreadShallowCopy<T>
+impl<
+        'a,
+        T: 'static
+            + Sync
+            + crate::safety::StackOnly
+            + crate::safety::PortableBitSemantics
+            + TypeGraphLayout,
+    > CudaKernelParameter for &'a PerThreadShallowCopy<T>
 {
     #[cfg(feature = "host")]
     type AsyncHostType<'stream, 'b> = crate::host::HostAndDeviceConstRefAsync<'stream, 'b, T>;
@@ -167,13 +184,25 @@ impl<'a, T: 'static + Sync + crate::safety::StackOnly + crate::safety::PortableB
         inner(param)
     }
 }
-impl<'a, T: 'static + Sync + crate::safety::StackOnly + crate::safety::PortableBitSemantics>
-    sealed::Sealed for &'a PerThreadShallowCopy<T>
+impl<
+        'a,
+        T: 'static
+            + Sync
+            + crate::safety::StackOnly
+            + crate::safety::PortableBitSemantics
+            + TypeGraphLayout,
+    > sealed::Sealed for &'a PerThreadShallowCopy<T>
 {
 }
 
-impl<'a, T: 'static + Sync + crate::safety::StackOnly + crate::safety::PortableBitSemantics>
-    CudaKernelParameter for &'a PtxJit<PerThreadShallowCopy<T>>
+impl<
+        'a,
+        T: 'static
+            + Sync
+            + crate::safety::StackOnly
+            + crate::safety::PortableBitSemantics
+            + TypeGraphLayout,
+    > CudaKernelParameter for &'a PtxJit<PerThreadShallowCopy<T>>
 {
     #[cfg(feature = "host")]
     type AsyncHostType<'stream, 'b> =
@@ -226,20 +255,35 @@ impl<'a, T: 'static + Sync + crate::safety::StackOnly + crate::safety::PortableB
         )
     }
 }
-impl<'a, T: 'static + Sync + crate::safety::StackOnly + crate::safety::PortableBitSemantics>
-    sealed::Sealed for &'a PtxJit<PerThreadShallowCopy<T>>
+impl<
+        'a,
+        T: 'static
+            + Sync
+            + crate::safety::StackOnly
+            + crate::safety::PortableBitSemantics
+            + TypeGraphLayout,
+    > sealed::Sealed for &'a PtxJit<PerThreadShallowCopy<T>>
 {
 }
 
 pub struct ShallowInteriorMutable<
-    T: crate::safety::StackOnly + crate::safety::PortableBitSemantics + InteriorMutableSync,
+    T: Sync
+        + crate::safety::StackOnly
+        + crate::safety::PortableBitSemantics
+        + TypeGraphLayout
+        + InteriorMutableSync,
 > {
     never: !,
     _marker: PhantomData<T>,
 }
 
-impl<T: crate::safety::StackOnly + crate::safety::PortableBitSemantics + InteriorMutableSync> Deref
-    for ShallowInteriorMutable<T>
+impl<
+        T: Sync
+            + crate::safety::StackOnly
+            + crate::safety::PortableBitSemantics
+            + TypeGraphLayout
+            + InteriorMutableSync,
+    > Deref for ShallowInteriorMutable<T>
 {
     type Target = T;
 
@@ -251,8 +295,10 @@ impl<T: crate::safety::StackOnly + crate::safety::PortableBitSemantics + Interio
 impl<
         'a,
         T: 'static
+            + Sync
             + crate::safety::StackOnly
             + crate::safety::PortableBitSemantics
+            + TypeGraphLayout
             + InteriorMutableSync,
     > CudaKernelParameter for &'a ShallowInteriorMutable<T>
 {
@@ -309,7 +355,11 @@ impl<
 }
 impl<
         'a,
-        T: crate::safety::StackOnly + crate::safety::PortableBitSemantics + InteriorMutableSync,
+        T: crate::safety::StackOnly
+            + Sync
+            + crate::safety::PortableBitSemantics
+            + TypeGraphLayout
+            + InteriorMutableSync,
     > sealed::Sealed for &'a ShallowInteriorMutable<T>
 {
 }
@@ -355,10 +405,12 @@ impl<T: RustToCuda> Deref for SharedHeapPerThreadShallowCopy<T> {
 }
 
 impl<
-        T: RustToCuda<
-            CudaRepresentation: 'static + crate::safety::PortableBitSemantics,
-            CudaAllocation: EmptyCudaAlloc,
-        >,
+        T: Send
+            + Clone
+            + RustToCuda<
+                CudaRepresentation: 'static + crate::safety::StackOnly,
+                CudaAllocation: EmptyCudaAlloc,
+            >,
     > CudaKernelParameter for SharedHeapPerThreadShallowCopy<T>
 {
     #[cfg(feature = "host")]
@@ -412,15 +464,19 @@ impl<
     }
 }
 impl<
-        T: RustToCuda<
-            CudaRepresentation: crate::safety::PortableBitSemantics,
-            CudaAllocation: EmptyCudaAlloc,
-        >,
+        T: Send
+            + Clone
+            + RustToCuda<
+                CudaRepresentation: 'static + crate::safety::StackOnly,
+                CudaAllocation: EmptyCudaAlloc,
+            >,
     > sealed::Sealed for SharedHeapPerThreadShallowCopy<T>
 {
 }
 
-impl<'a, T: 'static + RustToCuda> CudaKernelParameter for &'a SharedHeapPerThreadShallowCopy<T> {
+impl<'a, T: 'static + Sync + RustToCuda> CudaKernelParameter
+    for &'a SharedHeapPerThreadShallowCopy<T>
+{
     #[cfg(feature = "host")]
     type AsyncHostType<'stream, 'b> = crate::host::HostAndDeviceConstRefAsync<
         'stream,
@@ -471,13 +527,15 @@ impl<'a, T: 'static + RustToCuda> CudaKernelParameter for &'a SharedHeapPerThrea
         unsafe { crate::lend::BorrowFromRust::with_borrow_from_rust(param, inner) }
     }
 }
-impl<'a, T: RustToCuda> sealed::Sealed for &'a SharedHeapPerThreadShallowCopy<T> {}
+impl<'a, T: Sync + RustToCuda> sealed::Sealed for &'a SharedHeapPerThreadShallowCopy<T> {}
 
 impl<
-        T: RustToCuda<
-            CudaRepresentation: 'static + crate::safety::PortableBitSemantics,
-            CudaAllocation: EmptyCudaAlloc,
-        >,
+        T: Send
+            + Clone
+            + RustToCuda<
+                CudaRepresentation: 'static + crate::safety::StackOnly,
+                CudaAllocation: EmptyCudaAlloc,
+            >,
     > CudaKernelParameter for PtxJit<SharedHeapPerThreadShallowCopy<T>>
 {
     #[cfg(feature = "host")]
@@ -535,15 +593,17 @@ impl<
     }
 }
 impl<
-        T: RustToCuda<
-            CudaRepresentation: crate::safety::PortableBitSemantics,
-            CudaAllocation: EmptyCudaAlloc,
-        >,
+        T: Send
+            + Clone
+            + RustToCuda<
+                CudaRepresentation: 'static + crate::safety::StackOnly,
+                CudaAllocation: EmptyCudaAlloc,
+            >,
     > sealed::Sealed for PtxJit<SharedHeapPerThreadShallowCopy<T>>
 {
 }
 
-impl<'a, T: 'static + RustToCuda> CudaKernelParameter
+impl<'a, T: 'static + Sync + RustToCuda> CudaKernelParameter
     for &'a PtxJit<SharedHeapPerThreadShallowCopy<T>>
 {
     #[cfg(feature = "host")]
@@ -601,7 +661,10 @@ impl<'a, T: 'static + RustToCuda> CudaKernelParameter
         )
     }
 }
-impl<'a, T: RustToCuda> sealed::Sealed for &'a PtxJit<SharedHeapPerThreadShallowCopy<T>> {}
+impl<'a, T: 'static + Sync + RustToCuda> sealed::Sealed
+    for &'a PtxJit<SharedHeapPerThreadShallowCopy<T>>
+{
+}
 
 #[cfg(feature = "host")]
 fn param_as_raw_bytes<T: ?Sized>(r: &T) -> NonNull<[u8]> {
diff --git a/src/lend/impls/box.rs b/src/lend/impls/box.rs
index e2a78999b..4156b1a29 100644
--- a/src/lend/impls/box.rs
+++ b/src/lend/impls/box.rs
@@ -17,7 +17,7 @@ use crate::utils::ffi::DeviceAccessible;
 use crate::{
     alloc::{CombinedCudaAlloc, CudaAlloc},
     host::CudaDropWrapper,
-    utils::device_copy::SafeDeviceCopyWrapper,
+    utils::adapter::DeviceCopyWithPortableBitSemantics,
 };
 
 #[doc(hidden)]
@@ -28,7 +28,7 @@ pub struct BoxCudaRepresentation<T: PortableBitSemantics + TypeGraphLayout>(Devi
 
 unsafe impl<T: PortableBitSemantics + TypeGraphLayout> RustToCuda for Box<T> {
     #[cfg(all(feature = "host", not(doc)))]
-    type CudaAllocation = CudaDropWrapper<DeviceBox<SafeDeviceCopyWrapper<T>>>;
+    type CudaAllocation = CudaDropWrapper<DeviceBox<DeviceCopyWithPortableBitSemantics<T>>>;
     #[cfg(any(not(feature = "host"), doc))]
     type CudaAllocation = crate::alloc::SomeCudaAlloc;
     type CudaRepresentation = BoxCudaRepresentation<T>;
@@ -42,8 +42,9 @@ unsafe impl<T: PortableBitSemantics + TypeGraphLayout> RustToCuda for Box<T> {
         DeviceAccessible<Self::CudaRepresentation>,
         CombinedCudaAlloc<Self::CudaAllocation, A>,
     )> {
-        let mut device_box =
-            CudaDropWrapper::from(DeviceBox::new(SafeDeviceCopyWrapper::from_ref(&**self))?);
+        let mut device_box = CudaDropWrapper::from(DeviceBox::new(
+            DeviceCopyWithPortableBitSemantics::from_ref(&**self),
+        )?);
 
         Ok((
             DeviceAccessible::from(BoxCudaRepresentation(DeviceOwnedPointer(
@@ -62,7 +63,7 @@ unsafe impl<T: PortableBitSemantics + TypeGraphLayout> RustToCuda for Box<T> {
 
         let (alloc_front, alloc_tail) = alloc.split();
 
-        alloc_front.copy_to(SafeDeviceCopyWrapper::from_mut(&mut **self))?;
+        alloc_front.copy_to(DeviceCopyWithPortableBitSemantics::from_mut(&mut **self))?;
 
         core::mem::drop(alloc_front);
 
diff --git a/src/lend/impls/boxed_slice.rs b/src/lend/impls/boxed_slice.rs
index 677fcca7d..575ea4ef6 100644
--- a/src/lend/impls/boxed_slice.rs
+++ b/src/lend/impls/boxed_slice.rs
@@ -19,7 +19,7 @@ use crate::utils::ffi::DeviceAccessible;
 use crate::{
     alloc::{CombinedCudaAlloc, CudaAlloc},
     host::CudaDropWrapper,
-    utils::device_copy::SafeDeviceCopyWrapper,
+    utils::adapter::DeviceCopyWithPortableBitSemantics,
 };
 
 #[doc(hidden)]
@@ -34,7 +34,8 @@ pub struct BoxedSliceCudaRepresentation<T: PortableBitSemantics + TypeGraphLayou
 
 unsafe impl<T: PortableBitSemantics + TypeGraphLayout> RustToCuda for Box<[T]> {
     #[cfg(all(feature = "host", not(doc)))]
-    type CudaAllocation = crate::host::CudaDropWrapper<DeviceBuffer<SafeDeviceCopyWrapper<T>>>;
+    type CudaAllocation =
+        crate::host::CudaDropWrapper<DeviceBuffer<DeviceCopyWithPortableBitSemantics<T>>>;
     #[cfg(any(not(feature = "host"), doc))]
     type CudaAllocation = crate::alloc::SomeCudaAlloc;
     type CudaRepresentation = BoxedSliceCudaRepresentation<T>;
@@ -49,7 +50,7 @@ unsafe impl<T: PortableBitSemantics + TypeGraphLayout> RustToCuda for Box<[T]> {
         CombinedCudaAlloc<Self::CudaAllocation, A>,
     )> {
         let mut device_buffer = CudaDropWrapper::from(DeviceBuffer::from_slice(
-            SafeDeviceCopyWrapper::from_slice(self),
+            DeviceCopyWithPortableBitSemantics::from_slice(self),
         )?);
 
         Ok((
@@ -71,7 +72,7 @@ unsafe impl<T: PortableBitSemantics + TypeGraphLayout> RustToCuda for Box<[T]> {
 
         let (alloc_front, alloc_tail) = alloc.split();
 
-        alloc_front.copy_to(SafeDeviceCopyWrapper::from_mut_slice(self))?;
+        alloc_front.copy_to(DeviceCopyWithPortableBitSemantics::from_mut_slice(self))?;
 
         core::mem::drop(alloc_front);
 
diff --git a/src/lend/impls/option.rs b/src/lend/impls/option.rs
index f12f24861..fab89b89d 100644
--- a/src/lend/impls/option.rs
+++ b/src/lend/impls/option.rs
@@ -8,7 +8,7 @@ use rustacuda::error::CudaResult;
 use crate::{
     lend::{CudaAsRust, RustToCuda, RustToCudaAsync, RustToCudaAsyncProxy, RustToCudaProxy},
     safety::PortableBitSemantics,
-    utils::{device_copy::SafeDeviceCopyWrapper, ffi::DeviceAccessible},
+    utils::{adapter::RustToCudaWithPortableBitCopySemantics, ffi::DeviceAccessible},
 };
 
 #[cfg(feature = "host")]
@@ -145,38 +145,36 @@ unsafe impl<T: CudaAsRust> CudaAsRust for OptionCudaRepresentation<T> {
     }
 }
 
-impl<T: PortableBitSemantics + TypeGraphLayout> RustToCudaProxy<Option<T>>
-    for Option<SafeDeviceCopyWrapper<T>>
+impl<T: Copy + PortableBitSemantics + TypeGraphLayout> RustToCudaProxy<Option<T>>
+    for Option<RustToCudaWithPortableBitCopySemantics<T>>
 {
     fn from_ref(val: &Option<T>) -> &Self {
-        // Safety: [`SafeDeviceCopyWrapper`] is a transparent newtype
-        unsafe { &*(val as *const Option<T>).cast() }
+        // Safety: [`RustToCudaWithPortableBitCopySemantics`] is a transparent newtype
+        unsafe { &*core::ptr::from_ref(val).cast() }
     }
 
     fn from_mut(val: &mut Option<T>) -> &mut Self {
-        // Safety: [`SafeDeviceCopyWrapper`] is a transparent newtype
-        unsafe { &mut *(val as *mut Option<T>).cast() }
+        // Safety: [`RustToCudaWithPortableBitCopySemantics`] is a transparent newtype
+        unsafe { &mut *core::ptr::from_mut(val).cast() }
     }
 
     fn into(self) -> Option<T> {
-        self.map(SafeDeviceCopyWrapper::into_inner)
+        self.map(RustToCudaWithPortableBitCopySemantics::into_inner)
     }
 }
 
-impl<T: PortableBitSemantics + TypeGraphLayout> RustToCudaAsyncProxy<Option<T>>
-    for Option<SafeDeviceCopyWrapper<T>>
+impl<T: Copy + PortableBitSemantics + TypeGraphLayout> RustToCudaAsyncProxy<Option<T>>
+    for Option<RustToCudaWithPortableBitCopySemantics<T>>
 {
     fn from_ref(val: &Option<T>) -> &Self {
-        // Safety: [`SafeDeviceCopyWrapper`] is a transparent newtype
-        unsafe { &*(val as *const Option<T>).cast() }
+        <Self as RustToCudaProxy<Option<T>>>::from_ref(val)
     }
 
     fn from_mut(val: &mut Option<T>) -> &mut Self {
-        // Safety: [`SafeDeviceCopyWrapper`] is a transparent newtype
-        unsafe { &mut *(val as *mut Option<T>).cast() }
+        <Self as RustToCudaProxy<Option<T>>>::from_mut(val)
     }
 
     fn into(self) -> Option<T> {
-        self.map(SafeDeviceCopyWrapper::into_inner)
+        <Self as RustToCudaProxy<Option<T>>>::into(self)
     }
 }
diff --git a/src/lend/impls/ref.rs b/src/lend/impls/ref.rs
index 39ba6117d..c068920ab 100644
--- a/src/lend/impls/ref.rs
+++ b/src/lend/impls/ref.rs
@@ -18,7 +18,7 @@ use crate::utils::ffi::DeviceAccessible;
 use crate::{
     alloc::{CombinedCudaAlloc, CudaAlloc},
     host::CudaDropWrapper,
-    utils::device_copy::SafeDeviceCopyWrapper,
+    utils::adapter::DeviceCopyWithPortableBitSemantics,
 };
 
 #[doc(hidden)]
@@ -32,7 +32,7 @@ pub struct RefCudaRepresentation<'a, T: 'a + PortableBitSemantics + TypeGraphLay
 
 unsafe impl<'a, T: PortableBitSemantics + TypeGraphLayout> RustToCuda for &'a T {
     #[cfg(all(feature = "host", not(doc)))]
-    type CudaAllocation = CudaDropWrapper<DeviceBox<SafeDeviceCopyWrapper<T>>>;
+    type CudaAllocation = CudaDropWrapper<DeviceBox<DeviceCopyWithPortableBitSemantics<T>>>;
     #[cfg(any(not(feature = "host"), doc))]
     type CudaAllocation = crate::alloc::SomeCudaAlloc;
     type CudaRepresentation = RefCudaRepresentation<'a, T>;
@@ -46,8 +46,9 @@ unsafe impl<'a, T: PortableBitSemantics + TypeGraphLayout> RustToCuda for &'a T
         DeviceAccessible<Self::CudaRepresentation>,
         CombinedCudaAlloc<Self::CudaAllocation, A>,
     )> {
-        let mut device_box =
-            CudaDropWrapper::from(DeviceBox::new(SafeDeviceCopyWrapper::from_ref(&**self))?);
+        let mut device_box = CudaDropWrapper::from(DeviceBox::new(
+            DeviceCopyWithPortableBitSemantics::from_ref(&**self),
+        )?);
 
         Ok((
             DeviceAccessible::from(RefCudaRepresentation {
diff --git a/src/lend/impls/ref_mut.rs b/src/lend/impls/ref_mut.rs
index 33d0fa6e7..2a59d8953 100644
--- a/src/lend/impls/ref_mut.rs
+++ b/src/lend/impls/ref_mut.rs
@@ -18,7 +18,7 @@ use crate::utils::ffi::DeviceAccessible;
 use crate::{
     alloc::{CombinedCudaAlloc, CudaAlloc},
     host::CudaDropWrapper,
-    utils::device_copy::SafeDeviceCopyWrapper,
+    utils::adapter::DeviceCopyWithPortableBitSemantics,
 };
 
 #[doc(hidden)]
@@ -32,7 +32,7 @@ pub struct RefMutCudaRepresentation<'a, T: 'a + PortableBitSemantics + TypeGraph
 
 unsafe impl<'a, T: PortableBitSemantics + TypeGraphLayout> RustToCuda for &'a mut T {
     #[cfg(all(feature = "host", not(doc)))]
-    type CudaAllocation = CudaDropWrapper<DeviceBox<SafeDeviceCopyWrapper<T>>>;
+    type CudaAllocation = CudaDropWrapper<DeviceBox<DeviceCopyWithPortableBitSemantics<T>>>;
     #[cfg(any(not(feature = "host"), doc))]
     type CudaAllocation = crate::alloc::SomeCudaAlloc;
     type CudaRepresentation = RefMutCudaRepresentation<'a, T>;
@@ -46,8 +46,9 @@ unsafe impl<'a, T: PortableBitSemantics + TypeGraphLayout> RustToCuda for &'a mu
         DeviceAccessible<Self::CudaRepresentation>,
         CombinedCudaAlloc<Self::CudaAllocation, A>,
     )> {
-        let mut device_box =
-            CudaDropWrapper::from(DeviceBox::new(SafeDeviceCopyWrapper::from_ref(&**self))?);
+        let mut device_box = CudaDropWrapper::from(DeviceBox::new(
+            DeviceCopyWithPortableBitSemantics::from_ref(&**self),
+        )?);
 
         Ok((
             DeviceAccessible::from(RefMutCudaRepresentation {
@@ -67,7 +68,7 @@ unsafe impl<'a, T: PortableBitSemantics + TypeGraphLayout> RustToCuda for &'a mu
 
         let (alloc_front, alloc_tail) = alloc.split();
 
-        alloc_front.copy_to(SafeDeviceCopyWrapper::from_mut(&mut **self))?;
+        alloc_front.copy_to(DeviceCopyWithPortableBitSemantics::from_mut(&mut **self))?;
 
         core::mem::drop(alloc_front);
 
diff --git a/src/lend/impls/slice_ref.rs b/src/lend/impls/slice_ref.rs
index 4b7898571..70d3a1e63 100644
--- a/src/lend/impls/slice_ref.rs
+++ b/src/lend/impls/slice_ref.rs
@@ -18,7 +18,7 @@ use crate::utils::ffi::DeviceAccessible;
 use crate::{
     alloc::{CombinedCudaAlloc, CudaAlloc},
     host::CudaDropWrapper,
-    utils::device_copy::SafeDeviceCopyWrapper,
+    utils::adapter::DeviceCopyWithPortableBitSemantics,
 };
 
 #[doc(hidden)]
@@ -33,7 +33,8 @@ pub struct SliceRefCudaRepresentation<'a, T: 'a + PortableBitSemantics + TypeGra
 
 unsafe impl<'a, T: PortableBitSemantics + TypeGraphLayout> RustToCuda for &'a [T] {
     #[cfg(all(feature = "host", not(doc)))]
-    type CudaAllocation = crate::host::CudaDropWrapper<DeviceBuffer<SafeDeviceCopyWrapper<T>>>;
+    type CudaAllocation =
+        crate::host::CudaDropWrapper<DeviceBuffer<DeviceCopyWithPortableBitSemantics<T>>>;
     #[cfg(any(not(feature = "host"), doc))]
     type CudaAllocation = crate::alloc::SomeCudaAlloc;
     type CudaRepresentation = SliceRefCudaRepresentation<'a, T>;
@@ -48,7 +49,7 @@ unsafe impl<'a, T: PortableBitSemantics + TypeGraphLayout> RustToCuda for &'a [T
         CombinedCudaAlloc<Self::CudaAllocation, A>,
     )> {
         let device_buffer = CudaDropWrapper::from(DeviceBuffer::from_slice(
-            SafeDeviceCopyWrapper::from_slice(self),
+            DeviceCopyWithPortableBitSemantics::from_slice(self),
         )?);
 
         Ok((
diff --git a/src/lend/impls/slice_ref_mut.rs b/src/lend/impls/slice_ref_mut.rs
index 9246fa474..0e802ccca 100644
--- a/src/lend/impls/slice_ref_mut.rs
+++ b/src/lend/impls/slice_ref_mut.rs
@@ -18,7 +18,7 @@ use crate::utils::ffi::DeviceAccessible;
 use crate::{
     alloc::{CombinedCudaAlloc, CudaAlloc},
     host::CudaDropWrapper,
-    utils::device_copy::SafeDeviceCopyWrapper,
+    utils::adapter::DeviceCopyWithPortableBitSemantics,
 };
 
 #[doc(hidden)]
@@ -33,7 +33,8 @@ pub struct SliceRefMutCudaRepresentation<'a, T: 'a + PortableBitSemantics + Type
 
 unsafe impl<'a, T: PortableBitSemantics + TypeGraphLayout> RustToCuda for &'a mut [T] {
     #[cfg(all(feature = "host", not(doc)))]
-    type CudaAllocation = crate::host::CudaDropWrapper<DeviceBuffer<SafeDeviceCopyWrapper<T>>>;
+    type CudaAllocation =
+        crate::host::CudaDropWrapper<DeviceBuffer<DeviceCopyWithPortableBitSemantics<T>>>;
     #[cfg(any(not(feature = "host"), doc))]
     type CudaAllocation = crate::alloc::SomeCudaAlloc;
     type CudaRepresentation = SliceRefMutCudaRepresentation<'a, T>;
@@ -48,7 +49,7 @@ unsafe impl<'a, T: PortableBitSemantics + TypeGraphLayout> RustToCuda for &'a mu
         CombinedCudaAlloc<Self::CudaAllocation, A>,
     )> {
         let mut device_buffer = CudaDropWrapper::from(DeviceBuffer::from_slice(
-            SafeDeviceCopyWrapper::from_slice(self),
+            DeviceCopyWithPortableBitSemantics::from_slice(self),
         )?);
 
         Ok((
@@ -70,7 +71,7 @@ unsafe impl<'a, T: PortableBitSemantics + TypeGraphLayout> RustToCuda for &'a mu
 
         let (alloc_front, alloc_tail) = alloc.split();
 
-        alloc_front.copy_to(SafeDeviceCopyWrapper::from_mut_slice(self))?;
+        alloc_front.copy_to(DeviceCopyWithPortableBitSemantics::from_mut_slice(self))?;
 
         core::mem::drop(alloc_front);
 
diff --git a/src/lend/mod.rs b/src/lend/mod.rs
index 2fac0a08e..603064fb8 100644
--- a/src/lend/mod.rs
+++ b/src/lend/mod.rs
@@ -6,17 +6,18 @@ use rustacuda::error::CudaError;
 #[allow(clippy::module_name_repetitions)]
 pub use rust_cuda_derive::LendRustToCuda;
 
-use crate::alloc::CudaAlloc;
-
+#[cfg(any(feature = "host", feature = "device", doc))]
+use crate::safety::StackOnly;
 #[cfg(feature = "device")]
 use crate::utils::ffi::{DeviceConstRef, DeviceOwnedRef};
+use crate::{alloc::CudaAlloc, safety::PortableBitSemantics};
+#[cfg(any(feature = "host", feature = "device"))]
+use crate::{alloc::EmptyCudaAlloc, utils::ffi::DeviceAccessible};
 #[cfg(feature = "host")]
 use crate::{
-    alloc::{CombinedCudaAlloc, EmptyCudaAlloc, NoCudaAlloc},
+    alloc::{CombinedCudaAlloc, NoCudaAlloc},
     host::{HostAndDeviceConstRef, HostAndDeviceOwned},
 };
-#[cfg(any(feature = "host", feature = "device"))]
-use crate::{safety::PortableBitSemantics, utils::ffi::DeviceAccessible};
 
 mod impls;
 
@@ -163,7 +164,7 @@ pub trait LendToCuda: RustToCuda {
         inner: F,
     ) -> Result<O, E>;
 
-    /// Moves `self` to CUDA iff `self` has [`PortableBitSemantics`]
+    /// Moves `self` to CUDA iff `self` is [`StackOnly`].
     ///
     /// # Errors
     ///
@@ -179,7 +180,7 @@ pub trait LendToCuda: RustToCuda {
         inner: F,
     ) -> Result<O, E>
     where
-        Self: RustToCuda<CudaRepresentation: PortableBitSemantics, CudaAllocation: EmptyCudaAlloc>;
+        Self: RustToCuda<CudaRepresentation: StackOnly, CudaAllocation: EmptyCudaAlloc>;
 }
 
 #[cfg(feature = "host")]
@@ -215,7 +216,7 @@ impl<T: RustToCuda> LendToCuda for T {
         inner: F,
     ) -> Result<O, E>
     where
-        Self: RustToCuda<CudaRepresentation: PortableBitSemantics, CudaAllocation: EmptyCudaAlloc>,
+        Self: RustToCuda<CudaRepresentation: StackOnly, CudaAllocation: EmptyCudaAlloc>,
     {
         let (cuda_repr, alloc) = unsafe { self.borrow(NoCudaAlloc) }?;
 
@@ -249,8 +250,7 @@ pub trait BorrowFromRust: RustToCuda {
         inner: F,
     ) -> O
     where
-        Self: Sized,
-        <Self as RustToCuda>::CudaRepresentation: PortableBitSemantics;
+        Self: Sized + RustToCuda<CudaRepresentation: StackOnly, CudaAllocation: EmptyCudaAlloc>;
 }
 
 #[cfg(feature = "device")]
@@ -273,8 +273,7 @@ impl<T: RustToCuda> BorrowFromRust for T {
         inner: F,
     ) -> O
     where
-        Self: Sized,
-        <Self as RustToCuda>::CudaRepresentation: PortableBitSemantics,
+        Self: RustToCuda<CudaRepresentation: StackOnly, CudaAllocation: EmptyCudaAlloc>,
     {
         inner(CudaAsRust::as_rust(cuda_repr.as_mut()))
     }
diff --git a/src/utils/adapter.rs b/src/utils/adapter.rs
new file mode 100644
index 000000000..8be7712ef
--- /dev/null
+++ b/src/utils/adapter.rs
@@ -0,0 +1,248 @@
+#![allow(clippy::trait_duplication_in_bounds)]
+
+use const_type_layout::{TypeGraphLayout, TypeLayout};
+
+use crate::{
+    alloc::NoCudaAlloc,
+    lend::{CudaAsRust, RustToCuda, RustToCudaAsync},
+    safety::PortableBitSemantics,
+};
+
+#[cfg(any(feature = "host", feature = "device"))]
+use crate::utils::ffi::DeviceAccessible;
+
+#[cfg(feature = "host")]
+use crate::alloc::{CombinedCudaAlloc, CudaAlloc};
+
+#[derive(Copy, Clone, Debug, TypeLayout)]
+#[repr(transparent)]
+pub struct RustToCudaWithPortableBitCopySemantics<T: Copy + PortableBitSemantics + TypeGraphLayout>(
+    T,
+);
+
+impl<T: Copy + PortableBitSemantics + TypeGraphLayout> From<T>
+    for RustToCudaWithPortableBitCopySemantics<T>
+{
+    fn from(value: T) -> Self {
+        Self(value)
+    }
+}
+
+impl<T: Copy + PortableBitSemantics + TypeGraphLayout> RustToCudaWithPortableBitCopySemantics<T> {
+    #[must_use]
+    pub const fn from_copy(value: &T) -> Self {
+        Self(*value)
+    }
+
+    #[must_use]
+    pub const fn into_inner(self) -> T {
+        self.0
+    }
+
+    #[must_use]
+    pub const fn from_ref(reference: &T) -> &Self {
+        // Safety: [`RustToCudaWithPortableBitCopySemantics`] is a transparent newtype
+        //         around `T`
+        unsafe { &*core::ptr::from_ref(reference).cast() }
+    }
+
+    #[must_use]
+    pub const fn into_ref(&self) -> &T {
+        // Safety: [`RustToCudaWithPortableBitCopySemantics`] is a transparent newtype
+        //         around `T`
+        unsafe { &*core::ptr::from_ref(self).cast() }
+    }
+
+    #[must_use]
+    pub fn from_mut(reference: &mut T) -> &mut Self {
+        // Safety: [`RustToCudaWithPortableBitCopySemantics`] is a transparent newtype
+        //         around `T`
+        unsafe { &mut *core::ptr::from_mut(reference).cast() }
+    }
+
+    #[must_use]
+    pub fn into_mut(&mut self) -> &mut T {
+        // Safety: [`RustToCudaWithPortableBitCopySemantics`] is a transparent newtype
+        //         around `T`
+        unsafe { &mut *core::ptr::from_mut(self).cast() }
+    }
+
+    #[must_use]
+    pub const fn from_slice(slice: &[T]) -> &[Self] {
+        // Safety: [`RustToCudaWithPortableBitCopySemantics`] is a transparent newtype
+        //         around `T`
+        unsafe { core::slice::from_raw_parts(slice.as_ptr().cast(), slice.len()) }
+    }
+
+    #[must_use]
+    pub const fn into_slice(slice: &[Self]) -> &[T] {
+        // Safety: [`RustToCudaWithPortableBitCopySemantics`] is a transparent newtype
+        //         around `T`
+        unsafe { core::slice::from_raw_parts(slice.as_ptr().cast(), slice.len()) }
+    }
+
+    #[must_use]
+    pub fn from_mut_slice(slice: &mut [T]) -> &mut [Self] {
+        // Safety: [`RustToCudaWithPortableBitCopySemantics`] is a transparent newtype
+        //         around `T`
+        unsafe { core::slice::from_raw_parts_mut(slice.as_mut_ptr().cast(), slice.len()) }
+    }
+
+    #[must_use]
+    pub fn into_mut_slice(slice: &mut [Self]) -> &mut [T] {
+        // Safety: [`RustToCudaWithPortableBitCopySemantics`] is a transparent newtype
+        //         around `T`
+        unsafe { core::slice::from_raw_parts_mut(slice.as_mut_ptr().cast(), slice.len()) }
+    }
+}
+
+unsafe impl<T: Copy + PortableBitSemantics + TypeGraphLayout> RustToCuda
+    for RustToCudaWithPortableBitCopySemantics<T>
+{
+    type CudaAllocation = NoCudaAlloc;
+    type CudaRepresentation = Self;
+
+    #[cfg(feature = "host")]
+    #[allow(clippy::type_complexity)]
+    unsafe fn borrow<A: CudaAlloc>(
+        &self,
+        alloc: A,
+    ) -> rustacuda::error::CudaResult<(
+        DeviceAccessible<Self::CudaRepresentation>,
+        CombinedCudaAlloc<Self::CudaAllocation, A>,
+    )> {
+        let alloc = CombinedCudaAlloc::new(NoCudaAlloc, alloc);
+        Ok((DeviceAccessible::from(*self), alloc))
+    }
+
+    #[cfg(feature = "host")]
+    unsafe fn restore<A: CudaAlloc>(
+        &mut self,
+        alloc: CombinedCudaAlloc<Self::CudaAllocation, A>,
+    ) -> rustacuda::error::CudaResult<A> {
+        let (_alloc_front, alloc_tail): (NoCudaAlloc, A) = alloc.split();
+
+        Ok(alloc_tail)
+    }
+}
+
+unsafe impl<T: Copy + PortableBitSemantics + TypeGraphLayout> RustToCudaAsync
+    for RustToCudaWithPortableBitCopySemantics<T>
+{
+    #[cfg(feature = "host")]
+    #[allow(clippy::type_complexity)]
+    unsafe fn borrow_async<A: CudaAlloc>(
+        &self,
+        alloc: A,
+        _stream: &rustacuda::stream::Stream,
+    ) -> rustacuda::error::CudaResult<(
+        DeviceAccessible<Self::CudaRepresentation>,
+        CombinedCudaAlloc<Self::CudaAllocation, A>,
+    )> {
+        let alloc = CombinedCudaAlloc::new(NoCudaAlloc, alloc);
+        Ok((DeviceAccessible::from(*self), alloc))
+    }
+
+    #[cfg(feature = "host")]
+    unsafe fn restore_async<A: CudaAlloc>(
+        &mut self,
+        alloc: CombinedCudaAlloc<Self::CudaAllocation, A>,
+        _stream: &rustacuda::stream::Stream,
+    ) -> rustacuda::error::CudaResult<A> {
+        let (_alloc_front, alloc_tail): (NoCudaAlloc, A) = alloc.split();
+
+        Ok(alloc_tail)
+    }
+}
+
+unsafe impl<T: Copy + PortableBitSemantics + TypeGraphLayout> CudaAsRust
+    for RustToCudaWithPortableBitCopySemantics<T>
+{
+    type RustRepresentation = Self;
+
+    #[cfg(feature = "device")]
+    unsafe fn as_rust(this: &DeviceAccessible<Self>) -> Self::RustRepresentation {
+        let mut uninit = core::mem::MaybeUninit::uninit();
+        core::ptr::copy_nonoverlapping(&**this, uninit.as_mut_ptr(), 1);
+        uninit.assume_init()
+    }
+}
+
+#[allow(clippy::module_name_repetitions)]
+#[derive(Copy, Clone, Debug, TypeLayout)]
+#[repr(transparent)]
+pub struct DeviceCopyWithPortableBitSemantics<T: PortableBitSemantics + TypeGraphLayout>(T);
+
+unsafe impl<T: PortableBitSemantics + TypeGraphLayout> rustacuda_core::DeviceCopy
+    for DeviceCopyWithPortableBitSemantics<T>
+{
+}
+
+impl<T: PortableBitSemantics + TypeGraphLayout> From<T> for DeviceCopyWithPortableBitSemantics<T> {
+    fn from(value: T) -> Self {
+        Self(value)
+    }
+}
+
+impl<T: PortableBitSemantics + TypeGraphLayout> DeviceCopyWithPortableBitSemantics<T> {
+    #[must_use]
+    pub fn into_inner(self) -> T {
+        self.0
+    }
+
+    #[must_use]
+    pub const fn from_ref(reference: &T) -> &Self {
+        // Safety: [`DeviceCopyWithPortableBitSemantics`] is a transparent newtype
+        //         around `T`
+        unsafe { &*core::ptr::from_ref(reference).cast() }
+    }
+
+    #[must_use]
+    pub const fn into_ref(&self) -> &T {
+        // Safety: [`DeviceCopyWithPortableBitSemantics`] is a transparent newtype
+        //         around `T`
+        unsafe { &*core::ptr::from_ref(self).cast() }
+    }
+
+    #[must_use]
+    pub fn from_mut(reference: &mut T) -> &mut Self {
+        // Safety: [`DeviceCopyWithPortableBitSemantics`] is a transparent newtype
+        //         around `T`
+        unsafe { &mut *core::ptr::from_mut(reference).cast() }
+    }
+
+    #[must_use]
+    pub fn into_mut(&mut self) -> &mut T {
+        // Safety: [`DeviceCopyWithPortableBitSemantics`] is a transparent newtype
+        //         around `T`
+        unsafe { &mut *core::ptr::from_mut(self).cast() }
+    }
+
+    #[must_use]
+    pub const fn from_slice(slice: &[T]) -> &[Self] {
+        // Safety: [`DeviceCopyWithPortableBitSemantics`] is a transparent newtype
+        //         around `T`
+        unsafe { core::slice::from_raw_parts(slice.as_ptr().cast(), slice.len()) }
+    }
+
+    #[must_use]
+    pub const fn into_slice(slice: &[Self]) -> &[T] {
+        // Safety: [`DeviceCopyWithPortableBitSemantics`] is a transparent newtype
+        //         around `T`
+        unsafe { core::slice::from_raw_parts(slice.as_ptr().cast(), slice.len()) }
+    }
+
+    #[must_use]
+    pub fn from_mut_slice(slice: &mut [T]) -> &mut [Self] {
+        // Safety: [`DeviceCopyWithPortableBitSemantics`] is a transparent newtype
+        //         around `T`
+        unsafe { core::slice::from_raw_parts_mut(slice.as_mut_ptr().cast(), slice.len()) }
+    }
+
+    #[must_use]
+    pub fn into_mut_slice(slice: &mut [Self]) -> &mut [T] {
+        // Safety: [`DeviceCopyWithPortableBitSemantics`] is a transparent newtype
+        //         around `T`
+        unsafe { core::slice::from_raw_parts_mut(slice.as_mut_ptr().cast(), slice.len()) }
+    }
+}
diff --git a/src/utils/device_copy.rs b/src/utils/device_copy.rs
deleted file mode 100644
index 72bd7d64e..000000000
--- a/src/utils/device_copy.rs
+++ /dev/null
@@ -1,150 +0,0 @@
-#![allow(clippy::trait_duplication_in_bounds)]
-
-use const_type_layout::{TypeGraphLayout, TypeLayout};
-
-use crate::{
-    alloc::NoCudaAlloc,
-    lend::{CudaAsRust, RustToCuda, RustToCudaAsync},
-    safety::PortableBitSemantics,
-};
-
-#[cfg(any(feature = "host", feature = "device"))]
-use crate::utils::ffi::DeviceAccessible;
-
-#[cfg(feature = "host")]
-use crate::alloc::{CombinedCudaAlloc, CudaAlloc};
-
-#[derive(Copy, Clone, Debug, TypeLayout)]
-#[repr(transparent)]
-pub struct SafeDeviceCopyWrapper<T: PortableBitSemantics>(T);
-
-unsafe impl<T: PortableBitSemantics> rustacuda_core::DeviceCopy for SafeDeviceCopyWrapper<T> {}
-
-impl<T: PortableBitSemantics> From<T> for SafeDeviceCopyWrapper<T> {
-    fn from(value: T) -> Self {
-        Self(value)
-    }
-}
-
-impl<T: PortableBitSemantics> SafeDeviceCopyWrapper<T> {
-    #[must_use]
-    pub fn into_inner(self) -> T {
-        self.0
-    }
-
-    #[must_use]
-    pub const fn from_ref(reference: &T) -> &Self {
-        // Safety: [`SafeDeviceCopyWrapper`] is a transparent newtype around [`T`]
-        unsafe { &*(reference as *const T).cast() }
-    }
-
-    #[must_use]
-    pub const fn into_ref(&self) -> &T {
-        // Safety: [`SafeDeviceCopyWrapper`] is a transparent newtype around [`T`]
-        unsafe { &*(self as *const Self).cast() }
-    }
-
-    #[must_use]
-    pub fn from_mut(reference: &mut T) -> &mut Self {
-        // Safety: [`SafeDeviceCopyWrapper`] is a transparent newtype around [`T`]
-        unsafe { &mut *(reference as *mut T).cast() }
-    }
-
-    #[must_use]
-    pub fn into_mut(&mut self) -> &mut T {
-        // Safety: [`SafeDeviceCopyWrapper`] is a transparent newtype around [`T`]
-        unsafe { &mut *(self as *mut Self).cast() }
-    }
-
-    #[must_use]
-    pub const fn from_slice(slice: &[T]) -> &[Self] {
-        // Safety: [`SafeDeviceCopyWrapper`] is a transparent newtype around [`T`]
-        unsafe { core::slice::from_raw_parts(slice.as_ptr().cast(), slice.len()) }
-    }
-
-    #[must_use]
-    pub const fn into_slice(slice: &[Self]) -> &[T] {
-        // Safety: [`SafeDeviceCopyWrapper`] is a transparent newtype around [`T`]
-        unsafe { core::slice::from_raw_parts(slice.as_ptr().cast(), slice.len()) }
-    }
-
-    #[must_use]
-    pub fn from_mut_slice(slice: &mut [T]) -> &mut [Self] {
-        // Safety: [`SafeDeviceCopyWrapper`] is a transparent newtype around [`T`]
-        unsafe { core::slice::from_raw_parts_mut(slice.as_mut_ptr().cast(), slice.len()) }
-    }
-
-    #[must_use]
-    pub fn into_mut_slice(slice: &mut [Self]) -> &mut [T] {
-        // Safety: [`SafeDeviceCopyWrapper`] is a transparent newtype around [`T`]
-        unsafe { core::slice::from_raw_parts_mut(slice.as_mut_ptr().cast(), slice.len()) }
-    }
-}
-
-unsafe impl<T: PortableBitSemantics + TypeGraphLayout> RustToCuda for SafeDeviceCopyWrapper<T> {
-    type CudaAllocation = NoCudaAlloc;
-    type CudaRepresentation = Self;
-
-    #[cfg(feature = "host")]
-    #[allow(clippy::type_complexity)]
-    unsafe fn borrow<A: CudaAlloc>(
-        &self,
-        alloc: A,
-    ) -> rustacuda::error::CudaResult<(
-        DeviceAccessible<Self::CudaRepresentation>,
-        CombinedCudaAlloc<Self::CudaAllocation, A>,
-    )> {
-        let alloc = CombinedCudaAlloc::new(NoCudaAlloc, alloc);
-        Ok((DeviceAccessible::from(&self.0), alloc))
-    }
-
-    #[cfg(feature = "host")]
-    unsafe fn restore<A: CudaAlloc>(
-        &mut self,
-        alloc: CombinedCudaAlloc<Self::CudaAllocation, A>,
-    ) -> rustacuda::error::CudaResult<A> {
-        let (_alloc_front, alloc_tail): (NoCudaAlloc, A) = alloc.split();
-
-        Ok(alloc_tail)
-    }
-}
-
-unsafe impl<T: PortableBitSemantics + TypeGraphLayout> RustToCudaAsync
-    for SafeDeviceCopyWrapper<T>
-{
-    #[cfg(feature = "host")]
-    #[allow(clippy::type_complexity)]
-    unsafe fn borrow_async<A: CudaAlloc>(
-        &self,
-        alloc: A,
-        _stream: &rustacuda::stream::Stream,
-    ) -> rustacuda::error::CudaResult<(
-        DeviceAccessible<Self::CudaRepresentation>,
-        CombinedCudaAlloc<Self::CudaAllocation, A>,
-    )> {
-        let alloc = CombinedCudaAlloc::new(NoCudaAlloc, alloc);
-        Ok((DeviceAccessible::from(&self.0), alloc))
-    }
-
-    #[cfg(feature = "host")]
-    unsafe fn restore_async<A: CudaAlloc>(
-        &mut self,
-        alloc: CombinedCudaAlloc<Self::CudaAllocation, A>,
-        _stream: &rustacuda::stream::Stream,
-    ) -> rustacuda::error::CudaResult<A> {
-        let (_alloc_front, alloc_tail): (NoCudaAlloc, A) = alloc.split();
-
-        Ok(alloc_tail)
-    }
-}
-
-unsafe impl<T: PortableBitSemantics + TypeGraphLayout> CudaAsRust for SafeDeviceCopyWrapper<T> {
-    type RustRepresentation = Self;
-
-    #[cfg(feature = "device")]
-    unsafe fn as_rust(this: &DeviceAccessible<Self>) -> Self::RustRepresentation {
-        let mut uninit = core::mem::MaybeUninit::uninit();
-        core::ptr::copy_nonoverlapping(&**this, uninit.as_mut_ptr(), 1);
-        uninit.assume_init()
-    }
-}
diff --git a/src/utils/exchange/buffer/common.rs b/src/utils/exchange/buffer/common.rs
index cfacf61a2..079dba419 100644
--- a/src/utils/exchange/buffer/common.rs
+++ b/src/utils/exchange/buffer/common.rs
@@ -1,6 +1,10 @@
 use const_type_layout::{TypeGraphLayout, TypeLayout};
 
-use crate::{lend::CudaAsRust, safety::PortableBitSemantics, utils::ffi::DeviceMutPointer};
+use crate::{
+    lend::CudaAsRust,
+    safety::{PortableBitSemantics, StackOnly},
+    utils::ffi::DeviceMutPointer,
+};
 
 use super::{CudaExchangeBuffer, CudaExchangeItem};
 
@@ -9,7 +13,7 @@ use super::{CudaExchangeBuffer, CudaExchangeItem};
 #[derive(TypeLayout)]
 #[repr(C)]
 pub struct CudaExchangeBufferCudaRepresentation<
-    T: PortableBitSemantics + TypeGraphLayout,
+    T: StackOnly + PortableBitSemantics + TypeGraphLayout,
     const M2D: bool,
     const M2H: bool,
 >(
@@ -17,8 +21,8 @@ pub struct CudaExchangeBufferCudaRepresentation<
     pub(super) usize,
 );
 
-unsafe impl<T: PortableBitSemantics + TypeGraphLayout, const M2D: bool, const M2H: bool> CudaAsRust
-    for CudaExchangeBufferCudaRepresentation<T, M2D, M2H>
+unsafe impl<T: StackOnly + PortableBitSemantics + TypeGraphLayout, const M2D: bool, const M2H: bool>
+    CudaAsRust for CudaExchangeBufferCudaRepresentation<T, M2D, M2H>
 {
     type RustRepresentation = CudaExchangeBuffer<T, M2D, M2H>;
 
diff --git a/src/utils/exchange/buffer/device.rs b/src/utils/exchange/buffer/device.rs
index 8c4b3b6ee..5083263b3 100644
--- a/src/utils/exchange/buffer/device.rs
+++ b/src/utils/exchange/buffer/device.rs
@@ -2,18 +2,21 @@ use core::ops::{Deref, DerefMut};
 
 use const_type_layout::TypeGraphLayout;
 
-use crate::{deps::alloc::boxed::Box, safety::PortableBitSemantics};
+use crate::{
+    deps::alloc::boxed::Box,
+    safety::{PortableBitSemantics, StackOnly},
+};
 
 use super::CudaExchangeItem;
 
 #[allow(clippy::module_name_repetitions)]
 pub struct CudaExchangeBufferDevice<
-    T: PortableBitSemantics + TypeGraphLayout,
+    T: StackOnly + PortableBitSemantics + TypeGraphLayout,
     const M2D: bool,
     const M2H: bool,
 >(pub(super) core::mem::ManuallyDrop<Box<[CudaExchangeItem<T, M2D, M2H>]>>);
 
-impl<T: PortableBitSemantics + TypeGraphLayout, const M2D: bool, const M2H: bool> Deref
+impl<T: StackOnly + PortableBitSemantics + TypeGraphLayout, const M2D: bool, const M2H: bool> Deref
     for CudaExchangeBufferDevice<T, M2D, M2H>
 {
     type Target = [CudaExchangeItem<T, M2D, M2H>];
@@ -23,8 +26,8 @@ impl<T: PortableBitSemantics + TypeGraphLayout, const M2D: bool, const M2H: bool
     }
 }
 
-impl<T: PortableBitSemantics + TypeGraphLayout, const M2D: bool, const M2H: bool> DerefMut
-    for CudaExchangeBufferDevice<T, M2D, M2H>
+impl<T: StackOnly + PortableBitSemantics + TypeGraphLayout, const M2D: bool, const M2H: bool>
+    DerefMut for CudaExchangeBufferDevice<T, M2D, M2H>
 {
     fn deref_mut(&mut self) -> &mut Self::Target {
         &mut self.0
diff --git a/src/utils/exchange/buffer/host.rs b/src/utils/exchange/buffer/host.rs
index f7fedc804..e62227d8e 100644
--- a/src/utils/exchange/buffer/host.rs
+++ b/src/utils/exchange/buffer/host.rs
@@ -12,9 +12,9 @@ use rustacuda::{
 use crate::{
     alloc::{CombinedCudaAlloc, CudaAlloc, NoCudaAlloc},
     host::CudaDropWrapper,
-    safety::PortableBitSemantics,
+    safety::{PortableBitSemantics, StackOnly},
     utils::{
-        device_copy::SafeDeviceCopyWrapper,
+        adapter::DeviceCopyWithPortableBitSemantics,
         ffi::{DeviceAccessible, DeviceMutPointer},
     },
 };
@@ -23,29 +23,35 @@ use super::{common::CudaExchangeBufferCudaRepresentation, CudaExchangeItem};
 
 #[allow(clippy::module_name_repetitions)]
 pub struct CudaExchangeBufferHost<
-    T: PortableBitSemantics + TypeGraphLayout,
+    T: StackOnly + PortableBitSemantics + TypeGraphLayout,
     const M2D: bool,
     const M2H: bool,
 > {
-    host_buffer:
-        CudaDropWrapper<LockedBuffer<SafeDeviceCopyWrapper<CudaExchangeItem<T, M2D, M2H>>>>,
+    host_buffer: CudaDropWrapper<
+        LockedBuffer<DeviceCopyWithPortableBitSemantics<CudaExchangeItem<T, M2D, M2H>>>,
+    >,
     device_buffer: UnsafeCell<
-        CudaDropWrapper<DeviceBuffer<SafeDeviceCopyWrapper<CudaExchangeItem<T, M2D, M2H>>>>,
+        CudaDropWrapper<
+            DeviceBuffer<DeviceCopyWithPortableBitSemantics<CudaExchangeItem<T, M2D, M2H>>>,
+        >,
     >,
 }
 
-impl<T: Clone + PortableBitSemantics + TypeGraphLayout, const M2D: bool, const M2H: bool>
-    CudaExchangeBufferHost<T, M2D, M2H>
+impl<
+        T: Clone + StackOnly + PortableBitSemantics + TypeGraphLayout,
+        const M2D: bool,
+        const M2H: bool,
+    > CudaExchangeBufferHost<T, M2D, M2H>
 {
     /// # Errors
     /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside
     /// CUDA
     pub fn new(elem: &T, capacity: usize) -> CudaResult<Self> {
         // Safety: CudaExchangeItem is a `repr(transparent)` wrapper around T
-        let elem: &CudaExchangeItem<T, M2D, M2H> = unsafe { &*(elem as *const T).cast() };
+        let elem: &CudaExchangeItem<T, M2D, M2H> = unsafe { &*std::ptr::from_ref(elem).cast() };
 
         let host_buffer = CudaDropWrapper::from(LockedBuffer::new(
-            SafeDeviceCopyWrapper::from_ref(elem),
+            DeviceCopyWithPortableBitSemantics::from_ref(elem),
             capacity,
         )?);
         let device_buffer = UnsafeCell::new(CudaDropWrapper::from(DeviceBuffer::from_slice(
@@ -59,7 +65,7 @@ impl<T: Clone + PortableBitSemantics + TypeGraphLayout, const M2D: bool, const M
     }
 }
 
-impl<T: PortableBitSemantics + TypeGraphLayout, const M2D: bool, const M2H: bool>
+impl<T: StackOnly + PortableBitSemantics + TypeGraphLayout, const M2D: bool, const M2H: bool>
     CudaExchangeBufferHost<T, M2D, M2H>
 {
     /// # Errors
@@ -67,14 +73,16 @@ impl<T: PortableBitSemantics + TypeGraphLayout, const M2D: bool, const M2H: bool
     /// CUDA
     pub fn from_vec(vec: Vec<T>) -> CudaResult<Self> {
         let host_buffer = unsafe {
-            let mut uninit: CudaDropWrapper<LockedBuffer<SafeDeviceCopyWrapper<_>>> =
+            let mut uninit: CudaDropWrapper<LockedBuffer<DeviceCopyWithPortableBitSemantics<_>>> =
                 CudaDropWrapper::from(LockedBuffer::uninitialized(vec.len())?);
 
             for (i, src) in vec.into_iter().enumerate() {
                 uninit
                     .as_mut_ptr()
                     .add(i)
-                    .write(SafeDeviceCopyWrapper::from(CudaExchangeItem(src)));
+                    .write(DeviceCopyWithPortableBitSemantics::from(CudaExchangeItem(
+                        src,
+                    )));
             }
 
             uninit
@@ -91,25 +99,25 @@ impl<T: PortableBitSemantics + TypeGraphLayout, const M2D: bool, const M2H: bool
     }
 }
 
-impl<T: PortableBitSemantics + TypeGraphLayout, const M2D: bool, const M2H: bool> Deref
+impl<T: StackOnly + PortableBitSemantics + TypeGraphLayout, const M2D: bool, const M2H: bool> Deref
     for CudaExchangeBufferHost<T, M2D, M2H>
 {
     type Target = [CudaExchangeItem<T, M2D, M2H>];
 
     fn deref(&self) -> &Self::Target {
-        SafeDeviceCopyWrapper::into_slice(self.host_buffer.as_slice())
+        DeviceCopyWithPortableBitSemantics::into_slice(self.host_buffer.as_slice())
     }
 }
 
-impl<T: PortableBitSemantics + TypeGraphLayout, const M2D: bool, const M2H: bool> DerefMut
-    for CudaExchangeBufferHost<T, M2D, M2H>
+impl<T: StackOnly + PortableBitSemantics + TypeGraphLayout, const M2D: bool, const M2H: bool>
+    DerefMut for CudaExchangeBufferHost<T, M2D, M2H>
 {
     fn deref_mut(&mut self) -> &mut Self::Target {
-        SafeDeviceCopyWrapper::into_mut_slice(self.host_buffer.as_mut_slice())
+        DeviceCopyWithPortableBitSemantics::into_mut_slice(self.host_buffer.as_mut_slice())
     }
 }
 
-impl<T: PortableBitSemantics + TypeGraphLayout, const M2D: bool, const M2H: bool>
+impl<T: StackOnly + PortableBitSemantics + TypeGraphLayout, const M2D: bool, const M2H: bool>
     CudaExchangeBufferHost<T, M2D, M2H>
 {
     #[allow(clippy::type_complexity)]
@@ -162,7 +170,7 @@ impl<T: PortableBitSemantics + TypeGraphLayout, const M2D: bool, const M2H: bool
     }
 }
 
-impl<T: PortableBitSemantics + TypeGraphLayout, const M2D: bool, const M2H: bool>
+impl<T: StackOnly + PortableBitSemantics + TypeGraphLayout, const M2D: bool, const M2H: bool>
     CudaExchangeBufferHost<T, M2D, M2H>
 {
     #[allow(clippy::type_complexity)]
diff --git a/src/utils/exchange/buffer/mod.rs b/src/utils/exchange/buffer/mod.rs
index f493f316c..31c76f1b7 100644
--- a/src/utils/exchange/buffer/mod.rs
+++ b/src/utils/exchange/buffer/mod.rs
@@ -6,10 +6,9 @@ use core::{
 
 use const_type_layout::TypeLayout;
 
-#[cfg(any(feature = "host", feature = "device"))]
 use const_type_layout::TypeGraphLayout;
 
-use crate::safety::PortableBitSemantics;
+use crate::safety::{PortableBitSemantics, StackOnly};
 
 #[cfg(any(feature = "host", feature = "device"))]
 use crate::{
@@ -36,7 +35,7 @@ mod host;
 #[cfg(any(feature = "host", feature = "device"))]
 #[allow(clippy::module_name_repetitions)]
 pub struct CudaExchangeBuffer<
-    T: PortableBitSemantics + TypeGraphLayout,
+    T: StackOnly + PortableBitSemantics + TypeGraphLayout,
     const M2D: bool,
     const M2H: bool,
 > {
@@ -47,8 +46,11 @@ pub struct CudaExchangeBuffer<
 }
 
 #[cfg(feature = "host")]
-impl<T: Clone + PortableBitSemantics + TypeGraphLayout, const M2D: bool, const M2H: bool>
-    CudaExchangeBuffer<T, M2D, M2H>
+impl<
+        T: Clone + StackOnly + PortableBitSemantics + TypeGraphLayout,
+        const M2D: bool,
+        const M2H: bool,
+    > CudaExchangeBuffer<T, M2D, M2H>
 {
     /// # Errors
     /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside
@@ -61,7 +63,7 @@ impl<T: Clone + PortableBitSemantics + TypeGraphLayout, const M2D: bool, const M
 }
 
 #[cfg(feature = "host")]
-impl<T: PortableBitSemantics + TypeGraphLayout, const M2D: bool, const M2H: bool>
+impl<T: StackOnly + PortableBitSemantics + TypeGraphLayout, const M2D: bool, const M2H: bool>
     CudaExchangeBuffer<T, M2D, M2H>
 {
     /// # Errors
@@ -75,7 +77,7 @@ impl<T: PortableBitSemantics + TypeGraphLayout, const M2D: bool, const M2H: bool
 }
 
 #[cfg(any(feature = "host", feature = "device"))]
-impl<T: PortableBitSemantics + TypeGraphLayout, const M2D: bool, const M2H: bool> Deref
+impl<T: StackOnly + PortableBitSemantics + TypeGraphLayout, const M2D: bool, const M2H: bool> Deref
     for CudaExchangeBuffer<T, M2D, M2H>
 {
     type Target = [CudaExchangeItem<T, M2D, M2H>];
@@ -86,8 +88,8 @@ impl<T: PortableBitSemantics + TypeGraphLayout, const M2D: bool, const M2H: bool
 }
 
 #[cfg(any(feature = "host", feature = "device"))]
-impl<T: PortableBitSemantics + TypeGraphLayout, const M2D: bool, const M2H: bool> DerefMut
-    for CudaExchangeBuffer<T, M2D, M2H>
+impl<T: StackOnly + PortableBitSemantics + TypeGraphLayout, const M2D: bool, const M2H: bool>
+    DerefMut for CudaExchangeBuffer<T, M2D, M2H>
 {
     fn deref_mut(&mut self) -> &mut Self::Target {
         &mut self.inner
@@ -95,8 +97,8 @@ impl<T: PortableBitSemantics + TypeGraphLayout, const M2D: bool, const M2H: bool
 }
 
 #[cfg(any(feature = "host", feature = "device"))]
-unsafe impl<T: PortableBitSemantics + TypeGraphLayout, const M2D: bool, const M2H: bool> RustToCuda
-    for CudaExchangeBuffer<T, M2D, M2H>
+unsafe impl<T: StackOnly + PortableBitSemantics + TypeGraphLayout, const M2D: bool, const M2H: bool>
+    RustToCuda for CudaExchangeBuffer<T, M2D, M2H>
 {
     type CudaAllocation = NoCudaAlloc;
     type CudaRepresentation = CudaExchangeBufferCudaRepresentation<T, M2D, M2H>;
@@ -124,7 +126,7 @@ unsafe impl<T: PortableBitSemantics + TypeGraphLayout, const M2D: bool, const M2
 }
 
 #[cfg(any(feature = "host", feature = "device"))]
-unsafe impl<T: PortableBitSemantics + TypeGraphLayout, const M2D: bool, const M2H: bool>
+unsafe impl<T: StackOnly + PortableBitSemantics + TypeGraphLayout, const M2D: bool, const M2H: bool>
     RustToCudaAsync for CudaExchangeBuffer<T, M2D, M2H>
 {
     #[cfg(feature = "host")]
@@ -154,12 +156,14 @@ unsafe impl<T: PortableBitSemantics + TypeGraphLayout, const M2D: bool, const M2
 #[repr(transparent)]
 #[derive(Clone, Copy, TypeLayout)]
 pub struct CudaExchangeItem<
-    T: PortableBitSemantics + TypeGraphLayout,
+    T: StackOnly + PortableBitSemantics + TypeGraphLayout,
     const M2D: bool,
     const M2H: bool,
 >(T);
 
-impl<T: PortableBitSemantics + TypeGraphLayout, const M2D: bool> CudaExchangeItem<T, M2D, true> {
+impl<T: StackOnly + PortableBitSemantics + TypeGraphLayout, const M2D: bool>
+    CudaExchangeItem<T, M2D, true>
+{
     #[cfg(feature = "host")]
     pub const fn read(&self) -> &T {
         &self.0
@@ -171,7 +175,9 @@ impl<T: PortableBitSemantics + TypeGraphLayout, const M2D: bool> CudaExchangeIte
     }
 }
 
-impl<T: PortableBitSemantics + TypeGraphLayout, const M2H: bool> CudaExchangeItem<T, true, M2H> {
+impl<T: StackOnly + PortableBitSemantics + TypeGraphLayout, const M2H: bool>
+    CudaExchangeItem<T, true, M2H>
+{
     #[cfg(feature = "device")]
     pub const fn read(&self) -> &T {
         &self.0
@@ -183,13 +189,15 @@ impl<T: PortableBitSemantics + TypeGraphLayout, const M2H: bool> CudaExchangeIte
     }
 }
 
-impl<T: PortableBitSemantics + TypeGraphLayout> AsMut<T> for CudaExchangeItem<T, true, true> {
+impl<T: StackOnly + PortableBitSemantics + TypeGraphLayout> AsMut<T>
+    for CudaExchangeItem<T, true, true>
+{
     fn as_mut(&mut self) -> &mut T {
         &mut self.0
     }
 }
 
-impl<T: PortableBitSemantics + TypeGraphLayout> CudaExchangeItem<T, false, true> {
+impl<T: StackOnly + PortableBitSemantics + TypeGraphLayout> CudaExchangeItem<T, false, true> {
     #[cfg(feature = "host")]
     pub const fn as_scratch(&self) -> &T {
         &self.0
@@ -201,7 +209,7 @@ impl<T: PortableBitSemantics + TypeGraphLayout> CudaExchangeItem<T, false, true>
     }
 }
 
-impl<T: PortableBitSemantics + TypeGraphLayout> CudaExchangeItem<T, true, false> {
+impl<T: StackOnly + PortableBitSemantics + TypeGraphLayout> CudaExchangeItem<T, true, false> {
     #[cfg(feature = "device")]
     pub const fn as_scratch(&self) -> &T {
         &self.0
@@ -213,13 +221,13 @@ impl<T: PortableBitSemantics + TypeGraphLayout> CudaExchangeItem<T, true, false>
     }
 }
 
-impl<T: PortableBitSemantics + TypeGraphLayout> CudaExchangeItem<T, true, false> {
+impl<T: StackOnly + PortableBitSemantics + TypeGraphLayout> CudaExchangeItem<T, true, false> {
     #[cfg(feature = "host")]
     pub const fn as_uninit(&self) -> &MaybeUninit<T> {
         // Safety:
         // - MaybeUninit is a transparent newtype union
         // - CudaExchangeItem is a transparent newtype
-        unsafe { &*(self as *const Self).cast() }
+        unsafe { &*core::ptr::from_ref(self).cast() }
     }
 
     #[cfg(feature = "host")]
@@ -227,17 +235,17 @@ impl<T: PortableBitSemantics + TypeGraphLayout> CudaExchangeItem<T, true, false>
         // Safety:
         // - MaybeUninit is a transparent newtype union
         // - CudaExchangeItem is a transparent newtype
-        unsafe { &mut *(self as *mut Self).cast() }
+        unsafe { &mut *core::ptr::from_mut(self).cast() }
     }
 }
 
-impl<T: PortableBitSemantics + TypeGraphLayout> CudaExchangeItem<T, false, true> {
+impl<T: StackOnly + PortableBitSemantics + TypeGraphLayout> CudaExchangeItem<T, false, true> {
     #[cfg(feature = "device")]
     pub const fn as_uninit(&self) -> &MaybeUninit<T> {
         // Safety:
         // - MaybeUninit is a transparent newtype union
         // - CudaExchangeItem is a transparent newtype
-        unsafe { &*(self as *const Self).cast() }
+        unsafe { &*core::ptr::from_ref(self).cast() }
     }
 
     #[cfg(feature = "device")]
@@ -245,6 +253,6 @@ impl<T: PortableBitSemantics + TypeGraphLayout> CudaExchangeItem<T, false, true>
         // Safety:
         // - MaybeUninit is a transparent newtype union
         // - CudaExchangeItem is a transparent newtype
-        unsafe { &mut *(self as *mut Self).cast() }
+        unsafe { &mut *core::ptr::from_mut(self).cast() }
     }
 }
diff --git a/src/utils/exchange/wrapper.rs b/src/utils/exchange/wrapper.rs
index 9eedb058e..b7bbeba09 100644
--- a/src/utils/exchange/wrapper.rs
+++ b/src/utils/exchange/wrapper.rs
@@ -20,17 +20,25 @@ use crate::{
         HostAndDeviceMutRefAsync,
     },
     lend::{RustToCuda, RustToCudaAsync},
-    utils::{device_copy::SafeDeviceCopyWrapper, ffi::DeviceAccessible},
+    utils::{adapter::DeviceCopyWithPortableBitSemantics, ffi::DeviceAccessible},
 };
 
 #[allow(clippy::module_name_repetitions)]
 pub struct ExchangeWrapperOnHost<T: RustToCuda<CudaAllocation: EmptyCudaAlloc>> {
     value: T,
     device_box: CudaDropWrapper<
-        DeviceBox<SafeDeviceCopyWrapper<DeviceAccessible<<T as RustToCuda>::CudaRepresentation>>>,
+        DeviceBox<
+            DeviceCopyWithPortableBitSemantics<
+                DeviceAccessible<<T as RustToCuda>::CudaRepresentation>,
+            >,
+        >,
     >,
     locked_cuda_repr: CudaDropWrapper<
-        LockedBox<SafeDeviceCopyWrapper<DeviceAccessible<<T as RustToCuda>::CudaRepresentation>>>,
+        LockedBox<
+            DeviceCopyWithPortableBitSemantics<
+                DeviceAccessible<<T as RustToCuda>::CudaRepresentation>,
+            >,
+        >,
     >,
     move_event: CudaDropWrapper<Event>,
 }
@@ -39,10 +47,18 @@ pub struct ExchangeWrapperOnHost<T: RustToCuda<CudaAllocation: EmptyCudaAlloc>>
 pub struct ExchangeWrapperOnHostAsync<'stream, T: RustToCuda<CudaAllocation: EmptyCudaAlloc>> {
     value: T,
     device_box: CudaDropWrapper<
-        DeviceBox<SafeDeviceCopyWrapper<DeviceAccessible<<T as RustToCuda>::CudaRepresentation>>>,
+        DeviceBox<
+            DeviceCopyWithPortableBitSemantics<
+                DeviceAccessible<<T as RustToCuda>::CudaRepresentation>,
+            >,
+        >,
     >,
     locked_cuda_repr: CudaDropWrapper<
-        LockedBox<SafeDeviceCopyWrapper<DeviceAccessible<<T as RustToCuda>::CudaRepresentation>>>,
+        LockedBox<
+            DeviceCopyWithPortableBitSemantics<
+                DeviceAccessible<<T as RustToCuda>::CudaRepresentation>,
+            >,
+        >,
     >,
     move_event: CudaDropWrapper<Event>,
     stream: PhantomData<&'stream Stream>,
@@ -53,10 +69,18 @@ pub struct ExchangeWrapperOnHostAsync<'stream, T: RustToCuda<CudaAllocation: Emp
 pub struct ExchangeWrapperOnDevice<T: RustToCuda<CudaAllocation: EmptyCudaAlloc>> {
     value: T,
     device_box: CudaDropWrapper<
-        DeviceBox<SafeDeviceCopyWrapper<DeviceAccessible<<T as RustToCuda>::CudaRepresentation>>>,
+        DeviceBox<
+            DeviceCopyWithPortableBitSemantics<
+                DeviceAccessible<<T as RustToCuda>::CudaRepresentation>,
+            >,
+        >,
     >,
     locked_cuda_repr: CudaDropWrapper<
-        LockedBox<SafeDeviceCopyWrapper<DeviceAccessible<<T as RustToCuda>::CudaRepresentation>>>,
+        LockedBox<
+            DeviceCopyWithPortableBitSemantics<
+                DeviceAccessible<<T as RustToCuda>::CudaRepresentation>,
+            >,
+        >,
     >,
     null_alloc: CombinedCudaAlloc<<T as RustToCuda>::CudaAllocation, NoCudaAlloc>,
     move_event: CudaDropWrapper<Event>,
@@ -66,10 +90,18 @@ pub struct ExchangeWrapperOnDevice<T: RustToCuda<CudaAllocation: EmptyCudaAlloc>
 pub struct ExchangeWrapperOnDeviceAsync<'stream, T: RustToCuda<CudaAllocation: EmptyCudaAlloc>> {
     value: T,
     device_box: CudaDropWrapper<
-        DeviceBox<SafeDeviceCopyWrapper<DeviceAccessible<<T as RustToCuda>::CudaRepresentation>>>,
+        DeviceBox<
+            DeviceCopyWithPortableBitSemantics<
+                DeviceAccessible<<T as RustToCuda>::CudaRepresentation>,
+            >,
+        >,
     >,
     locked_cuda_repr: CudaDropWrapper<
-        LockedBox<SafeDeviceCopyWrapper<DeviceAccessible<<T as RustToCuda>::CudaRepresentation>>>,
+        LockedBox<
+            DeviceCopyWithPortableBitSemantics<
+                DeviceAccessible<<T as RustToCuda>::CudaRepresentation>,
+            >,
+        >,
     >,
     null_alloc: CombinedCudaAlloc<<T as RustToCuda>::CudaAllocation, NoCudaAlloc>,
     move_event: CudaDropWrapper<Event>,
@@ -90,11 +122,13 @@ impl<T: RustToCuda<CudaAllocation: EmptyCudaAlloc>> ExchangeWrapperOnHost<T> {
         let (cuda_repr, _null_alloc) = unsafe { value.borrow(NoCudaAlloc) }?;
         let locked_cuda_repr = unsafe {
             let mut uninit = CudaDropWrapper::from(LockedBox::<
-                SafeDeviceCopyWrapper<DeviceAccessible<<T as RustToCuda>::CudaRepresentation>>,
+                DeviceCopyWithPortableBitSemantics<
+                    DeviceAccessible<<T as RustToCuda>::CudaRepresentation>,
+                >,
             >::uninitialized()?);
             uninit
                 .as_mut_ptr()
-                .write(SafeDeviceCopyWrapper::from(cuda_repr));
+                .write(DeviceCopyWithPortableBitSemantics::from(cuda_repr));
             uninit
         };
 
@@ -122,7 +156,7 @@ impl<T: RustToCuda<CudaAllocation: EmptyCudaAlloc>> ExchangeWrapperOnHost<T> {
     /// CUDA
     pub fn move_to_device(mut self) -> CudaResult<ExchangeWrapperOnDevice<T>> {
         let (cuda_repr, null_alloc) = unsafe { self.value.borrow(NoCudaAlloc) }?;
-        **self.locked_cuda_repr = SafeDeviceCopyWrapper::from(cuda_repr);
+        **self.locked_cuda_repr = DeviceCopyWithPortableBitSemantics::from(cuda_repr);
 
         self.device_box.copy_from(&**self.locked_cuda_repr)?;
 
@@ -152,7 +186,7 @@ impl<T: RustToCudaAsync<CudaAllocation: EmptyCudaAlloc>> ExchangeWrapperOnHost<T
         stream: &Stream,
     ) -> CudaResult<ExchangeWrapperOnDeviceAsync<'_, T>> {
         let (cuda_repr, null_alloc) = unsafe { self.value.borrow_async(NoCudaAlloc, stream) }?;
-        **self.locked_cuda_repr = SafeDeviceCopyWrapper::from(cuda_repr);
+        **self.locked_cuda_repr = DeviceCopyWithPortableBitSemantics::from(cuda_repr);
 
         // Safety: The device value is not safely exposed until either
         // - the passed-in [`Stream`] is synchronised
diff --git a/src/utils/ffi.rs b/src/utils/ffi.rs
index 3b205ffca..52d7f691d 100644
--- a/src/utils/ffi.rs
+++ b/src/utils/ffi.rs
@@ -5,18 +5,18 @@ use core::{
     ops::{Deref, DerefMut},
 };
 #[cfg(feature = "host")]
-use std::{fmt, mem::MaybeUninit, ptr::copy_nonoverlapping};
+use std::fmt;
 
-use const_type_layout::TypeLayout;
+use const_type_layout::{TypeGraphLayout, TypeLayout};
 
 use crate::safety::PortableBitSemantics;
 #[cfg(feature = "host")]
-use crate::{lend::CudaAsRust, utils::device_copy::SafeDeviceCopyWrapper};
+use crate::{lend::CudaAsRust, utils::adapter::RustToCudaWithPortableBitCopySemantics};
 
 #[cfg_attr(any(feature = "device", doc), derive(Debug))]
 #[derive(TypeLayout)]
 #[repr(transparent)]
-pub struct DeviceAccessible<T: ?Sized + PortableBitSemantics>(T);
+pub struct DeviceAccessible<T: ?Sized + PortableBitSemantics + TypeGraphLayout>(T);
 
 #[cfg(feature = "host")]
 impl<T: CudaAsRust> From<T> for DeviceAccessible<T> {
@@ -25,22 +25,19 @@ impl<T: CudaAsRust> From<T> for DeviceAccessible<T> {
     }
 }
 
-// TODO: should there be some copy bound here?
 #[cfg(feature = "host")]
-impl<T: PortableBitSemantics> From<&T> for DeviceAccessible<SafeDeviceCopyWrapper<T>> {
+impl<T: Copy + PortableBitSemantics + TypeGraphLayout> From<&T>
+    for DeviceAccessible<RustToCudaWithPortableBitCopySemantics<T>>
+{
     fn from(value: &T) -> Self {
-        let value = unsafe {
-            let mut uninit = MaybeUninit::uninit();
-            copy_nonoverlapping(value, uninit.as_mut_ptr(), 1);
-            uninit.assume_init()
-        };
-
-        Self(SafeDeviceCopyWrapper::from(value))
+        Self(RustToCudaWithPortableBitCopySemantics::from_copy(value))
     }
 }
 
 #[cfg(all(feature = "host", not(doc)))]
-impl<T: ?Sized + PortableBitSemantics + fmt::Debug> fmt::Debug for DeviceAccessible<T> {
+impl<T: ?Sized + PortableBitSemantics + TypeGraphLayout + fmt::Debug> fmt::Debug
+    for DeviceAccessible<T>
+{
     fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result {
         fmt.debug_struct(stringify!(DeviceAccessible))
             .finish_non_exhaustive()
@@ -48,7 +45,7 @@ impl<T: ?Sized + PortableBitSemantics + fmt::Debug> fmt::Debug for DeviceAccessi
 }
 
 #[cfg(feature = "device")]
-impl<T: ?Sized + PortableBitSemantics> Deref for DeviceAccessible<T> {
+impl<T: ?Sized + PortableBitSemantics + TypeGraphLayout> Deref for DeviceAccessible<T> {
     type Target = T;
 
     fn deref(&self) -> &Self::Target {
@@ -57,7 +54,7 @@ impl<T: ?Sized + PortableBitSemantics> Deref for DeviceAccessible<T> {
 }
 
 #[cfg(feature = "device")]
-impl<T: ?Sized + PortableBitSemantics> DerefMut for DeviceAccessible<T> {
+impl<T: ?Sized + PortableBitSemantics + TypeGraphLayout> DerefMut for DeviceAccessible<T> {
     fn deref_mut(&mut self) -> &mut Self::Target {
         &mut self.0
     }
diff --git a/src/utils/mod.rs b/src/utils/mod.rs
index 65a4379fb..bab467e42 100644
--- a/src/utils/mod.rs
+++ b/src/utils/mod.rs
@@ -1,5 +1,5 @@
+pub mod adapter;
 pub mod aliasing;
-pub mod device_copy;
 pub mod exchange;
 pub mod ffi;
 pub mod shared;

From 76af5f11712529b35af113c2676071d471af04a3 Mon Sep 17 00:00:00 2001
From: Juniper Tyree <juniper.tyree@helsinki.fi>
Date: Thu, 28 Dec 2023 12:45:33 +0000
Subject: [PATCH 072/120] First exploration towards a stricter async CUDA API

---
 .../src/rust_to_cuda/field_copy.rs            |  17 +-
 rust-cuda-derive/src/rust_to_cuda/impl.rs     |   7 +-
 rust-cuda-derive/src/rust_to_cuda/mod.rs      |  32 +--
 src/alloc.rs                                  |  17 +-
 src/lend/impls/box.rs                         | 106 +++++++++-
 src/lend/impls/option.rs                      |   6 +-
 src/lend/mod.rs                               | 135 ++++++++++++-
 src/utils/adapter.rs                          |   2 +
 src/utils/aliasing/const.rs                   |   6 +-
 src/utils/aliasing/dynamic.rs                 |   6 +-
 src/utils/async.rs                            | 186 ++++++++++++++++++
 src/utils/exchange/buffer/mod.rs              |   2 +
 src/utils/exchange/wrapper.rs                 |  45 +++--
 src/utils/mod.rs                              |   1 +
 14 files changed, 519 insertions(+), 49 deletions(-)
 create mode 100644 src/utils/async.rs

diff --git a/rust-cuda-derive/src/rust_to_cuda/field_copy.rs b/rust-cuda-derive/src/rust_to_cuda/field_copy.rs
index 10f528730..1baf8829e 100644
--- a/rust-cuda-derive/src/rust_to_cuda/field_copy.rs
+++ b/rust-cuda-derive/src/rust_to_cuda/field_copy.rs
@@ -12,6 +12,7 @@ pub fn impl_field_copy_init_and_expand_alloc_type(
     cuda_repr_field_ty: &CudaReprFieldTy,
 
     mut combined_cuda_alloc_type: TokenStream,
+    mut combined_cuda_alloc_async_type: TokenStream,
 
     r2c_field_declarations: &mut Vec<TokenStream>,
     r2c_field_async_declarations: &mut Vec<TokenStream>,
@@ -20,7 +21,7 @@ pub fn impl_field_copy_init_and_expand_alloc_type(
     r2c_field_async_destructors: &mut Vec<TokenStream>,
 
     c2r_field_initialisations: &mut Vec<TokenStream>,
-) -> TokenStream {
+) -> (TokenStream, TokenStream) {
     #[allow(clippy::option_if_let_else)]
     let field_accessor = match &field.ident {
         Some(ident) => quote! { #ident },
@@ -63,6 +64,12 @@ pub fn impl_field_copy_init_and_expand_alloc_type(
                     #combined_cuda_alloc_type
                 >
             };
+            combined_cuda_alloc_async_type = quote! {
+                #crate_path::alloc::CombinedCudaAlloc<
+                    <#field_ty as #crate_path::lend::RustToCudaAsync>::CudaAllocationAsync,
+                    #combined_cuda_alloc_async_type
+                >
+            };
 
             r2c_field_declarations.push(quote! {
                 let (#field_repr_ident, alloc_front) = #crate_path::lend::RustToCuda::borrow(
@@ -109,6 +116,12 @@ pub fn impl_field_copy_init_and_expand_alloc_type(
                     #combined_cuda_alloc_type
                 >
             };
+            combined_cuda_alloc_async_type = quote! {
+                #crate_path::alloc::CombinedCudaAlloc<
+                    <#proxy_ty as #crate_path::lend::RustToCudaAsync>::CudaAllocationAsync,
+                    #combined_cuda_alloc_async_type
+                >
+            };
 
             r2c_field_declarations.push(quote! {
                 let (#field_repr_ident, alloc_front) = #crate_path::lend::RustToCuda::borrow(
@@ -160,5 +173,5 @@ pub fn impl_field_copy_init_and_expand_alloc_type(
         },
     }
 
-    combined_cuda_alloc_type
+    (combined_cuda_alloc_type, combined_cuda_alloc_async_type)
 }
diff --git a/rust-cuda-derive/src/rust_to_cuda/impl.rs b/rust-cuda-derive/src/rust_to_cuda/impl.rs
index 612d77c5a..d1249720e 100644
--- a/rust-cuda-derive/src/rust_to_cuda/impl.rs
+++ b/rust-cuda-derive/src/rust_to_cuda/impl.rs
@@ -125,6 +125,7 @@ pub fn rust_to_cuda_async_trait(
     struct_name_cuda: &syn::Ident,
     struct_generics_cuda_async: &syn::Generics,
     struct_fields_cuda: &syn::Fields,
+    combined_cuda_alloc_async_type: &TokenStream,
     r2c_field_async_declarations: &[TokenStream],
     r2c_field_initialisations: &[TokenStream],
     r2c_field_async_destructors: &[TokenStream],
@@ -149,6 +150,8 @@ pub fn rust_to_cuda_async_trait(
         unsafe impl #impl_generics #crate_path::lend::RustToCudaAsync for #struct_name #ty_generics
             #where_clause
         {
+            type CudaAllocationAsync = #combined_cuda_alloc_async_type;
+
             #[cfg(not(target_os = "cuda"))]
             unsafe fn borrow_async<CudaAllocType: #crate_path::alloc::CudaAlloc>(
                 &self,
@@ -156,7 +159,7 @@ pub fn rust_to_cuda_async_trait(
                 stream: &#crate_path::deps::rustacuda::stream::Stream,
             ) -> #crate_path::deps::rustacuda::error::CudaResult<(
                 #crate_path::utils::ffi::DeviceAccessible<Self::CudaRepresentation>,
-                #crate_path::alloc::CombinedCudaAlloc<Self::CudaAllocation, CudaAllocType>
+                #crate_path::alloc::CombinedCudaAlloc<Self::CudaAllocationAsync, CudaAllocType>
             )> {
                 let alloc_front = #crate_path::alloc::NoCudaAlloc;
                 let alloc_tail = alloc;
@@ -175,7 +178,7 @@ pub fn rust_to_cuda_async_trait(
             unsafe fn restore_async<CudaAllocType: #crate_path::alloc::CudaAlloc>(
                 &mut self,
                 alloc: #crate_path::alloc::CombinedCudaAlloc<
-                    Self::CudaAllocation, CudaAllocType
+                    Self::CudaAllocationAsync, CudaAllocType
                 >,
                 stream: &#crate_path::deps::rustacuda::stream::Stream,
             ) -> #crate_path::deps::rustacuda::error::CudaResult<CudaAllocType> {
diff --git a/rust-cuda-derive/src/rust_to_cuda/mod.rs b/rust-cuda-derive/src/rust_to_cuda/mod.rs
index 6a885ac94..77382d4c4 100644
--- a/rust-cuda-derive/src/rust_to_cuda/mod.rs
+++ b/rust-cuda-derive/src/rust_to_cuda/mod.rs
@@ -33,6 +33,9 @@ pub fn impl_rust_to_cuda(ast: &syn::DeriveInput) -> proc_macro::TokenStream {
     let mut combined_cuda_alloc_type: TokenStream = quote! {
         #crate_path::alloc::NoCudaAlloc
     };
+    let mut combined_cuda_alloc_async_type: TokenStream = quote! {
+        #crate_path::alloc::NoCudaAlloc
+    };
     let mut r2c_field_declarations: Vec<TokenStream> = Vec::new();
     let mut r2c_field_async_declarations: Vec<TokenStream> = Vec::new();
     let mut r2c_field_initialisations: Vec<TokenStream> = Vec::new();
@@ -57,19 +60,21 @@ pub fn impl_rust_to_cuda(ast: &syn::DeriveInput) -> proc_macro::TokenStream {
                 let cuda_repr_field_ty =
                     field_ty::swap_field_type_and_filter_attrs(&crate_path, field);
 
-                combined_cuda_alloc_type = field_copy::impl_field_copy_init_and_expand_alloc_type(
-                    &crate_path,
-                    field,
-                    field_index,
-                    &cuda_repr_field_ty,
-                    combined_cuda_alloc_type,
-                    &mut r2c_field_declarations,
-                    &mut r2c_field_async_declarations,
-                    &mut r2c_field_initialisations,
-                    &mut r2c_field_destructors_reverse,
-                    &mut r2c_field_async_destructors_reverse,
-                    &mut c2r_field_initialisations,
-                );
+                (combined_cuda_alloc_type, combined_cuda_alloc_async_type) =
+                    field_copy::impl_field_copy_init_and_expand_alloc_type(
+                        &crate_path,
+                        field,
+                        field_index,
+                        &cuda_repr_field_ty,
+                        combined_cuda_alloc_type,
+                        combined_cuda_alloc_async_type,
+                        &mut r2c_field_declarations,
+                        &mut r2c_field_async_declarations,
+                        &mut r2c_field_initialisations,
+                        &mut r2c_field_destructors_reverse,
+                        &mut r2c_field_async_destructors_reverse,
+                        &mut c2r_field_initialisations,
+                    );
             }
 
             // The fields must be deallocated in the reverse order of their allocation
@@ -110,6 +115,7 @@ pub fn impl_rust_to_cuda(ast: &syn::DeriveInput) -> proc_macro::TokenStream {
             &struct_name_cuda,
             &struct_generics_cuda_async,
             &struct_fields_cuda,
+            &combined_cuda_alloc_async_type,
             &r2c_field_async_declarations,
             &r2c_field_initialisations,
             &r2c_field_async_destructors,
diff --git a/src/alloc.rs b/src/alloc.rs
index f16178aec..80d0ee840 100644
--- a/src/alloc.rs
+++ b/src/alloc.rs
@@ -1,6 +1,6 @@
 #![allow(clippy::module_name_repetitions)]
 
-pub trait EmptyCudaAlloc: sealed::empty::Sealed {}
+pub trait EmptyCudaAlloc: From<NoCudaAlloc> + Into<NoCudaAlloc> + sealed::empty::Sealed {}
 
 pub trait CudaAlloc: sealed::alloc::Sealed {}
 
@@ -30,6 +30,21 @@ impl<A: CudaAlloc + EmptyCudaAlloc, B: CudaAlloc + EmptyCudaAlloc> sealed::empty
     for CombinedCudaAlloc<A, B>
 {
 }
+impl<A: CudaAlloc + EmptyCudaAlloc, B: CudaAlloc + EmptyCudaAlloc> From<NoCudaAlloc>
+    for CombinedCudaAlloc<A, B>
+{
+    fn from(_: NoCudaAlloc) -> Self {
+        Self(A::from(NoCudaAlloc), B::from(NoCudaAlloc))
+    }
+}
+impl<A: CudaAlloc + EmptyCudaAlloc, B: CudaAlloc + EmptyCudaAlloc> From<CombinedCudaAlloc<A, B>>
+    for NoCudaAlloc
+{
+    fn from(val: CombinedCudaAlloc<A, B>) -> Self {
+        let _: (Self, Self) = (val.0.into(), val.1.into());
+        Self
+    }
+}
 impl<A: CudaAlloc, B: CudaAlloc> CombinedCudaAlloc<A, B> {
     #[must_use]
     pub const fn new(front: A, tail: B) -> Self {
diff --git a/src/lend/impls/box.rs b/src/lend/impls/box.rs
index 4156b1a29..1ec853b34 100644
--- a/src/lend/impls/box.rs
+++ b/src/lend/impls/box.rs
@@ -1,13 +1,16 @@
-use crate::{deps::alloc::boxed::Box, utils::ffi::DeviceOwnedPointer};
+#[cfg(feature = "host")]
+use std::mem::ManuallyDrop;
 
 use const_type_layout::{TypeGraphLayout, TypeLayout};
 
 #[cfg(feature = "host")]
-use rustacuda::{error::CudaResult, memory::DeviceBox};
+use rustacuda::{error::CudaResult, memory::DeviceBox, memory::LockedBox};
 
 use crate::{
-    lend::{CudaAsRust, RustToCuda},
+    deps::alloc::boxed::Box,
+    lend::{CudaAsRust, RustToCuda, RustToCudaAsync},
     safety::PortableBitSemantics,
+    utils::ffi::DeviceOwnedPointer,
 };
 
 #[cfg(any(feature = "host", feature = "device"))]
@@ -71,6 +74,103 @@ unsafe impl<T: PortableBitSemantics + TypeGraphLayout> RustToCuda for Box<T> {
     }
 }
 
+unsafe impl<T: PortableBitSemantics + TypeGraphLayout> RustToCudaAsync for Box<T> {
+    #[cfg(all(feature = "host", not(doc)))]
+    type CudaAllocationAsync = CombinedCudaAlloc<
+        CudaDropWrapper<LockedBox<DeviceCopyWithPortableBitSemantics<ManuallyDrop<T>>>>,
+        CudaDropWrapper<DeviceBox<DeviceCopyWithPortableBitSemantics<ManuallyDrop<T>>>>,
+    >;
+    #[cfg(any(not(feature = "host"), doc))]
+    type CudaAllocationAsync = crate::alloc::SomeCudaAlloc;
+
+    #[cfg(feature = "host")]
+    unsafe fn borrow_async<A: CudaAlloc>(
+        &self,
+        alloc: A,
+        stream: &rustacuda::stream::Stream,
+    ) -> rustacuda::error::CudaResult<(
+        DeviceAccessible<Self::CudaRepresentation>,
+        CombinedCudaAlloc<Self::CudaAllocationAsync, A>,
+    )> {
+        use rustacuda::memory::AsyncCopyDestination;
+
+        let locked_box = unsafe {
+            let mut uninit = CudaDropWrapper::from(LockedBox::<
+                DeviceCopyWithPortableBitSemantics<ManuallyDrop<T>>,
+            >::uninitialized()?);
+            std::ptr::copy_nonoverlapping(
+                std::ptr::from_ref::<T>(&**self)
+                    .cast::<DeviceCopyWithPortableBitSemantics<ManuallyDrop<T>>>(),
+                uninit.as_mut_ptr(),
+                1,
+            );
+            uninit
+        };
+
+        let mut device_box = CudaDropWrapper::from(DeviceBox::<
+            DeviceCopyWithPortableBitSemantics<ManuallyDrop<T>>,
+        >::uninitialized()?);
+        device_box.async_copy_from(&*locked_box, stream)?;
+
+        Ok((
+            DeviceAccessible::from(BoxCudaRepresentation(DeviceOwnedPointer(
+                device_box.as_device_ptr().as_raw_mut().cast(),
+            ))),
+            CombinedCudaAlloc::new(CombinedCudaAlloc::new(locked_box, device_box), alloc),
+        ))
+    }
+
+    #[cfg(feature = "host")]
+    unsafe fn restore_async<A: CudaAlloc>(
+        &mut self,
+        alloc: CombinedCudaAlloc<Self::CudaAllocationAsync, A>,
+        stream: &rustacuda::stream::Stream,
+    ) -> rustacuda::error::CudaResult<A> {
+        use rustacuda::memory::AsyncCopyDestination;
+
+        struct PromiseSend<T>(T);
+        #[allow(clippy::non_send_fields_in_send_ty)]
+        unsafe impl<T> Send for PromiseSend<T> {}
+
+        let (alloc_front, alloc_tail) = alloc.split();
+        let (mut locked_box, device_box) = alloc_front.split();
+
+        device_box.async_copy_to(&mut *locked_box, stream)?;
+
+        {
+            // TODO: express this unsafe-rich completion safely
+            //       by explicitly capturing &mut self until the
+            //       async restore has completed
+            let self_ptr: *mut T = std::ptr::from_mut(self);
+
+            let self_ptr = PromiseSend(self_ptr);
+            let locked_box = PromiseSend(locked_box);
+            let device_box = PromiseSend(device_box);
+
+            stream.add_callback(Box::new(move |res| {
+                let self_ptr: PromiseSend<_> = self_ptr;
+
+                std::mem::drop(device_box);
+                if res == Ok(()) {
+                    // Safety: The precondition of this method guarantees that
+                    //         &mut self has been borrowed until after this
+                    //         completion is run
+                    unsafe {
+                        std::ptr::copy_nonoverlapping(
+                            locked_box.0.as_ptr().cast::<T>(),
+                            self_ptr.0,
+                            1,
+                        );
+                    }
+                }
+                std::mem::drop(locked_box);
+            }))
+        }?;
+
+        Ok(alloc_tail)
+    }
+}
+
 unsafe impl<T: PortableBitSemantics + TypeGraphLayout> CudaAsRust for BoxCudaRepresentation<T> {
     type RustRepresentation = Box<T>;
 
diff --git a/src/lend/impls/option.rs b/src/lend/impls/option.rs
index fab89b89d..a0f0f8b4a 100644
--- a/src/lend/impls/option.rs
+++ b/src/lend/impls/option.rs
@@ -79,6 +79,8 @@ unsafe impl<T: RustToCuda> RustToCuda for Option<T> {
 }
 
 unsafe impl<T: RustToCudaAsync> RustToCudaAsync for Option<T> {
+    type CudaAllocationAsync = Option<<T as RustToCudaAsync>::CudaAllocationAsync>;
+
     #[cfg(feature = "host")]
     #[allow(clippy::type_complexity)]
     unsafe fn borrow_async<A: CudaAlloc>(
@@ -87,7 +89,7 @@ unsafe impl<T: RustToCudaAsync> RustToCudaAsync for Option<T> {
         stream: &rustacuda::stream::Stream,
     ) -> CudaResult<(
         DeviceAccessible<Self::CudaRepresentation>,
-        CombinedCudaAlloc<Self::CudaAllocation, A>,
+        CombinedCudaAlloc<Self::CudaAllocationAsync, A>,
     )> {
         let (cuda_repr, alloc) = match self {
             None => (
@@ -118,7 +120,7 @@ unsafe impl<T: RustToCudaAsync> RustToCudaAsync for Option<T> {
     #[cfg(feature = "host")]
     unsafe fn restore_async<A: CudaAlloc>(
         &mut self,
-        alloc: CombinedCudaAlloc<Self::CudaAllocation, A>,
+        alloc: CombinedCudaAlloc<Self::CudaAllocationAsync, A>,
         stream: &rustacuda::stream::Stream,
     ) -> CudaResult<A> {
         let (alloc_front, alloc_tail) = alloc.split();
diff --git a/src/lend/mod.rs b/src/lend/mod.rs
index 603064fb8..50f2aee1d 100644
--- a/src/lend/mod.rs
+++ b/src/lend/mod.rs
@@ -17,6 +17,7 @@ use crate::{alloc::EmptyCudaAlloc, utils::ffi::DeviceAccessible};
 use crate::{
     alloc::{CombinedCudaAlloc, NoCudaAlloc},
     host::{HostAndDeviceConstRef, HostAndDeviceOwned},
+    utils::r#async::{Async, CudaAsync},
 };
 
 mod impls;
@@ -72,6 +73,8 @@ pub unsafe trait RustToCuda {
 /// This is an internal trait and should ONLY be derived automatically using
 /// `#[derive(LendRustToCuda)]`
 pub unsafe trait RustToCudaAsync: RustToCuda {
+    type CudaAllocationAsync: CudaAlloc;
+
     #[doc(hidden)]
     #[cfg(feature = "host")]
     /// # Errors
@@ -81,11 +84,19 @@ pub unsafe trait RustToCudaAsync: RustToCuda {
     ///
     /// # Safety
     ///
-    /// This is an internal function and should NEVER be called manually
+    /// This is an internal function and should NEVER be called manually.
+    ///
     /// The returned
     /// [`Self::CudaRepresentation`](RustToCuda::CudaRepresentation) must NEVER
     /// be accessed on the  CPU  as it contains a GPU-resident copy of
     /// `self`.
+    ///
+    /// Since this method may perform asynchronous computation but returns its
+    /// result immediately, this result must only be used to construct compound
+    /// asynchronous computations before it has been synchronized on.
+    ///
+    /// Similarly, `&self` should remain borrowed until synchronisation has
+    /// been performed.
     #[allow(clippy::type_complexity)]
     unsafe fn borrow_async<A: CudaAlloc>(
         &self,
@@ -93,7 +104,7 @@ pub unsafe trait RustToCudaAsync: RustToCuda {
         stream: &rustacuda::stream::Stream,
     ) -> rustacuda::error::CudaResult<(
         DeviceAccessible<Self::CudaRepresentation>,
-        CombinedCudaAlloc<Self::CudaAllocation, A>,
+        CombinedCudaAlloc<Self::CudaAllocationAsync, A>,
     )>;
 
     #[doc(hidden)]
@@ -105,11 +116,17 @@ pub unsafe trait RustToCudaAsync: RustToCuda {
     ///
     /// # Safety
     ///
-    /// This is an internal function and should NEVER be called manually
+    /// This is an internal function and should NEVER be called manually.
+    ///
+    /// Since this method may perform asynchronous computation but returns
+    /// immediately, `&mut self` not be used until it has been synchronized on.
+    ///
+    /// Therefore, `&mut self` should remain mutably borrowed until
+    /// synchronisation has been performed.
     #[allow(clippy::type_complexity)]
     unsafe fn restore_async<A: CudaAlloc>(
         &mut self,
-        alloc: CombinedCudaAlloc<Self::CudaAllocation, A>,
+        alloc: CombinedCudaAlloc<Self::CudaAllocationAsync, A>,
         stream: &rustacuda::stream::Stream,
     ) -> rustacuda::error::CudaResult<A>;
 }
@@ -228,6 +245,116 @@ impl<T: RustToCuda> LendToCuda for T {
     }
 }
 
+#[cfg(feature = "host")]
+#[allow(clippy::module_name_repetitions)]
+pub trait LendToCudaAsync: RustToCudaAsync {
+    /// Lends an immutable copy of `&self` to CUDA:
+    /// - code in the CUDA kernel can only access `&self` through the
+    ///   [`DeviceConstRef`] inside the closure
+    /// - after the closure, `&self` will not have changed
+    ///
+    /// # Errors
+    ///
+    /// Returns a [`CudaError`] iff an error occurs inside CUDA
+    fn lend_to_cuda_async<
+        'stream,
+        O,
+        E: From<CudaError>,
+        F: FnOnce(
+            Async<
+                'stream,
+                HostAndDeviceConstRef<DeviceAccessible<<Self as RustToCuda>::CudaRepresentation>>,
+            >,
+        ) -> Result<O, E>,
+    >(
+        &self,
+        stream: &'stream rustacuda::stream::Stream,
+        inner: F,
+    ) -> Result<O, E>;
+
+    /// Moves `self` to CUDA iff `self` is [`StackOnly`].
+    ///
+    /// # Errors
+    ///
+    /// Returns a [`CudaError`] iff an error occurs inside CUDA
+    fn move_to_cuda_async<
+        'stream,
+        O,
+        E: From<CudaError>,
+        F: FnOnce(
+            Async<
+                'stream,
+                HostAndDeviceOwned<DeviceAccessible<<Self as RustToCuda>::CudaRepresentation>>,
+            >,
+        ) -> Result<O, E>,
+    >(
+        self,
+        stream: &'stream rustacuda::stream::Stream,
+        inner: F,
+    ) -> Result<O, E>
+    where
+        Self: RustToCuda<CudaRepresentation: StackOnly, CudaAllocation: EmptyCudaAlloc>;
+}
+
+#[cfg(feature = "host")]
+impl<T: RustToCudaAsync> LendToCudaAsync for T {
+    fn lend_to_cuda_async<
+        'stream,
+        O,
+        E: From<CudaError>,
+        F: FnOnce(
+            Async<
+                'stream,
+                HostAndDeviceConstRef<DeviceAccessible<<Self as RustToCuda>::CudaRepresentation>>,
+            >,
+        ) -> Result<O, E>,
+    >(
+        &self,
+        stream: &'stream rustacuda::stream::Stream,
+        inner: F,
+    ) -> Result<O, E> {
+        let (cuda_repr, alloc) = unsafe { self.borrow_async(NoCudaAlloc, stream) }?;
+
+        let result = HostAndDeviceConstRef::with_new(&cuda_repr, |const_ref| {
+            inner(Async::new(const_ref, stream)?)
+        });
+
+        core::mem::drop(cuda_repr);
+        core::mem::drop(alloc);
+
+        result
+    }
+
+    fn move_to_cuda_async<
+        'stream,
+        O,
+        E: From<CudaError>,
+        F: FnOnce(
+            Async<
+                'stream,
+                HostAndDeviceOwned<DeviceAccessible<<Self as RustToCuda>::CudaRepresentation>>,
+            >,
+        ) -> Result<O, E>,
+    >(
+        self,
+        stream: &'stream rustacuda::stream::Stream,
+        inner: F,
+    ) -> Result<O, E>
+    where
+        Self: RustToCuda<CudaRepresentation: StackOnly, CudaAllocation: EmptyCudaAlloc>,
+    {
+        let (cuda_repr, alloc) = unsafe { self.borrow_async(NoCudaAlloc, stream) }?;
+
+        let result = HostAndDeviceOwned::with_new(cuda_repr, |owned_ref| {
+            inner(Async::new(owned_ref, stream)?)
+        });
+
+        core::mem::drop(alloc);
+
+        result
+    }
+}
+
 #[cfg(feature = "device")]
 pub trait BorrowFromRust: RustToCuda {
     /// # Safety
diff --git a/src/utils/adapter.rs b/src/utils/adapter.rs
index 8be7712ef..8e27d98df 100644
--- a/src/utils/adapter.rs
+++ b/src/utils/adapter.rs
@@ -129,6 +129,8 @@ unsafe impl<T: Copy + PortableBitSemantics + TypeGraphLayout> RustToCuda
 unsafe impl<T: Copy + PortableBitSemantics + TypeGraphLayout> RustToCudaAsync
     for RustToCudaWithPortableBitCopySemantics<T>
 {
+    type CudaAllocationAsync = NoCudaAlloc;
+
     #[cfg(feature = "host")]
     #[allow(clippy::type_complexity)]
     unsafe fn borrow_async<A: CudaAlloc>(
diff --git a/src/utils/aliasing/const.rs b/src/utils/aliasing/const.rs
index c36f814bf..8441a5bd1 100644
--- a/src/utils/aliasing/const.rs
+++ b/src/utils/aliasing/const.rs
@@ -215,6 +215,8 @@ unsafe impl<T: RustToCuda, const STRIDE: usize> RustToCuda
 unsafe impl<T: RustToCudaAsync, const STRIDE: usize> RustToCudaAsync
     for SplitSliceOverCudaThreadsConstStride<T, STRIDE>
 {
+    type CudaAllocationAsync = T::CudaAllocationAsync;
+
     #[cfg(feature = "host")]
     #[allow(clippy::type_complexity)]
     unsafe fn borrow_async<A: crate::alloc::CudaAlloc>(
@@ -223,7 +225,7 @@ unsafe impl<T: RustToCudaAsync, const STRIDE: usize> RustToCudaAsync
         stream: &rustacuda::stream::Stream,
     ) -> rustacuda::error::CudaResult<(
         DeviceAccessible<Self::CudaRepresentation>,
-        crate::alloc::CombinedCudaAlloc<Self::CudaAllocation, A>,
+        crate::alloc::CombinedCudaAlloc<Self::CudaAllocationAsync, A>,
     )> {
         let (cuda_repr, alloc) = self.0.borrow_async(alloc, stream)?;
 
@@ -236,7 +238,7 @@ unsafe impl<T: RustToCudaAsync, const STRIDE: usize> RustToCudaAsync
     #[cfg(feature = "host")]
     unsafe fn restore_async<A: crate::alloc::CudaAlloc>(
         &mut self,
-        alloc: crate::alloc::CombinedCudaAlloc<Self::CudaAllocation, A>,
+        alloc: crate::alloc::CombinedCudaAlloc<Self::CudaAllocationAsync, A>,
         stream: &rustacuda::stream::Stream,
     ) -> rustacuda::error::CudaResult<A> {
         self.0.restore_async(alloc, stream)
diff --git a/src/utils/aliasing/dynamic.rs b/src/utils/aliasing/dynamic.rs
index 0ab97016c..f8a04fa06 100644
--- a/src/utils/aliasing/dynamic.rs
+++ b/src/utils/aliasing/dynamic.rs
@@ -193,6 +193,8 @@ unsafe impl<T: RustToCuda> RustToCuda for SplitSliceOverCudaThreadsDynamicStride
 }
 
 unsafe impl<T: RustToCudaAsync> RustToCudaAsync for SplitSliceOverCudaThreadsDynamicStride<T> {
+    type CudaAllocationAsync = T::CudaAllocationAsync;
+
     #[cfg(feature = "host")]
     #[allow(clippy::type_complexity)]
     unsafe fn borrow_async<A: crate::alloc::CudaAlloc>(
@@ -201,7 +203,7 @@ unsafe impl<T: RustToCudaAsync> RustToCudaAsync for SplitSliceOverCudaThreadsDyn
         stream: &rustacuda::stream::Stream,
     ) -> rustacuda::error::CudaResult<(
         DeviceAccessible<Self::CudaRepresentation>,
-        crate::alloc::CombinedCudaAlloc<Self::CudaAllocation, A>,
+        crate::alloc::CombinedCudaAlloc<Self::CudaAllocationAsync, A>,
     )> {
         let (cuda_repr, alloc) = self.inner.borrow_async(alloc, stream)?;
 
@@ -217,7 +219,7 @@ unsafe impl<T: RustToCudaAsync> RustToCudaAsync for SplitSliceOverCudaThreadsDyn
     #[cfg(feature = "host")]
     unsafe fn restore_async<A: crate::alloc::CudaAlloc>(
         &mut self,
-        alloc: crate::alloc::CombinedCudaAlloc<Self::CudaAllocation, A>,
+        alloc: crate::alloc::CombinedCudaAlloc<Self::CudaAllocationAsync, A>,
         stream: &rustacuda::stream::Stream,
     ) -> rustacuda::error::CudaResult<A> {
         self.inner.restore_async(alloc, stream)
diff --git a/src/utils/async.rs b/src/utils/async.rs
new file mode 100644
index 000000000..b691b755f
--- /dev/null
+++ b/src/utils/async.rs
@@ -0,0 +1,186 @@
+#[cfg(feature = "host")]
+use std::{
+    future::Future, future::IntoFuture, future::Ready, marker::PhantomData, sync::Arc, sync::Mutex,
+    task::Poll, task::Waker,
+};
+
+#[cfg(feature = "host")]
+use rustacuda::{
+    error::CudaError, error::CudaResult, event::Event, event::EventFlags, event::EventStatus,
+    stream::Stream, stream::StreamWaitEventFlags,
+};
+
+#[cfg(feature = "host")]
+use crate::host::CudaDropWrapper;
+
+#[cfg(feature = "host")]
+#[allow(clippy::module_name_repetitions)]
+pub trait CudaAsync<'stream, T>: Sized + IntoFuture<Output = CudaResult<T>> {
+    /// Wraps a still-asynchronous `value` which is being computed on `stream`
+    /// such that its computation can be synchronised on.
+    ///
+    /// # Errors
+    /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside
+    /// CUDA.
+    fn new(value: T, stream: &'stream Stream) -> CudaResult<Self>;
+
+    /// Synchronises on this computation to block until it has completed and
+    /// the inner value can be safely returned and again be used in synchronous
+    /// operations.
+    ///
+    /// # Errors
+    /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside
+    /// CUDA.
+    fn synchronize(self) -> CudaResult<T>;
+
+    /// Moves the asynchronous data move to a different [`Stream`].
+    ///
+    /// # Errors
+    /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside
+    /// CUDA.
+    fn move_to_stream<'stream_new>(
+        self,
+        stream: &'stream_new Stream,
+    ) -> CudaResult<impl CudaAsync<'stream_new, T>>;
+}
+
+#[cfg(feature = "host")]
+pub struct Sync<T> {
+    value: T,
+}
+
+#[cfg(feature = "host")]
+impl<'stream, T> CudaAsync<'stream, T> for Sync<T> {
+    fn new(value: T, _stream: &'stream Stream) -> CudaResult<Self> {
+        Ok(Self { value })
+    }
+
+    fn synchronize(self) -> CudaResult<T> {
+        Ok(self.value)
+    }
+
+    #[allow(refining_impl_trait)]
+    fn move_to_stream(self, _stream: &Stream) -> CudaResult<Self> {
+        Ok(self)
+    }
+}
+
+#[cfg(feature = "host")]
+impl<T> IntoFuture for Sync<T> {
+    type IntoFuture = Ready<CudaResult<T>>;
+    type Output = CudaResult<T>;
+
+    fn into_future(self) -> Self::IntoFuture {
+        std::future::ready(Ok(self.value))
+    }
+}
+
+#[cfg(feature = "host")]
+pub struct Async<'stream, T> {
+    _stream: PhantomData<&'stream Stream>,
+    event: CudaDropWrapper<Event>,
+    waker: Arc<Mutex<Option<Waker>>>,
+    value: T,
+}
+
+#[cfg(feature = "host")]
+impl<'stream, T> CudaAsync<'stream, T> for Async<'stream, T> {
+    fn new(value: T, stream: &'stream Stream) -> CudaResult<Self> {
+        let event = CudaDropWrapper::from(Event::new(
+            EventFlags::DISABLE_TIMING | EventFlags::BLOCKING_SYNC,
+        )?);
+        event.record(stream)?;
+
+        let waker: Arc<Mutex<Option<Waker>>> = Arc::new(Mutex::new(None));
+        let waker_callback = waker.clone();
+        stream.add_callback(Box::new(move |_| {
+            if let Ok(mut waker) = waker_callback.lock() {
+                if let Some(waker) = waker.take() {
+                    waker.wake();
+                }
+            }
+        }))?;
+
+        Ok(Self {
+            _stream: PhantomData::<&'stream Stream>,
+            event,
+            waker,
+            value,
+        })
+    }
+
+    fn synchronize(self) -> CudaResult<T> {
+        self.event.synchronize()?;
+
+        Ok(self.value)
+    }
+
+    #[allow(refining_impl_trait)]
+    fn move_to_stream<'stream_new>(
+        self,
+        stream: &'stream_new Stream,
+    ) -> CudaResult<Async<'stream_new, T>> {
+        stream.wait_event(&self.event, StreamWaitEventFlags::DEFAULT)?;
+        self.event.record(stream)?;
+
+        let waker_callback = self.waker.clone();
+        stream.add_callback(Box::new(move |_| {
+            if let Ok(mut waker) = waker_callback.lock() {
+                if let Some(waker) = waker.take() {
+                    waker.wake();
+                }
+            }
+        }))?;
+
+        Ok(Async {
+            _stream: PhantomData::<&'stream_new Stream>,
+            event: self.event,
+            waker: self.waker,
+            value: self.value,
+        })
+    }
+}
+
+#[cfg(feature = "host")]
+impl<'stream, T> Async<'stream, T> {
+    /// # Safety
+    ///
+    /// The returned inner value of type `T` may not yet have completed its
+    /// asynchronous work and may thus be in an inconsistent state.
+    ///
+    /// This method must only be used to construct a larger asynchronous
+    /// computation out of smaller ones that have all been submitted to the
+    /// same [`Stream`].
+    pub unsafe fn unwrap_unchecked(self) -> T {
+        self.value
+    }
+}
+
+#[cfg(feature = "host")]
+impl<'stream, T> IntoFuture for Async<'stream, T> {
+    type Output = CudaResult<T>;
+
+    type IntoFuture = impl Future<Output = Self::Output>;
+
+    fn into_future(self) -> Self::IntoFuture {
+        let mut wrapper = Some(self);
+
+        std::future::poll_fn(move |cx| match &wrapper {
+            Some(Async { waker, event, .. }) => match event.query() {
+                Ok(EventStatus::NotReady) => waker.lock().map_or_else(
+                    |_| Poll::Ready(Err(CudaError::OperatingSystemError)),
+                    |mut waker| {
+                        *waker = Some(cx.waker().clone());
+                        Poll::Pending
+                    },
+                ),
+                Ok(EventStatus::Ready) => match wrapper.take() {
+                    Some(Async { value, .. }) => Poll::Ready(Ok(value)),
+                    None => Poll::Ready(Err(CudaError::AlreadyAcquired)),
+                },
+                Err(err) => Poll::Ready(Err(err)),
+            },
+            None => Poll::Ready(Err(CudaError::AlreadyAcquired)),
+        })
+    }
+}
diff --git a/src/utils/exchange/buffer/mod.rs b/src/utils/exchange/buffer/mod.rs
index 31c76f1b7..9dfc4414e 100644
--- a/src/utils/exchange/buffer/mod.rs
+++ b/src/utils/exchange/buffer/mod.rs
@@ -129,6 +129,8 @@ unsafe impl<T: StackOnly + PortableBitSemantics + TypeGraphLayout, const M2D: bo
 unsafe impl<T: StackOnly + PortableBitSemantics + TypeGraphLayout, const M2D: bool, const M2H: bool>
     RustToCudaAsync for CudaExchangeBuffer<T, M2D, M2H>
 {
+    type CudaAllocationAsync = NoCudaAlloc;
+
     #[cfg(feature = "host")]
     #[allow(clippy::type_complexity)]
     unsafe fn borrow_async<A: CudaAlloc>(
diff --git a/src/utils/exchange/wrapper.rs b/src/utils/exchange/wrapper.rs
index b7bbeba09..454ecc8f3 100644
--- a/src/utils/exchange/wrapper.rs
+++ b/src/utils/exchange/wrapper.rs
@@ -14,7 +14,7 @@ use rustacuda::{
 };
 
 use crate::{
-    alloc::{CombinedCudaAlloc, EmptyCudaAlloc, NoCudaAlloc},
+    alloc::{EmptyCudaAlloc, NoCudaAlloc},
     host::{
         CudaDropWrapper, HostAndDeviceConstRef, HostAndDeviceConstRefAsync, HostAndDeviceMutRef,
         HostAndDeviceMutRefAsync,
@@ -82,7 +82,6 @@ pub struct ExchangeWrapperOnDevice<T: RustToCuda<CudaAllocation: EmptyCudaAlloc>
             >,
         >,
     >,
-    null_alloc: CombinedCudaAlloc<<T as RustToCuda>::CudaAllocation, NoCudaAlloc>,
     move_event: CudaDropWrapper<Event>,
 }
 
@@ -103,7 +102,6 @@ pub struct ExchangeWrapperOnDeviceAsync<'stream, T: RustToCuda<CudaAllocation: E
             >,
         >,
     >,
-    null_alloc: CombinedCudaAlloc<<T as RustToCuda>::CudaAllocation, NoCudaAlloc>,
     move_event: CudaDropWrapper<Event>,
     stream: &'stream Stream,
     waker: Arc<Mutex<Option<Waker>>>,
@@ -160,17 +158,20 @@ impl<T: RustToCuda<CudaAllocation: EmptyCudaAlloc>> ExchangeWrapperOnHost<T> {
 
         self.device_box.copy_from(&**self.locked_cuda_repr)?;
 
+        let _: NoCudaAlloc = null_alloc.into();
+
         Ok(ExchangeWrapperOnDevice {
             value: self.value,
             device_box: self.device_box,
             locked_cuda_repr: self.locked_cuda_repr,
-            null_alloc,
             move_event: self.move_event,
         })
     }
 }
 
-impl<T: RustToCudaAsync<CudaAllocation: EmptyCudaAlloc>> ExchangeWrapperOnHost<T> {
+impl<T: RustToCudaAsync<CudaAllocationAsync: EmptyCudaAlloc, CudaAllocation: EmptyCudaAlloc>>
+    ExchangeWrapperOnHost<T>
+{
     /// Moves the data asynchronously to the CUDA device.
     ///
     /// To avoid aliasing, each CUDA thread will get access to its own shallow
@@ -208,11 +209,12 @@ impl<T: RustToCudaAsync<CudaAllocation: EmptyCudaAlloc>> ExchangeWrapperOnHost<T
             }
         }))?;
 
+        let _: NoCudaAlloc = null_alloc.into();
+
         Ok(ExchangeWrapperOnDeviceAsync {
             value: self.value,
             device_box: self.device_box,
             locked_cuda_repr: self.locked_cuda_repr,
-            null_alloc,
             move_event: self.move_event,
             stream,
             waker,
@@ -332,7 +334,6 @@ impl<'stream, T: RustToCuda<CudaAllocation: EmptyCudaAlloc>>
             value: self.value,
             device_box: self.device_box,
             locked_cuda_repr: self.locked_cuda_repr,
-            null_alloc: self.null_alloc,
             move_event: self.move_event,
         })
     }
@@ -362,7 +363,6 @@ impl<'stream, T: RustToCuda<CudaAllocation: EmptyCudaAlloc>>
             value: self.value,
             device_box: self.device_box,
             locked_cuda_repr: self.locked_cuda_repr,
-            null_alloc: self.null_alloc,
             move_event: self.move_event,
             stream,
             waker: self.waker,
@@ -406,8 +406,10 @@ impl<'stream, T: RustToCuda<CudaAllocation: EmptyCudaAlloc>>
     /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside
     /// CUDA
     pub fn move_to_host(mut self) -> CudaResult<ExchangeWrapperOnHost<T>> {
+        let null_alloc = NoCudaAlloc.into();
+
         // Reflect deep changes back to the CPU
-        let _null_alloc: NoCudaAlloc = unsafe { self.value.restore(self.null_alloc) }?;
+        let _null_alloc: NoCudaAlloc = unsafe { self.value.restore(null_alloc) }?;
 
         // Note: Shallow changes are not reflected back to the CPU
 
@@ -420,8 +422,10 @@ impl<'stream, T: RustToCuda<CudaAllocation: EmptyCudaAlloc>>
     }
 }
 
-impl<'stream, T: RustToCudaAsync<CudaAllocation: EmptyCudaAlloc>>
-    ExchangeWrapperOnDeviceAsync<'stream, T>
+impl<
+        'stream,
+        T: RustToCudaAsync<CudaAllocationAsync: EmptyCudaAlloc, CudaAllocation: EmptyCudaAlloc>,
+    > ExchangeWrapperOnDeviceAsync<'stream, T>
 {
     /// Moves the data asynchronously back to the host CPU device.
     ///
@@ -437,9 +441,10 @@ impl<'stream, T: RustToCudaAsync<CudaAllocation: EmptyCudaAlloc>>
         mut self,
         stream: &'stream Stream,
     ) -> CudaResult<ExchangeWrapperOnHostAsync<'stream, T>> {
+        let null_alloc = NoCudaAlloc.into();
+
         // Reflect deep changes back to the CPU
-        let _null_alloc: NoCudaAlloc =
-            unsafe { self.value.restore_async(self.null_alloc, stream) }?;
+        let _null_alloc: NoCudaAlloc = unsafe { self.value.restore_async(null_alloc, stream) }?;
 
         // Note: Shallow changes are not reflected back to the CPU
 
@@ -491,7 +496,6 @@ impl<'stream, T: RustToCuda<CudaAllocation: EmptyCudaAlloc>> IntoFuture
                         value: inner.value,
                         device_box: inner.device_box,
                         locked_cuda_repr: inner.locked_cuda_repr,
-                        null_alloc: inner.null_alloc,
                         move_event: inner.move_event,
                     })),
                     None => Poll::Ready(Err(CudaError::AlreadyAcquired)),
@@ -515,8 +519,10 @@ impl<T: RustToCuda<CudaAllocation: EmptyCudaAlloc>> ExchangeWrapperOnDevice<T> {
     /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside
     /// CUDA
     pub fn move_to_host(mut self) -> CudaResult<ExchangeWrapperOnHost<T>> {
+        let null_alloc = NoCudaAlloc.into();
+
         // Reflect deep changes back to the CPU
-        let _null_alloc: NoCudaAlloc = unsafe { self.value.restore(self.null_alloc) }?;
+        let _null_alloc: NoCudaAlloc = unsafe { self.value.restore(null_alloc) }?;
 
         // Note: Shallow changes are not reflected back to the CPU
 
@@ -547,7 +553,9 @@ impl<T: RustToCuda<CudaAllocation: EmptyCudaAlloc>> ExchangeWrapperOnDevice<T> {
     }
 }
 
-impl<T: RustToCudaAsync<CudaAllocation: EmptyCudaAlloc>> ExchangeWrapperOnDevice<T> {
+impl<T: RustToCudaAsync<CudaAllocationAsync: EmptyCudaAlloc, CudaAllocation: EmptyCudaAlloc>>
+    ExchangeWrapperOnDevice<T>
+{
     /// Moves the data asynchronously back to the host CPU device.
     ///
     /// To avoid aliasing, each CUDA thread only got access to its own shallow
@@ -562,9 +570,10 @@ impl<T: RustToCudaAsync<CudaAllocation: EmptyCudaAlloc>> ExchangeWrapperOnDevice
         mut self,
         stream: &Stream,
     ) -> CudaResult<ExchangeWrapperOnHostAsync<'_, T>> {
+        let null_alloc = NoCudaAlloc.into();
+
         // Reflect deep changes back to the CPU
-        let _null_alloc: NoCudaAlloc =
-            unsafe { self.value.restore_async(self.null_alloc, stream) }?;
+        let _null_alloc: NoCudaAlloc = unsafe { self.value.restore_async(null_alloc, stream) }?;
 
         // Note: Shallow changes are not reflected back to the CPU
 
diff --git a/src/utils/mod.rs b/src/utils/mod.rs
index bab467e42..e41a3c4ee 100644
--- a/src/utils/mod.rs
+++ b/src/utils/mod.rs
@@ -1,5 +1,6 @@
 pub mod adapter;
 pub mod aliasing;
+pub mod r#async;
 pub mod exchange;
 pub mod ffi;
 pub mod shared;

From 8ec927a5e3b53239976b6584d1c1af4456c441f5 Mon Sep 17 00:00:00 2001
From: Juniper Tyree <juniper.tyree@helsinki.fi>
Date: Fri, 29 Dec 2023 14:47:46 +0000
Subject: [PATCH 073/120] More experiments with async API

---
 src/lend/impls/box.rs |  90 ++++++++++++++-------
 src/lend/mod.rs       |  38 ++++++---
 src/utils/async.rs    | 184 ++++++++++++++++++++++++++++++++----------
 3 files changed, 227 insertions(+), 85 deletions(-)

diff --git a/src/lend/impls/box.rs b/src/lend/impls/box.rs
index 1ec853b34..552f93e7e 100644
--- a/src/lend/impls/box.rs
+++ b/src/lend/impls/box.rs
@@ -21,6 +21,7 @@ use crate::{
     alloc::{CombinedCudaAlloc, CudaAlloc},
     host::CudaDropWrapper,
     utils::adapter::DeviceCopyWithPortableBitSemantics,
+    utils::r#async::{Async, CudaAsync},
 };
 
 #[doc(hidden)]
@@ -74,7 +75,7 @@ unsafe impl<T: PortableBitSemantics + TypeGraphLayout> RustToCuda for Box<T> {
     }
 }
 
-unsafe impl<T: PortableBitSemantics + TypeGraphLayout> RustToCudaAsync for Box<T> {
+unsafe impl<T: Send + PortableBitSemantics + TypeGraphLayout> RustToCudaAsync for Box<T> {
     #[cfg(all(feature = "host", not(doc)))]
     type CudaAllocationAsync = CombinedCudaAlloc<
         CudaDropWrapper<LockedBox<DeviceCopyWithPortableBitSemantics<ManuallyDrop<T>>>>,
@@ -121,11 +122,11 @@ unsafe impl<T: PortableBitSemantics + TypeGraphLayout> RustToCudaAsync for Box<T
     }
 
     #[cfg(feature = "host")]
-    unsafe fn restore_async<A: CudaAlloc>(
+    unsafe fn restore_async<'stream, A: CudaAlloc>(
         &mut self,
         alloc: CombinedCudaAlloc<Self::CudaAllocationAsync, A>,
-        stream: &rustacuda::stream::Stream,
-    ) -> rustacuda::error::CudaResult<A> {
+        stream: &'stream rustacuda::stream::Stream,
+    ) -> CudaResult<(Async<'stream, (), &mut std::boxed::Box<T>>, A)> {
         use rustacuda::memory::AsyncCopyDestination;
 
         struct PromiseSend<T>(T);
@@ -137,37 +138,64 @@ unsafe impl<T: PortableBitSemantics + TypeGraphLayout> RustToCudaAsync for Box<T
 
         device_box.async_copy_to(&mut *locked_box, stream)?;
 
-        {
-            // TODO: express this unsafe-rich completion safely
-            //       by explicitly capturing &mut self until the
-            //       async restore has completed
-            let self_ptr: *mut T = std::ptr::from_mut(self);
-
-            let self_ptr = PromiseSend(self_ptr);
-            let locked_box = PromiseSend(locked_box);
-            let device_box = PromiseSend(device_box);
-
-            stream.add_callback(Box::new(move |res| {
-                let self_ptr: PromiseSend<_> = self_ptr;
+        let locked_box = PromiseSend(locked_box);
+        let device_box = PromiseSend(device_box);
 
+        let r#async =
+            <crate::utils::r#async::Async<(), &mut Self> as crate::utils::r#async::CudaAsync<
+                (),
+                &mut Self,
+            >>::new((), stream, self, |data: &mut Self| {
+                // TODO: we cannot actually drop here since that would invoke a CUDA function
                 std::mem::drop(device_box);
-                if res == Ok(()) {
-                    // Safety: The precondition of this method guarantees that
-                    //         &mut self has been borrowed until after this
-                    //         completion is run
-                    unsafe {
-                        std::ptr::copy_nonoverlapping(
-                            locked_box.0.as_ptr().cast::<T>(),
-                            self_ptr.0,
-                            1,
-                        );
-                    }
+                // Safety: equivalent to *data = *locked_box since
+                //         LockedBox<ManuallyDrop<T>> doesn't drop T
+                unsafe {
+                    std::ptr::copy_nonoverlapping(
+                        locked_box.0.as_ptr().cast::<T>(),
+                        &mut **data,
+                        1,
+                    );
                 }
+                // TODO: we cannot actually drop here since that would invoke a CUDA function
                 std::mem::drop(locked_box);
-            }))
-        }?;
-
-        Ok(alloc_tail)
+                Ok(())
+            })?;
+        // std::mem::drop(r#async);
+
+        Ok((r#async, alloc_tail))
+
+        // {
+        //     // TODO: express this unsafe-rich completion safely
+        //     //       by explicitly capturing &mut self until the
+        //     //       async restore has completed
+        //     let self_ptr: *mut T = std::ptr::from_mut(self);
+
+        //     let self_ptr = PromiseSend(self_ptr);
+        //     let locked_box = PromiseSend(locked_box);
+        //     let device_box = PromiseSend(device_box);
+
+        //     stream.add_callback(Box::new(move |res| {
+        //         let self_ptr: PromiseSend<_> = self_ptr;
+
+        //         std::mem::drop(device_box);
+        //         if res == Ok(()) {
+        //             // Safety: The precondition of this method guarantees that
+        //             //         &mut self has been borrowed until after this
+        //             //         completion is run
+        //             unsafe {
+        //                 std::ptr::copy_nonoverlapping(
+        //                     locked_box.0.as_ptr().cast::<T>(),
+        //                     self_ptr.0,
+        //                     1,
+        //                 );
+        //             }
+        //         }
+        //         std::mem::drop(locked_box);
+        //     }))
+        // }?;
+
+        // Ok(alloc_tail)
     }
 }
 
diff --git a/src/lend/mod.rs b/src/lend/mod.rs
index 50f2aee1d..485f03c1f 100644
--- a/src/lend/mod.rs
+++ b/src/lend/mod.rs
@@ -124,11 +124,11 @@ pub unsafe trait RustToCudaAsync: RustToCuda {
     /// Therefore, `&mut self` should remain mutably borrowed until
     /// synchronisation has been performed.
     #[allow(clippy::type_complexity)]
-    unsafe fn restore_async<A: CudaAlloc>(
+    unsafe fn restore_async<'stream, A: CudaAlloc>(
         &mut self,
         alloc: CombinedCudaAlloc<Self::CudaAllocationAsync, A>,
-        stream: &rustacuda::stream::Stream,
-    ) -> rustacuda::error::CudaResult<A>;
+        stream: &'stream rustacuda::stream::Stream,
+    ) -> rustacuda::error::CudaResult<(Async<'stream, (), &mut Self>, A)>;
 }
 
 /// # Safety
@@ -179,7 +179,9 @@ pub trait LendToCuda: RustToCuda {
     >(
         &self,
         inner: F,
-    ) -> Result<O, E>;
+    ) -> Result<O, E>
+    where
+        Self: Sync;
 
     /// Moves `self` to CUDA iff `self` is [`StackOnly`].
     ///
@@ -197,7 +199,7 @@ pub trait LendToCuda: RustToCuda {
         inner: F,
     ) -> Result<O, E>
     where
-        Self: RustToCuda<CudaRepresentation: StackOnly, CudaAllocation: EmptyCudaAlloc>;
+        Self: Send + RustToCuda<CudaRepresentation: StackOnly, CudaAllocation: EmptyCudaAlloc>;
 }
 
 #[cfg(feature = "host")]
@@ -211,7 +213,10 @@ impl<T: RustToCuda> LendToCuda for T {
     >(
         &self,
         inner: F,
-    ) -> Result<O, E> {
+    ) -> Result<O, E>
+    where
+        Self: Sync,
+    {
         let (cuda_repr, alloc) = unsafe { self.borrow(NoCudaAlloc) }?;
 
         let result = HostAndDeviceConstRef::with_new(&cuda_repr, inner);
@@ -233,7 +238,7 @@ impl<T: RustToCuda> LendToCuda for T {
         inner: F,
     ) -> Result<O, E>
     where
-        Self: RustToCuda<CudaRepresentation: StackOnly, CudaAllocation: EmptyCudaAlloc>,
+        Self: Send + RustToCuda<CudaRepresentation: StackOnly, CudaAllocation: EmptyCudaAlloc>,
     {
         let (cuda_repr, alloc) = unsafe { self.borrow(NoCudaAlloc) }?;
 
@@ -264,13 +269,16 @@ pub trait LendToCudaAsync: RustToCudaAsync {
             Async<
                 'stream,
                 HostAndDeviceConstRef<DeviceAccessible<<Self as RustToCuda>::CudaRepresentation>>,
+                &Self,
             >,
         ) -> Result<O, E>,
     >(
         &self,
         stream: &'stream rustacuda::stream::Stream,
         inner: F,
-    ) -> Result<O, E>;
+    ) -> Result<O, E>
+    where
+        Self: Sync;
 
     /// Moves `self` to CUDA iff `self` is [`StackOnly`].
     ///
@@ -293,7 +301,7 @@ pub trait LendToCudaAsync: RustToCudaAsync {
         inner: F,
     ) -> Result<O, E>
     where
-        Self: RustToCuda<CudaRepresentation: StackOnly, CudaAllocation: EmptyCudaAlloc>;
+        Self: Send + RustToCuda<CudaRepresentation: StackOnly, CudaAllocation: EmptyCudaAlloc>;
 }
 
 #[cfg(feature = "host")]
@@ -306,17 +314,21 @@ impl<T: RustToCudaAsync> LendToCudaAsync for T {
             Async<
                 'stream,
                 HostAndDeviceConstRef<DeviceAccessible<<Self as RustToCuda>::CudaRepresentation>>,
+                &Self,
             >,
         ) -> Result<O, E>,
     >(
         &self,
         stream: &'stream rustacuda::stream::Stream,
         inner: F,
-    ) -> Result<O, E> {
+    ) -> Result<O, E>
+    where
+        Self: Sync,
+    {
         let (cuda_repr, alloc) = unsafe { self.borrow_async(NoCudaAlloc, stream) }?;
 
         let result = HostAndDeviceConstRef::with_new(&cuda_repr, |const_ref| {
-            inner(Async::new(const_ref, stream)?)
+            inner(Async::new(const_ref, stream, self, |_self| Ok(()))?)
         });
 
         core::mem::drop(cuda_repr);
@@ -341,12 +353,12 @@ impl<T: RustToCudaAsync> LendToCudaAsync for T {
         inner: F,
     ) -> Result<O, E>
     where
-        Self: RustToCuda<CudaRepresentation: StackOnly, CudaAllocation: EmptyCudaAlloc>,
+        Self: Send + RustToCuda<CudaRepresentation: StackOnly, CudaAllocation: EmptyCudaAlloc>,
     {
         let (cuda_repr, alloc) = unsafe { self.borrow_async(NoCudaAlloc, stream) }?;
 
         let result = HostAndDeviceOwned::with_new(cuda_repr, |owned_ref| {
-            inner(Async::new(owned_ref, stream)?)
+            inner(Async::new(owned_ref, stream, (), |()| Ok(()))?)
         });
 
         core::mem::drop(alloc);
diff --git a/src/utils/async.rs b/src/utils/async.rs
index b691b755f..78bad1725 100644
--- a/src/utils/async.rs
+++ b/src/utils/async.rs
@@ -6,8 +6,8 @@ use std::{
 
 #[cfg(feature = "host")]
 use rustacuda::{
-    error::CudaError, error::CudaResult, event::Event, event::EventFlags, event::EventStatus,
-    stream::Stream, stream::StreamWaitEventFlags,
+    error::CudaError, error::CudaResult, event::Event, event::EventFlags, stream::Stream,
+    stream::StreamWaitEventFlags,
 };
 
 #[cfg(feature = "host")]
@@ -15,14 +15,19 @@ use crate::host::CudaDropWrapper;
 
 #[cfg(feature = "host")]
 #[allow(clippy::module_name_repetitions)]
-pub trait CudaAsync<'stream, T>: Sized + IntoFuture<Output = CudaResult<T>> {
+pub trait CudaAsync<'stream, T, C: Send = ()>: Sized + IntoFuture<Output = CudaResult<T>> {
     /// Wraps a still-asynchronous `value` which is being computed on `stream`
     /// such that its computation can be synchronised on.
     ///
     /// # Errors
     /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside
     /// CUDA.
-    fn new(value: T, stream: &'stream Stream) -> CudaResult<Self>;
+    fn new(
+        value: T,
+        stream: &'stream Stream,
+        capture: C,
+        on_completion: impl Send + FnOnce(C) -> CudaResult<()>,
+    ) -> CudaResult<Self>;
 
     /// Synchronises on this computation to block until it has completed and
     /// the inner value can be safely returned and again be used in synchronous
@@ -41,7 +46,7 @@ pub trait CudaAsync<'stream, T>: Sized + IntoFuture<Output = CudaResult<T>> {
     fn move_to_stream<'stream_new>(
         self,
         stream: &'stream_new Stream,
-    ) -> CudaResult<impl CudaAsync<'stream_new, T>>;
+    ) -> CudaResult<impl CudaAsync<'stream_new, T, C>>;
 }
 
 #[cfg(feature = "host")]
@@ -50,8 +55,15 @@ pub struct Sync<T> {
 }
 
 #[cfg(feature = "host")]
-impl<'stream, T> CudaAsync<'stream, T> for Sync<T> {
-    fn new(value: T, _stream: &'stream Stream) -> CudaResult<Self> {
+impl<'stream, T, C: Send> CudaAsync<'stream, T, C> for Sync<T> {
+    fn new(
+        value: T,
+        _stream: &'stream Stream,
+        capture: C,
+        on_completion: impl Send + FnOnce(C) -> CudaResult<()>,
+    ) -> CudaResult<Self> {
+        on_completion(capture)?;
+
         Ok(Self { value })
     }
 
@@ -76,58 +88,136 @@ impl<T> IntoFuture for Sync<T> {
 }
 
 #[cfg(feature = "host")]
-pub struct Async<'stream, T> {
+pub struct Async<'stream, T, C = ()> {
     _stream: PhantomData<&'stream Stream>,
     event: CudaDropWrapper<Event>,
-    waker: Arc<Mutex<Option<Waker>>>,
     value: T,
+    status: Arc<Mutex<AsyncStatus<C>>>,
+}
+
+// This could also be expressed as a
+//  https://docs.rs/oneshot/latest/oneshot/index.html channel
+#[cfg(feature = "host")]
+enum AsyncStatus<C> {
+    Processing { waker: Option<Waker>, capture: C },
+    Completed { result: CudaResult<()> },
 }
 
+// TODO: completion is NOT allowed to make any cuda calls
 #[cfg(feature = "host")]
-impl<'stream, T> CudaAsync<'stream, T> for Async<'stream, T> {
-    fn new(value: T, stream: &'stream Stream) -> CudaResult<Self> {
+impl<'stream, T, C: Send> CudaAsync<'stream, T, C> for Async<'stream, T, C> {
+    fn new(
+        value: T,
+        stream: &'stream Stream,
+        capture: C,
+        on_completion: impl Send + FnOnce(C) -> CudaResult<()>,
+    ) -> CudaResult<Self> {
         let event = CudaDropWrapper::from(Event::new(
             EventFlags::DISABLE_TIMING | EventFlags::BLOCKING_SYNC,
         )?);
-        event.record(stream)?;
 
-        let waker: Arc<Mutex<Option<Waker>>> = Arc::new(Mutex::new(None));
-        let waker_callback = waker.clone();
-        stream.add_callback(Box::new(move |_| {
-            if let Ok(mut waker) = waker_callback.lock() {
-                if let Some(waker) = waker.take() {
-                    waker.wake();
-                }
+        let status = Arc::new(Mutex::new(AsyncStatus::Processing {
+            waker: None,
+            capture,
+        }));
+
+        let status_callback = status.clone();
+        stream.add_callback(Box::new(move |res| {
+            let Ok(mut status) = status_callback.lock() else {
+                return;
+            };
+
+            let old_status =
+                std::mem::replace(&mut *status, AsyncStatus::Completed { result: Ok(()) });
+
+            let AsyncStatus::Processing { mut waker, capture } = old_status else {
+                // this path should never be taken
+                *status = old_status;
+                return;
+            };
+
+            if let Err(err) = res {
+                *status = AsyncStatus::Completed { result: Err(err) };
+            } else if let Err(err) = on_completion(capture) {
+                *status = AsyncStatus::Completed { result: Err(err) };
+            }
+
+            if let Some(waker) = waker.take() {
+                waker.wake();
             }
         }))?;
 
+        event.record(stream)?;
+
         Ok(Self {
             _stream: PhantomData::<&'stream Stream>,
             event,
-            waker,
             value,
+            status,
         })
     }
 
     fn synchronize(self) -> CudaResult<T> {
+        let Ok(status) = self.status.lock() else {
+            return Err(CudaError::OperatingSystemError);
+        };
+
+        if let AsyncStatus::Completed { result } = &*status {
+            return result.map(|()| self.value);
+        }
+
+        std::mem::drop(status);
+
         self.event.synchronize()?;
 
-        Ok(self.value)
+        let Ok(status) = self.status.lock() else {
+            return Err(CudaError::OperatingSystemError);
+        };
+
+        match &*status {
+            AsyncStatus::Completed { result } => result.map(|()| self.value),
+            AsyncStatus::Processing { .. } => Err(CudaError::NotReady),
+        }
     }
 
     #[allow(refining_impl_trait)]
     fn move_to_stream<'stream_new>(
         self,
         stream: &'stream_new Stream,
-    ) -> CudaResult<Async<'stream_new, T>> {
+    ) -> CudaResult<Async<'stream_new, T, C>> {
+        let Ok(status) = self.status.lock() else {
+            return Err(CudaError::OperatingSystemError);
+        };
+
+        if let AsyncStatus::Completed { result } = &*status {
+            #[allow(clippy::let_unit_value)]
+            let () = (*result)?;
+
+            std::mem::drop(status);
+
+            // the computation has completed, so the result is available on any stream
+            return Ok(Async {
+                _stream: PhantomData::<&'stream_new Stream>,
+                event: self.event,
+                value: self.value,
+                status: self.status,
+            });
+        }
+
+        std::mem::drop(status);
+
         stream.wait_event(&self.event, StreamWaitEventFlags::DEFAULT)?;
         self.event.record(stream)?;
 
-        let waker_callback = self.waker.clone();
+        // add a new waker callback since the waker may have received a spurious
+        //  wake-up from when the computation completed on the original stream
+        let waker_callback = self.status.clone();
         stream.add_callback(Box::new(move |_| {
-            if let Ok(mut waker) = waker_callback.lock() {
-                if let Some(waker) = waker.take() {
-                    waker.wake();
+            if let Ok(mut status) = waker_callback.lock() {
+                if let AsyncStatus::Processing { waker, .. } = &mut *status {
+                    if let Some(waker) = waker.take() {
+                        waker.wake();
+                    }
                 }
             }
         }))?;
@@ -135,14 +225,14 @@ impl<'stream, T> CudaAsync<'stream, T> for Async<'stream, T> {
         Ok(Async {
             _stream: PhantomData::<&'stream_new Stream>,
             event: self.event,
-            waker: self.waker,
             value: self.value,
+            status: self.status,
         })
     }
 }
 
 #[cfg(feature = "host")]
-impl<'stream, T> Async<'stream, T> {
+impl<'stream, T, C> Async<'stream, T, C> {
     /// # Safety
     ///
     /// The returned inner value of type `T` may not yet have completed its
@@ -157,7 +247,7 @@ impl<'stream, T> Async<'stream, T> {
 }
 
 #[cfg(feature = "host")]
-impl<'stream, T> IntoFuture for Async<'stream, T> {
+impl<'stream, T, C> IntoFuture for Async<'stream, T, C> {
     type Output = CudaResult<T>;
 
     type IntoFuture = impl Future<Output = Self::Output>;
@@ -165,22 +255,34 @@ impl<'stream, T> IntoFuture for Async<'stream, T> {
     fn into_future(self) -> Self::IntoFuture {
         let mut wrapper = Some(self);
 
-        std::future::poll_fn(move |cx| match &wrapper {
-            Some(Async { waker, event, .. }) => match event.query() {
-                Ok(EventStatus::NotReady) => waker.lock().map_or_else(
-                    |_| Poll::Ready(Err(CudaError::OperatingSystemError)),
-                    |mut waker| {
-                        *waker = Some(cx.waker().clone());
-                        Poll::Pending
+        std::future::poll_fn(move |cx| {
+            let poll = match &wrapper {
+                #[allow(clippy::option_if_let_else)]
+                Some(Async {
+                    status: status_mutex,
+                    ..
+                }) => match status_mutex.lock() {
+                    Ok(mut status_guard) => match &mut *status_guard {
+                        AsyncStatus::Completed { result: Ok(()) } => Poll::Ready(Ok(())),
+                        AsyncStatus::Completed { result: Err(err) } => Poll::Ready(Err(*err)),
+                        AsyncStatus::Processing { waker, .. } => {
+                            *waker = Some(cx.waker().clone());
+                            Poll::Pending
+                        },
                     },
-                ),
-                Ok(EventStatus::Ready) => match wrapper.take() {
+                    Err(_) => Poll::Ready(Err(CudaError::OperatingSystemError)),
+                },
+                None => Poll::Ready(Err(CudaError::AlreadyAcquired)),
+            };
+
+            match poll {
+                Poll::Ready(Ok(())) => match wrapper.take() {
                     Some(Async { value, .. }) => Poll::Ready(Ok(value)),
                     None => Poll::Ready(Err(CudaError::AlreadyAcquired)),
                 },
-                Err(err) => Poll::Ready(Err(err)),
-            },
-            None => Poll::Ready(Err(CudaError::AlreadyAcquired)),
+                Poll::Ready(Err(err)) => Poll::Ready(Err(err)),
+                Poll::Pending => Poll::Pending,
+            }
         })
     }
 }

From 4993daf56d9b948e64ad5bb17c1041489485655e Mon Sep 17 00:00:00 2001
From: Juniper Tyree <juniper.tyree@helsinki.fi>
Date: Sun, 31 Dec 2023 13:43:28 +0000
Subject: [PATCH 074/120] Further API experimentation

---
 Cargo.toml                |   5 +-
 src/lend/impls/box.rs     |  75 ++-----
 src/lend/impls/option.rs  |  66 +++++--
 src/lend/mod.rs           |  19 +-
 src/utils/adapter.rs      |  24 ++-
 src/utils/aliasing/mod.rs |   8 +-
 src/utils/async.rs        | 399 ++++++++++++++++++--------------------
 src/utils/exchange/mod.rs |   6 +-
 8 files changed, 308 insertions(+), 294 deletions(-)

diff --git a/Cargo.toml b/Cargo.toml
index bbabb2007..90626aae6 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -21,7 +21,7 @@ rust-version = "1.75" # nightly
 default = []
 derive = ["dep:rustacuda_derive", "dep:rust-cuda-derive"]
 device = []
-host = ["dep:rustacuda", "dep:regex"]
+host = ["dep:rustacuda", "dep:regex", "dep:oneshot"]
 kernel = ["dep:rust-cuda-kernel"]
 
 [dependencies]
@@ -34,5 +34,8 @@ regex = { version = "1.10", optional = true }
 
 const-type-layout = { version = "0.2.1", features = ["derive"] }
 
+safer_owning_ref = { version = "0.5" }
+oneshot = { version = "0.1", optional = true, features = ["std", "async"] }
+
 rust-cuda-derive = { path = "rust-cuda-derive", optional = true }
 rust-cuda-kernel = { path = "rust-cuda-kernel", optional = true }
diff --git a/src/lend/impls/box.rs b/src/lend/impls/box.rs
index 552f93e7e..34224eb62 100644
--- a/src/lend/impls/box.rs
+++ b/src/lend/impls/box.rs
@@ -21,7 +21,7 @@ use crate::{
     alloc::{CombinedCudaAlloc, CudaAlloc},
     host::CudaDropWrapper,
     utils::adapter::DeviceCopyWithPortableBitSemantics,
-    utils::r#async::{Async, CudaAsync},
+    utils::r#async::Async,
 };
 
 #[doc(hidden)]
@@ -75,7 +75,7 @@ unsafe impl<T: PortableBitSemantics + TypeGraphLayout> RustToCuda for Box<T> {
     }
 }
 
-unsafe impl<T: Send + PortableBitSemantics + TypeGraphLayout> RustToCudaAsync for Box<T> {
+unsafe impl<T: PortableBitSemantics + TypeGraphLayout> RustToCudaAsync for Box<T> {
     #[cfg(all(feature = "host", not(doc)))]
     type CudaAllocationAsync = CombinedCudaAlloc<
         CudaDropWrapper<LockedBox<DeviceCopyWithPortableBitSemantics<ManuallyDrop<T>>>>,
@@ -122,80 +122,41 @@ unsafe impl<T: Send + PortableBitSemantics + TypeGraphLayout> RustToCudaAsync fo
     }
 
     #[cfg(feature = "host")]
-    unsafe fn restore_async<'stream, A: CudaAlloc>(
-        &mut self,
+    unsafe fn restore_async<'a, 'stream, A: CudaAlloc, O>(
+        this: owning_ref::BoxRefMut<'a, O, Self>,
         alloc: CombinedCudaAlloc<Self::CudaAllocationAsync, A>,
         stream: &'stream rustacuda::stream::Stream,
-    ) -> CudaResult<(Async<'stream, (), &mut std::boxed::Box<T>>, A)> {
+    ) -> CudaResult<(
+        Async<'stream, owning_ref::BoxRefMut<'a, O, Self>, Self::CudaAllocationAsync>,
+        A,
+    )> {
         use rustacuda::memory::AsyncCopyDestination;
 
-        struct PromiseSend<T>(T);
-        #[allow(clippy::non_send_fields_in_send_ty)]
-        unsafe impl<T> Send for PromiseSend<T> {}
-
         let (alloc_front, alloc_tail) = alloc.split();
         let (mut locked_box, device_box) = alloc_front.split();
 
         device_box.async_copy_to(&mut *locked_box, stream)?;
 
-        let locked_box = PromiseSend(locked_box);
-        let device_box = PromiseSend(device_box);
+        let r#async = crate::utils::r#async::Async::pending(
+            this,
+            stream,
+            CombinedCudaAlloc::new(locked_box, device_box),
+            move |this, alloc| {
+                let data: &mut T = &mut *this;
+                let (locked_box, device_box) = alloc.split();
 
-        let r#async =
-            <crate::utils::r#async::Async<(), &mut Self> as crate::utils::r#async::CudaAsync<
-                (),
-                &mut Self,
-            >>::new((), stream, self, |data: &mut Self| {
-                // TODO: we cannot actually drop here since that would invoke a CUDA function
                 std::mem::drop(device_box);
                 // Safety: equivalent to *data = *locked_box since
                 //         LockedBox<ManuallyDrop<T>> doesn't drop T
                 unsafe {
-                    std::ptr::copy_nonoverlapping(
-                        locked_box.0.as_ptr().cast::<T>(),
-                        &mut **data,
-                        1,
-                    );
+                    std::ptr::copy_nonoverlapping(locked_box.as_ptr().cast::<T>(), data, 1);
                 }
-                // TODO: we cannot actually drop here since that would invoke a CUDA function
                 std::mem::drop(locked_box);
                 Ok(())
-            })?;
-        // std::mem::drop(r#async);
+            },
+        )?;
 
         Ok((r#async, alloc_tail))
-
-        // {
-        //     // TODO: express this unsafe-rich completion safely
-        //     //       by explicitly capturing &mut self until the
-        //     //       async restore has completed
-        //     let self_ptr: *mut T = std::ptr::from_mut(self);
-
-        //     let self_ptr = PromiseSend(self_ptr);
-        //     let locked_box = PromiseSend(locked_box);
-        //     let device_box = PromiseSend(device_box);
-
-        //     stream.add_callback(Box::new(move |res| {
-        //         let self_ptr: PromiseSend<_> = self_ptr;
-
-        //         std::mem::drop(device_box);
-        //         if res == Ok(()) {
-        //             // Safety: The precondition of this method guarantees that
-        //             //         &mut self has been borrowed until after this
-        //             //         completion is run
-        //             unsafe {
-        //                 std::ptr::copy_nonoverlapping(
-        //                     locked_box.0.as_ptr().cast::<T>(),
-        //                     self_ptr.0,
-        //                     1,
-        //                 );
-        //             }
-        //         }
-        //         std::mem::drop(locked_box);
-        //     }))
-        // }?;
-
-        // Ok(alloc_tail)
     }
 }
 
diff --git a/src/lend/impls/option.rs b/src/lend/impls/option.rs
index a0f0f8b4a..5a70a24c6 100644
--- a/src/lend/impls/option.rs
+++ b/src/lend/impls/option.rs
@@ -12,7 +12,10 @@ use crate::{
 };
 
 #[cfg(feature = "host")]
-use crate::alloc::{CombinedCudaAlloc, CudaAlloc};
+use crate::{
+    alloc::{CombinedCudaAlloc, CudaAlloc},
+    utils::r#async::Async,
+};
 
 #[doc(hidden)]
 #[allow(clippy::module_name_repetitions)]
@@ -118,18 +121,59 @@ unsafe impl<T: RustToCudaAsync> RustToCudaAsync for Option<T> {
     }
 
     #[cfg(feature = "host")]
-    unsafe fn restore_async<A: CudaAlloc>(
-        &mut self,
+    unsafe fn restore_async<'a, 'stream, A: CudaAlloc, O>(
+        mut this: owning_ref::BoxRefMut<'a, O, Self>,
         alloc: CombinedCudaAlloc<Self::CudaAllocationAsync, A>,
-        stream: &rustacuda::stream::Stream,
-    ) -> CudaResult<A> {
+        stream: &'stream rustacuda::stream::Stream,
+    ) -> CudaResult<(
+        Async<'stream, owning_ref::BoxRefMut<'a, O, Self>, Self::CudaAllocationAsync>,
+        A,
+    )> {
         let (alloc_front, alloc_tail) = alloc.split();
 
-        match (self, alloc_front) {
-            (Some(value), Some(alloc_front)) => {
-                value.restore_async(CombinedCudaAlloc::new(alloc_front, alloc_tail), stream)
-            },
-            _ => Ok(alloc_tail),
+        if let (Some(_), Some(alloc_front)) = (&mut *this, alloc_front) {
+            let this_backup = unsafe { std::mem::ManuallyDrop::new(std::ptr::read(&this)) };
+
+            #[allow(clippy::option_if_let_else)]
+            let (r#async, alloc_tail) = RustToCudaAsync::restore_async(
+                this.map_mut(|value| match value {
+                    Some(value) => value,
+                    None => unreachable!(), // TODO
+                }),
+                CombinedCudaAlloc::new(alloc_front, alloc_tail),
+                stream,
+            )?;
+
+            let (value, capture_on_completion) = unsafe { r#async.unwrap_unchecked()? };
+
+            std::mem::forget(value);
+            let this = std::mem::ManuallyDrop::into_inner(this_backup);
+
+            if let Some((capture, on_completion)) = capture_on_completion {
+                let r#async = Async::pending(this, stream, Some(capture), |this, capture| {
+                    let mut value_backup = unsafe {
+                        std::mem::ManuallyDrop::new(std::ptr::read(this).map_mut(
+                            |value| match value {
+                                Some(value) => value,
+                                None => unreachable!(), // TODO
+                            },
+                        ))
+                    };
+
+                    if let (Some(_), Some(capture)) = (&mut **this, capture) {
+                        on_completion(&mut value_backup, capture)?;
+                    }
+
+                    Ok(())
+                })?;
+                Ok((r#async, alloc_tail))
+            } else {
+                let r#async = Async::ready(this, stream);
+                Ok((r#async, alloc_tail))
+            }
+        } else {
+            let r#async = Async::ready(this, stream);
+            Ok((r#async, alloc_tail))
         }
     }
 }
@@ -165,7 +209,7 @@ impl<T: Copy + PortableBitSemantics + TypeGraphLayout> RustToCudaProxy<Option<T>
     }
 }
 
-impl<T: Copy + PortableBitSemantics + TypeGraphLayout> RustToCudaAsyncProxy<Option<T>>
+impl<T: Copy + Send + PortableBitSemantics + TypeGraphLayout> RustToCudaAsyncProxy<Option<T>>
     for Option<RustToCudaWithPortableBitCopySemantics<T>>
 {
     fn from_ref(val: &Option<T>) -> &Self {
diff --git a/src/lend/mod.rs b/src/lend/mod.rs
index 485f03c1f..0b442cab5 100644
--- a/src/lend/mod.rs
+++ b/src/lend/mod.rs
@@ -17,7 +17,7 @@ use crate::{alloc::EmptyCudaAlloc, utils::ffi::DeviceAccessible};
 use crate::{
     alloc::{CombinedCudaAlloc, NoCudaAlloc},
     host::{HostAndDeviceConstRef, HostAndDeviceOwned},
-    utils::r#async::{Async, CudaAsync},
+    utils::r#async::Async,
 };
 
 mod impls;
@@ -124,11 +124,14 @@ pub unsafe trait RustToCudaAsync: RustToCuda {
     /// Therefore, `&mut self` should remain mutably borrowed until
     /// synchronisation has been performed.
     #[allow(clippy::type_complexity)]
-    unsafe fn restore_async<'stream, A: CudaAlloc>(
-        &mut self,
+    unsafe fn restore_async<'a, 'stream, A: CudaAlloc, O>(
+        this: owning_ref::BoxRefMut<'a, O, Self>,
         alloc: CombinedCudaAlloc<Self::CudaAllocationAsync, A>,
         stream: &'stream rustacuda::stream::Stream,
-    ) -> rustacuda::error::CudaResult<(Async<'stream, (), &mut Self>, A)>;
+    ) -> rustacuda::error::CudaResult<(
+        Async<'stream, owning_ref::BoxRefMut<'a, O, Self>, Self::CudaAllocationAsync>,
+        A,
+    )>;
 }
 
 /// # Safety
@@ -293,6 +296,7 @@ pub trait LendToCudaAsync: RustToCudaAsync {
             Async<
                 'stream,
                 HostAndDeviceOwned<DeviceAccessible<<Self as RustToCuda>::CudaRepresentation>>,
+                (),
             >,
         ) -> Result<O, E>,
     >(
@@ -328,7 +332,9 @@ impl<T: RustToCudaAsync> LendToCudaAsync for T {
         let (cuda_repr, alloc) = unsafe { self.borrow_async(NoCudaAlloc, stream) }?;
 
         let result = HostAndDeviceConstRef::with_new(&cuda_repr, |const_ref| {
-            inner(Async::new(const_ref, stream, self, |_self| Ok(()))?)
+            inner(Async::pending(const_ref, stream, self, |_ref, _self| {
+                Ok(())
+            })?)
         });
 
         core::mem::drop(cuda_repr);
@@ -345,6 +351,7 @@ impl<T: RustToCudaAsync> LendToCudaAsync for T {
             Async<
                 'stream,
                 HostAndDeviceOwned<DeviceAccessible<<Self as RustToCuda>::CudaRepresentation>>,
+                (),
             >,
         ) -> Result<O, E>,
     >(
@@ -358,7 +365,7 @@ impl<T: RustToCudaAsync> LendToCudaAsync for T {
         let (cuda_repr, alloc) = unsafe { self.borrow_async(NoCudaAlloc, stream) }?;
 
         let result = HostAndDeviceOwned::with_new(cuda_repr, |owned_ref| {
-            inner(Async::new(owned_ref, stream, (), |()| Ok(()))?)
+            inner(Async::pending(owned_ref, stream, (), |_ref, ()| Ok(()))?)
         });
 
         core::mem::drop(alloc);
diff --git a/src/utils/adapter.rs b/src/utils/adapter.rs
index 8e27d98df..182c29184 100644
--- a/src/utils/adapter.rs
+++ b/src/utils/adapter.rs
@@ -146,14 +146,28 @@ unsafe impl<T: Copy + PortableBitSemantics + TypeGraphLayout> RustToCudaAsync
     }
 
     #[cfg(feature = "host")]
-    unsafe fn restore_async<A: CudaAlloc>(
-        &mut self,
+    unsafe fn restore_async<'a, 'stream, A: CudaAlloc, O>(
+        this: owning_ref::BoxRefMut<'a, O, Self>,
         alloc: CombinedCudaAlloc<Self::CudaAllocation, A>,
-        _stream: &rustacuda::stream::Stream,
-    ) -> rustacuda::error::CudaResult<A> {
+        stream: &'stream rustacuda::stream::Stream,
+    ) -> rustacuda::error::CudaResult<(
+        crate::utils::r#async::Async<
+            'stream,
+            owning_ref::BoxRefMut<'a, O, Self>,
+            Self::CudaAllocationAsync,
+        >,
+        A,
+    )> {
         let (_alloc_front, alloc_tail): (NoCudaAlloc, A) = alloc.split();
 
-        Ok(alloc_tail)
+        let r#async = crate::utils::r#async::Async::pending(
+            this,
+            stream,
+            NoCudaAlloc,
+            |_this, NoCudaAlloc| Ok(()),
+        )?;
+
+        Ok((r#async, alloc_tail))
     }
 }
 
diff --git a/src/utils/aliasing/mod.rs b/src/utils/aliasing/mod.rs
index e7753cf92..aa0a42742 100644
--- a/src/utils/aliasing/mod.rs
+++ b/src/utils/aliasing/mod.rs
@@ -1,5 +1,5 @@
-mod r#const;
-mod dynamic;
+// mod r#const;
+// mod dynamic;
 
-pub use dynamic::SplitSliceOverCudaThreadsDynamicStride;
-pub use r#const::SplitSliceOverCudaThreadsConstStride;
+// pub use dynamic::SplitSliceOverCudaThreadsDynamicStride;
+// pub use r#const::SplitSliceOverCudaThreadsConstStride;
diff --git a/src/utils/async.rs b/src/utils/async.rs
index 78bad1725..683eeb235 100644
--- a/src/utils/async.rs
+++ b/src/utils/async.rs
@@ -1,8 +1,5 @@
 #[cfg(feature = "host")]
-use std::{
-    future::Future, future::IntoFuture, future::Ready, marker::PhantomData, sync::Arc, sync::Mutex,
-    task::Poll, task::Waker,
-};
+use std::{future::Future, future::IntoFuture, marker::PhantomData, task::Poll};
 
 #[cfg(feature = "host")]
 use rustacuda::{
@@ -14,225 +11,159 @@ use rustacuda::{
 use crate::host::CudaDropWrapper;
 
 #[cfg(feature = "host")]
-#[allow(clippy::module_name_repetitions)]
-pub trait CudaAsync<'stream, T, C: Send = ()>: Sized + IntoFuture<Output = CudaResult<T>> {
-    /// Wraps a still-asynchronous `value` which is being computed on `stream`
-    /// such that its computation can be synchronised on.
-    ///
-    /// # Errors
-    /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside
-    /// CUDA.
-    fn new(
-        value: T,
-        stream: &'stream Stream,
-        capture: C,
-        on_completion: impl Send + FnOnce(C) -> CudaResult<()>,
-    ) -> CudaResult<Self>;
-
-    /// Synchronises on this computation to block until it has completed and
-    /// the inner value can be safely returned and again be used in synchronous
-    /// operations.
-    ///
-    /// # Errors
-    /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside
-    /// CUDA.
-    fn synchronize(self) -> CudaResult<T>;
-
-    /// Moves the asynchronous data move to a different [`Stream`].
-    ///
-    /// # Errors
-    /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside
-    /// CUDA.
-    fn move_to_stream<'stream_new>(
-        self,
-        stream: &'stream_new Stream,
-    ) -> CudaResult<impl CudaAsync<'stream_new, T, C>>;
-}
-
-#[cfg(feature = "host")]
-pub struct Sync<T> {
+pub struct Async<'stream, T, C> {
+    _stream: PhantomData<&'stream Stream>,
     value: T,
+    status: AsyncStatus<T, C>,
 }
 
 #[cfg(feature = "host")]
-impl<'stream, T, C: Send> CudaAsync<'stream, T, C> for Sync<T> {
-    fn new(
-        value: T,
-        _stream: &'stream Stream,
+enum AsyncStatus<T, C> {
+    #[allow(clippy::type_complexity)]
+    Processing {
+        receiver: oneshot::Receiver<CudaResult<()>>,
         capture: C,
-        on_completion: impl Send + FnOnce(C) -> CudaResult<()>,
-    ) -> CudaResult<Self> {
-        on_completion(capture)?;
-
-        Ok(Self { value })
-    }
-
-    fn synchronize(self) -> CudaResult<T> {
-        Ok(self.value)
-    }
-
-    #[allow(refining_impl_trait)]
-    fn move_to_stream(self, _stream: &Stream) -> CudaResult<Self> {
-        Ok(self)
-    }
+        on_completion: Box<dyn FnOnce(&mut T, C) -> CudaResult<()>>,
+        event: CudaDropWrapper<Event>,
+    },
+    Completed {
+        result: CudaResult<()>,
+    },
 }
 
+// TODO: completion is NOT allowed to make any cuda calls
 #[cfg(feature = "host")]
-impl<T> IntoFuture for Sync<T> {
-    type IntoFuture = Ready<CudaResult<T>>;
-    type Output = CudaResult<T>;
+impl<'stream, T, C> Async<'stream, T, C> {
+    /// Wraps a `value` which is ready on `stream`.
+    #[must_use]
+    pub const fn ready(value: T, stream: &'stream Stream) -> Self {
+        let _ = stream;
 
-    fn into_future(self) -> Self::IntoFuture {
-        std::future::ready(Ok(self.value))
+        Self {
+            _stream: PhantomData::<&'stream Stream>,
+            value,
+            status: AsyncStatus::Completed { result: Ok(()) },
+        }
     }
-}
 
-#[cfg(feature = "host")]
-pub struct Async<'stream, T, C = ()> {
-    _stream: PhantomData<&'stream Stream>,
-    event: CudaDropWrapper<Event>,
-    value: T,
-    status: Arc<Mutex<AsyncStatus<C>>>,
-}
-
-// This could also be expressed as a
-//  https://docs.rs/oneshot/latest/oneshot/index.html channel
-#[cfg(feature = "host")]
-enum AsyncStatus<C> {
-    Processing { waker: Option<Waker>, capture: C },
-    Completed { result: CudaResult<()> },
-}
-
-// TODO: completion is NOT allowed to make any cuda calls
-#[cfg(feature = "host")]
-impl<'stream, T, C: Send> CudaAsync<'stream, T, C> for Async<'stream, T, C> {
-    fn new(
+    /// Wraps a still-pending `value` which is being computed on `stream`
+    /// such that its computation can be synchronised on.
+    ///
+    /// # Errors
+    /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside
+    /// CUDA.
+    pub fn pending(
         value: T,
         stream: &'stream Stream,
         capture: C,
-        on_completion: impl Send + FnOnce(C) -> CudaResult<()>,
+        on_completion: impl FnOnce(&mut T, C) -> CudaResult<()> + 'static,
     ) -> CudaResult<Self> {
         let event = CudaDropWrapper::from(Event::new(
             EventFlags::DISABLE_TIMING | EventFlags::BLOCKING_SYNC,
         )?);
 
-        let status = Arc::new(Mutex::new(AsyncStatus::Processing {
-            waker: None,
-            capture,
-        }));
-
-        let status_callback = status.clone();
-        stream.add_callback(Box::new(move |res| {
-            let Ok(mut status) = status_callback.lock() else {
-                return;
-            };
-
-            let old_status =
-                std::mem::replace(&mut *status, AsyncStatus::Completed { result: Ok(()) });
-
-            let AsyncStatus::Processing { mut waker, capture } = old_status else {
-                // this path should never be taken
-                *status = old_status;
-                return;
-            };
-
-            if let Err(err) = res {
-                *status = AsyncStatus::Completed { result: Err(err) };
-            } else if let Err(err) = on_completion(capture) {
-                *status = AsyncStatus::Completed { result: Err(err) };
-            }
-
-            if let Some(waker) = waker.take() {
-                waker.wake();
-            }
-        }))?;
+        let (sender, receiver) = oneshot::channel();
 
+        stream.add_callback(Box::new(|result| std::mem::drop(sender.send(result))))?;
         event.record(stream)?;
 
         Ok(Self {
             _stream: PhantomData::<&'stream Stream>,
-            event,
             value,
-            status,
+            status: AsyncStatus::Processing {
+                capture,
+                receiver,
+                on_completion: Box::new(on_completion),
+                event,
+            },
         })
     }
 
-    fn synchronize(self) -> CudaResult<T> {
-        let Ok(status) = self.status.lock() else {
-            return Err(CudaError::OperatingSystemError);
+    /// Synchronises on this computation to block until it has completed and
+    /// the inner value can be safely returned and again be used in synchronous
+    /// operations.
+    ///
+    /// # Errors
+    /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside
+    /// CUDA.
+    pub fn synchronize(mut self) -> CudaResult<T> {
+        let (receiver, capture, on_completion) = match self.status {
+            AsyncStatus::Completed { result } => return result.map(|()| self.value),
+            AsyncStatus::Processing {
+                receiver,
+                capture,
+                on_completion,
+                event: _,
+            } => (receiver, capture, on_completion),
         };
 
-        if let AsyncStatus::Completed { result } = &*status {
-            return result.map(|()| self.value);
+        match receiver.recv() {
+            Ok(Ok(())) => (),
+            Ok(Err(err)) => return Err(err),
+            Err(oneshot::RecvError) => return Err(CudaError::AlreadyAcquired),
         }
 
-        std::mem::drop(status);
-
-        self.event.synchronize()?;
-
-        let Ok(status) = self.status.lock() else {
-            return Err(CudaError::OperatingSystemError);
-        };
+        on_completion(&mut self.value, capture)?;
 
-        match &*status {
-            AsyncStatus::Completed { result } => result.map(|()| self.value),
-            AsyncStatus::Processing { .. } => Err(CudaError::NotReady),
-        }
+        Ok(self.value)
     }
 
-    #[allow(refining_impl_trait)]
-    fn move_to_stream<'stream_new>(
-        self,
+    /// Moves the asynchronous data move to a different [`Stream`].
+    ///
+    /// # Errors
+    /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside
+    /// CUDA.
+    pub fn move_to_stream<'stream_new>(
+        mut self,
         stream: &'stream_new Stream,
     ) -> CudaResult<Async<'stream_new, T, C>> {
-        let Ok(status) = self.status.lock() else {
-            return Err(CudaError::OperatingSystemError);
+        let (receiver, capture, on_completion, event) = match self.status {
+            AsyncStatus::Completed { .. } => {
+                return Ok(Async {
+                    _stream: PhantomData::<&'stream_new Stream>,
+                    value: self.value,
+                    status: self.status,
+                })
+            },
+            AsyncStatus::Processing {
+                receiver,
+                capture,
+                on_completion,
+                event,
+            } => (receiver, capture, on_completion, event),
         };
 
-        if let AsyncStatus::Completed { result } = &*status {
-            #[allow(clippy::let_unit_value)]
-            let () = (*result)?;
-
-            std::mem::drop(status);
-
-            // the computation has completed, so the result is available on any stream
-            return Ok(Async {
-                _stream: PhantomData::<&'stream_new Stream>,
-                event: self.event,
-                value: self.value,
-                status: self.status,
-            });
-        }
-
-        std::mem::drop(status);
-
-        stream.wait_event(&self.event, StreamWaitEventFlags::DEFAULT)?;
-        self.event.record(stream)?;
+        match receiver.try_recv() {
+            Ok(Ok(())) => (),
+            Ok(Err(err)) => return Err(err),
+            Err(oneshot::TryRecvError::Empty) => {
+                stream.wait_event(&event, StreamWaitEventFlags::DEFAULT)?;
+
+                return Ok(Async {
+                    _stream: PhantomData::<&'stream_new Stream>,
+                    value: self.value,
+                    status: AsyncStatus::Processing {
+                        receiver,
+                        capture,
+                        on_completion,
+                        event,
+                    },
+                });
+            },
+            Err(oneshot::TryRecvError::Disconnected) => return Err(CudaError::AlreadyAcquired),
+        };
 
-        // add a new waker callback since the waker may have received a spurious
-        //  wake-up from when the computation completed on the original stream
-        let waker_callback = self.status.clone();
-        stream.add_callback(Box::new(move |_| {
-            if let Ok(mut status) = waker_callback.lock() {
-                if let AsyncStatus::Processing { waker, .. } = &mut *status {
-                    if let Some(waker) = waker.take() {
-                        waker.wake();
-                    }
-                }
-            }
-        }))?;
+        on_completion(&mut self.value, capture)?;
 
         Ok(Async {
             _stream: PhantomData::<&'stream_new Stream>,
-            event: self.event,
             value: self.value,
-            status: self.status,
+            status: AsyncStatus::Completed { result: Ok(()) },
         })
     }
-}
 
-#[cfg(feature = "host")]
-impl<'stream, T, C> Async<'stream, T, C> {
+    #[allow(clippy::missing_errors_doc)] // FIXME
+    #[allow(clippy::type_complexity)] // FIXME
     /// # Safety
     ///
     /// The returned inner value of type `T` may not yet have completed its
@@ -241,8 +172,69 @@ impl<'stream, T, C> Async<'stream, T, C> {
     /// This method must only be used to construct a larger asynchronous
     /// computation out of smaller ones that have all been submitted to the
     /// same [`Stream`].
-    pub unsafe fn unwrap_unchecked(self) -> T {
-        self.value
+    pub unsafe fn unwrap_unchecked(
+        self,
+    ) -> CudaResult<(T, Option<(C, Box<dyn FnOnce(&mut T, C) -> CudaResult<()>>)>)> {
+        match self.status {
+            AsyncStatus::Completed { result: Ok(()) } => Ok((self.value, None)),
+            AsyncStatus::Completed { result: Err(err) } => Err(err),
+            AsyncStatus::Processing {
+                receiver: _,
+                capture,
+                on_completion,
+                event: _,
+            } => Ok((self.value, Some((capture, on_completion)))),
+        }
+    }
+}
+
+#[cfg(feature = "host")]
+struct AsyncFuture<'stream, T, C> {
+    _stream: PhantomData<&'stream Stream>,
+    value: Option<T>,
+    #[allow(clippy::type_complexity)]
+    capture_on_completion: Option<(C, Box<dyn FnOnce(&mut T, C) -> CudaResult<()> + 'static>)>,
+    status: AsyncStatus<T, ()>,
+}
+
+#[cfg(feature = "host")]
+impl<'stream, T, C> Future for AsyncFuture<'stream, T, C> {
+    type Output = CudaResult<T>;
+
+    fn poll(
+        self: core::pin::Pin<&mut Self>,
+        cx: &mut core::task::Context<'_>,
+    ) -> Poll<Self::Output> {
+        // Safety: this function does not move out of `this`
+        let this = unsafe { self.get_unchecked_mut() };
+
+        match &mut this.status {
+            AsyncStatus::Processing {
+                receiver,
+                capture: (),
+                on_completion: _,
+                event: _,
+            } => match std::pin::Pin::new(receiver).poll(cx) {
+                Poll::Ready(Ok(Ok(()))) => (),
+                Poll::Ready(Ok(Err(err))) => return Poll::Ready(Err(err)),
+                Poll::Ready(Err(oneshot::RecvError)) => {
+                    return Poll::Ready(Err(CudaError::AlreadyAcquired))
+                },
+                Poll::Pending => return Poll::Pending,
+            },
+            AsyncStatus::Completed { result: Ok(()) } => (),
+            AsyncStatus::Completed { result: Err(err) } => return Poll::Ready(Err(*err)),
+        }
+
+        let Some(mut value) = this.value.take() else {
+            return Poll::Ready(Err(CudaError::AlreadyAcquired));
+        };
+
+        if let Some((capture, on_completion)) = this.capture_on_completion.take() {
+            on_completion(&mut value, capture)?;
+        }
+
+        Poll::Ready(Ok(value))
     }
 }
 
@@ -253,36 +245,29 @@ impl<'stream, T, C> IntoFuture for Async<'stream, T, C> {
     type IntoFuture = impl Future<Output = Self::Output>;
 
     fn into_future(self) -> Self::IntoFuture {
-        let mut wrapper = Some(self);
-
-        std::future::poll_fn(move |cx| {
-            let poll = match &wrapper {
-                #[allow(clippy::option_if_let_else)]
-                Some(Async {
-                    status: status_mutex,
-                    ..
-                }) => match status_mutex.lock() {
-                    Ok(mut status_guard) => match &mut *status_guard {
-                        AsyncStatus::Completed { result: Ok(()) } => Poll::Ready(Ok(())),
-                        AsyncStatus::Completed { result: Err(err) } => Poll::Ready(Err(*err)),
-                        AsyncStatus::Processing { waker, .. } => {
-                            *waker = Some(cx.waker().clone());
-                            Poll::Pending
-                        },
-                    },
-                    Err(_) => Poll::Ready(Err(CudaError::OperatingSystemError)),
+        let (capture_on_completion, status) = match self.status {
+            AsyncStatus::Completed { result } => (None, AsyncStatus::Completed { result }),
+            AsyncStatus::Processing {
+                receiver,
+                capture,
+                on_completion,
+                event,
+            } => (
+                Some((capture, on_completion)),
+                AsyncStatus::Processing {
+                    receiver,
+                    capture: (),
+                    on_completion: Box::new(|_self, ()| Ok(())),
+                    event,
                 },
-                None => Poll::Ready(Err(CudaError::AlreadyAcquired)),
-            };
+            ),
+        };
 
-            match poll {
-                Poll::Ready(Ok(())) => match wrapper.take() {
-                    Some(Async { value, .. }) => Poll::Ready(Ok(value)),
-                    None => Poll::Ready(Err(CudaError::AlreadyAcquired)),
-                },
-                Poll::Ready(Err(err)) => Poll::Ready(Err(err)),
-                Poll::Pending => Poll::Pending,
-            }
-        })
+        AsyncFuture {
+            _stream: PhantomData::<&'stream Stream>,
+            value: Some(self.value),
+            capture_on_completion,
+            status,
+        }
     }
 }
diff --git a/src/utils/exchange/mod.rs b/src/utils/exchange/mod.rs
index 722e02559..9c0de5e36 100644
--- a/src/utils/exchange/mod.rs
+++ b/src/utils/exchange/mod.rs
@@ -1,4 +1,4 @@
-pub mod buffer;
+// pub mod buffer;
 
-#[cfg(feature = "host")]
-pub mod wrapper;
+// #[cfg(feature = "host")]
+// pub mod wrapper;

From f8618c5abfe6323356592cd5356108760016b4d9 Mon Sep 17 00:00:00 2001
From: Juniper Tyree <juniper.tyree@helsinki.fi>
Date: Mon, 1 Jan 2024 13:43:18 +0000
Subject: [PATCH 075/120] Further async API experimentation

---
 src/host/mod.rs               |   5 ++
 src/lend/impls/box.rs         |  23 +++++---
 src/lend/impls/boxed_slice.rs | 106 +++++++++++++++++++++++++++++++++-
 src/lend/impls/option.rs      |  86 ++++++++++++++++-----------
 src/lend/mod.rs               |  44 ++++++++++----
 src/utils/adapter.rs          |  23 ++++----
 src/utils/async.rs            |  37 ++++++------
 7 files changed, 243 insertions(+), 81 deletions(-)

diff --git a/src/host/mod.rs b/src/host/mod.rs
index f77c75792..6dee3b1f6 100644
--- a/src/host/mod.rs
+++ b/src/host/mod.rs
@@ -336,6 +336,11 @@ impl<'a, T: PortableBitSemantics + TypeGraphLayout> HostAndDeviceOwned<'a, T> {
         self.host_val
     }
 
+    #[must_use]
+    pub(crate) fn for_async_completion(&mut self) -> &mut T {
+        self.host_val
+    }
+
     #[must_use]
     pub fn into_async<'stream>(self) -> HostAndDeviceOwnedAsync<'stream, 'a, T> {
         HostAndDeviceOwnedAsync {
diff --git a/src/lend/impls/box.rs b/src/lend/impls/box.rs
index 34224eb62..1dffba723 100644
--- a/src/lend/impls/box.rs
+++ b/src/lend/impls/box.rs
@@ -83,14 +83,16 @@ unsafe impl<T: PortableBitSemantics + TypeGraphLayout> RustToCudaAsync for Box<T
     >;
     #[cfg(any(not(feature = "host"), doc))]
     type CudaAllocationAsync = crate::alloc::SomeCudaAlloc;
+    #[cfg(feature = "host")]
+    type RestoreAsyncCapture = Self::CudaAllocationAsync;
 
     #[cfg(feature = "host")]
-    unsafe fn borrow_async<A: CudaAlloc>(
+    unsafe fn borrow_async<'stream, A: CudaAlloc>(
         &self,
         alloc: A,
-        stream: &rustacuda::stream::Stream,
+        stream: &'stream rustacuda::stream::Stream,
     ) -> rustacuda::error::CudaResult<(
-        DeviceAccessible<Self::CudaRepresentation>,
+        Async<'stream, DeviceAccessible<Self::CudaRepresentation>, &Self>,
         CombinedCudaAlloc<Self::CudaAllocationAsync, A>,
     )> {
         use rustacuda::memory::AsyncCopyDestination;
@@ -114,9 +116,14 @@ unsafe impl<T: PortableBitSemantics + TypeGraphLayout> RustToCudaAsync for Box<T
         device_box.async_copy_from(&*locked_box, stream)?;
 
         Ok((
-            DeviceAccessible::from(BoxCudaRepresentation(DeviceOwnedPointer(
-                device_box.as_device_ptr().as_raw_mut().cast(),
-            ))),
+            Async::pending(
+                DeviceAccessible::from(BoxCudaRepresentation(DeviceOwnedPointer(
+                    device_box.as_device_ptr().as_raw_mut().cast(),
+                ))),
+                stream,
+                self,
+                |_cuda_repr, _self| Ok(()),
+            )?,
             CombinedCudaAlloc::new(CombinedCudaAlloc::new(locked_box, device_box), alloc),
         ))
     }
@@ -127,7 +134,7 @@ unsafe impl<T: PortableBitSemantics + TypeGraphLayout> RustToCudaAsync for Box<T
         alloc: CombinedCudaAlloc<Self::CudaAllocationAsync, A>,
         stream: &'stream rustacuda::stream::Stream,
     ) -> CudaResult<(
-        Async<'stream, owning_ref::BoxRefMut<'a, O, Self>, Self::CudaAllocationAsync>,
+        Async<'stream, owning_ref::BoxRefMut<'a, O, Self>, Self::RestoreAsyncCapture, Self>,
         A,
     )> {
         use rustacuda::memory::AsyncCopyDestination;
@@ -141,7 +148,7 @@ unsafe impl<T: PortableBitSemantics + TypeGraphLayout> RustToCudaAsync for Box<T
             this,
             stream,
             CombinedCudaAlloc::new(locked_box, device_box),
-            move |this, alloc| {
+            move |this: &mut Self, alloc| {
                 let data: &mut T = &mut *this;
                 let (locked_box, device_box) = alloc.split();
 
diff --git a/src/lend/impls/boxed_slice.rs b/src/lend/impls/boxed_slice.rs
index 575ea4ef6..c7def8c69 100644
--- a/src/lend/impls/boxed_slice.rs
+++ b/src/lend/impls/boxed_slice.rs
@@ -1,11 +1,13 @@
 use core::marker::PhantomData;
+#[cfg(feature = "host")]
+use std::mem::ManuallyDrop;
 
-use crate::{deps::alloc::boxed::Box, utils::ffi::DeviceOwnedPointer};
+use crate::{deps::alloc::boxed::Box, lend::RustToCudaAsync, utils::ffi::DeviceOwnedPointer};
 
 use const_type_layout::{TypeGraphLayout, TypeLayout};
 
 #[cfg(feature = "host")]
-use rustacuda::{error::CudaResult, memory::DeviceBuffer};
+use rustacuda::{error::CudaResult, memory::DeviceBuffer, memory::LockedBuffer};
 
 use crate::{
     lend::{CudaAsRust, RustToCuda},
@@ -20,6 +22,7 @@ use crate::{
     alloc::{CombinedCudaAlloc, CudaAlloc},
     host::CudaDropWrapper,
     utils::adapter::DeviceCopyWithPortableBitSemantics,
+    utils::r#async::Async,
 };
 
 #[doc(hidden)]
@@ -80,6 +83,105 @@ unsafe impl<T: PortableBitSemantics + TypeGraphLayout> RustToCuda for Box<[T]> {
     }
 }
 
+unsafe impl<T: PortableBitSemantics + TypeGraphLayout> RustToCudaAsync for Box<[T]> {
+    #[cfg(all(feature = "host", not(doc)))]
+    type CudaAllocationAsync = CombinedCudaAlloc<
+        CudaDropWrapper<LockedBuffer<DeviceCopyWithPortableBitSemantics<ManuallyDrop<T>>>>,
+        CudaDropWrapper<DeviceBuffer<DeviceCopyWithPortableBitSemantics<ManuallyDrop<T>>>>,
+    >;
+    #[cfg(any(not(feature = "host"), doc))]
+    type CudaAllocationAsync = crate::alloc::SomeCudaAlloc;
+    #[cfg(feature = "host")]
+    type RestoreAsyncCapture = Self::CudaAllocationAsync;
+
+    #[cfg(feature = "host")]
+    unsafe fn borrow_async<'stream, A: CudaAlloc>(
+        &self,
+        alloc: A,
+        stream: &'stream rustacuda::stream::Stream,
+    ) -> rustacuda::error::CudaResult<(
+        Async<'stream, DeviceAccessible<Self::CudaRepresentation>, &Self>,
+        CombinedCudaAlloc<Self::CudaAllocationAsync, A>,
+    )> {
+        use rustacuda::memory::AsyncCopyDestination;
+
+        let locked_buffer = unsafe {
+            let mut uninit = CudaDropWrapper::from(LockedBuffer::<
+                DeviceCopyWithPortableBitSemantics<ManuallyDrop<T>>,
+            >::uninitialized(self.len())?);
+            std::ptr::copy_nonoverlapping(
+                self.as_ref()
+                    .as_ptr()
+                    .cast::<DeviceCopyWithPortableBitSemantics<ManuallyDrop<T>>>(),
+                uninit.as_mut_ptr(),
+                self.len(),
+            );
+            uninit
+        };
+
+        let mut device_buffer = CudaDropWrapper::from(DeviceBuffer::<
+            DeviceCopyWithPortableBitSemantics<ManuallyDrop<T>>,
+        >::uninitialized(self.len())?);
+        device_buffer.async_copy_from(&*locked_buffer, stream)?;
+
+        Ok((
+            Async::pending(
+                DeviceAccessible::from(BoxedSliceCudaRepresentation {
+                    data: DeviceOwnedPointer(device_buffer.as_mut_ptr().cast()),
+                    len: device_buffer.len(),
+                    _marker: PhantomData::<T>,
+                }),
+                stream,
+                self,
+                |_cuda_repr, _self| Ok(()),
+            )?,
+            CombinedCudaAlloc::new(CombinedCudaAlloc::new(locked_buffer, device_buffer), alloc),
+        ))
+    }
+
+    #[cfg(feature = "host")]
+    unsafe fn restore_async<'a, 'stream, A: CudaAlloc, O>(
+        this: owning_ref::BoxRefMut<'a, O, Self>,
+        alloc: CombinedCudaAlloc<Self::CudaAllocationAsync, A>,
+        stream: &'stream rustacuda::stream::Stream,
+    ) -> CudaResult<(
+        Async<'stream, owning_ref::BoxRefMut<'a, O, Self>, Self::RestoreAsyncCapture, Self>,
+        A,
+    )> {
+        use rustacuda::memory::AsyncCopyDestination;
+
+        let (alloc_front, alloc_tail) = alloc.split();
+        let (mut locked_buffer, device_buffer) = alloc_front.split();
+
+        device_buffer.async_copy_to(&mut *locked_buffer, stream)?;
+
+        let r#async = crate::utils::r#async::Async::pending(
+            this,
+            stream,
+            CombinedCudaAlloc::new(locked_buffer, device_buffer),
+            move |this: &mut Self, alloc| {
+                let data: &mut [T] = &mut *this;
+                let (locked_buffer, device_buffer) = alloc.split();
+
+                std::mem::drop(device_buffer);
+                // Safety: equivalent to data.copy_from_slice(&*locked_buffer)
+                //         since LockedBox<ManuallyDrop<T>> doesn't drop T
+                unsafe {
+                    std::ptr::copy_nonoverlapping(
+                        locked_buffer.as_ptr().cast::<T>(),
+                        data.as_mut_ptr(),
+                        data.len(),
+                    );
+                }
+                std::mem::drop(locked_buffer);
+                Ok(())
+            },
+        )?;
+
+        Ok((r#async, alloc_tail))
+    }
+}
+
 unsafe impl<T: PortableBitSemantics + TypeGraphLayout> CudaAsRust
     for BoxedSliceCudaRepresentation<T>
 {
diff --git a/src/lend/impls/option.rs b/src/lend/impls/option.rs
index 5a70a24c6..b447c6e34 100644
--- a/src/lend/impls/option.rs
+++ b/src/lend/impls/option.rs
@@ -83,41 +83,65 @@ unsafe impl<T: RustToCuda> RustToCuda for Option<T> {
 
 unsafe impl<T: RustToCudaAsync> RustToCudaAsync for Option<T> {
     type CudaAllocationAsync = Option<<T as RustToCudaAsync>::CudaAllocationAsync>;
+    #[cfg(feature = "host")]
+    type RestoreAsyncCapture = (
+        <T as RustToCudaAsync>::RestoreAsyncCapture,
+        Box<
+            dyn FnOnce(&mut T, <T as RustToCudaAsync>::RestoreAsyncCapture) -> CudaResult<()>
+                + 'static,
+        >,
+    );
 
     #[cfg(feature = "host")]
     #[allow(clippy::type_complexity)]
-    unsafe fn borrow_async<A: CudaAlloc>(
+    unsafe fn borrow_async<'stream, A: CudaAlloc>(
         &self,
         alloc: A,
-        stream: &rustacuda::stream::Stream,
+        stream: &'stream rustacuda::stream::Stream,
     ) -> CudaResult<(
-        DeviceAccessible<Self::CudaRepresentation>,
+        Async<'stream, DeviceAccessible<Self::CudaRepresentation>, &Self>,
         CombinedCudaAlloc<Self::CudaAllocationAsync, A>,
     )> {
         let (cuda_repr, alloc) = match self {
             None => (
-                OptionCudaRepresentation {
-                    maybe: MaybeUninit::uninit(),
-                    present: false,
-                },
+                Async::ready(
+                    DeviceAccessible::from(OptionCudaRepresentation {
+                        maybe: MaybeUninit::uninit(),
+                        present: false,
+                    }),
+                    stream,
+                ),
                 CombinedCudaAlloc::new(None, alloc),
             ),
             Some(value) => {
                 let (cuda_repr, alloc) = value.borrow_async(alloc, stream)?;
 
-                let (alloc_front, alloc_tail) = alloc.split();
+                let (cuda_repr, capture_on_completion) = unsafe { cuda_repr.unwrap_unchecked()? };
 
-                (
-                    OptionCudaRepresentation {
-                        maybe: MaybeUninit::new(cuda_repr),
-                        present: true,
-                    },
-                    CombinedCudaAlloc::new(Some(alloc_front), alloc_tail),
-                )
+                let (alloc_front, alloc_tail) = alloc.split();
+                let alloc = CombinedCudaAlloc::new(Some(alloc_front), alloc_tail);
+
+                let option_cuda_repr = DeviceAccessible::from(OptionCudaRepresentation {
+                    maybe: MaybeUninit::new(cuda_repr),
+                    present: true,
+                });
+
+                let r#async = if let Some((capture, on_completion)) = capture_on_completion {
+                    Async::pending(option_cuda_repr, stream, self, |option_cuda_repr, this| {
+                        // if let Some(capture) = this {
+                        //     on_completion(todo!(), capture)?;
+                        // }
+                        Ok(())
+                    })?
+                } else {
+                    Async::ready(option_cuda_repr, stream)
+                };
+
+                (r#async, alloc)
             },
         };
 
-        Ok((DeviceAccessible::from(cuda_repr), alloc))
+        Ok((cuda_repr, alloc))
     }
 
     #[cfg(feature = "host")]
@@ -126,7 +150,7 @@ unsafe impl<T: RustToCudaAsync> RustToCudaAsync for Option<T> {
         alloc: CombinedCudaAlloc<Self::CudaAllocationAsync, A>,
         stream: &'stream rustacuda::stream::Stream,
     ) -> CudaResult<(
-        Async<'stream, owning_ref::BoxRefMut<'a, O, Self>, Self::CudaAllocationAsync>,
+        Async<'stream, owning_ref::BoxRefMut<'a, O, Self>, Self::RestoreAsyncCapture, Self>,
         A,
     )> {
         let (alloc_front, alloc_tail) = alloc.split();
@@ -150,22 +174,18 @@ unsafe impl<T: RustToCudaAsync> RustToCudaAsync for Option<T> {
             let this = std::mem::ManuallyDrop::into_inner(this_backup);
 
             if let Some((capture, on_completion)) = capture_on_completion {
-                let r#async = Async::pending(this, stream, Some(capture), |this, capture| {
-                    let mut value_backup = unsafe {
-                        std::mem::ManuallyDrop::new(std::ptr::read(this).map_mut(
-                            |value| match value {
-                                Some(value) => value,
-                                None => unreachable!(), // TODO
-                            },
-                        ))
-                    };
-
-                    if let (Some(_), Some(capture)) = (&mut **this, capture) {
-                        on_completion(&mut value_backup, capture)?;
-                    }
-
-                    Ok(())
-                })?;
+                let r#async = Async::pending(
+                    this,
+                    stream,
+                    (capture, on_completion),
+                    |this: &mut Self, (capture, on_completion)| {
+                        if let Some(value) = this {
+                            on_completion(value, capture)?;
+                        }
+
+                        Ok(())
+                    },
+                )?;
                 Ok((r#async, alloc_tail))
             } else {
                 let r#async = Async::ready(this, stream);
diff --git a/src/lend/mod.rs b/src/lend/mod.rs
index 0b442cab5..df533a050 100644
--- a/src/lend/mod.rs
+++ b/src/lend/mod.rs
@@ -75,6 +75,10 @@ pub unsafe trait RustToCuda {
 pub unsafe trait RustToCudaAsync: RustToCuda {
     type CudaAllocationAsync: CudaAlloc;
 
+    #[doc(hidden)]
+    #[cfg(feature = "host")]
+    type RestoreAsyncCapture;
+
     #[doc(hidden)]
     #[cfg(feature = "host")]
     /// # Errors
@@ -98,12 +102,12 @@ pub unsafe trait RustToCudaAsync: RustToCuda {
     /// Similarly, `&self` should remain borrowed until synchronisation has
     /// been performed.
     #[allow(clippy::type_complexity)]
-    unsafe fn borrow_async<A: CudaAlloc>(
+    unsafe fn borrow_async<'stream, A: CudaAlloc>(
         &self,
         alloc: A,
-        stream: &rustacuda::stream::Stream,
+        stream: &'stream rustacuda::stream::Stream,
     ) -> rustacuda::error::CudaResult<(
-        DeviceAccessible<Self::CudaRepresentation>,
+        Async<'stream, DeviceAccessible<Self::CudaRepresentation>, &Self>,
         CombinedCudaAlloc<Self::CudaAllocationAsync, A>,
     )>;
 
@@ -129,7 +133,7 @@ pub unsafe trait RustToCudaAsync: RustToCuda {
         alloc: CombinedCudaAlloc<Self::CudaAllocationAsync, A>,
         stream: &'stream rustacuda::stream::Stream,
     ) -> rustacuda::error::CudaResult<(
-        Async<'stream, owning_ref::BoxRefMut<'a, O, Self>, Self::CudaAllocationAsync>,
+        Async<'stream, owning_ref::BoxRefMut<'a, O, Self>, Self::RestoreAsyncCapture, Self>,
         A,
     )>;
 }
@@ -296,7 +300,7 @@ pub trait LendToCudaAsync: RustToCudaAsync {
             Async<
                 'stream,
                 HostAndDeviceOwned<DeviceAccessible<<Self as RustToCuda>::CudaRepresentation>>,
-                (),
+                Self,
             >,
         ) -> Result<O, E>,
     >(
@@ -331,10 +335,20 @@ impl<T: RustToCudaAsync> LendToCudaAsync for T {
     {
         let (cuda_repr, alloc) = unsafe { self.borrow_async(NoCudaAlloc, stream) }?;
 
+        let (cuda_repr, capture_on_completion) = unsafe { cuda_repr.unwrap_unchecked()? };
+
         let result = HostAndDeviceConstRef::with_new(&cuda_repr, |const_ref| {
-            inner(Async::pending(const_ref, stream, self, |_ref, _self| {
-                Ok(())
-            })?)
+            let r#async = if let Some((capture, on_completion)) = capture_on_completion {
+                Async::pending(const_ref, stream, self, |const_ref, this| {
+                    // TODO
+                    // on_completion(const_ref.for_host(), this)
+                    Ok(())
+                })?
+            } else {
+                Async::ready(const_ref, stream)
+            };
+
+            inner(r#async)
         });
 
         core::mem::drop(cuda_repr);
@@ -351,7 +365,7 @@ impl<T: RustToCudaAsync> LendToCudaAsync for T {
             Async<
                 'stream,
                 HostAndDeviceOwned<DeviceAccessible<<Self as RustToCuda>::CudaRepresentation>>,
-                (),
+                Self,
             >,
         ) -> Result<O, E>,
     >(
@@ -364,8 +378,18 @@ impl<T: RustToCudaAsync> LendToCudaAsync for T {
     {
         let (cuda_repr, alloc) = unsafe { self.borrow_async(NoCudaAlloc, stream) }?;
 
+        let (cuda_repr, capture_on_completion) = unsafe { cuda_repr.unwrap_unchecked()? };
+
         let result = HostAndDeviceOwned::with_new(cuda_repr, |owned_ref| {
-            inner(Async::pending(owned_ref, stream, (), |_ref, ()| Ok(()))?)
+            let r#async = if let Some((capture, on_completion)) = capture_on_completion {
+                Async::pending(owned_ref, stream, self, |owned_ref, this| {
+                    on_completion(owned_ref.for_async_completion(), &this)
+                })?
+            } else {
+                Async::ready(owned_ref, stream)
+            };
+
+            inner(r#async)
         });
 
         core::mem::drop(alloc);
diff --git a/src/utils/adapter.rs b/src/utils/adapter.rs
index 182c29184..e2f667be8 100644
--- a/src/utils/adapter.rs
+++ b/src/utils/adapter.rs
@@ -130,19 +130,24 @@ unsafe impl<T: Copy + PortableBitSemantics + TypeGraphLayout> RustToCudaAsync
     for RustToCudaWithPortableBitCopySemantics<T>
 {
     type CudaAllocationAsync = NoCudaAlloc;
+    #[cfg(feature = "host")]
+    type RestoreAsyncCapture = ();
 
     #[cfg(feature = "host")]
     #[allow(clippy::type_complexity)]
-    unsafe fn borrow_async<A: CudaAlloc>(
+    unsafe fn borrow_async<'stream, A: CudaAlloc>(
         &self,
         alloc: A,
-        _stream: &rustacuda::stream::Stream,
+        stream: &'stream rustacuda::stream::Stream,
     ) -> rustacuda::error::CudaResult<(
-        DeviceAccessible<Self::CudaRepresentation>,
+        crate::utils::r#async::Async<'stream, DeviceAccessible<Self::CudaRepresentation>, &Self>,
         CombinedCudaAlloc<Self::CudaAllocation, A>,
     )> {
         let alloc = CombinedCudaAlloc::new(NoCudaAlloc, alloc);
-        Ok((DeviceAccessible::from(*self), alloc))
+        Ok((
+            crate::utils::r#async::Async::ready(DeviceAccessible::from(*self), stream),
+            alloc,
+        ))
     }
 
     #[cfg(feature = "host")]
@@ -154,18 +159,14 @@ unsafe impl<T: Copy + PortableBitSemantics + TypeGraphLayout> RustToCudaAsync
         crate::utils::r#async::Async<
             'stream,
             owning_ref::BoxRefMut<'a, O, Self>,
-            Self::CudaAllocationAsync,
+            Self::RestoreAsyncCapture,
+            Self,
         >,
         A,
     )> {
         let (_alloc_front, alloc_tail): (NoCudaAlloc, A) = alloc.split();
 
-        let r#async = crate::utils::r#async::Async::pending(
-            this,
-            stream,
-            NoCudaAlloc,
-            |_this, NoCudaAlloc| Ok(()),
-        )?;
+        let r#async = crate::utils::r#async::Async::pending(this, stream, (), |_this, ()| Ok(()))?;
 
         Ok((r#async, alloc_tail))
     }
diff --git a/src/utils/async.rs b/src/utils/async.rs
index 683eeb235..561cf97f8 100644
--- a/src/utils/async.rs
+++ b/src/utils/async.rs
@@ -1,5 +1,5 @@
 #[cfg(feature = "host")]
-use std::{future::Future, future::IntoFuture, marker::PhantomData, task::Poll};
+use std::{borrow::BorrowMut, future::Future, future::IntoFuture, marker::PhantomData, task::Poll};
 
 #[cfg(feature = "host")]
 use rustacuda::{
@@ -11,19 +11,19 @@ use rustacuda::{
 use crate::host::CudaDropWrapper;
 
 #[cfg(feature = "host")]
-pub struct Async<'stream, T, C> {
+pub struct Async<'stream, T: BorrowMut<B>, C, B: ?Sized = T> {
     _stream: PhantomData<&'stream Stream>,
     value: T,
-    status: AsyncStatus<T, C>,
+    status: AsyncStatus<C, B>,
 }
 
 #[cfg(feature = "host")]
-enum AsyncStatus<T, C> {
+enum AsyncStatus<C, B: ?Sized> {
     #[allow(clippy::type_complexity)]
     Processing {
         receiver: oneshot::Receiver<CudaResult<()>>,
         capture: C,
-        on_completion: Box<dyn FnOnce(&mut T, C) -> CudaResult<()>>,
+        on_completion: Box<dyn FnOnce(&mut B, C) -> CudaResult<()>>,
         event: CudaDropWrapper<Event>,
     },
     Completed {
@@ -33,7 +33,7 @@ enum AsyncStatus<T, C> {
 
 // TODO: completion is NOT allowed to make any cuda calls
 #[cfg(feature = "host")]
-impl<'stream, T, C> Async<'stream, T, C> {
+impl<'stream, T: BorrowMut<B>, C, B: ?Sized> Async<'stream, T, C, B> {
     /// Wraps a `value` which is ready on `stream`.
     #[must_use]
     pub const fn ready(value: T, stream: &'stream Stream) -> Self {
@@ -56,7 +56,7 @@ impl<'stream, T, C> Async<'stream, T, C> {
         value: T,
         stream: &'stream Stream,
         capture: C,
-        on_completion: impl FnOnce(&mut T, C) -> CudaResult<()> + 'static,
+        on_completion: impl FnOnce(&mut B, C) -> CudaResult<()> + 'static,
     ) -> CudaResult<Self> {
         let event = CudaDropWrapper::from(Event::new(
             EventFlags::DISABLE_TIMING | EventFlags::BLOCKING_SYNC,
@@ -103,7 +103,7 @@ impl<'stream, T, C> Async<'stream, T, C> {
             Err(oneshot::RecvError) => return Err(CudaError::AlreadyAcquired),
         }
 
-        on_completion(&mut self.value, capture)?;
+        on_completion(self.value.borrow_mut(), capture)?;
 
         Ok(self.value)
     }
@@ -116,7 +116,7 @@ impl<'stream, T, C> Async<'stream, T, C> {
     pub fn move_to_stream<'stream_new>(
         mut self,
         stream: &'stream_new Stream,
-    ) -> CudaResult<Async<'stream_new, T, C>> {
+    ) -> CudaResult<Async<'stream_new, T, C, B>> {
         let (receiver, capture, on_completion, event) = match self.status {
             AsyncStatus::Completed { .. } => {
                 return Ok(Async {
@@ -153,7 +153,7 @@ impl<'stream, T, C> Async<'stream, T, C> {
             Err(oneshot::TryRecvError::Disconnected) => return Err(CudaError::AlreadyAcquired),
         };
 
-        on_completion(&mut self.value, capture)?;
+        on_completion(self.value.borrow_mut(), capture)?;
 
         Ok(Async {
             _stream: PhantomData::<&'stream_new Stream>,
@@ -174,7 +174,10 @@ impl<'stream, T, C> Async<'stream, T, C> {
     /// same [`Stream`].
     pub unsafe fn unwrap_unchecked(
         self,
-    ) -> CudaResult<(T, Option<(C, Box<dyn FnOnce(&mut T, C) -> CudaResult<()>>)>)> {
+    ) -> CudaResult<(
+        T,
+        Option<(C, Box<dyn FnOnce(&mut B, C) -> CudaResult<()> + 'static>)>,
+    )> {
         match self.status {
             AsyncStatus::Completed { result: Ok(()) } => Ok((self.value, None)),
             AsyncStatus::Completed { result: Err(err) } => Err(err),
@@ -189,16 +192,16 @@ impl<'stream, T, C> Async<'stream, T, C> {
 }
 
 #[cfg(feature = "host")]
-struct AsyncFuture<'stream, T, C> {
+struct AsyncFuture<'stream, T: BorrowMut<B>, C, B: ?Sized> {
     _stream: PhantomData<&'stream Stream>,
     value: Option<T>,
     #[allow(clippy::type_complexity)]
-    capture_on_completion: Option<(C, Box<dyn FnOnce(&mut T, C) -> CudaResult<()> + 'static>)>,
-    status: AsyncStatus<T, ()>,
+    capture_on_completion: Option<(C, Box<dyn FnOnce(&mut B, C) -> CudaResult<()> + 'static>)>,
+    status: AsyncStatus<(), B>,
 }
 
 #[cfg(feature = "host")]
-impl<'stream, T, C> Future for AsyncFuture<'stream, T, C> {
+impl<'stream, T: BorrowMut<B>, C, B: ?Sized> Future for AsyncFuture<'stream, T, C, B> {
     type Output = CudaResult<T>;
 
     fn poll(
@@ -231,7 +234,7 @@ impl<'stream, T, C> Future for AsyncFuture<'stream, T, C> {
         };
 
         if let Some((capture, on_completion)) = this.capture_on_completion.take() {
-            on_completion(&mut value, capture)?;
+            on_completion(value.borrow_mut(), capture)?;
         }
 
         Poll::Ready(Ok(value))
@@ -239,7 +242,7 @@ impl<'stream, T, C> Future for AsyncFuture<'stream, T, C> {
 }
 
 #[cfg(feature = "host")]
-impl<'stream, T, C> IntoFuture for Async<'stream, T, C> {
+impl<'stream, T: BorrowMut<B>, C, B: ?Sized> IntoFuture for Async<'stream, T, C, B> {
     type Output = CudaResult<T>;
 
     type IntoFuture = impl Future<Output = Self::Output>;

From 5f52d1d3a3f0c125a7a4b08079d1d4174ffd994d Mon Sep 17 00:00:00 2001
From: Juniper Tyree <juniper.tyree@helsinki.fi>
Date: Tue, 2 Jan 2024 05:10:04 +0000
Subject: [PATCH 076/120] Further async API design work

---
 src/host/mod.rs               |   5 --
 src/lend/impls/box.rs         |  20 ++---
 src/lend/impls/boxed_slice.rs |  20 ++---
 src/lend/impls/option.rs      |  36 +++-----
 src/lend/mod.rs               |  42 ++++------
 src/utils/adapter.rs          |  13 +--
 src/utils/async.rs            | 152 +++++++++++++++++++++-------------
 7 files changed, 144 insertions(+), 144 deletions(-)

diff --git a/src/host/mod.rs b/src/host/mod.rs
index 6dee3b1f6..f77c75792 100644
--- a/src/host/mod.rs
+++ b/src/host/mod.rs
@@ -336,11 +336,6 @@ impl<'a, T: PortableBitSemantics + TypeGraphLayout> HostAndDeviceOwned<'a, T> {
         self.host_val
     }
 
-    #[must_use]
-    pub(crate) fn for_async_completion(&mut self) -> &mut T {
-        self.host_val
-    }
-
     #[must_use]
     pub fn into_async<'stream>(self) -> HostAndDeviceOwnedAsync<'stream, 'a, T> {
         HostAndDeviceOwnedAsync {
diff --git a/src/lend/impls/box.rs b/src/lend/impls/box.rs
index 1dffba723..121fe3905 100644
--- a/src/lend/impls/box.rs
+++ b/src/lend/impls/box.rs
@@ -22,6 +22,8 @@ use crate::{
     host::CudaDropWrapper,
     utils::adapter::DeviceCopyWithPortableBitSemantics,
     utils::r#async::Async,
+    utils::r#async::CompletionFnMut,
+    utils::r#async::NoCompletion,
 };
 
 #[doc(hidden)]
@@ -83,8 +85,6 @@ unsafe impl<T: PortableBitSemantics + TypeGraphLayout> RustToCudaAsync for Box<T
     >;
     #[cfg(any(not(feature = "host"), doc))]
     type CudaAllocationAsync = crate::alloc::SomeCudaAlloc;
-    #[cfg(feature = "host")]
-    type RestoreAsyncCapture = Self::CudaAllocationAsync;
 
     #[cfg(feature = "host")]
     unsafe fn borrow_async<'stream, A: CudaAlloc>(
@@ -92,7 +92,7 @@ unsafe impl<T: PortableBitSemantics + TypeGraphLayout> RustToCudaAsync for Box<T
         alloc: A,
         stream: &'stream rustacuda::stream::Stream,
     ) -> rustacuda::error::CudaResult<(
-        Async<'stream, DeviceAccessible<Self::CudaRepresentation>, &Self>,
+        Async<'_, 'stream, DeviceAccessible<Self::CudaRepresentation>>,
         CombinedCudaAlloc<Self::CudaAllocationAsync, A>,
     )> {
         use rustacuda::memory::AsyncCopyDestination;
@@ -121,8 +121,7 @@ unsafe impl<T: PortableBitSemantics + TypeGraphLayout> RustToCudaAsync for Box<T
                     device_box.as_device_ptr().as_raw_mut().cast(),
                 ))),
                 stream,
-                self,
-                |_cuda_repr, _self| Ok(()),
+                NoCompletion,
             )?,
             CombinedCudaAlloc::new(CombinedCudaAlloc::new(locked_box, device_box), alloc),
         ))
@@ -134,7 +133,7 @@ unsafe impl<T: PortableBitSemantics + TypeGraphLayout> RustToCudaAsync for Box<T
         alloc: CombinedCudaAlloc<Self::CudaAllocationAsync, A>,
         stream: &'stream rustacuda::stream::Stream,
     ) -> CudaResult<(
-        Async<'stream, owning_ref::BoxRefMut<'a, O, Self>, Self::RestoreAsyncCapture, Self>,
+        Async<'a, 'stream, owning_ref::BoxRefMut<'a, O, Self>, CompletionFnMut<'a, Self>>,
         A,
     )> {
         use rustacuda::memory::AsyncCopyDestination;
@@ -144,14 +143,11 @@ unsafe impl<T: PortableBitSemantics + TypeGraphLayout> RustToCudaAsync for Box<T
 
         device_box.async_copy_to(&mut *locked_box, stream)?;
 
-        let r#async = crate::utils::r#async::Async::pending(
+        let r#async = crate::utils::r#async::Async::<_, CompletionFnMut<'a, Self>>::pending(
             this,
             stream,
-            CombinedCudaAlloc::new(locked_box, device_box),
-            move |this: &mut Self, alloc| {
+            Box::new(move |this: &mut Self| {
                 let data: &mut T = &mut *this;
-                let (locked_box, device_box) = alloc.split();
-
                 std::mem::drop(device_box);
                 // Safety: equivalent to *data = *locked_box since
                 //         LockedBox<ManuallyDrop<T>> doesn't drop T
@@ -160,7 +156,7 @@ unsafe impl<T: PortableBitSemantics + TypeGraphLayout> RustToCudaAsync for Box<T
                 }
                 std::mem::drop(locked_box);
                 Ok(())
-            },
+            }),
         )?;
 
         Ok((r#async, alloc_tail))
diff --git a/src/lend/impls/boxed_slice.rs b/src/lend/impls/boxed_slice.rs
index c7def8c69..09a612c98 100644
--- a/src/lend/impls/boxed_slice.rs
+++ b/src/lend/impls/boxed_slice.rs
@@ -22,7 +22,7 @@ use crate::{
     alloc::{CombinedCudaAlloc, CudaAlloc},
     host::CudaDropWrapper,
     utils::adapter::DeviceCopyWithPortableBitSemantics,
-    utils::r#async::Async,
+    utils::r#async::{Async, CompletionFnMut, NoCompletion},
 };
 
 #[doc(hidden)]
@@ -91,8 +91,6 @@ unsafe impl<T: PortableBitSemantics + TypeGraphLayout> RustToCudaAsync for Box<[
     >;
     #[cfg(any(not(feature = "host"), doc))]
     type CudaAllocationAsync = crate::alloc::SomeCudaAlloc;
-    #[cfg(feature = "host")]
-    type RestoreAsyncCapture = Self::CudaAllocationAsync;
 
     #[cfg(feature = "host")]
     unsafe fn borrow_async<'stream, A: CudaAlloc>(
@@ -100,7 +98,7 @@ unsafe impl<T: PortableBitSemantics + TypeGraphLayout> RustToCudaAsync for Box<[
         alloc: A,
         stream: &'stream rustacuda::stream::Stream,
     ) -> rustacuda::error::CudaResult<(
-        Async<'stream, DeviceAccessible<Self::CudaRepresentation>, &Self>,
+        Async<'_, 'stream, DeviceAccessible<Self::CudaRepresentation>>,
         CombinedCudaAlloc<Self::CudaAllocationAsync, A>,
     )> {
         use rustacuda::memory::AsyncCopyDestination;
@@ -132,8 +130,7 @@ unsafe impl<T: PortableBitSemantics + TypeGraphLayout> RustToCudaAsync for Box<[
                     _marker: PhantomData::<T>,
                 }),
                 stream,
-                self,
-                |_cuda_repr, _self| Ok(()),
+                NoCompletion,
             )?,
             CombinedCudaAlloc::new(CombinedCudaAlloc::new(locked_buffer, device_buffer), alloc),
         ))
@@ -145,7 +142,7 @@ unsafe impl<T: PortableBitSemantics + TypeGraphLayout> RustToCudaAsync for Box<[
         alloc: CombinedCudaAlloc<Self::CudaAllocationAsync, A>,
         stream: &'stream rustacuda::stream::Stream,
     ) -> CudaResult<(
-        Async<'stream, owning_ref::BoxRefMut<'a, O, Self>, Self::RestoreAsyncCapture, Self>,
+        Async<'a, 'stream, owning_ref::BoxRefMut<'a, O, Self>, CompletionFnMut<'a, Self>>,
         A,
     )> {
         use rustacuda::memory::AsyncCopyDestination;
@@ -155,14 +152,11 @@ unsafe impl<T: PortableBitSemantics + TypeGraphLayout> RustToCudaAsync for Box<[
 
         device_buffer.async_copy_to(&mut *locked_buffer, stream)?;
 
-        let r#async = crate::utils::r#async::Async::pending(
+        let r#async = crate::utils::r#async::Async::<_, CompletionFnMut<'a, Self>>::pending(
             this,
             stream,
-            CombinedCudaAlloc::new(locked_buffer, device_buffer),
-            move |this: &mut Self, alloc| {
+            Box::new(move |this: &mut Self| {
                 let data: &mut [T] = &mut *this;
-                let (locked_buffer, device_buffer) = alloc.split();
-
                 std::mem::drop(device_buffer);
                 // Safety: equivalent to data.copy_from_slice(&*locked_buffer)
                 //         since LockedBox<ManuallyDrop<T>> doesn't drop T
@@ -175,7 +169,7 @@ unsafe impl<T: PortableBitSemantics + TypeGraphLayout> RustToCudaAsync for Box<[
                 }
                 std::mem::drop(locked_buffer);
                 Ok(())
-            },
+            }),
         )?;
 
         Ok((r#async, alloc_tail))
diff --git a/src/lend/impls/option.rs b/src/lend/impls/option.rs
index b447c6e34..c05c0d3bb 100644
--- a/src/lend/impls/option.rs
+++ b/src/lend/impls/option.rs
@@ -14,7 +14,7 @@ use crate::{
 #[cfg(feature = "host")]
 use crate::{
     alloc::{CombinedCudaAlloc, CudaAlloc},
-    utils::r#async::Async,
+    utils::r#async::{Async, CompletionFnMut, NoCompletion},
 };
 
 #[doc(hidden)]
@@ -83,14 +83,6 @@ unsafe impl<T: RustToCuda> RustToCuda for Option<T> {
 
 unsafe impl<T: RustToCudaAsync> RustToCudaAsync for Option<T> {
     type CudaAllocationAsync = Option<<T as RustToCudaAsync>::CudaAllocationAsync>;
-    #[cfg(feature = "host")]
-    type RestoreAsyncCapture = (
-        <T as RustToCudaAsync>::RestoreAsyncCapture,
-        Box<
-            dyn FnOnce(&mut T, <T as RustToCudaAsync>::RestoreAsyncCapture) -> CudaResult<()>
-                + 'static,
-        >,
-    );
 
     #[cfg(feature = "host")]
     #[allow(clippy::type_complexity)]
@@ -99,7 +91,7 @@ unsafe impl<T: RustToCudaAsync> RustToCudaAsync for Option<T> {
         alloc: A,
         stream: &'stream rustacuda::stream::Stream,
     ) -> CudaResult<(
-        Async<'stream, DeviceAccessible<Self::CudaRepresentation>, &Self>,
+        Async<'_, 'stream, DeviceAccessible<Self::CudaRepresentation>>,
         CombinedCudaAlloc<Self::CudaAllocationAsync, A>,
     )> {
         let (cuda_repr, alloc) = match self {
@@ -126,13 +118,8 @@ unsafe impl<T: RustToCudaAsync> RustToCudaAsync for Option<T> {
                     present: true,
                 });
 
-                let r#async = if let Some((capture, on_completion)) = capture_on_completion {
-                    Async::pending(option_cuda_repr, stream, self, |option_cuda_repr, this| {
-                        // if let Some(capture) = this {
-                        //     on_completion(todo!(), capture)?;
-                        // }
-                        Ok(())
-                    })?
+                let r#async = if matches!(capture_on_completion, Some(NoCompletion)) {
+                    Async::pending(option_cuda_repr, stream, NoCompletion)?
                 } else {
                     Async::ready(option_cuda_repr, stream)
                 };
@@ -150,7 +137,7 @@ unsafe impl<T: RustToCudaAsync> RustToCudaAsync for Option<T> {
         alloc: CombinedCudaAlloc<Self::CudaAllocationAsync, A>,
         stream: &'stream rustacuda::stream::Stream,
     ) -> CudaResult<(
-        Async<'stream, owning_ref::BoxRefMut<'a, O, Self>, Self::RestoreAsyncCapture, Self>,
+        Async<'a, 'stream, owning_ref::BoxRefMut<'a, O, Self>, CompletionFnMut<'a, Self>>,
         A,
     )> {
         let (alloc_front, alloc_tail) = alloc.split();
@@ -168,23 +155,22 @@ unsafe impl<T: RustToCudaAsync> RustToCudaAsync for Option<T> {
                 stream,
             )?;
 
-            let (value, capture_on_completion) = unsafe { r#async.unwrap_unchecked()? };
+            let (value, on_completion) = unsafe { r#async.unwrap_unchecked()? };
 
             std::mem::forget(value);
             let this = std::mem::ManuallyDrop::into_inner(this_backup);
 
-            if let Some((capture, on_completion)) = capture_on_completion {
-                let r#async = Async::pending(
+            if let Some(on_completion) = on_completion {
+                let r#async = Async::<_, CompletionFnMut<'a, Self>>::pending(
                     this,
                     stream,
-                    (capture, on_completion),
-                    |this: &mut Self, (capture, on_completion)| {
+                    Box::new(|this: &mut Self| {
                         if let Some(value) = this {
-                            on_completion(value, capture)?;
+                            on_completion(value)?;
                         }
 
                         Ok(())
-                    },
+                    }),
                 )?;
                 Ok((r#async, alloc_tail))
             } else {
diff --git a/src/lend/mod.rs b/src/lend/mod.rs
index df533a050..598a586b8 100644
--- a/src/lend/mod.rs
+++ b/src/lend/mod.rs
@@ -17,7 +17,7 @@ use crate::{alloc::EmptyCudaAlloc, utils::ffi::DeviceAccessible};
 use crate::{
     alloc::{CombinedCudaAlloc, NoCudaAlloc},
     host::{HostAndDeviceConstRef, HostAndDeviceOwned},
-    utils::r#async::Async,
+    utils::r#async::{Async, CompletionFnMut, NoCompletion},
 };
 
 mod impls;
@@ -75,10 +75,6 @@ pub unsafe trait RustToCuda {
 pub unsafe trait RustToCudaAsync: RustToCuda {
     type CudaAllocationAsync: CudaAlloc;
 
-    #[doc(hidden)]
-    #[cfg(feature = "host")]
-    type RestoreAsyncCapture;
-
     #[doc(hidden)]
     #[cfg(feature = "host")]
     /// # Errors
@@ -107,7 +103,7 @@ pub unsafe trait RustToCudaAsync: RustToCuda {
         alloc: A,
         stream: &'stream rustacuda::stream::Stream,
     ) -> rustacuda::error::CudaResult<(
-        Async<'stream, DeviceAccessible<Self::CudaRepresentation>, &Self>,
+        Async<'_, 'stream, DeviceAccessible<Self::CudaRepresentation>>,
         CombinedCudaAlloc<Self::CudaAllocationAsync, A>,
     )>;
 
@@ -133,7 +129,7 @@ pub unsafe trait RustToCudaAsync: RustToCuda {
         alloc: CombinedCudaAlloc<Self::CudaAllocationAsync, A>,
         stream: &'stream rustacuda::stream::Stream,
     ) -> rustacuda::error::CudaResult<(
-        Async<'stream, owning_ref::BoxRefMut<'a, O, Self>, Self::RestoreAsyncCapture, Self>,
+        Async<'a, 'stream, owning_ref::BoxRefMut<'a, O, Self>, CompletionFnMut<'a, Self>>,
         A,
     )>;
 }
@@ -274,9 +270,9 @@ pub trait LendToCudaAsync: RustToCudaAsync {
         E: From<CudaError>,
         F: FnOnce(
             Async<
+                '_,
                 'stream,
                 HostAndDeviceConstRef<DeviceAccessible<<Self as RustToCuda>::CudaRepresentation>>,
-                &Self,
             >,
         ) -> Result<O, E>,
     >(
@@ -296,11 +292,11 @@ pub trait LendToCudaAsync: RustToCudaAsync {
         'stream,
         O,
         E: From<CudaError>,
-        F: FnOnce(
+        F: for<'a> FnOnce(
             Async<
+                'a,
                 'stream,
                 HostAndDeviceOwned<DeviceAccessible<<Self as RustToCuda>::CudaRepresentation>>,
-                Self,
             >,
         ) -> Result<O, E>,
     >(
@@ -320,9 +316,9 @@ impl<T: RustToCudaAsync> LendToCudaAsync for T {
         E: From<CudaError>,
         F: FnOnce(
             Async<
+                '_,
                 'stream,
                 HostAndDeviceConstRef<DeviceAccessible<<Self as RustToCuda>::CudaRepresentation>>,
-                &Self,
             >,
         ) -> Result<O, E>,
     >(
@@ -338,12 +334,8 @@ impl<T: RustToCudaAsync> LendToCudaAsync for T {
         let (cuda_repr, capture_on_completion) = unsafe { cuda_repr.unwrap_unchecked()? };
 
         let result = HostAndDeviceConstRef::with_new(&cuda_repr, |const_ref| {
-            let r#async = if let Some((capture, on_completion)) = capture_on_completion {
-                Async::pending(const_ref, stream, self, |const_ref, this| {
-                    // TODO
-                    // on_completion(const_ref.for_host(), this)
-                    Ok(())
-                })?
+            let r#async = if matches!(capture_on_completion, Some(NoCompletion)) {
+                Async::pending(const_ref, stream, NoCompletion)?
             } else {
                 Async::ready(const_ref, stream)
             };
@@ -361,11 +353,11 @@ impl<T: RustToCudaAsync> LendToCudaAsync for T {
         'stream,
         O,
         E: From<CudaError>,
-        F: FnOnce(
+        F: for<'a> FnOnce(
             Async<
+                'a,
                 'stream,
                 HostAndDeviceOwned<DeviceAccessible<<Self as RustToCuda>::CudaRepresentation>>,
-                Self,
             >,
         ) -> Result<O, E>,
     >(
@@ -381,15 +373,11 @@ impl<T: RustToCudaAsync> LendToCudaAsync for T {
         let (cuda_repr, capture_on_completion) = unsafe { cuda_repr.unwrap_unchecked()? };
 
         let result = HostAndDeviceOwned::with_new(cuda_repr, |owned_ref| {
-            let r#async = if let Some((capture, on_completion)) = capture_on_completion {
-                Async::pending(owned_ref, stream, self, |owned_ref, this| {
-                    on_completion(owned_ref.for_async_completion(), &this)
-                })?
+            if matches!(capture_on_completion, Some(NoCompletion)) {
+                inner(Async::pending(owned_ref, stream, NoCompletion)?)
             } else {
-                Async::ready(owned_ref, stream)
-            };
-
-            inner(r#async)
+                inner(Async::ready(owned_ref, stream))
+            }
         });
 
         core::mem::drop(alloc);
diff --git a/src/utils/adapter.rs b/src/utils/adapter.rs
index e2f667be8..093a02fd4 100644
--- a/src/utils/adapter.rs
+++ b/src/utils/adapter.rs
@@ -130,8 +130,6 @@ unsafe impl<T: Copy + PortableBitSemantics + TypeGraphLayout> RustToCudaAsync
     for RustToCudaWithPortableBitCopySemantics<T>
 {
     type CudaAllocationAsync = NoCudaAlloc;
-    #[cfg(feature = "host")]
-    type RestoreAsyncCapture = ();
 
     #[cfg(feature = "host")]
     #[allow(clippy::type_complexity)]
@@ -140,7 +138,7 @@ unsafe impl<T: Copy + PortableBitSemantics + TypeGraphLayout> RustToCudaAsync
         alloc: A,
         stream: &'stream rustacuda::stream::Stream,
     ) -> rustacuda::error::CudaResult<(
-        crate::utils::r#async::Async<'stream, DeviceAccessible<Self::CudaRepresentation>, &Self>,
+        crate::utils::r#async::Async<'_, 'stream, DeviceAccessible<Self::CudaRepresentation>>,
         CombinedCudaAlloc<Self::CudaAllocation, A>,
     )> {
         let alloc = CombinedCudaAlloc::new(NoCudaAlloc, alloc);
@@ -157,16 +155,19 @@ unsafe impl<T: Copy + PortableBitSemantics + TypeGraphLayout> RustToCudaAsync
         stream: &'stream rustacuda::stream::Stream,
     ) -> rustacuda::error::CudaResult<(
         crate::utils::r#async::Async<
+            'a,
             'stream,
             owning_ref::BoxRefMut<'a, O, Self>,
-            Self::RestoreAsyncCapture,
-            Self,
+            crate::utils::r#async::CompletionFnMut<'a, Self>,
         >,
         A,
     )> {
         let (_alloc_front, alloc_tail): (NoCudaAlloc, A) = alloc.split();
 
-        let r#async = crate::utils::r#async::Async::pending(this, stream, (), |_this, ()| Ok(()))?;
+        let r#async = crate::utils::r#async::Async::<
+            _,
+            crate::utils::r#async::CompletionFnMut<'a, Self>,
+        >::pending(this, stream, Box::new(|_this| Ok(())))?;
 
         Ok((r#async, alloc_tail))
     }
diff --git a/src/utils/async.rs b/src/utils/async.rs
index 561cf97f8..6aab8adca 100644
--- a/src/utils/async.rs
+++ b/src/utils/async.rs
@@ -11,29 +11,68 @@ use rustacuda::{
 use crate::host::CudaDropWrapper;
 
 #[cfg(feature = "host")]
-pub struct Async<'stream, T: BorrowMut<B>, C, B: ?Sized = T> {
+pub struct NoCompletion;
+#[cfg(feature = "host")]
+pub type CompletionFnMut<'a, T> = Box<dyn FnOnce(&mut T) -> CudaResult<()> + 'a>;
+
+#[cfg(feature = "host")]
+pub trait Completion<T: BorrowMut<Self::Completed>>: sealed::Sealed {
+    type Completed: ?Sized;
+
+    #[allow(clippy::missing_errors_doc)] // FIXME
+    fn complete(self, completed: &mut Self::Completed) -> CudaResult<()>;
+}
+#[cfg(feature = "host")]
+mod sealed {
+    pub trait Sealed {}
+}
+
+#[cfg(feature = "host")]
+impl<T> Completion<T> for NoCompletion {
+    type Completed = T;
+
+    fn complete(self, _completed: &mut Self::Completed) -> CudaResult<()> {
+        Ok(())
+    }
+}
+#[cfg(feature = "host")]
+impl sealed::Sealed for NoCompletion {}
+
+#[cfg(feature = "host")]
+impl<'a, T: BorrowMut<B>, B: ?Sized> Completion<T> for CompletionFnMut<'a, B> {
+    type Completed = B;
+
+    fn complete(self, completed: &mut Self::Completed) -> CudaResult<()> {
+        (self)(completed)
+    }
+}
+#[cfg(feature = "host")]
+impl<'a, T: ?Sized> sealed::Sealed for CompletionFnMut<'a, T> {}
+
+#[cfg(feature = "host")]
+pub struct Async<'a, 'stream, T: BorrowMut<C::Completed>, C: Completion<T> = NoCompletion> {
     _stream: PhantomData<&'stream Stream>,
     value: T,
-    status: AsyncStatus<C, B>,
+    status: AsyncStatus<'a, T, C>,
+    _capture: PhantomData<&'a ()>,
 }
 
 #[cfg(feature = "host")]
-enum AsyncStatus<C, B: ?Sized> {
+enum AsyncStatus<'a, T: BorrowMut<C::Completed>, C: Completion<T>> {
     #[allow(clippy::type_complexity)]
     Processing {
         receiver: oneshot::Receiver<CudaResult<()>>,
-        capture: C,
-        on_completion: Box<dyn FnOnce(&mut B, C) -> CudaResult<()>>,
+        completion: C,
         event: CudaDropWrapper<Event>,
+        _capture: PhantomData<&'a T>,
     },
     Completed {
         result: CudaResult<()>,
     },
 }
 
-// TODO: completion is NOT allowed to make any cuda calls
 #[cfg(feature = "host")]
-impl<'stream, T: BorrowMut<B>, C, B: ?Sized> Async<'stream, T, C, B> {
+impl<'a, 'stream, T: BorrowMut<C::Completed>, C: Completion<T>> Async<'a, 'stream, T, C> {
     /// Wraps a `value` which is ready on `stream`.
     #[must_use]
     pub const fn ready(value: T, stream: &'stream Stream) -> Self {
@@ -43,6 +82,7 @@ impl<'stream, T: BorrowMut<B>, C, B: ?Sized> Async<'stream, T, C, B> {
             _stream: PhantomData::<&'stream Stream>,
             value,
             status: AsyncStatus::Completed { result: Ok(()) },
+            _capture: PhantomData::<&'a ()>,
         }
     }
 
@@ -52,12 +92,7 @@ impl<'stream, T: BorrowMut<B>, C, B: ?Sized> Async<'stream, T, C, B> {
     /// # Errors
     /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside
     /// CUDA.
-    pub fn pending(
-        value: T,
-        stream: &'stream Stream,
-        capture: C,
-        on_completion: impl FnOnce(&mut B, C) -> CudaResult<()> + 'static,
-    ) -> CudaResult<Self> {
+    pub fn pending(value: T, stream: &'stream Stream, completion: C) -> CudaResult<Self> {
         let event = CudaDropWrapper::from(Event::new(
             EventFlags::DISABLE_TIMING | EventFlags::BLOCKING_SYNC,
         )?);
@@ -71,11 +106,12 @@ impl<'stream, T: BorrowMut<B>, C, B: ?Sized> Async<'stream, T, C, B> {
             _stream: PhantomData::<&'stream Stream>,
             value,
             status: AsyncStatus::Processing {
-                capture,
                 receiver,
-                on_completion: Box::new(on_completion),
+                completion,
                 event,
+                _capture: PhantomData::<&'a T>,
             },
+            _capture: PhantomData::<&'a ()>,
         })
     }
 
@@ -87,14 +123,14 @@ impl<'stream, T: BorrowMut<B>, C, B: ?Sized> Async<'stream, T, C, B> {
     /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside
     /// CUDA.
     pub fn synchronize(mut self) -> CudaResult<T> {
-        let (receiver, capture, on_completion) = match self.status {
+        let (receiver, completion) = match self.status {
             AsyncStatus::Completed { result } => return result.map(|()| self.value),
             AsyncStatus::Processing {
                 receiver,
-                capture,
-                on_completion,
+                completion,
                 event: _,
-            } => (receiver, capture, on_completion),
+                _capture,
+            } => (receiver, completion),
         };
 
         match receiver.recv() {
@@ -103,7 +139,7 @@ impl<'stream, T: BorrowMut<B>, C, B: ?Sized> Async<'stream, T, C, B> {
             Err(oneshot::RecvError) => return Err(CudaError::AlreadyAcquired),
         }
 
-        on_completion(self.value.borrow_mut(), capture)?;
+        completion.complete(self.value.borrow_mut())?;
 
         Ok(self.value)
     }
@@ -116,21 +152,22 @@ impl<'stream, T: BorrowMut<B>, C, B: ?Sized> Async<'stream, T, C, B> {
     pub fn move_to_stream<'stream_new>(
         mut self,
         stream: &'stream_new Stream,
-    ) -> CudaResult<Async<'stream_new, T, C, B>> {
-        let (receiver, capture, on_completion, event) = match self.status {
+    ) -> CudaResult<Async<'a, 'stream_new, T, C>> {
+        let (receiver, completion, event) = match self.status {
             AsyncStatus::Completed { .. } => {
                 return Ok(Async {
                     _stream: PhantomData::<&'stream_new Stream>,
                     value: self.value,
                     status: self.status,
+                    _capture: PhantomData::<&'a ()>,
                 })
             },
             AsyncStatus::Processing {
                 receiver,
-                capture,
-                on_completion,
+                completion,
                 event,
-            } => (receiver, capture, on_completion, event),
+                _capture,
+            } => (receiver, completion, event),
         };
 
         match receiver.try_recv() {
@@ -144,21 +181,23 @@ impl<'stream, T: BorrowMut<B>, C, B: ?Sized> Async<'stream, T, C, B> {
                     value: self.value,
                     status: AsyncStatus::Processing {
                         receiver,
-                        capture,
-                        on_completion,
+                        completion,
                         event,
+                        _capture: PhantomData::<&'a T>,
                     },
+                    _capture: PhantomData::<&'a ()>,
                 });
             },
             Err(oneshot::TryRecvError::Disconnected) => return Err(CudaError::AlreadyAcquired),
         };
 
-        on_completion(self.value.borrow_mut(), capture)?;
+        completion.complete(self.value.borrow_mut())?;
 
         Ok(Async {
             _stream: PhantomData::<&'stream_new Stream>,
             value: self.value,
             status: AsyncStatus::Completed { result: Ok(()) },
+            _capture: PhantomData::<&'a ()>,
         })
     }
 
@@ -172,36 +211,32 @@ impl<'stream, T: BorrowMut<B>, C, B: ?Sized> Async<'stream, T, C, B> {
     /// This method must only be used to construct a larger asynchronous
     /// computation out of smaller ones that have all been submitted to the
     /// same [`Stream`].
-    pub unsafe fn unwrap_unchecked(
-        self,
-    ) -> CudaResult<(
-        T,
-        Option<(C, Box<dyn FnOnce(&mut B, C) -> CudaResult<()> + 'static>)>,
-    )> {
+    pub unsafe fn unwrap_unchecked(self) -> CudaResult<(T, Option<C>)> {
         match self.status {
             AsyncStatus::Completed { result: Ok(()) } => Ok((self.value, None)),
             AsyncStatus::Completed { result: Err(err) } => Err(err),
             AsyncStatus::Processing {
                 receiver: _,
-                capture,
-                on_completion,
+                completion,
                 event: _,
-            } => Ok((self.value, Some((capture, on_completion)))),
+                _capture,
+            } => Ok((self.value, Some(completion))),
         }
     }
 }
 
 #[cfg(feature = "host")]
-struct AsyncFuture<'stream, T: BorrowMut<B>, C, B: ?Sized> {
+struct AsyncFuture<'a, 'stream, T: BorrowMut<C::Completed>, C: Completion<T>> {
     _stream: PhantomData<&'stream Stream>,
     value: Option<T>,
-    #[allow(clippy::type_complexity)]
-    capture_on_completion: Option<(C, Box<dyn FnOnce(&mut B, C) -> CudaResult<()> + 'static>)>,
-    status: AsyncStatus<(), B>,
+    completion: Option<C>,
+    status: AsyncStatus<'a, T, NoCompletion>,
 }
 
 #[cfg(feature = "host")]
-impl<'stream, T: BorrowMut<B>, C, B: ?Sized> Future for AsyncFuture<'stream, T, C, B> {
+impl<'a, 'stream, T: BorrowMut<C::Completed>, C: Completion<T>> Future
+    for AsyncFuture<'a, 'stream, T, C>
+{
     type Output = CudaResult<T>;
 
     fn poll(
@@ -214,9 +249,9 @@ impl<'stream, T: BorrowMut<B>, C, B: ?Sized> Future for AsyncFuture<'stream, T,
         match &mut this.status {
             AsyncStatus::Processing {
                 receiver,
-                capture: (),
-                on_completion: _,
+                completion: _,
                 event: _,
+                _capture,
             } => match std::pin::Pin::new(receiver).poll(cx) {
                 Poll::Ready(Ok(Ok(()))) => (),
                 Poll::Ready(Ok(Err(err))) => return Poll::Ready(Err(err)),
@@ -233,8 +268,8 @@ impl<'stream, T: BorrowMut<B>, C, B: ?Sized> Future for AsyncFuture<'stream, T,
             return Poll::Ready(Err(CudaError::AlreadyAcquired));
         };
 
-        if let Some((capture, on_completion)) = this.capture_on_completion.take() {
-            on_completion(value.borrow_mut(), capture)?;
+        if let Some(completion) = this.completion.take() {
+            completion.complete(value.borrow_mut())?;
         }
 
         Poll::Ready(Ok(value))
@@ -242,26 +277,31 @@ impl<'stream, T: BorrowMut<B>, C, B: ?Sized> Future for AsyncFuture<'stream, T,
 }
 
 #[cfg(feature = "host")]
-impl<'stream, T: BorrowMut<B>, C, B: ?Sized> IntoFuture for Async<'stream, T, C, B> {
+impl<'a, 'stream, T: BorrowMut<C::Completed>, C: Completion<T>> IntoFuture
+    for Async<'a, 'stream, T, C>
+{
     type Output = CudaResult<T>;
 
     type IntoFuture = impl Future<Output = Self::Output>;
 
     fn into_future(self) -> Self::IntoFuture {
-        let (capture_on_completion, status) = match self.status {
-            AsyncStatus::Completed { result } => (None, AsyncStatus::Completed { result }),
+        let (completion, status): (Option<C>, AsyncStatus<'a, T, NoCompletion>) = match self.status
+        {
+            AsyncStatus::Completed { result } => {
+                (None, AsyncStatus::Completed::<T, NoCompletion> { result })
+            },
             AsyncStatus::Processing {
                 receiver,
-                capture,
-                on_completion,
+                completion,
                 event,
+                _capture,
             } => (
-                Some((capture, on_completion)),
-                AsyncStatus::Processing {
+                Some(completion),
+                AsyncStatus::Processing::<T, NoCompletion> {
                     receiver,
-                    capture: (),
-                    on_completion: Box::new(|_self, ()| Ok(())),
+                    completion: NoCompletion,
                     event,
+                    _capture: PhantomData::<&'a T>,
                 },
             ),
         };
@@ -269,7 +309,7 @@ impl<'stream, T: BorrowMut<B>, C, B: ?Sized> IntoFuture for Async<'stream, T, C,
         AsyncFuture {
             _stream: PhantomData::<&'stream Stream>,
             value: Some(self.value),
-            capture_on_completion,
+            completion,
             status,
         }
     }

From 9dc2ae7c30f1ad42941a033a3b8868fe96ebbf8b Mon Sep 17 00:00:00 2001
From: Juniper Tyree <juniper.tyree@helsinki.fi>
Date: Tue, 2 Jan 2024 05:53:07 +0000
Subject: [PATCH 077/120] Add RustToCudaAsync impls for &T and &[T], but not
 &mut T or &mut [T]

---
 .github/workflows/ci.yml        |  4 +-
 Cargo.toml                      |  4 +-
 src/lend/impls/ref.rs           | 73 +++++++++++++++++++++++++++++++-
 src/lend/impls/ref_mut.rs       |  3 ++
 src/lend/impls/slice_ref.rs     | 75 ++++++++++++++++++++++++++++++++-
 src/lend/impls/slice_ref_mut.rs |  3 ++
 6 files changed, 154 insertions(+), 8 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index a8f37a6dd..fcf0fd63c 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -63,7 +63,7 @@ jobs:
       - name: Check feature powerset on CUDA
         run: |
           cargo hack check --feature-powerset --optional-deps \
-            --skip host,rustacuda,rustacuda_derive,regex \
+            --skip host \
             --keep-going \
             --target nvptx64-nvidia-cuda
 
@@ -182,7 +182,7 @@ jobs:
       - name: Check feature powerset on CUDA
         run: |
           cargo hack clippy --feature-powerset --optional-deps \
-            --skip host,rustacuda,rustacuda_derive,regex \
+            --skip host \
             --keep-going \
             --target nvptx64-nvidia-cuda \
             -- -D warnings
diff --git a/Cargo.toml b/Cargo.toml
index 90626aae6..eb0e1725f 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -21,7 +21,7 @@ rust-version = "1.75" # nightly
 default = []
 derive = ["dep:rustacuda_derive", "dep:rust-cuda-derive"]
 device = []
-host = ["dep:rustacuda", "dep:regex", "dep:oneshot"]
+host = ["dep:rustacuda", "dep:regex", "dep:oneshot", "dep:safer_owning_ref"]
 kernel = ["dep:rust-cuda-kernel"]
 
 [dependencies]
@@ -34,7 +34,7 @@ regex = { version = "1.10", optional = true }
 
 const-type-layout = { version = "0.2.1", features = ["derive"] }
 
-safer_owning_ref = { version = "0.5" }
+safer_owning_ref = { version = "0.5", optional = true }
 oneshot = { version = "0.1", optional = true, features = ["std", "async"] }
 
 rust-cuda-derive = { path = "rust-cuda-derive", optional = true }
diff --git a/src/lend/impls/ref.rs b/src/lend/impls/ref.rs
index c068920ab..501393f63 100644
--- a/src/lend/impls/ref.rs
+++ b/src/lend/impls/ref.rs
@@ -1,12 +1,14 @@
 use core::marker::PhantomData;
+#[cfg(feature = "host")]
+use std::mem::ManuallyDrop;
 
 use const_type_layout::{TypeGraphLayout, TypeLayout};
 
 #[cfg(feature = "host")]
-use rustacuda::{error::CudaResult, memory::DeviceBox};
+use rustacuda::{error::CudaResult, memory::DeviceBox, memory::LockedBox};
 
 use crate::{
-    lend::{CudaAsRust, RustToCuda},
+    lend::{CudaAsRust, RustToCuda, RustToCudaAsync},
     safety::PortableBitSemantics,
     utils::ffi::DeviceConstPointer,
 };
@@ -19,6 +21,7 @@ use crate::{
     alloc::{CombinedCudaAlloc, CudaAlloc},
     host::CudaDropWrapper,
     utils::adapter::DeviceCopyWithPortableBitSemantics,
+    utils::r#async::{Async, CompletionFnMut, NoCompletion},
 };
 
 #[doc(hidden)]
@@ -69,6 +72,72 @@ unsafe impl<'a, T: PortableBitSemantics + TypeGraphLayout> RustToCuda for &'a T
     }
 }
 
+unsafe impl<'a, T: PortableBitSemantics + TypeGraphLayout> RustToCudaAsync for &'a T {
+    #[cfg(all(feature = "host", not(doc)))]
+    type CudaAllocationAsync = CombinedCudaAlloc<
+        CudaDropWrapper<LockedBox<DeviceCopyWithPortableBitSemantics<ManuallyDrop<T>>>>,
+        CudaDropWrapper<DeviceBox<DeviceCopyWithPortableBitSemantics<ManuallyDrop<T>>>>,
+    >;
+    #[cfg(any(not(feature = "host"), doc))]
+    type CudaAllocationAsync = crate::alloc::SomeCudaAlloc;
+
+    #[cfg(feature = "host")]
+    unsafe fn borrow_async<'stream, A: CudaAlloc>(
+        &self,
+        alloc: A,
+        stream: &'stream rustacuda::stream::Stream,
+    ) -> rustacuda::error::CudaResult<(
+        Async<'_, 'stream, DeviceAccessible<Self::CudaRepresentation>>,
+        CombinedCudaAlloc<Self::CudaAllocationAsync, A>,
+    )> {
+        use rustacuda::memory::AsyncCopyDestination;
+
+        let locked_box = unsafe {
+            let mut uninit = CudaDropWrapper::from(LockedBox::<
+                DeviceCopyWithPortableBitSemantics<ManuallyDrop<T>>,
+            >::uninitialized()?);
+            std::ptr::copy_nonoverlapping(
+                std::ptr::from_ref::<T>(&**self)
+                    .cast::<DeviceCopyWithPortableBitSemantics<ManuallyDrop<T>>>(),
+                uninit.as_mut_ptr(),
+                1,
+            );
+            uninit
+        };
+
+        let mut device_box = CudaDropWrapper::from(DeviceBox::<
+            DeviceCopyWithPortableBitSemantics<ManuallyDrop<T>>,
+        >::uninitialized()?);
+        device_box.async_copy_from(&*locked_box, stream)?;
+
+        Ok((
+            Async::pending(
+                DeviceAccessible::from(RefCudaRepresentation {
+                    data: DeviceConstPointer(device_box.as_device_ptr().as_raw().cast()),
+                    _marker: PhantomData::<&T>,
+                }),
+                stream,
+                NoCompletion,
+            )?,
+            CombinedCudaAlloc::new(CombinedCudaAlloc::new(locked_box, device_box), alloc),
+        ))
+    }
+
+    #[cfg(feature = "host")]
+    unsafe fn restore_async<'b, 'stream, A: CudaAlloc, O>(
+        this: owning_ref::BoxRefMut<'b, O, Self>,
+        alloc: CombinedCudaAlloc<Self::CudaAllocationAsync, A>,
+        stream: &'stream rustacuda::stream::Stream,
+    ) -> CudaResult<(
+        Async<'b, 'stream, owning_ref::BoxRefMut<'b, O, Self>, CompletionFnMut<'b, Self>>,
+        A,
+    )> {
+        let (_alloc_front, alloc_tail) = alloc.split();
+        let r#async = Async::ready(this, stream);
+        Ok((r#async, alloc_tail))
+    }
+}
+
 unsafe impl<'a, T: PortableBitSemantics + TypeGraphLayout> CudaAsRust
     for RefCudaRepresentation<'a, T>
 {
diff --git a/src/lend/impls/ref_mut.rs b/src/lend/impls/ref_mut.rs
index 2a59d8953..cab1ea8df 100644
--- a/src/lend/impls/ref_mut.rs
+++ b/src/lend/impls/ref_mut.rs
@@ -76,6 +76,9 @@ unsafe impl<'a, T: PortableBitSemantics + TypeGraphLayout> RustToCuda for &'a mu
     }
 }
 
+// &mut T cannot implement RustToCudaAsync since the reference, potentially
+//  with garbage data, would remain accessible after failing a mutable restore
+
 unsafe impl<'a, T: PortableBitSemantics + TypeGraphLayout> CudaAsRust
     for RefMutCudaRepresentation<'a, T>
 {
diff --git a/src/lend/impls/slice_ref.rs b/src/lend/impls/slice_ref.rs
index 70d3a1e63..4f8a3ecd9 100644
--- a/src/lend/impls/slice_ref.rs
+++ b/src/lend/impls/slice_ref.rs
@@ -1,12 +1,14 @@
 use core::marker::PhantomData;
+#[cfg(feature = "host")]
+use std::mem::ManuallyDrop;
 
 use const_type_layout::{TypeGraphLayout, TypeLayout};
 
 #[cfg(feature = "host")]
-use rustacuda::{error::CudaResult, memory::DeviceBuffer};
+use rustacuda::{error::CudaResult, memory::DeviceBuffer, memory::LockedBuffer};
 
 use crate::{
-    lend::{CudaAsRust, RustToCuda},
+    lend::{CudaAsRust, RustToCuda, RustToCudaAsync},
     safety::PortableBitSemantics,
     utils::ffi::DeviceConstPointer,
 };
@@ -19,6 +21,7 @@ use crate::{
     alloc::{CombinedCudaAlloc, CudaAlloc},
     host::CudaDropWrapper,
     utils::adapter::DeviceCopyWithPortableBitSemantics,
+    utils::r#async::{Async, CompletionFnMut, NoCompletion},
 };
 
 #[doc(hidden)]
@@ -72,6 +75,74 @@ unsafe impl<'a, T: PortableBitSemantics + TypeGraphLayout> RustToCuda for &'a [T
     }
 }
 
+unsafe impl<'a, T: PortableBitSemantics + TypeGraphLayout> RustToCudaAsync for &'a [T] {
+    #[cfg(all(feature = "host", not(doc)))]
+    type CudaAllocationAsync = CombinedCudaAlloc<
+        CudaDropWrapper<LockedBuffer<DeviceCopyWithPortableBitSemantics<ManuallyDrop<T>>>>,
+        CudaDropWrapper<DeviceBuffer<DeviceCopyWithPortableBitSemantics<ManuallyDrop<T>>>>,
+    >;
+    #[cfg(any(not(feature = "host"), doc))]
+    type CudaAllocationAsync = crate::alloc::SomeCudaAlloc;
+
+    #[cfg(feature = "host")]
+    unsafe fn borrow_async<'stream, A: CudaAlloc>(
+        &self,
+        alloc: A,
+        stream: &'stream rustacuda::stream::Stream,
+    ) -> rustacuda::error::CudaResult<(
+        Async<'_, 'stream, DeviceAccessible<Self::CudaRepresentation>>,
+        CombinedCudaAlloc<Self::CudaAllocationAsync, A>,
+    )> {
+        use rustacuda::memory::AsyncCopyDestination;
+
+        let locked_buffer = unsafe {
+            let mut uninit = CudaDropWrapper::from(LockedBuffer::<
+                DeviceCopyWithPortableBitSemantics<ManuallyDrop<T>>,
+            >::uninitialized(self.len())?);
+            std::ptr::copy_nonoverlapping(
+                self.as_ref()
+                    .as_ptr()
+                    .cast::<DeviceCopyWithPortableBitSemantics<ManuallyDrop<T>>>(),
+                uninit.as_mut_ptr(),
+                self.len(),
+            );
+            uninit
+        };
+
+        let mut device_buffer = CudaDropWrapper::from(DeviceBuffer::<
+            DeviceCopyWithPortableBitSemantics<ManuallyDrop<T>>,
+        >::uninitialized(self.len())?);
+        device_buffer.async_copy_from(&*locked_buffer, stream)?;
+
+        Ok((
+            Async::pending(
+                DeviceAccessible::from(SliceRefCudaRepresentation {
+                    data: DeviceConstPointer(device_buffer.as_ptr().cast()),
+                    len: device_buffer.len(),
+                    _marker: PhantomData::<&'a [T]>,
+                }),
+                stream,
+                NoCompletion,
+            )?,
+            CombinedCudaAlloc::new(CombinedCudaAlloc::new(locked_buffer, device_buffer), alloc),
+        ))
+    }
+
+    #[cfg(feature = "host")]
+    unsafe fn restore_async<'b, 'stream, A: CudaAlloc, O>(
+        this: owning_ref::BoxRefMut<'b, O, Self>,
+        alloc: CombinedCudaAlloc<Self::CudaAllocationAsync, A>,
+        stream: &'stream rustacuda::stream::Stream,
+    ) -> CudaResult<(
+        Async<'b, 'stream, owning_ref::BoxRefMut<'b, O, Self>, CompletionFnMut<'b, Self>>,
+        A,
+    )> {
+        let (_alloc_front, alloc_tail) = alloc.split();
+        let r#async = Async::ready(this, stream);
+        Ok((r#async, alloc_tail))
+    }
+}
+
 unsafe impl<'a, T: PortableBitSemantics + TypeGraphLayout> CudaAsRust
     for SliceRefCudaRepresentation<'a, T>
 {
diff --git a/src/lend/impls/slice_ref_mut.rs b/src/lend/impls/slice_ref_mut.rs
index 0e802ccca..5c766dd24 100644
--- a/src/lend/impls/slice_ref_mut.rs
+++ b/src/lend/impls/slice_ref_mut.rs
@@ -79,6 +79,9 @@ unsafe impl<'a, T: PortableBitSemantics + TypeGraphLayout> RustToCuda for &'a mu
     }
 }
 
+// &mut [T] cannot implement RustToCudaAsync since the slice, potentially with
+//  garbage data, would remain accessible after failing a mutable restore
+
 unsafe impl<'a, T: PortableBitSemantics + TypeGraphLayout> CudaAsRust
     for SliceRefMutCudaRepresentation<'a, T>
 {

From 91f9246832390215bc91ebf00b6692839918a13c Mon Sep 17 00:00:00 2001
From: Juniper Tyree <juniper.tyree@helsinki.fi>
Date: Wed, 3 Jan 2024 04:02:35 +0000
Subject: [PATCH 078/120] Add back mostly unchanged exchange wrapper + buffer
 with RustToCudaAsync impls

---
 src/host/mod.rs                   |   6 -
 src/utils/async.rs                |  26 ++
 src/utils/exchange/buffer/host.rs |  51 ++-
 src/utils/exchange/buffer/mod.rs  |  47 ++-
 src/utils/exchange/mod.rs         |   6 +-
 src/utils/exchange/wrapper.rs     | 536 +++++++++---------------------
 6 files changed, 262 insertions(+), 410 deletions(-)

diff --git a/src/host/mod.rs b/src/host/mod.rs
index f77c75792..a705c8504 100644
--- a/src/host/mod.rs
+++ b/src/host/mod.rs
@@ -362,10 +362,7 @@ impl<'stream, 'a, T: PortableBitSemantics + TypeGraphLayout>
     pub unsafe fn new(
         device_box: &'a mut DeviceBox<DeviceCopyWithPortableBitSemantics<T>>,
         host_ref: &'a mut T,
-        stream: &'stream Stream,
     ) -> Self {
-        let _ = stream;
-
         Self {
             device_box,
             host_ref,
@@ -448,10 +445,7 @@ impl<'stream, 'a, T: PortableBitSemantics + TypeGraphLayout>
     pub const unsafe fn new(
         device_box: &'a DeviceBox<DeviceCopyWithPortableBitSemantics<T>>,
         host_ref: &'a T,
-        stream: &'stream Stream,
     ) -> Self {
-        let _ = stream;
-
         Self {
             device_box,
             host_ref,
diff --git a/src/utils/async.rs b/src/utils/async.rs
index 6aab8adca..f408431ae 100644
--- a/src/utils/async.rs
+++ b/src/utils/async.rs
@@ -223,6 +223,32 @@ impl<'a, 'stream, T: BorrowMut<C::Completed>, C: Completion<T>> Async<'a, 'strea
             } => Ok((self.value, Some(completion))),
         }
     }
+
+    /// # Safety
+    ///
+    /// The returned reference to the inner value of type `T` may not yet have
+    /// completed its asynchronous work and may thus be in an inconsistent
+    /// state.
+    ///
+    /// This method must only be used to construct a larger asynchronous
+    /// computation out of smaller ones that have all been submitted to the
+    /// same [`Stream`].
+    pub const unsafe fn unwrap_ref_unchecked(&self) -> &T {
+        &self.value
+    }
+
+    /// # Safety
+    ///
+    /// The returned reference to the inner value of type `T` may not yet have
+    /// completed its asynchronous work and may thus be in an inconsistent
+    /// state.
+    ///
+    /// This method must only be used to construct a larger asynchronous
+    /// computation out of smaller ones that have all been submitted to the
+    /// same [`Stream`].
+    pub unsafe fn unwrap_mut_unchecked(&mut self) -> &mut T {
+        &mut self.value
+    }
 }
 
 #[cfg(feature = "host")]
diff --git a/src/utils/exchange/buffer/host.rs b/src/utils/exchange/buffer/host.rs
index e62227d8e..ce0cb9d41 100644
--- a/src/utils/exchange/buffer/host.rs
+++ b/src/utils/exchange/buffer/host.rs
@@ -16,6 +16,7 @@ use crate::{
     utils::{
         adapter::DeviceCopyWithPortableBitSemantics,
         ffi::{DeviceAccessible, DeviceMutPointer},
+        r#async::{Async, CompletionFnMut, NoCompletion},
     },
 };
 
@@ -174,12 +175,12 @@ impl<T: StackOnly + PortableBitSemantics + TypeGraphLayout, const M2D: bool, con
     CudaExchangeBufferHost<T, M2D, M2H>
 {
     #[allow(clippy::type_complexity)]
-    pub unsafe fn borrow_async<A: CudaAlloc>(
+    pub unsafe fn borrow_async<'stream, A: CudaAlloc>(
         &self,
         alloc: A,
-        stream: &rustacuda::stream::Stream,
+        stream: &'stream rustacuda::stream::Stream,
     ) -> rustacuda::error::CudaResult<(
-        DeviceAccessible<CudaExchangeBufferCudaRepresentation<T, M2D, M2H>>,
+        Async<'_, 'stream, DeviceAccessible<CudaExchangeBufferCudaRepresentation<T, M2D, M2H>>>,
         CombinedCudaAlloc<NoCudaAlloc, A>,
     )> {
         // Safety: device_buffer is inside an UnsafeCell
@@ -196,33 +197,49 @@ impl<T: StackOnly + PortableBitSemantics + TypeGraphLayout, const M2D: bool, con
             )?;
         }
 
-        Ok((
-            DeviceAccessible::from(CudaExchangeBufferCudaRepresentation(
-                DeviceMutPointer(device_buffer.as_mut_ptr().cast()),
-                device_buffer.len(),
-            )),
-            CombinedCudaAlloc::new(NoCudaAlloc, alloc),
-        ))
+        let cuda_repr = DeviceAccessible::from(CudaExchangeBufferCudaRepresentation(
+            DeviceMutPointer(device_buffer.as_mut_ptr().cast()),
+            device_buffer.len(),
+        ));
+
+        let r#async = if M2D {
+            Async::pending(cuda_repr, stream, NoCompletion)?
+        } else {
+            Async::ready(cuda_repr, stream)
+        };
+
+        Ok((r#async, CombinedCudaAlloc::new(NoCudaAlloc, alloc)))
     }
 
     #[allow(clippy::type_complexity)]
-    pub unsafe fn restore_async<A: CudaAlloc>(
-        &mut self,
+    pub unsafe fn restore_async<'a, 'stream, A: CudaAlloc, O>(
+        mut this: owning_ref::BoxRefMut<'a, O, Self>,
         alloc: CombinedCudaAlloc<NoCudaAlloc, A>,
-        stream: &rustacuda::stream::Stream,
-    ) -> rustacuda::error::CudaResult<A> {
+        stream: &'stream rustacuda::stream::Stream,
+    ) -> rustacuda::error::CudaResult<(
+        Async<'a, 'stream, owning_ref::BoxRefMut<'a, O, Self>, CompletionFnMut<'a, Self>>,
+        A,
+    )> {
         let (_alloc_front, alloc_tail) = alloc.split();
 
         if M2H {
             // Only move the buffer contents back to the host if needed
 
+            let this: &mut Self = &mut this;
+
             rustacuda::memory::AsyncCopyDestination::async_copy_to(
-                &***self.device_buffer.get_mut(),
-                self.host_buffer.as_mut_slice(),
+                &***this.device_buffer.get_mut(),
+                this.host_buffer.as_mut_slice(),
                 stream,
             )?;
         }
 
-        Ok(alloc_tail)
+        let r#async = if M2H {
+            Async::<_, CompletionFnMut<'a, Self>>::pending(this, stream, Box::new(|_this| Ok(())))?
+        } else {
+            Async::ready(this, stream)
+        };
+
+        Ok((r#async, alloc_tail))
     }
 }
diff --git a/src/utils/exchange/buffer/mod.rs b/src/utils/exchange/buffer/mod.rs
index 9dfc4414e..c48a715ac 100644
--- a/src/utils/exchange/buffer/mod.rs
+++ b/src/utils/exchange/buffer/mod.rs
@@ -20,6 +20,7 @@ use crate::{
 use crate::{
     alloc::{CombinedCudaAlloc, CudaAlloc},
     utils::ffi::DeviceAccessible,
+    utils::r#async::{Async, CompletionFnMut},
 };
 
 #[cfg(any(feature = "host", feature = "device"))]
@@ -133,25 +134,51 @@ unsafe impl<T: StackOnly + PortableBitSemantics + TypeGraphLayout, const M2D: bo
 
     #[cfg(feature = "host")]
     #[allow(clippy::type_complexity)]
-    unsafe fn borrow_async<A: CudaAlloc>(
+    unsafe fn borrow_async<'stream, A: CudaAlloc>(
         &self,
         alloc: A,
-        stream: &rustacuda::stream::Stream,
+        stream: &'stream rustacuda::stream::Stream,
     ) -> rustacuda::error::CudaResult<(
-        DeviceAccessible<Self::CudaRepresentation>,
-        CombinedCudaAlloc<Self::CudaAllocation, A>,
+        Async<'_, 'stream, DeviceAccessible<Self::CudaRepresentation>>,
+        CombinedCudaAlloc<Self::CudaAllocationAsync, A>,
     )> {
         self.inner.borrow_async(alloc, stream)
     }
 
     #[cfg(feature = "host")]
     #[allow(clippy::type_complexity)]
-    unsafe fn restore_async<A: CudaAlloc>(
-        &mut self,
-        alloc: CombinedCudaAlloc<Self::CudaAllocation, A>,
-        stream: &rustacuda::stream::Stream,
-    ) -> rustacuda::error::CudaResult<A> {
-        self.inner.restore_async(alloc, stream)
+    unsafe fn restore_async<'a, 'stream, A: CudaAlloc, O>(
+        this: owning_ref::BoxRefMut<'a, O, Self>,
+        alloc: CombinedCudaAlloc<Self::CudaAllocationAsync, A>,
+        stream: &'stream rustacuda::stream::Stream,
+    ) -> rustacuda::error::CudaResult<(
+        Async<'a, 'stream, owning_ref::BoxRefMut<'a, O, Self>, CompletionFnMut<'a, Self>>,
+        A,
+    )> {
+        let this_backup = unsafe { std::mem::ManuallyDrop::new(std::ptr::read(&this)) };
+
+        let (r#async, alloc_tail) = host::CudaExchangeBufferHost::restore_async(
+            this.map_mut(|this| &mut this.inner),
+            alloc,
+            stream,
+        )?;
+
+        let (inner, on_completion) = unsafe { r#async.unwrap_unchecked()? };
+
+        std::mem::forget(inner);
+        let this = std::mem::ManuallyDrop::into_inner(this_backup);
+
+        if let Some(on_completion) = on_completion {
+            let r#async = Async::<_, CompletionFnMut<'a, Self>>::pending(
+                this,
+                stream,
+                Box::new(|this: &mut Self| on_completion(&mut this.inner)),
+            )?;
+            Ok((r#async, alloc_tail))
+        } else {
+            let r#async = Async::ready(this, stream);
+            Ok((r#async, alloc_tail))
+        }
     }
 }
 
diff --git a/src/utils/exchange/mod.rs b/src/utils/exchange/mod.rs
index 9c0de5e36..722e02559 100644
--- a/src/utils/exchange/mod.rs
+++ b/src/utils/exchange/mod.rs
@@ -1,4 +1,4 @@
-// pub mod buffer;
+pub mod buffer;
 
-// #[cfg(feature = "host")]
-// pub mod wrapper;
+#[cfg(feature = "host")]
+pub mod wrapper;
diff --git a/src/utils/exchange/wrapper.rs b/src/utils/exchange/wrapper.rs
index 454ecc8f3..09aef582d 100644
--- a/src/utils/exchange/wrapper.rs
+++ b/src/utils/exchange/wrapper.rs
@@ -1,16 +1,9 @@
-use std::{
-    future::{Future, IntoFuture},
-    marker::PhantomData,
-    ops::{Deref, DerefMut},
-    sync::{Arc, Mutex},
-    task::{Poll, Waker},
-};
+use std::ops::{Deref, DerefMut};
 
 use rustacuda::{
-    error::{CudaError, CudaResult},
-    event::{Event, EventFlags, EventStatus},
+    error::CudaResult,
     memory::{AsyncCopyDestination, CopyDestination, DeviceBox, LockedBox},
-    stream::{Stream, StreamWaitEventFlags},
+    stream::Stream,
 };
 
 use crate::{
@@ -20,32 +13,16 @@ use crate::{
         HostAndDeviceMutRefAsync,
     },
     lend::{RustToCuda, RustToCudaAsync},
-    utils::{adapter::DeviceCopyWithPortableBitSemantics, ffi::DeviceAccessible},
+    utils::{
+        adapter::DeviceCopyWithPortableBitSemantics,
+        ffi::DeviceAccessible,
+        r#async::{Async, CompletionFnMut, NoCompletion},
+    },
 };
 
 #[allow(clippy::module_name_repetitions)]
 pub struct ExchangeWrapperOnHost<T: RustToCuda<CudaAllocation: EmptyCudaAlloc>> {
-    value: T,
-    device_box: CudaDropWrapper<
-        DeviceBox<
-            DeviceCopyWithPortableBitSemantics<
-                DeviceAccessible<<T as RustToCuda>::CudaRepresentation>,
-            >,
-        >,
-    >,
-    locked_cuda_repr: CudaDropWrapper<
-        LockedBox<
-            DeviceCopyWithPortableBitSemantics<
-                DeviceAccessible<<T as RustToCuda>::CudaRepresentation>,
-            >,
-        >,
-    >,
-    move_event: CudaDropWrapper<Event>,
-}
-
-#[allow(clippy::module_name_repetitions)]
-pub struct ExchangeWrapperOnHostAsync<'stream, T: RustToCuda<CudaAllocation: EmptyCudaAlloc>> {
-    value: T,
+    value: Box<T>,
     device_box: CudaDropWrapper<
         DeviceBox<
             DeviceCopyWithPortableBitSemantics<
@@ -60,34 +37,11 @@ pub struct ExchangeWrapperOnHostAsync<'stream, T: RustToCuda<CudaAllocation: Emp
             >,
         >,
     >,
-    move_event: CudaDropWrapper<Event>,
-    stream: PhantomData<&'stream Stream>,
-    waker: Arc<Mutex<Option<Waker>>>,
 }
 
 #[allow(clippy::module_name_repetitions)]
 pub struct ExchangeWrapperOnDevice<T: RustToCuda<CudaAllocation: EmptyCudaAlloc>> {
-    value: T,
-    device_box: CudaDropWrapper<
-        DeviceBox<
-            DeviceCopyWithPortableBitSemantics<
-                DeviceAccessible<<T as RustToCuda>::CudaRepresentation>,
-            >,
-        >,
-    >,
-    locked_cuda_repr: CudaDropWrapper<
-        LockedBox<
-            DeviceCopyWithPortableBitSemantics<
-                DeviceAccessible<<T as RustToCuda>::CudaRepresentation>,
-            >,
-        >,
-    >,
-    move_event: CudaDropWrapper<Event>,
-}
-
-#[allow(clippy::module_name_repetitions)]
-pub struct ExchangeWrapperOnDeviceAsync<'stream, T: RustToCuda<CudaAllocation: EmptyCudaAlloc>> {
-    value: T,
+    value: Box<T>,
     device_box: CudaDropWrapper<
         DeviceBox<
             DeviceCopyWithPortableBitSemantics<
@@ -102,9 +56,6 @@ pub struct ExchangeWrapperOnDeviceAsync<'stream, T: RustToCuda<CudaAllocation: E
             >,
         >,
     >,
-    move_event: CudaDropWrapper<Event>,
-    stream: &'stream Stream,
-    waker: Arc<Mutex<Option<Waker>>>,
 }
 
 impl<T: RustToCuda<CudaAllocation: EmptyCudaAlloc>> ExchangeWrapperOnHost<T> {
@@ -130,16 +81,14 @@ impl<T: RustToCuda<CudaAllocation: EmptyCudaAlloc>> ExchangeWrapperOnHost<T> {
             uninit
         };
 
-        let move_event = Event::new(EventFlags::DISABLE_TIMING)?.into();
-
         Ok(Self {
-            value,
+            value: Box::new(value),
             device_box,
             locked_cuda_repr,
-            move_event,
         })
     }
 
+    // TODO: safety constraint?
     /// Moves the data synchronously to the CUDA device, where it can then be
     /// lent out immutably via [`ExchangeWrapperOnDevice::as_ref`], or mutably
     /// via [`ExchangeWrapperOnDevice::as_mut`].
@@ -164,7 +113,6 @@ impl<T: RustToCuda<CudaAllocation: EmptyCudaAlloc>> ExchangeWrapperOnHost<T> {
             value: self.value,
             device_box: self.device_box,
             locked_cuda_repr: self.locked_cuda_repr,
-            move_event: self.move_event,
         })
     }
 }
@@ -172,6 +120,8 @@ impl<T: RustToCuda<CudaAllocation: EmptyCudaAlloc>> ExchangeWrapperOnHost<T> {
 impl<T: RustToCudaAsync<CudaAllocationAsync: EmptyCudaAlloc, CudaAllocation: EmptyCudaAlloc>>
     ExchangeWrapperOnHost<T>
 {
+    #[allow(clippy::needless_lifetimes)] // keep 'stream explicit
+    // TODO: safety constraint?
     /// Moves the data asynchronously to the CUDA device.
     ///
     /// To avoid aliasing, each CUDA thread will get access to its own shallow
@@ -182,11 +132,14 @@ impl<T: RustToCudaAsync<CudaAllocationAsync: EmptyCudaAlloc, CudaAllocation: Emp
     /// # Errors
     /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside
     /// CUDA
-    pub fn move_to_device_async(
+    pub fn move_to_device_async<'stream>(
         mut self,
-        stream: &Stream,
-    ) -> CudaResult<ExchangeWrapperOnDeviceAsync<'_, T>> {
-        let (cuda_repr, null_alloc) = unsafe { self.value.borrow_async(NoCudaAlloc, stream) }?;
+        stream: &'stream Stream,
+    ) -> CudaResult<Async<'static, 'stream, ExchangeWrapperOnDevice<T>, NoCompletion>> {
+        let (cuda_repr, _null_alloc) = unsafe { self.value.borrow_async(NoCudaAlloc, stream) }?;
+        let (cuda_repr, _completion): (_, Option<NoCompletion>) =
+            unsafe { cuda_repr.unwrap_unchecked()? };
+
         **self.locked_cuda_repr = DeviceCopyWithPortableBitSemantics::from(cuda_repr);
 
         // Safety: The device value is not safely exposed until either
@@ -196,112 +149,16 @@ impl<T: RustToCudaAsync<CudaAllocationAsync: EmptyCudaAlloc, CudaAllocation: Emp
             self.device_box
                 .async_copy_from(&*self.locked_cuda_repr, stream)
         }?;
-        self.move_event.record(stream)?;
-
-        let waker: Arc<Mutex<Option<Waker>>> = Arc::new(Mutex::new(None));
-
-        let waker_callback = waker.clone();
-        stream.add_callback(Box::new(move |_| {
-            if let Ok(mut w) = waker_callback.lock() {
-                if let Some(w) = w.take() {
-                    w.wake();
-                }
-            }
-        }))?;
-
-        let _: NoCudaAlloc = null_alloc.into();
-
-        Ok(ExchangeWrapperOnDeviceAsync {
-            value: self.value,
-            device_box: self.device_box,
-            locked_cuda_repr: self.locked_cuda_repr,
-            move_event: self.move_event,
-            stream,
-            waker,
-        })
-    }
-}
-
-impl<'stream, T: RustToCuda<CudaAllocation: EmptyCudaAlloc>>
-    ExchangeWrapperOnHostAsync<'stream, T>
-{
-    /// Synchronises the host CPU thread until the data has moved to the CPU.
-    ///
-    /// # Errors
-    /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside
-    /// CUDA
-    pub fn sync_to_host(self) -> CudaResult<ExchangeWrapperOnHost<T>> {
-        self.move_event.synchronize()?;
-
-        Ok(ExchangeWrapperOnHost {
-            value: self.value,
-            device_box: self.device_box,
-            locked_cuda_repr: self.locked_cuda_repr,
-            move_event: self.move_event,
-        })
-    }
-
-    /// Moves the asynchronous data move to a different [`Stream`].
-    ///
-    /// # Errors
-    /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside
-    /// CUDA
-    pub fn move_to_stream(self, stream: &Stream) -> CudaResult<ExchangeWrapperOnHostAsync<'_, T>> {
-        stream.wait_event(&self.move_event, StreamWaitEventFlags::DEFAULT)?;
-        self.move_event.record(stream)?;
-
-        let waker_callback = self.waker.clone();
-        stream.add_callback(Box::new(move |_| {
-            if let Ok(mut w) = waker_callback.lock() {
-                if let Some(w) = w.take() {
-                    w.wake();
-                }
-            }
-        }))?;
-
-        Ok(ExchangeWrapperOnHostAsync {
-            value: self.value,
-            device_box: self.device_box,
-            locked_cuda_repr: self.locked_cuda_repr,
-            move_event: self.move_event,
-            stream: PhantomData::<&Stream>,
-            waker: self.waker,
-        })
-    }
-}
 
-impl<'stream, T: RustToCuda<CudaAllocation: EmptyCudaAlloc>> IntoFuture
-    for ExchangeWrapperOnHostAsync<'stream, T>
-{
-    type Output = CudaResult<ExchangeWrapperOnHost<T>>;
-
-    type IntoFuture = impl Future<Output = Self::Output>;
-
-    fn into_future(self) -> Self::IntoFuture {
-        let mut wrapper = Some(self);
-
-        core::future::poll_fn(move |cx| match &wrapper {
-            Some(inner) => match inner.move_event.query() {
-                Ok(EventStatus::NotReady) => inner.waker.lock().map_or_else(
-                    |_| Poll::Ready(Err(CudaError::OperatingSystemError)),
-                    |mut w| {
-                        *w = Some(cx.waker().clone());
-                        Poll::Pending
-                    },
-                ),
-                Ok(EventStatus::Ready) => match wrapper.take() {
-                    Some(inner) => Poll::Ready(Ok(ExchangeWrapperOnHost {
-                        value: inner.value,
-                        device_box: inner.device_box,
-                        locked_cuda_repr: inner.locked_cuda_repr,
-                        move_event: inner.move_event,
-                    })),
-                    None => Poll::Ready(Err(CudaError::AlreadyAcquired)),
-                },
-                Err(err) => Poll::Ready(Err(err)),
+        Async::pending(
+            ExchangeWrapperOnDevice {
+                value: self.value,
+                device_box: self.device_box,
+                locked_cuda_repr: self.locked_cuda_repr,
             },
-            None => Poll::Ready(Err(CudaError::AlreadyAcquired)),
-        })
+            stream,
+            NoCompletion,
+        )
     }
 }
 
@@ -319,83 +176,60 @@ impl<T: RustToCuda<CudaAllocation: EmptyCudaAlloc>> DerefMut for ExchangeWrapper
     }
 }
 
-impl<'stream, T: RustToCuda<CudaAllocation: EmptyCudaAlloc>>
-    ExchangeWrapperOnDeviceAsync<'stream, T>
-{
-    /// Synchronises the host CPU thread until the data has moved to the GPU.
+impl<T: RustToCuda<CudaAllocation: EmptyCudaAlloc>> ExchangeWrapperOnDevice<T> {
+    // TODO: safety constraint?
+    /// Moves the data synchronously back to the host CPU device.
+    ///
+    /// To avoid aliasing, each CUDA thread only got access to its own shallow
+    /// copy of the data. Hence,
+    /// - any shallow changes to the data will NOT be reflected back to the CPU
+    /// - any deep changes to the data WILL be reflected back to the CPU
     ///
     /// # Errors
     /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside
     /// CUDA
-    pub fn sync_to_device(self) -> CudaResult<ExchangeWrapperOnDevice<T>> {
-        self.move_event.synchronize()?;
+    pub fn move_to_host(mut self) -> CudaResult<ExchangeWrapperOnHost<T>> {
+        let null_alloc = NoCudaAlloc.into();
 
-        Ok(ExchangeWrapperOnDevice {
-            value: self.value,
-            device_box: self.device_box,
-            locked_cuda_repr: self.locked_cuda_repr,
-            move_event: self.move_event,
-        })
-    }
+        // Reflect deep changes back to the CPU
+        let _null_alloc: NoCudaAlloc = unsafe { self.value.restore(null_alloc) }?;
 
-    /// Moves the asynchronous data move to a different [`Stream`].
-    ///
-    /// # Errors
-    /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside
-    /// CUDA
-    pub fn move_to_stream(
-        self,
-        stream: &Stream,
-    ) -> CudaResult<ExchangeWrapperOnDeviceAsync<'_, T>> {
-        stream.wait_event(&self.move_event, StreamWaitEventFlags::DEFAULT)?;
-        self.move_event.record(stream)?;
-
-        let waker_callback = self.waker.clone();
-        stream.add_callback(Box::new(move |_| {
-            if let Ok(mut w) = waker_callback.lock() {
-                if let Some(w) = w.take() {
-                    w.wake();
-                }
-            }
-        }))?;
-
-        Ok(ExchangeWrapperOnDeviceAsync {
+        // Note: Shallow changes are not reflected back to the CPU
+
+        Ok(ExchangeWrapperOnHost {
             value: self.value,
             device_box: self.device_box,
             locked_cuda_repr: self.locked_cuda_repr,
-            move_event: self.move_event,
-            stream,
-            waker: self.waker,
         })
     }
 
-    pub fn as_ref_async(
+    #[must_use]
+    pub fn as_ref(
         &self,
-    ) -> HostAndDeviceConstRefAsync<DeviceAccessible<<T as RustToCuda>::CudaRepresentation>> {
+    ) -> HostAndDeviceConstRef<DeviceAccessible<<T as RustToCuda>::CudaRepresentation>> {
         // Safety: `device_box` contains exactly the device copy of `locked_cuda_repr`
         unsafe {
-            HostAndDeviceConstRefAsync::new(
-                &*self.device_box,
-                (**self.locked_cuda_repr).into_ref(),
-                self.stream,
-            )
+            HostAndDeviceConstRef::new(&self.device_box, (**self.locked_cuda_repr).into_ref())
         }
     }
 
-    pub fn as_mut_async(
+    #[must_use]
+    pub fn as_mut(
         &mut self,
-    ) -> HostAndDeviceMutRefAsync<DeviceAccessible<<T as RustToCuda>::CudaRepresentation>> {
+    ) -> HostAndDeviceMutRef<DeviceAccessible<<T as RustToCuda>::CudaRepresentation>> {
         // Safety: `device_box` contains exactly the device copy of `locked_cuda_repr`
         unsafe {
-            HostAndDeviceMutRefAsync::new(
-                &mut self.device_box,
-                (**self.locked_cuda_repr).into_mut(),
-                self.stream,
-            )
+            HostAndDeviceMutRef::new(&mut self.device_box, (**self.locked_cuda_repr).into_mut())
         }
     }
+}
 
-    /// Moves the data synchronously back to the host CPU device.
+impl<T: RustToCudaAsync<CudaAllocationAsync: EmptyCudaAlloc, CudaAllocation: EmptyCudaAlloc>>
+    ExchangeWrapperOnDevice<T>
+{
+    #[allow(clippy::needless_lifetimes)] // keep 'stream explicit
+    // TODO: safety constraint?
+    /// Moves the data asynchronously back to the host CPU device.
     ///
     /// To avoid aliasing, each CUDA thread only got access to its own shallow
     /// copy of the data. Hence,
@@ -405,28 +239,60 @@ impl<'stream, T: RustToCuda<CudaAllocation: EmptyCudaAlloc>>
     /// # Errors
     /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside
     /// CUDA
-    pub fn move_to_host(mut self) -> CudaResult<ExchangeWrapperOnHost<T>> {
+    pub fn move_to_host_async<'stream>(
+        self,
+        stream: &'stream Stream,
+    ) -> CudaResult<
+        Async<
+            'static,
+            'stream,
+            ExchangeWrapperOnHost<T>,
+            CompletionFnMut<'static, ExchangeWrapperOnHost<T>>,
+        >,
+    > {
         let null_alloc = NoCudaAlloc.into();
 
+        let value = owning_ref::BoxRefMut::new(self.value);
+
         // Reflect deep changes back to the CPU
-        let _null_alloc: NoCudaAlloc = unsafe { self.value.restore(null_alloc) }?;
+        let (r#async, _null_alloc): (_, NoCudaAlloc) =
+            unsafe { RustToCudaAsync::restore_async(value, null_alloc, stream) }?;
+        let (value, on_complete) = unsafe { r#async.unwrap_unchecked()? };
+
+        let value = value.into_owner();
 
         // Note: Shallow changes are not reflected back to the CPU
 
-        Ok(ExchangeWrapperOnHost {
-            value: self.value,
-            device_box: self.device_box,
-            locked_cuda_repr: self.locked_cuda_repr,
-            move_event: self.move_event,
-        })
+        if let Some(on_complete) = on_complete {
+            Async::<_, CompletionFnMut<ExchangeWrapperOnHost<T>>>::pending(
+                ExchangeWrapperOnHost {
+                    value,
+                    device_box: self.device_box,
+                    locked_cuda_repr: self.locked_cuda_repr,
+                },
+                stream,
+                Box::new(|on_host: &mut ExchangeWrapperOnHost<T>| on_complete(&mut on_host.value)),
+            )
+        } else {
+            Ok(Async::ready(
+                ExchangeWrapperOnHost {
+                    value,
+                    device_box: self.device_box,
+                    locked_cuda_repr: self.locked_cuda_repr,
+                },
+                stream,
+            ))
+        }
     }
 }
 
 impl<
+        'a,
         'stream,
         T: RustToCudaAsync<CudaAllocationAsync: EmptyCudaAlloc, CudaAllocation: EmptyCudaAlloc>,
-    > ExchangeWrapperOnDeviceAsync<'stream, T>
+    > Async<'a, 'stream, ExchangeWrapperOnDevice<T>, NoCompletion>
 {
+    // TODO: safety constraint?
     /// Moves the data asynchronously back to the host CPU device.
     ///
     /// To avoid aliasing, each CUDA thread only got access to its own shallow
@@ -438,165 +304,87 @@ impl<
     /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside
     /// CUDA
     pub fn move_to_host_async(
-        mut self,
+        self,
         stream: &'stream Stream,
-    ) -> CudaResult<ExchangeWrapperOnHostAsync<'stream, T>> {
-        let null_alloc = NoCudaAlloc.into();
-
-        // Reflect deep changes back to the CPU
-        let _null_alloc: NoCudaAlloc = unsafe { self.value.restore_async(null_alloc, stream) }?;
-
-        // Note: Shallow changes are not reflected back to the CPU
-
-        self.move_event.record(stream)?;
-
-        let waker: Arc<Mutex<Option<Waker>>> = Arc::new(Mutex::new(None));
-
-        let waker_callback = waker.clone();
-        stream.add_callback(Box::new(move |_| {
-            if let Ok(mut w) = waker_callback.lock() {
-                if let Some(w) = w.take() {
-                    w.wake();
-                }
-            }
-        }))?;
-
-        Ok(ExchangeWrapperOnHostAsync {
-            value: self.value,
-            device_box: self.device_box,
-            locked_cuda_repr: self.locked_cuda_repr,
-            move_event: self.move_event,
-            stream: PhantomData::<&'stream Stream>,
-            waker,
-        })
-    }
-}
-
-impl<'stream, T: RustToCuda<CudaAllocation: EmptyCudaAlloc>> IntoFuture
-    for ExchangeWrapperOnDeviceAsync<'stream, T>
-{
-    type Output = CudaResult<ExchangeWrapperOnDevice<T>>;
-
-    type IntoFuture = impl Future<Output = Self::Output>;
-
-    fn into_future(self) -> Self::IntoFuture {
-        let mut wrapper = Some(self);
-
-        core::future::poll_fn(move |cx| match &wrapper {
-            Some(inner) => match inner.move_event.query() {
-                Ok(EventStatus::NotReady) => inner.waker.lock().map_or_else(
-                    |_| Poll::Ready(Err(CudaError::OperatingSystemError)),
-                    |mut w| {
-                        *w = Some(cx.waker().clone());
-                        Poll::Pending
-                    },
-                ),
-                Ok(EventStatus::Ready) => match wrapper.take() {
-                    Some(inner) => Poll::Ready(Ok(ExchangeWrapperOnDevice {
-                        value: inner.value,
-                        device_box: inner.device_box,
-                        locked_cuda_repr: inner.locked_cuda_repr,
-                        move_event: inner.move_event,
-                    })),
-                    None => Poll::Ready(Err(CudaError::AlreadyAcquired)),
-                },
-                Err(err) => Poll::Ready(Err(err)),
-            },
-            None => Poll::Ready(Err(CudaError::AlreadyAcquired)),
-        })
-    }
-}
+    ) -> CudaResult<
+        Async<
+            'static,
+            'stream,
+            ExchangeWrapperOnHost<T>,
+            CompletionFnMut<'static, ExchangeWrapperOnHost<T>>,
+        >,
+    > {
+        let (this, completion): (_, Option<NoCompletion>) = unsafe { self.unwrap_unchecked()? };
 
-impl<T: RustToCuda<CudaAllocation: EmptyCudaAlloc>> ExchangeWrapperOnDevice<T> {
-    /// Moves the data synchronously back to the host CPU device.
-    ///
-    /// To avoid aliasing, each CUDA thread only got access to its own shallow
-    /// copy of the data. Hence,
-    /// - any shallow changes to the data will NOT be reflected back to the CPU
-    /// - any deep changes to the data WILL be reflected back to the CPU
-    ///
-    /// # Errors
-    /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside
-    /// CUDA
-    pub fn move_to_host(mut self) -> CudaResult<ExchangeWrapperOnHost<T>> {
         let null_alloc = NoCudaAlloc.into();
 
+        let value = owning_ref::BoxRefMut::new(this.value);
+
         // Reflect deep changes back to the CPU
-        let _null_alloc: NoCudaAlloc = unsafe { self.value.restore(null_alloc) }?;
+        let (r#async, _null_alloc): (_, NoCudaAlloc) =
+            unsafe { RustToCudaAsync::restore_async(value, null_alloc, stream) }?;
+        let (value, on_complete) = unsafe { r#async.unwrap_unchecked()? };
+
+        let value = value.into_owner();
 
         // Note: Shallow changes are not reflected back to the CPU
 
-        Ok(ExchangeWrapperOnHost {
-            value: self.value,
-            device_box: self.device_box,
-            locked_cuda_repr: self.locked_cuda_repr,
-            move_event: self.move_event,
-        })
+        let on_host = ExchangeWrapperOnHost {
+            value,
+            device_box: this.device_box,
+            locked_cuda_repr: this.locked_cuda_repr,
+        };
+
+        if let Some(on_complete) = on_complete {
+            Async::<_, CompletionFnMut<ExchangeWrapperOnHost<T>>>::pending(
+                on_host,
+                stream,
+                Box::new(|on_host: &mut ExchangeWrapperOnHost<T>| on_complete(&mut on_host.value)),
+            )
+        } else if matches!(completion, Some(NoCompletion)) {
+            Async::<_, CompletionFnMut<ExchangeWrapperOnHost<T>>>::pending(
+                on_host,
+                stream,
+                Box::new(|_on_host: &mut ExchangeWrapperOnHost<T>| Ok(())),
+            )
+        } else {
+            Ok(Async::ready(on_host, stream))
+        }
     }
 
-    pub fn as_ref(
+    // TODO: replace by async borrow map
+    #[must_use]
+    pub fn as_ref_async(
         &self,
-    ) -> HostAndDeviceConstRef<DeviceAccessible<<T as RustToCuda>::CudaRepresentation>> {
+    ) -> HostAndDeviceConstRefAsync<
+        'stream,
+        '_,
+        DeviceAccessible<<T as RustToCuda>::CudaRepresentation>,
+    > {
+        let this = unsafe { self.unwrap_ref_unchecked() };
+
         // Safety: `device_box` contains exactly the device copy of `locked_cuda_repr`
         unsafe {
-            HostAndDeviceConstRef::new(&self.device_box, (**self.locked_cuda_repr).into_ref())
+            HostAndDeviceConstRefAsync::new(
+                &*(this.device_box),
+                (**(this.locked_cuda_repr)).into_ref(),
+            )
         }
     }
 
-    pub fn as_mut(
+    // TODO: replace by async borrow map mut
+    #[must_use]
+    pub fn as_mut_async(
         &mut self,
-    ) -> HostAndDeviceMutRef<DeviceAccessible<<T as RustToCuda>::CudaRepresentation>> {
+    ) -> HostAndDeviceMutRefAsync<DeviceAccessible<<T as RustToCuda>::CudaRepresentation>> {
+        let this = unsafe { self.unwrap_mut_unchecked() };
+
         // Safety: `device_box` contains exactly the device copy of `locked_cuda_repr`
         unsafe {
-            HostAndDeviceMutRef::new(&mut self.device_box, (**self.locked_cuda_repr).into_mut())
+            HostAndDeviceMutRefAsync::new(
+                &mut *(this.device_box),
+                (**(this.locked_cuda_repr)).into_mut(),
+            )
         }
     }
 }
-
-impl<T: RustToCudaAsync<CudaAllocationAsync: EmptyCudaAlloc, CudaAllocation: EmptyCudaAlloc>>
-    ExchangeWrapperOnDevice<T>
-{
-    /// Moves the data asynchronously back to the host CPU device.
-    ///
-    /// To avoid aliasing, each CUDA thread only got access to its own shallow
-    /// copy of the data. Hence,
-    /// - any shallow changes to the data will NOT be reflected back to the CPU
-    /// - any deep changes to the data WILL be reflected back to the CPU
-    ///
-    /// # Errors
-    /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside
-    /// CUDA
-    pub fn move_to_host_async(
-        mut self,
-        stream: &Stream,
-    ) -> CudaResult<ExchangeWrapperOnHostAsync<'_, T>> {
-        let null_alloc = NoCudaAlloc.into();
-
-        // Reflect deep changes back to the CPU
-        let _null_alloc: NoCudaAlloc = unsafe { self.value.restore_async(null_alloc, stream) }?;
-
-        // Note: Shallow changes are not reflected back to the CPU
-
-        self.move_event.record(stream)?;
-
-        let waker: Arc<Mutex<Option<Waker>>> = Arc::new(Mutex::new(None));
-
-        let waker_callback = waker.clone();
-        stream.add_callback(Box::new(move |_| {
-            if let Ok(mut w) = waker_callback.lock() {
-                if let Some(w) = w.take() {
-                    w.wake();
-                }
-            }
-        }))?;
-
-        Ok(ExchangeWrapperOnHostAsync {
-            value: self.value,
-            device_box: self.device_box,
-            locked_cuda_repr: self.locked_cuda_repr,
-            move_event: self.move_event,
-            stream: PhantomData::<&Stream>,
-            waker,
-        })
-    }
-}

From 7e2801f06862c35041f978b27931d71bc574f9c4 Mon Sep 17 00:00:00 2001
From: Juniper Tyree <juniper.tyree@helsinki.fi>
Date: Wed, 3 Jan 2024 04:14:13 +0000
Subject: [PATCH 079/120] Add back mostly unchanged anti-aliasing types with
 RustToCudaAsync impls

---
 src/utils/aliasing/const.rs   | 70 +++++++++++++++++++++++++++-------
 src/utils/aliasing/dynamic.rs | 71 +++++++++++++++++++++++++++--------
 src/utils/aliasing/mod.rs     |  8 ++--
 3 files changed, 116 insertions(+), 33 deletions(-)

diff --git a/src/utils/aliasing/const.rs b/src/utils/aliasing/const.rs
index 8441a5bd1..0259c301a 100644
--- a/src/utils/aliasing/const.rs
+++ b/src/utils/aliasing/const.rs
@@ -219,29 +219,71 @@ unsafe impl<T: RustToCudaAsync, const STRIDE: usize> RustToCudaAsync
 
     #[cfg(feature = "host")]
     #[allow(clippy::type_complexity)]
-    unsafe fn borrow_async<A: crate::alloc::CudaAlloc>(
+    unsafe fn borrow_async<'stream, A: crate::alloc::CudaAlloc>(
         &self,
         alloc: A,
-        stream: &rustacuda::stream::Stream,
+        stream: &'stream rustacuda::stream::Stream,
     ) -> rustacuda::error::CudaResult<(
-        DeviceAccessible<Self::CudaRepresentation>,
+        crate::utils::r#async::Async<'_, 'stream, DeviceAccessible<Self::CudaRepresentation>>,
         crate::alloc::CombinedCudaAlloc<Self::CudaAllocationAsync, A>,
     )> {
-        let (cuda_repr, alloc) = self.0.borrow_async(alloc, stream)?;
-
-        Ok((
-            DeviceAccessible::from(SplitSliceOverCudaThreadsConstStride::new(cuda_repr)),
-            alloc,
-        ))
+        let (r#async, alloc) = self.0.borrow_async(alloc, stream)?;
+        let (cuda_repr, completion) = unsafe { r#async.unwrap_unchecked()? };
+
+        let cuda_repr =
+            DeviceAccessible::from(SplitSliceOverCudaThreadsConstStride::new(cuda_repr));
+
+        let r#async = if matches!(completion, Some(crate::utils::r#async::NoCompletion)) {
+            crate::utils::r#async::Async::pending(
+                cuda_repr,
+                stream,
+                crate::utils::r#async::NoCompletion,
+            )?
+        } else {
+            crate::utils::r#async::Async::ready(cuda_repr, stream)
+        };
+
+        Ok((r#async, alloc))
     }
 
     #[cfg(feature = "host")]
-    unsafe fn restore_async<A: crate::alloc::CudaAlloc>(
-        &mut self,
+    unsafe fn restore_async<'a, 'stream, A: crate::alloc::CudaAlloc, O>(
+        this: owning_ref::BoxRefMut<'a, O, Self>,
         alloc: crate::alloc::CombinedCudaAlloc<Self::CudaAllocationAsync, A>,
-        stream: &rustacuda::stream::Stream,
-    ) -> rustacuda::error::CudaResult<A> {
-        self.0.restore_async(alloc, stream)
+        stream: &'stream rustacuda::stream::Stream,
+    ) -> rustacuda::error::CudaResult<(
+        crate::utils::r#async::Async<
+            'a,
+            'stream,
+            owning_ref::BoxRefMut<'a, O, Self>,
+            crate::utils::r#async::CompletionFnMut<'a, Self>,
+        >,
+        A,
+    )> {
+        let this_backup = unsafe { std::mem::ManuallyDrop::new(std::ptr::read(&this)) };
+
+        let (r#async, alloc_tail) =
+            T::restore_async(this.map_mut(|this| &mut this.0), alloc, stream)?;
+
+        let (inner, on_completion) = unsafe { r#async.unwrap_unchecked()? };
+
+        std::mem::forget(inner);
+        let this = std::mem::ManuallyDrop::into_inner(this_backup);
+
+        if let Some(on_completion) = on_completion {
+            let r#async = crate::utils::r#async::Async::<
+                _,
+                crate::utils::r#async::CompletionFnMut<'a, Self>,
+            >::pending(
+                this,
+                stream,
+                Box::new(|this: &mut Self| on_completion(&mut this.0)),
+            )?;
+            Ok((r#async, alloc_tail))
+        } else {
+            let r#async = crate::utils::r#async::Async::ready(this, stream);
+            Ok((r#async, alloc_tail))
+        }
     }
 }
 
diff --git a/src/utils/aliasing/dynamic.rs b/src/utils/aliasing/dynamic.rs
index f8a04fa06..1c502dc8e 100644
--- a/src/utils/aliasing/dynamic.rs
+++ b/src/utils/aliasing/dynamic.rs
@@ -197,32 +197,73 @@ unsafe impl<T: RustToCudaAsync> RustToCudaAsync for SplitSliceOverCudaThreadsDyn
 
     #[cfg(feature = "host")]
     #[allow(clippy::type_complexity)]
-    unsafe fn borrow_async<A: crate::alloc::CudaAlloc>(
+    unsafe fn borrow_async<'stream, A: crate::alloc::CudaAlloc>(
         &self,
         alloc: A,
-        stream: &rustacuda::stream::Stream,
+        stream: &'stream rustacuda::stream::Stream,
     ) -> rustacuda::error::CudaResult<(
-        DeviceAccessible<Self::CudaRepresentation>,
+        crate::utils::r#async::Async<'_, 'stream, DeviceAccessible<Self::CudaRepresentation>>,
         crate::alloc::CombinedCudaAlloc<Self::CudaAllocationAsync, A>,
     )> {
-        let (cuda_repr, alloc) = self.inner.borrow_async(alloc, stream)?;
+        let (r#async, alloc) = self.inner.borrow_async(alloc, stream)?;
+        let (cuda_repr, completion) = unsafe { r#async.unwrap_unchecked()? };
 
-        Ok((
-            DeviceAccessible::from(SplitSliceOverCudaThreadsDynamicStride::new(
+        let cuda_repr = DeviceAccessible::from(SplitSliceOverCudaThreadsDynamicStride::new(
+            cuda_repr,
+            self.stride,
+        ));
+
+        let r#async = if matches!(completion, Some(crate::utils::r#async::NoCompletion)) {
+            crate::utils::r#async::Async::pending(
                 cuda_repr,
-                self.stride,
-            )),
-            alloc,
-        ))
+                stream,
+                crate::utils::r#async::NoCompletion,
+            )?
+        } else {
+            crate::utils::r#async::Async::ready(cuda_repr, stream)
+        };
+
+        Ok((r#async, alloc))
     }
 
     #[cfg(feature = "host")]
-    unsafe fn restore_async<A: crate::alloc::CudaAlloc>(
-        &mut self,
+    unsafe fn restore_async<'a, 'stream, A: crate::alloc::CudaAlloc, O>(
+        this: owning_ref::BoxRefMut<'a, O, Self>,
         alloc: crate::alloc::CombinedCudaAlloc<Self::CudaAllocationAsync, A>,
-        stream: &rustacuda::stream::Stream,
-    ) -> rustacuda::error::CudaResult<A> {
-        self.inner.restore_async(alloc, stream)
+        stream: &'stream rustacuda::stream::Stream,
+    ) -> rustacuda::error::CudaResult<(
+        crate::utils::r#async::Async<
+            'a,
+            'stream,
+            owning_ref::BoxRefMut<'a, O, Self>,
+            crate::utils::r#async::CompletionFnMut<'a, Self>,
+        >,
+        A,
+    )> {
+        let this_backup = unsafe { std::mem::ManuallyDrop::new(std::ptr::read(&this)) };
+
+        let (r#async, alloc_tail) =
+            T::restore_async(this.map_mut(|this| &mut this.inner), alloc, stream)?;
+
+        let (inner, on_completion) = unsafe { r#async.unwrap_unchecked()? };
+
+        std::mem::forget(inner);
+        let this = std::mem::ManuallyDrop::into_inner(this_backup);
+
+        if let Some(on_completion) = on_completion {
+            let r#async = crate::utils::r#async::Async::<
+                _,
+                crate::utils::r#async::CompletionFnMut<'a, Self>,
+            >::pending(
+                this,
+                stream,
+                Box::new(|this: &mut Self| on_completion(&mut this.inner)),
+            )?;
+            Ok((r#async, alloc_tail))
+        } else {
+            let r#async = crate::utils::r#async::Async::ready(this, stream);
+            Ok((r#async, alloc_tail))
+        }
     }
 }
 
diff --git a/src/utils/aliasing/mod.rs b/src/utils/aliasing/mod.rs
index aa0a42742..e7753cf92 100644
--- a/src/utils/aliasing/mod.rs
+++ b/src/utils/aliasing/mod.rs
@@ -1,5 +1,5 @@
-// mod r#const;
-// mod dynamic;
+mod r#const;
+mod dynamic;
 
-// pub use dynamic::SplitSliceOverCudaThreadsDynamicStride;
-// pub use r#const::SplitSliceOverCudaThreadsConstStride;
+pub use dynamic::SplitSliceOverCudaThreadsDynamicStride;
+pub use r#const::SplitSliceOverCudaThreadsConstStride;

From af999e5b4620d929a1562d12d0ca7c5b9c73e0c1 Mon Sep 17 00:00:00 2001
From: Juniper Tyree <juniper.tyree@helsinki.fi>
Date: Wed, 3 Jan 2024 11:14:07 +0000
Subject: [PATCH 080/120] Progress on replacing ...Async with Async<...>

---
 src/host/mod.rs               | 213 +++++-----------------------------
 src/kernel/mod.rs             |   6 +-
 src/kernel/param.rs           | 121 ++++++++++++-------
 src/utils/exchange/wrapper.rs |  47 ++++----
 4 files changed, 130 insertions(+), 257 deletions(-)

diff --git a/src/host/mod.rs b/src/host/mod.rs
index a705c8504..62870dd39 100644
--- a/src/host/mod.rs
+++ b/src/host/mod.rs
@@ -22,6 +22,7 @@ use crate::{
             DeviceConstPointer, DeviceConstRef, DeviceMutPointer, DeviceMutRef, DeviceOwnedPointer,
             DeviceOwnedRef,
         },
+        r#async::{Async, NoCompletion},
     },
 };
 
@@ -190,15 +191,20 @@ impl<'a, T: PortableBitSemantics + TypeGraphLayout> HostAndDeviceMutRef<'a, T> {
     }
 
     #[must_use]
-    pub fn as_async<'stream, 'b>(&'b mut self) -> HostAndDeviceMutRefAsync<'stream, 'b, T>
+    pub fn as_async<'b, 'stream>(
+        &'b mut self,
+        stream: &'stream Stream,
+    ) -> Async<'b, 'stream, HostAndDeviceMutRef<'b, T>, NoCompletion>
     where
         'a: 'b,
     {
-        HostAndDeviceMutRefAsync {
-            device_box: self.device_box,
-            host_ref: self.host_ref,
-            stream: PhantomData::<&'stream Stream>,
-        }
+        Async::ready(
+            HostAndDeviceMutRef {
+                device_box: self.device_box,
+                host_ref: self.host_ref,
+            },
+            stream,
+        )
     }
 }
 
@@ -284,15 +290,20 @@ impl<'a, T: PortableBitSemantics + TypeGraphLayout> HostAndDeviceConstRef<'a, T>
     }
 
     #[must_use]
-    pub const fn as_async<'stream, 'b>(&'b self) -> HostAndDeviceConstRefAsync<'stream, 'b, T>
+    pub const fn as_async<'b, 'stream>(
+        &'b self,
+        stream: &'stream Stream,
+    ) -> Async<'b, 'stream, HostAndDeviceConstRef<'b, T>, NoCompletion>
     where
         'a: 'b,
     {
-        HostAndDeviceConstRefAsync {
-            device_box: self.device_box,
-            host_ref: self.host_ref,
-            stream: PhantomData::<&'stream Stream>,
-        }
+        Async::ready(
+            HostAndDeviceConstRef {
+                device_box: self.device_box,
+                host_ref: self.host_ref,
+            },
+            stream,
+        )
     }
 }
 
@@ -337,178 +348,10 @@ impl<'a, T: PortableBitSemantics + TypeGraphLayout> HostAndDeviceOwned<'a, T> {
     }
 
     #[must_use]
-    pub fn into_async<'stream>(self) -> HostAndDeviceOwnedAsync<'stream, 'a, T> {
-        HostAndDeviceOwnedAsync {
-            device_box: self.device_box,
-            host_val: self.host_val,
-            stream: PhantomData::<&'stream Stream>,
-        }
-    }
-}
-
-#[allow(clippy::module_name_repetitions)]
-pub struct HostAndDeviceMutRefAsync<'stream, 'a, T: PortableBitSemantics + TypeGraphLayout> {
-    device_box: &'a mut DeviceBox<DeviceCopyWithPortableBitSemantics<T>>,
-    host_ref: &'a mut T,
-    stream: PhantomData<&'stream Stream>,
-}
-
-impl<'stream, 'a, T: PortableBitSemantics + TypeGraphLayout>
-    HostAndDeviceMutRefAsync<'stream, 'a, T>
-{
-    /// # Safety
-    ///
-    /// `device_box` must contain EXACTLY the device copy of `host_ref`
-    pub unsafe fn new(
-        device_box: &'a mut DeviceBox<DeviceCopyWithPortableBitSemantics<T>>,
-        host_ref: &'a mut T,
-    ) -> Self {
-        Self {
-            device_box,
-            host_ref,
-            stream: PhantomData::<&'stream Stream>,
-        }
-    }
-
-    #[must_use]
-    /// # Safety
-    ///
-    /// The returned [`DeviceMutRef`] must only be used on the constructed-with
-    /// [`Stream`]
-    pub unsafe fn for_device_async<'b>(&'b mut self) -> DeviceMutRef<'a, T>
-    where
-        'a: 'b,
-    {
-        DeviceMutRef {
-            pointer: DeviceMutPointer(self.device_box.as_device_ptr().as_raw_mut().cast()),
-            reference: PhantomData,
-        }
-    }
-
-    #[must_use]
-    pub fn for_host<'b: 'a>(&'b self) -> &'a T {
-        self.host_ref
-    }
-
-    #[must_use]
-    pub fn as_ref<'b>(&'b self) -> HostAndDeviceConstRefAsync<'stream, 'b, T>
-    where
-        'a: 'b,
-    {
-        HostAndDeviceConstRefAsync {
-            device_box: self.device_box,
-            host_ref: self.host_ref,
-            stream: self.stream,
-        }
-    }
-
-    #[must_use]
-    pub fn as_mut<'b>(&'b mut self) -> HostAndDeviceMutRefAsync<'stream, 'b, T>
-    where
-        'a: 'b,
-    {
-        HostAndDeviceMutRefAsync {
-            device_box: self.device_box,
-            host_ref: self.host_ref,
-            stream: self.stream,
-        }
-    }
-}
-
-#[allow(clippy::module_name_repetitions)]
-pub struct HostAndDeviceConstRefAsync<'stream, 'a, T: PortableBitSemantics + TypeGraphLayout> {
-    device_box: &'a DeviceBox<DeviceCopyWithPortableBitSemantics<T>>,
-    host_ref: &'a T,
-    stream: PhantomData<&'stream Stream>,
-}
-
-impl<'stream, 'a, T: PortableBitSemantics + TypeGraphLayout> Clone
-    for HostAndDeviceConstRefAsync<'stream, 'a, T>
-{
-    fn clone(&self) -> Self {
-        *self
-    }
-}
-
-impl<'stream, 'a, T: PortableBitSemantics + TypeGraphLayout> Copy
-    for HostAndDeviceConstRefAsync<'stream, 'a, T>
-{
-}
-
-impl<'stream, 'a, T: PortableBitSemantics + TypeGraphLayout>
-    HostAndDeviceConstRefAsync<'stream, 'a, T>
-{
-    /// # Safety
-    ///
-    /// `device_box` must contain EXACTLY the device copy of `host_ref`
-    #[must_use]
-    pub const unsafe fn new(
-        device_box: &'a DeviceBox<DeviceCopyWithPortableBitSemantics<T>>,
-        host_ref: &'a T,
-    ) -> Self {
-        Self {
-            device_box,
-            host_ref,
-            stream: PhantomData::<&'stream Stream>,
-        }
-    }
-
-    #[must_use]
-    /// # Safety
-    ///
-    /// The returned [`DeviceConstRef`] must only be used on the
-    /// constructed-with [`Stream`]
-    pub unsafe fn for_device_async<'b>(&'b self) -> DeviceConstRef<'a, T>
-    where
-        'a: 'b,
-    {
-        let mut hack = ManuallyDrop::new(unsafe { std::ptr::read(self.device_box) });
-
-        DeviceConstRef {
-            pointer: DeviceConstPointer(hack.as_device_ptr().as_raw().cast()),
-            reference: PhantomData,
-        }
-    }
-
-    #[must_use]
-    pub const fn for_host(&'a self) -> &'a T {
-        self.host_ref
-    }
-
-    #[must_use]
-    pub const fn as_ref<'b>(&'b self) -> HostAndDeviceConstRefAsync<'stream, 'b, T>
-    where
-        'a: 'b,
-    {
-        *self
-    }
-}
-
-#[allow(clippy::module_name_repetitions)]
-pub struct HostAndDeviceOwnedAsync<'stream, 'a, T: PortableBitSemantics + TypeGraphLayout> {
-    device_box: &'a mut DeviceBox<DeviceCopyWithPortableBitSemantics<T>>,
-    host_val: &'a mut T,
-    stream: PhantomData<&'stream Stream>,
-}
-
-impl<'stream, 'a, T: PortableBitSemantics + TypeGraphLayout>
-    HostAndDeviceOwnedAsync<'stream, 'a, T>
-{
-    #[must_use]
-    /// # Safety
-    ///
-    /// The returned [`DeviceOwnedRef`] must only be used on the
-    /// constructed-with [`Stream`]
-    pub unsafe fn for_device_async(self) -> DeviceOwnedRef<'a, T> {
-        DeviceOwnedRef {
-            pointer: DeviceOwnedPointer(self.device_box.as_device_ptr().as_raw_mut().cast()),
-            marker: PhantomData::<T>,
-            reference: PhantomData::<&'a mut ()>,
-        }
-    }
-
-    #[must_use]
-    pub fn for_host(&self) -> &T {
-        self.host_val
+    pub const fn into_async<'stream>(
+        self,
+        stream: &'stream Stream,
+    ) -> Async<'a, 'stream, Self, NoCompletion> {
+        Async::ready(self, stream)
     }
 }
diff --git a/src/kernel/mod.rs b/src/kernel/mod.rs
index b6ed5b8e7..40985a0e8 100644
--- a/src/kernel/mod.rs
+++ b/src/kernel/mod.rs
@@ -67,9 +67,9 @@ pub trait CudaKernelParameter: sealed::Sealed {
 
     #[doc(hidden)]
     #[cfg(feature = "host")]
-    fn async_to_ffi<'stream, 'b>(
+    fn async_to_ffi<'stream, 'b, E: From<rustacuda::error::CudaError>>(
         param: Self::AsyncHostType<'stream, 'b>,
-    ) -> Self::FfiType<'stream, 'b>;
+    ) -> Result<Self::FfiType<'stream, 'b>, E>;
 
     #[doc(hidden)]
     #[cfg(feature = "device")]
@@ -377,7 +377,7 @@ macro_rules! impl_typed_kernel_launch {
                 shared_memory_size,
                 &[
                     $(core::ptr::from_mut(
-                        &mut $T::async_to_ffi($arg)
+                        &mut $T::async_to_ffi($arg)?
                     ).cast::<core::ffi::c_void>()),*
                 ],
             ) }
diff --git a/src/kernel/param.rs b/src/kernel/param.rs
index 17d4bc3a5..ad4ade594 100644
--- a/src/kernel/param.rs
+++ b/src/kernel/param.rs
@@ -102,10 +102,10 @@ impl<
     }
 
     #[cfg(feature = "host")]
-    fn async_to_ffi<'stream, 'b>(
+    fn async_to_ffi<'stream, 'b, E: From<rustacuda::error::CudaError>>(
         param: Self::AsyncHostType<'stream, 'b>,
-    ) -> Self::FfiType<'stream, 'b> {
-        param
+    ) -> Result<Self::FfiType<'stream, 'b>, E> {
+        Ok(param)
     }
 
     #[cfg(feature = "device")]
@@ -138,7 +138,12 @@ impl<
     > CudaKernelParameter for &'a PerThreadShallowCopy<T>
 {
     #[cfg(feature = "host")]
-    type AsyncHostType<'stream, 'b> = crate::host::HostAndDeviceConstRefAsync<'stream, 'b, T>;
+    type AsyncHostType<'stream, 'b> = crate::utils::r#async::Async<
+        'b,
+        'stream,
+        crate::host::HostAndDeviceConstRef<'b, T>,
+        crate::utils::r#async::NoCompletion,
+    >;
     #[cfg(any(feature = "device", doc))]
     type DeviceType<'b> = &'b T;
     type FfiType<'stream, 'b> = DeviceConstRef<'b, T>;
@@ -148,10 +153,12 @@ impl<
     #[cfg(feature = "host")]
     fn with_new_async<'stream, O, E: From<rustacuda::error::CudaError>>(
         param: Self::SyncHostType,
-        _stream: &'stream rustacuda::stream::Stream,
+        stream: &'stream rustacuda::stream::Stream,
         inner: impl for<'b> FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result<O, E>,
     ) -> Result<O, E> {
-        crate::host::HostAndDeviceConstRef::with_new(param, |const_ref| inner(const_ref.as_async()))
+        crate::host::HostAndDeviceConstRef::with_new(param, |const_ref| {
+            inner(const_ref.as_async(stream))
+        })
     }
 
     #[cfg(feature = "host")]
@@ -168,10 +175,12 @@ impl<
     }
 
     #[cfg(feature = "host")]
-    fn async_to_ffi<'stream, 'b>(
+    fn async_to_ffi<'stream, 'b, E: From<rustacuda::error::CudaError>>(
         param: Self::AsyncHostType<'stream, 'b>,
-    ) -> Self::FfiType<'stream, 'b> {
-        unsafe { param.for_device_async() }
+    ) -> Result<Self::FfiType<'stream, 'b>, E> {
+        let (param, _completion): (_, Option<crate::utils::r#async::NoCompletion>) =
+            unsafe { param.unwrap_unchecked()? };
+        Ok(param.for_device())
     }
 
     #[cfg(feature = "device")]
@@ -228,6 +237,7 @@ impl<
         param: &Self::AsyncHostType<'_, '_>,
         inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O,
     ) -> O {
+        let param = unsafe { param.unwrap_ref_unchecked() };
         inner(Some(&param_as_raw_bytes(param.for_host())))
     }
 
@@ -237,9 +247,9 @@ impl<
     }
 
     #[cfg(feature = "host")]
-    fn async_to_ffi<'stream, 'b>(
+    fn async_to_ffi<'stream, 'b, E: From<rustacuda::error::CudaError>>(
         param: Self::AsyncHostType<'stream, 'b>,
-    ) -> Self::FfiType<'stream, 'b> {
+    ) -> Result<Self::FfiType<'stream, 'b>, E> {
         <&'a PerThreadShallowCopy<T> as CudaKernelParameter>::async_to_ffi(param)
     }
 
@@ -303,7 +313,12 @@ impl<
     > CudaKernelParameter for &'a ShallowInteriorMutable<T>
 {
     #[cfg(feature = "host")]
-    type AsyncHostType<'stream, 'b> = crate::host::HostAndDeviceConstRefAsync<'stream, 'b, T>;
+    type AsyncHostType<'stream, 'b> = crate::utils::r#async::Async<
+        'b,
+        'stream,
+        crate::host::HostAndDeviceConstRef<'b, T>,
+        crate::utils::r#async::NoCompletion,
+    >;
     #[cfg(any(feature = "device", doc))]
     type DeviceType<'b> = &'b T;
     type FfiType<'stream, 'b> = DeviceConstRef<'b, T>;
@@ -315,11 +330,11 @@ impl<
     #[cfg(feature = "host")]
     fn with_new_async<'stream, O, E: From<rustacuda::error::CudaError>>(
         param: Self::SyncHostType,
-        _stream: &'stream rustacuda::stream::Stream,
+        stream: &'stream rustacuda::stream::Stream,
         inner: impl for<'b> FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result<O, E>,
     ) -> Result<O, E> {
         crate::host::HostAndDeviceMutRef::with_new(param, |const_ref| {
-            inner(const_ref.as_ref().as_async())
+            inner(const_ref.as_ref().as_async(stream))
         })
     }
 
@@ -337,10 +352,12 @@ impl<
     }
 
     #[cfg(feature = "host")]
-    fn async_to_ffi<'stream, 'b>(
+    fn async_to_ffi<'stream, 'b, E: From<rustacuda::error::CudaError>>(
         param: Self::AsyncHostType<'stream, 'b>,
-    ) -> Self::FfiType<'stream, 'b> {
-        unsafe { param.for_device_async() }
+    ) -> Result<Self::FfiType<'stream, 'b>, E> {
+        let (param, _completion): (_, Option<crate::utils::r#async::NoCompletion>) =
+            unsafe { param.unwrap_unchecked()? };
+        Ok(param.for_device())
     }
 
     #[cfg(feature = "device")]
@@ -414,10 +431,14 @@ impl<
     > CudaKernelParameter for SharedHeapPerThreadShallowCopy<T>
 {
     #[cfg(feature = "host")]
-    type AsyncHostType<'stream, 'b> = crate::host::HostAndDeviceOwnedAsync<
-        'stream,
+    type AsyncHostType<'stream, 'b> = crate::utils::r#async::Async<
         'b,
-        DeviceAccessible<<T as RustToCuda>::CudaRepresentation>,
+        'stream,
+        crate::host::HostAndDeviceOwned<
+            'b,
+            DeviceAccessible<<T as RustToCuda>::CudaRepresentation>,
+        >,
+        crate::utils::r#async::NoCompletion,
     >;
     #[cfg(any(feature = "device", doc))]
     type DeviceType<'b> = T;
@@ -429,10 +450,10 @@ impl<
     #[cfg(feature = "host")]
     fn with_new_async<'stream, O, E: From<rustacuda::error::CudaError>>(
         param: Self::SyncHostType,
-        _stream: &'stream rustacuda::stream::Stream,
+        stream: &'stream rustacuda::stream::Stream,
         inner: impl for<'b> FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result<O, E>,
     ) -> Result<O, E> {
-        crate::lend::LendToCuda::move_to_cuda(param, |param| inner(param.into_async()))
+        crate::lend::LendToCuda::move_to_cuda(param, |param| inner(param.into_async(stream)))
     }
 
     #[cfg(feature = "host")]
@@ -449,10 +470,12 @@ impl<
     }
 
     #[cfg(feature = "host")]
-    fn async_to_ffi<'stream, 'b>(
+    fn async_to_ffi<'stream, 'b, E: From<rustacuda::error::CudaError>>(
         param: Self::AsyncHostType<'stream, 'b>,
-    ) -> Self::FfiType<'stream, 'b> {
-        unsafe { param.for_device_async() }
+    ) -> Result<Self::FfiType<'stream, 'b>, E> {
+        let (param, _completion): (_, Option<crate::utils::r#async::NoCompletion>) =
+            unsafe { param.unwrap_unchecked()? };
+        Ok(param.for_device())
     }
 
     #[cfg(feature = "device")]
@@ -478,10 +501,14 @@ impl<'a, T: 'static + Sync + RustToCuda> CudaKernelParameter
     for &'a SharedHeapPerThreadShallowCopy<T>
 {
     #[cfg(feature = "host")]
-    type AsyncHostType<'stream, 'b> = crate::host::HostAndDeviceConstRefAsync<
-        'stream,
+    type AsyncHostType<'stream, 'b> = crate::utils::r#async::Async<
         'b,
-        DeviceAccessible<<T as RustToCuda>::CudaRepresentation>,
+        'stream,
+        crate::host::HostAndDeviceConstRef<
+            'b,
+            DeviceAccessible<<T as RustToCuda>::CudaRepresentation>,
+        >,
+        crate::utils::r#async::NoCompletion,
     >;
     #[cfg(any(feature = "device", doc))]
     type DeviceType<'b> = &'b T;
@@ -493,10 +520,10 @@ impl<'a, T: 'static + Sync + RustToCuda> CudaKernelParameter
     #[cfg(feature = "host")]
     fn with_new_async<'stream, O, E: From<rustacuda::error::CudaError>>(
         param: Self::SyncHostType,
-        _stream: &'stream rustacuda::stream::Stream,
+        stream: &'stream rustacuda::stream::Stream,
         inner: impl for<'b> FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result<O, E>,
     ) -> Result<O, E> {
-        crate::lend::LendToCuda::lend_to_cuda(param, |param| inner(param.as_async()))
+        crate::lend::LendToCuda::lend_to_cuda(param, |param| inner(param.as_async(stream)))
     }
 
     #[cfg(feature = "host")]
@@ -513,10 +540,12 @@ impl<'a, T: 'static + Sync + RustToCuda> CudaKernelParameter
     }
 
     #[cfg(feature = "host")]
-    fn async_to_ffi<'stream, 'b>(
+    fn async_to_ffi<'stream, 'b, E: From<rustacuda::error::CudaError>>(
         param: Self::AsyncHostType<'stream, 'b>,
-    ) -> Self::FfiType<'stream, 'b> {
-        unsafe { param.for_device_async() }
+    ) -> Result<Self::FfiType<'stream, 'b>, E> {
+        let (param, _completion): (_, Option<crate::utils::r#async::NoCompletion>) =
+            unsafe { param.unwrap_unchecked()? };
+        Ok(param.for_device())
     }
 
     #[cfg(feature = "device")]
@@ -565,13 +594,14 @@ impl<
         param: &Self::AsyncHostType<'_, '_>,
         inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O,
     ) -> O {
+        let param = unsafe { param.unwrap_ref_unchecked() };
         inner(Some(&param_as_raw_bytes(param.for_host())))
     }
 
     #[cfg(feature = "host")]
-    fn async_to_ffi<'stream, 'b>(
+    fn async_to_ffi<'stream, 'b, E: From<rustacuda::error::CudaError>>(
         param: Self::AsyncHostType<'stream, 'b>,
-    ) -> Self::FfiType<'stream, 'b> {
+    ) -> Result<Self::FfiType<'stream, 'b>, E> {
         <SharedHeapPerThreadShallowCopy<T> as CudaKernelParameter>::async_to_ffi(param)
     }
 
@@ -634,6 +664,7 @@ impl<'a, T: 'static + Sync + RustToCuda> CudaKernelParameter
         param: &Self::AsyncHostType<'_, '_>,
         inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O,
     ) -> O {
+        let param = unsafe { param.unwrap_ref_unchecked() };
         inner(Some(&param_as_raw_bytes(param.for_host())))
     }
 
@@ -643,9 +674,9 @@ impl<'a, T: 'static + Sync + RustToCuda> CudaKernelParameter
     }
 
     #[cfg(feature = "host")]
-    fn async_to_ffi<'stream, 'b>(
+    fn async_to_ffi<'stream, 'b, E: From<rustacuda::error::CudaError>>(
         param: Self::AsyncHostType<'stream, 'b>,
-    ) -> Self::FfiType<'stream, 'b> {
+    ) -> Result<Self::FfiType<'stream, 'b>, E> {
         <&'a SharedHeapPerThreadShallowCopy<T> as CudaKernelParameter>::async_to_ffi(param)
     }
 
@@ -738,13 +769,13 @@ impl<'a, T: 'static> CudaKernelParameter for &'a mut crate::utils::shared::Threa
     }
 
     #[cfg(feature = "host")]
-    fn async_to_ffi<'stream, 'b>(
+    fn async_to_ffi<'stream, 'b, E: From<rustacuda::error::CudaError>>(
         _param: Self::AsyncHostType<'stream, 'b>,
-    ) -> Self::FfiType<'stream, 'b> {
-        private_shared::ThreadBlockSharedFfi {
+    ) -> Result<Self::FfiType<'stream, 'b>, E> {
+        Ok(private_shared::ThreadBlockSharedFfi {
             _dummy: [],
             _marker: PhantomData::<T>,
-        }
+        })
     }
 
     #[cfg(feature = "device")]
@@ -795,13 +826,13 @@ impl<'a, T: 'static + PortableBitSemantics + TypeGraphLayout> CudaKernelParamete
     }
 
     #[cfg(feature = "host")]
-    fn async_to_ffi<'stream, 'b>(
+    fn async_to_ffi<'stream, 'b, E: From<rustacuda::error::CudaError>>(
         param: Self::AsyncHostType<'stream, 'b>,
-    ) -> Self::FfiType<'stream, 'b> {
-        private_shared::ThreadBlockSharedSliceFfi {
+    ) -> Result<Self::FfiType<'stream, 'b>, E> {
+        Ok(private_shared::ThreadBlockSharedSliceFfi {
             len: param.len(),
             _marker: [],
-        }
+        })
     }
 
     #[cfg(feature = "device")]
diff --git a/src/utils/exchange/wrapper.rs b/src/utils/exchange/wrapper.rs
index 09aef582d..aeee541e1 100644
--- a/src/utils/exchange/wrapper.rs
+++ b/src/utils/exchange/wrapper.rs
@@ -8,10 +8,7 @@ use rustacuda::{
 
 use crate::{
     alloc::{EmptyCudaAlloc, NoCudaAlloc},
-    host::{
-        CudaDropWrapper, HostAndDeviceConstRef, HostAndDeviceConstRefAsync, HostAndDeviceMutRef,
-        HostAndDeviceMutRefAsync,
-    },
+    host::{CudaDropWrapper, HostAndDeviceConstRef, HostAndDeviceMutRef},
     lend::{RustToCuda, RustToCudaAsync},
     utils::{
         adapter::DeviceCopyWithPortableBitSemantics,
@@ -356,35 +353,37 @@ impl<
     #[must_use]
     pub fn as_ref_async(
         &self,
-    ) -> HostAndDeviceConstRefAsync<
-        'stream,
-        '_,
-        DeviceAccessible<<T as RustToCuda>::CudaRepresentation>,
-    > {
+    ) -> Async<'_, 'stream, DeviceAccessible<<T as RustToCuda>::CudaRepresentation>, NoCompletion>
+    {
         let this = unsafe { self.unwrap_ref_unchecked() };
 
-        // Safety: `device_box` contains exactly the device copy of `locked_cuda_repr`
-        unsafe {
-            HostAndDeviceConstRefAsync::new(
-                &*(this.device_box),
-                (**(this.locked_cuda_repr)).into_ref(),
-            )
-        }
+        todo!()
+
+        // Safety: `device_box` contains exactly the device copy of
+        // `locked_cuda_repr` unsafe {
+        //     HostAndDeviceConstRefAsync::new(
+        //         &*(this.device_box),
+        //         (**(this.locked_cuda_repr)).into_ref(),
+        //     )
+        // }
     }
 
     // TODO: replace by async borrow map mut
     #[must_use]
     pub fn as_mut_async(
         &mut self,
-    ) -> HostAndDeviceMutRefAsync<DeviceAccessible<<T as RustToCuda>::CudaRepresentation>> {
+    ) -> Async<'_, 'stream, DeviceAccessible<<T as RustToCuda>::CudaRepresentation>, NoCompletion>
+    {
         let this = unsafe { self.unwrap_mut_unchecked() };
 
-        // Safety: `device_box` contains exactly the device copy of `locked_cuda_repr`
-        unsafe {
-            HostAndDeviceMutRefAsync::new(
-                &mut *(this.device_box),
-                (**(this.locked_cuda_repr)).into_mut(),
-            )
-        }
+        todo!()
+
+        // Safety: `device_box` contains exactly the device copy of
+        // `locked_cuda_repr` unsafe {
+        //     HostAndDeviceMutRefAsync::new(
+        //         &mut *(this.device_box),
+        //         (**(this.locked_cuda_repr)).into_mut(),
+        //     )
+        // }
     }
 }

From 24efa2301379b96b71d91e0016d08787dcf31980 Mon Sep 17 00:00:00 2001
From: Juniper Tyree <juniper.tyree@helsinki.fi>
Date: Wed, 3 Jan 2024 13:29:53 +0000
Subject: [PATCH 081/120] Seal more implementation details

---
 src/host/mod.rs     | 14 +++++----
 src/kernel/mod.rs   | 28 +++++++++++------
 src/kernel/param.rs | 76 +++++++++++++++++++++++++++++++++++++--------
 3 files changed, 89 insertions(+), 29 deletions(-)

diff --git a/src/host/mod.rs b/src/host/mod.rs
index 62870dd39..9efc7b9b1 100644
--- a/src/host/mod.rs
+++ b/src/host/mod.rs
@@ -152,8 +152,9 @@ impl<'a, T: PortableBitSemantics + TypeGraphLayout> HostAndDeviceMutRef<'a, T> {
         result
     }
 
+    #[allow(dead_code)] // FIXME
     #[must_use]
-    pub fn for_device<'b>(&'b mut self) -> DeviceMutRef<'a, T>
+    pub(crate) fn for_device<'b>(&'b mut self) -> DeviceMutRef<'a, T>
     where
         'a: 'b,
     {
@@ -163,8 +164,9 @@ impl<'a, T: PortableBitSemantics + TypeGraphLayout> HostAndDeviceMutRef<'a, T> {
         }
     }
 
+    #[allow(dead_code)] // FIXME
     #[must_use]
-    pub fn for_host<'b: 'a>(&'b self) -> &'a T {
+    pub(crate) fn for_host<'b: 'a>(&'b self) -> &'a T {
         self.host_ref
     }
 
@@ -264,7 +266,7 @@ impl<'a, T: PortableBitSemantics + TypeGraphLayout> HostAndDeviceConstRef<'a, T>
     }
 
     #[must_use]
-    pub fn for_device<'b>(&'b self) -> DeviceConstRef<'a, T>
+    pub(crate) fn for_device<'b>(&'b self) -> DeviceConstRef<'a, T>
     where
         'a: 'b,
     {
@@ -277,7 +279,7 @@ impl<'a, T: PortableBitSemantics + TypeGraphLayout> HostAndDeviceConstRef<'a, T>
     }
 
     #[must_use]
-    pub const fn for_host(&'a self) -> &'a T {
+    pub(crate) const fn for_host(&'a self) -> &'a T {
         self.host_ref
     }
 
@@ -334,7 +336,7 @@ impl<'a, T: PortableBitSemantics + TypeGraphLayout> HostAndDeviceOwned<'a, T> {
     }
 
     #[must_use]
-    pub fn for_device(self) -> DeviceOwnedRef<'a, T> {
+    pub(crate) fn for_device(self) -> DeviceOwnedRef<'a, T> {
         DeviceOwnedRef {
             pointer: DeviceOwnedPointer(self.device_box.as_device_ptr().as_raw_mut().cast()),
             marker: PhantomData::<T>,
@@ -343,7 +345,7 @@ impl<'a, T: PortableBitSemantics + TypeGraphLayout> HostAndDeviceOwned<'a, T> {
     }
 
     #[must_use]
-    pub fn for_host(&self) -> &T {
+    pub(crate) fn for_host(&self) -> &T {
         self.host_val
     }
 
diff --git a/src/kernel/mod.rs b/src/kernel/mod.rs
index 40985a0e8..7026efc1a 100644
--- a/src/kernel/mod.rs
+++ b/src/kernel/mod.rs
@@ -34,6 +34,9 @@ pub mod param;
 mod sealed {
     #[doc(hidden)]
     pub trait Sealed {}
+
+    #[cfg(feature = "host")]
+    pub struct Token;
 }
 
 pub trait CudaKernelParameter: sealed::Sealed {
@@ -58,17 +61,22 @@ pub trait CudaKernelParameter: sealed::Sealed {
     #[cfg(feature = "host")]
     fn with_async_as_ptx_jit<O>(
         param: &Self::AsyncHostType<'_, '_>,
+        token: sealed::Token,
         inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O,
     ) -> O;
 
     #[doc(hidden)]
     #[cfg(feature = "host")]
-    fn shared_layout_for_async(param: &Self::AsyncHostType<'_, '_>) -> std::alloc::Layout;
+    fn shared_layout_for_async(
+        param: &Self::AsyncHostType<'_, '_>,
+        token: sealed::Token,
+    ) -> std::alloc::Layout;
 
     #[doc(hidden)]
     #[cfg(feature = "host")]
     fn async_to_ffi<'stream, 'b, E: From<rustacuda::error::CudaError>>(
         param: Self::AsyncHostType<'stream, 'b>,
+        token: sealed::Token,
     ) -> Result<Self::FfiType<'stream, 'b>, E>;
 
     #[doc(hidden)]
@@ -139,10 +147,10 @@ macro_rules! impl_launcher_launch {
             self.kernel.$launch_async::<$($T),*>(self.stream, &self.config, $($arg),*)
         }
     };
-    (impl $func:ident () + ($($other:ident),*) $inner:block) => {
+    (impl $func:ident () + ($($other:expr),*) $inner:block) => {
         $inner
     };
-    (impl $func:ident ($arg0:ident : $T0:ident $(, $arg:ident : $T:ident)*) + ($($other:ident),*) $inner:block) => {
+    (impl $func:ident ($arg0:ident : $T0:ident $(, $arg:ident : $T:ident)*) + ($($other:expr),*) $inner:block) => {
         $T0::$func($arg0 $(, $other)*, |$arg0| {
             impl_launcher_launch! { impl $func ($($arg: $T),*) + ($($other),*) $inner }
         })
@@ -353,7 +361,7 @@ macro_rules! impl_typed_kernel_launch {
             Kernel: FnOnce(&mut Launcher<'stream, 'kernel, Kernel>, $($T),*),
         {
             let function = if config.ptx_jit {
-                impl_typed_kernel_launch! { impl with_async_as_ptx_jit ref ($($arg: $T),*) + () {
+                impl_typed_kernel_launch! { impl with_async_as_ptx_jit ref ($($arg: $T),*) + (sealed::Token) {
                     self.compile_with_ptx_jit_args(Some(&[$($arg),*]))
                 } }?
             } else {
@@ -363,7 +371,7 @@ macro_rules! impl_typed_kernel_launch {
             #[allow(unused_mut)]
             let mut shared_memory_size = crate::utils::shared::SharedMemorySize::new();
             $(
-                shared_memory_size.add($T::shared_layout_for_async(&$arg));
+                shared_memory_size.add($T::shared_layout_for_async(&$arg, sealed::Token));
             )*
             let Ok(shared_memory_size) = u32::try_from(shared_memory_size.total()) else {
                 // FIXME: this should really be InvalidConfiguration = 9
@@ -377,24 +385,24 @@ macro_rules! impl_typed_kernel_launch {
                 shared_memory_size,
                 &[
                     $(core::ptr::from_mut(
-                        &mut $T::async_to_ffi($arg)?
+                        &mut $T::async_to_ffi($arg, sealed::Token)?
                     ).cast::<core::ffi::c_void>()),*
                 ],
             ) }
         }
     };
-    (impl $func:ident () + ($($other:ident),*) $inner:block) => {
+    (impl $func:ident () + ($($other:expr),*) $inner:block) => {
         $inner
     };
-    (impl $func:ident ($arg0:ident : $T0:ident $(, $arg:ident : $T:ident)*) + ($($other:ident),*) $inner:block) => {
+    (impl $func:ident ($arg0:ident : $T0:ident $(, $arg:ident : $T:ident)*) + ($($other:expr),*) $inner:block) => {
         $T0::$func($arg0 $(, $other)*, |$arg0| {
             impl_typed_kernel_launch! { impl $func ($($arg: $T),*) + ($($other),*) $inner }
         })
     };
-    (impl $func:ident ref () + ($($other:ident),*) $inner:block) => {
+    (impl $func:ident ref () + ($($other:expr),*) $inner:block) => {
         $inner
     };
-    (impl $func:ident ref ($arg0:ident : $T0:ident $(, $arg:ident : $T:ident)*) + ($($other:ident),*) $inner:block) => {
+    (impl $func:ident ref ($arg0:ident : $T0:ident $(, $arg:ident : $T:ident)*) + ($($other:expr),*) $inner:block) => {
         $T0::$func(&$arg0 $(, $other)*, |$arg0| {
             impl_typed_kernel_launch! { impl $func ref ($($arg: $T),*) + ($($other),*) $inner }
         })
diff --git a/src/kernel/param.rs b/src/kernel/param.rs
index ad4ade594..0e3bf8790 100644
--- a/src/kernel/param.rs
+++ b/src/kernel/param.rs
@@ -91,19 +91,24 @@ impl<
     #[cfg(feature = "host")]
     fn with_async_as_ptx_jit<O>(
         _param: &Self::AsyncHostType<'_, '_>,
+        _token: sealed::Token,
         inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O,
     ) -> O {
         inner(None)
     }
 
     #[cfg(feature = "host")]
-    fn shared_layout_for_async(_param: &Self::AsyncHostType<'_, '_>) -> Layout {
+    fn shared_layout_for_async(
+        _param: &Self::AsyncHostType<'_, '_>,
+        _token: sealed::Token,
+    ) -> Layout {
         Layout::new::<()>()
     }
 
     #[cfg(feature = "host")]
     fn async_to_ffi<'stream, 'b, E: From<rustacuda::error::CudaError>>(
         param: Self::AsyncHostType<'stream, 'b>,
+        _token: sealed::Token,
     ) -> Result<Self::FfiType<'stream, 'b>, E> {
         Ok(param)
     }
@@ -164,19 +169,24 @@ impl<
     #[cfg(feature = "host")]
     fn with_async_as_ptx_jit<O>(
         _param: &Self::AsyncHostType<'_, '_>,
+        _token: sealed::Token,
         inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O,
     ) -> O {
         inner(None)
     }
 
     #[cfg(feature = "host")]
-    fn shared_layout_for_async(_param: &Self::AsyncHostType<'_, '_>) -> Layout {
+    fn shared_layout_for_async(
+        _param: &Self::AsyncHostType<'_, '_>,
+        _token: sealed::Token,
+    ) -> Layout {
         Layout::new::<()>()
     }
 
     #[cfg(feature = "host")]
     fn async_to_ffi<'stream, 'b, E: From<rustacuda::error::CudaError>>(
         param: Self::AsyncHostType<'stream, 'b>,
+        _token: sealed::Token,
     ) -> Result<Self::FfiType<'stream, 'b>, E> {
         let (param, _completion): (_, Option<crate::utils::r#async::NoCompletion>) =
             unsafe { param.unwrap_unchecked()? };
@@ -235,6 +245,7 @@ impl<
     #[cfg(feature = "host")]
     fn with_async_as_ptx_jit<O>(
         param: &Self::AsyncHostType<'_, '_>,
+        _token: sealed::Token,
         inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O,
     ) -> O {
         let param = unsafe { param.unwrap_ref_unchecked() };
@@ -242,15 +253,19 @@ impl<
     }
 
     #[cfg(feature = "host")]
-    fn shared_layout_for_async(_param: &Self::AsyncHostType<'_, '_>) -> Layout {
+    fn shared_layout_for_async(
+        _param: &Self::AsyncHostType<'_, '_>,
+        _token: sealed::Token,
+    ) -> Layout {
         Layout::new::<()>()
     }
 
     #[cfg(feature = "host")]
     fn async_to_ffi<'stream, 'b, E: From<rustacuda::error::CudaError>>(
         param: Self::AsyncHostType<'stream, 'b>,
+        token: sealed::Token,
     ) -> Result<Self::FfiType<'stream, 'b>, E> {
-        <&'a PerThreadShallowCopy<T> as CudaKernelParameter>::async_to_ffi(param)
+        <&'a PerThreadShallowCopy<T> as CudaKernelParameter>::async_to_ffi(param, token)
     }
 
     #[cfg(feature = "device")]
@@ -341,19 +356,24 @@ impl<
     #[cfg(feature = "host")]
     fn with_async_as_ptx_jit<O>(
         _param: &Self::AsyncHostType<'_, '_>,
+        _token: sealed::Token,
         inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O,
     ) -> O {
         inner(None)
     }
 
     #[cfg(feature = "host")]
-    fn shared_layout_for_async(_param: &Self::AsyncHostType<'_, '_>) -> Layout {
+    fn shared_layout_for_async(
+        _param: &Self::AsyncHostType<'_, '_>,
+        _token: sealed::Token,
+    ) -> Layout {
         Layout::new::<()>()
     }
 
     #[cfg(feature = "host")]
     fn async_to_ffi<'stream, 'b, E: From<rustacuda::error::CudaError>>(
         param: Self::AsyncHostType<'stream, 'b>,
+        _token: sealed::Token,
     ) -> Result<Self::FfiType<'stream, 'b>, E> {
         let (param, _completion): (_, Option<crate::utils::r#async::NoCompletion>) =
             unsafe { param.unwrap_unchecked()? };
@@ -459,19 +479,24 @@ impl<
     #[cfg(feature = "host")]
     fn with_async_as_ptx_jit<O>(
         _param: &Self::AsyncHostType<'_, '_>,
+        _token: sealed::Token,
         inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O,
     ) -> O {
         inner(None)
     }
 
     #[cfg(feature = "host")]
-    fn shared_layout_for_async(_param: &Self::AsyncHostType<'_, '_>) -> Layout {
+    fn shared_layout_for_async(
+        _param: &Self::AsyncHostType<'_, '_>,
+        _token: sealed::Token,
+    ) -> Layout {
         Layout::new::<()>()
     }
 
     #[cfg(feature = "host")]
     fn async_to_ffi<'stream, 'b, E: From<rustacuda::error::CudaError>>(
         param: Self::AsyncHostType<'stream, 'b>,
+        _token: sealed::Token,
     ) -> Result<Self::FfiType<'stream, 'b>, E> {
         let (param, _completion): (_, Option<crate::utils::r#async::NoCompletion>) =
             unsafe { param.unwrap_unchecked()? };
@@ -529,19 +554,24 @@ impl<'a, T: 'static + Sync + RustToCuda> CudaKernelParameter
     #[cfg(feature = "host")]
     fn with_async_as_ptx_jit<O>(
         _param: &Self::AsyncHostType<'_, '_>,
+        _token: sealed::Token,
         inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O,
     ) -> O {
         inner(None)
     }
 
     #[cfg(feature = "host")]
-    fn shared_layout_for_async(_param: &Self::AsyncHostType<'_, '_>) -> Layout {
+    fn shared_layout_for_async(
+        _param: &Self::AsyncHostType<'_, '_>,
+        _token: sealed::Token,
+    ) -> Layout {
         Layout::new::<()>()
     }
 
     #[cfg(feature = "host")]
     fn async_to_ffi<'stream, 'b, E: From<rustacuda::error::CudaError>>(
         param: Self::AsyncHostType<'stream, 'b>,
+        _token: sealed::Token,
     ) -> Result<Self::FfiType<'stream, 'b>, E> {
         let (param, _completion): (_, Option<crate::utils::r#async::NoCompletion>) =
             unsafe { param.unwrap_unchecked()? };
@@ -592,6 +622,7 @@ impl<
     #[cfg(feature = "host")]
     fn with_async_as_ptx_jit<O>(
         param: &Self::AsyncHostType<'_, '_>,
+        _token: sealed::Token,
         inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O,
     ) -> O {
         let param = unsafe { param.unwrap_ref_unchecked() };
@@ -601,12 +632,16 @@ impl<
     #[cfg(feature = "host")]
     fn async_to_ffi<'stream, 'b, E: From<rustacuda::error::CudaError>>(
         param: Self::AsyncHostType<'stream, 'b>,
+        token: sealed::Token,
     ) -> Result<Self::FfiType<'stream, 'b>, E> {
-        <SharedHeapPerThreadShallowCopy<T> as CudaKernelParameter>::async_to_ffi(param)
+        <SharedHeapPerThreadShallowCopy<T> as CudaKernelParameter>::async_to_ffi(param, token)
     }
 
     #[cfg(feature = "host")]
-    fn shared_layout_for_async(_param: &Self::AsyncHostType<'_, '_>) -> Layout {
+    fn shared_layout_for_async(
+        _param: &Self::AsyncHostType<'_, '_>,
+        _token: sealed::Token,
+    ) -> Layout {
         Layout::new::<()>()
     }
 
@@ -662,6 +697,7 @@ impl<'a, T: 'static + Sync + RustToCuda> CudaKernelParameter
     #[cfg(feature = "host")]
     fn with_async_as_ptx_jit<O>(
         param: &Self::AsyncHostType<'_, '_>,
+        _token: sealed::Token,
         inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O,
     ) -> O {
         let param = unsafe { param.unwrap_ref_unchecked() };
@@ -669,15 +705,19 @@ impl<'a, T: 'static + Sync + RustToCuda> CudaKernelParameter
     }
 
     #[cfg(feature = "host")]
-    fn shared_layout_for_async(_param: &Self::AsyncHostType<'_, '_>) -> Layout {
+    fn shared_layout_for_async(
+        _param: &Self::AsyncHostType<'_, '_>,
+        _token: sealed::Token,
+    ) -> Layout {
         Layout::new::<()>()
     }
 
     #[cfg(feature = "host")]
     fn async_to_ffi<'stream, 'b, E: From<rustacuda::error::CudaError>>(
         param: Self::AsyncHostType<'stream, 'b>,
+        token: sealed::Token,
     ) -> Result<Self::FfiType<'stream, 'b>, E> {
-        <&'a SharedHeapPerThreadShallowCopy<T> as CudaKernelParameter>::async_to_ffi(param)
+        <&'a SharedHeapPerThreadShallowCopy<T> as CudaKernelParameter>::async_to_ffi(param, token)
     }
 
     #[cfg(feature = "device")]
@@ -758,19 +798,24 @@ impl<'a, T: 'static> CudaKernelParameter for &'a mut crate::utils::shared::Threa
     #[cfg(feature = "host")]
     fn with_async_as_ptx_jit<O>(
         _param: &Self::AsyncHostType<'_, '_>,
+        _token: sealed::Token,
         inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O,
     ) -> O {
         inner(None)
     }
 
     #[cfg(feature = "host")]
-    fn shared_layout_for_async(_param: &Self::AsyncHostType<'_, '_>) -> Layout {
+    fn shared_layout_for_async(
+        _param: &Self::AsyncHostType<'_, '_>,
+        _token: sealed::Token,
+    ) -> Layout {
         Layout::new::<()>()
     }
 
     #[cfg(feature = "host")]
     fn async_to_ffi<'stream, 'b, E: From<rustacuda::error::CudaError>>(
         _param: Self::AsyncHostType<'stream, 'b>,
+        _token: sealed::Token,
     ) -> Result<Self::FfiType<'stream, 'b>, E> {
         Ok(private_shared::ThreadBlockSharedFfi {
             _dummy: [],
@@ -815,19 +860,24 @@ impl<'a, T: 'static + PortableBitSemantics + TypeGraphLayout> CudaKernelParamete
     #[cfg(feature = "host")]
     fn with_async_as_ptx_jit<O>(
         _param: &Self::AsyncHostType<'_, '_>,
+        _token: sealed::Token,
         inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O,
     ) -> O {
         inner(None)
     }
 
     #[cfg(feature = "host")]
-    fn shared_layout_for_async(param: &Self::AsyncHostType<'_, '_>) -> Layout {
+    fn shared_layout_for_async(
+        param: &Self::AsyncHostType<'_, '_>,
+        _token: sealed::Token,
+    ) -> Layout {
         param.layout()
     }
 
     #[cfg(feature = "host")]
     fn async_to_ffi<'stream, 'b, E: From<rustacuda::error::CudaError>>(
         param: Self::AsyncHostType<'stream, 'b>,
+        _token: sealed::Token,
     ) -> Result<Self::FfiType<'stream, 'b>, E> {
         Ok(private_shared::ThreadBlockSharedSliceFfi {
             len: param.len(),

From 1e19fe13b6a0daf860f860aa78468f7941d5ca28 Mon Sep 17 00:00:00 2001
From: Juniper Tyree <juniper.tyree@helsinki.fi>
Date: Wed, 3 Jan 2024 13:51:02 +0000
Subject: [PATCH 082/120] Further small API improvements

---
 src/host/mod.rs               | 52 +++++++++++++++++------------------
 src/lend/impls/option.rs      | 18 +-----------
 src/lend/mod.rs               |  7 ++---
 src/utils/exchange/wrapper.rs | 10 +++++--
 4 files changed, 37 insertions(+), 50 deletions(-)

diff --git a/src/host/mod.rs b/src/host/mod.rs
index 9efc7b9b1..2ddc768dd 100644
--- a/src/host/mod.rs
+++ b/src/host/mod.rs
@@ -109,19 +109,6 @@ pub struct HostAndDeviceMutRef<'a, T: PortableBitSemantics + TypeGraphLayout> {
 }
 
 impl<'a, T: PortableBitSemantics + TypeGraphLayout> HostAndDeviceMutRef<'a, T> {
-    /// # Safety
-    ///
-    /// `device_box` must contain EXACTLY the device copy of `host_ref`
-    pub unsafe fn new(
-        device_box: &'a mut DeviceBox<DeviceCopyWithPortableBitSemantics<T>>,
-        host_ref: &'a mut T,
-    ) -> Self {
-        Self {
-            device_box,
-            host_ref,
-        }
-    }
-
     /// # Errors
     ///
     /// Returns a [`CudaError`] iff `value` cannot be moved
@@ -152,6 +139,19 @@ impl<'a, T: PortableBitSemantics + TypeGraphLayout> HostAndDeviceMutRef<'a, T> {
         result
     }
 
+    /// # Safety
+    ///
+    /// `device_box` must contain EXACTLY the device copy of `host_ref`
+    pub unsafe fn new_unchecked(
+        device_box: &'a mut DeviceBox<DeviceCopyWithPortableBitSemantics<T>>,
+        host_ref: &'a mut T,
+    ) -> Self {
+        Self {
+            device_box,
+            host_ref,
+        }
+    }
+
     #[allow(dead_code)] // FIXME
     #[must_use]
     pub(crate) fn for_device<'b>(&'b mut self) -> DeviceMutRef<'a, T>
@@ -225,19 +225,6 @@ impl<'a, T: PortableBitSemantics + TypeGraphLayout> Clone for HostAndDeviceConst
 impl<'a, T: PortableBitSemantics + TypeGraphLayout> Copy for HostAndDeviceConstRef<'a, T> {}
 
 impl<'a, T: PortableBitSemantics + TypeGraphLayout> HostAndDeviceConstRef<'a, T> {
-    /// # Safety
-    ///
-    /// `device_box` must contain EXACTLY the device copy of `host_ref`
-    pub const unsafe fn new(
-        device_box: &'a DeviceBox<DeviceCopyWithPortableBitSemantics<T>>,
-        host_ref: &'a T,
-    ) -> Self {
-        Self {
-            device_box,
-            host_ref,
-        }
-    }
-
     /// # Errors
     ///
     /// Returns a [`CudaError`] iff `value` cannot be moved
@@ -265,6 +252,19 @@ impl<'a, T: PortableBitSemantics + TypeGraphLayout> HostAndDeviceConstRef<'a, T>
         result
     }
 
+    /// # Safety
+    ///
+    /// `device_box` must contain EXACTLY the device copy of `host_ref`
+    pub const unsafe fn new_unchecked(
+        device_box: &'a DeviceBox<DeviceCopyWithPortableBitSemantics<T>>,
+        host_ref: &'a T,
+    ) -> Self {
+        Self {
+            device_box,
+            host_ref,
+        }
+    }
+
     #[must_use]
     pub(crate) fn for_device<'b>(&'b self) -> DeviceConstRef<'a, T>
     where
diff --git a/src/lend/impls/option.rs b/src/lend/impls/option.rs
index c05c0d3bb..197906baf 100644
--- a/src/lend/impls/option.rs
+++ b/src/lend/impls/option.rs
@@ -6,7 +6,7 @@ use const_type_layout::{TypeGraphLayout, TypeLayout};
 use rustacuda::error::CudaResult;
 
 use crate::{
-    lend::{CudaAsRust, RustToCuda, RustToCudaAsync, RustToCudaAsyncProxy, RustToCudaProxy},
+    lend::{CudaAsRust, RustToCuda, RustToCudaAsync, RustToCudaProxy},
     safety::PortableBitSemantics,
     utils::{adapter::RustToCudaWithPortableBitCopySemantics, ffi::DeviceAccessible},
 };
@@ -214,19 +214,3 @@ impl<T: Copy + PortableBitSemantics + TypeGraphLayout> RustToCudaProxy<Option<T>
         self.map(RustToCudaWithPortableBitCopySemantics::into_inner)
     }
 }
-
-impl<T: Copy + Send + PortableBitSemantics + TypeGraphLayout> RustToCudaAsyncProxy<Option<T>>
-    for Option<RustToCudaWithPortableBitCopySemantics<T>>
-{
-    fn from_ref(val: &Option<T>) -> &Self {
-        <Self as RustToCudaProxy<Option<T>>>::from_ref(val)
-    }
-
-    fn from_mut(val: &mut Option<T>) -> &mut Self {
-        <Self as RustToCudaProxy<Option<T>>>::from_mut(val)
-    }
-
-    fn into(self) -> Option<T> {
-        <Self as RustToCudaProxy<Option<T>>>::into(self)
-    }
-}
diff --git a/src/lend/mod.rs b/src/lend/mod.rs
index 598a586b8..e2e5dcf99 100644
--- a/src/lend/mod.rs
+++ b/src/lend/mod.rs
@@ -155,12 +155,9 @@ pub trait RustToCudaProxy<T>: RustToCuda {
     fn into(self) -> T;
 }
 
-pub trait RustToCudaAsyncProxy<T>: RustToCudaAsync {
-    fn from_ref(val: &T) -> &Self;
-    fn from_mut(val: &mut T) -> &mut Self;
+pub trait RustToCudaAsyncProxy<T>: RustToCudaAsync + RustToCudaProxy<T> {}
 
-    fn into(self) -> T;
-}
+impl<T, P: RustToCudaAsync + RustToCudaProxy<T>> RustToCudaAsyncProxy<T> for P {}
 
 #[cfg(feature = "host")]
 #[allow(clippy::module_name_repetitions)]
diff --git a/src/utils/exchange/wrapper.rs b/src/utils/exchange/wrapper.rs
index aeee541e1..660fdaae7 100644
--- a/src/utils/exchange/wrapper.rs
+++ b/src/utils/exchange/wrapper.rs
@@ -206,7 +206,10 @@ impl<T: RustToCuda<CudaAllocation: EmptyCudaAlloc>> ExchangeWrapperOnDevice<T> {
     ) -> HostAndDeviceConstRef<DeviceAccessible<<T as RustToCuda>::CudaRepresentation>> {
         // Safety: `device_box` contains exactly the device copy of `locked_cuda_repr`
         unsafe {
-            HostAndDeviceConstRef::new(&self.device_box, (**self.locked_cuda_repr).into_ref())
+            HostAndDeviceConstRef::new_unchecked(
+                &self.device_box,
+                (**self.locked_cuda_repr).into_ref(),
+            )
         }
     }
 
@@ -216,7 +219,10 @@ impl<T: RustToCuda<CudaAllocation: EmptyCudaAlloc>> ExchangeWrapperOnDevice<T> {
     ) -> HostAndDeviceMutRef<DeviceAccessible<<T as RustToCuda>::CudaRepresentation>> {
         // Safety: `device_box` contains exactly the device copy of `locked_cuda_repr`
         unsafe {
-            HostAndDeviceMutRef::new(&mut self.device_box, (**self.locked_cuda_repr).into_mut())
+            HostAndDeviceMutRef::new_unchecked(
+                &mut self.device_box,
+                (**self.locked_cuda_repr).into_mut(),
+            )
         }
     }
 }

From a0521861c267c747269213587cf50c6238a0632c Mon Sep 17 00:00:00 2001
From: Juniper Tyree <juniper.tyree@helsinki.fi>
Date: Thu, 4 Jan 2024 05:24:05 +0000
Subject: [PATCH 083/120] Add AsyncProj helper API struct for async projections

---
 src/kernel/param.rs           | 36 ++++++++------------
 src/utils/async.rs            | 64 +++++++++++++++++++++++------------
 src/utils/exchange/wrapper.rs | 56 +++++++++++++++---------------
 3 files changed, 84 insertions(+), 72 deletions(-)

diff --git a/src/kernel/param.rs b/src/kernel/param.rs
index 0e3bf8790..8ca41ddf4 100644
--- a/src/kernel/param.rs
+++ b/src/kernel/param.rs
@@ -143,11 +143,10 @@ impl<
     > CudaKernelParameter for &'a PerThreadShallowCopy<T>
 {
     #[cfg(feature = "host")]
-    type AsyncHostType<'stream, 'b> = crate::utils::r#async::Async<
+    type AsyncHostType<'stream, 'b> = crate::utils::r#async::AsyncProj<
         'b,
         'stream,
-        crate::host::HostAndDeviceConstRef<'b, T>,
-        crate::utils::r#async::NoCompletion,
+        &'b crate::host::HostAndDeviceConstRef<'b, T>,
     >;
     #[cfg(any(feature = "device", doc))]
     type DeviceType<'b> = &'b T;
@@ -162,7 +161,7 @@ impl<
         inner: impl for<'b> FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result<O, E>,
     ) -> Result<O, E> {
         crate::host::HostAndDeviceConstRef::with_new(param, |const_ref| {
-            inner(const_ref.as_async(stream))
+            inner(const_ref.as_async(stream).as_ref())
         })
     }
 
@@ -188,8 +187,7 @@ impl<
         param: Self::AsyncHostType<'stream, 'b>,
         _token: sealed::Token,
     ) -> Result<Self::FfiType<'stream, 'b>, E> {
-        let (param, _completion): (_, Option<crate::utils::r#async::NoCompletion>) =
-            unsafe { param.unwrap_unchecked()? };
+        let param = unsafe { param.unwrap_unchecked() };
         Ok(param.for_device())
     }
 
@@ -248,7 +246,7 @@ impl<
         _token: sealed::Token,
         inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O,
     ) -> O {
-        let param = unsafe { param.unwrap_ref_unchecked() };
+        let param = unsafe { param.unwrap_unchecked() };
         inner(Some(&param_as_raw_bytes(param.for_host())))
     }
 
@@ -328,11 +326,10 @@ impl<
     > CudaKernelParameter for &'a ShallowInteriorMutable<T>
 {
     #[cfg(feature = "host")]
-    type AsyncHostType<'stream, 'b> = crate::utils::r#async::Async<
+    type AsyncHostType<'stream, 'b> = crate::utils::r#async::AsyncProj<
         'b,
         'stream,
-        crate::host::HostAndDeviceConstRef<'b, T>,
-        crate::utils::r#async::NoCompletion,
+        &'b crate::host::HostAndDeviceConstRef<'b, T>,
     >;
     #[cfg(any(feature = "device", doc))]
     type DeviceType<'b> = &'b T;
@@ -349,7 +346,7 @@ impl<
         inner: impl for<'b> FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result<O, E>,
     ) -> Result<O, E> {
         crate::host::HostAndDeviceMutRef::with_new(param, |const_ref| {
-            inner(const_ref.as_ref().as_async(stream))
+            inner(const_ref.as_ref().as_async(stream).as_ref())
         })
     }
 
@@ -375,8 +372,7 @@ impl<
         param: Self::AsyncHostType<'stream, 'b>,
         _token: sealed::Token,
     ) -> Result<Self::FfiType<'stream, 'b>, E> {
-        let (param, _completion): (_, Option<crate::utils::r#async::NoCompletion>) =
-            unsafe { param.unwrap_unchecked()? };
+        let param = unsafe { param.unwrap_unchecked() };
         Ok(param.for_device())
     }
 
@@ -526,14 +522,13 @@ impl<'a, T: 'static + Sync + RustToCuda> CudaKernelParameter
     for &'a SharedHeapPerThreadShallowCopy<T>
 {
     #[cfg(feature = "host")]
-    type AsyncHostType<'stream, 'b> = crate::utils::r#async::Async<
+    type AsyncHostType<'stream, 'b> = crate::utils::r#async::AsyncProj<
         'b,
         'stream,
-        crate::host::HostAndDeviceConstRef<
+        &'b crate::host::HostAndDeviceConstRef<
             'b,
             DeviceAccessible<<T as RustToCuda>::CudaRepresentation>,
         >,
-        crate::utils::r#async::NoCompletion,
     >;
     #[cfg(any(feature = "device", doc))]
     type DeviceType<'b> = &'b T;
@@ -548,7 +543,7 @@ impl<'a, T: 'static + Sync + RustToCuda> CudaKernelParameter
         stream: &'stream rustacuda::stream::Stream,
         inner: impl for<'b> FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result<O, E>,
     ) -> Result<O, E> {
-        crate::lend::LendToCuda::lend_to_cuda(param, |param| inner(param.as_async(stream)))
+        crate::lend::LendToCuda::lend_to_cuda(param, |param| inner(param.as_async(stream).as_ref()))
     }
 
     #[cfg(feature = "host")]
@@ -573,8 +568,7 @@ impl<'a, T: 'static + Sync + RustToCuda> CudaKernelParameter
         param: Self::AsyncHostType<'stream, 'b>,
         _token: sealed::Token,
     ) -> Result<Self::FfiType<'stream, 'b>, E> {
-        let (param, _completion): (_, Option<crate::utils::r#async::NoCompletion>) =
-            unsafe { param.unwrap_unchecked()? };
+        let param = unsafe { param.unwrap_unchecked() };
         Ok(param.for_device())
     }
 
@@ -625,7 +619,7 @@ impl<
         _token: sealed::Token,
         inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O,
     ) -> O {
-        let param = unsafe { param.unwrap_ref_unchecked() };
+        let param = unsafe { param.as_ref().unwrap_unchecked() };
         inner(Some(&param_as_raw_bytes(param.for_host())))
     }
 
@@ -700,7 +694,7 @@ impl<'a, T: 'static + Sync + RustToCuda> CudaKernelParameter
         _token: sealed::Token,
         inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O,
     ) -> O {
-        let param = unsafe { param.unwrap_ref_unchecked() };
+        let param = unsafe { param.unwrap_unchecked() };
         inner(Some(&param_as_raw_bytes(param.for_host())))
     }
 
diff --git a/src/utils/async.rs b/src/utils/async.rs
index f408431ae..d945538d0 100644
--- a/src/utils/async.rs
+++ b/src/utils/async.rs
@@ -119,6 +119,10 @@ impl<'a, 'stream, T: BorrowMut<C::Completed>, C: Completion<T>> Async<'a, 'strea
     /// the inner value can be safely returned and again be used in synchronous
     /// operations.
     ///
+    /// Calling `synchronize` after the computation has completed, e.g. after
+    /// calling [`rustacuda::stream::Stream::synchronize`], should be very
+    /// cheap.
+    ///
     /// # Errors
     /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside
     /// CUDA.
@@ -224,30 +228,12 @@ impl<'a, 'stream, T: BorrowMut<C::Completed>, C: Completion<T>> Async<'a, 'strea
         }
     }
 
-    /// # Safety
-    ///
-    /// The returned reference to the inner value of type `T` may not yet have
-    /// completed its asynchronous work and may thus be in an inconsistent
-    /// state.
-    ///
-    /// This method must only be used to construct a larger asynchronous
-    /// computation out of smaller ones that have all been submitted to the
-    /// same [`Stream`].
-    pub const unsafe fn unwrap_ref_unchecked(&self) -> &T {
-        &self.value
+    pub const fn as_ref(&self) -> AsyncProj<'_, 'stream, &T> {
+        AsyncProj::new(&self.value)
     }
 
-    /// # Safety
-    ///
-    /// The returned reference to the inner value of type `T` may not yet have
-    /// completed its asynchronous work and may thus be in an inconsistent
-    /// state.
-    ///
-    /// This method must only be used to construct a larger asynchronous
-    /// computation out of smaller ones that have all been submitted to the
-    /// same [`Stream`].
-    pub unsafe fn unwrap_mut_unchecked(&mut self) -> &mut T {
-        &mut self.value
+    pub fn as_mut(&mut self) -> AsyncProj<'_, 'stream, &mut T> {
+        AsyncProj::new(&mut self.value)
     }
 }
 
@@ -340,3 +326,37 @@ impl<'a, 'stream, T: BorrowMut<C::Completed>, C: Completion<T>> IntoFuture
         }
     }
 }
+
+#[cfg(feature = "host")]
+#[allow(clippy::module_name_repetitions)]
+#[derive(Copy, Clone)]
+pub struct AsyncProj<'a, 'stream, T: 'a> {
+    _capture: PhantomData<&'a ()>,
+    _stream: PhantomData<&'stream Stream>,
+    value: T,
+}
+
+#[cfg(feature = "host")]
+impl<'a, 'stream, T: 'a> AsyncProj<'a, 'stream, T> {
+    #[must_use]
+    pub(crate) const fn new(value: T) -> Self {
+        Self {
+            _capture: PhantomData::<&'a ()>,
+            _stream: PhantomData::<&'stream Stream>,
+            value,
+        }
+    }
+
+    /// # Safety
+    ///
+    /// The returned reference to the inner value of type `T` may not yet have
+    /// completed its asynchronous work and may thus be in an inconsistent
+    /// state.
+    ///
+    /// This method must only be used to construct a larger asynchronous
+    /// computation out of smaller ones that have all been submitted to the
+    /// same [`Stream`].
+    pub(crate) unsafe fn unwrap_unchecked(self) -> T {
+        self.value
+    }
+}
diff --git a/src/utils/exchange/wrapper.rs b/src/utils/exchange/wrapper.rs
index 660fdaae7..0f1ff89f8 100644
--- a/src/utils/exchange/wrapper.rs
+++ b/src/utils/exchange/wrapper.rs
@@ -13,7 +13,7 @@ use crate::{
     utils::{
         adapter::DeviceCopyWithPortableBitSemantics,
         ffi::DeviceAccessible,
-        r#async::{Async, CompletionFnMut, NoCompletion},
+        r#async::{Async, AsyncProj, CompletionFnMut, NoCompletion},
     },
 };
 
@@ -355,41 +355,39 @@ impl<
         }
     }
 
-    // TODO: replace by async borrow map
     #[must_use]
     pub fn as_ref_async(
         &self,
-    ) -> Async<'_, 'stream, DeviceAccessible<<T as RustToCuda>::CudaRepresentation>, NoCompletion>
-    {
-        let this = unsafe { self.unwrap_ref_unchecked() };
-
-        todo!()
-
-        // Safety: `device_box` contains exactly the device copy of
-        // `locked_cuda_repr` unsafe {
-        //     HostAndDeviceConstRefAsync::new(
-        //         &*(this.device_box),
-        //         (**(this.locked_cuda_repr)).into_ref(),
-        //     )
-        // }
+    ) -> AsyncProj<
+        '_,
+        'stream,
+        HostAndDeviceConstRef<DeviceAccessible<<T as RustToCuda>::CudaRepresentation>>,
+    > {
+        let this = unsafe { self.as_ref().unwrap_unchecked() };
+
+        AsyncProj::new(unsafe {
+            HostAndDeviceConstRef::new_unchecked(
+                &*(this.device_box),
+                (**(this.locked_cuda_repr)).into_ref(),
+            )
+        })
     }
 
-    // TODO: replace by async borrow map mut
     #[must_use]
     pub fn as_mut_async(
         &mut self,
-    ) -> Async<'_, 'stream, DeviceAccessible<<T as RustToCuda>::CudaRepresentation>, NoCompletion>
-    {
-        let this = unsafe { self.unwrap_mut_unchecked() };
-
-        todo!()
-
-        // Safety: `device_box` contains exactly the device copy of
-        // `locked_cuda_repr` unsafe {
-        //     HostAndDeviceMutRefAsync::new(
-        //         &mut *(this.device_box),
-        //         (**(this.locked_cuda_repr)).into_mut(),
-        //     )
-        // }
+    ) -> AsyncProj<
+        '_,
+        'stream,
+        HostAndDeviceMutRef<DeviceAccessible<<T as RustToCuda>::CudaRepresentation>>,
+    > {
+        let this = unsafe { self.as_mut().unwrap_unchecked() };
+
+        AsyncProj::new(unsafe {
+            HostAndDeviceMutRef::new_unchecked(
+                &mut *(this.device_box),
+                (**(this.locked_cuda_repr)).into_mut(),
+            )
+        })
     }
 }

From b9d8ac0337bdad25e5c72991621af5fa865e197a Mon Sep 17 00:00:00 2001
From: Juniper Tyree <juniper.tyree@helsinki.fi>
Date: Thu, 4 Jan 2024 05:25:39 +0000
Subject: [PATCH 084/120] Disable async derive in examples for now

---
 examples/derive/src/lib.rs         | 4 ++--
 examples/single-source/src/main.rs | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/examples/derive/src/lib.rs b/examples/derive/src/lib.rs
index 622b1b699..2d7b00ad6 100644
--- a/examples/derive/src/lib.rs
+++ b/examples/derive/src/lib.rs
@@ -3,14 +3,14 @@
 #![feature(offset_of)]
 
 #[derive(rc::lend::LendRustToCuda)]
-#[cuda(crate = "rc")]
+#[cuda(crate = "rc", async = false)]
 struct Inner<T: Copy> {
     #[cuda(embed)]
     inner: T,
 }
 
 #[derive(rc::lend::LendRustToCuda)]
-#[cuda(crate = "rc")]
+#[cuda(crate = "rc", async = false)]
 struct Outer<T: Copy> {
     #[cuda(embed)]
     inner: Inner<T>,
diff --git a/examples/single-source/src/main.rs b/examples/single-source/src/main.rs
index 4783deffa..ec699d43c 100644
--- a/examples/single-source/src/main.rs
+++ b/examples/single-source/src/main.rs
@@ -24,7 +24,7 @@ fn main() {}
 pub struct Dummy(i32);
 
 #[derive(Clone, rc::lend::LendRustToCuda)]
-#[cuda(crate = "rc")]
+#[cuda(crate = "rc", async = false)]
 #[allow(dead_code)]
 pub struct Wrapper<T> {
     #[cuda(embed)]
@@ -32,7 +32,7 @@ pub struct Wrapper<T> {
 }
 
 #[derive(Clone, rc::lend::LendRustToCuda)]
-#[cuda(crate = "rc")]
+#[cuda(crate = "rc", async = false)]
 pub struct Empty([u8; 0]);
 
 #[repr(C)]

From e0729b11ee4cfd31a36ffd133c2e14c0d7debb22 Mon Sep 17 00:00:00 2001
From: Juniper Tyree <juniper.tyree@helsinki.fi>
Date: Thu, 4 Jan 2024 11:13:53 +0000
Subject: [PATCH 085/120] Implement RustToCudaAsync derive impls

---
 examples/derive/src/lib.rs                    |  4 +-
 examples/single-source/src/main.rs            |  4 +-
 .../src/rust_to_cuda/field_copy.rs            | 55 ++++++++++++--
 rust-cuda-derive/src/rust_to_cuda/impl.rs     | 74 +++++++++++++++----
 rust-cuda-derive/src/rust_to_cuda/mod.rs      |  6 ++
 src/deps.rs                                   |  3 +
 src/utils/async.rs                            | 20 ++++-
 7 files changed, 140 insertions(+), 26 deletions(-)

diff --git a/examples/derive/src/lib.rs b/examples/derive/src/lib.rs
index 2d7b00ad6..622b1b699 100644
--- a/examples/derive/src/lib.rs
+++ b/examples/derive/src/lib.rs
@@ -3,14 +3,14 @@
 #![feature(offset_of)]
 
 #[derive(rc::lend::LendRustToCuda)]
-#[cuda(crate = "rc", async = false)]
+#[cuda(crate = "rc")]
 struct Inner<T: Copy> {
     #[cuda(embed)]
     inner: T,
 }
 
 #[derive(rc::lend::LendRustToCuda)]
-#[cuda(crate = "rc", async = false)]
+#[cuda(crate = "rc")]
 struct Outer<T: Copy> {
     #[cuda(embed)]
     inner: Inner<T>,
diff --git a/examples/single-source/src/main.rs b/examples/single-source/src/main.rs
index ec699d43c..4783deffa 100644
--- a/examples/single-source/src/main.rs
+++ b/examples/single-source/src/main.rs
@@ -24,7 +24,7 @@ fn main() {}
 pub struct Dummy(i32);
 
 #[derive(Clone, rc::lend::LendRustToCuda)]
-#[cuda(crate = "rc", async = false)]
+#[cuda(crate = "rc")]
 #[allow(dead_code)]
 pub struct Wrapper<T> {
     #[cuda(embed)]
@@ -32,7 +32,7 @@ pub struct Wrapper<T> {
 }
 
 #[derive(Clone, rc::lend::LendRustToCuda)]
-#[cuda(crate = "rc", async = false)]
+#[cuda(crate = "rc")]
 pub struct Empty([u8; 0]);
 
 #[repr(C)]
diff --git a/rust-cuda-derive/src/rust_to_cuda/field_copy.rs b/rust-cuda-derive/src/rust_to_cuda/field_copy.rs
index 1baf8829e..c32ac67ee 100644
--- a/rust-cuda-derive/src/rust_to_cuda/field_copy.rs
+++ b/rust-cuda-derive/src/rust_to_cuda/field_copy.rs
@@ -16,9 +16,11 @@ pub fn impl_field_copy_init_and_expand_alloc_type(
 
     r2c_field_declarations: &mut Vec<TokenStream>,
     r2c_field_async_declarations: &mut Vec<TokenStream>,
+    r2c_field_async_completions: &mut Vec<syn::Ident>,
     r2c_field_initialisations: &mut Vec<TokenStream>,
     r2c_field_destructors: &mut Vec<TokenStream>,
     r2c_field_async_destructors: &mut Vec<TokenStream>,
+    r2c_field_async_completion_calls: &mut Vec<TokenStream>,
 
     c2r_field_initialisations: &mut Vec<TokenStream>,
 ) -> (TokenStream, TokenStream) {
@@ -32,6 +34,11 @@ pub fn impl_field_copy_init_and_expand_alloc_type(
         Some(ident) => format_ident!("field_{}_repr", ident),
         None => format_ident!("field_{}_repr", field_index),
     };
+    #[allow(clippy::option_if_let_else)]
+    let field_completion_ident = match &field.ident {
+        Some(ident) => format_ident!("field_{}_completion", ident),
+        None => format_ident!("field_{}_completion", field_index),
+    };
     let optional_field_ident = field.ident.as_ref().map(|ident| quote! { #ident: });
 
     match cuda_repr_field_ty {
@@ -83,6 +90,7 @@ pub fn impl_field_copy_init_and_expand_alloc_type(
                     alloc_front,
                     stream,
                 )?;
+                let (#field_repr_ident, #field_completion_ident) = #field_repr_ident.unwrap_unchecked()?;
             });
 
             r2c_field_initialisations.push(quote! {
@@ -96,13 +104,29 @@ pub fn impl_field_copy_init_and_expand_alloc_type(
                 )?;
             });
             r2c_field_async_destructors.push(quote! {
-                let alloc_front = #crate_path::lend::RustToCudaAsync::restore_async(
-                    &mut self.#field_accessor,
+                let this_backup = unsafe {
+                    ::core::mem::ManuallyDrop::new(::core::ptr::read(&this))
+                };
+                let (r#async, alloc_front) = #crate_path::lend::RustToCudaAsync::restore_async(
+                    this.map_mut(|this| &mut this.#field_accessor),
                     alloc_front,
                     stream,
                 )?;
+                let (value, #field_completion_ident) = r#async.unwrap_unchecked()?;
+                ::core::mem::forget(value);
+                let this = ::core::mem::ManuallyDrop::into_inner(this_backup);
+            });
+
+            r2c_field_async_completion_calls.push(quote! {
+                #crate_path::utils::r#async::Completion::<
+                    #crate_path::deps::owning_ref::BoxRefMut<'a, CudaRestoreOwner, _>
+                >::complete(
+                    #field_completion_ident, &mut this.#field_accessor,
+                )?;
             });
 
+            r2c_field_async_completions.push(field_completion_ident);
+
             c2r_field_initialisations.push(quote! {
                 #optional_field_ident {
                     #crate_path::lend::CudaAsRust::as_rust(&this.#field_accessor)
@@ -139,6 +163,7 @@ pub fn impl_field_copy_init_and_expand_alloc_type(
                     alloc_front,
                     stream,
                 )?;
+                let (#field_repr_ident, #field_completion_ident) = #field_repr_ident.unwrap_unchecked()?;
             });
 
             r2c_field_initialisations.push(quote! {
@@ -154,15 +179,33 @@ pub fn impl_field_copy_init_and_expand_alloc_type(
                 )?;
             });
             r2c_field_async_destructors.push(quote! {
-                let alloc_front = #crate_path::lend::RustToCudaAsync::restore_async(
-                    <
-                        #proxy_ty as #crate_path::lend::RustToCudaAsyncProxy<#field_ty>
-                    >::from_mut(&mut self.#field_accessor),
+                let this_backup = unsafe {
+                    ::core::mem::ManuallyDrop::new(::core::ptr::read(&this))
+                };
+                let (r#async, alloc_front) = #crate_path::lend::RustToCudaAsync::restore_async(
+                    this.map_mut(|this| <
+                        #proxy_ty as #crate_path::lend::RustToCudaProxyAsync<#field_ty>
+                    >::from_mut(&mut this.#field_accessor)),
                     alloc_front,
                     stream,
                 )?;
+                let (value, #field_completion_ident) = r#async.unwrap_unchecked()?;
+                ::core::mem::forget(value);
+                let this = ::core::mem::ManuallyDrop::into_inner(this_backup);
+            });
+
+            r2c_field_async_completion_calls.push(quote! {
+                #crate_path::utils::r#async::Completion::<
+                    #crate_path::deps::owning_ref::BoxRefMut<'a, CudaRestoreOwner, _>
+                >::complete(
+                    #field_completion_ident, <
+                        #proxy_ty as #crate_path::lend::RustToCudaProxyAsync<#field_ty>
+                    >::from_mut(&mut this.#field_accessor),
+                )?;
             });
 
+            r2c_field_async_completions.push(field_completion_ident);
+
             c2r_field_initialisations.push(quote! {
                 #optional_field_ident {
                     #crate_path::lend::RustToCudaProxy::<#field_ty>::into(
diff --git a/rust-cuda-derive/src/rust_to_cuda/impl.rs b/rust-cuda-derive/src/rust_to_cuda/impl.rs
index d1249720e..674f5e166 100644
--- a/rust-cuda-derive/src/rust_to_cuda/impl.rs
+++ b/rust-cuda-derive/src/rust_to_cuda/impl.rs
@@ -127,8 +127,10 @@ pub fn rust_to_cuda_async_trait(
     struct_fields_cuda: &syn::Fields,
     combined_cuda_alloc_async_type: &TokenStream,
     r2c_field_async_declarations: &[TokenStream],
+    r2c_field_async_completions: &[syn::Ident],
     r2c_field_initialisations: &[TokenStream],
     r2c_field_async_destructors: &[TokenStream],
+    r2c_field_async_completion_calls: &[TokenStream],
 ) -> TokenStream {
     let rust_to_cuda_struct_construction = match struct_fields_cuda {
         syn::Fields::Named(_) => quote! {
@@ -144,6 +146,39 @@ pub fn rust_to_cuda_async_trait(
         syn::Fields::Unit => quote! { #struct_name_cuda },
     };
 
+    let async_borrow_completion = if r2c_field_async_completions.is_empty() {
+        quote! { #crate_path::utils::r#async::Async::ready(borrow, stream) }
+    } else {
+        quote! {
+            if #(#r2c_field_async_completions.is_none())&&* {
+                #crate_path::utils::r#async::Async::ready(borrow, stream)
+            } else {
+                #crate_path::utils::r#async::Async::pending(
+                    borrow, stream, #crate_path::utils::r#async::NoCompletion,
+                )?
+            }
+        }
+    };
+
+    let async_restore_completion = if r2c_field_async_completions.is_empty() {
+        quote! { #crate_path::utils::r#async::Async::ready(this, stream) }
+    } else {
+        quote! {
+            if #(#r2c_field_async_completions.is_none())&&* {
+                #crate_path::utils::r#async::Async::ready(this, stream)
+            } else {
+                #crate_path::utils::r#async::Async::<
+                    _, #crate_path::utils::r#async::CompletionFnMut<Self>,
+                >::pending(
+                    this, stream, Box::new(|this| {
+                        #(#r2c_field_async_completion_calls)*
+                        Ok(())
+                    }),
+                )?
+            }
+        }
+    };
+
     let (impl_generics, ty_generics, where_clause) = struct_generics_cuda_async.split_for_impl();
 
     quote! {
@@ -153,13 +188,16 @@ pub fn rust_to_cuda_async_trait(
             type CudaAllocationAsync = #combined_cuda_alloc_async_type;
 
             #[cfg(not(target_os = "cuda"))]
-            unsafe fn borrow_async<CudaAllocType: #crate_path::alloc::CudaAlloc>(
+            unsafe fn borrow_async<'stream, CudaAllocType: #crate_path::alloc::CudaAlloc>(
                 &self,
                 alloc: CudaAllocType,
-                stream: &#crate_path::deps::rustacuda::stream::Stream,
+                stream: &'stream #crate_path::deps::rustacuda::stream::Stream,
             ) -> #crate_path::deps::rustacuda::error::CudaResult<(
-                #crate_path::utils::ffi::DeviceAccessible<Self::CudaRepresentation>,
-                #crate_path::alloc::CombinedCudaAlloc<Self::CudaAllocationAsync, CudaAllocType>
+                #crate_path::utils::r#async::Async<
+                    '_, 'stream,
+                    #crate_path::utils::ffi::DeviceAccessible<Self::CudaRepresentation>,
+                >,
+                #crate_path::alloc::CombinedCudaAlloc<Self::CudaAllocationAsync, CudaAllocType>,
             )> {
                 let alloc_front = #crate_path::alloc::NoCudaAlloc;
                 let alloc_tail = alloc;
@@ -167,26 +205,36 @@ pub fn rust_to_cuda_async_trait(
                 #(#r2c_field_async_declarations)*
 
                 let borrow = #rust_to_cuda_struct_construction;
+                let borrow = #crate_path::utils::ffi::DeviceAccessible::from(borrow);
 
-                Ok((
-                    #crate_path::utils::ffi::DeviceAccessible::from(borrow),
-                    #crate_path::alloc::CombinedCudaAlloc::new(alloc_front, alloc_tail)
-                ))
+                let r#async = #async_borrow_completion;
+                let alloc = #crate_path::alloc::CombinedCudaAlloc::new(alloc_front, alloc_tail);
+
+                Ok((r#async, alloc))
             }
 
             #[cfg(not(target_os = "cuda"))]
-            unsafe fn restore_async<CudaAllocType: #crate_path::alloc::CudaAlloc>(
-                &mut self,
+            unsafe fn restore_async<'a, 'stream, CudaAllocType: #crate_path::alloc::CudaAlloc, CudaRestoreOwner>(
+                this: #crate_path::deps::owning_ref::BoxRefMut<'a, CudaRestoreOwner, Self>,
                 alloc: #crate_path::alloc::CombinedCudaAlloc<
                     Self::CudaAllocationAsync, CudaAllocType
                 >,
-                stream: &#crate_path::deps::rustacuda::stream::Stream,
-            ) -> #crate_path::deps::rustacuda::error::CudaResult<CudaAllocType> {
+                stream: &'stream #crate_path::deps::rustacuda::stream::Stream,
+            ) -> #crate_path::deps::rustacuda::error::CudaResult<(
+                #crate_path::utils::r#async::Async<
+                    'a, 'stream,
+                    #crate_path::deps::owning_ref::BoxRefMut<'a, CudaRestoreOwner, Self>,
+                    #crate_path::utils::r#async::CompletionFnMut<'a, Self>,
+                >,
+                CudaAllocType,
+            )> {
                 let (alloc_front, alloc_tail) = alloc.split();
 
                 #(#r2c_field_async_destructors)*
 
-                Ok(alloc_tail)
+                let r#async = #async_restore_completion;
+
+                Ok((r#async, alloc_tail))
             }
         }
     }
diff --git a/rust-cuda-derive/src/rust_to_cuda/mod.rs b/rust-cuda-derive/src/rust_to_cuda/mod.rs
index 77382d4c4..615c81edf 100644
--- a/rust-cuda-derive/src/rust_to_cuda/mod.rs
+++ b/rust-cuda-derive/src/rust_to_cuda/mod.rs
@@ -38,9 +38,11 @@ pub fn impl_rust_to_cuda(ast: &syn::DeriveInput) -> proc_macro::TokenStream {
     };
     let mut r2c_field_declarations: Vec<TokenStream> = Vec::new();
     let mut r2c_field_async_declarations: Vec<TokenStream> = Vec::new();
+    let mut r2c_field_async_completions: Vec<syn::Ident> = Vec::new();
     let mut r2c_field_initialisations: Vec<TokenStream> = Vec::new();
     let mut r2c_field_destructors: Vec<TokenStream> = Vec::new();
     let mut r2c_field_async_destructors: Vec<TokenStream> = Vec::new();
+    let mut r2c_field_async_completion_calls: Vec<TokenStream> = Vec::new();
 
     let mut c2r_field_initialisations: Vec<TokenStream> = Vec::new();
 
@@ -70,9 +72,11 @@ pub fn impl_rust_to_cuda(ast: &syn::DeriveInput) -> proc_macro::TokenStream {
                         combined_cuda_alloc_async_type,
                         &mut r2c_field_declarations,
                         &mut r2c_field_async_declarations,
+                        &mut r2c_field_async_completions,
                         &mut r2c_field_initialisations,
                         &mut r2c_field_destructors_reverse,
                         &mut r2c_field_async_destructors_reverse,
+                        &mut r2c_field_async_completion_calls,
                         &mut c2r_field_initialisations,
                     );
             }
@@ -117,8 +121,10 @@ pub fn impl_rust_to_cuda(ast: &syn::DeriveInput) -> proc_macro::TokenStream {
             &struct_fields_cuda,
             &combined_cuda_alloc_async_type,
             &r2c_field_async_declarations,
+            &r2c_field_async_completions,
             &r2c_field_initialisations,
             &r2c_field_async_destructors,
+            &r2c_field_async_completion_calls,
         )
     } else {
         TokenStream::new()
diff --git a/src/deps.rs b/src/deps.rs
index 68257e095..0000f9250 100644
--- a/src/deps.rs
+++ b/src/deps.rs
@@ -2,6 +2,9 @@ pub(crate) extern crate alloc;
 
 pub extern crate const_type_layout;
 
+#[cfg(feature = "host")]
+pub extern crate owning_ref;
+
 #[cfg(feature = "host")]
 pub extern crate rustacuda;
 
diff --git a/src/utils/async.rs b/src/utils/async.rs
index d945538d0..84d8cedd8 100644
--- a/src/utils/async.rs
+++ b/src/utils/async.rs
@@ -16,7 +16,7 @@ pub struct NoCompletion;
 pub type CompletionFnMut<'a, T> = Box<dyn FnOnce(&mut T) -> CudaResult<()> + 'a>;
 
 #[cfg(feature = "host")]
-pub trait Completion<T: BorrowMut<Self::Completed>>: sealed::Sealed {
+pub trait Completion<T: ?Sized + BorrowMut<Self::Completed>>: sealed::Sealed {
     type Completed: ?Sized;
 
     #[allow(clippy::missing_errors_doc)] // FIXME
@@ -28,9 +28,10 @@ mod sealed {
 }
 
 #[cfg(feature = "host")]
-impl<T> Completion<T> for NoCompletion {
+impl<T: ?Sized> Completion<T> for NoCompletion {
     type Completed = T;
 
+    #[inline]
     fn complete(self, _completed: &mut Self::Completed) -> CudaResult<()> {
         Ok(())
     }
@@ -39,9 +40,10 @@ impl<T> Completion<T> for NoCompletion {
 impl sealed::Sealed for NoCompletion {}
 
 #[cfg(feature = "host")]
-impl<'a, T: BorrowMut<B>, B: ?Sized> Completion<T> for CompletionFnMut<'a, B> {
+impl<'a, T: ?Sized + BorrowMut<B>, B: ?Sized> Completion<T> for CompletionFnMut<'a, B> {
     type Completed = B;
 
+    #[inline]
     fn complete(self, completed: &mut Self::Completed) -> CudaResult<()> {
         (self)(completed)
     }
@@ -49,6 +51,18 @@ impl<'a, T: BorrowMut<B>, B: ?Sized> Completion<T> for CompletionFnMut<'a, B> {
 #[cfg(feature = "host")]
 impl<'a, T: ?Sized> sealed::Sealed for CompletionFnMut<'a, T> {}
 
+#[cfg(feature = "host")]
+impl<T: ?Sized + BorrowMut<C::Completed>, C: Completion<T>> Completion<T> for Option<C> {
+    type Completed = C::Completed;
+
+    #[inline]
+    fn complete(self, completed: &mut Self::Completed) -> CudaResult<()> {
+        self.map_or(Ok(()), |completion| completion.complete(completed))
+    }
+}
+#[cfg(feature = "host")]
+impl<C> sealed::Sealed for Option<C> {}
+
 #[cfg(feature = "host")]
 pub struct Async<'a, 'stream, T: BorrowMut<C::Completed>, C: Completion<T> = NoCompletion> {
     _stream: PhantomData<&'stream Stream>,

From 875f04981021cd34fbd379fe057c78b37d0580c9 Mon Sep 17 00:00:00 2001
From: Juniper Tyree <juniper.tyree@helsinki.fi>
Date: Fri, 5 Jan 2024 03:46:46 +0000
Subject: [PATCH 086/120] Further async API improvements to add drop behaviour

---
 src/lend/mod.rs    |  17 ++++++-
 src/utils/async.rs | 122 +++++++++++++++++++++++++++++++++++++--------
 2 files changed, 117 insertions(+), 22 deletions(-)

diff --git a/src/lend/mod.rs b/src/lend/mod.rs
index e2e5dcf99..7d8a1e864 100644
--- a/src/lend/mod.rs
+++ b/src/lend/mod.rs
@@ -256,7 +256,13 @@ pub trait LendToCudaAsync: RustToCudaAsync {
     /// Lends an immutable copy of `&self` to CUDA:
     /// - code in the CUDA kernel can only access `&self` through the
     ///   [`DeviceConstRef`] inside the closure
-    /// - after the closure, `&self` will not have changed
+    /// - after the closure, `&self` will not have changed, i.e. interior
+    ///   mutability is not handled by this method
+    ///
+    /// Since the [`HostAndDeviceConstRef`] is wrapped in an [`Async`] with
+    /// [`NoCompletion`], this [`Async`] can be safely dropped or forgotten
+    /// without changing any behaviour. Therefore, this [`Async`] does *not*
+    /// need to be returned from the `inner` closure.
     ///
     /// # Errors
     ///
@@ -270,6 +276,7 @@ pub trait LendToCudaAsync: RustToCudaAsync {
                 '_,
                 'stream,
                 HostAndDeviceConstRef<DeviceAccessible<<Self as RustToCuda>::CudaRepresentation>>,
+                NoCompletion,
             >,
         ) -> Result<O, E>,
     >(
@@ -282,6 +289,11 @@ pub trait LendToCudaAsync: RustToCudaAsync {
 
     /// Moves `self` to CUDA iff `self` is [`StackOnly`].
     ///
+    /// Since the [`HostAndDeviceOwned`] is wrapped in an [`Async`] with
+    /// [`NoCompletion`], this [`Async`] can be safely dropped or forgotten
+    /// without changing any behaviour. Therefore, this [`Async`] does *not*
+    /// need to be returned from the `inner` closure.
+    ///
     /// # Errors
     ///
     /// Returns a [`CudaError`] iff an error occurs inside CUDA
@@ -294,6 +306,7 @@ pub trait LendToCudaAsync: RustToCudaAsync {
                 'a,
                 'stream,
                 HostAndDeviceOwned<DeviceAccessible<<Self as RustToCuda>::CudaRepresentation>>,
+                NoCompletion,
             >,
         ) -> Result<O, E>,
     >(
@@ -316,6 +329,7 @@ impl<T: RustToCudaAsync> LendToCudaAsync for T {
                 '_,
                 'stream,
                 HostAndDeviceConstRef<DeviceAccessible<<Self as RustToCuda>::CudaRepresentation>>,
+                NoCompletion,
             >,
         ) -> Result<O, E>,
     >(
@@ -355,6 +369,7 @@ impl<T: RustToCudaAsync> LendToCudaAsync for T {
                 'a,
                 'stream,
                 HostAndDeviceOwned<DeviceAccessible<<Self as RustToCuda>::CudaRepresentation>>,
+                NoCompletion,
             >,
         ) -> Result<O, E>,
     >(
diff --git a/src/utils/async.rs b/src/utils/async.rs
index 84d8cedd8..87b91a3e0 100644
--- a/src/utils/async.rs
+++ b/src/utils/async.rs
@@ -19,6 +19,9 @@ pub type CompletionFnMut<'a, T> = Box<dyn FnOnce(&mut T) -> CudaResult<()> + 'a>
 pub trait Completion<T: ?Sized + BorrowMut<Self::Completed>>: sealed::Sealed {
     type Completed: ?Sized;
 
+    #[doc(hidden)]
+    fn synchronize_on_drop(&self) -> bool;
+
     #[allow(clippy::missing_errors_doc)] // FIXME
     fn complete(self, completed: &mut Self::Completed) -> CudaResult<()>;
 }
@@ -31,6 +34,11 @@ mod sealed {
 impl<T: ?Sized> Completion<T> for NoCompletion {
     type Completed = T;
 
+    #[inline]
+    fn synchronize_on_drop(&self) -> bool {
+        false
+    }
+
     #[inline]
     fn complete(self, _completed: &mut Self::Completed) -> CudaResult<()> {
         Ok(())
@@ -43,6 +51,11 @@ impl sealed::Sealed for NoCompletion {}
 impl<'a, T: ?Sized + BorrowMut<B>, B: ?Sized> Completion<T> for CompletionFnMut<'a, B> {
     type Completed = B;
 
+    #[inline]
+    fn synchronize_on_drop(&self) -> bool {
+        true
+    }
+
     #[inline]
     fn complete(self, completed: &mut Self::Completed) -> CudaResult<()> {
         (self)(completed)
@@ -55,6 +68,11 @@ impl<'a, T: ?Sized> sealed::Sealed for CompletionFnMut<'a, T> {}
 impl<T: ?Sized + BorrowMut<C::Completed>, C: Completion<T>> Completion<T> for Option<C> {
     type Completed = C::Completed;
 
+    #[inline]
+    fn synchronize_on_drop(&self) -> bool {
+        self.as_ref().map_or(false, Completion::synchronize_on_drop)
+    }
+
     #[inline]
     fn complete(self, completed: &mut Self::Completed) -> CudaResult<()> {
         self.map_or(Ok(()), |completion| completion.complete(completed))
@@ -107,9 +125,7 @@ impl<'a, 'stream, T: BorrowMut<C::Completed>, C: Completion<T>> Async<'a, 'strea
     /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside
     /// CUDA.
     pub fn pending(value: T, stream: &'stream Stream, completion: C) -> CudaResult<Self> {
-        let event = CudaDropWrapper::from(Event::new(
-            EventFlags::DISABLE_TIMING | EventFlags::BLOCKING_SYNC,
-        )?);
+        let event = CudaDropWrapper::from(Event::new(EventFlags::DISABLE_TIMING)?);
 
         let (sender, receiver) = oneshot::channel();
 
@@ -140,9 +156,11 @@ impl<'a, 'stream, T: BorrowMut<C::Completed>, C: Completion<T>> Async<'a, 'strea
     /// # Errors
     /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside
     /// CUDA.
-    pub fn synchronize(mut self) -> CudaResult<T> {
-        let (receiver, completion) = match self.status {
-            AsyncStatus::Completed { result } => return result.map(|()| self.value),
+    pub fn synchronize(self) -> CudaResult<T> {
+        let (mut value, status) = self.destructure_into_parts();
+
+        let (receiver, completion) = match status {
+            AsyncStatus::Completed { result } => return result.map(|()| value),
             AsyncStatus::Processing {
                 receiver,
                 completion,
@@ -157,9 +175,9 @@ impl<'a, 'stream, T: BorrowMut<C::Completed>, C: Completion<T>> Async<'a, 'strea
             Err(oneshot::RecvError) => return Err(CudaError::AlreadyAcquired),
         }
 
-        completion.complete(self.value.borrow_mut())?;
+        completion.complete(value.borrow_mut())?;
 
-        Ok(self.value)
+        Ok(value)
     }
 
     /// Moves the asynchronous data move to a different [`Stream`].
@@ -168,15 +186,17 @@ impl<'a, 'stream, T: BorrowMut<C::Completed>, C: Completion<T>> Async<'a, 'strea
     /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside
     /// CUDA.
     pub fn move_to_stream<'stream_new>(
-        mut self,
+        self,
         stream: &'stream_new Stream,
     ) -> CudaResult<Async<'a, 'stream_new, T, C>> {
-        let (receiver, completion, event) = match self.status {
+        let (mut value, status) = self.destructure_into_parts();
+
+        let (receiver, completion, event) = match status {
             AsyncStatus::Completed { .. } => {
                 return Ok(Async {
                     _stream: PhantomData::<&'stream_new Stream>,
-                    value: self.value,
-                    status: self.status,
+                    value,
+                    status,
                     _capture: PhantomData::<&'a ()>,
                 })
             },
@@ -196,7 +216,7 @@ impl<'a, 'stream, T: BorrowMut<C::Completed>, C: Completion<T>> Async<'a, 'strea
 
                 return Ok(Async {
                     _stream: PhantomData::<&'stream_new Stream>,
-                    value: self.value,
+                    value,
                     status: AsyncStatus::Processing {
                         receiver,
                         completion,
@@ -209,11 +229,11 @@ impl<'a, 'stream, T: BorrowMut<C::Completed>, C: Completion<T>> Async<'a, 'strea
             Err(oneshot::TryRecvError::Disconnected) => return Err(CudaError::AlreadyAcquired),
         };
 
-        completion.complete(self.value.borrow_mut())?;
+        completion.complete(value.borrow_mut())?;
 
         Ok(Async {
             _stream: PhantomData::<&'stream_new Stream>,
-            value: self.value,
+            value,
             status: AsyncStatus::Completed { result: Ok(()) },
             _capture: PhantomData::<&'a ()>,
         })
@@ -230,15 +250,17 @@ impl<'a, 'stream, T: BorrowMut<C::Completed>, C: Completion<T>> Async<'a, 'strea
     /// computation out of smaller ones that have all been submitted to the
     /// same [`Stream`].
     pub unsafe fn unwrap_unchecked(self) -> CudaResult<(T, Option<C>)> {
-        match self.status {
-            AsyncStatus::Completed { result: Ok(()) } => Ok((self.value, None)),
+        let (value, status) = self.destructure_into_parts();
+
+        match status {
+            AsyncStatus::Completed { result: Ok(()) } => Ok((value, None)),
             AsyncStatus::Completed { result: Err(err) } => Err(err),
             AsyncStatus::Processing {
                 receiver: _,
                 completion,
                 event: _,
                 _capture,
-            } => Ok((self.value, Some(completion))),
+            } => Ok((value, Some(completion))),
         }
     }
 
@@ -249,6 +271,34 @@ impl<'a, 'stream, T: BorrowMut<C::Completed>, C: Completion<T>> Async<'a, 'strea
     pub fn as_mut(&mut self) -> AsyncProj<'_, 'stream, &mut T> {
         AsyncProj::new(&mut self.value)
     }
+
+    #[must_use]
+    fn destructure_into_parts(self) -> (T, AsyncStatus<'a, T, C>) {
+        let this = std::mem::ManuallyDrop::new(self);
+
+        // Safety: we destructure self into its droppable components,
+        //         value and status, without dropping self itself
+        unsafe { (std::ptr::read(&this.value), (std::ptr::read(&this.status))) }
+    }
+}
+
+#[cfg(feature = "host")]
+impl<'a, 'stream, T: BorrowMut<C::Completed>, C: Completion<T>> Drop for Async<'a, 'stream, T, C> {
+    fn drop(&mut self) {
+        let AsyncStatus::Processing {
+            receiver,
+            completion,
+            event: _,
+            _capture,
+        } = std::mem::replace(&mut self.status, AsyncStatus::Completed { result: Ok(()) })
+        else {
+            return;
+        };
+
+        if completion.synchronize_on_drop() && receiver.recv() == Ok(Ok(())) {
+            let _ = completion.complete(self.value.borrow_mut());
+        }
+    }
 }
 
 #[cfg(feature = "host")]
@@ -311,8 +361,9 @@ impl<'a, 'stream, T: BorrowMut<C::Completed>, C: Completion<T>> IntoFuture
     type IntoFuture = impl Future<Output = Self::Output>;
 
     fn into_future(self) -> Self::IntoFuture {
-        let (completion, status): (Option<C>, AsyncStatus<'a, T, NoCompletion>) = match self.status
-        {
+        let (value, status) = self.destructure_into_parts();
+
+        let (completion, status): (Option<C>, AsyncStatus<'a, T, NoCompletion>) = match status {
             AsyncStatus::Completed { result } => {
                 (None, AsyncStatus::Completed::<T, NoCompletion> { result })
             },
@@ -334,13 +385,42 @@ impl<'a, 'stream, T: BorrowMut<C::Completed>, C: Completion<T>> IntoFuture
 
         AsyncFuture {
             _stream: PhantomData::<&'stream Stream>,
-            value: Some(self.value),
+            value: Some(value),
             completion,
             status,
         }
     }
 }
 
+#[cfg(feature = "host")]
+impl<'a, 'stream, T: BorrowMut<C::Completed>, C: Completion<T>> Drop
+    for AsyncFuture<'a, 'stream, T, C>
+{
+    fn drop(&mut self) {
+        let Some(mut value) = self.value.take() else {
+            return;
+        };
+
+        let AsyncStatus::Processing {
+            receiver,
+            completion: NoCompletion,
+            event: _,
+            _capture,
+        } = std::mem::replace(&mut self.status, AsyncStatus::Completed { result: Ok(()) })
+        else {
+            return;
+        };
+
+        let Some(completion) = self.completion.take() else {
+            return;
+        };
+
+        if completion.synchronize_on_drop() && receiver.recv() == Ok(Ok(())) {
+            let _ = completion.complete(value.borrow_mut());
+        }
+    }
+}
+
 #[cfg(feature = "host")]
 #[allow(clippy::module_name_repetitions)]
 #[derive(Copy, Clone)]

From 356b7b2f5085e55be706304e4b80f3fb4531cf89 Mon Sep 17 00:00:00 2001
From: Juniper Tyree <juniper.tyree@helsinki.fi>
Date: Fri, 5 Jan 2024 04:30:31 +0000
Subject: [PATCH 087/120] First sketch of the safety constraints of a new
 NoSafeAliasing trait

---
 src/safety/aliasing.rs | 83 ++++++++++++++++++++++++++++++++++++++++++
 src/safety/mod.rs      |  2 +
 2 files changed, 85 insertions(+)
 create mode 100644 src/safety/aliasing.rs

diff --git a/src/safety/aliasing.rs b/src/safety/aliasing.rs
new file mode 100644
index 000000000..25cb61992
--- /dev/null
+++ b/src/safety/aliasing.rs
@@ -0,0 +1,83 @@
+#[allow(clippy::module_name_repetitions)]
+/// Types for which mutable references can be safely shared with each CUDA
+/// thread without breaking Rust's no-mutable-aliasing memory safety
+/// guarantees.
+///
+/// # Safety
+///
+/// A type may only implement [`NoSafeAliasing`], if and only if all of the
+/// conditions below hold:
+///
+/// * Calling [`std::mem::replace`] on a mutable reference of the type does
+///   *not* return a value which owns memory which it must deallocate on drop.
+///   For instance, `&mut [T]` satisfies this criteria, but `Box<T>` does not.
+///
+/// * No safe alising mutable access is provided to the same memory locations
+///   across multiple CUDA threads. You can use the
+///   [`SplitSliceOverCudaThreadsConstStride`](crate::utils::aliasing::SplitSliceOverCudaThreadsConstStride)
+///   and
+///   [`SplitSliceOverCudaThreadsDynamicStride`](crate::utils::aliasing::SplitSliceOverCudaThreadsDynamicStride)
+///   wrapper types to ensure that each thread is only given access to to its
+///   own sub-slice partition so that aliasing is avoided.
+///
+/// * A mutable reference of the type must not provide mutable access to some
+///   shallow inner state (in contrast to deep, which refers to values behind
+///   references) of the value which the API user expects to be mutably shared
+///   between all threads even if it is not in practice so as to not violate the
+///   second condition. For instance, a struct `Counter { pub a: u32 }` violates
+///   this third condition, as code with access to `&mut Counter` also gets
+///   mutable access to its field `a` and might assume that mutations of this
+///   field are either shared across threads or shared back with the host after
+///   the kernel has completed, neither of which is possible. In contrast, `&mut
+///   [T]` satisfies this condition, as it is well known that modifying the
+///   shallow length of a slice (by assigning a sub-slice) inside a function
+///   does not alter the length of the slice that the caller of the function
+///   passed in.
+pub unsafe trait NoSafeAliasing {}
+
+unsafe impl<
+        'a,
+        T: crate::safety::StackOnly
+            + crate::safety::PortableBitSemantics
+            + const_type_layout::TypeGraphLayout,
+        const STRIDE: usize,
+    > NoSafeAliasing
+    for crate::utils::aliasing::SplitSliceOverCudaThreadsConstStride<&'a mut [T], STRIDE>
+{
+}
+unsafe impl<
+        'a,
+        T: crate::safety::StackOnly
+            + crate::safety::PortableBitSemantics
+            + const_type_layout::TypeGraphLayout,
+    > NoSafeAliasing
+    for crate::utils::aliasing::SplitSliceOverCudaThreadsDynamicStride<&'a mut [T]>
+{
+}
+
+unsafe impl<
+        T: crate::safety::StackOnly
+            + crate::safety::PortableBitSemantics
+            + const_type_layout::TypeGraphLayout,
+        const M2D: bool,
+        const M2H: bool,
+        const STRIDE: usize,
+    > NoSafeAliasing
+    for crate::utils::aliasing::SplitSliceOverCudaThreadsConstStride<
+        crate::utils::exchange::buffer::CudaExchangeBuffer<T, M2D, M2H>,
+        STRIDE,
+    >
+{
+}
+unsafe impl<
+        T: crate::safety::StackOnly
+            + crate::safety::PortableBitSemantics
+            + const_type_layout::TypeGraphLayout,
+        const M2D: bool,
+        const M2H: bool,
+    > NoSafeAliasing
+    for crate::utils::aliasing::SplitSliceOverCudaThreadsDynamicStride<
+        crate::utils::exchange::buffer::CudaExchangeBuffer<T, M2D, M2H>,
+    >
+{
+}
diff --git a/src/safety/mod.rs b/src/safety/mod.rs
index 243a2a9f9..a3741ea90 100644
--- a/src/safety/mod.rs
+++ b/src/safety/mod.rs
@@ -1,3 +1,4 @@
+mod aliasing;
 mod arch;
 mod portable;
 mod stack_only;
@@ -7,5 +8,6 @@ pub mod kernel_signature;
 #[doc(hidden)]
 pub mod type_layout;
 
+pub use aliasing::NoSafeAliasing;
 pub use portable::PortableBitSemantics;
 pub use stack_only::StackOnly;

From 564ab2beeaece8e386506a483f4e948d4d16e584 Mon Sep 17 00:00:00 2001
From: Juniper Tyree <juniper.tyree@helsinki.fi>
Date: Sat, 6 Jan 2024 20:33:25 +0000
Subject: [PATCH 088/120] First steps towards reintroducing LendToCudaMut

---
 examples/single-source/src/main.rs |   4 +-
 src/kernel/param.rs                | 196 +++++++++++++++++++++++------
 src/lend/mod.rs                    | 177 ++++++++++++++++++++++++--
 src/safety/aliasing.rs             |  38 +++---
 src/safety/mod.rs                  |   2 +-
 src/utils/aliasing/const.rs        |   2 +-
 src/utils/aliasing/dynamic.rs      |   2 +-
 src/utils/async.rs                 |  43 ++++++-
 src/utils/exchange/wrapper.rs      |  41 ++----
 9 files changed, 409 insertions(+), 96 deletions(-)

diff --git a/examples/single-source/src/main.rs b/examples/single-source/src/main.rs
index 4783deffa..89bbdf990 100644
--- a/examples/single-source/src/main.rs
+++ b/examples/single-source/src/main.rs
@@ -64,9 +64,9 @@ pub fn kernel<
         + rc::safety::StackOnly,
 >(
     _x: &rc::kernel::param::PerThreadShallowCopy<Dummy>,
-    _z: &rc::kernel::param::SharedHeapPerThreadShallowCopy<Wrapper<T>>,
+    _z: &rc::kernel::param::DeepPerThreadBorrow<Wrapper<T>>,
     _v @ _w: &'a rc::kernel::param::ShallowInteriorMutable<core::sync::atomic::AtomicU64>,
-    _: rc::kernel::param::SharedHeapPerThreadShallowCopy<Wrapper<T>>,
+    _: rc::kernel::param::DeepPerThreadBorrow<Wrapper<T>>,
     q @ Triple(s, mut __t, _u): rc::kernel::param::PerThreadShallowCopy<Triple>,
     shared3: &mut rc::utils::shared::ThreadBlockShared<u32>,
     dynamic: &mut rc::utils::shared::ThreadBlockSharedSlice<Dummy>,
diff --git a/src/kernel/param.rs b/src/kernel/param.rs
index 8ca41ddf4..f28944b81 100644
--- a/src/kernel/param.rs
+++ b/src/kernel/param.rs
@@ -14,8 +14,8 @@ use crate::{
     alloc::EmptyCudaAlloc,
     kernel::{sealed, CudaKernelParameter},
     lend::RustToCuda,
-    safety::PortableBitSemantics,
-    utils::ffi::{DeviceAccessible, DeviceConstRef, DeviceOwnedRef},
+    safety::{PortableBitSemantics, SafeMutableAliasing},
+    utils::ffi::{DeviceAccessible, DeviceConstRef, DeviceMutRef, DeviceOwnedRef},
 };
 
 pub struct PtxJit<T> {
@@ -424,12 +424,12 @@ impl<T: crate::safety::StackOnly + crate::safety::PortableBitSemantics + Sync> s
 {
 }
 
-pub struct SharedHeapPerThreadShallowCopy<T: RustToCuda> {
+pub struct DeepPerThreadBorrow<T: RustToCuda> {
     never: !,
     _marker: PhantomData<T>,
 }
 
-impl<T: RustToCuda> Deref for SharedHeapPerThreadShallowCopy<T> {
+impl<T: RustToCuda> Deref for DeepPerThreadBorrow<T> {
     type Target = T;
 
     fn deref(&self) -> &Self::Target {
@@ -444,7 +444,7 @@ impl<
                 CudaRepresentation: 'static + crate::safety::StackOnly,
                 CudaAllocation: EmptyCudaAlloc,
             >,
-    > CudaKernelParameter for SharedHeapPerThreadShallowCopy<T>
+    > CudaKernelParameter for DeepPerThreadBorrow<T>
 {
     #[cfg(feature = "host")]
     type AsyncHostType<'stream, 'b> = crate::utils::r#async::Async<
@@ -514,13 +514,11 @@ impl<
                 CudaRepresentation: 'static + crate::safety::StackOnly,
                 CudaAllocation: EmptyCudaAlloc,
             >,
-    > sealed::Sealed for SharedHeapPerThreadShallowCopy<T>
+    > sealed::Sealed for DeepPerThreadBorrow<T>
 {
 }
 
-impl<'a, T: 'static + Sync + RustToCuda> CudaKernelParameter
-    for &'a SharedHeapPerThreadShallowCopy<T>
-{
+impl<'a, T: 'static + Sync + RustToCuda> CudaKernelParameter for &'a DeepPerThreadBorrow<T> {
     #[cfg(feature = "host")]
     type AsyncHostType<'stream, 'b> = crate::utils::r#async::AsyncProj<
         'b,
@@ -580,7 +578,78 @@ impl<'a, T: 'static + Sync + RustToCuda> CudaKernelParameter
         unsafe { crate::lend::BorrowFromRust::with_borrow_from_rust(param, inner) }
     }
 }
-impl<'a, T: Sync + RustToCuda> sealed::Sealed for &'a SharedHeapPerThreadShallowCopy<T> {}
+impl<'a, T: Sync + RustToCuda> sealed::Sealed for &'a DeepPerThreadBorrow<T> {}
+
+impl<'a, T: 'static + Sync + RustToCuda + SafeMutableAliasing> CudaKernelParameter
+    for &'a mut DeepPerThreadBorrow<T>
+{
+    #[cfg(feature = "host")]
+    type AsyncHostType<'stream, 'b> = crate::utils::r#async::AsyncProj<
+        'b,
+        'stream,
+        &'b mut crate::host::HostAndDeviceMutRef<
+            'b,
+            DeviceAccessible<<T as RustToCuda>::CudaRepresentation>,
+        >,
+    >;
+    #[cfg(any(feature = "device", doc))]
+    type DeviceType<'b> = &'b mut T;
+    type FfiType<'stream, 'b> =
+        DeviceMutRef<'b, DeviceAccessible<<T as RustToCuda>::CudaRepresentation>>;
+    #[cfg(feature = "host")]
+    type SyncHostType = &'a mut T;
+
+    #[cfg(feature = "host")]
+    fn with_new_async<'stream, O, E: From<rustacuda::error::CudaError>>(
+        param: Self::SyncHostType,
+        stream: &'stream rustacuda::stream::Stream,
+        inner: impl for<'b> FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result<O, E>,
+    ) -> Result<O, E> {
+        crate::lend::LendToCuda::lend_to_cuda_mut(param, |mut param| {
+            // FIXME: express the same with param.as_async(stream).as_mut()
+            let _ = stream;
+            inner(crate::utils::r#async::AsyncProj::new(&mut param.as_mut()))
+        })
+    }
+
+    #[cfg(feature = "host")]
+    fn with_async_as_ptx_jit<O>(
+        _param: &Self::AsyncHostType<'_, '_>,
+        _token: sealed::Token,
+        inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O,
+    ) -> O {
+        inner(None)
+    }
+
+    #[cfg(feature = "host")]
+    fn shared_layout_for_async(
+        _param: &Self::AsyncHostType<'_, '_>,
+        _token: sealed::Token,
+    ) -> Layout {
+        Layout::new::<()>()
+    }
+
+    #[cfg(feature = "host")]
+    fn async_to_ffi<'stream, 'b, E: From<rustacuda::error::CudaError>>(
+        param: Self::AsyncHostType<'stream, 'b>,
+        _token: sealed::Token,
+    ) -> Result<Self::FfiType<'stream, 'b>, E> {
+        let param = unsafe { param.unwrap_unchecked() };
+        Ok(param.for_device())
+    }
+
+    #[cfg(feature = "device")]
+    unsafe fn with_ffi_as_device<O, const PARAM: usize>(
+        param: Self::FfiType<'static, 'static>,
+        inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O,
+    ) -> O {
+        unsafe { crate::lend::BorrowFromRust::with_borrow_from_rust_mut(param, inner) }
+    }
+}
+impl<'a, T: Sync + RustToCuda + SafeMutableAliasing> sealed::Sealed
+    for &'a mut DeepPerThreadBorrow<T>
+{
+}
 
 impl<
         T: Send
@@ -589,18 +658,17 @@ impl<
                 CudaRepresentation: 'static + crate::safety::StackOnly,
                 CudaAllocation: EmptyCudaAlloc,
             >,
-    > CudaKernelParameter for PtxJit<SharedHeapPerThreadShallowCopy<T>>
+    > CudaKernelParameter for PtxJit<DeepPerThreadBorrow<T>>
 {
     #[cfg(feature = "host")]
     type AsyncHostType<'stream, 'b> =
-        <SharedHeapPerThreadShallowCopy<T> as CudaKernelParameter>::AsyncHostType<'stream, 'b>;
+        <DeepPerThreadBorrow<T> as CudaKernelParameter>::AsyncHostType<'stream, 'b>;
     #[cfg(any(feature = "device", doc))]
-    type DeviceType<'b> =
-        <SharedHeapPerThreadShallowCopy<T> as CudaKernelParameter>::DeviceType<'b>;
+    type DeviceType<'b> = <DeepPerThreadBorrow<T> as CudaKernelParameter>::DeviceType<'b>;
     type FfiType<'stream, 'b> =
-        <SharedHeapPerThreadShallowCopy<T> as CudaKernelParameter>::FfiType<'stream, 'b>;
+        <DeepPerThreadBorrow<T> as CudaKernelParameter>::FfiType<'stream, 'b>;
     #[cfg(feature = "host")]
-    type SyncHostType = <SharedHeapPerThreadShallowCopy<T> as CudaKernelParameter>::SyncHostType;
+    type SyncHostType = <DeepPerThreadBorrow<T> as CudaKernelParameter>::SyncHostType;
 
     #[cfg(feature = "host")]
     fn with_new_async<'stream, O, E: From<rustacuda::error::CudaError>>(
@@ -608,9 +676,7 @@ impl<
         stream: &'stream rustacuda::stream::Stream,
         inner: impl for<'b> FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result<O, E>,
     ) -> Result<O, E> {
-        <SharedHeapPerThreadShallowCopy<T> as CudaKernelParameter>::with_new_async(
-            param, stream, inner,
-        )
+        <DeepPerThreadBorrow<T> as CudaKernelParameter>::with_new_async(param, stream, inner)
     }
 
     #[cfg(feature = "host")]
@@ -628,7 +694,7 @@ impl<
         param: Self::AsyncHostType<'stream, 'b>,
         token: sealed::Token,
     ) -> Result<Self::FfiType<'stream, 'b>, E> {
-        <SharedHeapPerThreadShallowCopy<T> as CudaKernelParameter>::async_to_ffi(param, token)
+        <DeepPerThreadBorrow<T> as CudaKernelParameter>::async_to_ffi(param, token)
     }
 
     #[cfg(feature = "host")]
@@ -646,7 +712,7 @@ impl<
     ) -> O {
         emit_param_ptx_jit_marker::<_, PARAM>(param.as_ref());
 
-        <SharedHeapPerThreadShallowCopy<T> as CudaKernelParameter>::with_ffi_as_device::<O, PARAM>(
+        <DeepPerThreadBorrow<T> as CudaKernelParameter>::with_ffi_as_device::<O, PARAM>(
             param, inner,
         )
     }
@@ -658,24 +724,84 @@ impl<
                 CudaRepresentation: 'static + crate::safety::StackOnly,
                 CudaAllocation: EmptyCudaAlloc,
             >,
-    > sealed::Sealed for PtxJit<SharedHeapPerThreadShallowCopy<T>>
+    > sealed::Sealed for PtxJit<DeepPerThreadBorrow<T>>
 {
 }
 
 impl<'a, T: 'static + Sync + RustToCuda> CudaKernelParameter
-    for &'a PtxJit<SharedHeapPerThreadShallowCopy<T>>
+    for &'a PtxJit<DeepPerThreadBorrow<T>>
+{
+    #[cfg(feature = "host")]
+    type AsyncHostType<'stream, 'b> =
+        <&'a DeepPerThreadBorrow<T> as CudaKernelParameter>::AsyncHostType<'stream, 'b>;
+    #[cfg(any(feature = "device", doc))]
+    type DeviceType<'b> = <&'a DeepPerThreadBorrow<T> as CudaKernelParameter>::DeviceType<'b>;
+    type FfiType<'stream, 'b> =
+        <&'a DeepPerThreadBorrow<T> as CudaKernelParameter>::FfiType<'stream, 'b>;
+    #[cfg(feature = "host")]
+    type SyncHostType = <&'a DeepPerThreadBorrow<T> as CudaKernelParameter>::SyncHostType;
+
+    #[cfg(feature = "host")]
+    fn with_new_async<'stream, O, E: From<rustacuda::error::CudaError>>(
+        param: Self::SyncHostType,
+        stream: &'stream rustacuda::stream::Stream,
+        inner: impl for<'b> FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result<O, E>,
+    ) -> Result<O, E> {
+        <&'a DeepPerThreadBorrow<T> as CudaKernelParameter>::with_new_async(param, stream, inner)
+    }
+
+    #[cfg(feature = "host")]
+    fn with_async_as_ptx_jit<O>(
+        param: &Self::AsyncHostType<'_, '_>,
+        _token: sealed::Token,
+        inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O,
+    ) -> O {
+        let param = unsafe { param.unwrap_unchecked() };
+        inner(Some(&param_as_raw_bytes(param.for_host())))
+    }
+
+    #[cfg(feature = "host")]
+    fn shared_layout_for_async(
+        _param: &Self::AsyncHostType<'_, '_>,
+        _token: sealed::Token,
+    ) -> Layout {
+        Layout::new::<()>()
+    }
+
+    #[cfg(feature = "host")]
+    fn async_to_ffi<'stream, 'b, E: From<rustacuda::error::CudaError>>(
+        param: Self::AsyncHostType<'stream, 'b>,
+        token: sealed::Token,
+    ) -> Result<Self::FfiType<'stream, 'b>, E> {
+        <&'a DeepPerThreadBorrow<T> as CudaKernelParameter>::async_to_ffi(param, token)
+    }
+
+    #[cfg(feature = "device")]
+    unsafe fn with_ffi_as_device<O, const PARAM: usize>(
+        param: Self::FfiType<'static, 'static>,
+        inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O,
+    ) -> O {
+        emit_param_ptx_jit_marker::<_, PARAM>(param.as_ref());
+
+        <&'a DeepPerThreadBorrow<T> as CudaKernelParameter>::with_ffi_as_device::<O, PARAM>(
+            param, inner,
+        )
+    }
+}
+impl<'a, T: 'static + Sync + RustToCuda> sealed::Sealed for &'a PtxJit<DeepPerThreadBorrow<T>> {}
+
+impl<'a, T: 'static + Sync + RustToCuda + SafeMutableAliasing> CudaKernelParameter
+    for &'a mut PtxJit<DeepPerThreadBorrow<T>>
 {
     #[cfg(feature = "host")]
     type AsyncHostType<'stream, 'b> =
-        <&'a SharedHeapPerThreadShallowCopy<T> as CudaKernelParameter>::AsyncHostType<'stream, 'b>;
+        <&'a mut DeepPerThreadBorrow<T> as CudaKernelParameter>::AsyncHostType<'stream, 'b>;
     #[cfg(any(feature = "device", doc))]
-    type DeviceType<'b> =
-        <&'a SharedHeapPerThreadShallowCopy<T> as CudaKernelParameter>::DeviceType<'b>;
+    type DeviceType<'b> = <&'a mut DeepPerThreadBorrow<T> as CudaKernelParameter>::DeviceType<'b>;
     type FfiType<'stream, 'b> =
-        <&'a SharedHeapPerThreadShallowCopy<T> as CudaKernelParameter>::FfiType<'stream, 'b>;
+        <&'a mut DeepPerThreadBorrow<T> as CudaKernelParameter>::FfiType<'stream, 'b>;
     #[cfg(feature = "host")]
-    type SyncHostType =
-        <&'a SharedHeapPerThreadShallowCopy<T> as CudaKernelParameter>::SyncHostType;
+    type SyncHostType = <&'a mut DeepPerThreadBorrow<T> as CudaKernelParameter>::SyncHostType;
 
     #[cfg(feature = "host")]
     fn with_new_async<'stream, O, E: From<rustacuda::error::CudaError>>(
@@ -683,7 +809,7 @@ impl<'a, T: 'static + Sync + RustToCuda> CudaKernelParameter
         stream: &'stream rustacuda::stream::Stream,
         inner: impl for<'b> FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result<O, E>,
     ) -> Result<O, E> {
-        <&'a SharedHeapPerThreadShallowCopy<T> as CudaKernelParameter>::with_new_async(
+        <&'a mut DeepPerThreadBorrow<T> as CudaKernelParameter>::with_new_async(
             param, stream, inner,
         )
     }
@@ -694,7 +820,7 @@ impl<'a, T: 'static + Sync + RustToCuda> CudaKernelParameter
         _token: sealed::Token,
         inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O,
     ) -> O {
-        let param = unsafe { param.unwrap_unchecked() };
+        let param = unsafe { param.as_ref().unwrap_unchecked() };
         inner(Some(&param_as_raw_bytes(param.for_host())))
     }
 
@@ -711,7 +837,7 @@ impl<'a, T: 'static + Sync + RustToCuda> CudaKernelParameter
         param: Self::AsyncHostType<'stream, 'b>,
         token: sealed::Token,
     ) -> Result<Self::FfiType<'stream, 'b>, E> {
-        <&'a SharedHeapPerThreadShallowCopy<T> as CudaKernelParameter>::async_to_ffi(param, token)
+        <&'a mut DeepPerThreadBorrow<T> as CudaKernelParameter>::async_to_ffi(param, token)
     }
 
     #[cfg(feature = "device")]
@@ -721,13 +847,13 @@ impl<'a, T: 'static + Sync + RustToCuda> CudaKernelParameter
     ) -> O {
         emit_param_ptx_jit_marker::<_, PARAM>(param.as_ref());
 
-        <&'a SharedHeapPerThreadShallowCopy<T> as CudaKernelParameter>::with_ffi_as_device::<O, PARAM>(
+        <&'a mut DeepPerThreadBorrow<T> as CudaKernelParameter>::with_ffi_as_device::<O, PARAM>(
             param, inner,
         )
     }
 }
-impl<'a, T: 'static + Sync + RustToCuda> sealed::Sealed
-    for &'a PtxJit<SharedHeapPerThreadShallowCopy<T>>
+impl<'a, T: 'static + Sync + RustToCuda + SafeMutableAliasing> sealed::Sealed
+    for &'a mut PtxJit<DeepPerThreadBorrow<T>>
 {
 }
 
diff --git a/src/lend/mod.rs b/src/lend/mod.rs
index 7d8a1e864..a78cd4018 100644
--- a/src/lend/mod.rs
+++ b/src/lend/mod.rs
@@ -7,16 +7,16 @@ use rustacuda::error::CudaError;
 pub use rust_cuda_derive::LendRustToCuda;
 
 #[cfg(any(feature = "host", feature = "device", doc))]
-use crate::safety::StackOnly;
+use crate::safety::{SafeMutableAliasing, StackOnly};
 #[cfg(feature = "device")]
-use crate::utils::ffi::{DeviceConstRef, DeviceOwnedRef};
+use crate::utils::ffi::{DeviceConstRef, DeviceMutRef, DeviceOwnedRef};
 use crate::{alloc::CudaAlloc, safety::PortableBitSemantics};
 #[cfg(any(feature = "host", feature = "device"))]
 use crate::{alloc::EmptyCudaAlloc, utils::ffi::DeviceAccessible};
 #[cfg(feature = "host")]
 use crate::{
     alloc::{CombinedCudaAlloc, NoCudaAlloc},
-    host::{HostAndDeviceConstRef, HostAndDeviceOwned},
+    host::{HostAndDeviceConstRef, HostAndDeviceMutRef, HostAndDeviceOwned},
     utils::r#async::{Async, CompletionFnMut, NoCompletion},
 };
 
@@ -162,7 +162,7 @@ impl<T, P: RustToCudaAsync + RustToCudaProxy<T>> RustToCudaAsyncProxy<T> for P {
 #[cfg(feature = "host")]
 #[allow(clippy::module_name_repetitions)]
 pub trait LendToCuda: RustToCuda {
-    /// Lends an immutable copy of `&self` to CUDA:
+    /// Lends an immutable borrow of `&self` to CUDA:
     /// - code in the CUDA kernel can only access `&self` through the
     ///   [`DeviceConstRef`] inside the closure
     /// - after the closure, `&self` will not have changed
@@ -183,7 +183,30 @@ pub trait LendToCuda: RustToCuda {
     where
         Self: Sync;
 
-    /// Moves `self` to CUDA iff `self` is [`StackOnly`].
+    /// Lends a mutable borrow of `&mut self` to CUDA iff `Self` is
+    /// [`SafeMutableAliasing`]:
+    /// - code in the CUDA kernel can only access `&mut self` through the
+    ///   `DeviceMutRef` inside the closure
+    /// - after the closure, `&mut self` will reflect the changes from the
+    ///   kernel execution
+    ///
+    /// # Errors
+    ///
+    /// Returns a `rustacuda::errors::CudaError` iff an error occurs inside CUDA
+    fn lend_to_cuda_mut<
+        O,
+        E: From<CudaError>,
+        F: FnOnce(
+            HostAndDeviceMutRef<DeviceAccessible<<Self as RustToCuda>::CudaRepresentation>>,
+        ) -> Result<O, E>,
+    >(
+        &mut self,
+        inner: F,
+    ) -> Result<O, E>
+    where
+        Self: Sync + SafeMutableAliasing;
+
+    /// Moves `self` to CUDA iff `Self` is [`StackOnly`].
     ///
     /// # Errors
     ///
@@ -227,6 +250,30 @@ impl<T: RustToCuda> LendToCuda for T {
         result
     }
 
+    fn lend_to_cuda_mut<
+        O,
+        E: From<CudaError>,
+        F: FnOnce(
+            HostAndDeviceMutRef<DeviceAccessible<<Self as RustToCuda>::CudaRepresentation>>,
+        ) -> Result<O, E>,
+    >(
+        &mut self,
+        inner: F,
+    ) -> Result<O, E>
+    where
+        Self: Sync + SafeMutableAliasing,
+    {
+        let (mut cuda_repr, alloc) = unsafe { self.borrow(NoCudaAlloc) }?;
+
+        let result = HostAndDeviceMutRef::with_new(&mut cuda_repr, inner);
+
+        core::mem::drop(cuda_repr);
+
+        let _: NoCudaAlloc = unsafe { self.restore(alloc) }?;
+
+        result
+    }
+
     fn move_to_cuda<
         O,
         E: From<CudaError>,
@@ -287,6 +334,45 @@ pub trait LendToCudaAsync: RustToCudaAsync {
     where
         Self: Sync;
 
+    #[allow(clippy::type_complexity)]
+    /// Lends a mutable borrow of `&mut self` to CUDA iff `Self` is
+    /// [`SafeMutableAliasing`]:
+    /// - code in the CUDA kernel can only access `&mut self` through the
+    ///   `DeviceMutRef` inside the closure
+    /// - after the closure, `&mut self` will reflect the changes from the
+    ///   kernel execution
+    ///
+    /// # Errors
+    ///
+    /// Returns a `rustacuda::errors::CudaError` iff an error occurs inside CUDA
+    fn lend_to_cuda_mut_async<
+        'a,
+        'stream,
+        O,
+        E: From<CudaError>,
+        F: for<'b> FnOnce(
+            Async<
+                'b,
+                'stream,
+                HostAndDeviceMutRef<DeviceAccessible<<Self as RustToCuda>::CudaRepresentation>>,
+                NoCompletion,
+            >,
+        ) -> Result<O, E>,
+        T: 'a,
+    >(
+        this: owning_ref::BoxRefMut<'a, T, Self>,
+        stream: &'stream rustacuda::stream::Stream,
+        inner: F,
+    ) -> Result<
+        (
+            Async<'a, 'stream, owning_ref::BoxRefMut<'a, T, Self>, CompletionFnMut<'a, Self>>,
+            O,
+        ),
+        E,
+    >
+    where
+        Self: Sync + SafeMutableAliasing;
+
     /// Moves `self` to CUDA iff `self` is [`StackOnly`].
     ///
     /// Since the [`HostAndDeviceOwned`] is wrapped in an [`Async`] with
@@ -360,6 +446,55 @@ impl<T: RustToCudaAsync> LendToCudaAsync for T {
         result
     }
 
+    fn lend_to_cuda_mut_async<
+        'a,
+        'stream,
+        O,
+        E: From<CudaError>,
+        F: for<'b> FnOnce(
+            Async<
+                'b,
+                'stream,
+                HostAndDeviceMutRef<DeviceAccessible<<Self as RustToCuda>::CudaRepresentation>>,
+                NoCompletion,
+            >,
+        ) -> Result<O, E>,
+        S: 'a,
+    >(
+        this: owning_ref::BoxRefMut<'a, S, Self>,
+        stream: &'stream rustacuda::stream::Stream,
+        inner: F,
+    ) -> Result<
+        (
+            Async<'a, 'stream, owning_ref::BoxRefMut<'a, S, Self>, CompletionFnMut<'a, Self>>,
+            O,
+        ),
+        E,
+    >
+    where
+        Self: Sync + SafeMutableAliasing,
+    {
+        let (cuda_repr, alloc) = unsafe { this.borrow_async(NoCudaAlloc, stream) }?;
+
+        let (mut cuda_repr, capture_on_completion) = unsafe { cuda_repr.unwrap_unchecked()? };
+
+        let result = HostAndDeviceMutRef::with_new(&mut cuda_repr, |mut_ref| {
+            let r#async = if matches!(capture_on_completion, Some(NoCompletion)) {
+                Async::pending(mut_ref, stream, NoCompletion)?
+            } else {
+                Async::ready(mut_ref, stream)
+            };
+
+            inner(r#async)
+        });
+
+        core::mem::drop(cuda_repr);
+
+        let (r#async, _): (_, NoCudaAlloc) = unsafe { Self::restore_async(this, alloc, stream) }?;
+
+        result.map(|ok| (r#async, ok))
+    }
+
     fn move_to_cuda_async<
         'stream,
         O,
@@ -403,13 +538,25 @@ pub trait BorrowFromRust: RustToCuda {
     /// # Safety
     ///
     /// This function is only safe to call iff `cuda_repr` is the
-    ///  [`DeviceConstRef`] borrowed on the CPU using the corresponding
-    ///  [`LendToCuda::lend_to_cuda`].
+    /// [`DeviceConstRef`] borrowed on the CPU using the corresponding
+    /// [`LendToCuda::lend_to_cuda`].
     unsafe fn with_borrow_from_rust<O, F: FnOnce(&Self) -> O>(
         cuda_repr: DeviceConstRef<DeviceAccessible<<Self as RustToCuda>::CudaRepresentation>>,
         inner: F,
     ) -> O;
 
+    /// # Safety
+    ///
+    /// This function is only safe to call iff `cuda_repr_mut` is the
+    /// [`DeviceMutRef`] borrowed on the CPU using the corresponding
+    /// [`LendToCuda::lend_to_cuda_mut`].
+    unsafe fn with_borrow_from_rust_mut<O, F: FnOnce(&mut Self) -> O>(
+        cuda_repr_mut: DeviceMutRef<DeviceAccessible<<Self as RustToCuda>::CudaRepresentation>>,
+        inner: F,
+    ) -> O
+    where
+        Self: SafeMutableAliasing;
+
     /// # Safety
     ///
     /// This function is only safe to call iff `cuda_repr` is the
@@ -437,6 +584,22 @@ impl<T: RustToCuda> BorrowFromRust for T {
         inner(&rust_repr)
     }
 
+    #[inline]
+    unsafe fn with_borrow_from_rust_mut<O, F: FnOnce(&mut Self) -> O>(
+        mut cuda_repr_mut: DeviceMutRef<DeviceAccessible<<Self as RustToCuda>::CudaRepresentation>>,
+        inner: F,
+    ) -> O
+    where
+        Self: SafeMutableAliasing,
+    {
+        // `rust_repr` must never be dropped as we do NOT own any of the
+        //  heap memory it might reference
+        let mut rust_repr_mut =
+            core::mem::ManuallyDrop::new(CudaAsRust::as_rust(cuda_repr_mut.as_mut()));
+
+        inner(&mut rust_repr_mut)
+    }
+
     #[inline]
     unsafe fn with_moved_from_rust<O, F: FnOnce(Self) -> O>(
         mut cuda_repr: DeviceOwnedRef<DeviceAccessible<<Self as RustToCuda>::CudaRepresentation>>,
diff --git a/src/safety/aliasing.rs b/src/safety/aliasing.rs
index 25cb61992..7add5775c 100644
--- a/src/safety/aliasing.rs
+++ b/src/safety/aliasing.rs
@@ -5,8 +5,8 @@
 ///
 /// # Safety
 ///
-/// A type may only implement [`NoSafeAliasing`], if and only if all of the
-/// conditions below hold:
+/// A type may only implement [`SafeMutableAliasing`], if and
+/// only if all of the safety conditions below hold:
 ///
 /// * Calling [`std::mem::replace`] on a mutable reference of the type does
 ///   *not* return a value which owns memory which it must deallocate on drop.
@@ -24,16 +24,18 @@
 ///   shallow inner state (in contrast to deep, which refers to values behind
 ///   references) of the value which the API user expects to be mutably shared
 ///   between all threads even if it is not in practice so as to not violate the
-///   second condition. For instance, a struct `Counter { pub a: u32 }` violates
-///   this third condition, as code with access to `&mut Counter` also gets
-///   mutable access to its field `a` and might assume that mutations of this
-///   field are either shared across threads or shared back with the host after
-///   the kernel has completed, neither of which is possible. In contrast, `&mut
-///   [T]` satisfies this condition, as it is well known that modifying the
-///   shallow length of a slice (by assigning a sub-slice) inside a function
-///   does not alter the length of the slice that the caller of the function
-///   passed in.
-pub unsafe trait NoSafeAliasing {}
+///   second condition. For instance, `Vec<T>` violates this third condition, as
+///   code with access to `&mut Vec<T>` can also mutate the length of the
+///   vector, which is shallow state that is expected to be propagated to the
+///   caller of a function sharing this vector (it is also related to the deep
+///   contents of the vector via a safety invariant) and might thus assume that
+///   mutations of this length are either shared across threads or shared back
+///   with the host after the kernel has completed, neither of which is
+///   possible. In contrast, `&mut [T]` satisfies this condition, as it is well
+///   known that modifying the shallow length of a slice (by assigning a
+///   sub-slice) inside a function does not alter the length of the slice that
+///   the caller of the function passed in.
+pub unsafe trait SafeMutableAliasing {}
 
 unsafe impl<
         'a,
@@ -41,20 +43,22 @@ unsafe impl<
             + crate::safety::PortableBitSemantics
             + const_type_layout::TypeGraphLayout,
         const STRIDE: usize,
-    > NoSafeAliasing
+    > SafeMutableAliasing
     for crate::utils::aliasing::SplitSliceOverCudaThreadsConstStride<&'a mut [T], STRIDE>
 {
 }
+
 unsafe impl<
         'a,
         T: crate::safety::StackOnly
             + crate::safety::PortableBitSemantics
             + const_type_layout::TypeGraphLayout,
-    > NoSafeAliasing
+    > SafeMutableAliasing
     for crate::utils::aliasing::SplitSliceOverCudaThreadsDynamicStride<&'a mut [T]>
 {
 }
 
+#[cfg(any(feature = "host", feature = "device"))]
 unsafe impl<
         T: crate::safety::StackOnly
             + crate::safety::PortableBitSemantics
@@ -62,20 +66,22 @@ unsafe impl<
         const M2D: bool,
         const M2H: bool,
         const STRIDE: usize,
-    > NoSafeAliasing
+    > SafeMutableAliasing
     for crate::utils::aliasing::SplitSliceOverCudaThreadsConstStride<
         crate::utils::exchange::buffer::CudaExchangeBuffer<T, M2D, M2H>,
         STRIDE,
     >
 {
 }
+
+#[cfg(any(feature = "host", feature = "device"))]
 unsafe impl<
         T: crate::safety::StackOnly
             + crate::safety::PortableBitSemantics
             + const_type_layout::TypeGraphLayout,
         const M2D: bool,
         const M2H: bool,
-    > NoSafeAliasing
+    > SafeMutableAliasing
     for crate::utils::aliasing::SplitSliceOverCudaThreadsDynamicStride<
         crate::utils::exchange::buffer::CudaExchangeBuffer<T, M2D, M2H>,
     >
diff --git a/src/safety/mod.rs b/src/safety/mod.rs
index a3741ea90..c26ef3389 100644
--- a/src/safety/mod.rs
+++ b/src/safety/mod.rs
@@ -8,6 +8,6 @@ pub mod kernel_signature;
 #[doc(hidden)]
 pub mod type_layout;
 
-pub use aliasing::NoSafeAliasing;
+pub use aliasing::SafeMutableAliasing;
 pub use portable::PortableBitSemantics;
 pub use stack_only::StackOnly;
diff --git a/src/utils/aliasing/const.rs b/src/utils/aliasing/const.rs
index 0259c301a..3ca7b0597 100644
--- a/src/utils/aliasing/const.rs
+++ b/src/utils/aliasing/const.rs
@@ -13,7 +13,7 @@ use crate::{
 };
 
 #[repr(transparent)]
-#[derive(Clone, TypeLayout)]
+#[derive(PartialEq, Eq, PartialOrd, Ord, Hash, TypeLayout)]
 pub struct SplitSliceOverCudaThreadsConstStride<T, const STRIDE: usize>(T);
 
 impl<T, const STRIDE: usize> SplitSliceOverCudaThreadsConstStride<T, STRIDE> {
diff --git a/src/utils/aliasing/dynamic.rs b/src/utils/aliasing/dynamic.rs
index 1c502dc8e..2c663e9d6 100644
--- a/src/utils/aliasing/dynamic.rs
+++ b/src/utils/aliasing/dynamic.rs
@@ -13,7 +13,7 @@ use crate::{
 };
 
 #[repr(C)]
-#[derive(Clone, TypeLayout)]
+#[derive(PartialEq, Eq, PartialOrd, Ord, Hash, TypeLayout)]
 pub struct SplitSliceOverCudaThreadsDynamicStride<T> {
     stride: usize,
     inner: T,
diff --git a/src/utils/async.rs b/src/utils/async.rs
index 87b91a3e0..e98758d4f 100644
--- a/src/utils/async.rs
+++ b/src/utils/async.rs
@@ -240,7 +240,6 @@ impl<'a, 'stream, T: BorrowMut<C::Completed>, C: Completion<T>> Async<'a, 'strea
     }
 
     #[allow(clippy::missing_errors_doc)] // FIXME
-    #[allow(clippy::type_complexity)] // FIXME
     /// # Safety
     ///
     /// The returned inner value of type `T` may not yet have completed its
@@ -454,3 +453,45 @@ impl<'a, 'stream, T: 'a> AsyncProj<'a, 'stream, T> {
         self.value
     }
 }
+
+#[cfg(feature = "host")]
+impl<'a, 'stream, T: 'a> AsyncProj<'a, 'stream, &'a T> {
+    #[must_use]
+    pub const fn as_ref<'b>(&'b self) -> AsyncProj<'b, 'stream, &'b T>
+    where
+        'a: 'b,
+    {
+        AsyncProj {
+            _capture: PhantomData::<&'b ()>,
+            _stream: PhantomData::<&'stream Stream>,
+            value: self.value,
+        }
+    }
+}
+
+#[cfg(feature = "host")]
+impl<'a, 'stream, T: 'a> AsyncProj<'a, 'stream, &'a mut T> {
+    #[must_use]
+    pub fn as_ref<'b>(&'b self) -> AsyncProj<'b, 'stream, &'b T>
+    where
+        'a: 'b,
+    {
+        AsyncProj {
+            _capture: PhantomData::<&'b ()>,
+            _stream: PhantomData::<&'stream Stream>,
+            value: self.value,
+        }
+    }
+
+    #[must_use]
+    pub fn as_mut<'b>(&'b mut self) -> AsyncProj<'b, 'stream, &'b mut T>
+    where
+        'a: 'b,
+    {
+        AsyncProj {
+            _capture: PhantomData::<&'b ()>,
+            _stream: PhantomData::<&'stream Stream>,
+            value: self.value,
+        }
+    }
+}
diff --git a/src/utils/exchange/wrapper.rs b/src/utils/exchange/wrapper.rs
index 0f1ff89f8..1f3326c5b 100644
--- a/src/utils/exchange/wrapper.rs
+++ b/src/utils/exchange/wrapper.rs
@@ -10,6 +10,7 @@ use crate::{
     alloc::{EmptyCudaAlloc, NoCudaAlloc},
     host::{CudaDropWrapper, HostAndDeviceConstRef, HostAndDeviceMutRef},
     lend::{RustToCuda, RustToCudaAsync},
+    safety::SafeMutableAliasing,
     utils::{
         adapter::DeviceCopyWithPortableBitSemantics,
         ffi::DeviceAccessible,
@@ -85,16 +86,10 @@ impl<T: RustToCuda<CudaAllocation: EmptyCudaAlloc>> ExchangeWrapperOnHost<T> {
         })
     }
 
-    // TODO: safety constraint?
     /// Moves the data synchronously to the CUDA device, where it can then be
     /// lent out immutably via [`ExchangeWrapperOnDevice::as_ref`], or mutably
     /// via [`ExchangeWrapperOnDevice::as_mut`].
     ///
-    /// To avoid aliasing, each CUDA thread will get access to its own shallow
-    /// copy of the data. Hence,
-    /// - any shallow changes to the data will NOT be reflected back to the CPU
-    /// - any deep changes to the data WILL be reflected back to the CPU
-    ///
     /// # Errors
     /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside
     /// CUDA
@@ -118,14 +113,8 @@ impl<T: RustToCudaAsync<CudaAllocationAsync: EmptyCudaAlloc, CudaAllocation: Emp
     ExchangeWrapperOnHost<T>
 {
     #[allow(clippy::needless_lifetimes)] // keep 'stream explicit
-    // TODO: safety constraint?
     /// Moves the data asynchronously to the CUDA device.
     ///
-    /// To avoid aliasing, each CUDA thread will get access to its own shallow
-    /// copy of the data. Hence,
-    /// - any shallow changes to the data will NOT be reflected back to the CPU
-    /// - any deep changes to the data WILL be reflected back to the CPU
-    ///
     /// # Errors
     /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside
     /// CUDA
@@ -174,14 +163,8 @@ impl<T: RustToCuda<CudaAllocation: EmptyCudaAlloc>> DerefMut for ExchangeWrapper
 }
 
 impl<T: RustToCuda<CudaAllocation: EmptyCudaAlloc>> ExchangeWrapperOnDevice<T> {
-    // TODO: safety constraint?
     /// Moves the data synchronously back to the host CPU device.
     ///
-    /// To avoid aliasing, each CUDA thread only got access to its own shallow
-    /// copy of the data. Hence,
-    /// - any shallow changes to the data will NOT be reflected back to the CPU
-    /// - any deep changes to the data WILL be reflected back to the CPU
-    ///
     /// # Errors
     /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside
     /// CUDA
@@ -216,7 +199,10 @@ impl<T: RustToCuda<CudaAllocation: EmptyCudaAlloc>> ExchangeWrapperOnDevice<T> {
     #[must_use]
     pub fn as_mut(
         &mut self,
-    ) -> HostAndDeviceMutRef<DeviceAccessible<<T as RustToCuda>::CudaRepresentation>> {
+    ) -> HostAndDeviceMutRef<DeviceAccessible<<T as RustToCuda>::CudaRepresentation>>
+    where
+        T: SafeMutableAliasing,
+    {
         // Safety: `device_box` contains exactly the device copy of `locked_cuda_repr`
         unsafe {
             HostAndDeviceMutRef::new_unchecked(
@@ -231,14 +217,8 @@ impl<T: RustToCudaAsync<CudaAllocationAsync: EmptyCudaAlloc, CudaAllocation: Emp
     ExchangeWrapperOnDevice<T>
 {
     #[allow(clippy::needless_lifetimes)] // keep 'stream explicit
-    // TODO: safety constraint?
     /// Moves the data asynchronously back to the host CPU device.
     ///
-    /// To avoid aliasing, each CUDA thread only got access to its own shallow
-    /// copy of the data. Hence,
-    /// - any shallow changes to the data will NOT be reflected back to the CPU
-    /// - any deep changes to the data WILL be reflected back to the CPU
-    ///
     /// # Errors
     /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside
     /// CUDA
@@ -295,14 +275,8 @@ impl<
         T: RustToCudaAsync<CudaAllocationAsync: EmptyCudaAlloc, CudaAllocation: EmptyCudaAlloc>,
     > Async<'a, 'stream, ExchangeWrapperOnDevice<T>, NoCompletion>
 {
-    // TODO: safety constraint?
     /// Moves the data asynchronously back to the host CPU device.
     ///
-    /// To avoid aliasing, each CUDA thread only got access to its own shallow
-    /// copy of the data. Hence,
-    /// - any shallow changes to the data will NOT be reflected back to the CPU
-    /// - any deep changes to the data WILL be reflected back to the CPU
-    ///
     /// # Errors
     /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside
     /// CUDA
@@ -380,7 +354,10 @@ impl<
         '_,
         'stream,
         HostAndDeviceMutRef<DeviceAccessible<<T as RustToCuda>::CudaRepresentation>>,
-    > {
+    >
+    where
+        T: SafeMutableAliasing,
+    {
         let this = unsafe { self.as_mut().unwrap_unchecked() };
 
         AsyncProj::new(unsafe {

From eeb4020cc98baad04d0a01f747efd752463541d3 Mon Sep 17 00:00:00 2001
From: Juniper Tyree <juniper.tyree@helsinki.fi>
Date: Sun, 7 Jan 2024 04:25:37 +0000
Subject: [PATCH 089/120] Fix no-std Box import for LendRustToCuda derive

---
 rust-cuda-derive/src/rust_to_cuda/impl.rs | 2 +-
 src/deps.rs                               | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/rust-cuda-derive/src/rust_to_cuda/impl.rs b/rust-cuda-derive/src/rust_to_cuda/impl.rs
index 674f5e166..40dd3487d 100644
--- a/rust-cuda-derive/src/rust_to_cuda/impl.rs
+++ b/rust-cuda-derive/src/rust_to_cuda/impl.rs
@@ -170,7 +170,7 @@ pub fn rust_to_cuda_async_trait(
                 #crate_path::utils::r#async::Async::<
                     _, #crate_path::utils::r#async::CompletionFnMut<Self>,
                 >::pending(
-                    this, stream, Box::new(|this| {
+                    this, stream, #crate_path::deps::alloc::boxed::Box::new(|this| {
                         #(#r2c_field_async_completion_calls)*
                         Ok(())
                     }),
diff --git a/src/deps.rs b/src/deps.rs
index 0000f9250..50fd38f3f 100644
--- a/src/deps.rs
+++ b/src/deps.rs
@@ -1,4 +1,5 @@
-pub(crate) extern crate alloc;
+#[doc(hidden)]
+pub extern crate alloc;
 
 pub extern crate const_type_layout;
 

From 4eaaa92afdc3e43427c58bc4125247dc49b762b7 Mon Sep 17 00:00:00 2001
From: Juniper Tyree <juniper.tyree@helsinki.fi>
Date: Sun, 7 Jan 2024 05:04:47 +0000
Subject: [PATCH 090/120] Re-add RustToCuda implementation for Final

---
 .vscode/settings.json    |   1 +
 Cargo.toml               |   3 ++
 src/lend/impls/final.rs  | 102 +++++++++++++++++++++++++++++++++++++++
 src/lend/impls/mod.rs    |   2 +
 src/lend/impls/option.rs |   4 +-
 src/lend/mod.rs          |  12 ++---
 6 files changed, 116 insertions(+), 8 deletions(-)
 create mode 100644 src/lend/impls/final.rs

diff --git a/.vscode/settings.json b/.vscode/settings.json
index ddfa41463..d12ff8221 100644
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -6,6 +6,7 @@
     "rust-analyzer.cargo.allFeatures": false,
     "rust-analyzer.cargo.features": [
         "derive",
+        "final",
         "host",
         "kernel"
     ],
diff --git a/Cargo.toml b/Cargo.toml
index eb0e1725f..5aaa324bb 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -21,6 +21,7 @@ rust-version = "1.75" # nightly
 default = []
 derive = ["dep:rustacuda_derive", "dep:rust-cuda-derive"]
 device = []
+final = ["dep:final"]
 host = ["dep:rustacuda", "dep:regex", "dep:oneshot", "dep:safer_owning_ref"]
 kernel = ["dep:rust-cuda-kernel"]
 
@@ -37,5 +38,7 @@ const-type-layout = { version = "0.2.1", features = ["derive"] }
 safer_owning_ref = { version = "0.5", optional = true }
 oneshot = { version = "0.1", optional = true, features = ["std", "async"] }
 
+final = { version = "0.1.1", optional = true }
+
 rust-cuda-derive = { path = "rust-cuda-derive", optional = true }
 rust-cuda-kernel = { path = "rust-cuda-kernel", optional = true }
diff --git a/src/lend/impls/final.rs b/src/lend/impls/final.rs
new file mode 100644
index 000000000..6235a58fe
--- /dev/null
+++ b/src/lend/impls/final.rs
@@ -0,0 +1,102 @@
+use r#final::Final;
+
+use crate::{
+    lend::{CudaAsRust, RustToCuda, RustToCudaAsync},
+    utils::ffi::DeviceAccessible,
+};
+
+#[doc(hidden)]
+#[allow(clippy::module_name_repetitions)]
+#[derive(const_type_layout::TypeLayout)]
+#[repr(transparent)]
+pub struct FinalCudaRepresentation<T: CudaAsRust>(DeviceAccessible<T>);
+
+unsafe impl<T: RustToCuda> RustToCuda for Final<T> {
+    type CudaAllocation = T::CudaAllocation;
+    type CudaRepresentation = FinalCudaRepresentation<T::CudaRepresentation>;
+
+    #[cfg(feature = "host")]
+    #[allow(clippy::type_complexity)]
+    unsafe fn borrow<A: crate::alloc::CudaAlloc>(
+        &self,
+        alloc: A,
+    ) -> rustacuda::error::CudaResult<(
+        DeviceAccessible<Self::CudaRepresentation>,
+        crate::alloc::CombinedCudaAlloc<Self::CudaAllocation, A>,
+    )> {
+        let (cuda_repr, alloc) = (**self).borrow(alloc)?;
+
+        Ok((
+            DeviceAccessible::from(FinalCudaRepresentation(cuda_repr)),
+            alloc,
+        ))
+    }
+
+    #[cfg(feature = "host")]
+    unsafe fn restore<A: crate::alloc::CudaAlloc>(
+        &mut self,
+        alloc: crate::alloc::CombinedCudaAlloc<Self::CudaAllocation, A>,
+    ) -> rustacuda::error::CudaResult<A> {
+        let (_alloc_front, alloc_tail) = alloc.split();
+        Ok(alloc_tail)
+    }
+}
+
+unsafe impl<T: RustToCudaAsync> RustToCudaAsync for Final<T> {
+    type CudaAllocationAsync = T::CudaAllocationAsync;
+
+    #[cfg(feature = "host")]
+    unsafe fn borrow_async<'stream, A: crate::alloc::CudaAlloc>(
+        &self,
+        alloc: A,
+        stream: &'stream rustacuda::stream::Stream,
+    ) -> rustacuda::error::CudaResult<(
+        crate::utils::r#async::Async<'_, 'stream, DeviceAccessible<Self::CudaRepresentation>>,
+        crate::alloc::CombinedCudaAlloc<Self::CudaAllocationAsync, A>,
+    )> {
+        let (cuda_repr, alloc) = (**self).borrow_async(alloc, stream)?;
+        let (cuda_repr, completion) = unsafe { cuda_repr.unwrap_unchecked()? };
+
+        let final_cuda_repr = DeviceAccessible::from(FinalCudaRepresentation(cuda_repr));
+
+        let r#async = if matches!(completion, Some(crate::utils::r#async::NoCompletion)) {
+            crate::utils::r#async::Async::pending(
+                final_cuda_repr,
+                stream,
+                crate::utils::r#async::NoCompletion,
+            )?
+        } else {
+            crate::utils::r#async::Async::ready(final_cuda_repr, stream)
+        };
+
+        Ok((r#async, alloc))
+    }
+
+    #[cfg(feature = "host")]
+    unsafe fn restore_async<'a, 'stream, A: crate::alloc::CudaAlloc, O>(
+        this: owning_ref::BoxRefMut<'a, O, Self>,
+        alloc: crate::alloc::CombinedCudaAlloc<Self::CudaAllocationAsync, A>,
+        stream: &'stream rustacuda::stream::Stream,
+    ) -> rustacuda::error::CudaResult<(
+        crate::utils::r#async::Async<
+            'a,
+            'stream,
+            owning_ref::BoxRefMut<'a, O, Self>,
+            crate::utils::r#async::CompletionFnMut<'a, Self>,
+        >,
+        A,
+    )> {
+        let (_alloc_front, alloc_tail) = alloc.split();
+        let r#async = crate::utils::r#async::Async::ready(this, stream);
+        Ok((r#async, alloc_tail))
+    }
+}
+
+unsafe impl<T: CudaAsRust> CudaAsRust for FinalCudaRepresentation<T> {
+    type RustRepresentation = Final<T::RustRepresentation>;
+
+    #[cfg(feature = "device")]
+    unsafe fn as_rust(this: &DeviceAccessible<Self>) -> Self::RustRepresentation {
+        Final::new(CudaAsRust::as_rust(&this.0))
+    }
+}
diff --git a/src/lend/impls/mod.rs b/src/lend/impls/mod.rs
index 18f546bbd..e0360671c 100644
--- a/src/lend/impls/mod.rs
+++ b/src/lend/impls/mod.rs
@@ -1,5 +1,7 @@
 mod r#box;
 mod boxed_slice;
+#[cfg(feature = "final")]
+mod r#final;
 mod option;
 mod r#ref;
 mod ref_mut;
diff --git a/src/lend/impls/option.rs b/src/lend/impls/option.rs
index 197906baf..b1c51b9a5 100644
--- a/src/lend/impls/option.rs
+++ b/src/lend/impls/option.rs
@@ -108,7 +108,7 @@ unsafe impl<T: RustToCudaAsync> RustToCudaAsync for Option<T> {
             Some(value) => {
                 let (cuda_repr, alloc) = value.borrow_async(alloc, stream)?;
 
-                let (cuda_repr, capture_on_completion) = unsafe { cuda_repr.unwrap_unchecked()? };
+                let (cuda_repr, completion) = unsafe { cuda_repr.unwrap_unchecked()? };
 
                 let (alloc_front, alloc_tail) = alloc.split();
                 let alloc = CombinedCudaAlloc::new(Some(alloc_front), alloc_tail);
@@ -118,7 +118,7 @@ unsafe impl<T: RustToCudaAsync> RustToCudaAsync for Option<T> {
                     present: true,
                 });
 
-                let r#async = if matches!(capture_on_completion, Some(NoCompletion)) {
+                let r#async = if matches!(completion, Some(NoCompletion)) {
                     Async::pending(option_cuda_repr, stream, NoCompletion)?
                 } else {
                     Async::ready(option_cuda_repr, stream)
diff --git a/src/lend/mod.rs b/src/lend/mod.rs
index a78cd4018..b3f83ecff 100644
--- a/src/lend/mod.rs
+++ b/src/lend/mod.rs
@@ -428,10 +428,10 @@ impl<T: RustToCudaAsync> LendToCudaAsync for T {
     {
         let (cuda_repr, alloc) = unsafe { self.borrow_async(NoCudaAlloc, stream) }?;
 
-        let (cuda_repr, capture_on_completion) = unsafe { cuda_repr.unwrap_unchecked()? };
+        let (cuda_repr, completion) = unsafe { cuda_repr.unwrap_unchecked()? };
 
         let result = HostAndDeviceConstRef::with_new(&cuda_repr, |const_ref| {
-            let r#async = if matches!(capture_on_completion, Some(NoCompletion)) {
+            let r#async = if matches!(completion, Some(NoCompletion)) {
                 Async::pending(const_ref, stream, NoCompletion)?
             } else {
                 Async::ready(const_ref, stream)
@@ -476,10 +476,10 @@ impl<T: RustToCudaAsync> LendToCudaAsync for T {
     {
         let (cuda_repr, alloc) = unsafe { this.borrow_async(NoCudaAlloc, stream) }?;
 
-        let (mut cuda_repr, capture_on_completion) = unsafe { cuda_repr.unwrap_unchecked()? };
+        let (mut cuda_repr, completion) = unsafe { cuda_repr.unwrap_unchecked()? };
 
         let result = HostAndDeviceMutRef::with_new(&mut cuda_repr, |mut_ref| {
-            let r#async = if matches!(capture_on_completion, Some(NoCompletion)) {
+            let r#async = if matches!(completion, Some(NoCompletion)) {
                 Async::pending(mut_ref, stream, NoCompletion)?
             } else {
                 Async::ready(mut_ref, stream)
@@ -517,10 +517,10 @@ impl<T: RustToCudaAsync> LendToCudaAsync for T {
     {
         let (cuda_repr, alloc) = unsafe { self.borrow_async(NoCudaAlloc, stream) }?;
 
-        let (cuda_repr, capture_on_completion) = unsafe { cuda_repr.unwrap_unchecked()? };
+        let (cuda_repr, completion) = unsafe { cuda_repr.unwrap_unchecked()? };
 
         let result = HostAndDeviceOwned::with_new(cuda_repr, |owned_ref| {
-            if matches!(capture_on_completion, Some(NoCompletion)) {
+            if matches!(completion, Some(NoCompletion)) {
                 inner(Async::pending(owned_ref, stream, NoCompletion)?)
             } else {
                 inner(Async::ready(owned_ref, stream))

From fc18c7908f94ebc1e76ba5b722ffe7118b618035 Mon Sep 17 00:00:00 2001
From: Juniper Tyree <juniper.tyree@helsinki.fi>
Date: Sun, 7 Jan 2024 05:30:17 +0000
Subject: [PATCH 091/120] Remove redundant RustToCudaAsyncProxy

---
 rust-cuda-derive/src/rust_to_cuda/field_copy.rs | 6 +++---
 src/lend/mod.rs                                 | 4 ----
 2 files changed, 3 insertions(+), 7 deletions(-)

diff --git a/rust-cuda-derive/src/rust_to_cuda/field_copy.rs b/rust-cuda-derive/src/rust_to_cuda/field_copy.rs
index c32ac67ee..05d133156 100644
--- a/rust-cuda-derive/src/rust_to_cuda/field_copy.rs
+++ b/rust-cuda-derive/src/rust_to_cuda/field_copy.rs
@@ -158,7 +158,7 @@ pub fn impl_field_copy_init_and_expand_alloc_type(
             r2c_field_async_declarations.push(quote! {
                 let (#field_repr_ident, alloc_front) = #crate_path::lend::RustToCudaAsync::borrow_async(
                     <
-                        #proxy_ty as #crate_path::lend::RustToCudaAsyncProxy<#field_ty>
+                        #proxy_ty as #crate_path::lend::RustToCudaProxy<#field_ty>
                     >::from_ref(&self.#field_accessor),
                     alloc_front,
                     stream,
@@ -184,7 +184,7 @@ pub fn impl_field_copy_init_and_expand_alloc_type(
                 };
                 let (r#async, alloc_front) = #crate_path::lend::RustToCudaAsync::restore_async(
                     this.map_mut(|this| <
-                        #proxy_ty as #crate_path::lend::RustToCudaProxyAsync<#field_ty>
+                        #proxy_ty as #crate_path::lend::RustToCudaProxy<#field_ty>
                     >::from_mut(&mut this.#field_accessor)),
                     alloc_front,
                     stream,
@@ -199,7 +199,7 @@ pub fn impl_field_copy_init_and_expand_alloc_type(
                     #crate_path::deps::owning_ref::BoxRefMut<'a, CudaRestoreOwner, _>
                 >::complete(
                     #field_completion_ident, <
-                        #proxy_ty as #crate_path::lend::RustToCudaProxyAsync<#field_ty>
+                        #proxy_ty as #crate_path::lend::RustToCudaProxy<#field_ty>
                     >::from_mut(&mut this.#field_accessor),
                 )?;
             });
diff --git a/src/lend/mod.rs b/src/lend/mod.rs
index b3f83ecff..7a3934aa0 100644
--- a/src/lend/mod.rs
+++ b/src/lend/mod.rs
@@ -155,10 +155,6 @@ pub trait RustToCudaProxy<T>: RustToCuda {
     fn into(self) -> T;
 }
 
-pub trait RustToCudaAsyncProxy<T>: RustToCudaAsync + RustToCudaProxy<T> {}
-
-impl<T, P: RustToCudaAsync + RustToCudaProxy<T>> RustToCudaAsyncProxy<T> for P {}
-
 #[cfg(feature = "host")]
 #[allow(clippy::module_name_repetitions)]
 pub trait LendToCuda: RustToCuda {

From abaa2598fc7ecd53a6a4538a55aad6931c910d8a Mon Sep 17 00:00:00 2001
From: Juniper Tyree <juniper.tyree@helsinki.fi>
Date: Sun, 7 Jan 2024 20:22:43 +0000
Subject: [PATCH 092/120] More progress on less 'static bounds on kernel params

---
 src/kernel/mod.rs   |   62 +-
 src/kernel/param.rs | 1562 +++++++++++++++++++++----------------------
 2 files changed, 823 insertions(+), 801 deletions(-)

diff --git a/src/kernel/mod.rs b/src/kernel/mod.rs
index 7026efc1a..cc51d64f0 100644
--- a/src/kernel/mod.rs
+++ b/src/kernel/mod.rs
@@ -39,52 +39,76 @@ mod sealed {
     pub struct Token;
 }
 
+#[cfg(feature = "host")]
+pub trait WithNewAsync<'stream, P: ?Sized + CudaKernelParameter, O, E: From<rustacuda::error::CudaError>> {
+    fn with<'b>(self, param: P::AsyncHostType<'stream, 'b>) -> Result<O, E> where P: 'b;
+}
+
+#[cfg(feature = "host")]
+impl<'stream, P: ?Sized + CudaKernelParameter, O, E: From<rustacuda::error::CudaError>, F: for<'b> FnOnce(P::AsyncHostType<'stream, 'b>) -> Result<O, E>> WithNewAsync<'stream, P, O, E> for F {
+    fn with<'b>(self, param: P::AsyncHostType<'stream, 'b>) -> Result<O, E> where P: 'b {
+        (self)(param)
+    }
+}
+
+#[cfg(feature = "device")]
+pub trait WithFfiAsDevice<P: ?Sized + CudaKernelParameter, O> {
+    fn with<'b>(self, param: P::DeviceType<'b>) -> O where P: 'b;
+}
+
+#[cfg(feature = "device")]
+impl<P: ?Sized + CudaKernelParameter, O, F: for<'b> FnOnce(P::DeviceType<'b>) -> O> WithFfiAsDevice<P, O> for F {
+    fn with<'b>(self, param: P::DeviceType<'b>) -> O where P: 'b {
+        (self)(param)
+    }
+}
+
 pub trait CudaKernelParameter: sealed::Sealed {
     #[cfg(feature = "host")]
     type SyncHostType;
     #[cfg(feature = "host")]
-    type AsyncHostType<'stream, 'b>;
+    type AsyncHostType<'stream, 'b> where Self: 'b;
     #[doc(hidden)]
-    type FfiType<'stream, 'b>: PortableBitSemantics;
+    type FfiType<'stream, 'b>: PortableBitSemantics where Self: 'b;
     #[cfg(any(feature = "device", doc))]
-    type DeviceType<'b>;
+    type DeviceType<'b> where Self: 'b;
 
     #[cfg(feature = "host")]
     #[allow(clippy::missing_errors_doc)] // FIXME
-    fn with_new_async<'stream, O, E: From<rustacuda::error::CudaError>>(
+    fn with_new_async<'stream, 'param, O, E: From<rustacuda::error::CudaError>>(
         param: Self::SyncHostType,
         stream: &'stream rustacuda::stream::Stream,
-        inner: impl for<'b> FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result<O, E>,
-    ) -> Result<O, E>;
+        inner: impl WithNewAsync<'stream, Self, O, E>,
+    ) -> Result<O, E> where Self: 'param;
 
     #[doc(hidden)]
     #[cfg(feature = "host")]
-    fn with_async_as_ptx_jit<O>(
-        param: &Self::AsyncHostType<'_, '_>,
+    fn with_async_as_ptx_jit<'stream, 'b, O>(
+        param: &Self::AsyncHostType<'stream, 'b>,
         token: sealed::Token,
         inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O,
-    ) -> O;
+    ) -> O where Self: 'b;
 
     #[doc(hidden)]
     #[cfg(feature = "host")]
-    fn shared_layout_for_async(
-        param: &Self::AsyncHostType<'_, '_>,
+    fn shared_layout_for_async<'stream, 'b>(
+        param: &Self::AsyncHostType<'stream, 'b>,
         token: sealed::Token,
-    ) -> std::alloc::Layout;
+    ) -> std::alloc::Layout where Self: 'b;
 
     #[doc(hidden)]
     #[cfg(feature = "host")]
     fn async_to_ffi<'stream, 'b, E: From<rustacuda::error::CudaError>>(
         param: Self::AsyncHostType<'stream, 'b>,
         token: sealed::Token,
-    ) -> Result<Self::FfiType<'stream, 'b>, E>;
+    ) -> Result<Self::FfiType<'stream, 'b>, E> where Self: 'b;
 
     #[doc(hidden)]
     #[cfg(feature = "device")]
-    unsafe fn with_ffi_as_device<O, const PARAM: usize>(
-        param: Self::FfiType<'static, 'static>,
-        inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O,
-    ) -> O;
+    unsafe fn with_ffi_as_device<'short, O, const PARAM: usize>(
+        param: Self::FfiType<'static, 'short>,
+        inner: impl WithFfiAsDevice<Self, O>,
+    ) -> O where Self: 'short;
 }
 
 #[cfg(feature = "host")]
@@ -151,7 +175,7 @@ macro_rules! impl_launcher_launch {
         $inner
     };
     (impl $func:ident ($arg0:ident : $T0:ident $(, $arg:ident : $T:ident)*) + ($($other:expr),*) $inner:block) => {
-        $T0::$func($arg0 $(, $other)*, |$arg0| {
+        $T0::$func($arg0 $(, $other)*, |$arg0: <$T0 as CudaKernelParameter>::AsyncHostType<'stream, '_>| {
             impl_launcher_launch! { impl $func ($($arg: $T),*) + ($($other),*) $inner }
         })
     };
@@ -395,7 +419,7 @@ macro_rules! impl_typed_kernel_launch {
         $inner
     };
     (impl $func:ident ($arg0:ident : $T0:ident $(, $arg:ident : $T:ident)*) + ($($other:expr),*) $inner:block) => {
-        $T0::$func($arg0 $(, $other)*, |$arg0| {
+        $T0::$func($arg0 $(, $other)*, |$arg0: <$T0 as CudaKernelParameter>::AsyncHostType<'stream, '_>| {
             impl_typed_kernel_launch! { impl $func ($($arg: $T),*) + ($($other),*) $inner }
         })
     };
diff --git a/src/kernel/param.rs b/src/kernel/param.rs
index f28944b81..edc56f4b7 100644
--- a/src/kernel/param.rs
+++ b/src/kernel/param.rs
@@ -72,36 +72,36 @@ impl<
 {
     #[cfg(feature = "host")]
     type AsyncHostType<'stream, 'b> =
-        crate::utils::adapter::RustToCudaWithPortableBitCopySemantics<T>;
+        crate::utils::adapter::RustToCudaWithPortableBitCopySemantics<T> where Self: 'b;
     #[cfg(any(feature = "device", doc))]
-    type DeviceType<'b> = T;
-    type FfiType<'stream, 'b> = crate::utils::adapter::RustToCudaWithPortableBitCopySemantics<T>;
+    type DeviceType<'b> = T where Self: 'b;
+    type FfiType<'stream, 'b> = crate::utils::adapter::RustToCudaWithPortableBitCopySemantics<T> where Self: 'b;
     #[cfg(feature = "host")]
     type SyncHostType = T;
 
     #[cfg(feature = "host")]
-    fn with_new_async<'stream, O, E: From<rustacuda::error::CudaError>>(
+    fn with_new_async<'stream, 'param, O, E: From<rustacuda::error::CudaError>>(
         param: Self::SyncHostType,
         _stream: &'stream rustacuda::stream::Stream,
-        inner: impl for<'b> FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result<O, E>,
-    ) -> Result<O, E> {
-        inner(crate::utils::adapter::RustToCudaWithPortableBitCopySemantics::from(param))
+        inner: impl super::WithNewAsync<'stream, Self, O, E>,
+    ) -> Result<O, E> where Self: 'param {
+        inner.with(crate::utils::adapter::RustToCudaWithPortableBitCopySemantics::from(param))
     }
 
     #[cfg(feature = "host")]
-    fn with_async_as_ptx_jit<O>(
-        _param: &Self::AsyncHostType<'_, '_>,
+    fn with_async_as_ptx_jit<'stream, 'b, O>(
+        _param: &Self::AsyncHostType<'stream, 'b>,
         _token: sealed::Token,
         inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O,
-    ) -> O {
+    ) -> O where Self: 'b {
         inner(None)
     }
 
     #[cfg(feature = "host")]
-    fn shared_layout_for_async(
-        _param: &Self::AsyncHostType<'_, '_>,
+    fn shared_layout_for_async<'stream, 'b>(
+        _param: &Self::AsyncHostType<'stream, 'b>,
         _token: sealed::Token,
-    ) -> Layout {
+    ) -> Layout where Self: 'b {
         Layout::new::<()>()
     }
 
@@ -109,18 +109,18 @@ impl<
     fn async_to_ffi<'stream, 'b, E: From<rustacuda::error::CudaError>>(
         param: Self::AsyncHostType<'stream, 'b>,
         _token: sealed::Token,
-    ) -> Result<Self::FfiType<'stream, 'b>, E> {
+    ) -> Result<Self::FfiType<'stream, 'b>, E> where Self: 'b {
         Ok(param)
     }
 
     #[cfg(feature = "device")]
-    unsafe fn with_ffi_as_device<O, const PARAM: usize>(
-        param: Self::FfiType<'static, 'static>,
-        inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O,
-    ) -> O {
+    unsafe fn with_ffi_as_device<'short, O, const PARAM: usize>(
+        param: Self::FfiType<'static, 'short>,
+        inner: impl super::WithFfiAsDevice<Self, O>,
+    ) -> O where Self: 'short {
         let param = param.into_inner();
 
-        inner(param)
+        inner.with(param)
     }
 }
 impl<
@@ -135,8 +135,7 @@ impl<
 
 impl<
         'a,
-        T: 'static
-            + Sync
+        T: Sync
             + crate::safety::StackOnly
             + crate::safety::PortableBitSemantics
             + TypeGraphLayout,
@@ -147,38 +146,38 @@ impl<
         'b,
         'stream,
         &'b crate::host::HostAndDeviceConstRef<'b, T>,
-    >;
+    > where Self: 'b;
     #[cfg(any(feature = "device", doc))]
-    type DeviceType<'b> = &'b T;
-    type FfiType<'stream, 'b> = DeviceConstRef<'b, T>;
+    type DeviceType<'b> = &'b T where Self: 'b;
+    type FfiType<'stream, 'b> = DeviceConstRef<'b, T> where Self: 'b;
     #[cfg(feature = "host")]
     type SyncHostType = &'a T;
 
     #[cfg(feature = "host")]
-    fn with_new_async<'stream, O, E: From<rustacuda::error::CudaError>>(
+    fn with_new_async<'stream, 'param, O, E: From<rustacuda::error::CudaError>>(
         param: Self::SyncHostType,
         stream: &'stream rustacuda::stream::Stream,
-        inner: impl for<'b> FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result<O, E>,
-    ) -> Result<O, E> {
+        inner: impl super::WithNewAsync<'stream, Self, O, E>,
+    ) -> Result<O, E> where Self: 'param {
         crate::host::HostAndDeviceConstRef::with_new(param, |const_ref| {
-            inner(const_ref.as_async(stream).as_ref())
+            inner.with(const_ref.as_async(stream).as_ref())
         })
     }
 
     #[cfg(feature = "host")]
-    fn with_async_as_ptx_jit<O>(
-        _param: &Self::AsyncHostType<'_, '_>,
+    fn with_async_as_ptx_jit<'stream, 'b, O>(
+        _param: &Self::AsyncHostType<'stream, 'b>,
         _token: sealed::Token,
         inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O,
-    ) -> O {
+    ) -> O where Self: 'b {
         inner(None)
     }
 
     #[cfg(feature = "host")]
-    fn shared_layout_for_async(
-        _param: &Self::AsyncHostType<'_, '_>,
+    fn shared_layout_for_async<'stream, 'b>(
+        _param: &Self::AsyncHostType<'stream, 'b>,
         _token: sealed::Token,
-    ) -> Layout {
+    ) -> Layout where Self: 'b {
         Layout::new::<()>()
     }
 
@@ -186,25 +185,24 @@ impl<
     fn async_to_ffi<'stream, 'b, E: From<rustacuda::error::CudaError>>(
         param: Self::AsyncHostType<'stream, 'b>,
         _token: sealed::Token,
-    ) -> Result<Self::FfiType<'stream, 'b>, E> {
+    ) -> Result<Self::FfiType<'stream, 'b>, E> where Self: 'b {
         let param = unsafe { param.unwrap_unchecked() };
         Ok(param.for_device())
     }
 
     #[cfg(feature = "device")]
-    unsafe fn with_ffi_as_device<O, const PARAM: usize>(
-        param: Self::FfiType<'static, 'static>,
-        inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O,
-    ) -> O {
+    unsafe fn with_ffi_as_device<'short, O, const PARAM: usize>(
+        param: Self::FfiType<'static, 'short>,
+        inner: impl super::WithFfiAsDevice<Self, O>,
+    ) -> O where Self: 'short {
         let param = param.as_ref();
 
-        inner(param)
+        inner.with(param)
     }
 }
 impl<
         'a,
-        T: 'static
-            + Sync
+        T: Sync
             + crate::safety::StackOnly
             + crate::safety::PortableBitSemantics
             + TypeGraphLayout,
@@ -214,8 +212,7 @@ impl<
 
 impl<
         'a,
-        T: 'static
-            + Sync
+        T: Sync
             + crate::safety::StackOnly
             + crate::safety::PortableBitSemantics
             + TypeGraphLayout,
@@ -223,38 +220,40 @@ impl<
 {
     #[cfg(feature = "host")]
     type AsyncHostType<'stream, 'b> =
-        <&'a PerThreadShallowCopy<T> as CudaKernelParameter>::AsyncHostType<'stream, 'b>;
+        <&'a PerThreadShallowCopy<T> as CudaKernelParameter>::AsyncHostType<'stream, 'b> where Self: 'b;
     #[cfg(any(feature = "device", doc))]
-    type DeviceType<'b> = <&'a PerThreadShallowCopy<T> as CudaKernelParameter>::DeviceType<'b>;
+    type DeviceType<'b> = <&'a PerThreadShallowCopy<T> as CudaKernelParameter>::DeviceType<'b> where Self: 'b;
     type FfiType<'stream, 'b> =
-        <&'a PerThreadShallowCopy<T> as CudaKernelParameter>::FfiType<'stream, 'b>;
+        <&'a PerThreadShallowCopy<T> as CudaKernelParameter>::FfiType<'stream, 'b> where Self: 'b;
     #[cfg(feature = "host")]
     type SyncHostType = <&'a PerThreadShallowCopy<T> as CudaKernelParameter>::SyncHostType;
 
     #[cfg(feature = "host")]
-    fn with_new_async<'stream, O, E: From<rustacuda::error::CudaError>>(
+    fn with_new_async<'stream, 'param, O, E: From<rustacuda::error::CudaError>>(
         param: Self::SyncHostType,
         stream: &'stream rustacuda::stream::Stream,
-        inner: impl for<'b> FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result<O, E>,
-    ) -> Result<O, E> {
-        <&'a PerThreadShallowCopy<T> as CudaKernelParameter>::with_new_async(param, stream, inner)
+        inner: impl super::WithNewAsync<'stream, Self, O, E>,
+    ) -> Result<O, E> where Self: 'param {
+        crate::host::HostAndDeviceConstRef::with_new(param, |const_ref| {
+            inner.with(const_ref.as_async(stream).as_ref())
+        })
     }
 
     #[cfg(feature = "host")]
-    fn with_async_as_ptx_jit<O>(
-        param: &Self::AsyncHostType<'_, '_>,
+    fn with_async_as_ptx_jit<'stream, 'b, O>(
+        param: &Self::AsyncHostType<'stream, 'b>,
         _token: sealed::Token,
         inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O,
-    ) -> O {
+    ) -> O where Self: 'b {
         let param = unsafe { param.unwrap_unchecked() };
         inner(Some(&param_as_raw_bytes(param.for_host())))
     }
 
     #[cfg(feature = "host")]
-    fn shared_layout_for_async(
-        _param: &Self::AsyncHostType<'_, '_>,
+    fn shared_layout_for_async<'stream, 'b>(
+        _param: &Self::AsyncHostType<'stream, 'b>,
         _token: sealed::Token,
-    ) -> Layout {
+    ) -> Layout where Self: 'b {
         Layout::new::<()>()
     }
 
@@ -262,26 +261,25 @@ impl<
     fn async_to_ffi<'stream, 'b, E: From<rustacuda::error::CudaError>>(
         param: Self::AsyncHostType<'stream, 'b>,
         token: sealed::Token,
-    ) -> Result<Self::FfiType<'stream, 'b>, E> {
+    ) -> Result<Self::FfiType<'stream, 'b>, E> where Self: 'b {
         <&'a PerThreadShallowCopy<T> as CudaKernelParameter>::async_to_ffi(param, token)
     }
 
     #[cfg(feature = "device")]
-    unsafe fn with_ffi_as_device<O, const PARAM: usize>(
-        param: Self::FfiType<'static, 'static>,
-        inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O,
-    ) -> O {
+    unsafe fn with_ffi_as_device<'short, O, const PARAM: usize>(
+        param: Self::FfiType<'static, 'short>,
+        inner: impl super::WithFfiAsDevice<Self, O>,
+    ) -> O where Self: 'short {
         emit_param_ptx_jit_marker::<_, PARAM>(param.as_ref());
 
-        <&'a PerThreadShallowCopy<T> as CudaKernelParameter>::with_ffi_as_device::<O, PARAM>(
-            param, inner,
-        )
+        let param = param.as_ref();
+
+        inner.with(param)
     }
 }
 impl<
         'a,
-        T: 'static
-            + Sync
+        T: Sync
             + crate::safety::StackOnly
             + crate::safety::PortableBitSemantics
             + TypeGraphLayout,
@@ -289,573 +287,573 @@ impl<
 {
 }
 
-pub struct ShallowInteriorMutable<
-    T: Sync
-        + crate::safety::StackOnly
-        + crate::safety::PortableBitSemantics
-        + TypeGraphLayout
-        + InteriorMutableSync,
-> {
-    never: !,
-    _marker: PhantomData<T>,
-}
-
-impl<
-        T: Sync
-            + crate::safety::StackOnly
-            + crate::safety::PortableBitSemantics
-            + TypeGraphLayout
-            + InteriorMutableSync,
-    > Deref for ShallowInteriorMutable<T>
-{
-    type Target = T;
-
-    fn deref(&self) -> &Self::Target {
-        self.never
-    }
-}
-
-impl<
-        'a,
-        T: 'static
-            + Sync
-            + crate::safety::StackOnly
-            + crate::safety::PortableBitSemantics
-            + TypeGraphLayout
-            + InteriorMutableSync,
-    > CudaKernelParameter for &'a ShallowInteriorMutable<T>
-{
-    #[cfg(feature = "host")]
-    type AsyncHostType<'stream, 'b> = crate::utils::r#async::AsyncProj<
-        'b,
-        'stream,
-        &'b crate::host::HostAndDeviceConstRef<'b, T>,
-    >;
-    #[cfg(any(feature = "device", doc))]
-    type DeviceType<'b> = &'b T;
-    type FfiType<'stream, 'b> = DeviceConstRef<'b, T>;
-    #[cfg(feature = "host")]
-    /// The kernel takes a mutable borrow of the interior mutable data to ensure
-    /// the interior mutability is limited to just this kernel invocation.
-    type SyncHostType = &'a mut T;
-
-    #[cfg(feature = "host")]
-    fn with_new_async<'stream, O, E: From<rustacuda::error::CudaError>>(
-        param: Self::SyncHostType,
-        stream: &'stream rustacuda::stream::Stream,
-        inner: impl for<'b> FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result<O, E>,
-    ) -> Result<O, E> {
-        crate::host::HostAndDeviceMutRef::with_new(param, |const_ref| {
-            inner(const_ref.as_ref().as_async(stream).as_ref())
-        })
-    }
-
-    #[cfg(feature = "host")]
-    fn with_async_as_ptx_jit<O>(
-        _param: &Self::AsyncHostType<'_, '_>,
-        _token: sealed::Token,
-        inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O,
-    ) -> O {
-        inner(None)
-    }
-
-    #[cfg(feature = "host")]
-    fn shared_layout_for_async(
-        _param: &Self::AsyncHostType<'_, '_>,
-        _token: sealed::Token,
-    ) -> Layout {
-        Layout::new::<()>()
-    }
-
-    #[cfg(feature = "host")]
-    fn async_to_ffi<'stream, 'b, E: From<rustacuda::error::CudaError>>(
-        param: Self::AsyncHostType<'stream, 'b>,
-        _token: sealed::Token,
-    ) -> Result<Self::FfiType<'stream, 'b>, E> {
-        let param = unsafe { param.unwrap_unchecked() };
-        Ok(param.for_device())
-    }
-
-    #[cfg(feature = "device")]
-    unsafe fn with_ffi_as_device<O, const PARAM: usize>(
-        param: Self::FfiType<'static, 'static>,
-        inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O,
-    ) -> O {
-        let param = param.as_ref();
-
-        inner(param)
-    }
-}
-impl<
-        'a,
-        T: crate::safety::StackOnly
-            + Sync
-            + crate::safety::PortableBitSemantics
-            + TypeGraphLayout
-            + InteriorMutableSync,
-    > sealed::Sealed for &'a ShallowInteriorMutable<T>
-{
-}
-
-pub trait InteriorMutableSync: Sync + sealed::Sealed {}
-
-macro_rules! impl_atomic_interior_mutable {
-    ($atomic:ident($interior:ty)) => {
-        impl InteriorMutableSync for core::sync::atomic::$atomic {}
-        impl sealed::Sealed for core::sync::atomic::$atomic {}
-    };
-    ($($atomic:ident($interior:ty)),*) => {
-        $(impl_atomic_interior_mutable! { $atomic($interior) })*
-    }
-}
-
-impl_atomic_interior_mutable! {
-    AtomicBool(bool),
-    AtomicI8(i8), AtomicI16(i16), AtomicI32(i32), AtomicI64(i64), AtomicIsize(isize),
-    AtomicU8(u8), AtomicU16(u16), AtomicU32(u32), AtomicU64(u64), AtomicUsize(usize)
-}
-
-impl<T: crate::safety::StackOnly + crate::safety::PortableBitSemantics + Sync> InteriorMutableSync
-    for core::cell::SyncUnsafeCell<T>
-{
-}
-impl<T: crate::safety::StackOnly + crate::safety::PortableBitSemantics + Sync> sealed::Sealed
-    for core::cell::SyncUnsafeCell<T>
-{
-}
-
-pub struct DeepPerThreadBorrow<T: RustToCuda> {
-    never: !,
-    _marker: PhantomData<T>,
-}
-
-impl<T: RustToCuda> Deref for DeepPerThreadBorrow<T> {
-    type Target = T;
-
-    fn deref(&self) -> &Self::Target {
-        self.never
-    }
-}
-
-impl<
-        T: Send
-            + Clone
-            + RustToCuda<
-                CudaRepresentation: 'static + crate::safety::StackOnly,
-                CudaAllocation: EmptyCudaAlloc,
-            >,
-    > CudaKernelParameter for DeepPerThreadBorrow<T>
-{
-    #[cfg(feature = "host")]
-    type AsyncHostType<'stream, 'b> = crate::utils::r#async::Async<
-        'b,
-        'stream,
-        crate::host::HostAndDeviceOwned<
-            'b,
-            DeviceAccessible<<T as RustToCuda>::CudaRepresentation>,
-        >,
-        crate::utils::r#async::NoCompletion,
-    >;
-    #[cfg(any(feature = "device", doc))]
-    type DeviceType<'b> = T;
-    type FfiType<'stream, 'b> =
-        DeviceOwnedRef<'b, DeviceAccessible<<T as RustToCuda>::CudaRepresentation>>;
-    #[cfg(feature = "host")]
-    type SyncHostType = T;
-
-    #[cfg(feature = "host")]
-    fn with_new_async<'stream, O, E: From<rustacuda::error::CudaError>>(
-        param: Self::SyncHostType,
-        stream: &'stream rustacuda::stream::Stream,
-        inner: impl for<'b> FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result<O, E>,
-    ) -> Result<O, E> {
-        crate::lend::LendToCuda::move_to_cuda(param, |param| inner(param.into_async(stream)))
-    }
-
-    #[cfg(feature = "host")]
-    fn with_async_as_ptx_jit<O>(
-        _param: &Self::AsyncHostType<'_, '_>,
-        _token: sealed::Token,
-        inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O,
-    ) -> O {
-        inner(None)
-    }
-
-    #[cfg(feature = "host")]
-    fn shared_layout_for_async(
-        _param: &Self::AsyncHostType<'_, '_>,
-        _token: sealed::Token,
-    ) -> Layout {
-        Layout::new::<()>()
-    }
-
-    #[cfg(feature = "host")]
-    fn async_to_ffi<'stream, 'b, E: From<rustacuda::error::CudaError>>(
-        param: Self::AsyncHostType<'stream, 'b>,
-        _token: sealed::Token,
-    ) -> Result<Self::FfiType<'stream, 'b>, E> {
-        let (param, _completion): (_, Option<crate::utils::r#async::NoCompletion>) =
-            unsafe { param.unwrap_unchecked()? };
-        Ok(param.for_device())
-    }
-
-    #[cfg(feature = "device")]
-    unsafe fn with_ffi_as_device<O, const PARAM: usize>(
-        param: Self::FfiType<'static, 'static>,
-        inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O,
-    ) -> O {
-        unsafe { crate::lend::BorrowFromRust::with_moved_from_rust(param, inner) }
-    }
-}
-impl<
-        T: Send
-            + Clone
-            + RustToCuda<
-                CudaRepresentation: 'static + crate::safety::StackOnly,
-                CudaAllocation: EmptyCudaAlloc,
-            >,
-    > sealed::Sealed for DeepPerThreadBorrow<T>
-{
-}
-
-impl<'a, T: 'static + Sync + RustToCuda> CudaKernelParameter for &'a DeepPerThreadBorrow<T> {
-    #[cfg(feature = "host")]
-    type AsyncHostType<'stream, 'b> = crate::utils::r#async::AsyncProj<
-        'b,
-        'stream,
-        &'b crate::host::HostAndDeviceConstRef<
-            'b,
-            DeviceAccessible<<T as RustToCuda>::CudaRepresentation>,
-        >,
-    >;
-    #[cfg(any(feature = "device", doc))]
-    type DeviceType<'b> = &'b T;
-    type FfiType<'stream, 'b> =
-        DeviceConstRef<'b, DeviceAccessible<<T as RustToCuda>::CudaRepresentation>>;
-    #[cfg(feature = "host")]
-    type SyncHostType = &'a T;
-
-    #[cfg(feature = "host")]
-    fn with_new_async<'stream, O, E: From<rustacuda::error::CudaError>>(
-        param: Self::SyncHostType,
-        stream: &'stream rustacuda::stream::Stream,
-        inner: impl for<'b> FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result<O, E>,
-    ) -> Result<O, E> {
-        crate::lend::LendToCuda::lend_to_cuda(param, |param| inner(param.as_async(stream).as_ref()))
-    }
-
-    #[cfg(feature = "host")]
-    fn with_async_as_ptx_jit<O>(
-        _param: &Self::AsyncHostType<'_, '_>,
-        _token: sealed::Token,
-        inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O,
-    ) -> O {
-        inner(None)
-    }
-
-    #[cfg(feature = "host")]
-    fn shared_layout_for_async(
-        _param: &Self::AsyncHostType<'_, '_>,
-        _token: sealed::Token,
-    ) -> Layout {
-        Layout::new::<()>()
-    }
-
-    #[cfg(feature = "host")]
-    fn async_to_ffi<'stream, 'b, E: From<rustacuda::error::CudaError>>(
-        param: Self::AsyncHostType<'stream, 'b>,
-        _token: sealed::Token,
-    ) -> Result<Self::FfiType<'stream, 'b>, E> {
-        let param = unsafe { param.unwrap_unchecked() };
-        Ok(param.for_device())
-    }
-
-    #[cfg(feature = "device")]
-    unsafe fn with_ffi_as_device<O, const PARAM: usize>(
-        param: Self::FfiType<'static, 'static>,
-        inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O,
-    ) -> O {
-        unsafe { crate::lend::BorrowFromRust::with_borrow_from_rust(param, inner) }
-    }
-}
-impl<'a, T: Sync + RustToCuda> sealed::Sealed for &'a DeepPerThreadBorrow<T> {}
-
-impl<'a, T: 'static + Sync + RustToCuda + SafeMutableAliasing> CudaKernelParameter
-    for &'a mut DeepPerThreadBorrow<T>
-{
-    #[cfg(feature = "host")]
-    type AsyncHostType<'stream, 'b> = crate::utils::r#async::AsyncProj<
-        'b,
-        'stream,
-        &'b mut crate::host::HostAndDeviceMutRef<
-            'b,
-            DeviceAccessible<<T as RustToCuda>::CudaRepresentation>,
-        >,
-    >;
-    #[cfg(any(feature = "device", doc))]
-    type DeviceType<'b> = &'b mut T;
-    type FfiType<'stream, 'b> =
-        DeviceMutRef<'b, DeviceAccessible<<T as RustToCuda>::CudaRepresentation>>;
-    #[cfg(feature = "host")]
-    type SyncHostType = &'a mut T;
-
-    #[cfg(feature = "host")]
-    fn with_new_async<'stream, O, E: From<rustacuda::error::CudaError>>(
-        param: Self::SyncHostType,
-        stream: &'stream rustacuda::stream::Stream,
-        inner: impl for<'b> FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result<O, E>,
-    ) -> Result<O, E> {
-        crate::lend::LendToCuda::lend_to_cuda_mut(param, |mut param| {
-            // FIXME: express the same with param.as_async(stream).as_mut()
-            let _ = stream;
-            inner(crate::utils::r#async::AsyncProj::new(&mut param.as_mut()))
-        })
-    }
-
-    #[cfg(feature = "host")]
-    fn with_async_as_ptx_jit<O>(
-        _param: &Self::AsyncHostType<'_, '_>,
-        _token: sealed::Token,
-        inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O,
-    ) -> O {
-        inner(None)
-    }
-
-    #[cfg(feature = "host")]
-    fn shared_layout_for_async(
-        _param: &Self::AsyncHostType<'_, '_>,
-        _token: sealed::Token,
-    ) -> Layout {
-        Layout::new::<()>()
-    }
-
-    #[cfg(feature = "host")]
-    fn async_to_ffi<'stream, 'b, E: From<rustacuda::error::CudaError>>(
-        param: Self::AsyncHostType<'stream, 'b>,
-        _token: sealed::Token,
-    ) -> Result<Self::FfiType<'stream, 'b>, E> {
-        let param = unsafe { param.unwrap_unchecked() };
-        Ok(param.for_device())
-    }
-
-    #[cfg(feature = "device")]
-    unsafe fn with_ffi_as_device<O, const PARAM: usize>(
-        param: Self::FfiType<'static, 'static>,
-        inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O,
-    ) -> O {
-        unsafe { crate::lend::BorrowFromRust::with_borrow_from_rust_mut(param, inner) }
-    }
-}
-impl<'a, T: Sync + RustToCuda + SafeMutableAliasing> sealed::Sealed
-    for &'a mut DeepPerThreadBorrow<T>
-{
-}
-
-impl<
-        T: Send
-            + Clone
-            + RustToCuda<
-                CudaRepresentation: 'static + crate::safety::StackOnly,
-                CudaAllocation: EmptyCudaAlloc,
-            >,
-    > CudaKernelParameter for PtxJit<DeepPerThreadBorrow<T>>
-{
-    #[cfg(feature = "host")]
-    type AsyncHostType<'stream, 'b> =
-        <DeepPerThreadBorrow<T> as CudaKernelParameter>::AsyncHostType<'stream, 'b>;
-    #[cfg(any(feature = "device", doc))]
-    type DeviceType<'b> = <DeepPerThreadBorrow<T> as CudaKernelParameter>::DeviceType<'b>;
-    type FfiType<'stream, 'b> =
-        <DeepPerThreadBorrow<T> as CudaKernelParameter>::FfiType<'stream, 'b>;
-    #[cfg(feature = "host")]
-    type SyncHostType = <DeepPerThreadBorrow<T> as CudaKernelParameter>::SyncHostType;
-
-    #[cfg(feature = "host")]
-    fn with_new_async<'stream, O, E: From<rustacuda::error::CudaError>>(
-        param: Self::SyncHostType,
-        stream: &'stream rustacuda::stream::Stream,
-        inner: impl for<'b> FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result<O, E>,
-    ) -> Result<O, E> {
-        <DeepPerThreadBorrow<T> as CudaKernelParameter>::with_new_async(param, stream, inner)
-    }
-
-    #[cfg(feature = "host")]
-    fn with_async_as_ptx_jit<O>(
-        param: &Self::AsyncHostType<'_, '_>,
-        _token: sealed::Token,
-        inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O,
-    ) -> O {
-        let param = unsafe { param.as_ref().unwrap_unchecked() };
-        inner(Some(&param_as_raw_bytes(param.for_host())))
-    }
-
-    #[cfg(feature = "host")]
-    fn async_to_ffi<'stream, 'b, E: From<rustacuda::error::CudaError>>(
-        param: Self::AsyncHostType<'stream, 'b>,
-        token: sealed::Token,
-    ) -> Result<Self::FfiType<'stream, 'b>, E> {
-        <DeepPerThreadBorrow<T> as CudaKernelParameter>::async_to_ffi(param, token)
-    }
-
-    #[cfg(feature = "host")]
-    fn shared_layout_for_async(
-        _param: &Self::AsyncHostType<'_, '_>,
-        _token: sealed::Token,
-    ) -> Layout {
-        Layout::new::<()>()
-    }
-
-    #[cfg(feature = "device")]
-    unsafe fn with_ffi_as_device<O, const PARAM: usize>(
-        param: Self::FfiType<'static, 'static>,
-        inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O,
-    ) -> O {
-        emit_param_ptx_jit_marker::<_, PARAM>(param.as_ref());
-
-        <DeepPerThreadBorrow<T> as CudaKernelParameter>::with_ffi_as_device::<O, PARAM>(
-            param, inner,
-        )
-    }
-}
-impl<
-        T: Send
-            + Clone
-            + RustToCuda<
-                CudaRepresentation: 'static + crate::safety::StackOnly,
-                CudaAllocation: EmptyCudaAlloc,
-            >,
-    > sealed::Sealed for PtxJit<DeepPerThreadBorrow<T>>
-{
-}
-
-impl<'a, T: 'static + Sync + RustToCuda> CudaKernelParameter
-    for &'a PtxJit<DeepPerThreadBorrow<T>>
-{
-    #[cfg(feature = "host")]
-    type AsyncHostType<'stream, 'b> =
-        <&'a DeepPerThreadBorrow<T> as CudaKernelParameter>::AsyncHostType<'stream, 'b>;
-    #[cfg(any(feature = "device", doc))]
-    type DeviceType<'b> = <&'a DeepPerThreadBorrow<T> as CudaKernelParameter>::DeviceType<'b>;
-    type FfiType<'stream, 'b> =
-        <&'a DeepPerThreadBorrow<T> as CudaKernelParameter>::FfiType<'stream, 'b>;
-    #[cfg(feature = "host")]
-    type SyncHostType = <&'a DeepPerThreadBorrow<T> as CudaKernelParameter>::SyncHostType;
-
-    #[cfg(feature = "host")]
-    fn with_new_async<'stream, O, E: From<rustacuda::error::CudaError>>(
-        param: Self::SyncHostType,
-        stream: &'stream rustacuda::stream::Stream,
-        inner: impl for<'b> FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result<O, E>,
-    ) -> Result<O, E> {
-        <&'a DeepPerThreadBorrow<T> as CudaKernelParameter>::with_new_async(param, stream, inner)
-    }
-
-    #[cfg(feature = "host")]
-    fn with_async_as_ptx_jit<O>(
-        param: &Self::AsyncHostType<'_, '_>,
-        _token: sealed::Token,
-        inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O,
-    ) -> O {
-        let param = unsafe { param.unwrap_unchecked() };
-        inner(Some(&param_as_raw_bytes(param.for_host())))
-    }
-
-    #[cfg(feature = "host")]
-    fn shared_layout_for_async(
-        _param: &Self::AsyncHostType<'_, '_>,
-        _token: sealed::Token,
-    ) -> Layout {
-        Layout::new::<()>()
-    }
-
-    #[cfg(feature = "host")]
-    fn async_to_ffi<'stream, 'b, E: From<rustacuda::error::CudaError>>(
-        param: Self::AsyncHostType<'stream, 'b>,
-        token: sealed::Token,
-    ) -> Result<Self::FfiType<'stream, 'b>, E> {
-        <&'a DeepPerThreadBorrow<T> as CudaKernelParameter>::async_to_ffi(param, token)
-    }
-
-    #[cfg(feature = "device")]
-    unsafe fn with_ffi_as_device<O, const PARAM: usize>(
-        param: Self::FfiType<'static, 'static>,
-        inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O,
-    ) -> O {
-        emit_param_ptx_jit_marker::<_, PARAM>(param.as_ref());
-
-        <&'a DeepPerThreadBorrow<T> as CudaKernelParameter>::with_ffi_as_device::<O, PARAM>(
-            param, inner,
-        )
-    }
-}
-impl<'a, T: 'static + Sync + RustToCuda> sealed::Sealed for &'a PtxJit<DeepPerThreadBorrow<T>> {}
-
-impl<'a, T: 'static + Sync + RustToCuda + SafeMutableAliasing> CudaKernelParameter
-    for &'a mut PtxJit<DeepPerThreadBorrow<T>>
-{
-    #[cfg(feature = "host")]
-    type AsyncHostType<'stream, 'b> =
-        <&'a mut DeepPerThreadBorrow<T> as CudaKernelParameter>::AsyncHostType<'stream, 'b>;
-    #[cfg(any(feature = "device", doc))]
-    type DeviceType<'b> = <&'a mut DeepPerThreadBorrow<T> as CudaKernelParameter>::DeviceType<'b>;
-    type FfiType<'stream, 'b> =
-        <&'a mut DeepPerThreadBorrow<T> as CudaKernelParameter>::FfiType<'stream, 'b>;
-    #[cfg(feature = "host")]
-    type SyncHostType = <&'a mut DeepPerThreadBorrow<T> as CudaKernelParameter>::SyncHostType;
-
-    #[cfg(feature = "host")]
-    fn with_new_async<'stream, O, E: From<rustacuda::error::CudaError>>(
-        param: Self::SyncHostType,
-        stream: &'stream rustacuda::stream::Stream,
-        inner: impl for<'b> FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result<O, E>,
-    ) -> Result<O, E> {
-        <&'a mut DeepPerThreadBorrow<T> as CudaKernelParameter>::with_new_async(
-            param, stream, inner,
-        )
-    }
-
-    #[cfg(feature = "host")]
-    fn with_async_as_ptx_jit<O>(
-        param: &Self::AsyncHostType<'_, '_>,
-        _token: sealed::Token,
-        inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O,
-    ) -> O {
-        let param = unsafe { param.as_ref().unwrap_unchecked() };
-        inner(Some(&param_as_raw_bytes(param.for_host())))
-    }
-
-    #[cfg(feature = "host")]
-    fn shared_layout_for_async(
-        _param: &Self::AsyncHostType<'_, '_>,
-        _token: sealed::Token,
-    ) -> Layout {
-        Layout::new::<()>()
-    }
-
-    #[cfg(feature = "host")]
-    fn async_to_ffi<'stream, 'b, E: From<rustacuda::error::CudaError>>(
-        param: Self::AsyncHostType<'stream, 'b>,
-        token: sealed::Token,
-    ) -> Result<Self::FfiType<'stream, 'b>, E> {
-        <&'a mut DeepPerThreadBorrow<T> as CudaKernelParameter>::async_to_ffi(param, token)
-    }
-
-    #[cfg(feature = "device")]
-    unsafe fn with_ffi_as_device<O, const PARAM: usize>(
-        param: Self::FfiType<'static, 'static>,
-        inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O,
-    ) -> O {
-        emit_param_ptx_jit_marker::<_, PARAM>(param.as_ref());
-
-        <&'a mut DeepPerThreadBorrow<T> as CudaKernelParameter>::with_ffi_as_device::<O, PARAM>(
-            param, inner,
-        )
-    }
-}
-impl<'a, T: 'static + Sync + RustToCuda + SafeMutableAliasing> sealed::Sealed
-    for &'a mut PtxJit<DeepPerThreadBorrow<T>>
-{
-}
+// pub struct ShallowInteriorMutable<
+//     T: Sync
+//         + crate::safety::StackOnly
+//         + crate::safety::PortableBitSemantics
+//         + TypeGraphLayout
+//         + InteriorMutableSync,
+// > {
+//     never: !,
+//     _marker: PhantomData<T>,
+// }
+
+// impl<
+//         T: Sync
+//             + crate::safety::StackOnly
+//             + crate::safety::PortableBitSemantics
+//             + TypeGraphLayout
+//             + InteriorMutableSync,
+//     > Deref for ShallowInteriorMutable<T>
+// {
+//     type Target = T;
+
+//     fn deref(&self) -> &Self::Target {
+//         self.never
+//     }
+// }
+
+// impl<
+//         'a,
+//         T: 'static
+//             + Sync
+//             + crate::safety::StackOnly
+//             + crate::safety::PortableBitSemantics
+//             + TypeGraphLayout
+//             + InteriorMutableSync,
+//     > CudaKernelParameter for &'a ShallowInteriorMutable<T>
+// {
+//     #[cfg(feature = "host")]
+//     type AsyncHostType<'stream, 'b> = crate::utils::r#async::AsyncProj<
+//         'b,
+//         'stream,
+//         &'b crate::host::HostAndDeviceConstRef<'b, T>,
+//     >;
+//     #[cfg(any(feature = "device", doc))]
+//     type DeviceType<'b> = &'b T;
+//     type FfiType<'stream, 'b> = DeviceConstRef<'b, T>;
+//     #[cfg(feature = "host")]
+//     /// The kernel takes a mutable borrow of the interior mutable data to ensure
+//     /// the interior mutability is limited to just this kernel invocation.
+//     type SyncHostType = &'a mut T;
+
+//     #[cfg(feature = "host")]
+//     fn with_new_async<'stream, 'param, O, E: From<rustacuda::error::CudaError>>(
+//         param: Self::SyncHostType,
+//         stream: &'stream rustacuda::stream::Stream,
+//         inner: impl super::WithNewAsync<'stream, Self, O, E>,
+//     ) -> Result<O, E> where Self: 'param {
+//         crate::host::HostAndDeviceMutRef::with_new(param, |const_ref| {
+//             inner.with(const_ref.as_ref().as_async(stream).as_ref())
+//         })
+//     }
+
+//     #[cfg(feature = "host")]
+//     fn with_async_as_ptx_jit<O>(
+//         _param: &Self::AsyncHostType<'_, '_>,
+//         _token: sealed::Token,
+//         inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O,
+//     ) -> O {
+//         inner(None)
+//     }
+
+//     #[cfg(feature = "host")]
+//     fn shared_layout_for_async(
+//         _param: &Self::AsyncHostType<'_, '_>,
+//         _token: sealed::Token,
+//     ) -> Layout {
+//         Layout::new::<()>()
+//     }
+
+//     #[cfg(feature = "host")]
+//     fn async_to_ffi<'stream, 'b, E: From<rustacuda::error::CudaError>>(
+//         param: Self::AsyncHostType<'stream, 'b>,
+//         _token: sealed::Token,
+//     ) -> Result<Self::FfiType<'stream, 'b>, E> {
+//         let param = unsafe { param.unwrap_unchecked() };
+//         Ok(param.for_device())
+//     }
+
+//     #[cfg(feature = "device")]
+//     unsafe fn with_ffi_as_device<O, const PARAM: usize>(
+//         param: Self::FfiType<'static, 'static>,
+//         inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O,
+//     ) -> O {
+//         let param = param.as_ref();
+
+//         inner(param)
+//     }
+// }
+// impl<
+//         'a,
+//         T: crate::safety::StackOnly
+//             + Sync
+//             + crate::safety::PortableBitSemantics
+//             + TypeGraphLayout
+//             + InteriorMutableSync,
+//     > sealed::Sealed for &'a ShallowInteriorMutable<T>
+// {
+// }
+
+// pub trait InteriorMutableSync: Sync + sealed::Sealed {}
+
+// macro_rules! impl_atomic_interior_mutable {
+//     ($atomic:ident($interior:ty)) => {
+//         impl InteriorMutableSync for core::sync::atomic::$atomic {}
+//         impl sealed::Sealed for core::sync::atomic::$atomic {}
+//     };
+//     ($($atomic:ident($interior:ty)),*) => {
+//         $(impl_atomic_interior_mutable! { $atomic($interior) })*
+//     }
+// }
+
+// impl_atomic_interior_mutable! {
+//     AtomicBool(bool),
+//     AtomicI8(i8), AtomicI16(i16), AtomicI32(i32), AtomicI64(i64), AtomicIsize(isize),
+//     AtomicU8(u8), AtomicU16(u16), AtomicU32(u32), AtomicU64(u64), AtomicUsize(usize)
+// }
+
+// impl<T: crate::safety::StackOnly + crate::safety::PortableBitSemantics + Sync> InteriorMutableSync
+//     for core::cell::SyncUnsafeCell<T>
+// {
+// }
+// impl<T: crate::safety::StackOnly + crate::safety::PortableBitSemantics + Sync> sealed::Sealed
+//     for core::cell::SyncUnsafeCell<T>
+// {
+// }
+
+// pub struct DeepPerThreadBorrow<T: RustToCuda> {
+//     never: !,
+//     _marker: PhantomData<T>,
+// }
+
+// impl<T: RustToCuda> Deref for DeepPerThreadBorrow<T> {
+//     type Target = T;
+
+//     fn deref(&self) -> &Self::Target {
+//         self.never
+//     }
+// }
+
+// impl<
+//         T: Send
+//             + Clone
+//             + RustToCuda<
+//                 CudaRepresentation: 'static + crate::safety::StackOnly,
+//                 CudaAllocation: EmptyCudaAlloc,
+//             >,
+//     > CudaKernelParameter for DeepPerThreadBorrow<T>
+// {
+//     #[cfg(feature = "host")]
+//     type AsyncHostType<'stream, 'b> = crate::utils::r#async::Async<
+//         'b,
+//         'stream,
+//         crate::host::HostAndDeviceOwned<
+//             'b,
+//             DeviceAccessible<<T as RustToCuda>::CudaRepresentation>,
+//         >,
+//         crate::utils::r#async::NoCompletion,
+//     >;
+//     #[cfg(any(feature = "device", doc))]
+//     type DeviceType<'b> = T;
+//     type FfiType<'stream, 'b> =
+//         DeviceOwnedRef<'b, DeviceAccessible<<T as RustToCuda>::CudaRepresentation>>;
+//     #[cfg(feature = "host")]
+//     type SyncHostType = T;
+
+//     #[cfg(feature = "host")]
+//     fn with_new_async<'stream, 'param, O, E: From<rustacuda::error::CudaError>>(
+//         param: Self::SyncHostType,
+//         stream: &'stream rustacuda::stream::Stream,
+//         inner: impl super::WithNewAsync<'stream, Self, O, E>,
+//     ) -> Result<O, E> where Self: 'param {
+//         crate::lend::LendToCuda::move_to_cuda(param, |param| inner.with(param.into_async(stream)))
+//     }
+
+//     #[cfg(feature = "host")]
+//     fn with_async_as_ptx_jit<O>(
+//         _param: &Self::AsyncHostType<'_, '_>,
+//         _token: sealed::Token,
+//         inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O,
+//     ) -> O {
+//         inner(None)
+//     }
+
+//     #[cfg(feature = "host")]
+//     fn shared_layout_for_async(
+//         _param: &Self::AsyncHostType<'_, '_>,
+//         _token: sealed::Token,
+//     ) -> Layout {
+//         Layout::new::<()>()
+//     }
+
+//     #[cfg(feature = "host")]
+//     fn async_to_ffi<'stream, 'b, E: From<rustacuda::error::CudaError>>(
+//         param: Self::AsyncHostType<'stream, 'b>,
+//         _token: sealed::Token,
+//     ) -> Result<Self::FfiType<'stream, 'b>, E> {
+//         let (param, _completion): (_, Option<crate::utils::r#async::NoCompletion>) =
+//             unsafe { param.unwrap_unchecked()? };
+//         Ok(param.for_device())
+//     }
+
+//     #[cfg(feature = "device")]
+//     unsafe fn with_ffi_as_device<O, const PARAM: usize>(
+//         param: Self::FfiType<'static, 'static>,
+//         inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O,
+//     ) -> O {
+//         unsafe { crate::lend::BorrowFromRust::with_moved_from_rust(param, inner) }
+//     }
+// }
+// impl<
+//         T: Send
+//             + Clone
+//             + RustToCuda<
+//                 CudaRepresentation: 'static + crate::safety::StackOnly,
+//                 CudaAllocation: EmptyCudaAlloc,
+//             >,
+//     > sealed::Sealed for DeepPerThreadBorrow<T>
+// {
+// }
+
+// impl<'a, T: 'static + Sync + RustToCuda> CudaKernelParameter for &'a DeepPerThreadBorrow<T> {
+//     #[cfg(feature = "host")]
+//     type AsyncHostType<'stream, 'b> = crate::utils::r#async::AsyncProj<
+//         'b,
+//         'stream,
+//         &'b crate::host::HostAndDeviceConstRef<
+//             'b,
+//             DeviceAccessible<<T as RustToCuda>::CudaRepresentation>,
+//         >,
+//     >;
+//     #[cfg(any(feature = "device", doc))]
+//     type DeviceType<'b> = &'b T;
+//     type FfiType<'stream, 'b> =
+//         DeviceConstRef<'b, DeviceAccessible<<T as RustToCuda>::CudaRepresentation>>;
+//     #[cfg(feature = "host")]
+//     type SyncHostType = &'a T;
+
+//     #[cfg(feature = "host")]
+//     fn with_new_async<'stream, 'param, O, E: From<rustacuda::error::CudaError>>(
+//         param: Self::SyncHostType,
+//         stream: &'stream rustacuda::stream::Stream,
+//         inner: impl super::WithNewAsync<'stream, Self, O, E>,
+//     ) -> Result<O, E> where Self: 'param {
+//         crate::lend::LendToCuda::lend_to_cuda(param, |param| inner.with(param.as_async(stream).as_ref()))
+//     }
+
+//     #[cfg(feature = "host")]
+//     fn with_async_as_ptx_jit<O>(
+//         _param: &Self::AsyncHostType<'_, '_>,
+//         _token: sealed::Token,
+//         inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O,
+//     ) -> O {
+//         inner(None)
+//     }
+
+//     #[cfg(feature = "host")]
+//     fn shared_layout_for_async(
+//         _param: &Self::AsyncHostType<'_, '_>,
+//         _token: sealed::Token,
+//     ) -> Layout {
+//         Layout::new::<()>()
+//     }
+
+//     #[cfg(feature = "host")]
+//     fn async_to_ffi<'stream, 'b, E: From<rustacuda::error::CudaError>>(
+//         param: Self::AsyncHostType<'stream, 'b>,
+//         _token: sealed::Token,
+//     ) -> Result<Self::FfiType<'stream, 'b>, E> {
+//         let param = unsafe { param.unwrap_unchecked() };
+//         Ok(param.for_device())
+//     }
+
+//     #[cfg(feature = "device")]
+//     unsafe fn with_ffi_as_device<O, const PARAM: usize>(
+//         param: Self::FfiType<'static, 'static>,
+//         inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O,
+//     ) -> O {
+//         unsafe { crate::lend::BorrowFromRust::with_borrow_from_rust(param, inner) }
+//     }
+// }
+// impl<'a, T: Sync + RustToCuda> sealed::Sealed for &'a DeepPerThreadBorrow<T> {}
+
+// impl<'a, T: 'static + Sync + RustToCuda + SafeMutableAliasing> CudaKernelParameter
+//     for &'a mut DeepPerThreadBorrow<T>
+// {
+//     #[cfg(feature = "host")]
+//     type AsyncHostType<'stream, 'b> = crate::utils::r#async::AsyncProj<
+//         'b,
+//         'stream,
+//         &'b mut crate::host::HostAndDeviceMutRef<
+//             'b,
+//             DeviceAccessible<<T as RustToCuda>::CudaRepresentation>,
+//         >,
+//     >;
+//     #[cfg(any(feature = "device", doc))]
+//     type DeviceType<'b> = &'b mut T;
+//     type FfiType<'stream, 'b> =
+//         DeviceMutRef<'b, DeviceAccessible<<T as RustToCuda>::CudaRepresentation>>;
+//     #[cfg(feature = "host")]
+//     type SyncHostType = &'a mut T;
+
+//     #[cfg(feature = "host")]
+//     fn with_new_async<'stream, 'param, O, E: From<rustacuda::error::CudaError>>(
+//         param: Self::SyncHostType,
+//         stream: &'stream rustacuda::stream::Stream,
+//         inner: impl super::WithNewAsync<'stream, Self, O, E>,
+//     ) -> Result<O, E> where Self: 'param {
+//         crate::lend::LendToCuda::lend_to_cuda_mut(param, |mut param| {
+//             // FIXME: express the same with param.as_async(stream).as_mut()
+//             let _ = stream;
+//             inner.with(crate::utils::r#async::AsyncProj::new(&mut param.as_mut()))
+//         })
+//     }
+
+//     #[cfg(feature = "host")]
+//     fn with_async_as_ptx_jit<O>(
+//         _param: &Self::AsyncHostType<'_, '_>,
+//         _token: sealed::Token,
+//         inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O,
+//     ) -> O {
+//         inner(None)
+//     }
+
+//     #[cfg(feature = "host")]
+//     fn shared_layout_for_async(
+//         _param: &Self::AsyncHostType<'_, '_>,
+//         _token: sealed::Token,
+//     ) -> Layout {
+//         Layout::new::<()>()
+//     }
+
+//     #[cfg(feature = "host")]
+//     fn async_to_ffi<'stream, 'b, E: From<rustacuda::error::CudaError>>(
+//         param: Self::AsyncHostType<'stream, 'b>,
+//         _token: sealed::Token,
+//     ) -> Result<Self::FfiType<'stream, 'b>, E> {
+//         let param = unsafe { param.unwrap_unchecked() };
+//         Ok(param.for_device())
+//     }
+
+//     #[cfg(feature = "device")]
+//     unsafe fn with_ffi_as_device<O, const PARAM: usize>(
+//         param: Self::FfiType<'static, 'static>,
+//         inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O,
+//     ) -> O {
+//         unsafe { crate::lend::BorrowFromRust::with_borrow_from_rust_mut(param, inner) }
+//     }
+// }
+// impl<'a, T: Sync + RustToCuda + SafeMutableAliasing> sealed::Sealed
+//     for &'a mut DeepPerThreadBorrow<T>
+// {
+// }
+
+// impl<
+//         T: Send
+//             + Clone
+//             + RustToCuda<
+//                 CudaRepresentation: 'static + crate::safety::StackOnly,
+//                 CudaAllocation: EmptyCudaAlloc,
+//             >,
+//     > CudaKernelParameter for PtxJit<DeepPerThreadBorrow<T>>
+// {
+//     #[cfg(feature = "host")]
+//     type AsyncHostType<'stream, 'b> =
+//         <DeepPerThreadBorrow<T> as CudaKernelParameter>::AsyncHostType<'stream, 'b>;
+//     #[cfg(any(feature = "device", doc))]
+//     type DeviceType<'b> = <DeepPerThreadBorrow<T> as CudaKernelParameter>::DeviceType<'b>;
+//     type FfiType<'stream, 'b> =
+//         <DeepPerThreadBorrow<T> as CudaKernelParameter>::FfiType<'stream, 'b>;
+//     #[cfg(feature = "host")]
+//     type SyncHostType = <DeepPerThreadBorrow<T> as CudaKernelParameter>::SyncHostType;
+
+//     #[cfg(feature = "host")]
+//     fn with_new_async<'stream, 'param, O, E: From<rustacuda::error::CudaError>>(
+//         param: Self::SyncHostType,
+//         stream: &'stream rustacuda::stream::Stream,
+//         inner: impl super::WithNewAsync<'stream, Self, O, E>,
+//     ) -> Result<O, E> where Self: 'param {
+//         <DeepPerThreadBorrow<T> as CudaKernelParameter>::with_new_async(param, stream, |param: <DeepPerThreadBorrow<T> as CudaKernelParameter>::AsyncHostType<'stream, '_>| inner.with(param))
+//     }
+
+//     #[cfg(feature = "host")]
+//     fn with_async_as_ptx_jit<O>(
+//         param: &Self::AsyncHostType<'_, '_>,
+//         _token: sealed::Token,
+//         inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O,
+//     ) -> O {
+//         let param = unsafe { param.as_ref().unwrap_unchecked() };
+//         inner(Some(&param_as_raw_bytes(param.for_host())))
+//     }
+
+//     #[cfg(feature = "host")]
+//     fn async_to_ffi<'stream, 'b, E: From<rustacuda::error::CudaError>>(
+//         param: Self::AsyncHostType<'stream, 'b>,
+//         token: sealed::Token,
+//     ) -> Result<Self::FfiType<'stream, 'b>, E> {
+//         <DeepPerThreadBorrow<T> as CudaKernelParameter>::async_to_ffi(param, token)
+//     }
+
+//     #[cfg(feature = "host")]
+//     fn shared_layout_for_async(
+//         _param: &Self::AsyncHostType<'_, '_>,
+//         _token: sealed::Token,
+//     ) -> Layout {
+//         Layout::new::<()>()
+//     }
+
+//     #[cfg(feature = "device")]
+//     unsafe fn with_ffi_as_device<O, const PARAM: usize>(
+//         param: Self::FfiType<'static, 'static>,
+//         inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O,
+//     ) -> O {
+//         emit_param_ptx_jit_marker::<_, PARAM>(param.as_ref());
+
+//         <DeepPerThreadBorrow<T> as CudaKernelParameter>::with_ffi_as_device::<O, PARAM>(
+//             param, inner,
+//         )
+//     }
+// }
+// impl<
+//         T: Send
+//             + Clone
+//             + RustToCuda<
+//                 CudaRepresentation: 'static + crate::safety::StackOnly,
+//                 CudaAllocation: EmptyCudaAlloc,
+//             >,
+//     > sealed::Sealed for PtxJit<DeepPerThreadBorrow<T>>
+// {
+// }
+
+// impl<'a, T: 'static + Sync + RustToCuda> CudaKernelParameter
+//     for &'a PtxJit<DeepPerThreadBorrow<T>>
+// {
+//     #[cfg(feature = "host")]
+//     type AsyncHostType<'stream, 'b> =
+//         <&'a DeepPerThreadBorrow<T> as CudaKernelParameter>::AsyncHostType<'stream, 'b>;
+//     #[cfg(any(feature = "device", doc))]
+//     type DeviceType<'b> = <&'a DeepPerThreadBorrow<T> as CudaKernelParameter>::DeviceType<'b>;
+//     type FfiType<'stream, 'b> =
+//         <&'a DeepPerThreadBorrow<T> as CudaKernelParameter>::FfiType<'stream, 'b>;
+//     #[cfg(feature = "host")]
+//     type SyncHostType = <&'a DeepPerThreadBorrow<T> as CudaKernelParameter>::SyncHostType;
+
+//     #[cfg(feature = "host")]
+//     fn with_new_async<'stream, 'param, O, E: From<rustacuda::error::CudaError>>(
+//         param: Self::SyncHostType,
+//         stream: &'stream rustacuda::stream::Stream,
+//         inner: impl super::WithNewAsync<'stream, Self, O, E>,
+//     ) -> Result<O, E> where Self: 'param {
+//         <&'a DeepPerThreadBorrow<T> as CudaKernelParameter>::with_new_async(param, stream, |param: <&'a DeepPerThreadBorrow<T> as CudaKernelParameter>::AsyncHostType<'stream, '_>| inner.with(param))
+//     }
+
+//     #[cfg(feature = "host")]
+//     fn with_async_as_ptx_jit<O>(
+//         param: &Self::AsyncHostType<'_, '_>,
+//         _token: sealed::Token,
+//         inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O,
+//     ) -> O {
+//         let param = unsafe { param.unwrap_unchecked() };
+//         inner(Some(&param_as_raw_bytes(param.for_host())))
+//     }
+
+//     #[cfg(feature = "host")]
+//     fn shared_layout_for_async(
+//         _param: &Self::AsyncHostType<'_, '_>,
+//         _token: sealed::Token,
+//     ) -> Layout {
+//         Layout::new::<()>()
+//     }
+
+//     #[cfg(feature = "host")]
+//     fn async_to_ffi<'stream, 'b, E: From<rustacuda::error::CudaError>>(
+//         param: Self::AsyncHostType<'stream, 'b>,
+//         token: sealed::Token,
+//     ) -> Result<Self::FfiType<'stream, 'b>, E> {
+//         <&'a DeepPerThreadBorrow<T> as CudaKernelParameter>::async_to_ffi(param, token)
+//     }
+
+//     #[cfg(feature = "device")]
+//     unsafe fn with_ffi_as_device<O, const PARAM: usize>(
+//         param: Self::FfiType<'static, 'static>,
+//         inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O,
+//     ) -> O {
+//         emit_param_ptx_jit_marker::<_, PARAM>(param.as_ref());
+
+//         <&'a DeepPerThreadBorrow<T> as CudaKernelParameter>::with_ffi_as_device::<O, PARAM>(
+//             param, inner,
+//         )
+//     }
+// }
+// impl<'a, T: 'static + Sync + RustToCuda> sealed::Sealed for &'a PtxJit<DeepPerThreadBorrow<T>> {}
+
+// impl<'a, T: 'static + Sync + RustToCuda + SafeMutableAliasing> CudaKernelParameter
+//     for &'a mut PtxJit<DeepPerThreadBorrow<T>>
+// {
+//     #[cfg(feature = "host")]
+//     type AsyncHostType<'stream, 'b> =
+//         <&'a mut DeepPerThreadBorrow<T> as CudaKernelParameter>::AsyncHostType<'stream, 'b>;
+//     #[cfg(any(feature = "device", doc))]
+//     type DeviceType<'b> = <&'a mut DeepPerThreadBorrow<T> as CudaKernelParameter>::DeviceType<'b>;
+//     type FfiType<'stream, 'b> =
+//         <&'a mut DeepPerThreadBorrow<T> as CudaKernelParameter>::FfiType<'stream, 'b>;
+//     #[cfg(feature = "host")]
+//     type SyncHostType = <&'a mut DeepPerThreadBorrow<T> as CudaKernelParameter>::SyncHostType;
+
+//     #[cfg(feature = "host")]
+//     fn with_new_async<'stream, 'param, O, E: From<rustacuda::error::CudaError>>(
+//         param: Self::SyncHostType,
+//         stream: &'stream rustacuda::stream::Stream,
+//         inner: impl super::WithNewAsync<'stream, Self, O, E>,
+//     ) -> Result<O, E> where Self: 'param {
+//         <&'a mut DeepPerThreadBorrow<T> as CudaKernelParameter>::with_new_async(
+//             param, stream, |param: <&'a mut DeepPerThreadBorrow<T> as CudaKernelParameter>::AsyncHostType<'stream, '_>| inner.with(param),
+//         )
+//     }
+
+//     #[cfg(feature = "host")]
+//     fn with_async_as_ptx_jit<O>(
+//         param: &Self::AsyncHostType<'_, '_>,
+//         _token: sealed::Token,
+//         inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O,
+//     ) -> O {
+//         let param = unsafe { param.as_ref().unwrap_unchecked() };
+//         inner(Some(&param_as_raw_bytes(param.for_host())))
+//     }
+
+//     #[cfg(feature = "host")]
+//     fn shared_layout_for_async(
+//         _param: &Self::AsyncHostType<'_, '_>,
+//         _token: sealed::Token,
+//     ) -> Layout {
+//         Layout::new::<()>()
+//     }
+
+//     #[cfg(feature = "host")]
+//     fn async_to_ffi<'stream, 'b, E: From<rustacuda::error::CudaError>>(
+//         param: Self::AsyncHostType<'stream, 'b>,
+//         token: sealed::Token,
+//     ) -> Result<Self::FfiType<'stream, 'b>, E> {
+//         <&'a mut DeepPerThreadBorrow<T> as CudaKernelParameter>::async_to_ffi(param, token)
+//     }
+
+//     #[cfg(feature = "device")]
+//     unsafe fn with_ffi_as_device<O, const PARAM: usize>(
+//         param: Self::FfiType<'static, 'static>,
+//         inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O,
+//     ) -> O {
+//         emit_param_ptx_jit_marker::<_, PARAM>(param.as_ref());
+
+//         <&'a mut DeepPerThreadBorrow<T> as CudaKernelParameter>::with_ffi_as_device::<O, PARAM>(
+//             param, inner,
+//         )
+//     }
+// }
+// impl<'a, T: 'static + Sync + RustToCuda + SafeMutableAliasing> sealed::Sealed
+//     for &'a mut PtxJit<DeepPerThreadBorrow<T>>
+// {
+// }
 
 #[cfg(feature = "host")]
 fn param_as_raw_bytes<T: ?Sized>(r: &T) -> NonNull<[u8]> {
@@ -873,151 +871,151 @@ fn emit_param_ptx_jit_marker<T: ?Sized, const INDEX: usize>(param: &T) {
     }
 }
 
-mod private_shared {
-    use core::marker::PhantomData;
-
-    use const_type_layout::{TypeGraphLayout, TypeLayout};
-
-    use crate::safety::PortableBitSemantics;
-
-    #[doc(hidden)]
-    #[derive(TypeLayout)]
-    #[repr(C)]
-    pub struct ThreadBlockSharedFfi<T: 'static> {
-        pub(super) _dummy: [u8; 0],
-        pub(super) _marker: PhantomData<T>,
-    }
-
-    #[doc(hidden)]
-    #[derive(TypeLayout)]
-    #[repr(C)]
-    pub struct ThreadBlockSharedSliceFfi<T: 'static + PortableBitSemantics + TypeGraphLayout> {
-        pub(super) len: usize,
-        pub(super) _marker: [T; 0],
-    }
-}
-
-impl<'a, T: 'static> CudaKernelParameter for &'a mut crate::utils::shared::ThreadBlockShared<T> {
-    #[cfg(feature = "host")]
-    type AsyncHostType<'stream, 'b> = &'b mut crate::utils::shared::ThreadBlockShared<T>;
-    #[cfg(any(feature = "device", doc))]
-    type DeviceType<'b> = &'b mut crate::utils::shared::ThreadBlockShared<T>;
-    type FfiType<'stream, 'b> = private_shared::ThreadBlockSharedFfi<T>;
-    #[cfg(feature = "host")]
-    type SyncHostType = Self;
-
-    #[cfg(feature = "host")]
-    fn with_new_async<'stream, O, E: From<rustacuda::error::CudaError>>(
-        param: Self::SyncHostType,
-        _stream: &'stream rustacuda::stream::Stream,
-        inner: impl for<'b> FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result<O, E>,
-    ) -> Result<O, E> {
-        inner(param)
-    }
-
-    #[cfg(feature = "host")]
-    fn with_async_as_ptx_jit<O>(
-        _param: &Self::AsyncHostType<'_, '_>,
-        _token: sealed::Token,
-        inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O,
-    ) -> O {
-        inner(None)
-    }
-
-    #[cfg(feature = "host")]
-    fn shared_layout_for_async(
-        _param: &Self::AsyncHostType<'_, '_>,
-        _token: sealed::Token,
-    ) -> Layout {
-        Layout::new::<()>()
-    }
-
-    #[cfg(feature = "host")]
-    fn async_to_ffi<'stream, 'b, E: From<rustacuda::error::CudaError>>(
-        _param: Self::AsyncHostType<'stream, 'b>,
-        _token: sealed::Token,
-    ) -> Result<Self::FfiType<'stream, 'b>, E> {
-        Ok(private_shared::ThreadBlockSharedFfi {
-            _dummy: [],
-            _marker: PhantomData::<T>,
-        })
-    }
-
-    #[cfg(feature = "device")]
-    #[allow(clippy::inline_always)]
-    #[inline(always)]
-    unsafe fn with_ffi_as_device<O, const PARAM: usize>(
-        _param: Self::FfiType<'static, 'static>,
-        inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O,
-    ) -> O {
-        let mut param = crate::utils::shared::ThreadBlockShared::new_uninit();
-
-        inner(&mut param)
-    }
-}
-impl<'a, T: 'static> sealed::Sealed for &'a mut crate::utils::shared::ThreadBlockShared<T> {}
-
-impl<'a, T: 'static + PortableBitSemantics + TypeGraphLayout> CudaKernelParameter
-    for &'a mut crate::utils::shared::ThreadBlockSharedSlice<T>
-{
-    #[cfg(feature = "host")]
-    type AsyncHostType<'stream, 'b> = &'b mut crate::utils::shared::ThreadBlockSharedSlice<T>;
-    #[cfg(any(feature = "device", doc))]
-    type DeviceType<'b> = &'b mut crate::utils::shared::ThreadBlockSharedSlice<T>;
-    type FfiType<'stream, 'b> = private_shared::ThreadBlockSharedSliceFfi<T>;
-    #[cfg(feature = "host")]
-    type SyncHostType = Self;
-
-    #[cfg(feature = "host")]
-    fn with_new_async<'stream, O, E: From<rustacuda::error::CudaError>>(
-        param: Self::SyncHostType,
-        _stream: &'stream rustacuda::stream::Stream,
-        inner: impl for<'b> FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result<O, E>,
-    ) -> Result<O, E> {
-        inner(param)
-    }
-
-    #[cfg(feature = "host")]
-    fn with_async_as_ptx_jit<O>(
-        _param: &Self::AsyncHostType<'_, '_>,
-        _token: sealed::Token,
-        inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O,
-    ) -> O {
-        inner(None)
-    }
-
-    #[cfg(feature = "host")]
-    fn shared_layout_for_async(
-        param: &Self::AsyncHostType<'_, '_>,
-        _token: sealed::Token,
-    ) -> Layout {
-        param.layout()
-    }
-
-    #[cfg(feature = "host")]
-    fn async_to_ffi<'stream, 'b, E: From<rustacuda::error::CudaError>>(
-        param: Self::AsyncHostType<'stream, 'b>,
-        _token: sealed::Token,
-    ) -> Result<Self::FfiType<'stream, 'b>, E> {
-        Ok(private_shared::ThreadBlockSharedSliceFfi {
-            len: param.len(),
-            _marker: [],
-        })
-    }
-
-    #[cfg(feature = "device")]
-    #[allow(clippy::inline_always)]
-    #[inline(always)]
-    unsafe fn with_ffi_as_device<O, const PARAM: usize>(
-        param: Self::FfiType<'static, 'static>,
-        inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O,
-    ) -> O {
-        unsafe {
-            crate::utils::shared::ThreadBlockSharedSlice::with_uninit_for_len(param.len, inner)
-        }
-    }
-}
-impl<'a, T: 'static + PortableBitSemantics + TypeGraphLayout> sealed::Sealed
-    for &'a mut crate::utils::shared::ThreadBlockSharedSlice<T>
-{
-}
+// mod private_shared {
+//     use core::marker::PhantomData;
+
+//     use const_type_layout::{TypeGraphLayout, TypeLayout};
+
+//     use crate::safety::PortableBitSemantics;
+
+//     #[doc(hidden)]
+//     #[derive(TypeLayout)]
+//     #[repr(C)]
+//     pub struct ThreadBlockSharedFfi<T: 'static> {
+//         pub(super) _dummy: [u8; 0],
+//         pub(super) _marker: PhantomData<T>,
+//     }
+
+//     #[doc(hidden)]
+//     #[derive(TypeLayout)]
+//     #[repr(C)]
+//     pub struct ThreadBlockSharedSliceFfi<T: 'static + PortableBitSemantics + TypeGraphLayout> {
+//         pub(super) len: usize,
+//         pub(super) _marker: [T; 0],
+//     }
+// }
+
+// impl<'a, T: 'static> CudaKernelParameter for &'a mut crate::utils::shared::ThreadBlockShared<T> {
+//     #[cfg(feature = "host")]
+//     type AsyncHostType<'stream, 'b> = &'b mut crate::utils::shared::ThreadBlockShared<T>;
+//     #[cfg(any(feature = "device", doc))]
+//     type DeviceType<'b> = &'b mut crate::utils::shared::ThreadBlockShared<T>;
+//     type FfiType<'stream, 'b> = private_shared::ThreadBlockSharedFfi<T>;
+//     #[cfg(feature = "host")]
+//     type SyncHostType = Self;
+
+//     #[cfg(feature = "host")]
+//     fn with_new_async<'stream, 'param, O, E: From<rustacuda::error::CudaError>>(
+//         param: Self::SyncHostType,
+//         _stream: &'stream rustacuda::stream::Stream,
+//         inner: impl super::WithNewAsync<'stream, Self, O, E>,
+//     ) -> Result<O, E> where Self: 'param {
+//         inner.with(param)
+//     }
+
+//     #[cfg(feature = "host")]
+//     fn with_async_as_ptx_jit<O>(
+//         _param: &Self::AsyncHostType<'_, '_>,
+//         _token: sealed::Token,
+//         inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O,
+//     ) -> O {
+//         inner(None)
+//     }
+
+//     #[cfg(feature = "host")]
+//     fn shared_layout_for_async(
+//         _param: &Self::AsyncHostType<'_, '_>,
+//         _token: sealed::Token,
+//     ) -> Layout {
+//         Layout::new::<()>()
+//     }
+
+//     #[cfg(feature = "host")]
+//     fn async_to_ffi<'stream, 'b, E: From<rustacuda::error::CudaError>>(
+//         _param: Self::AsyncHostType<'stream, 'b>,
+//         _token: sealed::Token,
+//     ) -> Result<Self::FfiType<'stream, 'b>, E> {
+//         Ok(private_shared::ThreadBlockSharedFfi {
+//             _dummy: [],
+//             _marker: PhantomData::<T>,
+//         })
+//     }
+
+//     #[cfg(feature = "device")]
+//     #[allow(clippy::inline_always)]
+//     #[inline(always)]
+//     unsafe fn with_ffi_as_device<O, const PARAM: usize>(
+//         _param: Self::FfiType<'static, 'static>,
+//         inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O,
+//     ) -> O {
+//         let mut param = crate::utils::shared::ThreadBlockShared::new_uninit();
+
+//         inner(&mut param)
+//     }
+// }
+// impl<'a, T: 'static> sealed::Sealed for &'a mut crate::utils::shared::ThreadBlockShared<T> {}
+
+// impl<'a, T: 'static + PortableBitSemantics + TypeGraphLayout> CudaKernelParameter
+//     for &'a mut crate::utils::shared::ThreadBlockSharedSlice<T>
+// {
+//     #[cfg(feature = "host")]
+//     type AsyncHostType<'stream, 'b> = &'b mut crate::utils::shared::ThreadBlockSharedSlice<T>;
+//     #[cfg(any(feature = "device", doc))]
+//     type DeviceType<'b> = &'b mut crate::utils::shared::ThreadBlockSharedSlice<T>;
+//     type FfiType<'stream, 'b> = private_shared::ThreadBlockSharedSliceFfi<T>;
+//     #[cfg(feature = "host")]
+//     type SyncHostType = Self;
+
+//     #[cfg(feature = "host")]
+//     fn with_new_async<'stream, 'param, O, E: From<rustacuda::error::CudaError>>(
+//         param: Self::SyncHostType,
+//         _stream: &'stream rustacuda::stream::Stream,
+//         inner: impl super::WithNewAsync<'stream, Self, O, E>,
+//     ) -> Result<O, E> where Self: 'param {
+//         inner.with(param)
+//     }
+
+//     #[cfg(feature = "host")]
+//     fn with_async_as_ptx_jit<O>(
+//         _param: &Self::AsyncHostType<'_, '_>,
+//         _token: sealed::Token,
+//         inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O,
+//     ) -> O {
+//         inner(None)
+//     }
+
+//     #[cfg(feature = "host")]
+//     fn shared_layout_for_async(
+//         param: &Self::AsyncHostType<'_, '_>,
+//         _token: sealed::Token,
+//     ) -> Layout {
+//         param.layout()
+//     }
+
+//     #[cfg(feature = "host")]
+//     fn async_to_ffi<'stream, 'b, E: From<rustacuda::error::CudaError>>(
+//         param: Self::AsyncHostType<'stream, 'b>,
+//         _token: sealed::Token,
+//     ) -> Result<Self::FfiType<'stream, 'b>, E> {
+//         Ok(private_shared::ThreadBlockSharedSliceFfi {
+//             len: param.len(),
+//             _marker: [],
+//         })
+//     }
+
+//     #[cfg(feature = "device")]
+//     #[allow(clippy::inline_always)]
+//     #[inline(always)]
+//     unsafe fn with_ffi_as_device<O, const PARAM: usize>(
+//         param: Self::FfiType<'static, 'static>,
+//         inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O,
+//     ) -> O {
+//         unsafe {
+//             crate::utils::shared::ThreadBlockSharedSlice::with_uninit_for_len(param.len, inner)
+//         }
+//     }
+// }
+// impl<'a, T: 'static + PortableBitSemantics + TypeGraphLayout> sealed::Sealed
+//     for &'a mut crate::utils::shared::ThreadBlockSharedSlice<T>
+// {
+// }

From e0d2319c54bf79ccb5e6a1bf3a1390d4b5dcebf7 Mon Sep 17 00:00:00 2001
From: Juniper Tyree <juniper.tyree@helsinki.fi>
Date: Sun, 7 Jan 2024 22:01:01 +0000
Subject: [PATCH 093/120] Further investigation of less 'static bounds

---
 src/kernel/mod.rs   |    5 +-
 src/kernel/param.rs | 1433 ++++++++++++++++++++++---------------------
 2 files changed, 721 insertions(+), 717 deletions(-)

diff --git a/src/kernel/mod.rs b/src/kernel/mod.rs
index cc51d64f0..c03ca5517 100644
--- a/src/kernel/mod.rs
+++ b/src/kernel/mod.rs
@@ -39,12 +39,13 @@ mod sealed {
     pub struct Token;
 }
 
-#[cfg(feature = "host")]
+#[cfg(feature = "host")] // FIXME: make private?
 pub trait WithNewAsync<'stream, P: ?Sized + CudaKernelParameter, O, E: From<rustacuda::error::CudaError>> {
+    #[allow(clippy::missing_errors_doc)] // FIXME
     fn with<'b>(self, param: P::AsyncHostType<'stream, 'b>) -> Result<O, E> where P: 'b;
 }
 
-#[cfg(feature = "host")]
+#[cfg(feature = "host")] // FIXME: make private?
 impl<'stream, P: ?Sized + CudaKernelParameter, O, E: From<rustacuda::error::CudaError>, F: for<'b> FnOnce(P::AsyncHostType<'stream, 'b>) -> Result<O, E>> WithNewAsync<'stream, P, O, E> for F {
     fn with<'b>(self, param: P::AsyncHostType<'stream, 'b>) -> Result<O, E> where P: 'b {
         (self)(param)
diff --git a/src/kernel/param.rs b/src/kernel/param.rs
index edc56f4b7..c40f68e1e 100644
--- a/src/kernel/param.rs
+++ b/src/kernel/param.rs
@@ -234,6 +234,7 @@ impl<
         stream: &'stream rustacuda::stream::Stream,
         inner: impl super::WithNewAsync<'stream, Self, O, E>,
     ) -> Result<O, E> where Self: 'param {
+        // FIXME: forward impl
         crate::host::HostAndDeviceConstRef::with_new(param, |const_ref| {
             inner.with(const_ref.as_async(stream).as_ref())
         })
@@ -272,6 +273,7 @@ impl<
     ) -> O where Self: 'short {
         emit_param_ptx_jit_marker::<_, PARAM>(param.as_ref());
 
+        // FIXME: forward impl
         let param = param.as_ref();
 
         inner.with(param)
@@ -287,573 +289,574 @@ impl<
 {
 }
 
-// pub struct ShallowInteriorMutable<
-//     T: Sync
-//         + crate::safety::StackOnly
-//         + crate::safety::PortableBitSemantics
-//         + TypeGraphLayout
-//         + InteriorMutableSync,
-// > {
-//     never: !,
-//     _marker: PhantomData<T>,
-// }
-
-// impl<
-//         T: Sync
-//             + crate::safety::StackOnly
-//             + crate::safety::PortableBitSemantics
-//             + TypeGraphLayout
-//             + InteriorMutableSync,
-//     > Deref for ShallowInteriorMutable<T>
-// {
-//     type Target = T;
-
-//     fn deref(&self) -> &Self::Target {
-//         self.never
-//     }
-// }
-
-// impl<
-//         'a,
-//         T: 'static
-//             + Sync
-//             + crate::safety::StackOnly
-//             + crate::safety::PortableBitSemantics
-//             + TypeGraphLayout
-//             + InteriorMutableSync,
-//     > CudaKernelParameter for &'a ShallowInteriorMutable<T>
-// {
-//     #[cfg(feature = "host")]
-//     type AsyncHostType<'stream, 'b> = crate::utils::r#async::AsyncProj<
-//         'b,
-//         'stream,
-//         &'b crate::host::HostAndDeviceConstRef<'b, T>,
-//     >;
-//     #[cfg(any(feature = "device", doc))]
-//     type DeviceType<'b> = &'b T;
-//     type FfiType<'stream, 'b> = DeviceConstRef<'b, T>;
-//     #[cfg(feature = "host")]
-//     /// The kernel takes a mutable borrow of the interior mutable data to ensure
-//     /// the interior mutability is limited to just this kernel invocation.
-//     type SyncHostType = &'a mut T;
-
-//     #[cfg(feature = "host")]
-//     fn with_new_async<'stream, 'param, O, E: From<rustacuda::error::CudaError>>(
-//         param: Self::SyncHostType,
-//         stream: &'stream rustacuda::stream::Stream,
-//         inner: impl super::WithNewAsync<'stream, Self, O, E>,
-//     ) -> Result<O, E> where Self: 'param {
-//         crate::host::HostAndDeviceMutRef::with_new(param, |const_ref| {
-//             inner.with(const_ref.as_ref().as_async(stream).as_ref())
-//         })
-//     }
-
-//     #[cfg(feature = "host")]
-//     fn with_async_as_ptx_jit<O>(
-//         _param: &Self::AsyncHostType<'_, '_>,
-//         _token: sealed::Token,
-//         inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O,
-//     ) -> O {
-//         inner(None)
-//     }
-
-//     #[cfg(feature = "host")]
-//     fn shared_layout_for_async(
-//         _param: &Self::AsyncHostType<'_, '_>,
-//         _token: sealed::Token,
-//     ) -> Layout {
-//         Layout::new::<()>()
-//     }
-
-//     #[cfg(feature = "host")]
-//     fn async_to_ffi<'stream, 'b, E: From<rustacuda::error::CudaError>>(
-//         param: Self::AsyncHostType<'stream, 'b>,
-//         _token: sealed::Token,
-//     ) -> Result<Self::FfiType<'stream, 'b>, E> {
-//         let param = unsafe { param.unwrap_unchecked() };
-//         Ok(param.for_device())
-//     }
-
-//     #[cfg(feature = "device")]
-//     unsafe fn with_ffi_as_device<O, const PARAM: usize>(
-//         param: Self::FfiType<'static, 'static>,
-//         inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O,
-//     ) -> O {
-//         let param = param.as_ref();
-
-//         inner(param)
-//     }
-// }
-// impl<
-//         'a,
-//         T: crate::safety::StackOnly
-//             + Sync
-//             + crate::safety::PortableBitSemantics
-//             + TypeGraphLayout
-//             + InteriorMutableSync,
-//     > sealed::Sealed for &'a ShallowInteriorMutable<T>
-// {
-// }
-
-// pub trait InteriorMutableSync: Sync + sealed::Sealed {}
-
-// macro_rules! impl_atomic_interior_mutable {
-//     ($atomic:ident($interior:ty)) => {
-//         impl InteriorMutableSync for core::sync::atomic::$atomic {}
-//         impl sealed::Sealed for core::sync::atomic::$atomic {}
-//     };
-//     ($($atomic:ident($interior:ty)),*) => {
-//         $(impl_atomic_interior_mutable! { $atomic($interior) })*
-//     }
-// }
-
-// impl_atomic_interior_mutable! {
-//     AtomicBool(bool),
-//     AtomicI8(i8), AtomicI16(i16), AtomicI32(i32), AtomicI64(i64), AtomicIsize(isize),
-//     AtomicU8(u8), AtomicU16(u16), AtomicU32(u32), AtomicU64(u64), AtomicUsize(usize)
-// }
-
-// impl<T: crate::safety::StackOnly + crate::safety::PortableBitSemantics + Sync> InteriorMutableSync
-//     for core::cell::SyncUnsafeCell<T>
-// {
-// }
-// impl<T: crate::safety::StackOnly + crate::safety::PortableBitSemantics + Sync> sealed::Sealed
-//     for core::cell::SyncUnsafeCell<T>
-// {
-// }
-
-// pub struct DeepPerThreadBorrow<T: RustToCuda> {
-//     never: !,
-//     _marker: PhantomData<T>,
-// }
-
-// impl<T: RustToCuda> Deref for DeepPerThreadBorrow<T> {
-//     type Target = T;
-
-//     fn deref(&self) -> &Self::Target {
-//         self.never
-//     }
-// }
-
-// impl<
-//         T: Send
-//             + Clone
-//             + RustToCuda<
-//                 CudaRepresentation: 'static + crate::safety::StackOnly,
-//                 CudaAllocation: EmptyCudaAlloc,
-//             >,
-//     > CudaKernelParameter for DeepPerThreadBorrow<T>
-// {
-//     #[cfg(feature = "host")]
-//     type AsyncHostType<'stream, 'b> = crate::utils::r#async::Async<
-//         'b,
-//         'stream,
-//         crate::host::HostAndDeviceOwned<
-//             'b,
-//             DeviceAccessible<<T as RustToCuda>::CudaRepresentation>,
-//         >,
-//         crate::utils::r#async::NoCompletion,
-//     >;
-//     #[cfg(any(feature = "device", doc))]
-//     type DeviceType<'b> = T;
-//     type FfiType<'stream, 'b> =
-//         DeviceOwnedRef<'b, DeviceAccessible<<T as RustToCuda>::CudaRepresentation>>;
-//     #[cfg(feature = "host")]
-//     type SyncHostType = T;
-
-//     #[cfg(feature = "host")]
-//     fn with_new_async<'stream, 'param, O, E: From<rustacuda::error::CudaError>>(
-//         param: Self::SyncHostType,
-//         stream: &'stream rustacuda::stream::Stream,
-//         inner: impl super::WithNewAsync<'stream, Self, O, E>,
-//     ) -> Result<O, E> where Self: 'param {
-//         crate::lend::LendToCuda::move_to_cuda(param, |param| inner.with(param.into_async(stream)))
-//     }
-
-//     #[cfg(feature = "host")]
-//     fn with_async_as_ptx_jit<O>(
-//         _param: &Self::AsyncHostType<'_, '_>,
-//         _token: sealed::Token,
-//         inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O,
-//     ) -> O {
-//         inner(None)
-//     }
-
-//     #[cfg(feature = "host")]
-//     fn shared_layout_for_async(
-//         _param: &Self::AsyncHostType<'_, '_>,
-//         _token: sealed::Token,
-//     ) -> Layout {
-//         Layout::new::<()>()
-//     }
-
-//     #[cfg(feature = "host")]
-//     fn async_to_ffi<'stream, 'b, E: From<rustacuda::error::CudaError>>(
-//         param: Self::AsyncHostType<'stream, 'b>,
-//         _token: sealed::Token,
-//     ) -> Result<Self::FfiType<'stream, 'b>, E> {
-//         let (param, _completion): (_, Option<crate::utils::r#async::NoCompletion>) =
-//             unsafe { param.unwrap_unchecked()? };
-//         Ok(param.for_device())
-//     }
-
-//     #[cfg(feature = "device")]
-//     unsafe fn with_ffi_as_device<O, const PARAM: usize>(
-//         param: Self::FfiType<'static, 'static>,
-//         inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O,
-//     ) -> O {
-//         unsafe { crate::lend::BorrowFromRust::with_moved_from_rust(param, inner) }
-//     }
-// }
-// impl<
-//         T: Send
-//             + Clone
-//             + RustToCuda<
-//                 CudaRepresentation: 'static + crate::safety::StackOnly,
-//                 CudaAllocation: EmptyCudaAlloc,
-//             >,
-//     > sealed::Sealed for DeepPerThreadBorrow<T>
-// {
-// }
-
-// impl<'a, T: 'static + Sync + RustToCuda> CudaKernelParameter for &'a DeepPerThreadBorrow<T> {
-//     #[cfg(feature = "host")]
-//     type AsyncHostType<'stream, 'b> = crate::utils::r#async::AsyncProj<
-//         'b,
-//         'stream,
-//         &'b crate::host::HostAndDeviceConstRef<
-//             'b,
-//             DeviceAccessible<<T as RustToCuda>::CudaRepresentation>,
-//         >,
-//     >;
-//     #[cfg(any(feature = "device", doc))]
-//     type DeviceType<'b> = &'b T;
-//     type FfiType<'stream, 'b> =
-//         DeviceConstRef<'b, DeviceAccessible<<T as RustToCuda>::CudaRepresentation>>;
-//     #[cfg(feature = "host")]
-//     type SyncHostType = &'a T;
-
-//     #[cfg(feature = "host")]
-//     fn with_new_async<'stream, 'param, O, E: From<rustacuda::error::CudaError>>(
-//         param: Self::SyncHostType,
-//         stream: &'stream rustacuda::stream::Stream,
-//         inner: impl super::WithNewAsync<'stream, Self, O, E>,
-//     ) -> Result<O, E> where Self: 'param {
-//         crate::lend::LendToCuda::lend_to_cuda(param, |param| inner.with(param.as_async(stream).as_ref()))
-//     }
-
-//     #[cfg(feature = "host")]
-//     fn with_async_as_ptx_jit<O>(
-//         _param: &Self::AsyncHostType<'_, '_>,
-//         _token: sealed::Token,
-//         inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O,
-//     ) -> O {
-//         inner(None)
-//     }
-
-//     #[cfg(feature = "host")]
-//     fn shared_layout_for_async(
-//         _param: &Self::AsyncHostType<'_, '_>,
-//         _token: sealed::Token,
-//     ) -> Layout {
-//         Layout::new::<()>()
-//     }
-
-//     #[cfg(feature = "host")]
-//     fn async_to_ffi<'stream, 'b, E: From<rustacuda::error::CudaError>>(
-//         param: Self::AsyncHostType<'stream, 'b>,
-//         _token: sealed::Token,
-//     ) -> Result<Self::FfiType<'stream, 'b>, E> {
-//         let param = unsafe { param.unwrap_unchecked() };
-//         Ok(param.for_device())
-//     }
-
-//     #[cfg(feature = "device")]
-//     unsafe fn with_ffi_as_device<O, const PARAM: usize>(
-//         param: Self::FfiType<'static, 'static>,
-//         inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O,
-//     ) -> O {
-//         unsafe { crate::lend::BorrowFromRust::with_borrow_from_rust(param, inner) }
-//     }
-// }
-// impl<'a, T: Sync + RustToCuda> sealed::Sealed for &'a DeepPerThreadBorrow<T> {}
-
-// impl<'a, T: 'static + Sync + RustToCuda + SafeMutableAliasing> CudaKernelParameter
-//     for &'a mut DeepPerThreadBorrow<T>
-// {
-//     #[cfg(feature = "host")]
-//     type AsyncHostType<'stream, 'b> = crate::utils::r#async::AsyncProj<
-//         'b,
-//         'stream,
-//         &'b mut crate::host::HostAndDeviceMutRef<
-//             'b,
-//             DeviceAccessible<<T as RustToCuda>::CudaRepresentation>,
-//         >,
-//     >;
-//     #[cfg(any(feature = "device", doc))]
-//     type DeviceType<'b> = &'b mut T;
-//     type FfiType<'stream, 'b> =
-//         DeviceMutRef<'b, DeviceAccessible<<T as RustToCuda>::CudaRepresentation>>;
-//     #[cfg(feature = "host")]
-//     type SyncHostType = &'a mut T;
-
-//     #[cfg(feature = "host")]
-//     fn with_new_async<'stream, 'param, O, E: From<rustacuda::error::CudaError>>(
-//         param: Self::SyncHostType,
-//         stream: &'stream rustacuda::stream::Stream,
-//         inner: impl super::WithNewAsync<'stream, Self, O, E>,
-//     ) -> Result<O, E> where Self: 'param {
-//         crate::lend::LendToCuda::lend_to_cuda_mut(param, |mut param| {
-//             // FIXME: express the same with param.as_async(stream).as_mut()
-//             let _ = stream;
-//             inner.with(crate::utils::r#async::AsyncProj::new(&mut param.as_mut()))
-//         })
-//     }
-
-//     #[cfg(feature = "host")]
-//     fn with_async_as_ptx_jit<O>(
-//         _param: &Self::AsyncHostType<'_, '_>,
-//         _token: sealed::Token,
-//         inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O,
-//     ) -> O {
-//         inner(None)
-//     }
-
-//     #[cfg(feature = "host")]
-//     fn shared_layout_for_async(
-//         _param: &Self::AsyncHostType<'_, '_>,
-//         _token: sealed::Token,
-//     ) -> Layout {
-//         Layout::new::<()>()
-//     }
-
-//     #[cfg(feature = "host")]
-//     fn async_to_ffi<'stream, 'b, E: From<rustacuda::error::CudaError>>(
-//         param: Self::AsyncHostType<'stream, 'b>,
-//         _token: sealed::Token,
-//     ) -> Result<Self::FfiType<'stream, 'b>, E> {
-//         let param = unsafe { param.unwrap_unchecked() };
-//         Ok(param.for_device())
-//     }
-
-//     #[cfg(feature = "device")]
-//     unsafe fn with_ffi_as_device<O, const PARAM: usize>(
-//         param: Self::FfiType<'static, 'static>,
-//         inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O,
-//     ) -> O {
-//         unsafe { crate::lend::BorrowFromRust::with_borrow_from_rust_mut(param, inner) }
-//     }
-// }
-// impl<'a, T: Sync + RustToCuda + SafeMutableAliasing> sealed::Sealed
-//     for &'a mut DeepPerThreadBorrow<T>
-// {
-// }
-
-// impl<
-//         T: Send
-//             + Clone
-//             + RustToCuda<
-//                 CudaRepresentation: 'static + crate::safety::StackOnly,
-//                 CudaAllocation: EmptyCudaAlloc,
-//             >,
-//     > CudaKernelParameter for PtxJit<DeepPerThreadBorrow<T>>
-// {
-//     #[cfg(feature = "host")]
-//     type AsyncHostType<'stream, 'b> =
-//         <DeepPerThreadBorrow<T> as CudaKernelParameter>::AsyncHostType<'stream, 'b>;
-//     #[cfg(any(feature = "device", doc))]
-//     type DeviceType<'b> = <DeepPerThreadBorrow<T> as CudaKernelParameter>::DeviceType<'b>;
-//     type FfiType<'stream, 'b> =
-//         <DeepPerThreadBorrow<T> as CudaKernelParameter>::FfiType<'stream, 'b>;
-//     #[cfg(feature = "host")]
-//     type SyncHostType = <DeepPerThreadBorrow<T> as CudaKernelParameter>::SyncHostType;
-
-//     #[cfg(feature = "host")]
-//     fn with_new_async<'stream, 'param, O, E: From<rustacuda::error::CudaError>>(
-//         param: Self::SyncHostType,
-//         stream: &'stream rustacuda::stream::Stream,
-//         inner: impl super::WithNewAsync<'stream, Self, O, E>,
-//     ) -> Result<O, E> where Self: 'param {
-//         <DeepPerThreadBorrow<T> as CudaKernelParameter>::with_new_async(param, stream, |param: <DeepPerThreadBorrow<T> as CudaKernelParameter>::AsyncHostType<'stream, '_>| inner.with(param))
-//     }
-
-//     #[cfg(feature = "host")]
-//     fn with_async_as_ptx_jit<O>(
-//         param: &Self::AsyncHostType<'_, '_>,
-//         _token: sealed::Token,
-//         inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O,
-//     ) -> O {
-//         let param = unsafe { param.as_ref().unwrap_unchecked() };
-//         inner(Some(&param_as_raw_bytes(param.for_host())))
-//     }
-
-//     #[cfg(feature = "host")]
-//     fn async_to_ffi<'stream, 'b, E: From<rustacuda::error::CudaError>>(
-//         param: Self::AsyncHostType<'stream, 'b>,
-//         token: sealed::Token,
-//     ) -> Result<Self::FfiType<'stream, 'b>, E> {
-//         <DeepPerThreadBorrow<T> as CudaKernelParameter>::async_to_ffi(param, token)
-//     }
-
-//     #[cfg(feature = "host")]
-//     fn shared_layout_for_async(
-//         _param: &Self::AsyncHostType<'_, '_>,
-//         _token: sealed::Token,
-//     ) -> Layout {
-//         Layout::new::<()>()
-//     }
-
-//     #[cfg(feature = "device")]
-//     unsafe fn with_ffi_as_device<O, const PARAM: usize>(
-//         param: Self::FfiType<'static, 'static>,
-//         inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O,
-//     ) -> O {
-//         emit_param_ptx_jit_marker::<_, PARAM>(param.as_ref());
-
-//         <DeepPerThreadBorrow<T> as CudaKernelParameter>::with_ffi_as_device::<O, PARAM>(
-//             param, inner,
-//         )
-//     }
-// }
-// impl<
-//         T: Send
-//             + Clone
-//             + RustToCuda<
-//                 CudaRepresentation: 'static + crate::safety::StackOnly,
-//                 CudaAllocation: EmptyCudaAlloc,
-//             >,
-//     > sealed::Sealed for PtxJit<DeepPerThreadBorrow<T>>
-// {
-// }
-
-// impl<'a, T: 'static + Sync + RustToCuda> CudaKernelParameter
-//     for &'a PtxJit<DeepPerThreadBorrow<T>>
-// {
-//     #[cfg(feature = "host")]
-//     type AsyncHostType<'stream, 'b> =
-//         <&'a DeepPerThreadBorrow<T> as CudaKernelParameter>::AsyncHostType<'stream, 'b>;
-//     #[cfg(any(feature = "device", doc))]
-//     type DeviceType<'b> = <&'a DeepPerThreadBorrow<T> as CudaKernelParameter>::DeviceType<'b>;
-//     type FfiType<'stream, 'b> =
-//         <&'a DeepPerThreadBorrow<T> as CudaKernelParameter>::FfiType<'stream, 'b>;
-//     #[cfg(feature = "host")]
-//     type SyncHostType = <&'a DeepPerThreadBorrow<T> as CudaKernelParameter>::SyncHostType;
-
-//     #[cfg(feature = "host")]
-//     fn with_new_async<'stream, 'param, O, E: From<rustacuda::error::CudaError>>(
-//         param: Self::SyncHostType,
-//         stream: &'stream rustacuda::stream::Stream,
-//         inner: impl super::WithNewAsync<'stream, Self, O, E>,
-//     ) -> Result<O, E> where Self: 'param {
-//         <&'a DeepPerThreadBorrow<T> as CudaKernelParameter>::with_new_async(param, stream, |param: <&'a DeepPerThreadBorrow<T> as CudaKernelParameter>::AsyncHostType<'stream, '_>| inner.with(param))
-//     }
-
-//     #[cfg(feature = "host")]
-//     fn with_async_as_ptx_jit<O>(
-//         param: &Self::AsyncHostType<'_, '_>,
-//         _token: sealed::Token,
-//         inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O,
-//     ) -> O {
-//         let param = unsafe { param.unwrap_unchecked() };
-//         inner(Some(&param_as_raw_bytes(param.for_host())))
-//     }
-
-//     #[cfg(feature = "host")]
-//     fn shared_layout_for_async(
-//         _param: &Self::AsyncHostType<'_, '_>,
-//         _token: sealed::Token,
-//     ) -> Layout {
-//         Layout::new::<()>()
-//     }
-
-//     #[cfg(feature = "host")]
-//     fn async_to_ffi<'stream, 'b, E: From<rustacuda::error::CudaError>>(
-//         param: Self::AsyncHostType<'stream, 'b>,
-//         token: sealed::Token,
-//     ) -> Result<Self::FfiType<'stream, 'b>, E> {
-//         <&'a DeepPerThreadBorrow<T> as CudaKernelParameter>::async_to_ffi(param, token)
-//     }
-
-//     #[cfg(feature = "device")]
-//     unsafe fn with_ffi_as_device<O, const PARAM: usize>(
-//         param: Self::FfiType<'static, 'static>,
-//         inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O,
-//     ) -> O {
-//         emit_param_ptx_jit_marker::<_, PARAM>(param.as_ref());
-
-//         <&'a DeepPerThreadBorrow<T> as CudaKernelParameter>::with_ffi_as_device::<O, PARAM>(
-//             param, inner,
-//         )
-//     }
-// }
-// impl<'a, T: 'static + Sync + RustToCuda> sealed::Sealed for &'a PtxJit<DeepPerThreadBorrow<T>> {}
-
-// impl<'a, T: 'static + Sync + RustToCuda + SafeMutableAliasing> CudaKernelParameter
-//     for &'a mut PtxJit<DeepPerThreadBorrow<T>>
-// {
-//     #[cfg(feature = "host")]
-//     type AsyncHostType<'stream, 'b> =
-//         <&'a mut DeepPerThreadBorrow<T> as CudaKernelParameter>::AsyncHostType<'stream, 'b>;
-//     #[cfg(any(feature = "device", doc))]
-//     type DeviceType<'b> = <&'a mut DeepPerThreadBorrow<T> as CudaKernelParameter>::DeviceType<'b>;
-//     type FfiType<'stream, 'b> =
-//         <&'a mut DeepPerThreadBorrow<T> as CudaKernelParameter>::FfiType<'stream, 'b>;
-//     #[cfg(feature = "host")]
-//     type SyncHostType = <&'a mut DeepPerThreadBorrow<T> as CudaKernelParameter>::SyncHostType;
-
-//     #[cfg(feature = "host")]
-//     fn with_new_async<'stream, 'param, O, E: From<rustacuda::error::CudaError>>(
-//         param: Self::SyncHostType,
-//         stream: &'stream rustacuda::stream::Stream,
-//         inner: impl super::WithNewAsync<'stream, Self, O, E>,
-//     ) -> Result<O, E> where Self: 'param {
-//         <&'a mut DeepPerThreadBorrow<T> as CudaKernelParameter>::with_new_async(
-//             param, stream, |param: <&'a mut DeepPerThreadBorrow<T> as CudaKernelParameter>::AsyncHostType<'stream, '_>| inner.with(param),
-//         )
-//     }
-
-//     #[cfg(feature = "host")]
-//     fn with_async_as_ptx_jit<O>(
-//         param: &Self::AsyncHostType<'_, '_>,
-//         _token: sealed::Token,
-//         inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O,
-//     ) -> O {
-//         let param = unsafe { param.as_ref().unwrap_unchecked() };
-//         inner(Some(&param_as_raw_bytes(param.for_host())))
-//     }
-
-//     #[cfg(feature = "host")]
-//     fn shared_layout_for_async(
-//         _param: &Self::AsyncHostType<'_, '_>,
-//         _token: sealed::Token,
-//     ) -> Layout {
-//         Layout::new::<()>()
-//     }
-
-//     #[cfg(feature = "host")]
-//     fn async_to_ffi<'stream, 'b, E: From<rustacuda::error::CudaError>>(
-//         param: Self::AsyncHostType<'stream, 'b>,
-//         token: sealed::Token,
-//     ) -> Result<Self::FfiType<'stream, 'b>, E> {
-//         <&'a mut DeepPerThreadBorrow<T> as CudaKernelParameter>::async_to_ffi(param, token)
-//     }
-
-//     #[cfg(feature = "device")]
-//     unsafe fn with_ffi_as_device<O, const PARAM: usize>(
-//         param: Self::FfiType<'static, 'static>,
-//         inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O,
-//     ) -> O {
-//         emit_param_ptx_jit_marker::<_, PARAM>(param.as_ref());
-
-//         <&'a mut DeepPerThreadBorrow<T> as CudaKernelParameter>::with_ffi_as_device::<O, PARAM>(
-//             param, inner,
-//         )
-//     }
-// }
-// impl<'a, T: 'static + Sync + RustToCuda + SafeMutableAliasing> sealed::Sealed
-//     for &'a mut PtxJit<DeepPerThreadBorrow<T>>
-// {
-// }
+pub struct ShallowInteriorMutable<
+    T: Sync
+        + crate::safety::StackOnly
+        + crate::safety::PortableBitSemantics
+        + TypeGraphLayout
+        + InteriorMutableSync,
+> {
+    never: !,
+    _marker: PhantomData<T>,
+}
+
+impl<
+        T: Sync
+            + crate::safety::StackOnly
+            + crate::safety::PortableBitSemantics
+            + TypeGraphLayout
+            + InteriorMutableSync,
+    > Deref for ShallowInteriorMutable<T>
+{
+    type Target = T;
+
+    fn deref(&self) -> &Self::Target {
+        self.never
+    }
+}
+
+impl<
+        'a,
+        T: Sync
+            + crate::safety::StackOnly
+            + crate::safety::PortableBitSemantics
+            + TypeGraphLayout
+            + InteriorMutableSync,
+    > CudaKernelParameter for &'a ShallowInteriorMutable<T>
+{
+    #[cfg(feature = "host")]
+    type AsyncHostType<'stream, 'b> = crate::utils::r#async::AsyncProj<
+        'b,
+        'stream,
+        &'b crate::host::HostAndDeviceConstRef<'b, T>
+    > where Self: 'b;
+    #[cfg(any(feature = "device", doc))]
+    type DeviceType<'b> = &'b T where Self: 'b;
+    type FfiType<'stream, 'b> = DeviceConstRef<'b, T> where Self: 'b;
+    #[cfg(feature = "host")]
+    /// The kernel takes a mutable borrow of the interior mutable data to ensure
+    /// the interior mutability is limited to just this kernel invocation.
+    type SyncHostType = &'a mut T;
+
+    #[cfg(feature = "host")]
+    fn with_new_async<'stream, 'param, O, E: From<rustacuda::error::CudaError>>(
+        param: Self::SyncHostType,
+        stream: &'stream rustacuda::stream::Stream,
+        inner: impl super::WithNewAsync<'stream, Self, O, E>,
+    ) -> Result<O, E> where Self: 'param {
+        crate::host::HostAndDeviceMutRef::with_new(param, |const_ref| {
+            inner.with(const_ref.as_ref().as_async(stream).as_ref())
+        })
+    }
+
+    #[cfg(feature = "host")]
+    fn with_async_as_ptx_jit<'stream, 'b, O>(
+        _param: &Self::AsyncHostType<'stream, 'b>,
+        _token: sealed::Token,
+        inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O,
+    ) -> O where Self: 'b {
+        inner(None)
+    }
+
+    #[cfg(feature = "host")]
+    fn shared_layout_for_async<'stream, 'b>(
+        _param: &Self::AsyncHostType<'stream, 'b>,
+        _token: sealed::Token,
+    ) -> Layout where Self: 'b {
+        Layout::new::<()>()
+    }
+
+    #[cfg(feature = "host")]
+    fn async_to_ffi<'stream, 'b, E: From<rustacuda::error::CudaError>>(
+        param: Self::AsyncHostType<'stream, 'b>,
+        _token: sealed::Token,
+    ) -> Result<Self::FfiType<'stream, 'b>, E> where Self: 'b {
+        let param = unsafe { param.unwrap_unchecked() };
+        Ok(param.for_device())
+    }
+
+    #[cfg(feature = "device")]
+    unsafe fn with_ffi_as_device<'short, O, const PARAM: usize>(
+        param: Self::FfiType<'static, 'short>,
+        inner: impl super::WithFfiAsDevice<Self, O>,
+    ) -> O where Self: 'short {
+        let param = param.as_ref();
+
+        inner.with(param)
+    }
+}
+impl<
+        'a,
+        T: crate::safety::StackOnly
+            + Sync
+            + crate::safety::PortableBitSemantics
+            + TypeGraphLayout
+            + InteriorMutableSync,
+    > sealed::Sealed for &'a ShallowInteriorMutable<T>
+{
+}
+
+pub trait InteriorMutableSync: Sync + sealed::Sealed {}
+
+macro_rules! impl_atomic_interior_mutable {
+    ($atomic:ident($interior:ty)) => {
+        impl InteriorMutableSync for core::sync::atomic::$atomic {}
+        impl sealed::Sealed for core::sync::atomic::$atomic {}
+    };
+    ($($atomic:ident($interior:ty)),*) => {
+        $(impl_atomic_interior_mutable! { $atomic($interior) })*
+    }
+}
+
+impl_atomic_interior_mutable! {
+    AtomicBool(bool),
+    AtomicI8(i8), AtomicI16(i16), AtomicI32(i32), AtomicI64(i64), AtomicIsize(isize),
+    AtomicU8(u8), AtomicU16(u16), AtomicU32(u32), AtomicU64(u64), AtomicUsize(usize)
+}
+
+impl<T: crate::safety::StackOnly + crate::safety::PortableBitSemantics + Sync> InteriorMutableSync
+    for core::cell::SyncUnsafeCell<T>
+{
+}
+impl<T: crate::safety::StackOnly + crate::safety::PortableBitSemantics + Sync> sealed::Sealed
+    for core::cell::SyncUnsafeCell<T>
+{
+}
+
+pub struct DeepPerThreadBorrow<T: RustToCuda> {
+    never: !,
+    _marker: PhantomData<T>,
+}
+
+impl<T: RustToCuda> Deref for DeepPerThreadBorrow<T> {
+    type Target = T;
+
+    fn deref(&self) -> &Self::Target {
+        self.never
+    }
+}
+
+impl<
+        T: Send
+            + Clone
+            + RustToCuda<
+                CudaRepresentation: crate::safety::StackOnly,
+                CudaAllocation: EmptyCudaAlloc,
+            >,
+    > CudaKernelParameter for DeepPerThreadBorrow<T>
+{
+    #[cfg(feature = "host")]
+    type AsyncHostType<'stream, 'b> = crate::utils::r#async::Async<
+        'b,
+        'stream,
+        crate::host::HostAndDeviceOwned<
+            'b,
+            DeviceAccessible<<T as RustToCuda>::CudaRepresentation>,
+        >,
+        crate::utils::r#async::NoCompletion,
+    > where Self: 'b;
+    #[cfg(any(feature = "device", doc))]
+    type DeviceType<'b> = T where Self: 'b;
+    type FfiType<'stream, 'b> =
+        DeviceOwnedRef<'b, DeviceAccessible<<T as RustToCuda>::CudaRepresentation>> where Self: 'b;
+    #[cfg(feature = "host")]
+    type SyncHostType = T;
+
+    #[cfg(feature = "host")]
+    fn with_new_async<'stream, 'param, O, E: From<rustacuda::error::CudaError>>(
+        param: Self::SyncHostType,
+        stream: &'stream rustacuda::stream::Stream,
+        inner: impl super::WithNewAsync<'stream, Self, O, E>,
+    ) -> Result<O, E> where Self: 'param {
+        crate::lend::LendToCuda::move_to_cuda(param, |param| inner.with(param.into_async(stream)))
+    }
+
+    #[cfg(feature = "host")]
+    fn with_async_as_ptx_jit<'stream, 'b, O>(
+        _param: &Self::AsyncHostType<'stream, 'b>,
+        _token: sealed::Token,
+        inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O,
+    ) -> O where Self: 'b {
+        inner(None)
+    }
+
+    #[cfg(feature = "host")]
+    fn shared_layout_for_async<'stream, 'b>(
+        _param: &Self::AsyncHostType<'stream, 'b>,
+        _token: sealed::Token,
+    ) -> Layout where Self: 'b {
+        Layout::new::<()>()
+    }
+
+    #[cfg(feature = "host")]
+    fn async_to_ffi<'stream, 'b, E: From<rustacuda::error::CudaError>>(
+        param: Self::AsyncHostType<'stream, 'b>,
+        _token: sealed::Token,
+    ) -> Result<Self::FfiType<'stream, 'b>, E> where Self: 'b {
+        let (param, _completion): (_, Option<crate::utils::r#async::NoCompletion>) =
+            unsafe { param.unwrap_unchecked()? };
+        Ok(param.for_device())
+    }
+
+    #[cfg(feature = "device")]
+    unsafe fn with_ffi_as_device<'short, O, const PARAM: usize>(
+        param: Self::FfiType<'static, 'short>,
+        inner: impl super::WithFfiAsDevice<Self, O>,
+    ) -> O where Self: 'short {
+        unsafe { crate::lend::BorrowFromRust::with_moved_from_rust(param, |param| inner.with(param)) }
+    }
+}
+impl<
+        T: Send
+            + Clone
+            + RustToCuda<
+                CudaRepresentation: crate::safety::StackOnly,
+                CudaAllocation: EmptyCudaAlloc,
+            >,
+    > sealed::Sealed for DeepPerThreadBorrow<T>
+{
+}
+
+impl<'a, T: Sync + RustToCuda> CudaKernelParameter for &'a DeepPerThreadBorrow<T> {
+    #[cfg(feature = "host")]
+    type AsyncHostType<'stream, 'b> = crate::utils::r#async::AsyncProj<
+        'b,
+        'stream,
+        &'b crate::host::HostAndDeviceConstRef<
+            'b,
+            DeviceAccessible<<T as RustToCuda>::CudaRepresentation>,
+        >,
+    > where Self: 'b;
+    #[cfg(any(feature = "device", doc))]
+    type DeviceType<'b> = &'b T where Self: 'b;
+    type FfiType<'stream, 'b> =
+        DeviceConstRef<'b, DeviceAccessible<<T as RustToCuda>::CudaRepresentation>> where Self: 'b;
+    #[cfg(feature = "host")]
+    type SyncHostType = &'a T;
+
+    #[cfg(feature = "host")]
+    fn with_new_async<'stream, 'param, O, E: From<rustacuda::error::CudaError>>(
+        param: Self::SyncHostType,
+        stream: &'stream rustacuda::stream::Stream,
+        inner: impl super::WithNewAsync<'stream, Self, O, E>,
+    ) -> Result<O, E> where Self: 'param {
+        crate::lend::LendToCuda::lend_to_cuda(param, |param| inner.with(param.as_async(stream).as_ref()))
+    }
+
+    #[cfg(feature = "host")]
+    fn with_async_as_ptx_jit<'stream, 'b, O>(
+        _param: &Self::AsyncHostType<'stream, 'b>,
+        _token: sealed::Token,
+        inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O,
+    ) -> O where Self: 'b {
+        inner(None)
+    }
+
+    #[cfg(feature = "host")]
+    fn shared_layout_for_async<'stream, 'b>(
+        _param: &Self::AsyncHostType<'stream, 'b>,
+        _token: sealed::Token,
+    ) -> Layout where Self: 'b {
+        Layout::new::<()>()
+    }
+
+    #[cfg(feature = "host")]
+    fn async_to_ffi<'stream, 'b, E: From<rustacuda::error::CudaError>>(
+        param: Self::AsyncHostType<'stream, 'b>,
+        _token: sealed::Token,
+    ) -> Result<Self::FfiType<'stream, 'b>, E> where Self: 'b {
+        let param = unsafe { param.unwrap_unchecked() };
+        Ok(param.for_device())
+    }
+
+    #[cfg(feature = "device")]
+    unsafe fn with_ffi_as_device<'short, O, const PARAM: usize>(
+        param: Self::FfiType<'static, 'short>,
+        inner: impl super::WithFfiAsDevice<Self, O>,
+    ) -> O where Self: 'short {
+        unsafe { crate::lend::BorrowFromRust::with_borrow_from_rust(param, |param| inner.with(param)) }
+    }
+}
+impl<'a, T: Sync + RustToCuda> sealed::Sealed for &'a DeepPerThreadBorrow<T> {}
+
+impl<'a, T: Sync + RustToCuda + SafeMutableAliasing> CudaKernelParameter
+    for &'a mut DeepPerThreadBorrow<T>
+{
+    #[cfg(feature = "host")]
+    type AsyncHostType<'stream, 'b> = crate::utils::r#async::AsyncProj<
+        'b,
+        'stream,
+        &'b mut crate::host::HostAndDeviceMutRef<
+            'b,
+            DeviceAccessible<<T as RustToCuda>::CudaRepresentation>,
+        >,
+    > where Self: 'b;
+    #[cfg(any(feature = "device", doc))]
+    type DeviceType<'b> = &'b mut T where Self: 'b;
+    type FfiType<'stream, 'b> =
+        DeviceMutRef<'b, DeviceAccessible<<T as RustToCuda>::CudaRepresentation>> where Self: 'b;
+    #[cfg(feature = "host")]
+    type SyncHostType = &'a mut T;
+
+    #[cfg(feature = "host")]
+    fn with_new_async<'stream, 'param, O, E: From<rustacuda::error::CudaError>>(
+        param: Self::SyncHostType,
+        stream: &'stream rustacuda::stream::Stream,
+        inner: impl super::WithNewAsync<'stream, Self, O, E>,
+    ) -> Result<O, E> where Self: 'param {
+        crate::lend::LendToCuda::lend_to_cuda_mut(param, |mut param| {
+            // FIXME: express the same with param.as_async(stream).as_mut()
+            let _ = stream;
+            inner.with(crate::utils::r#async::AsyncProj::new(&mut param.as_mut()))
+        })
+    }
+
+    #[cfg(feature = "host")]
+    fn with_async_as_ptx_jit<'stream, 'b, O>(
+        _param: &Self::AsyncHostType<'stream, 'b>,
+        _token: sealed::Token,
+        inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O,
+    ) -> O where Self: 'b {
+        inner(None)
+    }
+
+    #[cfg(feature = "host")]
+    fn shared_layout_for_async<'stream, 'b>(
+        _param: &Self::AsyncHostType<'stream, 'b>,
+        _token: sealed::Token,
+    ) -> Layout where Self: 'b {
+        Layout::new::<()>()
+    }
+
+    #[cfg(feature = "host")]
+    fn async_to_ffi<'stream, 'b, E: From<rustacuda::error::CudaError>>(
+        param: Self::AsyncHostType<'stream, 'b>,
+        _token: sealed::Token,
+    ) -> Result<Self::FfiType<'stream, 'b>, E> where Self: 'b {
+        let param = unsafe { param.unwrap_unchecked() };
+        Ok(param.for_device())
+    }
+
+    #[cfg(feature = "device")]
+    unsafe fn with_ffi_as_device<'short, O, const PARAM: usize>(
+        param: Self::FfiType<'static, 'short>,
+        inner: impl super::WithFfiAsDevice<Self, O>,
+    ) -> O where Self: 'short {
+        unsafe { crate::lend::BorrowFromRust::with_borrow_from_rust_mut(param, |param| inner.with(param)) }
+    }
+}
+impl<'a, T: Sync + RustToCuda + SafeMutableAliasing> sealed::Sealed
+    for &'a mut DeepPerThreadBorrow<T>
+{
+}
+
+impl<
+        T: Send
+            + Clone
+            + RustToCuda<
+                CudaRepresentation: crate::safety::StackOnly,
+                CudaAllocation: EmptyCudaAlloc,
+            >,
+    > CudaKernelParameter for PtxJit<DeepPerThreadBorrow<T>>
+{
+    #[cfg(feature = "host")]
+    type AsyncHostType<'stream, 'b> =
+        <DeepPerThreadBorrow<T> as CudaKernelParameter>::AsyncHostType<'stream, 'b> where Self: 'b;
+    #[cfg(any(feature = "device", doc))]
+    type DeviceType<'b> = <DeepPerThreadBorrow<T> as CudaKernelParameter>::DeviceType<'b> where Self: 'b;
+    type FfiType<'stream, 'b> =
+        <DeepPerThreadBorrow<T> as CudaKernelParameter>::FfiType<'stream, 'b> where Self: 'b;
+    #[cfg(feature = "host")]
+    type SyncHostType = <DeepPerThreadBorrow<T> as CudaKernelParameter>::SyncHostType;
+
+    #[cfg(feature = "host")]
+    fn with_new_async<'stream, 'param, O, E: From<rustacuda::error::CudaError>>(
+        param: Self::SyncHostType,
+        stream: &'stream rustacuda::stream::Stream,
+        inner: impl super::WithNewAsync<'stream, Self, O, E>,
+    ) -> Result<O, E> where Self: 'param {
+        // FIXME: forward impl
+        crate::lend::LendToCuda::move_to_cuda(param, |param| inner.with(param.into_async(stream)))
+    }
+
+    #[cfg(feature = "host")]
+    fn with_async_as_ptx_jit<'stream, 'b, O>(
+        param: &Self::AsyncHostType<'stream, 'b>,
+        _token: sealed::Token,
+        inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O,
+    ) -> O where Self: 'b {
+        let param = unsafe { param.as_ref().unwrap_unchecked() };
+        inner(Some(&param_as_raw_bytes(param.for_host())))
+    }
+
+    #[cfg(feature = "host")]
+    fn async_to_ffi<'stream, 'b, E: From<rustacuda::error::CudaError>>(
+        param: Self::AsyncHostType<'stream, 'b>,
+        token: sealed::Token,
+    ) -> Result<Self::FfiType<'stream, 'b>, E> where Self: 'b {
+        <DeepPerThreadBorrow<T> as CudaKernelParameter>::async_to_ffi(param, token)
+    }
+
+    #[cfg(feature = "host")]
+    fn shared_layout_for_async<'stream, 'b>(
+        _param: &Self::AsyncHostType<'stream, 'b>,
+        _token: sealed::Token,
+    ) -> Layout where Self: 'b {
+        Layout::new::<()>()
+    }
+
+    #[cfg(feature = "device")]
+    unsafe fn with_ffi_as_device<'short, O, const PARAM: usize>(
+        param: Self::FfiType<'static, 'short>,
+        inner: impl super::WithFfiAsDevice<Self, O>,
+    ) -> O where Self: 'short {
+        emit_param_ptx_jit_marker::<_, PARAM>(param.as_ref());
+
+        // FIXME: forward impl
+        unsafe { crate::lend::BorrowFromRust::with_moved_from_rust(param, |param| inner.with(param)) }
+    }
+}
+impl<
+        T: Send
+            + Clone
+            + RustToCuda<
+                CudaRepresentation: crate::safety::StackOnly,
+                CudaAllocation: EmptyCudaAlloc,
+            >,
+    > sealed::Sealed for PtxJit<DeepPerThreadBorrow<T>>
+{
+}
+
+impl<'a, T: Sync + RustToCuda> CudaKernelParameter
+    for &'a PtxJit<DeepPerThreadBorrow<T>>
+{
+    #[cfg(feature = "host")]
+    type AsyncHostType<'stream, 'b> =
+        <&'a DeepPerThreadBorrow<T> as CudaKernelParameter>::AsyncHostType<'stream, 'b> where Self: 'b;
+    #[cfg(any(feature = "device", doc))]
+    type DeviceType<'b> = <&'a DeepPerThreadBorrow<T> as CudaKernelParameter>::DeviceType<'b> where Self: 'b;
+    type FfiType<'stream, 'b> =
+        <&'a DeepPerThreadBorrow<T> as CudaKernelParameter>::FfiType<'stream, 'b> where Self: 'b;
+    #[cfg(feature = "host")]
+    type SyncHostType = <&'a DeepPerThreadBorrow<T> as CudaKernelParameter>::SyncHostType;
+
+    #[cfg(feature = "host")]
+    fn with_new_async<'stream, 'param, O, E: From<rustacuda::error::CudaError>>(
+        param: Self::SyncHostType,
+        stream: &'stream rustacuda::stream::Stream,
+        inner: impl super::WithNewAsync<'stream, Self, O, E>,
+    ) -> Result<O, E> where Self: 'param {
+        // FIXME: forward impl
+        crate::lend::LendToCuda::lend_to_cuda(param, |param| inner.with(param.as_async(stream).as_ref()))
+    }
+
+    #[cfg(feature = "host")]
+    fn with_async_as_ptx_jit<'stream, 'b, O>(
+        param: &Self::AsyncHostType<'stream, 'b>,
+        _token: sealed::Token,
+        inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O,
+    ) -> O where Self: 'b {
+        let param = unsafe { param.unwrap_unchecked() };
+        inner(Some(&param_as_raw_bytes(param.for_host())))
+    }
+
+    #[cfg(feature = "host")]
+    fn shared_layout_for_async<'stream, 'b>(
+        _param: &Self::AsyncHostType<'stream, 'b>,
+        _token: sealed::Token,
+    ) -> Layout where Self: 'b {
+        Layout::new::<()>()
+    }
+
+    #[cfg(feature = "host")]
+    fn async_to_ffi<'stream, 'b, E: From<rustacuda::error::CudaError>>(
+        param: Self::AsyncHostType<'stream, 'b>,
+        token: sealed::Token,
+    ) -> Result<Self::FfiType<'stream, 'b>, E> where Self: 'b {
+        <&'a DeepPerThreadBorrow<T> as CudaKernelParameter>::async_to_ffi(param, token)
+    }
+
+    #[cfg(feature = "device")]
+    unsafe fn with_ffi_as_device<'short, O, const PARAM: usize>(
+        param: Self::FfiType<'static, 'short>,
+        inner: impl super::WithFfiAsDevice<Self, O>,
+    ) -> O where Self: 'short {
+        emit_param_ptx_jit_marker::<_, PARAM>(param.as_ref());
+
+        // FIXME: forward impl
+        unsafe { crate::lend::BorrowFromRust::with_borrow_from_rust(param, |param| inner.with(param)) }
+    }
+}
+impl<'a, T: Sync + RustToCuda> sealed::Sealed for &'a PtxJit<DeepPerThreadBorrow<T>> {}
+
+impl<'a, T: Sync + RustToCuda + SafeMutableAliasing> CudaKernelParameter
+    for &'a mut PtxJit<DeepPerThreadBorrow<T>>
+{
+    #[cfg(feature = "host")]
+    type AsyncHostType<'stream, 'b> =
+        <&'a mut DeepPerThreadBorrow<T> as CudaKernelParameter>::AsyncHostType<'stream, 'b> where Self: 'b;
+    #[cfg(any(feature = "device", doc))]
+    type DeviceType<'b> = <&'a mut DeepPerThreadBorrow<T> as CudaKernelParameter>::DeviceType<'b> where Self: 'b;
+    type FfiType<'stream, 'b> =
+        <&'a mut DeepPerThreadBorrow<T> as CudaKernelParameter>::FfiType<'stream, 'b> where Self: 'b;
+    #[cfg(feature = "host")]
+    type SyncHostType = <&'a mut DeepPerThreadBorrow<T> as CudaKernelParameter>::SyncHostType;
+
+    #[cfg(feature = "host")]
+    fn with_new_async<'stream, 'param, O, E: From<rustacuda::error::CudaError>>(
+        param: Self::SyncHostType,
+        stream: &'stream rustacuda::stream::Stream,
+        inner: impl super::WithNewAsync<'stream, Self, O, E>,
+    ) -> Result<O, E> where Self: 'param {
+        // FIXME: forward impl
+        crate::lend::LendToCuda::lend_to_cuda_mut(param, |mut param| {
+            // FIXME: express the same with param.as_async(stream).as_mut()
+            let _ = stream;
+            inner.with(crate::utils::r#async::AsyncProj::new(&mut param.as_mut()))
+        })
+    }
+
+    #[cfg(feature = "host")]
+    fn with_async_as_ptx_jit<'stream, 'b, O>(
+        param: &Self::AsyncHostType<'stream, 'b>,
+        _token: sealed::Token,
+        inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O,
+    ) -> O where Self: 'b {
+        let param = unsafe { param.as_ref().unwrap_unchecked() };
+        inner(Some(&param_as_raw_bytes(param.for_host())))
+    }
+
+    #[cfg(feature = "host")]
+    fn shared_layout_for_async<'stream, 'b>(
+        _param: &Self::AsyncHostType<'stream, 'b>,
+        _token: sealed::Token,
+    ) -> Layout where Self: 'b {
+        Layout::new::<()>()
+    }
+
+    #[cfg(feature = "host")]
+    fn async_to_ffi<'stream, 'b, E: From<rustacuda::error::CudaError>>(
+        param: Self::AsyncHostType<'stream, 'b>,
+        token: sealed::Token,
+    ) -> Result<Self::FfiType<'stream, 'b>, E> where Self: 'b {
+        <&'a mut DeepPerThreadBorrow<T> as CudaKernelParameter>::async_to_ffi(param, token)
+    }
+
+    #[cfg(feature = "device")]
+    unsafe fn with_ffi_as_device<'short, O, const PARAM: usize>(
+        param: Self::FfiType<'static, 'short>,
+        inner: impl super::WithFfiAsDevice<Self, O>,
+    ) -> O where Self: 'short {
+        emit_param_ptx_jit_marker::<_, PARAM>(param.as_ref());
+
+        // FIXME: forward impl
+        unsafe { crate::lend::BorrowFromRust::with_borrow_from_rust_mut(param, |param| inner.with(param)) }
+    }
+}
+impl<'a, T: Sync + RustToCuda + SafeMutableAliasing> sealed::Sealed
+    for &'a mut PtxJit<DeepPerThreadBorrow<T>>
+{
+}
 
 #[cfg(feature = "host")]
 fn param_as_raw_bytes<T: ?Sized>(r: &T) -> NonNull<[u8]> {
@@ -871,151 +874,151 @@ fn emit_param_ptx_jit_marker<T: ?Sized, const INDEX: usize>(param: &T) {
     }
 }
 
-// mod private_shared {
-//     use core::marker::PhantomData;
-
-//     use const_type_layout::{TypeGraphLayout, TypeLayout};
-
-//     use crate::safety::PortableBitSemantics;
-
-//     #[doc(hidden)]
-//     #[derive(TypeLayout)]
-//     #[repr(C)]
-//     pub struct ThreadBlockSharedFfi<T: 'static> {
-//         pub(super) _dummy: [u8; 0],
-//         pub(super) _marker: PhantomData<T>,
-//     }
-
-//     #[doc(hidden)]
-//     #[derive(TypeLayout)]
-//     #[repr(C)]
-//     pub struct ThreadBlockSharedSliceFfi<T: 'static + PortableBitSemantics + TypeGraphLayout> {
-//         pub(super) len: usize,
-//         pub(super) _marker: [T; 0],
-//     }
-// }
-
-// impl<'a, T: 'static> CudaKernelParameter for &'a mut crate::utils::shared::ThreadBlockShared<T> {
-//     #[cfg(feature = "host")]
-//     type AsyncHostType<'stream, 'b> = &'b mut crate::utils::shared::ThreadBlockShared<T>;
-//     #[cfg(any(feature = "device", doc))]
-//     type DeviceType<'b> = &'b mut crate::utils::shared::ThreadBlockShared<T>;
-//     type FfiType<'stream, 'b> = private_shared::ThreadBlockSharedFfi<T>;
-//     #[cfg(feature = "host")]
-//     type SyncHostType = Self;
-
-//     #[cfg(feature = "host")]
-//     fn with_new_async<'stream, 'param, O, E: From<rustacuda::error::CudaError>>(
-//         param: Self::SyncHostType,
-//         _stream: &'stream rustacuda::stream::Stream,
-//         inner: impl super::WithNewAsync<'stream, Self, O, E>,
-//     ) -> Result<O, E> where Self: 'param {
-//         inner.with(param)
-//     }
-
-//     #[cfg(feature = "host")]
-//     fn with_async_as_ptx_jit<O>(
-//         _param: &Self::AsyncHostType<'_, '_>,
-//         _token: sealed::Token,
-//         inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O,
-//     ) -> O {
-//         inner(None)
-//     }
-
-//     #[cfg(feature = "host")]
-//     fn shared_layout_for_async(
-//         _param: &Self::AsyncHostType<'_, '_>,
-//         _token: sealed::Token,
-//     ) -> Layout {
-//         Layout::new::<()>()
-//     }
-
-//     #[cfg(feature = "host")]
-//     fn async_to_ffi<'stream, 'b, E: From<rustacuda::error::CudaError>>(
-//         _param: Self::AsyncHostType<'stream, 'b>,
-//         _token: sealed::Token,
-//     ) -> Result<Self::FfiType<'stream, 'b>, E> {
-//         Ok(private_shared::ThreadBlockSharedFfi {
-//             _dummy: [],
-//             _marker: PhantomData::<T>,
-//         })
-//     }
-
-//     #[cfg(feature = "device")]
-//     #[allow(clippy::inline_always)]
-//     #[inline(always)]
-//     unsafe fn with_ffi_as_device<O, const PARAM: usize>(
-//         _param: Self::FfiType<'static, 'static>,
-//         inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O,
-//     ) -> O {
-//         let mut param = crate::utils::shared::ThreadBlockShared::new_uninit();
-
-//         inner(&mut param)
-//     }
-// }
-// impl<'a, T: 'static> sealed::Sealed for &'a mut crate::utils::shared::ThreadBlockShared<T> {}
-
-// impl<'a, T: 'static + PortableBitSemantics + TypeGraphLayout> CudaKernelParameter
-//     for &'a mut crate::utils::shared::ThreadBlockSharedSlice<T>
-// {
-//     #[cfg(feature = "host")]
-//     type AsyncHostType<'stream, 'b> = &'b mut crate::utils::shared::ThreadBlockSharedSlice<T>;
-//     #[cfg(any(feature = "device", doc))]
-//     type DeviceType<'b> = &'b mut crate::utils::shared::ThreadBlockSharedSlice<T>;
-//     type FfiType<'stream, 'b> = private_shared::ThreadBlockSharedSliceFfi<T>;
-//     #[cfg(feature = "host")]
-//     type SyncHostType = Self;
-
-//     #[cfg(feature = "host")]
-//     fn with_new_async<'stream, 'param, O, E: From<rustacuda::error::CudaError>>(
-//         param: Self::SyncHostType,
-//         _stream: &'stream rustacuda::stream::Stream,
-//         inner: impl super::WithNewAsync<'stream, Self, O, E>,
-//     ) -> Result<O, E> where Self: 'param {
-//         inner.with(param)
-//     }
-
-//     #[cfg(feature = "host")]
-//     fn with_async_as_ptx_jit<O>(
-//         _param: &Self::AsyncHostType<'_, '_>,
-//         _token: sealed::Token,
-//         inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O,
-//     ) -> O {
-//         inner(None)
-//     }
-
-//     #[cfg(feature = "host")]
-//     fn shared_layout_for_async(
-//         param: &Self::AsyncHostType<'_, '_>,
-//         _token: sealed::Token,
-//     ) -> Layout {
-//         param.layout()
-//     }
-
-//     #[cfg(feature = "host")]
-//     fn async_to_ffi<'stream, 'b, E: From<rustacuda::error::CudaError>>(
-//         param: Self::AsyncHostType<'stream, 'b>,
-//         _token: sealed::Token,
-//     ) -> Result<Self::FfiType<'stream, 'b>, E> {
-//         Ok(private_shared::ThreadBlockSharedSliceFfi {
-//             len: param.len(),
-//             _marker: [],
-//         })
-//     }
-
-//     #[cfg(feature = "device")]
-//     #[allow(clippy::inline_always)]
-//     #[inline(always)]
-//     unsafe fn with_ffi_as_device<O, const PARAM: usize>(
-//         param: Self::FfiType<'static, 'static>,
-//         inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O,
-//     ) -> O {
-//         unsafe {
-//             crate::utils::shared::ThreadBlockSharedSlice::with_uninit_for_len(param.len, inner)
-//         }
-//     }
-// }
-// impl<'a, T: 'static + PortableBitSemantics + TypeGraphLayout> sealed::Sealed
-//     for &'a mut crate::utils::shared::ThreadBlockSharedSlice<T>
-// {
-// }
+mod private_shared {
+    use core::marker::PhantomData;
+
+    use const_type_layout::{TypeGraphLayout, TypeLayout};
+
+    use crate::safety::PortableBitSemantics;
+
+    #[doc(hidden)]
+    #[derive(TypeLayout)]
+    #[repr(C)]
+    pub struct ThreadBlockSharedFfi<T: 'static> {
+        pub(super) _dummy: [u8; 0],
+        pub(super) _marker: PhantomData<T>,
+    }
+
+    #[doc(hidden)]
+    #[derive(TypeLayout)]
+    #[repr(C)]
+    pub struct ThreadBlockSharedSliceFfi<T: 'static + PortableBitSemantics + TypeGraphLayout> {
+        pub(super) len: usize,
+        pub(super) _marker: [T; 0],
+    }
+}
+
+impl<'a, T: 'static> CudaKernelParameter for &'a mut crate::utils::shared::ThreadBlockShared<T> {
+    #[cfg(feature = "host")]
+    type AsyncHostType<'stream, 'b> = &'b mut crate::utils::shared::ThreadBlockShared<T> where Self: 'b;
+    #[cfg(any(feature = "device", doc))]
+    type DeviceType<'b> = &'b mut crate::utils::shared::ThreadBlockShared<T> where Self: 'b;
+    type FfiType<'stream, 'b> = private_shared::ThreadBlockSharedFfi<T> where Self: 'b;
+    #[cfg(feature = "host")]
+    type SyncHostType = Self;
+
+    #[cfg(feature = "host")]
+    fn with_new_async<'stream, 'param, O, E: From<rustacuda::error::CudaError>>(
+        param: Self::SyncHostType,
+        _stream: &'stream rustacuda::stream::Stream,
+        inner: impl super::WithNewAsync<'stream, Self, O, E>,
+    ) -> Result<O, E> where Self: 'param {
+        inner.with(param)
+    }
+
+    #[cfg(feature = "host")]
+    fn with_async_as_ptx_jit<'stream, 'b, O>(
+        _param: &Self::AsyncHostType<'stream, 'b>,
+        _token: sealed::Token,
+        inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O,
+    ) -> O where Self: 'b {
+        inner(None)
+    }
+
+    #[cfg(feature = "host")]
+    fn shared_layout_for_async<'stream, 'b>(
+        _param: &Self::AsyncHostType<'stream, 'b>,
+        _token: sealed::Token,
+    ) -> Layout where Self: 'b {
+        Layout::new::<()>()
+    }
+
+    #[cfg(feature = "host")]
+    fn async_to_ffi<'stream, 'b, E: From<rustacuda::error::CudaError>>(
+        _param: Self::AsyncHostType<'stream, 'b>,
+        _token: sealed::Token,
+    ) -> Result<Self::FfiType<'stream, 'b>, E> where Self: 'b {
+        Ok(private_shared::ThreadBlockSharedFfi {
+            _dummy: [],
+            _marker: PhantomData::<T>,
+        })
+    }
+
+    #[cfg(feature = "device")]
+    #[allow(clippy::inline_always)]
+    #[inline(always)]
+    unsafe fn with_ffi_as_device<'short, O, const PARAM: usize>(
+        _param: Self::FfiType<'static, 'short>,
+        inner: impl super::WithFfiAsDevice<Self, O>,
+    ) -> O where Self: 'short {
+        let mut param = crate::utils::shared::ThreadBlockShared::new_uninit();
+
+        inner.with(&mut param)
+    }
+}
+impl<'a, T: 'static> sealed::Sealed for &'a mut crate::utils::shared::ThreadBlockShared<T> {}
+
+impl<'a, T: 'static + PortableBitSemantics + TypeGraphLayout> CudaKernelParameter
+    for &'a mut crate::utils::shared::ThreadBlockSharedSlice<T>
+{
+    #[cfg(feature = "host")]
+    type AsyncHostType<'stream, 'b> = &'b mut crate::utils::shared::ThreadBlockSharedSlice<T> where Self: 'b;
+    #[cfg(any(feature = "device", doc))]
+    type DeviceType<'b> = &'b mut crate::utils::shared::ThreadBlockSharedSlice<T> where Self: 'b;
+    type FfiType<'stream, 'b> = private_shared::ThreadBlockSharedSliceFfi<T> where Self: 'b;
+    #[cfg(feature = "host")]
+    type SyncHostType = Self;
+
+    #[cfg(feature = "host")]
+    fn with_new_async<'stream, 'param, O, E: From<rustacuda::error::CudaError>>(
+        param: Self::SyncHostType,
+        _stream: &'stream rustacuda::stream::Stream,
+        inner: impl super::WithNewAsync<'stream, Self, O, E>,
+    ) -> Result<O, E> where Self: 'param {
+        inner.with(param)
+    }
+
+    #[cfg(feature = "host")]
+    fn with_async_as_ptx_jit<'stream, 'b, O>(
+        _param: &Self::AsyncHostType<'stream, 'b>,
+        _token: sealed::Token,
+        inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O,
+    ) -> O where Self: 'b {
+        inner(None)
+    }
+
+    #[cfg(feature = "host")]
+    fn shared_layout_for_async<'stream, 'b>(
+        param: &Self::AsyncHostType<'stream, 'b>,
+        _token: sealed::Token,
+    ) -> Layout where Self: 'b {
+        param.layout()
+    }
+
+    #[cfg(feature = "host")]
+    fn async_to_ffi<'stream, 'b, E: From<rustacuda::error::CudaError>>(
+        param: Self::AsyncHostType<'stream, 'b>,
+        _token: sealed::Token,
+    ) -> Result<Self::FfiType<'stream, 'b>, E> where Self: 'b {
+        Ok(private_shared::ThreadBlockSharedSliceFfi {
+            len: param.len(),
+            _marker: [],
+        })
+    }
+
+    #[cfg(feature = "device")]
+    #[allow(clippy::inline_always)]
+    #[inline(always)]
+    unsafe fn with_ffi_as_device<'short, O, const PARAM: usize>(
+        param: Self::FfiType<'static, 'short>,
+        inner: impl super::WithFfiAsDevice<Self, O>,
+    ) -> O where Self: 'short {
+        unsafe {
+            crate::utils::shared::ThreadBlockSharedSlice::with_uninit_for_len(param.len, |param| inner.with(param))
+        }
+    }
+}
+impl<'a, T: 'static + PortableBitSemantics + TypeGraphLayout> sealed::Sealed
+    for &'a mut crate::utils::shared::ThreadBlockSharedSlice<T>
+{
+}

From fd08c41bae173dfe97ec55598efd4517f918e9f5 Mon Sep 17 00:00:00 2001
From: Juniper Tyree <juniper.tyree@helsinki.fi>
Date: Mon, 8 Jan 2024 10:20:47 +0000
Subject: [PATCH 094/120] Remove 'static bounds from LendToCuda ref kernel
 params

---
 .../wrapper/generate/cuda_generic_function.rs |  57 ++-
 .../kernel/wrapper/generate/cuda_wrapper.rs   |   4 +-
 src/kernel/mod.rs                             |  71 ++-
 src/kernel/param.rs                           | 404 +++++++++++++-----
 4 files changed, 397 insertions(+), 139 deletions(-)

diff --git a/rust-cuda-kernel/src/kernel/wrapper/generate/cuda_generic_function.rs b/rust-cuda-kernel/src/kernel/wrapper/generate/cuda_generic_function.rs
index 1b05df23b..4084db0ed 100644
--- a/rust-cuda-kernel/src/kernel/wrapper/generate/cuda_generic_function.rs
+++ b/rust-cuda-kernel/src/kernel/wrapper/generate/cuda_generic_function.rs
@@ -16,17 +16,60 @@ pub(in super::super) fn quote_cuda_generic_function(
     func_attrs: &[syn::Attribute],
     func_block: &syn::Block,
 ) -> TokenStream {
+    let mut generic_params = (*generic_params).clone();
+
     let kernel_func_inputs = func_inputs
         .iter()
+        .enumerate()
         .map(
-            |syn::PatType {
-                 attrs,
-                 ty,
-                 pat,
-                 colon_token,
-             }| {
+            |(
+                i,
+                syn::PatType {
+                    attrs,
+                    ty,
+                    pat,
+                    colon_token,
+                },
+            )| {
+                let (ty, lt) = if let syn::Type::Reference(syn::TypeReference {
+                    and_token,
+                    lifetime,
+                    mutability,
+                    elem,
+                }) = &**ty
+                {
+                    let lifetime = if let Some(lifetime) = lifetime {
+                        lifetime.clone()
+                    } else {
+                        let lifetime =
+                            syn::Lifetime::new(&format!("'__rust_cuda_lt_{i}"), ty.span());
+                        generic_params.insert(
+                            0,
+                            syn::GenericParam::Lifetime(syn::LifetimeDef {
+                                attrs: Vec::new(),
+                                colon_token: None,
+                                lifetime: lifetime.clone(),
+                                bounds: syn::punctuated::Punctuated::new(),
+                            }),
+                        );
+                        lifetime
+                    };
+                    let lt = quote!(#lifetime);
+                    (
+                        syn::Type::Reference(syn::TypeReference {
+                            and_token: *and_token,
+                            lifetime: Some(lifetime),
+                            mutability: *mutability,
+                            elem: elem.clone(),
+                        }),
+                        lt,
+                    )
+                } else {
+                    (syn::Type::clone(ty), quote!('_))
+                };
+
                 let ty: syn::Type = syn::parse_quote_spanned! { ty.span()=>
-                    <#ty as #crate_path::kernel::CudaKernelParameter>::DeviceType<'_>
+                    <#ty as #crate_path::kernel::CudaKernelParameter>::DeviceType<#lt>
                 };
 
                 syn::FnArg::Typed(syn::PatType {
diff --git a/rust-cuda-kernel/src/kernel/wrapper/generate/cuda_wrapper.rs b/rust-cuda-kernel/src/kernel/wrapper/generate/cuda_wrapper.rs
index f61bb9b32..48049c5a1 100644
--- a/rust-cuda-kernel/src/kernel/wrapper/generate/cuda_wrapper.rs
+++ b/rust-cuda-kernel/src/kernel/wrapper/generate/cuda_wrapper.rs
@@ -52,7 +52,9 @@ pub(in super::super) fn quote_cuda_wrapper(
                     <
                         #specialised_ty as #crate_path::kernel::CudaKernelParameter
                     >::with_ffi_as_device::<_, #i>(
-                        #pat, |#pat| { #inner }
+                        #pat, |#pat: <
+                            #specialised_ty as #crate_path::kernel::CudaKernelParameter
+                        >::DeviceType::<'_>| { #inner }
                     )
                 }
             }
diff --git a/src/kernel/mod.rs b/src/kernel/mod.rs
index c03ca5517..a27ed5b71 100644
--- a/src/kernel/mod.rs
+++ b/src/kernel/mod.rs
@@ -40,26 +40,51 @@ mod sealed {
 }
 
 #[cfg(feature = "host")] // FIXME: make private?
-pub trait WithNewAsync<'stream, P: ?Sized + CudaKernelParameter, O, E: From<rustacuda::error::CudaError>> {
+pub trait WithNewAsync<
+    'stream,
+    P: ?Sized + CudaKernelParameter,
+    O,
+    E: From<rustacuda::error::CudaError>,
+>
+{
     #[allow(clippy::missing_errors_doc)] // FIXME
-    fn with<'b>(self, param: P::AsyncHostType<'stream, 'b>) -> Result<O, E> where P: 'b;
+    fn with<'b>(self, param: P::AsyncHostType<'stream, 'b>) -> Result<O, E>
+    where
+        P: 'b;
 }
 
 #[cfg(feature = "host")] // FIXME: make private?
-impl<'stream, P: ?Sized + CudaKernelParameter, O, E: From<rustacuda::error::CudaError>, F: for<'b> FnOnce(P::AsyncHostType<'stream, 'b>) -> Result<O, E>> WithNewAsync<'stream, P, O, E> for F {
-    fn with<'b>(self, param: P::AsyncHostType<'stream, 'b>) -> Result<O, E> where P: 'b {
+impl<
+        'stream,
+        P: ?Sized + CudaKernelParameter,
+        O,
+        E: From<rustacuda::error::CudaError>,
+        F: for<'b> FnOnce(P::AsyncHostType<'stream, 'b>) -> Result<O, E>,
+    > WithNewAsync<'stream, P, O, E> for F
+{
+    fn with<'b>(self, param: P::AsyncHostType<'stream, 'b>) -> Result<O, E>
+    where
+        P: 'b,
+    {
         (self)(param)
     }
 }
 
 #[cfg(feature = "device")]
 pub trait WithFfiAsDevice<P: ?Sized + CudaKernelParameter, O> {
-    fn with<'b>(self, param: P::DeviceType<'b>) -> O where P: 'b;
+    fn with<'b>(self, param: P::DeviceType<'b>) -> O
+    where
+        P: 'b;
 }
 
 #[cfg(feature = "device")]
-impl<P: ?Sized + CudaKernelParameter, O, F: for<'b> FnOnce(P::DeviceType<'b>) -> O> WithFfiAsDevice<P, O> for F {
-    fn with<'b>(self, param: P::DeviceType<'b>) -> O where P: 'b {
+impl<P: ?Sized + CudaKernelParameter, O, F: for<'b> FnOnce(P::DeviceType<'b>) -> O>
+    WithFfiAsDevice<P, O> for F
+{
+    fn with<'b>(self, param: P::DeviceType<'b>) -> O
+    where
+        P: 'b,
+    {
         (self)(param)
     }
 }
@@ -68,11 +93,17 @@ pub trait CudaKernelParameter: sealed::Sealed {
     #[cfg(feature = "host")]
     type SyncHostType;
     #[cfg(feature = "host")]
-    type AsyncHostType<'stream, 'b> where Self: 'b;
+    type AsyncHostType<'stream, 'b>
+    where
+        Self: 'b;
     #[doc(hidden)]
-    type FfiType<'stream, 'b>: PortableBitSemantics where Self: 'b;
+    type FfiType<'stream, 'b>: PortableBitSemantics
+    where
+        Self: 'b;
     #[cfg(any(feature = "device", doc))]
-    type DeviceType<'b> where Self: 'b;
+    type DeviceType<'b>
+    where
+        Self: 'b;
 
     #[cfg(feature = "host")]
     #[allow(clippy::missing_errors_doc)] // FIXME
@@ -80,7 +111,9 @@ pub trait CudaKernelParameter: sealed::Sealed {
         param: Self::SyncHostType,
         stream: &'stream rustacuda::stream::Stream,
         inner: impl WithNewAsync<'stream, Self, O, E>,
-    ) -> Result<O, E> where Self: 'param;
+    ) -> Result<O, E>
+    where
+        Self: 'param;
 
     #[doc(hidden)]
     #[cfg(feature = "host")]
@@ -88,28 +121,36 @@ pub trait CudaKernelParameter: sealed::Sealed {
         param: &Self::AsyncHostType<'stream, 'b>,
         token: sealed::Token,
         inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O,
-    ) -> O where Self: 'b;
+    ) -> O
+    where
+        Self: 'b;
 
     #[doc(hidden)]
     #[cfg(feature = "host")]
     fn shared_layout_for_async<'stream, 'b>(
         param: &Self::AsyncHostType<'stream, 'b>,
         token: sealed::Token,
-    ) -> std::alloc::Layout where Self: 'b;
+    ) -> std::alloc::Layout
+    where
+        Self: 'b;
 
     #[doc(hidden)]
     #[cfg(feature = "host")]
     fn async_to_ffi<'stream, 'b, E: From<rustacuda::error::CudaError>>(
         param: Self::AsyncHostType<'stream, 'b>,
         token: sealed::Token,
-    ) -> Result<Self::FfiType<'stream, 'b>, E> where Self: 'b;
+    ) -> Result<Self::FfiType<'stream, 'b>, E>
+    where
+        Self: 'b;
 
     #[doc(hidden)]
     #[cfg(feature = "device")]
     unsafe fn with_ffi_as_device<'short, O, const PARAM: usize>(
         param: Self::FfiType<'static, 'short>,
         inner: impl WithFfiAsDevice<Self, O>,
-    ) -> O where Self: 'short;
+    ) -> O
+    where
+        Self: 'short;
 }
 
 #[cfg(feature = "host")]
diff --git a/src/kernel/param.rs b/src/kernel/param.rs
index c40f68e1e..1f149d8b4 100644
--- a/src/kernel/param.rs
+++ b/src/kernel/param.rs
@@ -80,11 +80,14 @@ impl<
     type SyncHostType = T;
 
     #[cfg(feature = "host")]
-    fn with_new_async<'stream, 'param, O, E: From<rustacuda::error::CudaError>>(
+    fn with_new_async<'stream, 'b, O, E: From<rustacuda::error::CudaError>>(
         param: Self::SyncHostType,
         _stream: &'stream rustacuda::stream::Stream,
         inner: impl super::WithNewAsync<'stream, Self, O, E>,
-    ) -> Result<O, E> where Self: 'param {
+    ) -> Result<O, E>
+    where
+        Self: 'b,
+    {
         inner.with(crate::utils::adapter::RustToCudaWithPortableBitCopySemantics::from(param))
     }
 
@@ -93,7 +96,10 @@ impl<
         _param: &Self::AsyncHostType<'stream, 'b>,
         _token: sealed::Token,
         inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O,
-    ) -> O where Self: 'b {
+    ) -> O
+    where
+        Self: 'b,
+    {
         inner(None)
     }
 
@@ -101,7 +107,10 @@ impl<
     fn shared_layout_for_async<'stream, 'b>(
         _param: &Self::AsyncHostType<'stream, 'b>,
         _token: sealed::Token,
-    ) -> Layout where Self: 'b {
+    ) -> Layout
+    where
+        Self: 'b,
+    {
         Layout::new::<()>()
     }
 
@@ -109,7 +118,10 @@ impl<
     fn async_to_ffi<'stream, 'b, E: From<rustacuda::error::CudaError>>(
         param: Self::AsyncHostType<'stream, 'b>,
         _token: sealed::Token,
-    ) -> Result<Self::FfiType<'stream, 'b>, E> where Self: 'b {
+    ) -> Result<Self::FfiType<'stream, 'b>, E>
+    where
+        Self: 'b,
+    {
         Ok(param)
     }
 
@@ -117,7 +129,10 @@ impl<
     unsafe fn with_ffi_as_device<'short, O, const PARAM: usize>(
         param: Self::FfiType<'static, 'short>,
         inner: impl super::WithFfiAsDevice<Self, O>,
-    ) -> O where Self: 'short {
+    ) -> O
+    where
+        Self: 'short,
+    {
         let param = param.into_inner();
 
         inner.with(param)
@@ -135,10 +150,7 @@ impl<
 
 impl<
         'a,
-        T: Sync
-            + crate::safety::StackOnly
-            + crate::safety::PortableBitSemantics
-            + TypeGraphLayout,
+        T: Sync + crate::safety::StackOnly + crate::safety::PortableBitSemantics + TypeGraphLayout,
     > CudaKernelParameter for &'a PerThreadShallowCopy<T>
 {
     #[cfg(feature = "host")]
@@ -154,11 +166,14 @@ impl<
     type SyncHostType = &'a T;
 
     #[cfg(feature = "host")]
-    fn with_new_async<'stream, 'param, O, E: From<rustacuda::error::CudaError>>(
+    fn with_new_async<'stream, 'b, O, E: From<rustacuda::error::CudaError>>(
         param: Self::SyncHostType,
         stream: &'stream rustacuda::stream::Stream,
         inner: impl super::WithNewAsync<'stream, Self, O, E>,
-    ) -> Result<O, E> where Self: 'param {
+    ) -> Result<O, E>
+    where
+        Self: 'b,
+    {
         crate::host::HostAndDeviceConstRef::with_new(param, |const_ref| {
             inner.with(const_ref.as_async(stream).as_ref())
         })
@@ -169,7 +184,10 @@ impl<
         _param: &Self::AsyncHostType<'stream, 'b>,
         _token: sealed::Token,
         inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O,
-    ) -> O where Self: 'b {
+    ) -> O
+    where
+        Self: 'b,
+    {
         inner(None)
     }
 
@@ -177,7 +195,10 @@ impl<
     fn shared_layout_for_async<'stream, 'b>(
         _param: &Self::AsyncHostType<'stream, 'b>,
         _token: sealed::Token,
-    ) -> Layout where Self: 'b {
+    ) -> Layout
+    where
+        Self: 'b,
+    {
         Layout::new::<()>()
     }
 
@@ -185,7 +206,10 @@ impl<
     fn async_to_ffi<'stream, 'b, E: From<rustacuda::error::CudaError>>(
         param: Self::AsyncHostType<'stream, 'b>,
         _token: sealed::Token,
-    ) -> Result<Self::FfiType<'stream, 'b>, E> where Self: 'b {
+    ) -> Result<Self::FfiType<'stream, 'b>, E>
+    where
+        Self: 'b,
+    {
         let param = unsafe { param.unwrap_unchecked() };
         Ok(param.for_device())
     }
@@ -194,7 +218,10 @@ impl<
     unsafe fn with_ffi_as_device<'short, O, const PARAM: usize>(
         param: Self::FfiType<'static, 'short>,
         inner: impl super::WithFfiAsDevice<Self, O>,
-    ) -> O where Self: 'short {
+    ) -> O
+    where
+        Self: 'short,
+    {
         let param = param.as_ref();
 
         inner.with(param)
@@ -202,20 +229,14 @@ impl<
 }
 impl<
         'a,
-        T: Sync
-            + crate::safety::StackOnly
-            + crate::safety::PortableBitSemantics
-            + TypeGraphLayout,
+        T: Sync + crate::safety::StackOnly + crate::safety::PortableBitSemantics + TypeGraphLayout,
     > sealed::Sealed for &'a PerThreadShallowCopy<T>
 {
 }
 
 impl<
         'a,
-        T: Sync
-            + crate::safety::StackOnly
-            + crate::safety::PortableBitSemantics
-            + TypeGraphLayout,
+        T: Sync + crate::safety::StackOnly + crate::safety::PortableBitSemantics + TypeGraphLayout,
     > CudaKernelParameter for &'a PtxJit<PerThreadShallowCopy<T>>
 {
     #[cfg(feature = "host")]
@@ -229,11 +250,14 @@ impl<
     type SyncHostType = <&'a PerThreadShallowCopy<T> as CudaKernelParameter>::SyncHostType;
 
     #[cfg(feature = "host")]
-    fn with_new_async<'stream, 'param, O, E: From<rustacuda::error::CudaError>>(
+    fn with_new_async<'stream, 'b, O, E: From<rustacuda::error::CudaError>>(
         param: Self::SyncHostType,
         stream: &'stream rustacuda::stream::Stream,
         inner: impl super::WithNewAsync<'stream, Self, O, E>,
-    ) -> Result<O, E> where Self: 'param {
+    ) -> Result<O, E>
+    where
+        Self: 'b,
+    {
         // FIXME: forward impl
         crate::host::HostAndDeviceConstRef::with_new(param, |const_ref| {
             inner.with(const_ref.as_async(stream).as_ref())
@@ -245,7 +269,10 @@ impl<
         param: &Self::AsyncHostType<'stream, 'b>,
         _token: sealed::Token,
         inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O,
-    ) -> O where Self: 'b {
+    ) -> O
+    where
+        Self: 'b,
+    {
         let param = unsafe { param.unwrap_unchecked() };
         inner(Some(&param_as_raw_bytes(param.for_host())))
     }
@@ -254,7 +281,10 @@ impl<
     fn shared_layout_for_async<'stream, 'b>(
         _param: &Self::AsyncHostType<'stream, 'b>,
         _token: sealed::Token,
-    ) -> Layout where Self: 'b {
+    ) -> Layout
+    where
+        Self: 'b,
+    {
         Layout::new::<()>()
     }
 
@@ -262,7 +292,10 @@ impl<
     fn async_to_ffi<'stream, 'b, E: From<rustacuda::error::CudaError>>(
         param: Self::AsyncHostType<'stream, 'b>,
         token: sealed::Token,
-    ) -> Result<Self::FfiType<'stream, 'b>, E> where Self: 'b {
+    ) -> Result<Self::FfiType<'stream, 'b>, E>
+    where
+        Self: 'b,
+    {
         <&'a PerThreadShallowCopy<T> as CudaKernelParameter>::async_to_ffi(param, token)
     }
 
@@ -270,7 +303,10 @@ impl<
     unsafe fn with_ffi_as_device<'short, O, const PARAM: usize>(
         param: Self::FfiType<'static, 'short>,
         inner: impl super::WithFfiAsDevice<Self, O>,
-    ) -> O where Self: 'short {
+    ) -> O
+    where
+        Self: 'short,
+    {
         emit_param_ptx_jit_marker::<_, PARAM>(param.as_ref());
 
         // FIXME: forward impl
@@ -281,10 +317,7 @@ impl<
 }
 impl<
         'a,
-        T: Sync
-            + crate::safety::StackOnly
-            + crate::safety::PortableBitSemantics
-            + TypeGraphLayout,
+        T: Sync + crate::safety::StackOnly + crate::safety::PortableBitSemantics + TypeGraphLayout,
     > sealed::Sealed for &'a PtxJit<PerThreadShallowCopy<T>>
 {
 }
@@ -339,11 +372,14 @@ impl<
     type SyncHostType = &'a mut T;
 
     #[cfg(feature = "host")]
-    fn with_new_async<'stream, 'param, O, E: From<rustacuda::error::CudaError>>(
+    fn with_new_async<'stream, 'b, O, E: From<rustacuda::error::CudaError>>(
         param: Self::SyncHostType,
         stream: &'stream rustacuda::stream::Stream,
         inner: impl super::WithNewAsync<'stream, Self, O, E>,
-    ) -> Result<O, E> where Self: 'param {
+    ) -> Result<O, E>
+    where
+        Self: 'b,
+    {
         crate::host::HostAndDeviceMutRef::with_new(param, |const_ref| {
             inner.with(const_ref.as_ref().as_async(stream).as_ref())
         })
@@ -354,7 +390,10 @@ impl<
         _param: &Self::AsyncHostType<'stream, 'b>,
         _token: sealed::Token,
         inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O,
-    ) -> O where Self: 'b {
+    ) -> O
+    where
+        Self: 'b,
+    {
         inner(None)
     }
 
@@ -362,7 +401,10 @@ impl<
     fn shared_layout_for_async<'stream, 'b>(
         _param: &Self::AsyncHostType<'stream, 'b>,
         _token: sealed::Token,
-    ) -> Layout where Self: 'b {
+    ) -> Layout
+    where
+        Self: 'b,
+    {
         Layout::new::<()>()
     }
 
@@ -370,7 +412,10 @@ impl<
     fn async_to_ffi<'stream, 'b, E: From<rustacuda::error::CudaError>>(
         param: Self::AsyncHostType<'stream, 'b>,
         _token: sealed::Token,
-    ) -> Result<Self::FfiType<'stream, 'b>, E> where Self: 'b {
+    ) -> Result<Self::FfiType<'stream, 'b>, E>
+    where
+        Self: 'b,
+    {
         let param = unsafe { param.unwrap_unchecked() };
         Ok(param.for_device())
     }
@@ -379,7 +424,10 @@ impl<
     unsafe fn with_ffi_as_device<'short, O, const PARAM: usize>(
         param: Self::FfiType<'static, 'short>,
         inner: impl super::WithFfiAsDevice<Self, O>,
-    ) -> O where Self: 'short {
+    ) -> O
+    where
+        Self: 'short,
+    {
         let param = param.as_ref();
 
         inner.with(param)
@@ -439,10 +487,7 @@ impl<T: RustToCuda> Deref for DeepPerThreadBorrow<T> {
 impl<
         T: Send
             + Clone
-            + RustToCuda<
-                CudaRepresentation: crate::safety::StackOnly,
-                CudaAllocation: EmptyCudaAlloc,
-            >,
+            + RustToCuda<CudaRepresentation: crate::safety::StackOnly, CudaAllocation: EmptyCudaAlloc>,
     > CudaKernelParameter for DeepPerThreadBorrow<T>
 {
     #[cfg(feature = "host")]
@@ -463,11 +508,14 @@ impl<
     type SyncHostType = T;
 
     #[cfg(feature = "host")]
-    fn with_new_async<'stream, 'param, O, E: From<rustacuda::error::CudaError>>(
+    fn with_new_async<'stream, 'b, O, E: From<rustacuda::error::CudaError>>(
         param: Self::SyncHostType,
         stream: &'stream rustacuda::stream::Stream,
         inner: impl super::WithNewAsync<'stream, Self, O, E>,
-    ) -> Result<O, E> where Self: 'param {
+    ) -> Result<O, E>
+    where
+        Self: 'b,
+    {
         crate::lend::LendToCuda::move_to_cuda(param, |param| inner.with(param.into_async(stream)))
     }
 
@@ -476,7 +524,10 @@ impl<
         _param: &Self::AsyncHostType<'stream, 'b>,
         _token: sealed::Token,
         inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O,
-    ) -> O where Self: 'b {
+    ) -> O
+    where
+        Self: 'b,
+    {
         inner(None)
     }
 
@@ -484,7 +535,10 @@ impl<
     fn shared_layout_for_async<'stream, 'b>(
         _param: &Self::AsyncHostType<'stream, 'b>,
         _token: sealed::Token,
-    ) -> Layout where Self: 'b {
+    ) -> Layout
+    where
+        Self: 'b,
+    {
         Layout::new::<()>()
     }
 
@@ -492,7 +546,10 @@ impl<
     fn async_to_ffi<'stream, 'b, E: From<rustacuda::error::CudaError>>(
         param: Self::AsyncHostType<'stream, 'b>,
         _token: sealed::Token,
-    ) -> Result<Self::FfiType<'stream, 'b>, E> where Self: 'b {
+    ) -> Result<Self::FfiType<'stream, 'b>, E>
+    where
+        Self: 'b,
+    {
         let (param, _completion): (_, Option<crate::utils::r#async::NoCompletion>) =
             unsafe { param.unwrap_unchecked()? };
         Ok(param.for_device())
@@ -502,17 +559,19 @@ impl<
     unsafe fn with_ffi_as_device<'short, O, const PARAM: usize>(
         param: Self::FfiType<'static, 'short>,
         inner: impl super::WithFfiAsDevice<Self, O>,
-    ) -> O where Self: 'short {
-        unsafe { crate::lend::BorrowFromRust::with_moved_from_rust(param, |param| inner.with(param)) }
+    ) -> O
+    where
+        Self: 'short,
+    {
+        unsafe {
+            crate::lend::BorrowFromRust::with_moved_from_rust(param, |param| inner.with(param))
+        }
     }
 }
 impl<
         T: Send
             + Clone
-            + RustToCuda<
-                CudaRepresentation: crate::safety::StackOnly,
-                CudaAllocation: EmptyCudaAlloc,
-            >,
+            + RustToCuda<CudaRepresentation: crate::safety::StackOnly, CudaAllocation: EmptyCudaAlloc>,
     > sealed::Sealed for DeepPerThreadBorrow<T>
 {
 }
@@ -535,12 +594,17 @@ impl<'a, T: Sync + RustToCuda> CudaKernelParameter for &'a DeepPerThreadBorrow<T
     type SyncHostType = &'a T;
 
     #[cfg(feature = "host")]
-    fn with_new_async<'stream, 'param, O, E: From<rustacuda::error::CudaError>>(
+    fn with_new_async<'stream, 'b, O, E: From<rustacuda::error::CudaError>>(
         param: Self::SyncHostType,
         stream: &'stream rustacuda::stream::Stream,
         inner: impl super::WithNewAsync<'stream, Self, O, E>,
-    ) -> Result<O, E> where Self: 'param {
-        crate::lend::LendToCuda::lend_to_cuda(param, |param| inner.with(param.as_async(stream).as_ref()))
+    ) -> Result<O, E>
+    where
+        Self: 'b,
+    {
+        crate::lend::LendToCuda::lend_to_cuda(param, |param| {
+            inner.with(param.as_async(stream).as_ref())
+        })
     }
 
     #[cfg(feature = "host")]
@@ -548,7 +612,10 @@ impl<'a, T: Sync + RustToCuda> CudaKernelParameter for &'a DeepPerThreadBorrow<T
         _param: &Self::AsyncHostType<'stream, 'b>,
         _token: sealed::Token,
         inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O,
-    ) -> O where Self: 'b {
+    ) -> O
+    where
+        Self: 'b,
+    {
         inner(None)
     }
 
@@ -556,7 +623,10 @@ impl<'a, T: Sync + RustToCuda> CudaKernelParameter for &'a DeepPerThreadBorrow<T
     fn shared_layout_for_async<'stream, 'b>(
         _param: &Self::AsyncHostType<'stream, 'b>,
         _token: sealed::Token,
-    ) -> Layout where Self: 'b {
+    ) -> Layout
+    where
+        Self: 'b,
+    {
         Layout::new::<()>()
     }
 
@@ -564,7 +634,10 @@ impl<'a, T: Sync + RustToCuda> CudaKernelParameter for &'a DeepPerThreadBorrow<T
     fn async_to_ffi<'stream, 'b, E: From<rustacuda::error::CudaError>>(
         param: Self::AsyncHostType<'stream, 'b>,
         _token: sealed::Token,
-    ) -> Result<Self::FfiType<'stream, 'b>, E> where Self: 'b {
+    ) -> Result<Self::FfiType<'stream, 'b>, E>
+    where
+        Self: 'b,
+    {
         let param = unsafe { param.unwrap_unchecked() };
         Ok(param.for_device())
     }
@@ -573,8 +646,13 @@ impl<'a, T: Sync + RustToCuda> CudaKernelParameter for &'a DeepPerThreadBorrow<T
     unsafe fn with_ffi_as_device<'short, O, const PARAM: usize>(
         param: Self::FfiType<'static, 'short>,
         inner: impl super::WithFfiAsDevice<Self, O>,
-    ) -> O where Self: 'short {
-        unsafe { crate::lend::BorrowFromRust::with_borrow_from_rust(param, |param| inner.with(param)) }
+    ) -> O
+    where
+        Self: 'short,
+    {
+        unsafe {
+            crate::lend::BorrowFromRust::with_borrow_from_rust(param, |param| inner.with(param))
+        }
     }
 }
 impl<'a, T: Sync + RustToCuda> sealed::Sealed for &'a DeepPerThreadBorrow<T> {}
@@ -599,11 +677,14 @@ impl<'a, T: Sync + RustToCuda + SafeMutableAliasing> CudaKernelParameter
     type SyncHostType = &'a mut T;
 
     #[cfg(feature = "host")]
-    fn with_new_async<'stream, 'param, O, E: From<rustacuda::error::CudaError>>(
+    fn with_new_async<'stream, 'b, O, E: From<rustacuda::error::CudaError>>(
         param: Self::SyncHostType,
         stream: &'stream rustacuda::stream::Stream,
         inner: impl super::WithNewAsync<'stream, Self, O, E>,
-    ) -> Result<O, E> where Self: 'param {
+    ) -> Result<O, E>
+    where
+        Self: 'b,
+    {
         crate::lend::LendToCuda::lend_to_cuda_mut(param, |mut param| {
             // FIXME: express the same with param.as_async(stream).as_mut()
             let _ = stream;
@@ -616,7 +697,10 @@ impl<'a, T: Sync + RustToCuda + SafeMutableAliasing> CudaKernelParameter
         _param: &Self::AsyncHostType<'stream, 'b>,
         _token: sealed::Token,
         inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O,
-    ) -> O where Self: 'b {
+    ) -> O
+    where
+        Self: 'b,
+    {
         inner(None)
     }
 
@@ -624,7 +708,10 @@ impl<'a, T: Sync + RustToCuda + SafeMutableAliasing> CudaKernelParameter
     fn shared_layout_for_async<'stream, 'b>(
         _param: &Self::AsyncHostType<'stream, 'b>,
         _token: sealed::Token,
-    ) -> Layout where Self: 'b {
+    ) -> Layout
+    where
+        Self: 'b,
+    {
         Layout::new::<()>()
     }
 
@@ -632,7 +719,10 @@ impl<'a, T: Sync + RustToCuda + SafeMutableAliasing> CudaKernelParameter
     fn async_to_ffi<'stream, 'b, E: From<rustacuda::error::CudaError>>(
         param: Self::AsyncHostType<'stream, 'b>,
         _token: sealed::Token,
-    ) -> Result<Self::FfiType<'stream, 'b>, E> where Self: 'b {
+    ) -> Result<Self::FfiType<'stream, 'b>, E>
+    where
+        Self: 'b,
+    {
         let param = unsafe { param.unwrap_unchecked() };
         Ok(param.for_device())
     }
@@ -641,8 +731,13 @@ impl<'a, T: Sync + RustToCuda + SafeMutableAliasing> CudaKernelParameter
     unsafe fn with_ffi_as_device<'short, O, const PARAM: usize>(
         param: Self::FfiType<'static, 'short>,
         inner: impl super::WithFfiAsDevice<Self, O>,
-    ) -> O where Self: 'short {
-        unsafe { crate::lend::BorrowFromRust::with_borrow_from_rust_mut(param, |param| inner.with(param)) }
+    ) -> O
+    where
+        Self: 'short,
+    {
+        unsafe {
+            crate::lend::BorrowFromRust::with_borrow_from_rust_mut(param, |param| inner.with(param))
+        }
     }
 }
 impl<'a, T: Sync + RustToCuda + SafeMutableAliasing> sealed::Sealed
@@ -653,10 +748,7 @@ impl<'a, T: Sync + RustToCuda + SafeMutableAliasing> sealed::Sealed
 impl<
         T: Send
             + Clone
-            + RustToCuda<
-                CudaRepresentation: crate::safety::StackOnly,
-                CudaAllocation: EmptyCudaAlloc,
-            >,
+            + RustToCuda<CudaRepresentation: crate::safety::StackOnly, CudaAllocation: EmptyCudaAlloc>,
     > CudaKernelParameter for PtxJit<DeepPerThreadBorrow<T>>
 {
     #[cfg(feature = "host")]
@@ -670,11 +762,14 @@ impl<
     type SyncHostType = <DeepPerThreadBorrow<T> as CudaKernelParameter>::SyncHostType;
 
     #[cfg(feature = "host")]
-    fn with_new_async<'stream, 'param, O, E: From<rustacuda::error::CudaError>>(
+    fn with_new_async<'stream, 'b, O, E: From<rustacuda::error::CudaError>>(
         param: Self::SyncHostType,
         stream: &'stream rustacuda::stream::Stream,
         inner: impl super::WithNewAsync<'stream, Self, O, E>,
-    ) -> Result<O, E> where Self: 'param {
+    ) -> Result<O, E>
+    where
+        Self: 'b,
+    {
         // FIXME: forward impl
         crate::lend::LendToCuda::move_to_cuda(param, |param| inner.with(param.into_async(stream)))
     }
@@ -684,7 +779,10 @@ impl<
         param: &Self::AsyncHostType<'stream, 'b>,
         _token: sealed::Token,
         inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O,
-    ) -> O where Self: 'b {
+    ) -> O
+    where
+        Self: 'b,
+    {
         let param = unsafe { param.as_ref().unwrap_unchecked() };
         inner(Some(&param_as_raw_bytes(param.for_host())))
     }
@@ -693,7 +791,10 @@ impl<
     fn async_to_ffi<'stream, 'b, E: From<rustacuda::error::CudaError>>(
         param: Self::AsyncHostType<'stream, 'b>,
         token: sealed::Token,
-    ) -> Result<Self::FfiType<'stream, 'b>, E> where Self: 'b {
+    ) -> Result<Self::FfiType<'stream, 'b>, E>
+    where
+        Self: 'b,
+    {
         <DeepPerThreadBorrow<T> as CudaKernelParameter>::async_to_ffi(param, token)
     }
 
@@ -701,7 +802,10 @@ impl<
     fn shared_layout_for_async<'stream, 'b>(
         _param: &Self::AsyncHostType<'stream, 'b>,
         _token: sealed::Token,
-    ) -> Layout where Self: 'b {
+    ) -> Layout
+    where
+        Self: 'b,
+    {
         Layout::new::<()>()
     }
 
@@ -709,27 +813,27 @@ impl<
     unsafe fn with_ffi_as_device<'short, O, const PARAM: usize>(
         param: Self::FfiType<'static, 'short>,
         inner: impl super::WithFfiAsDevice<Self, O>,
-    ) -> O where Self: 'short {
+    ) -> O
+    where
+        Self: 'short,
+    {
         emit_param_ptx_jit_marker::<_, PARAM>(param.as_ref());
 
         // FIXME: forward impl
-        unsafe { crate::lend::BorrowFromRust::with_moved_from_rust(param, |param| inner.with(param)) }
+        unsafe {
+            crate::lend::BorrowFromRust::with_moved_from_rust(param, |param| inner.with(param))
+        }
     }
 }
 impl<
         T: Send
             + Clone
-            + RustToCuda<
-                CudaRepresentation: crate::safety::StackOnly,
-                CudaAllocation: EmptyCudaAlloc,
-            >,
+            + RustToCuda<CudaRepresentation: crate::safety::StackOnly, CudaAllocation: EmptyCudaAlloc>,
     > sealed::Sealed for PtxJit<DeepPerThreadBorrow<T>>
 {
 }
 
-impl<'a, T: Sync + RustToCuda> CudaKernelParameter
-    for &'a PtxJit<DeepPerThreadBorrow<T>>
-{
+impl<'a, T: Sync + RustToCuda> CudaKernelParameter for &'a PtxJit<DeepPerThreadBorrow<T>> {
     #[cfg(feature = "host")]
     type AsyncHostType<'stream, 'b> =
         <&'a DeepPerThreadBorrow<T> as CudaKernelParameter>::AsyncHostType<'stream, 'b> where Self: 'b;
@@ -741,13 +845,18 @@ impl<'a, T: Sync + RustToCuda> CudaKernelParameter
     type SyncHostType = <&'a DeepPerThreadBorrow<T> as CudaKernelParameter>::SyncHostType;
 
     #[cfg(feature = "host")]
-    fn with_new_async<'stream, 'param, O, E: From<rustacuda::error::CudaError>>(
+    fn with_new_async<'stream, 'b, O, E: From<rustacuda::error::CudaError>>(
         param: Self::SyncHostType,
         stream: &'stream rustacuda::stream::Stream,
         inner: impl super::WithNewAsync<'stream, Self, O, E>,
-    ) -> Result<O, E> where Self: 'param {
+    ) -> Result<O, E>
+    where
+        Self: 'b,
+    {
         // FIXME: forward impl
-        crate::lend::LendToCuda::lend_to_cuda(param, |param| inner.with(param.as_async(stream).as_ref()))
+        crate::lend::LendToCuda::lend_to_cuda(param, |param| {
+            inner.with(param.as_async(stream).as_ref())
+        })
     }
 
     #[cfg(feature = "host")]
@@ -755,7 +864,10 @@ impl<'a, T: Sync + RustToCuda> CudaKernelParameter
         param: &Self::AsyncHostType<'stream, 'b>,
         _token: sealed::Token,
         inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O,
-    ) -> O where Self: 'b {
+    ) -> O
+    where
+        Self: 'b,
+    {
         let param = unsafe { param.unwrap_unchecked() };
         inner(Some(&param_as_raw_bytes(param.for_host())))
     }
@@ -764,7 +876,10 @@ impl<'a, T: Sync + RustToCuda> CudaKernelParameter
     fn shared_layout_for_async<'stream, 'b>(
         _param: &Self::AsyncHostType<'stream, 'b>,
         _token: sealed::Token,
-    ) -> Layout where Self: 'b {
+    ) -> Layout
+    where
+        Self: 'b,
+    {
         Layout::new::<()>()
     }
 
@@ -772,7 +887,10 @@ impl<'a, T: Sync + RustToCuda> CudaKernelParameter
     fn async_to_ffi<'stream, 'b, E: From<rustacuda::error::CudaError>>(
         param: Self::AsyncHostType<'stream, 'b>,
         token: sealed::Token,
-    ) -> Result<Self::FfiType<'stream, 'b>, E> where Self: 'b {
+    ) -> Result<Self::FfiType<'stream, 'b>, E>
+    where
+        Self: 'b,
+    {
         <&'a DeepPerThreadBorrow<T> as CudaKernelParameter>::async_to_ffi(param, token)
     }
 
@@ -780,11 +898,16 @@ impl<'a, T: Sync + RustToCuda> CudaKernelParameter
     unsafe fn with_ffi_as_device<'short, O, const PARAM: usize>(
         param: Self::FfiType<'static, 'short>,
         inner: impl super::WithFfiAsDevice<Self, O>,
-    ) -> O where Self: 'short {
+    ) -> O
+    where
+        Self: 'short,
+    {
         emit_param_ptx_jit_marker::<_, PARAM>(param.as_ref());
 
         // FIXME: forward impl
-        unsafe { crate::lend::BorrowFromRust::with_borrow_from_rust(param, |param| inner.with(param)) }
+        unsafe {
+            crate::lend::BorrowFromRust::with_borrow_from_rust(param, |param| inner.with(param))
+        }
     }
 }
 impl<'a, T: Sync + RustToCuda> sealed::Sealed for &'a PtxJit<DeepPerThreadBorrow<T>> {}
@@ -803,11 +926,14 @@ impl<'a, T: Sync + RustToCuda + SafeMutableAliasing> CudaKernelParameter
     type SyncHostType = <&'a mut DeepPerThreadBorrow<T> as CudaKernelParameter>::SyncHostType;
 
     #[cfg(feature = "host")]
-    fn with_new_async<'stream, 'param, O, E: From<rustacuda::error::CudaError>>(
+    fn with_new_async<'stream, 'b, O, E: From<rustacuda::error::CudaError>>(
         param: Self::SyncHostType,
         stream: &'stream rustacuda::stream::Stream,
         inner: impl super::WithNewAsync<'stream, Self, O, E>,
-    ) -> Result<O, E> where Self: 'param {
+    ) -> Result<O, E>
+    where
+        Self: 'b,
+    {
         // FIXME: forward impl
         crate::lend::LendToCuda::lend_to_cuda_mut(param, |mut param| {
             // FIXME: express the same with param.as_async(stream).as_mut()
@@ -821,7 +947,10 @@ impl<'a, T: Sync + RustToCuda + SafeMutableAliasing> CudaKernelParameter
         param: &Self::AsyncHostType<'stream, 'b>,
         _token: sealed::Token,
         inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O,
-    ) -> O where Self: 'b {
+    ) -> O
+    where
+        Self: 'b,
+    {
         let param = unsafe { param.as_ref().unwrap_unchecked() };
         inner(Some(&param_as_raw_bytes(param.for_host())))
     }
@@ -830,7 +959,10 @@ impl<'a, T: Sync + RustToCuda + SafeMutableAliasing> CudaKernelParameter
     fn shared_layout_for_async<'stream, 'b>(
         _param: &Self::AsyncHostType<'stream, 'b>,
         _token: sealed::Token,
-    ) -> Layout where Self: 'b {
+    ) -> Layout
+    where
+        Self: 'b,
+    {
         Layout::new::<()>()
     }
 
@@ -838,7 +970,10 @@ impl<'a, T: Sync + RustToCuda + SafeMutableAliasing> CudaKernelParameter
     fn async_to_ffi<'stream, 'b, E: From<rustacuda::error::CudaError>>(
         param: Self::AsyncHostType<'stream, 'b>,
         token: sealed::Token,
-    ) -> Result<Self::FfiType<'stream, 'b>, E> where Self: 'b {
+    ) -> Result<Self::FfiType<'stream, 'b>, E>
+    where
+        Self: 'b,
+    {
         <&'a mut DeepPerThreadBorrow<T> as CudaKernelParameter>::async_to_ffi(param, token)
     }
 
@@ -846,11 +981,16 @@ impl<'a, T: Sync + RustToCuda + SafeMutableAliasing> CudaKernelParameter
     unsafe fn with_ffi_as_device<'short, O, const PARAM: usize>(
         param: Self::FfiType<'static, 'short>,
         inner: impl super::WithFfiAsDevice<Self, O>,
-    ) -> O where Self: 'short {
+    ) -> O
+    where
+        Self: 'short,
+    {
         emit_param_ptx_jit_marker::<_, PARAM>(param.as_ref());
 
         // FIXME: forward impl
-        unsafe { crate::lend::BorrowFromRust::with_borrow_from_rust_mut(param, |param| inner.with(param)) }
+        unsafe {
+            crate::lend::BorrowFromRust::with_borrow_from_rust_mut(param, |param| inner.with(param))
+        }
     }
 }
 impl<'a, T: Sync + RustToCuda + SafeMutableAliasing> sealed::Sealed
@@ -908,11 +1048,14 @@ impl<'a, T: 'static> CudaKernelParameter for &'a mut crate::utils::shared::Threa
     type SyncHostType = Self;
 
     #[cfg(feature = "host")]
-    fn with_new_async<'stream, 'param, O, E: From<rustacuda::error::CudaError>>(
+    fn with_new_async<'stream, 'b, O, E: From<rustacuda::error::CudaError>>(
         param: Self::SyncHostType,
         _stream: &'stream rustacuda::stream::Stream,
         inner: impl super::WithNewAsync<'stream, Self, O, E>,
-    ) -> Result<O, E> where Self: 'param {
+    ) -> Result<O, E>
+    where
+        Self: 'b,
+    {
         inner.with(param)
     }
 
@@ -921,7 +1064,10 @@ impl<'a, T: 'static> CudaKernelParameter for &'a mut crate::utils::shared::Threa
         _param: &Self::AsyncHostType<'stream, 'b>,
         _token: sealed::Token,
         inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O,
-    ) -> O where Self: 'b {
+    ) -> O
+    where
+        Self: 'b,
+    {
         inner(None)
     }
 
@@ -929,7 +1075,10 @@ impl<'a, T: 'static> CudaKernelParameter for &'a mut crate::utils::shared::Threa
     fn shared_layout_for_async<'stream, 'b>(
         _param: &Self::AsyncHostType<'stream, 'b>,
         _token: sealed::Token,
-    ) -> Layout where Self: 'b {
+    ) -> Layout
+    where
+        Self: 'b,
+    {
         Layout::new::<()>()
     }
 
@@ -937,7 +1086,10 @@ impl<'a, T: 'static> CudaKernelParameter for &'a mut crate::utils::shared::Threa
     fn async_to_ffi<'stream, 'b, E: From<rustacuda::error::CudaError>>(
         _param: Self::AsyncHostType<'stream, 'b>,
         _token: sealed::Token,
-    ) -> Result<Self::FfiType<'stream, 'b>, E> where Self: 'b {
+    ) -> Result<Self::FfiType<'stream, 'b>, E>
+    where
+        Self: 'b,
+    {
         Ok(private_shared::ThreadBlockSharedFfi {
             _dummy: [],
             _marker: PhantomData::<T>,
@@ -950,7 +1102,10 @@ impl<'a, T: 'static> CudaKernelParameter for &'a mut crate::utils::shared::Threa
     unsafe fn with_ffi_as_device<'short, O, const PARAM: usize>(
         _param: Self::FfiType<'static, 'short>,
         inner: impl super::WithFfiAsDevice<Self, O>,
-    ) -> O where Self: 'short {
+    ) -> O
+    where
+        Self: 'short,
+    {
         let mut param = crate::utils::shared::ThreadBlockShared::new_uninit();
 
         inner.with(&mut param)
@@ -970,11 +1125,14 @@ impl<'a, T: 'static + PortableBitSemantics + TypeGraphLayout> CudaKernelParamete
     type SyncHostType = Self;
 
     #[cfg(feature = "host")]
-    fn with_new_async<'stream, 'param, O, E: From<rustacuda::error::CudaError>>(
+    fn with_new_async<'stream, 'b, O, E: From<rustacuda::error::CudaError>>(
         param: Self::SyncHostType,
         _stream: &'stream rustacuda::stream::Stream,
         inner: impl super::WithNewAsync<'stream, Self, O, E>,
-    ) -> Result<O, E> where Self: 'param {
+    ) -> Result<O, E>
+    where
+        Self: 'b,
+    {
         inner.with(param)
     }
 
@@ -983,7 +1141,10 @@ impl<'a, T: 'static + PortableBitSemantics + TypeGraphLayout> CudaKernelParamete
         _param: &Self::AsyncHostType<'stream, 'b>,
         _token: sealed::Token,
         inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O,
-    ) -> O where Self: 'b {
+    ) -> O
+    where
+        Self: 'b,
+    {
         inner(None)
     }
 
@@ -991,7 +1152,10 @@ impl<'a, T: 'static + PortableBitSemantics + TypeGraphLayout> CudaKernelParamete
     fn shared_layout_for_async<'stream, 'b>(
         param: &Self::AsyncHostType<'stream, 'b>,
         _token: sealed::Token,
-    ) -> Layout where Self: 'b {
+    ) -> Layout
+    where
+        Self: 'b,
+    {
         param.layout()
     }
 
@@ -999,7 +1163,10 @@ impl<'a, T: 'static + PortableBitSemantics + TypeGraphLayout> CudaKernelParamete
     fn async_to_ffi<'stream, 'b, E: From<rustacuda::error::CudaError>>(
         param: Self::AsyncHostType<'stream, 'b>,
         _token: sealed::Token,
-    ) -> Result<Self::FfiType<'stream, 'b>, E> where Self: 'b {
+    ) -> Result<Self::FfiType<'stream, 'b>, E>
+    where
+        Self: 'b,
+    {
         Ok(private_shared::ThreadBlockSharedSliceFfi {
             len: param.len(),
             _marker: [],
@@ -1012,9 +1179,14 @@ impl<'a, T: 'static + PortableBitSemantics + TypeGraphLayout> CudaKernelParamete
     unsafe fn with_ffi_as_device<'short, O, const PARAM: usize>(
         param: Self::FfiType<'static, 'short>,
         inner: impl super::WithFfiAsDevice<Self, O>,
-    ) -> O where Self: 'short {
+    ) -> O
+    where
+        Self: 'short,
+    {
         unsafe {
-            crate::utils::shared::ThreadBlockSharedSlice::with_uninit_for_len(param.len, |param| inner.with(param))
+            crate::utils::shared::ThreadBlockSharedSlice::with_uninit_for_len(param.len, |param| {
+                inner.with(param)
+            })
         }
     }
 }

From 61e83a65036945b70c4ebef4ac2f59994b9e5f4b Mon Sep 17 00:00:00 2001
From: Juniper Tyree <juniper.tyree@helsinki.fi>
Date: Mon, 8 Jan 2024 11:36:27 +0000
Subject: [PATCH 095/120] Make CudaExchangeBuffer Sync

---
 src/utils/exchange/buffer/host.rs | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/utils/exchange/buffer/host.rs b/src/utils/exchange/buffer/host.rs
index ce0cb9d41..7fc8b45bf 100644
--- a/src/utils/exchange/buffer/host.rs
+++ b/src/utils/exchange/buffer/host.rs
@@ -1,5 +1,5 @@
 use std::{
-    cell::UnsafeCell,
+    cell::SyncUnsafeCell,
     ops::{Deref, DerefMut},
 };
 
@@ -31,7 +31,7 @@ pub struct CudaExchangeBufferHost<
     host_buffer: CudaDropWrapper<
         LockedBuffer<DeviceCopyWithPortableBitSemantics<CudaExchangeItem<T, M2D, M2H>>>,
     >,
-    device_buffer: UnsafeCell<
+    device_buffer: SyncUnsafeCell<
         CudaDropWrapper<
             DeviceBuffer<DeviceCopyWithPortableBitSemantics<CudaExchangeItem<T, M2D, M2H>>>,
         >,
@@ -55,7 +55,7 @@ impl<
             DeviceCopyWithPortableBitSemantics::from_ref(elem),
             capacity,
         )?);
-        let device_buffer = UnsafeCell::new(CudaDropWrapper::from(DeviceBuffer::from_slice(
+        let device_buffer = SyncUnsafeCell::new(CudaDropWrapper::from(DeviceBuffer::from_slice(
             host_buffer.as_slice(),
         )?));
 
@@ -89,7 +89,7 @@ impl<T: StackOnly + PortableBitSemantics + TypeGraphLayout, const M2D: bool, con
             uninit
         };
 
-        let device_buffer = UnsafeCell::new(CudaDropWrapper::from(DeviceBuffer::from_slice(
+        let device_buffer = SyncUnsafeCell::new(CudaDropWrapper::from(DeviceBuffer::from_slice(
             host_buffer.as_slice(),
         )?));
 
@@ -129,7 +129,7 @@ impl<T: StackOnly + PortableBitSemantics + TypeGraphLayout, const M2D: bool, con
         DeviceAccessible<CudaExchangeBufferCudaRepresentation<T, M2D, M2H>>,
         CombinedCudaAlloc<NoCudaAlloc, A>,
     )> {
-        // Safety: device_buffer is inside an UnsafeCell
+        // Safety: device_buffer is inside an SyncUnsafeCell
         //         borrow checks must be satisfied through LendToCuda
         let device_buffer = &mut *self.device_buffer.get();
 
@@ -183,7 +183,7 @@ impl<T: StackOnly + PortableBitSemantics + TypeGraphLayout, const M2D: bool, con
         Async<'_, 'stream, DeviceAccessible<CudaExchangeBufferCudaRepresentation<T, M2D, M2H>>>,
         CombinedCudaAlloc<NoCudaAlloc, A>,
     )> {
-        // Safety: device_buffer is inside an UnsafeCell
+        // Safety: device_buffer is inside an SyncUnsafeCell
         //         borrow checks must be satisfied through LendToCuda
         let device_buffer = &mut *self.device_buffer.get();
 

From 8dc0c6df52348fd119230ca8f1a4edc9562a1f86 Mon Sep 17 00:00:00 2001
From: Juniper Tyree <juniper.tyree@helsinki.fi>
Date: Mon, 8 Jan 2024 11:44:58 +0000
Subject: [PATCH 096/120] Make CudaExchangeBuffer Sync v2

---
 src/utils/exchange/buffer/host.rs | 12 ++++++------
 src/utils/exchange/buffer/mod.rs  |  8 ++++++++
 2 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/src/utils/exchange/buffer/host.rs b/src/utils/exchange/buffer/host.rs
index 7fc8b45bf..ce0cb9d41 100644
--- a/src/utils/exchange/buffer/host.rs
+++ b/src/utils/exchange/buffer/host.rs
@@ -1,5 +1,5 @@
 use std::{
-    cell::SyncUnsafeCell,
+    cell::UnsafeCell,
     ops::{Deref, DerefMut},
 };
 
@@ -31,7 +31,7 @@ pub struct CudaExchangeBufferHost<
     host_buffer: CudaDropWrapper<
         LockedBuffer<DeviceCopyWithPortableBitSemantics<CudaExchangeItem<T, M2D, M2H>>>,
     >,
-    device_buffer: SyncUnsafeCell<
+    device_buffer: UnsafeCell<
         CudaDropWrapper<
             DeviceBuffer<DeviceCopyWithPortableBitSemantics<CudaExchangeItem<T, M2D, M2H>>>,
         >,
@@ -55,7 +55,7 @@ impl<
             DeviceCopyWithPortableBitSemantics::from_ref(elem),
             capacity,
         )?);
-        let device_buffer = SyncUnsafeCell::new(CudaDropWrapper::from(DeviceBuffer::from_slice(
+        let device_buffer = UnsafeCell::new(CudaDropWrapper::from(DeviceBuffer::from_slice(
             host_buffer.as_slice(),
         )?));
 
@@ -89,7 +89,7 @@ impl<T: StackOnly + PortableBitSemantics + TypeGraphLayout, const M2D: bool, con
             uninit
         };
 
-        let device_buffer = SyncUnsafeCell::new(CudaDropWrapper::from(DeviceBuffer::from_slice(
+        let device_buffer = UnsafeCell::new(CudaDropWrapper::from(DeviceBuffer::from_slice(
             host_buffer.as_slice(),
         )?));
 
@@ -129,7 +129,7 @@ impl<T: StackOnly + PortableBitSemantics + TypeGraphLayout, const M2D: bool, con
         DeviceAccessible<CudaExchangeBufferCudaRepresentation<T, M2D, M2H>>,
         CombinedCudaAlloc<NoCudaAlloc, A>,
     )> {
-        // Safety: device_buffer is inside an SyncUnsafeCell
+        // Safety: device_buffer is inside an UnsafeCell
         //         borrow checks must be satisfied through LendToCuda
         let device_buffer = &mut *self.device_buffer.get();
 
@@ -183,7 +183,7 @@ impl<T: StackOnly + PortableBitSemantics + TypeGraphLayout, const M2D: bool, con
         Async<'_, 'stream, DeviceAccessible<CudaExchangeBufferCudaRepresentation<T, M2D, M2H>>>,
         CombinedCudaAlloc<NoCudaAlloc, A>,
     )> {
-        // Safety: device_buffer is inside an SyncUnsafeCell
+        // Safety: device_buffer is inside an UnsafeCell
         //         borrow checks must be satisfied through LendToCuda
         let device_buffer = &mut *self.device_buffer.get();
 
diff --git a/src/utils/exchange/buffer/mod.rs b/src/utils/exchange/buffer/mod.rs
index c48a715ac..ea5118236 100644
--- a/src/utils/exchange/buffer/mod.rs
+++ b/src/utils/exchange/buffer/mod.rs
@@ -46,6 +46,14 @@ pub struct CudaExchangeBuffer<
     inner: device::CudaExchangeBufferDevice<T, M2D, M2H>,
 }
 
+unsafe impl<
+        T: StackOnly + PortableBitSemantics + TypeGraphLayout + Sync,
+        const M2D: bool,
+        const M2H: bool,
+    > Sync for CudaExchangeBuffer<T, M2D, M2H>
+{
+}
+
 #[cfg(feature = "host")]
 impl<
         T: Clone + StackOnly + PortableBitSemantics + TypeGraphLayout,

From dd9507d96ed34bf03a7537d62a693266ea4a8cb5 Mon Sep 17 00:00:00 2001
From: Juniper Tyree <juniper.tyree@helsinki.fi>
Date: Tue, 9 Jan 2024 03:31:42 +0000
Subject: [PATCH 097/120] Add AsyncProj proj_ref and proj_mut convenience
 methods

---
 src/kernel/param.rs |  7 +++----
 src/utils/async.rs  | 27 +++++++++++++++++++++++++++
 2 files changed, 30 insertions(+), 4 deletions(-)

diff --git a/src/kernel/param.rs b/src/kernel/param.rs
index 1f149d8b4..6be634b24 100644
--- a/src/kernel/param.rs
+++ b/src/kernel/param.rs
@@ -71,8 +71,7 @@ impl<
     > CudaKernelParameter for PerThreadShallowCopy<T>
 {
     #[cfg(feature = "host")]
-    type AsyncHostType<'stream, 'b> =
-        crate::utils::adapter::RustToCudaWithPortableBitCopySemantics<T> where Self: 'b;
+    type AsyncHostType<'stream, 'b> = T where Self: 'b;
     #[cfg(any(feature = "device", doc))]
     type DeviceType<'b> = T where Self: 'b;
     type FfiType<'stream, 'b> = crate::utils::adapter::RustToCudaWithPortableBitCopySemantics<T> where Self: 'b;
@@ -88,7 +87,7 @@ impl<
     where
         Self: 'b,
     {
-        inner.with(crate::utils::adapter::RustToCudaWithPortableBitCopySemantics::from(param))
+        inner.with(param)
     }
 
     #[cfg(feature = "host")]
@@ -122,7 +121,7 @@ impl<
     where
         Self: 'b,
     {
-        Ok(param)
+        Ok(crate::utils::adapter::RustToCudaWithPortableBitCopySemantics::from(param))
     }
 
     #[cfg(feature = "device")]
diff --git a/src/utils/async.rs b/src/utils/async.rs
index e98758d4f..b008ac553 100644
--- a/src/utils/async.rs
+++ b/src/utils/async.rs
@@ -454,6 +454,33 @@ impl<'a, 'stream, T: 'a> AsyncProj<'a, 'stream, T> {
     }
 }
 
+#[cfg(feature = "host")]
+impl<'a, 'stream, T: 'a> AsyncProj<'a, 'stream, T> {
+    #[must_use]
+    pub const fn proj_ref<'b>(&'b self) -> AsyncProj<'b, 'stream, &'b T>
+    where
+        'a: 'b,
+    {
+        AsyncProj {
+            _capture: PhantomData::<&'b ()>,
+            _stream: PhantomData::<&'stream Stream>,
+            value: &self.value,
+        }
+    }
+
+    #[must_use]
+    pub fn proj_mut<'b>(&'b mut self) -> AsyncProj<'b, 'stream, &'b mut T>
+    where
+        'a: 'b,
+    {
+        AsyncProj {
+            _capture: PhantomData::<&'b ()>,
+            _stream: PhantomData::<&'stream Stream>,
+            value: &mut self.value,
+        }
+    }
+}
+
 #[cfg(feature = "host")]
 impl<'a, 'stream, T: 'a> AsyncProj<'a, 'stream, &'a T> {
     #[must_use]

From e2032bf4e1c34bca9dc7214e2394001ff93bdab6 Mon Sep 17 00:00:00 2001
From: Juniper Tyree <juniper.tyree@helsinki.fi>
Date: Thu, 11 Jan 2024 04:42:52 +0000
Subject: [PATCH 098/120] Add RustToCudaWithPortableBitCloneSemantics adapter

---
 rust-cuda-derive/src/rust_to_cuda/field_ty.rs |   4 +-
 src/utils/adapter.rs                          | 172 ++++++++++++++++++
 src/utils/exchange/buffer/mod.rs              |   1 +
 3 files changed, 175 insertions(+), 2 deletions(-)

diff --git a/rust-cuda-derive/src/rust_to_cuda/field_ty.rs b/rust-cuda-derive/src/rust_to_cuda/field_ty.rs
index 36924aaf9..b2f624d66 100644
--- a/rust-cuda-derive/src/rust_to_cuda/field_ty.rs
+++ b/rust-cuda-derive/src/rust_to_cuda/field_ty.rs
@@ -83,7 +83,7 @@ pub fn swap_field_type_and_filter_attrs(
                         _ => {
                             emit_error!(
                                 meta.span(),
-                                "[rust-cuda]: Expected #[cuda(ignore)] / #[cdua(embed)] / \
+                                "[rust-cuda]: Expected #[cuda(ignore)] / #[cuda(embed)] / \
                                 #[cuda(embed = \"<type>\")] field attribute"
                             );
                         }
@@ -92,7 +92,7 @@ pub fn swap_field_type_and_filter_attrs(
             } else {
                 emit_error!(
                     attr.span(),
-                    "[rust-cuda]: Expected #[cuda(ignore)] / #[cdua(embed)] / \
+                    "[rust-cuda]: Expected #[cuda(ignore)] / #[cuda(embed)] / \
                     #[cuda(embed = \"<type>\")] field attribute."
                 );
             }
diff --git a/src/utils/adapter.rs b/src/utils/adapter.rs
index 093a02fd4..c80cab4d0 100644
--- a/src/utils/adapter.rs
+++ b/src/utils/adapter.rs
@@ -186,6 +186,178 @@ unsafe impl<T: Copy + PortableBitSemantics + TypeGraphLayout> CudaAsRust
     }
 }
 
+#[derive(Copy, Clone, Debug, TypeLayout)]
+#[repr(transparent)]
+pub struct RustToCudaWithPortableBitCloneSemantics<
+    T: Clone + PortableBitSemantics + TypeGraphLayout,
+>(T);
+
+impl<T: Clone + PortableBitSemantics + TypeGraphLayout> From<T>
+    for RustToCudaWithPortableBitCloneSemantics<T>
+{
+    fn from(value: T) -> Self {
+        Self(value)
+    }
+}
+
+impl<T: Clone + PortableBitSemantics + TypeGraphLayout> RustToCudaWithPortableBitCloneSemantics<T> {
+    #[must_use]
+    pub const fn from_clone(value: &T) -> Self {
+        Self(value.clone())
+    }
+
+    #[must_use]
+    pub const fn into_inner(self) -> T {
+        self.0
+    }
+
+    #[must_use]
+    pub const fn from_ref(reference: &T) -> &Self {
+        // Safety: [`RustToCudaWithPortableBitCloneSemantics`] is a transparent newtype
+        //         around `T`
+        unsafe { &*core::ptr::from_ref(reference).cast() }
+    }
+
+    #[must_use]
+    pub const fn into_ref(&self) -> &T {
+        // Safety: [`RustToCudaWithPortableBitCloneSemantics`] is a transparent newtype
+        //         around `T`
+        unsafe { &*core::ptr::from_ref(self).cast() }
+    }
+
+    #[must_use]
+    pub fn from_mut(reference: &mut T) -> &mut Self {
+        // Safety: [`RustToCudaWithPortableBitCloneSemantics`] is a transparent newtype
+        //         around `T`
+        unsafe { &mut *core::ptr::from_mut(reference).cast() }
+    }
+
+    #[must_use]
+    pub fn into_mut(&mut self) -> &mut T {
+        // Safety: [`RustToCudaWithPortableBitCloneSemantics`] is a transparent newtype
+        //         around `T`
+        unsafe { &mut *core::ptr::from_mut(self).cast() }
+    }
+
+    #[must_use]
+    pub const fn from_slice(slice: &[T]) -> &[Self] {
+        // Safety: [`RustToCudaWithPortableBitCloneSemantics`] is a transparent newtype
+        //         around `T`
+        unsafe { core::slice::from_raw_parts(slice.as_ptr().cast(), slice.len()) }
+    }
+
+    #[must_use]
+    pub const fn into_slice(slice: &[Self]) -> &[T] {
+        // Safety: [`RustToCudaWithPortableBitCloneSemantics`] is a transparent newtype
+        //         around `T`
+        unsafe { core::slice::from_raw_parts(slice.as_ptr().cast(), slice.len()) }
+    }
+
+    #[must_use]
+    pub fn from_mut_slice(slice: &mut [T]) -> &mut [Self] {
+        // Safety: [`RustToCudaWithPortableBitCloneSemantics`] is a transparent newtype
+        //         around `T`
+        unsafe { core::slice::from_raw_parts_mut(slice.as_mut_ptr().cast(), slice.len()) }
+    }
+
+    #[must_use]
+    pub fn into_mut_slice(slice: &mut [Self]) -> &mut [T] {
+        // Safety: [`RustToCudaWithPortableBitCloneSemantics`] is a transparent newtype
+        //         around `T`
+        unsafe { core::slice::from_raw_parts_mut(slice.as_mut_ptr().cast(), slice.len()) }
+    }
+}
+
+unsafe impl<T: Clone + PortableBitSemantics + TypeGraphLayout> RustToCuda
+    for RustToCudaWithPortableBitCloneSemantics<T>
+{
+    type CudaAllocation = NoCudaAlloc;
+    type CudaRepresentation = Self;
+
+    #[cfg(feature = "host")]
+    #[allow(clippy::type_complexity)]
+    unsafe fn borrow<A: CudaAlloc>(
+        &self,
+        alloc: A,
+    ) -> rustacuda::error::CudaResult<(
+        DeviceAccessible<Self::CudaRepresentation>,
+        CombinedCudaAlloc<Self::CudaAllocation, A>,
+    )> {
+        let alloc = CombinedCudaAlloc::new(NoCudaAlloc, alloc);
+        Ok((DeviceAccessible::from(self.clone()), alloc))
+    }
+
+    #[cfg(feature = "host")]
+    unsafe fn restore<A: CudaAlloc>(
+        &mut self,
+        alloc: CombinedCudaAlloc<Self::CudaAllocation, A>,
+    ) -> rustacuda::error::CudaResult<A> {
+        let (_alloc_front, alloc_tail): (NoCudaAlloc, A) = alloc.split();
+
+        Ok(alloc_tail)
+    }
+}
+
+unsafe impl<T: Clone + PortableBitSemantics + TypeGraphLayout> RustToCudaAsync
+    for RustToCudaWithPortableBitCloneSemantics<T>
+{
+    type CudaAllocationAsync = NoCudaAlloc;
+
+    #[cfg(feature = "host")]
+    #[allow(clippy::type_complexity)]
+    unsafe fn borrow_async<'stream, A: CudaAlloc>(
+        &self,
+        alloc: A,
+        stream: &'stream rustacuda::stream::Stream,
+    ) -> rustacuda::error::CudaResult<(
+        crate::utils::r#async::Async<'_, 'stream, DeviceAccessible<Self::CudaRepresentation>>,
+        CombinedCudaAlloc<Self::CudaAllocation, A>,
+    )> {
+        let alloc = CombinedCudaAlloc::new(NoCudaAlloc, alloc);
+        Ok((
+            crate::utils::r#async::Async::ready(DeviceAccessible::from(self.clone()), stream),
+            alloc,
+        ))
+    }
+
+    #[cfg(feature = "host")]
+    unsafe fn restore_async<'a, 'stream, A: CudaAlloc, O>(
+        this: owning_ref::BoxRefMut<'a, O, Self>,
+        alloc: CombinedCudaAlloc<Self::CudaAllocation, A>,
+        stream: &'stream rustacuda::stream::Stream,
+    ) -> rustacuda::error::CudaResult<(
+        crate::utils::r#async::Async<
+            'a,
+            'stream,
+            owning_ref::BoxRefMut<'a, O, Self>,
+            crate::utils::r#async::CompletionFnMut<'a, Self>,
+        >,
+        A,
+    )> {
+        let (_alloc_front, alloc_tail): (NoCudaAlloc, A) = alloc.split();
+
+        let r#async = crate::utils::r#async::Async::<
+            _,
+            crate::utils::r#async::CompletionFnMut<'a, Self>,
+        >::pending(this, stream, Box::new(|_this| Ok(())))?;
+
+        Ok((r#async, alloc_tail))
+    }
+}
+
+unsafe impl<T: Clone + PortableBitSemantics + TypeGraphLayout> CudaAsRust
+    for RustToCudaWithPortableBitCloneSemantics<T>
+{
+    type RustRepresentation = Self;
+
+    #[cfg(feature = "device")]
+    unsafe fn as_rust(this: &DeviceAccessible<Self>) -> Self::RustRepresentation {
+        let mut uninit = core::mem::MaybeUninit::uninit();
+        core::ptr::copy_nonoverlapping(&**this, uninit.as_mut_ptr(), 1);
+        uninit.assume_init()
+    }
+}
+
 #[allow(clippy::module_name_repetitions)]
 #[derive(Copy, Clone, Debug, TypeLayout)]
 #[repr(transparent)]
diff --git a/src/utils/exchange/buffer/mod.rs b/src/utils/exchange/buffer/mod.rs
index ea5118236..28ee028d1 100644
--- a/src/utils/exchange/buffer/mod.rs
+++ b/src/utils/exchange/buffer/mod.rs
@@ -46,6 +46,7 @@ pub struct CudaExchangeBuffer<
     inner: device::CudaExchangeBufferDevice<T, M2D, M2H>,
 }
 
+#[cfg(any(feature = "host", feature = "device"))]
 unsafe impl<
         T: StackOnly + PortableBitSemantics + TypeGraphLayout + Sync,
         const M2D: bool,

From eb6757cda5f0b0f71e509cd3f3e56f6208d62e1d Mon Sep 17 00:00:00 2001
From: Juniper Tyree <juniper.tyree@helsinki.fi>
Date: Thu, 11 Jan 2024 04:45:34 +0000
Subject: [PATCH 099/120] Fix invalid const fn bounds

---
 src/utils/adapter.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/utils/adapter.rs b/src/utils/adapter.rs
index c80cab4d0..1b6b3af4c 100644
--- a/src/utils/adapter.rs
+++ b/src/utils/adapter.rs
@@ -202,12 +202,12 @@ impl<T: Clone + PortableBitSemantics + TypeGraphLayout> From<T>
 
 impl<T: Clone + PortableBitSemantics + TypeGraphLayout> RustToCudaWithPortableBitCloneSemantics<T> {
     #[must_use]
-    pub const fn from_clone(value: &T) -> Self {
+    pub fn from_clone(value: &T) -> Self {
         Self(value.clone())
     }
 
     #[must_use]
-    pub const fn into_inner(self) -> T {
+    pub fn into_inner(self) -> T {
         self.0
     }
 

From 8552c2163406d0af443130f9d209b477bc34fd1b Mon Sep 17 00:00:00 2001
From: Juniper Tyree <juniper.tyree@helsinki.fi>
Date: Thu, 11 Jan 2024 04:51:23 +0000
Subject: [PATCH 100/120] Add Deref[Mut] to the adapters

---
 src/utils/adapter.rs | 52 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 52 insertions(+)

diff --git a/src/utils/adapter.rs b/src/utils/adapter.rs
index 1b6b3af4c..f7d041089 100644
--- a/src/utils/adapter.rs
+++ b/src/utils/adapter.rs
@@ -1,5 +1,7 @@
 #![allow(clippy::trait_duplication_in_bounds)]
 
+use core::ops::{Deref, DerefMut};
+
 use const_type_layout::{TypeGraphLayout, TypeLayout};
 
 use crate::{
@@ -28,6 +30,24 @@ impl<T: Copy + PortableBitSemantics + TypeGraphLayout> From<T>
     }
 }
 
+impl<T: Copy + PortableBitSemantics + TypeGraphLayout> Deref
+    for RustToCudaWithPortableBitCopySemantics<T>
+{
+    type Target = T;
+
+    fn deref(&self) -> &Self::Target {
+        &self.0
+    }
+}
+
+impl<T: Copy + PortableBitSemantics + TypeGraphLayout> DerefMut
+    for RustToCudaWithPortableBitCopySemantics<T>
+{
+    fn deref_mut(&mut self) -> &mut Self::Target {
+        &mut self.0
+    }
+}
+
 impl<T: Copy + PortableBitSemantics + TypeGraphLayout> RustToCudaWithPortableBitCopySemantics<T> {
     #[must_use]
     pub const fn from_copy(value: &T) -> Self {
@@ -200,6 +220,24 @@ impl<T: Clone + PortableBitSemantics + TypeGraphLayout> From<T>
     }
 }
 
+impl<T: Clone + PortableBitSemantics + TypeGraphLayout> Deref
+    for RustToCudaWithPortableBitCloneSemantics<T>
+{
+    type Target = T;
+
+    fn deref(&self) -> &Self::Target {
+        &self.0
+    }
+}
+
+impl<T: Clone + PortableBitSemantics + TypeGraphLayout> DerefMut
+    for RustToCudaWithPortableBitCloneSemantics<T>
+{
+    fn deref_mut(&mut self) -> &mut Self::Target {
+        &mut self.0
+    }
+}
+
 impl<T: Clone + PortableBitSemantics + TypeGraphLayout> RustToCudaWithPortableBitCloneSemantics<T> {
     #[must_use]
     pub fn from_clone(value: &T) -> Self {
@@ -374,6 +412,20 @@ impl<T: PortableBitSemantics + TypeGraphLayout> From<T> for DeviceCopyWithPortab
     }
 }
 
+impl<T: PortableBitSemantics + TypeGraphLayout> Deref for DeviceCopyWithPortableBitSemantics<T> {
+    type Target = T;
+
+    fn deref(&self) -> &Self::Target {
+        &self.0
+    }
+}
+
+impl<T: PortableBitSemantics + TypeGraphLayout> DerefMut for DeviceCopyWithPortableBitSemantics<T> {
+    fn deref_mut(&mut self) -> &mut Self::Target {
+        &mut self.0
+    }
+}
+
 impl<T: PortableBitSemantics + TypeGraphLayout> DeviceCopyWithPortableBitSemantics<T> {
     #[must_use]
     pub fn into_inner(self) -> T {

From 5e1534cf3c4bd98df88aefbfe647dcd9a519dd65 Mon Sep 17 00:00:00 2001
From: Juniper Tyree <juniper.tyree@helsinki.fi>
Date: Thu, 11 Jan 2024 05:05:06 +0000
Subject: [PATCH 101/120] Fix pointer type inference error

---
 src/utils/exchange/buffer/host.rs | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/utils/exchange/buffer/host.rs b/src/utils/exchange/buffer/host.rs
index ce0cb9d41..184de1aca 100644
--- a/src/utils/exchange/buffer/host.rs
+++ b/src/utils/exchange/buffer/host.rs
@@ -77,9 +77,11 @@ impl<T: StackOnly + PortableBitSemantics + TypeGraphLayout, const M2D: bool, con
             let mut uninit: CudaDropWrapper<LockedBuffer<DeviceCopyWithPortableBitSemantics<_>>> =
                 CudaDropWrapper::from(LockedBuffer::uninitialized(vec.len())?);
 
+            let uninit_ptr: *mut DeviceCopyWithPortableBitSemantics<CudaExchangeItem<T, M2D, M2H>> =
+                uninit.as_mut_ptr();
+
             for (i, src) in vec.into_iter().enumerate() {
-                uninit
-                    .as_mut_ptr()
+                uninit_ptr
                     .add(i)
                     .write(DeviceCopyWithPortableBitSemantics::from(CudaExchangeItem(
                         src,

From c74b542d35007dda960831ef1ce014c7ddb70ef8 Mon Sep 17 00:00:00 2001
From: Juniper Tyree <juniper.tyree@helsinki.fi>
Date: Fri, 12 Jan 2024 03:29:32 +0000
Subject: [PATCH 102/120] Try removing __rust_cuda_ffi_safe_assert module

---
 .../kernel/wrapper/generate/cuda_wrapper.rs   | 26 +++++++++-------
 .../generate/host_linker_macro/get_ptx.rs     | 31 ++++++++++---------
 src/host/mod.rs                               |  2 --
 3 files changed, 31 insertions(+), 28 deletions(-)

diff --git a/rust-cuda-kernel/src/kernel/wrapper/generate/cuda_wrapper.rs b/rust-cuda-kernel/src/kernel/wrapper/generate/cuda_wrapper.rs
index 48049c5a1..74ab20f5b 100644
--- a/rust-cuda-kernel/src/kernel/wrapper/generate/cuda_wrapper.rs
+++ b/rust-cuda-kernel/src/kernel/wrapper/generate/cuda_wrapper.rs
@@ -61,6 +61,15 @@ pub(in super::super) fn quote_cuda_wrapper(
         },
     );
 
+    let private_func_params = func_params
+        .iter()
+        .map(|param| {
+            let mut private = syn::Ident::clone(param);
+            private.set_span(proc_macro::Span::def_site().into());
+            private
+        })
+        .collect::<Vec<_>>();
+
     quote! {
         #[cfg(target_os = "cuda")]
         #[#crate_path::device::specialise_kernel_function(#func_ident)]
@@ -68,6 +77,12 @@ pub(in super::super) fn quote_cuda_wrapper(
         #[allow(unused_unsafe)]
         #(#func_attrs)*
         pub unsafe extern "ptx-kernel" fn #func_ident_hash(#(#ffi_inputs),*) {
+            extern "C" { #(
+                #[allow(dead_code)]
+                #[deny(improper_ctypes)]
+                static #private_func_params: #ffi_types;
+            )* }
+
             unsafe {
                 // Initialise the dynamically-sized thread-block shared memory
                 //  and the thread-local offset pointer that points to it
@@ -89,17 +104,6 @@ pub(in super::super) fn quote_cuda_wrapper(
                 ::core::arch::asm!(#KERNEL_TYPE_USE_END_CANARY);
             }
 
-            #[deny(improper_ctypes)]
-            mod __rust_cuda_ffi_safe_assert {
-                #[allow(unused_imports)]
-                use super::*;
-
-                extern "C" { #(
-                    #[allow(dead_code)]
-                    static #func_params: #ffi_types;
-                )* }
-            }
-
             #ffi_param_ptx_jit_wrap
         }
     }
diff --git a/rust-cuda-kernel/src/kernel/wrapper/generate/host_linker_macro/get_ptx.rs b/rust-cuda-kernel/src/kernel/wrapper/generate/host_linker_macro/get_ptx.rs
index 599b68fce..303b43ff1 100644
--- a/rust-cuda-kernel/src/kernel/wrapper/generate/host_linker_macro/get_ptx.rs
+++ b/rust-cuda-kernel/src/kernel/wrapper/generate/host_linker_macro/get_ptx.rs
@@ -80,10 +80,24 @@ pub(super) fn quote_get_ptx(
             .collect::<Vec<_>>()
     };
 
+    let private_func_params = func_params
+        .iter()
+        .map(|param| {
+            let mut private = syn::Ident::clone(param);
+            private.set_span(proc_macro::Span::def_site().into());
+            private
+        })
+        .collect::<Vec<_>>();
+
     quote! {
         fn get_ptx() -> &'static ::core::ffi::CStr {
-            #[allow(unused_imports)]
-            use __rust_cuda_ffi_safe_assert::#args;
+            #args_trait
+
+            extern "C" { #(
+                #[allow(dead_code)]
+                #[deny(improper_ctypes)]
+                static #private_func_params: #cpu_func_lifetime_erased_types;
+            )* }
 
             #crate_path::kernel::link_kernel!{
                 #func_ident #func_ident_hash #crate_name #crate_manifest_dir #generic_start_token
@@ -95,19 +109,6 @@ pub(super) fn quote_get_ptx(
 
             #(#type_layout_asserts)*
 
-            #[deny(improper_ctypes)]
-            mod __rust_cuda_ffi_safe_assert {
-                #[allow(unused_imports)]
-                use super::*;
-
-                #args_trait
-
-                extern "C" { #(
-                    #[allow(dead_code)]
-                    static #func_params: #cpu_func_lifetime_erased_types;
-                )* }
-            }
-
             PTX_CSTR
         }
     }
diff --git a/src/host/mod.rs b/src/host/mod.rs
index 2ddc768dd..ef45511e9 100644
--- a/src/host/mod.rs
+++ b/src/host/mod.rs
@@ -152,7 +152,6 @@ impl<'a, T: PortableBitSemantics + TypeGraphLayout> HostAndDeviceMutRef<'a, T> {
         }
     }
 
-    #[allow(dead_code)] // FIXME
     #[must_use]
     pub(crate) fn for_device<'b>(&'b mut self) -> DeviceMutRef<'a, T>
     where
@@ -164,7 +163,6 @@ impl<'a, T: PortableBitSemantics + TypeGraphLayout> HostAndDeviceMutRef<'a, T> {
         }
     }
 
-    #[allow(dead_code)] // FIXME
     #[must_use]
     pub(crate) fn for_host<'b: 'a>(&'b self) -> &'a T {
         self.host_ref

From 139adce5c160f8d9d1f89b661a2ed8f623fb1212 Mon Sep 17 00:00:00 2001
From: Juniper Tyree <juniper.tyree@helsinki.fi>
Date: Sat, 13 Jan 2024 06:10:07 +0000
Subject: [PATCH 103/120] Ensure async launch mutable borrow safety with
 barriers on use and stream move

---
 examples/print/src/main.rs                    |  16 +-
 rust-cuda-derive/src/rust_to_cuda/impl.rs     |   4 +-
 .../wrapper/generate/cuda_generic_function.rs |   3 +
 src/host/mod.rs                               |  50 ++--
 src/kernel/mod.rs                             |  46 +++-
 src/kernel/param.rs                           |  49 ++--
 src/lend/impls/box.rs                         |   4 +-
 src/lend/impls/boxed_slice.rs                 |   4 +-
 src/lend/impls/final.rs                       |   4 +-
 src/lend/impls/option.rs                      |   4 +-
 src/lend/impls/ref.rs                         |   4 +-
 src/lend/impls/slice_ref.rs                   |   4 +-
 src/lend/mod.rs                               |  16 +-
 src/utils/adapter.rs                          |   8 +-
 src/utils/aliasing/const.rs                   |   4 +-
 src/utils/aliasing/dynamic.rs                 |   4 +-
 src/utils/async.rs                            | 249 ++++++++++++++----
 src/utils/exchange/buffer/host.rs             |   4 +-
 src/utils/exchange/buffer/mod.rs              |   4 +-
 src/utils/exchange/wrapper.rs                 |  49 ++--
 20 files changed, 358 insertions(+), 172 deletions(-)

diff --git a/examples/print/src/main.rs b/examples/print/src/main.rs
index 7423f06ac..7cd9ab3f2 100644
--- a/examples/print/src/main.rs
+++ b/examples/print/src/main.rs
@@ -55,7 +55,7 @@ fn main() -> rust_cuda::deps::rustacuda::error::CudaResult<()> {
     );
 
     // Create a new CUDA stream to submit kernels to
-    let stream =
+    let mut stream =
         rust_cuda::host::CudaDropWrapper::from(rust_cuda::deps::rustacuda::stream::Stream::new(
             rust_cuda::deps::rustacuda::stream::StreamFlags::NON_BLOCKING,
             None,
@@ -70,12 +70,14 @@ fn main() -> rust_cuda::deps::rustacuda::error::CudaResult<()> {
     };
 
     // Launch the CUDA kernel on the stream and synchronise to its completion
-    println!("Launching print kernel ...");
-    kernel.launch1(&stream, &config, Action::Print)?;
-    println!("Launching panic kernel ...");
-    kernel.launch1(&stream, &config, Action::Panic)?;
-    println!("Launching alloc error kernel ...");
-    kernel.launch1(&stream, &config, Action::AllocError)?;
+    rust_cuda::host::Stream::with(&mut stream, |stream| {
+        println!("Launching print kernel ...");
+        kernel.launch1(stream, &config, Action::Print)?;
+        println!("Launching panic kernel ...");
+        kernel.launch1(stream, &config, Action::Panic)?;
+        println!("Launching alloc error kernel ...");
+        kernel.launch1(stream, &config, Action::AllocError)
+    })?;
 
     Ok(())
 }
diff --git a/rust-cuda-derive/src/rust_to_cuda/impl.rs b/rust-cuda-derive/src/rust_to_cuda/impl.rs
index 40dd3487d..e45a0e283 100644
--- a/rust-cuda-derive/src/rust_to_cuda/impl.rs
+++ b/rust-cuda-derive/src/rust_to_cuda/impl.rs
@@ -191,7 +191,7 @@ pub fn rust_to_cuda_async_trait(
             unsafe fn borrow_async<'stream, CudaAllocType: #crate_path::alloc::CudaAlloc>(
                 &self,
                 alloc: CudaAllocType,
-                stream: &'stream #crate_path::deps::rustacuda::stream::Stream,
+                stream: &'stream #crate_path::host::Stream,
             ) -> #crate_path::deps::rustacuda::error::CudaResult<(
                 #crate_path::utils::r#async::Async<
                     '_, 'stream,
@@ -219,7 +219,7 @@ pub fn rust_to_cuda_async_trait(
                 alloc: #crate_path::alloc::CombinedCudaAlloc<
                     Self::CudaAllocationAsync, CudaAllocType
                 >,
-                stream: &'stream #crate_path::deps::rustacuda::stream::Stream,
+                stream: &'stream #crate_path::host::Stream,
             ) -> #crate_path::deps::rustacuda::error::CudaResult<(
                 #crate_path::utils::r#async::Async<
                     'a, 'stream,
diff --git a/rust-cuda-kernel/src/kernel/wrapper/generate/cuda_generic_function.rs b/rust-cuda-kernel/src/kernel/wrapper/generate/cuda_generic_function.rs
index 4084db0ed..62cb3456d 100644
--- a/rust-cuda-kernel/src/kernel/wrapper/generate/cuda_generic_function.rs
+++ b/rust-cuda-kernel/src/kernel/wrapper/generate/cuda_generic_function.rs
@@ -82,6 +82,9 @@ pub(in super::super) fn quote_cuda_generic_function(
         )
         .collect::<Vec<_>>();
 
+    let generic_start_token = generic_start_token.unwrap_or_default();
+    let generic_close_token = generic_close_token.unwrap_or_default();
+
     quote! {
         #[cfg(target_os = "cuda")]
         #(#func_attrs)*
diff --git a/src/host/mod.rs b/src/host/mod.rs
index ef45511e9..25fd73a84 100644
--- a/src/host/mod.rs
+++ b/src/host/mod.rs
@@ -11,7 +11,6 @@ use rustacuda::{
     event::Event,
     memory::{CopyDestination, DeviceBox, DeviceBuffer, LockedBox, LockedBuffer},
     module::Module,
-    stream::Stream,
 };
 
 use crate::{
@@ -26,6 +25,33 @@ use crate::{
     },
 };
 
+#[repr(transparent)]
+pub struct Stream {
+    stream: rustacuda::stream::Stream,
+}
+
+impl Deref for Stream {
+    type Target = rustacuda::stream::Stream;
+
+    fn deref(&self) -> &Self::Target {
+        &self.stream
+    }
+}
+
+impl Stream {
+    pub fn with<O>(
+        stream: &mut rustacuda::stream::Stream,
+        inner: impl for<'stream> FnOnce(&'stream Self) -> O,
+    ) -> O {
+        // Safety:
+        //  - Stream is a newtype wrapper around rustacuda::stream::Stream
+        //  - we forge a unique lifetime for a unique reference
+        let stream = unsafe { &*std::ptr::from_ref(stream).cast() };
+
+        inner(stream)
+    }
+}
+
 pub trait CudaDroppable: Sized {
     #[allow(clippy::missing_errors_doc)]
     fn drop(val: Self) -> Result<(), (rustacuda::error::CudaError, Self)>;
@@ -88,7 +114,7 @@ impl<T: rustacuda_core::DeviceCopy> CudaDroppable for LockedBuffer<T> {
 }
 
 macro_rules! impl_sealed_drop_value {
-    ($type:ident) => {
+    ($type:ty) => {
         impl CudaDroppable for $type {
             fn drop(val: Self) -> Result<(), (CudaError, Self)> {
                 Self::drop(val)
@@ -98,7 +124,7 @@ macro_rules! impl_sealed_drop_value {
 }
 
 impl_sealed_drop_value!(Module);
-impl_sealed_drop_value!(Stream);
+impl_sealed_drop_value!(rustacuda::stream::Stream);
 impl_sealed_drop_value!(Context);
 impl_sealed_drop_value!(Event);
 
@@ -142,7 +168,7 @@ impl<'a, T: PortableBitSemantics + TypeGraphLayout> HostAndDeviceMutRef<'a, T> {
     /// # Safety
     ///
     /// `device_box` must contain EXACTLY the device copy of `host_ref`
-    pub unsafe fn new_unchecked(
+    pub(crate) unsafe fn new_unchecked(
         device_box: &'a mut DeviceBox<DeviceCopyWithPortableBitSemantics<T>>,
         host_ref: &'a mut T,
     ) -> Self {
@@ -180,7 +206,7 @@ impl<'a, T: PortableBitSemantics + TypeGraphLayout> HostAndDeviceMutRef<'a, T> {
     }
 
     #[must_use]
-    pub fn as_mut<'b>(&'b mut self) -> HostAndDeviceMutRef<'b, T>
+    pub fn into_mut<'b>(self) -> HostAndDeviceMutRef<'b, T>
     where
         'a: 'b,
     {
@@ -191,20 +217,14 @@ impl<'a, T: PortableBitSemantics + TypeGraphLayout> HostAndDeviceMutRef<'a, T> {
     }
 
     #[must_use]
-    pub fn as_async<'b, 'stream>(
-        &'b mut self,
+    pub fn into_async<'b, 'stream>(
+        self,
         stream: &'stream Stream,
     ) -> Async<'b, 'stream, HostAndDeviceMutRef<'b, T>, NoCompletion>
     where
         'a: 'b,
     {
-        Async::ready(
-            HostAndDeviceMutRef {
-                device_box: self.device_box,
-                host_ref: self.host_ref,
-            },
-            stream,
-        )
+        Async::ready(self.into_mut(), stream)
     }
 }
 
@@ -253,7 +273,7 @@ impl<'a, T: PortableBitSemantics + TypeGraphLayout> HostAndDeviceConstRef<'a, T>
     /// # Safety
     ///
     /// `device_box` must contain EXACTLY the device copy of `host_ref`
-    pub const unsafe fn new_unchecked(
+    pub(crate) const unsafe fn new_unchecked(
         device_box: &'a DeviceBox<DeviceCopyWithPortableBitSemantics<T>>,
         host_ref: &'a T,
     ) -> Self {
diff --git a/src/kernel/mod.rs b/src/kernel/mod.rs
index a27ed5b71..b5fea0af8 100644
--- a/src/kernel/mod.rs
+++ b/src/kernel/mod.rs
@@ -11,7 +11,6 @@ use rustacuda::{
     error::{CudaError, CudaResult},
     function::Function,
     module::Module,
-    stream::Stream,
 };
 
 #[cfg(feature = "kernel")]
@@ -27,6 +26,8 @@ mod ptx_jit;
 #[cfg(feature = "host")]
 use ptx_jit::{PtxJITCompiler, PtxJITResult};
 
+#[cfg(feature = "host")]
+use crate::host::Stream;
 use crate::safety::PortableBitSemantics;
 
 pub mod param;
@@ -109,7 +110,7 @@ pub trait CudaKernelParameter: sealed::Sealed {
     #[allow(clippy::missing_errors_doc)] // FIXME
     fn with_new_async<'stream, 'param, O, E: From<rustacuda::error::CudaError>>(
         param: Self::SyncHostType,
-        stream: &'stream rustacuda::stream::Stream,
+        stream: &'stream crate::host::Stream,
         inner: impl WithNewAsync<'stream, Self, O, E>,
     ) -> Result<O, E>
     where
@@ -206,7 +207,9 @@ macro_rules! impl_launcher_launch {
         pub fn $launch_async<$($T: CudaKernelParameter),*>(
             &mut self,
             $($arg: $T::AsyncHostType<'stream, '_>),*
-        ) -> CudaResult<()>
+        ) -> CudaResult<crate::utils::r#async::Async<
+            'static, 'stream, (), crate::utils::r#async::NoCompletion,
+        >>
         where
             Kernel: FnOnce(&mut Launcher<'stream, '_, Kernel>, $($T),*),
         {
@@ -375,13 +378,10 @@ macro_rules! impl_typed_kernel_launch {
                 config,
                 $($arg,)*
                 |kernel, stream, config, $($arg),*| {
-                    let result = kernel.$launch_async::<$($T),*>(stream, config, $($arg),*);
+                    let r#async = kernel.$launch_async::<$($T),*>(stream, config, $($arg),*)?;
 
                     // important: always synchronise here, this function is sync!
-                    match (stream.synchronize(), result) {
-                        (Ok(()), result) => result,
-                        (Err(_), Err(err)) | (Err(err), Ok(())) => Err(err),
-                    }
+                    r#async.synchronize()
                 },
             )
         }
@@ -422,7 +422,29 @@ macro_rules! impl_typed_kernel_launch {
             stream: &'stream Stream,
             config: &LaunchConfig,
             $($arg: $T::AsyncHostType<'stream, '_>),*
-        ) -> CudaResult<()>
+        ) -> CudaResult<crate::utils::r#async::Async<
+            'static, 'stream, (), crate::utils::r#async::NoCompletion,
+        >>
+        // launch_async does not need to capture its parameters until kernel completion:
+        //  - moved parameters are moved and cannot be used again, deallocation will sync
+        //  - immutably borrowed parameters can be shared across multiple kernel launches
+        //  - mutably borrowed parameters are more tricky:
+        //    - Rust's borrowing rules ensure that a single mutable reference cannot be
+        //      passed into multiple parameters of the kernel (no mutable aliasing)
+        //    - CUDA guarantees that kernels launched on the same stream are executed
+        //      sequentially, so even immediate resubmissions for the same mutable data
+        //      will not have temporally overlapping mutation on the same stream
+        //    - however, we have to guarantee that mutable data cannot be used on several
+        //      different streams at the same time
+        //      - Async::move_to_stream always adds a synchronisation barrier between the
+        //        old and the new stream to ensure that all uses on the old stream happen
+        //        strictly before all uses on the new stream
+        //      - async launches take AsyncProj<&mut HostAndDeviceMutRef<..>>, which either
+        //        captures an Async, which must be moved to a different stream explicitly,
+        //        or contains data that cannot async move to a different stream without
+        //      - any use of a mutable borrow in an async kernel launch adds a sync barrier
+        //        on the launch stream s.t. the borrow is only complete once the kernel has
+        //        completed
         where
             Kernel: FnOnce(&mut Launcher<'stream, 'kernel, Kernel>, $($T),*),
         {
@@ -454,7 +476,11 @@ macro_rules! impl_typed_kernel_launch {
                         &mut $T::async_to_ffi($arg, sealed::Token)?
                     ).cast::<core::ffi::c_void>()),*
                 ],
-            ) }
+            ) }?;
+
+            crate::utils::r#async::Async::pending(
+                (), stream, crate::utils::r#async::NoCompletion,
+            )
         }
     };
     (impl $func:ident () + ($($other:expr),*) $inner:block) => {
diff --git a/src/kernel/param.rs b/src/kernel/param.rs
index 6be634b24..a5a3cf457 100644
--- a/src/kernel/param.rs
+++ b/src/kernel/param.rs
@@ -81,7 +81,7 @@ impl<
     #[cfg(feature = "host")]
     fn with_new_async<'stream, 'b, O, E: From<rustacuda::error::CudaError>>(
         param: Self::SyncHostType,
-        _stream: &'stream rustacuda::stream::Stream,
+        _stream: &'stream crate::host::Stream,
         inner: impl super::WithNewAsync<'stream, Self, O, E>,
     ) -> Result<O, E>
     where
@@ -167,7 +167,7 @@ impl<
     #[cfg(feature = "host")]
     fn with_new_async<'stream, 'b, O, E: From<rustacuda::error::CudaError>>(
         param: Self::SyncHostType,
-        stream: &'stream rustacuda::stream::Stream,
+        stream: &'stream crate::host::Stream,
         inner: impl super::WithNewAsync<'stream, Self, O, E>,
     ) -> Result<O, E>
     where
@@ -251,7 +251,7 @@ impl<
     #[cfg(feature = "host")]
     fn with_new_async<'stream, 'b, O, E: From<rustacuda::error::CudaError>>(
         param: Self::SyncHostType,
-        stream: &'stream rustacuda::stream::Stream,
+        stream: &'stream crate::host::Stream,
         inner: impl super::WithNewAsync<'stream, Self, O, E>,
     ) -> Result<O, E>
     where
@@ -272,7 +272,7 @@ impl<
     where
         Self: 'b,
     {
-        let param = unsafe { param.unwrap_unchecked() };
+        let param = unsafe { param.unwrap_ref_unchecked() };
         inner(Some(&param_as_raw_bytes(param.for_host())))
     }
 
@@ -373,7 +373,7 @@ impl<
     #[cfg(feature = "host")]
     fn with_new_async<'stream, 'b, O, E: From<rustacuda::error::CudaError>>(
         param: Self::SyncHostType,
-        stream: &'stream rustacuda::stream::Stream,
+        stream: &'stream crate::host::Stream,
         inner: impl super::WithNewAsync<'stream, Self, O, E>,
     ) -> Result<O, E>
     where
@@ -509,7 +509,7 @@ impl<
     #[cfg(feature = "host")]
     fn with_new_async<'stream, 'b, O, E: From<rustacuda::error::CudaError>>(
         param: Self::SyncHostType,
-        stream: &'stream rustacuda::stream::Stream,
+        stream: &'stream crate::host::Stream,
         inner: impl super::WithNewAsync<'stream, Self, O, E>,
     ) -> Result<O, E>
     where
@@ -595,7 +595,7 @@ impl<'a, T: Sync + RustToCuda> CudaKernelParameter for &'a DeepPerThreadBorrow<T
     #[cfg(feature = "host")]
     fn with_new_async<'stream, 'b, O, E: From<rustacuda::error::CudaError>>(
         param: Self::SyncHostType,
-        stream: &'stream rustacuda::stream::Stream,
+        stream: &'stream crate::host::Stream,
         inner: impl super::WithNewAsync<'stream, Self, O, E>,
     ) -> Result<O, E>
     where
@@ -678,16 +678,20 @@ impl<'a, T: Sync + RustToCuda + SafeMutableAliasing> CudaKernelParameter
     #[cfg(feature = "host")]
     fn with_new_async<'stream, 'b, O, E: From<rustacuda::error::CudaError>>(
         param: Self::SyncHostType,
-        stream: &'stream rustacuda::stream::Stream,
+        stream: &'stream crate::host::Stream,
         inner: impl super::WithNewAsync<'stream, Self, O, E>,
     ) -> Result<O, E>
     where
         Self: 'b,
     {
-        crate::lend::LendToCuda::lend_to_cuda_mut(param, |mut param| {
-            // FIXME: express the same with param.as_async(stream).as_mut()
+        crate::lend::LendToCuda::lend_to_cuda_mut(param, |param| {
+            // FIXME: express the same with param.into_async(stream).as_mut()
             let _ = stream;
-            inner.with(crate::utils::r#async::AsyncProj::new(&mut param.as_mut()))
+            inner.with({
+                // Safety: this projection cannot be moved to a different stream
+                //         without first exiting lend_to_cuda_mut and synchronizing
+                unsafe { crate::utils::r#async::AsyncProj::new(&mut param.into_mut(), None) }
+            })
         })
     }
 
@@ -716,12 +720,13 @@ impl<'a, T: Sync + RustToCuda + SafeMutableAliasing> CudaKernelParameter
 
     #[cfg(feature = "host")]
     fn async_to_ffi<'stream, 'b, E: From<rustacuda::error::CudaError>>(
-        param: Self::AsyncHostType<'stream, 'b>,
+        mut param: Self::AsyncHostType<'stream, 'b>,
         _token: sealed::Token,
     ) -> Result<Self::FfiType<'stream, 'b>, E>
     where
         Self: 'b,
     {
+        param.record_mut_use()?;
         let param = unsafe { param.unwrap_unchecked() };
         Ok(param.for_device())
     }
@@ -763,7 +768,7 @@ impl<
     #[cfg(feature = "host")]
     fn with_new_async<'stream, 'b, O, E: From<rustacuda::error::CudaError>>(
         param: Self::SyncHostType,
-        stream: &'stream rustacuda::stream::Stream,
+        stream: &'stream crate::host::Stream,
         inner: impl super::WithNewAsync<'stream, Self, O, E>,
     ) -> Result<O, E>
     where
@@ -846,7 +851,7 @@ impl<'a, T: Sync + RustToCuda> CudaKernelParameter for &'a PtxJit<DeepPerThreadB
     #[cfg(feature = "host")]
     fn with_new_async<'stream, 'b, O, E: From<rustacuda::error::CudaError>>(
         param: Self::SyncHostType,
-        stream: &'stream rustacuda::stream::Stream,
+        stream: &'stream crate::host::Stream,
         inner: impl super::WithNewAsync<'stream, Self, O, E>,
     ) -> Result<O, E>
     where
@@ -867,7 +872,7 @@ impl<'a, T: Sync + RustToCuda> CudaKernelParameter for &'a PtxJit<DeepPerThreadB
     where
         Self: 'b,
     {
-        let param = unsafe { param.unwrap_unchecked() };
+        let param = unsafe { param.unwrap_ref_unchecked() };
         inner(Some(&param_as_raw_bytes(param.for_host())))
     }
 
@@ -927,17 +932,21 @@ impl<'a, T: Sync + RustToCuda + SafeMutableAliasing> CudaKernelParameter
     #[cfg(feature = "host")]
     fn with_new_async<'stream, 'b, O, E: From<rustacuda::error::CudaError>>(
         param: Self::SyncHostType,
-        stream: &'stream rustacuda::stream::Stream,
+        stream: &'stream crate::host::Stream,
         inner: impl super::WithNewAsync<'stream, Self, O, E>,
     ) -> Result<O, E>
     where
         Self: 'b,
     {
         // FIXME: forward impl
-        crate::lend::LendToCuda::lend_to_cuda_mut(param, |mut param| {
+        crate::lend::LendToCuda::lend_to_cuda_mut(param, |param| {
             // FIXME: express the same with param.as_async(stream).as_mut()
             let _ = stream;
-            inner.with(crate::utils::r#async::AsyncProj::new(&mut param.as_mut()))
+            inner.with({
+                // Safety: this projection cannot be moved to a different stream
+                //         without first exiting lend_to_cuda_mut and synchronizing
+                unsafe { crate::utils::r#async::AsyncProj::new(&mut param.into_mut(), None) }
+            })
         })
     }
 
@@ -1049,7 +1058,7 @@ impl<'a, T: 'static> CudaKernelParameter for &'a mut crate::utils::shared::Threa
     #[cfg(feature = "host")]
     fn with_new_async<'stream, 'b, O, E: From<rustacuda::error::CudaError>>(
         param: Self::SyncHostType,
-        _stream: &'stream rustacuda::stream::Stream,
+        _stream: &'stream crate::host::Stream,
         inner: impl super::WithNewAsync<'stream, Self, O, E>,
     ) -> Result<O, E>
     where
@@ -1126,7 +1135,7 @@ impl<'a, T: 'static + PortableBitSemantics + TypeGraphLayout> CudaKernelParamete
     #[cfg(feature = "host")]
     fn with_new_async<'stream, 'b, O, E: From<rustacuda::error::CudaError>>(
         param: Self::SyncHostType,
-        _stream: &'stream rustacuda::stream::Stream,
+        _stream: &'stream crate::host::Stream,
         inner: impl super::WithNewAsync<'stream, Self, O, E>,
     ) -> Result<O, E>
     where
diff --git a/src/lend/impls/box.rs b/src/lend/impls/box.rs
index 121fe3905..fff0bb8d8 100644
--- a/src/lend/impls/box.rs
+++ b/src/lend/impls/box.rs
@@ -90,7 +90,7 @@ unsafe impl<T: PortableBitSemantics + TypeGraphLayout> RustToCudaAsync for Box<T
     unsafe fn borrow_async<'stream, A: CudaAlloc>(
         &self,
         alloc: A,
-        stream: &'stream rustacuda::stream::Stream,
+        stream: &'stream crate::host::Stream,
     ) -> rustacuda::error::CudaResult<(
         Async<'_, 'stream, DeviceAccessible<Self::CudaRepresentation>>,
         CombinedCudaAlloc<Self::CudaAllocationAsync, A>,
@@ -131,7 +131,7 @@ unsafe impl<T: PortableBitSemantics + TypeGraphLayout> RustToCudaAsync for Box<T
     unsafe fn restore_async<'a, 'stream, A: CudaAlloc, O>(
         this: owning_ref::BoxRefMut<'a, O, Self>,
         alloc: CombinedCudaAlloc<Self::CudaAllocationAsync, A>,
-        stream: &'stream rustacuda::stream::Stream,
+        stream: &'stream crate::host::Stream,
     ) -> CudaResult<(
         Async<'a, 'stream, owning_ref::BoxRefMut<'a, O, Self>, CompletionFnMut<'a, Self>>,
         A,
diff --git a/src/lend/impls/boxed_slice.rs b/src/lend/impls/boxed_slice.rs
index 09a612c98..c275a6d1c 100644
--- a/src/lend/impls/boxed_slice.rs
+++ b/src/lend/impls/boxed_slice.rs
@@ -96,7 +96,7 @@ unsafe impl<T: PortableBitSemantics + TypeGraphLayout> RustToCudaAsync for Box<[
     unsafe fn borrow_async<'stream, A: CudaAlloc>(
         &self,
         alloc: A,
-        stream: &'stream rustacuda::stream::Stream,
+        stream: &'stream crate::host::Stream,
     ) -> rustacuda::error::CudaResult<(
         Async<'_, 'stream, DeviceAccessible<Self::CudaRepresentation>>,
         CombinedCudaAlloc<Self::CudaAllocationAsync, A>,
@@ -140,7 +140,7 @@ unsafe impl<T: PortableBitSemantics + TypeGraphLayout> RustToCudaAsync for Box<[
     unsafe fn restore_async<'a, 'stream, A: CudaAlloc, O>(
         this: owning_ref::BoxRefMut<'a, O, Self>,
         alloc: CombinedCudaAlloc<Self::CudaAllocationAsync, A>,
-        stream: &'stream rustacuda::stream::Stream,
+        stream: &'stream crate::host::Stream,
     ) -> CudaResult<(
         Async<'a, 'stream, owning_ref::BoxRefMut<'a, O, Self>, CompletionFnMut<'a, Self>>,
         A,
diff --git a/src/lend/impls/final.rs b/src/lend/impls/final.rs
index 6235a58fe..845424ef9 100644
--- a/src/lend/impls/final.rs
+++ b/src/lend/impls/final.rs
@@ -49,7 +49,7 @@ unsafe impl<T: RustToCudaAsync> RustToCudaAsync for Final<T> {
     unsafe fn borrow_async<'stream, A: crate::alloc::CudaAlloc>(
         &self,
         alloc: A,
-        stream: &'stream rustacuda::stream::Stream,
+        stream: &'stream crate::host::Stream,
     ) -> rustacuda::error::CudaResult<(
         crate::utils::r#async::Async<'_, 'stream, DeviceAccessible<Self::CudaRepresentation>>,
         crate::alloc::CombinedCudaAlloc<Self::CudaAllocationAsync, A>,
@@ -76,7 +76,7 @@ unsafe impl<T: RustToCudaAsync> RustToCudaAsync for Final<T> {
     unsafe fn restore_async<'a, 'stream, A: crate::alloc::CudaAlloc, O>(
         this: owning_ref::BoxRefMut<'a, O, Self>,
         alloc: crate::alloc::CombinedCudaAlloc<Self::CudaAllocationAsync, A>,
-        stream: &'stream rustacuda::stream::Stream,
+        stream: &'stream crate::host::Stream,
     ) -> rustacuda::error::CudaResult<(
         crate::utils::r#async::Async<
             'a,
diff --git a/src/lend/impls/option.rs b/src/lend/impls/option.rs
index b1c51b9a5..76be7e762 100644
--- a/src/lend/impls/option.rs
+++ b/src/lend/impls/option.rs
@@ -89,7 +89,7 @@ unsafe impl<T: RustToCudaAsync> RustToCudaAsync for Option<T> {
     unsafe fn borrow_async<'stream, A: CudaAlloc>(
         &self,
         alloc: A,
-        stream: &'stream rustacuda::stream::Stream,
+        stream: &'stream crate::host::Stream,
     ) -> CudaResult<(
         Async<'_, 'stream, DeviceAccessible<Self::CudaRepresentation>>,
         CombinedCudaAlloc<Self::CudaAllocationAsync, A>,
@@ -135,7 +135,7 @@ unsafe impl<T: RustToCudaAsync> RustToCudaAsync for Option<T> {
     unsafe fn restore_async<'a, 'stream, A: CudaAlloc, O>(
         mut this: owning_ref::BoxRefMut<'a, O, Self>,
         alloc: CombinedCudaAlloc<Self::CudaAllocationAsync, A>,
-        stream: &'stream rustacuda::stream::Stream,
+        stream: &'stream crate::host::Stream,
     ) -> CudaResult<(
         Async<'a, 'stream, owning_ref::BoxRefMut<'a, O, Self>, CompletionFnMut<'a, Self>>,
         A,
diff --git a/src/lend/impls/ref.rs b/src/lend/impls/ref.rs
index 501393f63..3ce472317 100644
--- a/src/lend/impls/ref.rs
+++ b/src/lend/impls/ref.rs
@@ -85,7 +85,7 @@ unsafe impl<'a, T: PortableBitSemantics + TypeGraphLayout> RustToCudaAsync for &
     unsafe fn borrow_async<'stream, A: CudaAlloc>(
         &self,
         alloc: A,
-        stream: &'stream rustacuda::stream::Stream,
+        stream: &'stream crate::host::Stream,
     ) -> rustacuda::error::CudaResult<(
         Async<'_, 'stream, DeviceAccessible<Self::CudaRepresentation>>,
         CombinedCudaAlloc<Self::CudaAllocationAsync, A>,
@@ -127,7 +127,7 @@ unsafe impl<'a, T: PortableBitSemantics + TypeGraphLayout> RustToCudaAsync for &
     unsafe fn restore_async<'b, 'stream, A: CudaAlloc, O>(
         this: owning_ref::BoxRefMut<'b, O, Self>,
         alloc: CombinedCudaAlloc<Self::CudaAllocationAsync, A>,
-        stream: &'stream rustacuda::stream::Stream,
+        stream: &'stream crate::host::Stream,
     ) -> CudaResult<(
         Async<'b, 'stream, owning_ref::BoxRefMut<'b, O, Self>, CompletionFnMut<'b, Self>>,
         A,
diff --git a/src/lend/impls/slice_ref.rs b/src/lend/impls/slice_ref.rs
index 4f8a3ecd9..07271a75a 100644
--- a/src/lend/impls/slice_ref.rs
+++ b/src/lend/impls/slice_ref.rs
@@ -88,7 +88,7 @@ unsafe impl<'a, T: PortableBitSemantics + TypeGraphLayout> RustToCudaAsync for &
     unsafe fn borrow_async<'stream, A: CudaAlloc>(
         &self,
         alloc: A,
-        stream: &'stream rustacuda::stream::Stream,
+        stream: &'stream crate::host::Stream,
     ) -> rustacuda::error::CudaResult<(
         Async<'_, 'stream, DeviceAccessible<Self::CudaRepresentation>>,
         CombinedCudaAlloc<Self::CudaAllocationAsync, A>,
@@ -132,7 +132,7 @@ unsafe impl<'a, T: PortableBitSemantics + TypeGraphLayout> RustToCudaAsync for &
     unsafe fn restore_async<'b, 'stream, A: CudaAlloc, O>(
         this: owning_ref::BoxRefMut<'b, O, Self>,
         alloc: CombinedCudaAlloc<Self::CudaAllocationAsync, A>,
-        stream: &'stream rustacuda::stream::Stream,
+        stream: &'stream crate::host::Stream,
     ) -> CudaResult<(
         Async<'b, 'stream, owning_ref::BoxRefMut<'b, O, Self>, CompletionFnMut<'b, Self>>,
         A,
diff --git a/src/lend/mod.rs b/src/lend/mod.rs
index 7a3934aa0..6c0467fd5 100644
--- a/src/lend/mod.rs
+++ b/src/lend/mod.rs
@@ -101,7 +101,7 @@ pub unsafe trait RustToCudaAsync: RustToCuda {
     unsafe fn borrow_async<'stream, A: CudaAlloc>(
         &self,
         alloc: A,
-        stream: &'stream rustacuda::stream::Stream,
+        stream: &'stream crate::host::Stream,
     ) -> rustacuda::error::CudaResult<(
         Async<'_, 'stream, DeviceAccessible<Self::CudaRepresentation>>,
         CombinedCudaAlloc<Self::CudaAllocationAsync, A>,
@@ -127,7 +127,7 @@ pub unsafe trait RustToCudaAsync: RustToCuda {
     unsafe fn restore_async<'a, 'stream, A: CudaAlloc, O>(
         this: owning_ref::BoxRefMut<'a, O, Self>,
         alloc: CombinedCudaAlloc<Self::CudaAllocationAsync, A>,
-        stream: &'stream rustacuda::stream::Stream,
+        stream: &'stream crate::host::Stream,
     ) -> rustacuda::error::CudaResult<(
         Async<'a, 'stream, owning_ref::BoxRefMut<'a, O, Self>, CompletionFnMut<'a, Self>>,
         A,
@@ -324,7 +324,7 @@ pub trait LendToCudaAsync: RustToCudaAsync {
         ) -> Result<O, E>,
     >(
         &self,
-        stream: &'stream rustacuda::stream::Stream,
+        stream: &'stream crate::host::Stream,
         inner: F,
     ) -> Result<O, E>
     where
@@ -357,7 +357,7 @@ pub trait LendToCudaAsync: RustToCudaAsync {
         T: 'a,
     >(
         this: owning_ref::BoxRefMut<'a, T, Self>,
-        stream: &'stream rustacuda::stream::Stream,
+        stream: &'stream crate::host::Stream,
         inner: F,
     ) -> Result<
         (
@@ -393,7 +393,7 @@ pub trait LendToCudaAsync: RustToCudaAsync {
         ) -> Result<O, E>,
     >(
         self,
-        stream: &'stream rustacuda::stream::Stream,
+        stream: &'stream crate::host::Stream,
         inner: F,
     ) -> Result<O, E>
     where
@@ -416,7 +416,7 @@ impl<T: RustToCudaAsync> LendToCudaAsync for T {
         ) -> Result<O, E>,
     >(
         &self,
-        stream: &'stream rustacuda::stream::Stream,
+        stream: &'stream crate::host::Stream,
         inner: F,
     ) -> Result<O, E>
     where
@@ -458,7 +458,7 @@ impl<T: RustToCudaAsync> LendToCudaAsync for T {
         S: 'a,
     >(
         this: owning_ref::BoxRefMut<'a, S, Self>,
-        stream: &'stream rustacuda::stream::Stream,
+        stream: &'stream crate::host::Stream,
         inner: F,
     ) -> Result<
         (
@@ -505,7 +505,7 @@ impl<T: RustToCudaAsync> LendToCudaAsync for T {
         ) -> Result<O, E>,
     >(
         self,
-        stream: &'stream rustacuda::stream::Stream,
+        stream: &'stream crate::host::Stream,
         inner: F,
     ) -> Result<O, E>
     where
diff --git a/src/utils/adapter.rs b/src/utils/adapter.rs
index f7d041089..84aa28569 100644
--- a/src/utils/adapter.rs
+++ b/src/utils/adapter.rs
@@ -156,7 +156,7 @@ unsafe impl<T: Copy + PortableBitSemantics + TypeGraphLayout> RustToCudaAsync
     unsafe fn borrow_async<'stream, A: CudaAlloc>(
         &self,
         alloc: A,
-        stream: &'stream rustacuda::stream::Stream,
+        stream: &'stream crate::host::Stream,
     ) -> rustacuda::error::CudaResult<(
         crate::utils::r#async::Async<'_, 'stream, DeviceAccessible<Self::CudaRepresentation>>,
         CombinedCudaAlloc<Self::CudaAllocation, A>,
@@ -172,7 +172,7 @@ unsafe impl<T: Copy + PortableBitSemantics + TypeGraphLayout> RustToCudaAsync
     unsafe fn restore_async<'a, 'stream, A: CudaAlloc, O>(
         this: owning_ref::BoxRefMut<'a, O, Self>,
         alloc: CombinedCudaAlloc<Self::CudaAllocation, A>,
-        stream: &'stream rustacuda::stream::Stream,
+        stream: &'stream crate::host::Stream,
     ) -> rustacuda::error::CudaResult<(
         crate::utils::r#async::Async<
             'a,
@@ -346,7 +346,7 @@ unsafe impl<T: Clone + PortableBitSemantics + TypeGraphLayout> RustToCudaAsync
     unsafe fn borrow_async<'stream, A: CudaAlloc>(
         &self,
         alloc: A,
-        stream: &'stream rustacuda::stream::Stream,
+        stream: &'stream crate::host::Stream,
     ) -> rustacuda::error::CudaResult<(
         crate::utils::r#async::Async<'_, 'stream, DeviceAccessible<Self::CudaRepresentation>>,
         CombinedCudaAlloc<Self::CudaAllocation, A>,
@@ -362,7 +362,7 @@ unsafe impl<T: Clone + PortableBitSemantics + TypeGraphLayout> RustToCudaAsync
     unsafe fn restore_async<'a, 'stream, A: CudaAlloc, O>(
         this: owning_ref::BoxRefMut<'a, O, Self>,
         alloc: CombinedCudaAlloc<Self::CudaAllocation, A>,
-        stream: &'stream rustacuda::stream::Stream,
+        stream: &'stream crate::host::Stream,
     ) -> rustacuda::error::CudaResult<(
         crate::utils::r#async::Async<
             'a,
diff --git a/src/utils/aliasing/const.rs b/src/utils/aliasing/const.rs
index 3ca7b0597..24178131c 100644
--- a/src/utils/aliasing/const.rs
+++ b/src/utils/aliasing/const.rs
@@ -222,7 +222,7 @@ unsafe impl<T: RustToCudaAsync, const STRIDE: usize> RustToCudaAsync
     unsafe fn borrow_async<'stream, A: crate::alloc::CudaAlloc>(
         &self,
         alloc: A,
-        stream: &'stream rustacuda::stream::Stream,
+        stream: &'stream crate::host::Stream,
     ) -> rustacuda::error::CudaResult<(
         crate::utils::r#async::Async<'_, 'stream, DeviceAccessible<Self::CudaRepresentation>>,
         crate::alloc::CombinedCudaAlloc<Self::CudaAllocationAsync, A>,
@@ -250,7 +250,7 @@ unsafe impl<T: RustToCudaAsync, const STRIDE: usize> RustToCudaAsync
     unsafe fn restore_async<'a, 'stream, A: crate::alloc::CudaAlloc, O>(
         this: owning_ref::BoxRefMut<'a, O, Self>,
         alloc: crate::alloc::CombinedCudaAlloc<Self::CudaAllocationAsync, A>,
-        stream: &'stream rustacuda::stream::Stream,
+        stream: &'stream crate::host::Stream,
     ) -> rustacuda::error::CudaResult<(
         crate::utils::r#async::Async<
             'a,
diff --git a/src/utils/aliasing/dynamic.rs b/src/utils/aliasing/dynamic.rs
index 2c663e9d6..c16d4bf4f 100644
--- a/src/utils/aliasing/dynamic.rs
+++ b/src/utils/aliasing/dynamic.rs
@@ -200,7 +200,7 @@ unsafe impl<T: RustToCudaAsync> RustToCudaAsync for SplitSliceOverCudaThreadsDyn
     unsafe fn borrow_async<'stream, A: crate::alloc::CudaAlloc>(
         &self,
         alloc: A,
-        stream: &'stream rustacuda::stream::Stream,
+        stream: &'stream crate::host::Stream,
     ) -> rustacuda::error::CudaResult<(
         crate::utils::r#async::Async<'_, 'stream, DeviceAccessible<Self::CudaRepresentation>>,
         crate::alloc::CombinedCudaAlloc<Self::CudaAllocationAsync, A>,
@@ -230,7 +230,7 @@ unsafe impl<T: RustToCudaAsync> RustToCudaAsync for SplitSliceOverCudaThreadsDyn
     unsafe fn restore_async<'a, 'stream, A: crate::alloc::CudaAlloc, O>(
         this: owning_ref::BoxRefMut<'a, O, Self>,
         alloc: crate::alloc::CombinedCudaAlloc<Self::CudaAllocationAsync, A>,
-        stream: &'stream rustacuda::stream::Stream,
+        stream: &'stream crate::host::Stream,
     ) -> rustacuda::error::CudaResult<(
         crate::utils::r#async::Async<
             'a,
diff --git a/src/utils/async.rs b/src/utils/async.rs
index b008ac553..7a33da8d6 100644
--- a/src/utils/async.rs
+++ b/src/utils/async.rs
@@ -3,12 +3,12 @@ use std::{borrow::BorrowMut, future::Future, future::IntoFuture, marker::Phantom
 
 #[cfg(feature = "host")]
 use rustacuda::{
-    error::CudaError, error::CudaResult, event::Event, event::EventFlags, stream::Stream,
+    error::CudaError, error::CudaResult, event::Event, event::EventFlags,
     stream::StreamWaitEventFlags,
 };
 
 #[cfg(feature = "host")]
-use crate::host::CudaDropWrapper;
+use crate::host::{CudaDropWrapper, Stream};
 
 #[cfg(feature = "host")]
 pub struct NoCompletion;
@@ -19,6 +19,8 @@ pub type CompletionFnMut<'a, T> = Box<dyn FnOnce(&mut T) -> CudaResult<()> + 'a>
 pub trait Completion<T: ?Sized + BorrowMut<Self::Completed>>: sealed::Sealed {
     type Completed: ?Sized;
 
+    fn no_op() -> Self;
+
     #[doc(hidden)]
     fn synchronize_on_drop(&self) -> bool;
 
@@ -34,6 +36,11 @@ mod sealed {
 impl<T: ?Sized> Completion<T> for NoCompletion {
     type Completed = T;
 
+    #[inline]
+    fn no_op() -> Self {
+        Self
+    }
+
     #[inline]
     fn synchronize_on_drop(&self) -> bool {
         false
@@ -51,6 +58,11 @@ impl sealed::Sealed for NoCompletion {}
 impl<'a, T: ?Sized + BorrowMut<B>, B: ?Sized> Completion<T> for CompletionFnMut<'a, B> {
     type Completed = B;
 
+    #[inline]
+    fn no_op() -> Self {
+        Box::new(|_value| Ok(()))
+    }
+
     #[inline]
     fn synchronize_on_drop(&self) -> bool {
         true
@@ -68,6 +80,11 @@ impl<'a, T: ?Sized> sealed::Sealed for CompletionFnMut<'a, T> {}
 impl<T: ?Sized + BorrowMut<C::Completed>, C: Completion<T>> Completion<T> for Option<C> {
     type Completed = C::Completed;
 
+    #[inline]
+    fn no_op() -> Self {
+        None
+    }
+
     #[inline]
     fn synchronize_on_drop(&self) -> bool {
         self.as_ref().map_or(false, Completion::synchronize_on_drop)
@@ -83,7 +100,7 @@ impl<C> sealed::Sealed for Option<C> {}
 
 #[cfg(feature = "host")]
 pub struct Async<'a, 'stream, T: BorrowMut<C::Completed>, C: Completion<T> = NoCompletion> {
-    _stream: PhantomData<&'stream Stream>,
+    stream: &'stream Stream,
     value: T,
     status: AsyncStatus<'a, T, C>,
     _capture: PhantomData<&'a ()>,
@@ -95,7 +112,7 @@ enum AsyncStatus<'a, T: BorrowMut<C::Completed>, C: Completion<T>> {
     Processing {
         receiver: oneshot::Receiver<CudaResult<()>>,
         completion: C,
-        event: CudaDropWrapper<Event>,
+        event: Option<CudaDropWrapper<Event>>,
         _capture: PhantomData<&'a T>,
     },
     Completed {
@@ -108,10 +125,8 @@ impl<'a, 'stream, T: BorrowMut<C::Completed>, C: Completion<T>> Async<'a, 'strea
     /// Wraps a `value` which is ready on `stream`.
     #[must_use]
     pub const fn ready(value: T, stream: &'stream Stream) -> Self {
-        let _ = stream;
-
         Self {
-            _stream: PhantomData::<&'stream Stream>,
+            stream,
             value,
             status: AsyncStatus::Completed { result: Ok(()) },
             _capture: PhantomData::<&'a ()>,
@@ -125,20 +140,16 @@ impl<'a, 'stream, T: BorrowMut<C::Completed>, C: Completion<T>> Async<'a, 'strea
     /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside
     /// CUDA.
     pub fn pending(value: T, stream: &'stream Stream, completion: C) -> CudaResult<Self> {
-        let event = CudaDropWrapper::from(Event::new(EventFlags::DISABLE_TIMING)?);
-
         let (sender, receiver) = oneshot::channel();
-
         stream.add_callback(Box::new(|result| std::mem::drop(sender.send(result))))?;
-        event.record(stream)?;
 
         Ok(Self {
-            _stream: PhantomData::<&'stream Stream>,
+            stream,
             value,
             status: AsyncStatus::Processing {
                 receiver,
                 completion,
-                event,
+                event: None,
                 _capture: PhantomData::<&'a T>,
             },
             _capture: PhantomData::<&'a ()>,
@@ -157,7 +168,7 @@ impl<'a, 'stream, T: BorrowMut<C::Completed>, C: Completion<T>> Async<'a, 'strea
     /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside
     /// CUDA.
     pub fn synchronize(self) -> CudaResult<T> {
-        let (mut value, status) = self.destructure_into_parts();
+        let (_stream, mut value, status) = self.destructure_into_parts();
 
         let (receiver, completion) = match status {
             AsyncStatus::Completed { result } => return result.map(|()| value),
@@ -182,6 +193,11 @@ impl<'a, 'stream, T: BorrowMut<C::Completed>, C: Completion<T>> Async<'a, 'strea
 
     /// Moves the asynchronous data move to a different [`Stream`].
     ///
+    /// This method always adds a synchronisation barrier between the old and
+    /// and the new [`Stream`] to ensure that any usages of this [`Async`]
+    /// computations on the old [`Stream`] have completed before they can be
+    /// used on the new one.
+    ///
     /// # Errors
     /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside
     /// CUDA.
@@ -189,52 +205,45 @@ impl<'a, 'stream, T: BorrowMut<C::Completed>, C: Completion<T>> Async<'a, 'strea
         self,
         stream: &'stream_new Stream,
     ) -> CudaResult<Async<'a, 'stream_new, T, C>> {
-        let (mut value, status) = self.destructure_into_parts();
-
-        let (receiver, completion, event) = match status {
-            AsyncStatus::Completed { .. } => {
-                return Ok(Async {
-                    _stream: PhantomData::<&'stream_new Stream>,
-                    value,
-                    status,
-                    _capture: PhantomData::<&'a ()>,
-                })
+        let (old_stream, mut value, status) = self.destructure_into_parts();
+
+        let completion = match status {
+            AsyncStatus::Completed { result } => {
+                result?;
+                C::no_op()
             },
             AsyncStatus::Processing {
                 receiver,
                 completion,
-                event,
+                event: _,
                 _capture,
-            } => (receiver, completion, event),
-        };
-
-        match receiver.try_recv() {
-            Ok(Ok(())) => (),
-            Ok(Err(err)) => return Err(err),
-            Err(oneshot::TryRecvError::Empty) => {
-                stream.wait_event(&event, StreamWaitEventFlags::DEFAULT)?;
-
-                return Ok(Async {
-                    _stream: PhantomData::<&'stream_new Stream>,
-                    value,
-                    status: AsyncStatus::Processing {
-                        receiver,
-                        completion,
-                        event,
-                        _capture: PhantomData::<&'a T>,
-                    },
-                    _capture: PhantomData::<&'a ()>,
-                });
+            } => match receiver.try_recv() {
+                Ok(Ok(())) => {
+                    completion.complete(value.borrow_mut())?;
+                    C::no_op()
+                },
+                Ok(Err(err)) => return Err(err),
+                Err(oneshot::TryRecvError::Empty) => completion,
+                Err(oneshot::TryRecvError::Disconnected) => return Err(CudaError::AlreadyAcquired),
             },
-            Err(oneshot::TryRecvError::Disconnected) => return Err(CudaError::AlreadyAcquired),
         };
 
-        completion.complete(value.borrow_mut())?;
+        let event = CudaDropWrapper::from(Event::new(EventFlags::DISABLE_TIMING)?);
+        event.record(old_stream)?;
+        stream.wait_event(&event, StreamWaitEventFlags::DEFAULT)?;
+
+        let (sender, receiver) = oneshot::channel();
+        stream.add_callback(Box::new(|result| std::mem::drop(sender.send(result))))?;
 
         Ok(Async {
-            _stream: PhantomData::<&'stream_new Stream>,
+            stream,
             value,
-            status: AsyncStatus::Completed { result: Ok(()) },
+            status: AsyncStatus::Processing {
+                receiver,
+                completion,
+                event: Some(event),
+                _capture: PhantomData::<&'a T>,
+            },
             _capture: PhantomData::<&'a ()>,
         })
     }
@@ -249,7 +258,7 @@ impl<'a, 'stream, T: BorrowMut<C::Completed>, C: Completion<T>> Async<'a, 'strea
     /// computation out of smaller ones that have all been submitted to the
     /// same [`Stream`].
     pub unsafe fn unwrap_unchecked(self) -> CudaResult<(T, Option<C>)> {
-        let (value, status) = self.destructure_into_parts();
+        let (_stream, value, status) = self.destructure_into_parts();
 
         match status {
             AsyncStatus::Completed { result: Ok(()) } => Ok((value, None)),
@@ -264,20 +273,63 @@ impl<'a, 'stream, T: BorrowMut<C::Completed>, C: Completion<T>> Async<'a, 'strea
     }
 
     pub const fn as_ref(&self) -> AsyncProj<'_, 'stream, &T> {
-        AsyncProj::new(&self.value)
+        // Safety: this projection captures this async
+        unsafe { AsyncProj::new(&self.value, None) }
     }
 
     pub fn as_mut(&mut self) -> AsyncProj<'_, 'stream, &mut T> {
-        AsyncProj::new(&mut self.value)
+        // Safety: this projection captures this async
+        unsafe {
+            AsyncProj::new(
+                &mut self.value,
+                Some(Box::new(|| {
+                    let completion = match &mut self.status {
+                        AsyncStatus::Completed { result } => {
+                            (*result)?;
+                            C::no_op()
+                        },
+                        AsyncStatus::Processing {
+                            receiver: _,
+                            completion,
+                            event: _,
+                            _capture,
+                        } => std::mem::replace(completion, C::no_op()),
+                    };
+
+                    let event = CudaDropWrapper::from(Event::new(EventFlags::DISABLE_TIMING)?);
+
+                    let (sender, receiver) = oneshot::channel();
+
+                    self.stream
+                        .add_callback(Box::new(|result| std::mem::drop(sender.send(result))))?;
+                    event.record(self.stream)?;
+
+                    self.status = AsyncStatus::Processing {
+                        receiver,
+                        completion,
+                        event: Some(event),
+                        _capture: PhantomData::<&'a T>,
+                    };
+
+                    Ok(())
+                })),
+            )
+        }
     }
 
     #[must_use]
-    fn destructure_into_parts(self) -> (T, AsyncStatus<'a, T, C>) {
+    fn destructure_into_parts(self) -> (&'stream Stream, T, AsyncStatus<'a, T, C>) {
         let this = std::mem::ManuallyDrop::new(self);
 
         // Safety: we destructure self into its droppable components,
         //         value and status, without dropping self itself
-        unsafe { (std::ptr::read(&this.value), (std::ptr::read(&this.status))) }
+        unsafe {
+            (
+                this.stream,
+                std::ptr::read(&this.value),
+                (std::ptr::read(&this.status)),
+            )
+        }
     }
 }
 
@@ -360,7 +412,7 @@ impl<'a, 'stream, T: BorrowMut<C::Completed>, C: Completion<T>> IntoFuture
     type IntoFuture = impl Future<Output = Self::Output>;
 
     fn into_future(self) -> Self::IntoFuture {
-        let (value, status) = self.destructure_into_parts();
+        let (_stream, value, status) = self.destructure_into_parts();
 
         let (completion, status): (Option<C>, AsyncStatus<'a, T, NoCompletion>) = match status {
             AsyncStatus::Completed { result } => {
@@ -422,21 +474,30 @@ impl<'a, 'stream, T: BorrowMut<C::Completed>, C: Completion<T>> Drop
 
 #[cfg(feature = "host")]
 #[allow(clippy::module_name_repetitions)]
-#[derive(Copy, Clone)]
 pub struct AsyncProj<'a, 'stream, T: 'a> {
     _capture: PhantomData<&'a ()>,
     _stream: PhantomData<&'stream Stream>,
     value: T,
+    use_callback: Option<Box<dyn FnMut() -> CudaResult<()> + 'a>>,
 }
 
 #[cfg(feature = "host")]
 impl<'a, 'stream, T: 'a> AsyncProj<'a, 'stream, T> {
     #[must_use]
-    pub(crate) const fn new(value: T) -> Self {
+    /// # Safety
+    ///
+    /// This projection must either capture an existing [`Async`] or come from
+    /// a source that ensures that the projected value can never (async) move
+    /// to a different [`Stream`].
+    pub(crate) const unsafe fn new(
+        value: T,
+        use_callback: Option<Box<dyn FnMut() -> CudaResult<()> + 'a>>,
+    ) -> Self {
         Self {
             _capture: PhantomData::<&'a ()>,
             _stream: PhantomData::<&'stream Stream>,
             value,
+            use_callback,
         }
     }
 
@@ -452,6 +513,22 @@ impl<'a, 'stream, T: 'a> AsyncProj<'a, 'stream, T> {
     pub(crate) unsafe fn unwrap_unchecked(self) -> T {
         self.value
     }
+
+    #[allow(clippy::type_complexity)]
+    /// # Safety
+    ///
+    /// The returned reference to the inner value of type `T` may not yet have
+    /// completed its asynchronous work and may thus be in an inconsistent
+    /// state.
+    ///
+    /// This method must only be used to construct a larger asynchronous
+    /// computation out of smaller ones that have all been submitted to the
+    /// same [`Stream`].
+    pub(crate) unsafe fn unwrap_unchecked_with_use(
+        self,
+    ) -> (T, Option<Box<dyn FnMut() -> CudaResult<()> + 'a>>) {
+        (self.value, self.use_callback)
+    }
 }
 
 #[cfg(feature = "host")]
@@ -465,6 +542,7 @@ impl<'a, 'stream, T: 'a> AsyncProj<'a, 'stream, T> {
             _capture: PhantomData::<&'b ()>,
             _stream: PhantomData::<&'stream Stream>,
             value: &self.value,
+            use_callback: None,
         }
     }
 
@@ -477,8 +555,18 @@ impl<'a, 'stream, T: 'a> AsyncProj<'a, 'stream, T> {
             _capture: PhantomData::<&'b ()>,
             _stream: PhantomData::<&'stream Stream>,
             value: &mut self.value,
+            use_callback: self.use_callback.as_mut().map(|use_callback| {
+                let use_callback: Box<dyn FnMut() -> CudaResult<()>> = Box::new(use_callback);
+                use_callback
+            }),
         }
     }
+
+    pub(crate) fn record_mut_use(&mut self) -> CudaResult<()> {
+        self.use_callback
+            .as_mut()
+            .map_or(Ok(()), |use_callback| use_callback())
+    }
 }
 
 #[cfg(feature = "host")]
@@ -492,8 +580,22 @@ impl<'a, 'stream, T: 'a> AsyncProj<'a, 'stream, &'a T> {
             _capture: PhantomData::<&'b ()>,
             _stream: PhantomData::<&'stream Stream>,
             value: self.value,
+            use_callback: None,
         }
     }
+
+    /// # Safety
+    ///
+    /// The returned reference to the inner value of type `&T` may not yet have
+    /// completed its asynchronous work and may thus be in an inconsistent
+    /// state.
+    ///
+    /// This method must only be used to construct a larger asynchronous
+    /// computation out of smaller ones that have all been submitted to the
+    /// same [`Stream`].
+    pub(crate) const unsafe fn unwrap_ref_unchecked(&self) -> &T {
+        self.value
+    }
 }
 
 #[cfg(feature = "host")]
@@ -507,6 +609,7 @@ impl<'a, 'stream, T: 'a> AsyncProj<'a, 'stream, &'a mut T> {
             _capture: PhantomData::<&'b ()>,
             _stream: PhantomData::<&'stream Stream>,
             value: self.value,
+            use_callback: None,
         }
     }
 
@@ -519,6 +622,38 @@ impl<'a, 'stream, T: 'a> AsyncProj<'a, 'stream, &'a mut T> {
             _capture: PhantomData::<&'b ()>,
             _stream: PhantomData::<&'stream Stream>,
             value: self.value,
+            use_callback: self.use_callback.as_mut().map(|use_callback| {
+                let use_callback: Box<dyn FnMut() -> CudaResult<()>> = Box::new(use_callback);
+                use_callback
+            }),
         }
     }
+
+    #[allow(dead_code)] // FIXME
+    /// # Safety
+    ///
+    /// The returned reference to the inner value of type `&T` may not yet have
+    /// completed its asynchronous work and may thus be in an inconsistent
+    /// state.
+    ///
+    /// This method must only be used to construct a larger asynchronous
+    /// computation out of smaller ones that have all been submitted to the
+    /// same [`Stream`].
+    pub(crate) unsafe fn unwrap_ref_unchecked(&self) -> &T {
+        self.value
+    }
+
+    #[allow(dead_code)] // FIXME
+    /// # Safety
+    ///
+    /// The returned reference to the inner value of type `&T` may not yet have
+    /// completed its asynchronous work and may thus be in an inconsistent
+    /// state.
+    ///
+    /// This method must only be used to construct a larger asynchronous
+    /// computation out of smaller ones that have all been submitted to the
+    /// same [`Stream`].
+    pub(crate) unsafe fn unwrap_mut_unchecked(&mut self) -> &mut T {
+        self.value
+    }
 }
diff --git a/src/utils/exchange/buffer/host.rs b/src/utils/exchange/buffer/host.rs
index 184de1aca..7db5ba3a2 100644
--- a/src/utils/exchange/buffer/host.rs
+++ b/src/utils/exchange/buffer/host.rs
@@ -180,7 +180,7 @@ impl<T: StackOnly + PortableBitSemantics + TypeGraphLayout, const M2D: bool, con
     pub unsafe fn borrow_async<'stream, A: CudaAlloc>(
         &self,
         alloc: A,
-        stream: &'stream rustacuda::stream::Stream,
+        stream: &'stream crate::host::Stream,
     ) -> rustacuda::error::CudaResult<(
         Async<'_, 'stream, DeviceAccessible<CudaExchangeBufferCudaRepresentation<T, M2D, M2H>>>,
         CombinedCudaAlloc<NoCudaAlloc, A>,
@@ -217,7 +217,7 @@ impl<T: StackOnly + PortableBitSemantics + TypeGraphLayout, const M2D: bool, con
     pub unsafe fn restore_async<'a, 'stream, A: CudaAlloc, O>(
         mut this: owning_ref::BoxRefMut<'a, O, Self>,
         alloc: CombinedCudaAlloc<NoCudaAlloc, A>,
-        stream: &'stream rustacuda::stream::Stream,
+        stream: &'stream crate::host::Stream,
     ) -> rustacuda::error::CudaResult<(
         Async<'a, 'stream, owning_ref::BoxRefMut<'a, O, Self>, CompletionFnMut<'a, Self>>,
         A,
diff --git a/src/utils/exchange/buffer/mod.rs b/src/utils/exchange/buffer/mod.rs
index 28ee028d1..80fa09bbd 100644
--- a/src/utils/exchange/buffer/mod.rs
+++ b/src/utils/exchange/buffer/mod.rs
@@ -146,7 +146,7 @@ unsafe impl<T: StackOnly + PortableBitSemantics + TypeGraphLayout, const M2D: bo
     unsafe fn borrow_async<'stream, A: CudaAlloc>(
         &self,
         alloc: A,
-        stream: &'stream rustacuda::stream::Stream,
+        stream: &'stream crate::host::Stream,
     ) -> rustacuda::error::CudaResult<(
         Async<'_, 'stream, DeviceAccessible<Self::CudaRepresentation>>,
         CombinedCudaAlloc<Self::CudaAllocationAsync, A>,
@@ -159,7 +159,7 @@ unsafe impl<T: StackOnly + PortableBitSemantics + TypeGraphLayout, const M2D: bo
     unsafe fn restore_async<'a, 'stream, A: CudaAlloc, O>(
         this: owning_ref::BoxRefMut<'a, O, Self>,
         alloc: CombinedCudaAlloc<Self::CudaAllocationAsync, A>,
-        stream: &'stream rustacuda::stream::Stream,
+        stream: &'stream crate::host::Stream,
     ) -> rustacuda::error::CudaResult<(
         Async<'a, 'stream, owning_ref::BoxRefMut<'a, O, Self>, CompletionFnMut<'a, Self>>,
         A,
diff --git a/src/utils/exchange/wrapper.rs b/src/utils/exchange/wrapper.rs
index 1f3326c5b..faa9c5b44 100644
--- a/src/utils/exchange/wrapper.rs
+++ b/src/utils/exchange/wrapper.rs
@@ -3,12 +3,11 @@ use std::ops::{Deref, DerefMut};
 use rustacuda::{
     error::CudaResult,
     memory::{AsyncCopyDestination, CopyDestination, DeviceBox, LockedBox},
-    stream::Stream,
 };
 
 use crate::{
     alloc::{EmptyCudaAlloc, NoCudaAlloc},
-    host::{CudaDropWrapper, HostAndDeviceConstRef, HostAndDeviceMutRef},
+    host::{CudaDropWrapper, HostAndDeviceConstRef, HostAndDeviceMutRef, Stream},
     lend::{RustToCuda, RustToCudaAsync},
     safety::SafeMutableAliasing,
     utils::{
@@ -195,22 +194,6 @@ impl<T: RustToCuda<CudaAllocation: EmptyCudaAlloc>> ExchangeWrapperOnDevice<T> {
             )
         }
     }
-
-    #[must_use]
-    pub fn as_mut(
-        &mut self,
-    ) -> HostAndDeviceMutRef<DeviceAccessible<<T as RustToCuda>::CudaRepresentation>>
-    where
-        T: SafeMutableAliasing,
-    {
-        // Safety: `device_box` contains exactly the device copy of `locked_cuda_repr`
-        unsafe {
-            HostAndDeviceMutRef::new_unchecked(
-                &mut self.device_box,
-                (**self.locked_cuda_repr).into_mut(),
-            )
-        }
-    }
 }
 
 impl<T: RustToCudaAsync<CudaAllocationAsync: EmptyCudaAlloc, CudaAllocation: EmptyCudaAlloc>>
@@ -339,12 +322,16 @@ impl<
     > {
         let this = unsafe { self.as_ref().unwrap_unchecked() };
 
-        AsyncProj::new(unsafe {
-            HostAndDeviceConstRef::new_unchecked(
-                &*(this.device_box),
-                (**(this.locked_cuda_repr)).into_ref(),
+        // Safety: this projection captures this async
+        unsafe {
+            AsyncProj::new(
+                HostAndDeviceConstRef::new_unchecked(
+                    &*(this.device_box),
+                    (**(this.locked_cuda_repr)).into_ref(),
+                ),
+                None,
             )
-        })
+        }
     }
 
     #[must_use]
@@ -358,13 +345,17 @@ impl<
     where
         T: SafeMutableAliasing,
     {
-        let this = unsafe { self.as_mut().unwrap_unchecked() };
+        let (this, use_callback) = unsafe { self.as_mut().unwrap_unchecked_with_use() };
 
-        AsyncProj::new(unsafe {
-            HostAndDeviceMutRef::new_unchecked(
-                &mut *(this.device_box),
-                (**(this.locked_cuda_repr)).into_mut(),
+        // Safety: this projection captures this async
+        unsafe {
+            AsyncProj::new(
+                HostAndDeviceMutRef::new_unchecked(
+                    &mut *(this.device_box),
+                    (**(this.locked_cuda_repr)).into_mut(),
+                ),
+                use_callback,
             )
-        })
+        }
     }
 }

From 36aa41a374637084fe95ed95799875dc622229e4 Mon Sep 17 00:00:00 2001
From: Juniper Tyree <juniper.tyree@helsinki.fi>
Date: Sat, 13 Jan 2024 07:18:28 +0000
Subject: [PATCH 104/120] Fix uniqueness guarantee for Stream using branded
 types

---
 rust-cuda-derive/src/rust_to_cuda/impl.rs |  4 +-
 src/host/mod.rs                           | 54 ++++++++++++++++-------
 src/kernel/mod.rs                         | 12 ++---
 src/kernel/param.rs                       | 24 +++++-----
 src/lend/impls/box.rs                     |  8 ++--
 src/lend/impls/boxed_slice.rs             |  8 ++--
 src/lend/impls/final.rs                   |  4 +-
 src/lend/impls/option.rs                  |  4 +-
 src/lend/impls/ref.rs                     |  6 +--
 src/lend/impls/slice_ref.rs               |  6 +--
 src/lend/mod.rs                           | 16 +++----
 src/utils/adapter.rs                      |  8 ++--
 src/utils/aliasing/const.rs               |  4 +-
 src/utils/aliasing/dynamic.rs             |  4 +-
 src/utils/async.rs                        | 32 +++++++-------
 src/utils/exchange/buffer/host.rs         |  8 ++--
 src/utils/exchange/buffer/mod.rs          |  4 +-
 src/utils/exchange/wrapper.rs             |  8 ++--
 18 files changed, 119 insertions(+), 95 deletions(-)

diff --git a/rust-cuda-derive/src/rust_to_cuda/impl.rs b/rust-cuda-derive/src/rust_to_cuda/impl.rs
index e45a0e283..e0a67b7e3 100644
--- a/rust-cuda-derive/src/rust_to_cuda/impl.rs
+++ b/rust-cuda-derive/src/rust_to_cuda/impl.rs
@@ -191,7 +191,7 @@ pub fn rust_to_cuda_async_trait(
             unsafe fn borrow_async<'stream, CudaAllocType: #crate_path::alloc::CudaAlloc>(
                 &self,
                 alloc: CudaAllocType,
-                stream: &'stream #crate_path::host::Stream,
+                stream: #crate_path::host::Stream<'stream>,
             ) -> #crate_path::deps::rustacuda::error::CudaResult<(
                 #crate_path::utils::r#async::Async<
                     '_, 'stream,
@@ -219,7 +219,7 @@ pub fn rust_to_cuda_async_trait(
                 alloc: #crate_path::alloc::CombinedCudaAlloc<
                     Self::CudaAllocationAsync, CudaAllocType
                 >,
-                stream: &'stream #crate_path::host::Stream,
+                stream: #crate_path::host::Stream<'stream>,
             ) -> #crate_path::deps::rustacuda::error::CudaResult<(
                 #crate_path::utils::r#async::Async<
                     'a, 'stream,
diff --git a/src/host/mod.rs b/src/host/mod.rs
index 25fd73a84..23bab2706 100644
--- a/src/host/mod.rs
+++ b/src/host/mod.rs
@@ -25,30 +25,54 @@ use crate::{
     },
 };
 
+type InvariantLifetime<'brand> = PhantomData<fn(&'brand ()) -> &'brand ()>;
+
+#[derive(Copy, Clone)]
 #[repr(transparent)]
-pub struct Stream {
-    stream: rustacuda::stream::Stream,
+pub struct Stream<'stream> {
+    stream: &'stream rustacuda::stream::Stream,
+    _brand: InvariantLifetime<'stream>,
 }
 
-impl Deref for Stream {
+impl<'stream> Deref for Stream<'stream> {
     type Target = rustacuda::stream::Stream;
 
     fn deref(&self) -> &Self::Target {
-        &self.stream
+        self.stream
     }
 }
 
-impl Stream {
+impl<'stream> Stream<'stream> {
+    #[allow(clippy::needless_pass_by_ref_mut)]
+    /// Create a new uniquely branded [`Stream`], which can bind async
+    /// operations to the [`Stream`] that they are computed on.
+    ///
+    /// The uniqueness guarantees are provided by using branded types,
+    /// as inspired by the Ghost Cell paper by Yanovski, J., Dang, H.-H.,
+    /// Jung, R., and Dreyer, D.: <https://doi.org/10.1145/3473597>.
+    ///
+    /// # Examples
+    ///
+    /// The following example shows that two [`Stream`]'s with different
+    /// `'stream` lifetime brands cannot be used interchangeably.
+    ///
+    /// ```rust, compile_fail
+    /// use rust_cuda::host::Stream;
+    ///
+    /// fn check_same<'stream>(_stream_a: Stream<'stream>, _stream_b: Stream<'stream>) {}
+    ///
+    /// fn two_streams<'stream_a, 'stream_b>(stream_a: Stream<'stream_a>, stream_b: Stream<'stream_b>) {
+    ///     check_same(stream_a, stream_b);
+    /// }
+    /// ```
     pub fn with<O>(
         stream: &mut rustacuda::stream::Stream,
-        inner: impl for<'stream> FnOnce(&'stream Self) -> O,
+        inner: impl for<'new_stream> FnOnce(Stream<'new_stream>) -> O,
     ) -> O {
-        // Safety:
-        //  - Stream is a newtype wrapper around rustacuda::stream::Stream
-        //  - we forge a unique lifetime for a unique reference
-        let stream = unsafe { &*std::ptr::from_ref(stream).cast() };
-
-        inner(stream)
+        inner(Stream {
+            stream,
+            _brand: InvariantLifetime::default(),
+        })
     }
 }
 
@@ -219,7 +243,7 @@ impl<'a, T: PortableBitSemantics + TypeGraphLayout> HostAndDeviceMutRef<'a, T> {
     #[must_use]
     pub fn into_async<'b, 'stream>(
         self,
-        stream: &'stream Stream,
+        stream: Stream<'stream>,
     ) -> Async<'b, 'stream, HostAndDeviceMutRef<'b, T>, NoCompletion>
     where
         'a: 'b,
@@ -312,7 +336,7 @@ impl<'a, T: PortableBitSemantics + TypeGraphLayout> HostAndDeviceConstRef<'a, T>
     #[must_use]
     pub const fn as_async<'b, 'stream>(
         &'b self,
-        stream: &'stream Stream,
+        stream: Stream<'stream>,
     ) -> Async<'b, 'stream, HostAndDeviceConstRef<'b, T>, NoCompletion>
     where
         'a: 'b,
@@ -370,7 +394,7 @@ impl<'a, T: PortableBitSemantics + TypeGraphLayout> HostAndDeviceOwned<'a, T> {
     #[must_use]
     pub const fn into_async<'stream>(
         self,
-        stream: &'stream Stream,
+        stream: Stream<'stream>,
     ) -> Async<'a, 'stream, Self, NoCompletion> {
         Async::ready(self, stream)
     }
diff --git a/src/kernel/mod.rs b/src/kernel/mod.rs
index b5fea0af8..42e13e0ce 100644
--- a/src/kernel/mod.rs
+++ b/src/kernel/mod.rs
@@ -110,7 +110,7 @@ pub trait CudaKernelParameter: sealed::Sealed {
     #[allow(clippy::missing_errors_doc)] // FIXME
     fn with_new_async<'stream, 'param, O, E: From<rustacuda::error::CudaError>>(
         param: Self::SyncHostType,
-        stream: &'stream crate::host::Stream,
+        stream: crate::host::Stream<'stream>,
         inner: impl WithNewAsync<'stream, Self, O, E>,
     ) -> Result<O, E>
     where
@@ -156,7 +156,7 @@ pub trait CudaKernelParameter: sealed::Sealed {
 
 #[cfg(feature = "host")]
 pub struct Launcher<'stream, 'kernel, Kernel> {
-    pub stream: &'stream Stream,
+    pub stream: Stream<'stream>,
     pub kernel: &'kernel mut TypedPtxKernel<Kernel>,
     pub config: LaunchConfig,
 }
@@ -366,7 +366,7 @@ macro_rules! impl_typed_kernel_launch {
         #[allow(clippy::too_many_arguments)] // func is defined for <= 12 args
         pub fn $launch<'kernel, 'stream, $($T: CudaKernelParameter),*>(
             &'kernel mut self,
-            stream: &'stream Stream,
+            stream: Stream<'stream>,
             config: &LaunchConfig,
             $($arg: $T::SyncHostType),*
         ) -> CudaResult<()>
@@ -396,12 +396,12 @@ macro_rules! impl_typed_kernel_launch {
             $($T: CudaKernelParameter),*
         >(
             &'kernel mut self,
-            stream: &'stream Stream,
+            stream: Stream<'stream>,
             config: &LaunchConfig,
             $($arg: $T::SyncHostType,)*
             inner: impl FnOnce(
                 &'kernel mut Self,
-                &'stream Stream,
+                Stream<'stream>,
                 &LaunchConfig,
                 $($T::AsyncHostType<'stream, '_>),*
             ) -> Result<Ok, Err>,
@@ -419,7 +419,7 @@ macro_rules! impl_typed_kernel_launch {
         #[allow(clippy::too_many_arguments)] // func is defined for <= 12 args
         pub fn $launch_async<'kernel, 'stream, $($T: CudaKernelParameter),*>(
             &'kernel mut self,
-            stream: &'stream Stream,
+            stream: Stream<'stream>,
             config: &LaunchConfig,
             $($arg: $T::AsyncHostType<'stream, '_>),*
         ) -> CudaResult<crate::utils::r#async::Async<
diff --git a/src/kernel/param.rs b/src/kernel/param.rs
index a5a3cf457..8cafb8c8d 100644
--- a/src/kernel/param.rs
+++ b/src/kernel/param.rs
@@ -81,7 +81,7 @@ impl<
     #[cfg(feature = "host")]
     fn with_new_async<'stream, 'b, O, E: From<rustacuda::error::CudaError>>(
         param: Self::SyncHostType,
-        _stream: &'stream crate::host::Stream,
+        _stream: crate::host::Stream<'stream>,
         inner: impl super::WithNewAsync<'stream, Self, O, E>,
     ) -> Result<O, E>
     where
@@ -167,7 +167,7 @@ impl<
     #[cfg(feature = "host")]
     fn with_new_async<'stream, 'b, O, E: From<rustacuda::error::CudaError>>(
         param: Self::SyncHostType,
-        stream: &'stream crate::host::Stream,
+        stream: crate::host::Stream<'stream>,
         inner: impl super::WithNewAsync<'stream, Self, O, E>,
     ) -> Result<O, E>
     where
@@ -251,7 +251,7 @@ impl<
     #[cfg(feature = "host")]
     fn with_new_async<'stream, 'b, O, E: From<rustacuda::error::CudaError>>(
         param: Self::SyncHostType,
-        stream: &'stream crate::host::Stream,
+        stream: crate::host::Stream<'stream>,
         inner: impl super::WithNewAsync<'stream, Self, O, E>,
     ) -> Result<O, E>
     where
@@ -373,7 +373,7 @@ impl<
     #[cfg(feature = "host")]
     fn with_new_async<'stream, 'b, O, E: From<rustacuda::error::CudaError>>(
         param: Self::SyncHostType,
-        stream: &'stream crate::host::Stream,
+        stream: crate::host::Stream<'stream>,
         inner: impl super::WithNewAsync<'stream, Self, O, E>,
     ) -> Result<O, E>
     where
@@ -509,7 +509,7 @@ impl<
     #[cfg(feature = "host")]
     fn with_new_async<'stream, 'b, O, E: From<rustacuda::error::CudaError>>(
         param: Self::SyncHostType,
-        stream: &'stream crate::host::Stream,
+        stream: crate::host::Stream<'stream>,
         inner: impl super::WithNewAsync<'stream, Self, O, E>,
     ) -> Result<O, E>
     where
@@ -595,7 +595,7 @@ impl<'a, T: Sync + RustToCuda> CudaKernelParameter for &'a DeepPerThreadBorrow<T
     #[cfg(feature = "host")]
     fn with_new_async<'stream, 'b, O, E: From<rustacuda::error::CudaError>>(
         param: Self::SyncHostType,
-        stream: &'stream crate::host::Stream,
+        stream: crate::host::Stream<'stream>,
         inner: impl super::WithNewAsync<'stream, Self, O, E>,
     ) -> Result<O, E>
     where
@@ -678,7 +678,7 @@ impl<'a, T: Sync + RustToCuda + SafeMutableAliasing> CudaKernelParameter
     #[cfg(feature = "host")]
     fn with_new_async<'stream, 'b, O, E: From<rustacuda::error::CudaError>>(
         param: Self::SyncHostType,
-        stream: &'stream crate::host::Stream,
+        stream: crate::host::Stream<'stream>,
         inner: impl super::WithNewAsync<'stream, Self, O, E>,
     ) -> Result<O, E>
     where
@@ -768,7 +768,7 @@ impl<
     #[cfg(feature = "host")]
     fn with_new_async<'stream, 'b, O, E: From<rustacuda::error::CudaError>>(
         param: Self::SyncHostType,
-        stream: &'stream crate::host::Stream,
+        stream: crate::host::Stream<'stream>,
         inner: impl super::WithNewAsync<'stream, Self, O, E>,
     ) -> Result<O, E>
     where
@@ -851,7 +851,7 @@ impl<'a, T: Sync + RustToCuda> CudaKernelParameter for &'a PtxJit<DeepPerThreadB
     #[cfg(feature = "host")]
     fn with_new_async<'stream, 'b, O, E: From<rustacuda::error::CudaError>>(
         param: Self::SyncHostType,
-        stream: &'stream crate::host::Stream,
+        stream: crate::host::Stream<'stream>,
         inner: impl super::WithNewAsync<'stream, Self, O, E>,
     ) -> Result<O, E>
     where
@@ -932,7 +932,7 @@ impl<'a, T: Sync + RustToCuda + SafeMutableAliasing> CudaKernelParameter
     #[cfg(feature = "host")]
     fn with_new_async<'stream, 'b, O, E: From<rustacuda::error::CudaError>>(
         param: Self::SyncHostType,
-        stream: &'stream crate::host::Stream,
+        stream: crate::host::Stream<'stream>,
         inner: impl super::WithNewAsync<'stream, Self, O, E>,
     ) -> Result<O, E>
     where
@@ -1058,7 +1058,7 @@ impl<'a, T: 'static> CudaKernelParameter for &'a mut crate::utils::shared::Threa
     #[cfg(feature = "host")]
     fn with_new_async<'stream, 'b, O, E: From<rustacuda::error::CudaError>>(
         param: Self::SyncHostType,
-        _stream: &'stream crate::host::Stream,
+        _stream: crate::host::Stream<'stream>,
         inner: impl super::WithNewAsync<'stream, Self, O, E>,
     ) -> Result<O, E>
     where
@@ -1135,7 +1135,7 @@ impl<'a, T: 'static + PortableBitSemantics + TypeGraphLayout> CudaKernelParamete
     #[cfg(feature = "host")]
     fn with_new_async<'stream, 'b, O, E: From<rustacuda::error::CudaError>>(
         param: Self::SyncHostType,
-        _stream: &'stream crate::host::Stream,
+        _stream: crate::host::Stream<'stream>,
         inner: impl super::WithNewAsync<'stream, Self, O, E>,
     ) -> Result<O, E>
     where
diff --git a/src/lend/impls/box.rs b/src/lend/impls/box.rs
index fff0bb8d8..b4cec19cd 100644
--- a/src/lend/impls/box.rs
+++ b/src/lend/impls/box.rs
@@ -90,7 +90,7 @@ unsafe impl<T: PortableBitSemantics + TypeGraphLayout> RustToCudaAsync for Box<T
     unsafe fn borrow_async<'stream, A: CudaAlloc>(
         &self,
         alloc: A,
-        stream: &'stream crate::host::Stream,
+        stream: crate::host::Stream<'stream>,
     ) -> rustacuda::error::CudaResult<(
         Async<'_, 'stream, DeviceAccessible<Self::CudaRepresentation>>,
         CombinedCudaAlloc<Self::CudaAllocationAsync, A>,
@@ -113,7 +113,7 @@ unsafe impl<T: PortableBitSemantics + TypeGraphLayout> RustToCudaAsync for Box<T
         let mut device_box = CudaDropWrapper::from(DeviceBox::<
             DeviceCopyWithPortableBitSemantics<ManuallyDrop<T>>,
         >::uninitialized()?);
-        device_box.async_copy_from(&*locked_box, stream)?;
+        device_box.async_copy_from(&*locked_box, &stream)?;
 
         Ok((
             Async::pending(
@@ -131,7 +131,7 @@ unsafe impl<T: PortableBitSemantics + TypeGraphLayout> RustToCudaAsync for Box<T
     unsafe fn restore_async<'a, 'stream, A: CudaAlloc, O>(
         this: owning_ref::BoxRefMut<'a, O, Self>,
         alloc: CombinedCudaAlloc<Self::CudaAllocationAsync, A>,
-        stream: &'stream crate::host::Stream,
+        stream: crate::host::Stream<'stream>,
     ) -> CudaResult<(
         Async<'a, 'stream, owning_ref::BoxRefMut<'a, O, Self>, CompletionFnMut<'a, Self>>,
         A,
@@ -141,7 +141,7 @@ unsafe impl<T: PortableBitSemantics + TypeGraphLayout> RustToCudaAsync for Box<T
         let (alloc_front, alloc_tail) = alloc.split();
         let (mut locked_box, device_box) = alloc_front.split();
 
-        device_box.async_copy_to(&mut *locked_box, stream)?;
+        device_box.async_copy_to(&mut *locked_box, &stream)?;
 
         let r#async = crate::utils::r#async::Async::<_, CompletionFnMut<'a, Self>>::pending(
             this,
diff --git a/src/lend/impls/boxed_slice.rs b/src/lend/impls/boxed_slice.rs
index c275a6d1c..5215d2acf 100644
--- a/src/lend/impls/boxed_slice.rs
+++ b/src/lend/impls/boxed_slice.rs
@@ -96,7 +96,7 @@ unsafe impl<T: PortableBitSemantics + TypeGraphLayout> RustToCudaAsync for Box<[
     unsafe fn borrow_async<'stream, A: CudaAlloc>(
         &self,
         alloc: A,
-        stream: &'stream crate::host::Stream,
+        stream: crate::host::Stream<'stream>,
     ) -> rustacuda::error::CudaResult<(
         Async<'_, 'stream, DeviceAccessible<Self::CudaRepresentation>>,
         CombinedCudaAlloc<Self::CudaAllocationAsync, A>,
@@ -120,7 +120,7 @@ unsafe impl<T: PortableBitSemantics + TypeGraphLayout> RustToCudaAsync for Box<[
         let mut device_buffer = CudaDropWrapper::from(DeviceBuffer::<
             DeviceCopyWithPortableBitSemantics<ManuallyDrop<T>>,
         >::uninitialized(self.len())?);
-        device_buffer.async_copy_from(&*locked_buffer, stream)?;
+        device_buffer.async_copy_from(&*locked_buffer, &stream)?;
 
         Ok((
             Async::pending(
@@ -140,7 +140,7 @@ unsafe impl<T: PortableBitSemantics + TypeGraphLayout> RustToCudaAsync for Box<[
     unsafe fn restore_async<'a, 'stream, A: CudaAlloc, O>(
         this: owning_ref::BoxRefMut<'a, O, Self>,
         alloc: CombinedCudaAlloc<Self::CudaAllocationAsync, A>,
-        stream: &'stream crate::host::Stream,
+        stream: crate::host::Stream<'stream>,
     ) -> CudaResult<(
         Async<'a, 'stream, owning_ref::BoxRefMut<'a, O, Self>, CompletionFnMut<'a, Self>>,
         A,
@@ -150,7 +150,7 @@ unsafe impl<T: PortableBitSemantics + TypeGraphLayout> RustToCudaAsync for Box<[
         let (alloc_front, alloc_tail) = alloc.split();
         let (mut locked_buffer, device_buffer) = alloc_front.split();
 
-        device_buffer.async_copy_to(&mut *locked_buffer, stream)?;
+        device_buffer.async_copy_to(&mut *locked_buffer, &stream)?;
 
         let r#async = crate::utils::r#async::Async::<_, CompletionFnMut<'a, Self>>::pending(
             this,
diff --git a/src/lend/impls/final.rs b/src/lend/impls/final.rs
index 845424ef9..5799a77eb 100644
--- a/src/lend/impls/final.rs
+++ b/src/lend/impls/final.rs
@@ -49,7 +49,7 @@ unsafe impl<T: RustToCudaAsync> RustToCudaAsync for Final<T> {
     unsafe fn borrow_async<'stream, A: crate::alloc::CudaAlloc>(
         &self,
         alloc: A,
-        stream: &'stream crate::host::Stream,
+        stream: crate::host::Stream<'stream>,
     ) -> rustacuda::error::CudaResult<(
         crate::utils::r#async::Async<'_, 'stream, DeviceAccessible<Self::CudaRepresentation>>,
         crate::alloc::CombinedCudaAlloc<Self::CudaAllocationAsync, A>,
@@ -76,7 +76,7 @@ unsafe impl<T: RustToCudaAsync> RustToCudaAsync for Final<T> {
     unsafe fn restore_async<'a, 'stream, A: crate::alloc::CudaAlloc, O>(
         this: owning_ref::BoxRefMut<'a, O, Self>,
         alloc: crate::alloc::CombinedCudaAlloc<Self::CudaAllocationAsync, A>,
-        stream: &'stream crate::host::Stream,
+        stream: crate::host::Stream<'stream>,
     ) -> rustacuda::error::CudaResult<(
         crate::utils::r#async::Async<
             'a,
diff --git a/src/lend/impls/option.rs b/src/lend/impls/option.rs
index 76be7e762..0e9c3c34d 100644
--- a/src/lend/impls/option.rs
+++ b/src/lend/impls/option.rs
@@ -89,7 +89,7 @@ unsafe impl<T: RustToCudaAsync> RustToCudaAsync for Option<T> {
     unsafe fn borrow_async<'stream, A: CudaAlloc>(
         &self,
         alloc: A,
-        stream: &'stream crate::host::Stream,
+        stream: crate::host::Stream<'stream>,
     ) -> CudaResult<(
         Async<'_, 'stream, DeviceAccessible<Self::CudaRepresentation>>,
         CombinedCudaAlloc<Self::CudaAllocationAsync, A>,
@@ -135,7 +135,7 @@ unsafe impl<T: RustToCudaAsync> RustToCudaAsync for Option<T> {
     unsafe fn restore_async<'a, 'stream, A: CudaAlloc, O>(
         mut this: owning_ref::BoxRefMut<'a, O, Self>,
         alloc: CombinedCudaAlloc<Self::CudaAllocationAsync, A>,
-        stream: &'stream crate::host::Stream,
+        stream: crate::host::Stream<'stream>,
     ) -> CudaResult<(
         Async<'a, 'stream, owning_ref::BoxRefMut<'a, O, Self>, CompletionFnMut<'a, Self>>,
         A,
diff --git a/src/lend/impls/ref.rs b/src/lend/impls/ref.rs
index 3ce472317..4233d1423 100644
--- a/src/lend/impls/ref.rs
+++ b/src/lend/impls/ref.rs
@@ -85,7 +85,7 @@ unsafe impl<'a, T: PortableBitSemantics + TypeGraphLayout> RustToCudaAsync for &
     unsafe fn borrow_async<'stream, A: CudaAlloc>(
         &self,
         alloc: A,
-        stream: &'stream crate::host::Stream,
+        stream: crate::host::Stream<'stream>,
     ) -> rustacuda::error::CudaResult<(
         Async<'_, 'stream, DeviceAccessible<Self::CudaRepresentation>>,
         CombinedCudaAlloc<Self::CudaAllocationAsync, A>,
@@ -108,7 +108,7 @@ unsafe impl<'a, T: PortableBitSemantics + TypeGraphLayout> RustToCudaAsync for &
         let mut device_box = CudaDropWrapper::from(DeviceBox::<
             DeviceCopyWithPortableBitSemantics<ManuallyDrop<T>>,
         >::uninitialized()?);
-        device_box.async_copy_from(&*locked_box, stream)?;
+        device_box.async_copy_from(&*locked_box, &stream)?;
 
         Ok((
             Async::pending(
@@ -127,7 +127,7 @@ unsafe impl<'a, T: PortableBitSemantics + TypeGraphLayout> RustToCudaAsync for &
     unsafe fn restore_async<'b, 'stream, A: CudaAlloc, O>(
         this: owning_ref::BoxRefMut<'b, O, Self>,
         alloc: CombinedCudaAlloc<Self::CudaAllocationAsync, A>,
-        stream: &'stream crate::host::Stream,
+        stream: crate::host::Stream<'stream>,
     ) -> CudaResult<(
         Async<'b, 'stream, owning_ref::BoxRefMut<'b, O, Self>, CompletionFnMut<'b, Self>>,
         A,
diff --git a/src/lend/impls/slice_ref.rs b/src/lend/impls/slice_ref.rs
index 07271a75a..bd74dea64 100644
--- a/src/lend/impls/slice_ref.rs
+++ b/src/lend/impls/slice_ref.rs
@@ -88,7 +88,7 @@ unsafe impl<'a, T: PortableBitSemantics + TypeGraphLayout> RustToCudaAsync for &
     unsafe fn borrow_async<'stream, A: CudaAlloc>(
         &self,
         alloc: A,
-        stream: &'stream crate::host::Stream,
+        stream: crate::host::Stream<'stream>,
     ) -> rustacuda::error::CudaResult<(
         Async<'_, 'stream, DeviceAccessible<Self::CudaRepresentation>>,
         CombinedCudaAlloc<Self::CudaAllocationAsync, A>,
@@ -112,7 +112,7 @@ unsafe impl<'a, T: PortableBitSemantics + TypeGraphLayout> RustToCudaAsync for &
         let mut device_buffer = CudaDropWrapper::from(DeviceBuffer::<
             DeviceCopyWithPortableBitSemantics<ManuallyDrop<T>>,
         >::uninitialized(self.len())?);
-        device_buffer.async_copy_from(&*locked_buffer, stream)?;
+        device_buffer.async_copy_from(&*locked_buffer, &stream)?;
 
         Ok((
             Async::pending(
@@ -132,7 +132,7 @@ unsafe impl<'a, T: PortableBitSemantics + TypeGraphLayout> RustToCudaAsync for &
     unsafe fn restore_async<'b, 'stream, A: CudaAlloc, O>(
         this: owning_ref::BoxRefMut<'b, O, Self>,
         alloc: CombinedCudaAlloc<Self::CudaAllocationAsync, A>,
-        stream: &'stream crate::host::Stream,
+        stream: crate::host::Stream<'stream>,
     ) -> CudaResult<(
         Async<'b, 'stream, owning_ref::BoxRefMut<'b, O, Self>, CompletionFnMut<'b, Self>>,
         A,
diff --git a/src/lend/mod.rs b/src/lend/mod.rs
index 6c0467fd5..e05237768 100644
--- a/src/lend/mod.rs
+++ b/src/lend/mod.rs
@@ -101,7 +101,7 @@ pub unsafe trait RustToCudaAsync: RustToCuda {
     unsafe fn borrow_async<'stream, A: CudaAlloc>(
         &self,
         alloc: A,
-        stream: &'stream crate::host::Stream,
+        stream: crate::host::Stream<'stream>,
     ) -> rustacuda::error::CudaResult<(
         Async<'_, 'stream, DeviceAccessible<Self::CudaRepresentation>>,
         CombinedCudaAlloc<Self::CudaAllocationAsync, A>,
@@ -127,7 +127,7 @@ pub unsafe trait RustToCudaAsync: RustToCuda {
     unsafe fn restore_async<'a, 'stream, A: CudaAlloc, O>(
         this: owning_ref::BoxRefMut<'a, O, Self>,
         alloc: CombinedCudaAlloc<Self::CudaAllocationAsync, A>,
-        stream: &'stream crate::host::Stream,
+        stream: crate::host::Stream<'stream>,
     ) -> rustacuda::error::CudaResult<(
         Async<'a, 'stream, owning_ref::BoxRefMut<'a, O, Self>, CompletionFnMut<'a, Self>>,
         A,
@@ -324,7 +324,7 @@ pub trait LendToCudaAsync: RustToCudaAsync {
         ) -> Result<O, E>,
     >(
         &self,
-        stream: &'stream crate::host::Stream,
+        stream: crate::host::Stream<'stream>,
         inner: F,
     ) -> Result<O, E>
     where
@@ -357,7 +357,7 @@ pub trait LendToCudaAsync: RustToCudaAsync {
         T: 'a,
     >(
         this: owning_ref::BoxRefMut<'a, T, Self>,
-        stream: &'stream crate::host::Stream,
+        stream: crate::host::Stream<'stream>,
         inner: F,
     ) -> Result<
         (
@@ -393,7 +393,7 @@ pub trait LendToCudaAsync: RustToCudaAsync {
         ) -> Result<O, E>,
     >(
         self,
-        stream: &'stream crate::host::Stream,
+        stream: crate::host::Stream<'stream>,
         inner: F,
     ) -> Result<O, E>
     where
@@ -416,7 +416,7 @@ impl<T: RustToCudaAsync> LendToCudaAsync for T {
         ) -> Result<O, E>,
     >(
         &self,
-        stream: &'stream crate::host::Stream,
+        stream: crate::host::Stream<'stream>,
         inner: F,
     ) -> Result<O, E>
     where
@@ -458,7 +458,7 @@ impl<T: RustToCudaAsync> LendToCudaAsync for T {
         S: 'a,
     >(
         this: owning_ref::BoxRefMut<'a, S, Self>,
-        stream: &'stream crate::host::Stream,
+        stream: crate::host::Stream<'stream>,
         inner: F,
     ) -> Result<
         (
@@ -505,7 +505,7 @@ impl<T: RustToCudaAsync> LendToCudaAsync for T {
         ) -> Result<O, E>,
     >(
         self,
-        stream: &'stream crate::host::Stream,
+        stream: crate::host::Stream<'stream>,
         inner: F,
     ) -> Result<O, E>
     where
diff --git a/src/utils/adapter.rs b/src/utils/adapter.rs
index 84aa28569..fa023cc66 100644
--- a/src/utils/adapter.rs
+++ b/src/utils/adapter.rs
@@ -156,7 +156,7 @@ unsafe impl<T: Copy + PortableBitSemantics + TypeGraphLayout> RustToCudaAsync
     unsafe fn borrow_async<'stream, A: CudaAlloc>(
         &self,
         alloc: A,
-        stream: &'stream crate::host::Stream,
+        stream: crate::host::Stream<'stream>,
     ) -> rustacuda::error::CudaResult<(
         crate::utils::r#async::Async<'_, 'stream, DeviceAccessible<Self::CudaRepresentation>>,
         CombinedCudaAlloc<Self::CudaAllocation, A>,
@@ -172,7 +172,7 @@ unsafe impl<T: Copy + PortableBitSemantics + TypeGraphLayout> RustToCudaAsync
     unsafe fn restore_async<'a, 'stream, A: CudaAlloc, O>(
         this: owning_ref::BoxRefMut<'a, O, Self>,
         alloc: CombinedCudaAlloc<Self::CudaAllocation, A>,
-        stream: &'stream crate::host::Stream,
+        stream: crate::host::Stream<'stream>,
     ) -> rustacuda::error::CudaResult<(
         crate::utils::r#async::Async<
             'a,
@@ -346,7 +346,7 @@ unsafe impl<T: Clone + PortableBitSemantics + TypeGraphLayout> RustToCudaAsync
     unsafe fn borrow_async<'stream, A: CudaAlloc>(
         &self,
         alloc: A,
-        stream: &'stream crate::host::Stream,
+        stream: crate::host::Stream<'stream>,
     ) -> rustacuda::error::CudaResult<(
         crate::utils::r#async::Async<'_, 'stream, DeviceAccessible<Self::CudaRepresentation>>,
         CombinedCudaAlloc<Self::CudaAllocation, A>,
@@ -362,7 +362,7 @@ unsafe impl<T: Clone + PortableBitSemantics + TypeGraphLayout> RustToCudaAsync
     unsafe fn restore_async<'a, 'stream, A: CudaAlloc, O>(
         this: owning_ref::BoxRefMut<'a, O, Self>,
         alloc: CombinedCudaAlloc<Self::CudaAllocation, A>,
-        stream: &'stream crate::host::Stream,
+        stream: crate::host::Stream<'stream>,
     ) -> rustacuda::error::CudaResult<(
         crate::utils::r#async::Async<
             'a,
diff --git a/src/utils/aliasing/const.rs b/src/utils/aliasing/const.rs
index 24178131c..097b4c0f4 100644
--- a/src/utils/aliasing/const.rs
+++ b/src/utils/aliasing/const.rs
@@ -222,7 +222,7 @@ unsafe impl<T: RustToCudaAsync, const STRIDE: usize> RustToCudaAsync
     unsafe fn borrow_async<'stream, A: crate::alloc::CudaAlloc>(
         &self,
         alloc: A,
-        stream: &'stream crate::host::Stream,
+        stream: crate::host::Stream<'stream>,
     ) -> rustacuda::error::CudaResult<(
         crate::utils::r#async::Async<'_, 'stream, DeviceAccessible<Self::CudaRepresentation>>,
         crate::alloc::CombinedCudaAlloc<Self::CudaAllocationAsync, A>,
@@ -250,7 +250,7 @@ unsafe impl<T: RustToCudaAsync, const STRIDE: usize> RustToCudaAsync
     unsafe fn restore_async<'a, 'stream, A: crate::alloc::CudaAlloc, O>(
         this: owning_ref::BoxRefMut<'a, O, Self>,
         alloc: crate::alloc::CombinedCudaAlloc<Self::CudaAllocationAsync, A>,
-        stream: &'stream crate::host::Stream,
+        stream: crate::host::Stream<'stream>,
     ) -> rustacuda::error::CudaResult<(
         crate::utils::r#async::Async<
             'a,
diff --git a/src/utils/aliasing/dynamic.rs b/src/utils/aliasing/dynamic.rs
index c16d4bf4f..3928c87d1 100644
--- a/src/utils/aliasing/dynamic.rs
+++ b/src/utils/aliasing/dynamic.rs
@@ -200,7 +200,7 @@ unsafe impl<T: RustToCudaAsync> RustToCudaAsync for SplitSliceOverCudaThreadsDyn
     unsafe fn borrow_async<'stream, A: crate::alloc::CudaAlloc>(
         &self,
         alloc: A,
-        stream: &'stream crate::host::Stream,
+        stream: crate::host::Stream<'stream>,
     ) -> rustacuda::error::CudaResult<(
         crate::utils::r#async::Async<'_, 'stream, DeviceAccessible<Self::CudaRepresentation>>,
         crate::alloc::CombinedCudaAlloc<Self::CudaAllocationAsync, A>,
@@ -230,7 +230,7 @@ unsafe impl<T: RustToCudaAsync> RustToCudaAsync for SplitSliceOverCudaThreadsDyn
     unsafe fn restore_async<'a, 'stream, A: crate::alloc::CudaAlloc, O>(
         this: owning_ref::BoxRefMut<'a, O, Self>,
         alloc: crate::alloc::CombinedCudaAlloc<Self::CudaAllocationAsync, A>,
-        stream: &'stream crate::host::Stream,
+        stream: crate::host::Stream<'stream>,
     ) -> rustacuda::error::CudaResult<(
         crate::utils::r#async::Async<
             'a,
diff --git a/src/utils/async.rs b/src/utils/async.rs
index 7a33da8d6..24ef8bfc2 100644
--- a/src/utils/async.rs
+++ b/src/utils/async.rs
@@ -100,7 +100,7 @@ impl<C> sealed::Sealed for Option<C> {}
 
 #[cfg(feature = "host")]
 pub struct Async<'a, 'stream, T: BorrowMut<C::Completed>, C: Completion<T> = NoCompletion> {
-    stream: &'stream Stream,
+    stream: Stream<'stream>,
     value: T,
     status: AsyncStatus<'a, T, C>,
     _capture: PhantomData<&'a ()>,
@@ -124,7 +124,7 @@ enum AsyncStatus<'a, T: BorrowMut<C::Completed>, C: Completion<T>> {
 impl<'a, 'stream, T: BorrowMut<C::Completed>, C: Completion<T>> Async<'a, 'stream, T, C> {
     /// Wraps a `value` which is ready on `stream`.
     #[must_use]
-    pub const fn ready(value: T, stream: &'stream Stream) -> Self {
+    pub const fn ready(value: T, stream: Stream<'stream>) -> Self {
         Self {
             stream,
             value,
@@ -139,7 +139,7 @@ impl<'a, 'stream, T: BorrowMut<C::Completed>, C: Completion<T>> Async<'a, 'strea
     /// # Errors
     /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside
     /// CUDA.
-    pub fn pending(value: T, stream: &'stream Stream, completion: C) -> CudaResult<Self> {
+    pub fn pending(value: T, stream: Stream<'stream>, completion: C) -> CudaResult<Self> {
         let (sender, receiver) = oneshot::channel();
         stream.add_callback(Box::new(|result| std::mem::drop(sender.send(result))))?;
 
@@ -203,7 +203,7 @@ impl<'a, 'stream, T: BorrowMut<C::Completed>, C: Completion<T>> Async<'a, 'strea
     /// CUDA.
     pub fn move_to_stream<'stream_new>(
         self,
-        stream: &'stream_new Stream,
+        stream: Stream<'stream_new>,
     ) -> CudaResult<Async<'a, 'stream_new, T, C>> {
         let (old_stream, mut value, status) = self.destructure_into_parts();
 
@@ -229,7 +229,7 @@ impl<'a, 'stream, T: BorrowMut<C::Completed>, C: Completion<T>> Async<'a, 'strea
         };
 
         let event = CudaDropWrapper::from(Event::new(EventFlags::DISABLE_TIMING)?);
-        event.record(old_stream)?;
+        event.record(&old_stream)?;
         stream.wait_event(&event, StreamWaitEventFlags::DEFAULT)?;
 
         let (sender, receiver) = oneshot::channel();
@@ -302,7 +302,7 @@ impl<'a, 'stream, T: BorrowMut<C::Completed>, C: Completion<T>> Async<'a, 'strea
 
                     self.stream
                         .add_callback(Box::new(|result| std::mem::drop(sender.send(result))))?;
-                    event.record(self.stream)?;
+                    event.record(&self.stream)?;
 
                     self.status = AsyncStatus::Processing {
                         receiver,
@@ -318,7 +318,7 @@ impl<'a, 'stream, T: BorrowMut<C::Completed>, C: Completion<T>> Async<'a, 'strea
     }
 
     #[must_use]
-    fn destructure_into_parts(self) -> (&'stream Stream, T, AsyncStatus<'a, T, C>) {
+    fn destructure_into_parts(self) -> (Stream<'stream>, T, AsyncStatus<'a, T, C>) {
         let this = std::mem::ManuallyDrop::new(self);
 
         // Safety: we destructure self into its droppable components,
@@ -354,7 +354,7 @@ impl<'a, 'stream, T: BorrowMut<C::Completed>, C: Completion<T>> Drop for Async<'
 
 #[cfg(feature = "host")]
 struct AsyncFuture<'a, 'stream, T: BorrowMut<C::Completed>, C: Completion<T>> {
-    _stream: PhantomData<&'stream Stream>,
+    _stream: PhantomData<Stream<'stream>>,
     value: Option<T>,
     completion: Option<C>,
     status: AsyncStatus<'a, T, NoCompletion>,
@@ -435,7 +435,7 @@ impl<'a, 'stream, T: BorrowMut<C::Completed>, C: Completion<T>> IntoFuture
         };
 
         AsyncFuture {
-            _stream: PhantomData::<&'stream Stream>,
+            _stream: PhantomData::<Stream<'stream>>,
             value: Some(value),
             completion,
             status,
@@ -476,7 +476,7 @@ impl<'a, 'stream, T: BorrowMut<C::Completed>, C: Completion<T>> Drop
 #[allow(clippy::module_name_repetitions)]
 pub struct AsyncProj<'a, 'stream, T: 'a> {
     _capture: PhantomData<&'a ()>,
-    _stream: PhantomData<&'stream Stream>,
+    _stream: PhantomData<Stream<'stream>>,
     value: T,
     use_callback: Option<Box<dyn FnMut() -> CudaResult<()> + 'a>>,
 }
@@ -495,7 +495,7 @@ impl<'a, 'stream, T: 'a> AsyncProj<'a, 'stream, T> {
     ) -> Self {
         Self {
             _capture: PhantomData::<&'a ()>,
-            _stream: PhantomData::<&'stream Stream>,
+            _stream: PhantomData::<Stream<'stream>>,
             value,
             use_callback,
         }
@@ -540,7 +540,7 @@ impl<'a, 'stream, T: 'a> AsyncProj<'a, 'stream, T> {
     {
         AsyncProj {
             _capture: PhantomData::<&'b ()>,
-            _stream: PhantomData::<&'stream Stream>,
+            _stream: PhantomData::<Stream<'stream>>,
             value: &self.value,
             use_callback: None,
         }
@@ -553,7 +553,7 @@ impl<'a, 'stream, T: 'a> AsyncProj<'a, 'stream, T> {
     {
         AsyncProj {
             _capture: PhantomData::<&'b ()>,
-            _stream: PhantomData::<&'stream Stream>,
+            _stream: PhantomData::<Stream<'stream>>,
             value: &mut self.value,
             use_callback: self.use_callback.as_mut().map(|use_callback| {
                 let use_callback: Box<dyn FnMut() -> CudaResult<()>> = Box::new(use_callback);
@@ -578,7 +578,7 @@ impl<'a, 'stream, T: 'a> AsyncProj<'a, 'stream, &'a T> {
     {
         AsyncProj {
             _capture: PhantomData::<&'b ()>,
-            _stream: PhantomData::<&'stream Stream>,
+            _stream: PhantomData::<Stream<'stream>>,
             value: self.value,
             use_callback: None,
         }
@@ -607,7 +607,7 @@ impl<'a, 'stream, T: 'a> AsyncProj<'a, 'stream, &'a mut T> {
     {
         AsyncProj {
             _capture: PhantomData::<&'b ()>,
-            _stream: PhantomData::<&'stream Stream>,
+            _stream: PhantomData::<Stream<'stream>>,
             value: self.value,
             use_callback: None,
         }
@@ -620,7 +620,7 @@ impl<'a, 'stream, T: 'a> AsyncProj<'a, 'stream, &'a mut T> {
     {
         AsyncProj {
             _capture: PhantomData::<&'b ()>,
-            _stream: PhantomData::<&'stream Stream>,
+            _stream: PhantomData::<Stream<'stream>>,
             value: self.value,
             use_callback: self.use_callback.as_mut().map(|use_callback| {
                 let use_callback: Box<dyn FnMut() -> CudaResult<()>> = Box::new(use_callback);
diff --git a/src/utils/exchange/buffer/host.rs b/src/utils/exchange/buffer/host.rs
index 7db5ba3a2..e252d0ce7 100644
--- a/src/utils/exchange/buffer/host.rs
+++ b/src/utils/exchange/buffer/host.rs
@@ -180,7 +180,7 @@ impl<T: StackOnly + PortableBitSemantics + TypeGraphLayout, const M2D: bool, con
     pub unsafe fn borrow_async<'stream, A: CudaAlloc>(
         &self,
         alloc: A,
-        stream: &'stream crate::host::Stream,
+        stream: crate::host::Stream<'stream>,
     ) -> rustacuda::error::CudaResult<(
         Async<'_, 'stream, DeviceAccessible<CudaExchangeBufferCudaRepresentation<T, M2D, M2H>>>,
         CombinedCudaAlloc<NoCudaAlloc, A>,
@@ -195,7 +195,7 @@ impl<T: StackOnly + PortableBitSemantics + TypeGraphLayout, const M2D: bool, con
             rustacuda::memory::AsyncCopyDestination::async_copy_from(
                 &mut ***device_buffer,
                 self.host_buffer.as_slice(),
-                stream,
+                &stream,
             )?;
         }
 
@@ -217,7 +217,7 @@ impl<T: StackOnly + PortableBitSemantics + TypeGraphLayout, const M2D: bool, con
     pub unsafe fn restore_async<'a, 'stream, A: CudaAlloc, O>(
         mut this: owning_ref::BoxRefMut<'a, O, Self>,
         alloc: CombinedCudaAlloc<NoCudaAlloc, A>,
-        stream: &'stream crate::host::Stream,
+        stream: crate::host::Stream<'stream>,
     ) -> rustacuda::error::CudaResult<(
         Async<'a, 'stream, owning_ref::BoxRefMut<'a, O, Self>, CompletionFnMut<'a, Self>>,
         A,
@@ -232,7 +232,7 @@ impl<T: StackOnly + PortableBitSemantics + TypeGraphLayout, const M2D: bool, con
             rustacuda::memory::AsyncCopyDestination::async_copy_to(
                 &***this.device_buffer.get_mut(),
                 this.host_buffer.as_mut_slice(),
-                stream,
+                &stream,
             )?;
         }
 
diff --git a/src/utils/exchange/buffer/mod.rs b/src/utils/exchange/buffer/mod.rs
index 80fa09bbd..1736b30ea 100644
--- a/src/utils/exchange/buffer/mod.rs
+++ b/src/utils/exchange/buffer/mod.rs
@@ -146,7 +146,7 @@ unsafe impl<T: StackOnly + PortableBitSemantics + TypeGraphLayout, const M2D: bo
     unsafe fn borrow_async<'stream, A: CudaAlloc>(
         &self,
         alloc: A,
-        stream: &'stream crate::host::Stream,
+        stream: crate::host::Stream<'stream>,
     ) -> rustacuda::error::CudaResult<(
         Async<'_, 'stream, DeviceAccessible<Self::CudaRepresentation>>,
         CombinedCudaAlloc<Self::CudaAllocationAsync, A>,
@@ -159,7 +159,7 @@ unsafe impl<T: StackOnly + PortableBitSemantics + TypeGraphLayout, const M2D: bo
     unsafe fn restore_async<'a, 'stream, A: CudaAlloc, O>(
         this: owning_ref::BoxRefMut<'a, O, Self>,
         alloc: CombinedCudaAlloc<Self::CudaAllocationAsync, A>,
-        stream: &'stream crate::host::Stream,
+        stream: crate::host::Stream<'stream>,
     ) -> rustacuda::error::CudaResult<(
         Async<'a, 'stream, owning_ref::BoxRefMut<'a, O, Self>, CompletionFnMut<'a, Self>>,
         A,
diff --git a/src/utils/exchange/wrapper.rs b/src/utils/exchange/wrapper.rs
index faa9c5b44..9f1196bb0 100644
--- a/src/utils/exchange/wrapper.rs
+++ b/src/utils/exchange/wrapper.rs
@@ -119,7 +119,7 @@ impl<T: RustToCudaAsync<CudaAllocationAsync: EmptyCudaAlloc, CudaAllocation: Emp
     /// CUDA
     pub fn move_to_device_async<'stream>(
         mut self,
-        stream: &'stream Stream,
+        stream: Stream<'stream>,
     ) -> CudaResult<Async<'static, 'stream, ExchangeWrapperOnDevice<T>, NoCompletion>> {
         let (cuda_repr, _null_alloc) = unsafe { self.value.borrow_async(NoCudaAlloc, stream) }?;
         let (cuda_repr, _completion): (_, Option<NoCompletion>) =
@@ -132,7 +132,7 @@ impl<T: RustToCudaAsync<CudaAllocationAsync: EmptyCudaAlloc, CudaAllocation: Emp
         // - the kernel is launched on the passed-in [`Stream`]
         unsafe {
             self.device_box
-                .async_copy_from(&*self.locked_cuda_repr, stream)
+                .async_copy_from(&*self.locked_cuda_repr, &stream)
         }?;
 
         Async::pending(
@@ -207,7 +207,7 @@ impl<T: RustToCudaAsync<CudaAllocationAsync: EmptyCudaAlloc, CudaAllocation: Emp
     /// CUDA
     pub fn move_to_host_async<'stream>(
         self,
-        stream: &'stream Stream,
+        stream: Stream<'stream>,
     ) -> CudaResult<
         Async<
             'static,
@@ -265,7 +265,7 @@ impl<
     /// CUDA
     pub fn move_to_host_async(
         self,
-        stream: &'stream Stream,
+        stream: Stream<'stream>,
     ) -> CudaResult<
         Async<
             'static,

From 0b355cc4c1bd417e8d3eefd1652a329b16c37f01 Mon Sep 17 00:00:00 2001
From: Juniper Tyree <juniper.tyree@helsinki.fi>
Date: Sat, 13 Jan 2024 08:56:07 +0000
Subject: [PATCH 105/120] Try without ref proj

---
 src/kernel/param.rs           | 40 +++++++++++++++++++++--------------
 src/utils/exchange/wrapper.rs |  2 +-
 2 files changed, 25 insertions(+), 17 deletions(-)

diff --git a/src/kernel/param.rs b/src/kernel/param.rs
index 8cafb8c8d..ff53f6dd4 100644
--- a/src/kernel/param.rs
+++ b/src/kernel/param.rs
@@ -156,7 +156,7 @@ impl<
     type AsyncHostType<'stream, 'b> = crate::utils::r#async::AsyncProj<
         'b,
         'stream,
-        &'b crate::host::HostAndDeviceConstRef<'b, T>,
+        crate::host::HostAndDeviceConstRef<'b, T>,
     > where Self: 'b;
     #[cfg(any(feature = "device", doc))]
     type DeviceType<'b> = &'b T where Self: 'b;
@@ -173,8 +173,9 @@ impl<
     where
         Self: 'b,
     {
+        let _ = stream;
         crate::host::HostAndDeviceConstRef::with_new(param, |const_ref| {
-            inner.with(const_ref.as_async(stream).as_ref())
+            inner.with(unsafe { crate::utils::r#async::AsyncProj::new(const_ref, None) })
         })
     }
 
@@ -257,9 +258,10 @@ impl<
     where
         Self: 'b,
     {
+        let _ = stream;
         // FIXME: forward impl
         crate::host::HostAndDeviceConstRef::with_new(param, |const_ref| {
-            inner.with(const_ref.as_async(stream).as_ref())
+            inner.with(unsafe { crate::utils::r#async::AsyncProj::new(const_ref, None) })
         })
     }
 
@@ -272,7 +274,8 @@ impl<
     where
         Self: 'b,
     {
-        let param = unsafe { param.unwrap_ref_unchecked() };
+        let param_ref = param.proj_ref();
+        let param = unsafe { param_ref.unwrap_ref_unchecked() };
         inner(Some(&param_as_raw_bytes(param.for_host())))
     }
 
@@ -360,7 +363,7 @@ impl<
     type AsyncHostType<'stream, 'b> = crate::utils::r#async::AsyncProj<
         'b,
         'stream,
-        &'b crate::host::HostAndDeviceConstRef<'b, T>
+        crate::host::HostAndDeviceConstRef<'b, T>
     > where Self: 'b;
     #[cfg(any(feature = "device", doc))]
     type DeviceType<'b> = &'b T where Self: 'b;
@@ -379,8 +382,9 @@ impl<
     where
         Self: 'b,
     {
-        crate::host::HostAndDeviceMutRef::with_new(param, |const_ref| {
-            inner.with(const_ref.as_ref().as_async(stream).as_ref())
+        let _ = stream;
+        crate::host::HostAndDeviceMutRef::with_new(param, |mut_ref| {
+            inner.with(unsafe { crate::utils::r#async::AsyncProj::new(mut_ref.as_ref(), None) })
         })
     }
 
@@ -580,7 +584,7 @@ impl<'a, T: Sync + RustToCuda> CudaKernelParameter for &'a DeepPerThreadBorrow<T
     type AsyncHostType<'stream, 'b> = crate::utils::r#async::AsyncProj<
         'b,
         'stream,
-        &'b crate::host::HostAndDeviceConstRef<
+        crate::host::HostAndDeviceConstRef<
             'b,
             DeviceAccessible<<T as RustToCuda>::CudaRepresentation>,
         >,
@@ -601,8 +605,9 @@ impl<'a, T: Sync + RustToCuda> CudaKernelParameter for &'a DeepPerThreadBorrow<T
     where
         Self: 'b,
     {
+        let _ = stream;
         crate::lend::LendToCuda::lend_to_cuda(param, |param| {
-            inner.with(param.as_async(stream).as_ref())
+            inner.with(unsafe { crate::utils::r#async::AsyncProj::new(param, None) })
         })
     }
 
@@ -663,7 +668,7 @@ impl<'a, T: Sync + RustToCuda + SafeMutableAliasing> CudaKernelParameter
     type AsyncHostType<'stream, 'b> = crate::utils::r#async::AsyncProj<
         'b,
         'stream,
-        &'b mut crate::host::HostAndDeviceMutRef<
+        crate::host::HostAndDeviceMutRef<
             'b,
             DeviceAccessible<<T as RustToCuda>::CudaRepresentation>,
         >,
@@ -690,7 +695,7 @@ impl<'a, T: Sync + RustToCuda + SafeMutableAliasing> CudaKernelParameter
             inner.with({
                 // Safety: this projection cannot be moved to a different stream
                 //         without first exiting lend_to_cuda_mut and synchronizing
-                unsafe { crate::utils::r#async::AsyncProj::new(&mut param.into_mut(), None) }
+                unsafe { crate::utils::r#async::AsyncProj::new(param.into_mut(), None) }
             })
         })
     }
@@ -727,7 +732,7 @@ impl<'a, T: Sync + RustToCuda + SafeMutableAliasing> CudaKernelParameter
         Self: 'b,
     {
         param.record_mut_use()?;
-        let param = unsafe { param.unwrap_unchecked() };
+        let mut param = unsafe { param.unwrap_unchecked() };
         Ok(param.for_device())
     }
 
@@ -858,8 +863,9 @@ impl<'a, T: Sync + RustToCuda> CudaKernelParameter for &'a PtxJit<DeepPerThreadB
         Self: 'b,
     {
         // FIXME: forward impl
+        let _ = stream;
         crate::lend::LendToCuda::lend_to_cuda(param, |param| {
-            inner.with(param.as_async(stream).as_ref())
+            inner.with(unsafe { crate::utils::r#async::AsyncProj::new(param, None) })
         })
     }
 
@@ -872,7 +878,8 @@ impl<'a, T: Sync + RustToCuda> CudaKernelParameter for &'a PtxJit<DeepPerThreadB
     where
         Self: 'b,
     {
-        let param = unsafe { param.unwrap_ref_unchecked() };
+        let param_ref = param.proj_ref();
+        let param = unsafe { param_ref.unwrap_unchecked() };
         inner(Some(&param_as_raw_bytes(param.for_host())))
     }
 
@@ -945,7 +952,7 @@ impl<'a, T: Sync + RustToCuda + SafeMutableAliasing> CudaKernelParameter
             inner.with({
                 // Safety: this projection cannot be moved to a different stream
                 //         without first exiting lend_to_cuda_mut and synchronizing
-                unsafe { crate::utils::r#async::AsyncProj::new(&mut param.into_mut(), None) }
+                unsafe { crate::utils::r#async::AsyncProj::new(param.into_mut(), None) }
             })
         })
     }
@@ -959,7 +966,8 @@ impl<'a, T: Sync + RustToCuda + SafeMutableAliasing> CudaKernelParameter
     where
         Self: 'b,
     {
-        let param = unsafe { param.as_ref().unwrap_unchecked() };
+        let param_ref = param.proj_ref();
+        let param = unsafe { param_ref.unwrap_unchecked() };
         inner(Some(&param_as_raw_bytes(param.for_host())))
     }
 
diff --git a/src/utils/exchange/wrapper.rs b/src/utils/exchange/wrapper.rs
index 9f1196bb0..f84bfa04d 100644
--- a/src/utils/exchange/wrapper.rs
+++ b/src/utils/exchange/wrapper.rs
@@ -340,7 +340,7 @@ impl<
     ) -> AsyncProj<
         '_,
         'stream,
-        HostAndDeviceMutRef<DeviceAccessible<<T as RustToCuda>::CudaRepresentation>>,
+        HostAndDeviceMutRef<'_, DeviceAccessible<<T as RustToCuda>::CudaRepresentation>>,
     >
     where
         T: SafeMutableAliasing,

From e6f20dc77ecb07de54d5e16addbbf39376542e8e Mon Sep 17 00:00:00 2001
From: Juniper Tyree <juniper.tyree@helsinki.fi>
Date: Sat, 13 Jan 2024 09:10:07 +0000
Subject: [PATCH 106/120] Try add extract ref

---
 src/host/mod.rs    | 11 +++++++
 src/utils/async.rs | 76 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 87 insertions(+)

diff --git a/src/host/mod.rs b/src/host/mod.rs
index 23bab2706..589556560 100644
--- a/src/host/mod.rs
+++ b/src/host/mod.rs
@@ -229,6 +229,17 @@ impl<'a, T: PortableBitSemantics + TypeGraphLayout> HostAndDeviceMutRef<'a, T> {
         }
     }
 
+    #[must_use]
+    pub(crate) unsafe fn as_mut<'b>(&'b mut self) -> HostAndDeviceMutRef<'b, T>
+    where
+        'a: 'b,
+    {
+        HostAndDeviceMutRef {
+            device_box: self.device_box,
+            host_ref: self.host_ref,
+        }
+    }
+
     #[must_use]
     pub fn into_mut<'b>(self) -> HostAndDeviceMutRef<'b, T>
     where
diff --git a/src/utils/async.rs b/src/utils/async.rs
index 24ef8bfc2..be4e2458c 100644
--- a/src/utils/async.rs
+++ b/src/utils/async.rs
@@ -333,6 +333,82 @@ impl<'a, 'stream, T: BorrowMut<C::Completed>, C: Completion<T>> Async<'a, 'strea
     }
 }
 
+#[cfg(feature = "host")]
+impl<
+        'a,
+        'stream,
+        T: crate::safety::PortableBitSemantics + const_type_layout::TypeGraphLayout,
+        C: Completion<crate::host::HostAndDeviceConstRef<'a, T>>,
+    > Async<'a, 'stream, crate::host::HostAndDeviceConstRef<'a, T>, C>
+where
+    crate::host::HostAndDeviceConstRef<'a, T>: BorrowMut<C::Completed>,
+{
+    pub const fn extract_ref(
+        &self,
+    ) -> AsyncProj<'_, 'stream, crate::host::HostAndDeviceConstRef<'_, T>> {
+        // Safety: this projection captures this async
+        unsafe { AsyncProj::new(self.value.as_ref(), None) }
+    }
+}
+
+#[cfg(feature = "host")]
+impl<
+        'a,
+        'stream,
+        T: crate::safety::PortableBitSemantics + const_type_layout::TypeGraphLayout,
+        C: Completion<crate::host::HostAndDeviceMutRef<'a, T>>,
+    > Async<'a, 'stream, crate::host::HostAndDeviceMutRef<'a, T>, C>
+where
+    crate::host::HostAndDeviceMutRef<'a, T>: BorrowMut<C::Completed>,
+{
+    pub fn extract_ref(&self) -> AsyncProj<'_, 'stream, crate::host::HostAndDeviceConstRef<'_, T>> {
+        // Safety: this projection captures this async
+        unsafe { AsyncProj::new(self.value.as_ref(), None) }
+    }
+
+    pub fn extract_mut(
+        &mut self,
+    ) -> AsyncProj<'_, 'stream, crate::host::HostAndDeviceMutRef<'_, T>> {
+        // Safety: this projection captures this async
+        unsafe {
+            AsyncProj::new(
+                self.value.as_mut(),
+                Some(Box::new(|| {
+                    let completion = match &mut self.status {
+                        AsyncStatus::Completed { result } => {
+                            (*result)?;
+                            C::no_op()
+                        },
+                        AsyncStatus::Processing {
+                            receiver: _,
+                            completion,
+                            event: _,
+                            _capture,
+                        } => std::mem::replace(completion, C::no_op()),
+                    };
+
+                    let event = CudaDropWrapper::from(Event::new(EventFlags::DISABLE_TIMING)?);
+
+                    let (sender, receiver) = oneshot::channel();
+
+                    self.stream
+                        .add_callback(Box::new(|result| std::mem::drop(sender.send(result))))?;
+                    event.record(&self.stream)?;
+
+                    self.status = AsyncStatus::Processing {
+                        receiver,
+                        completion,
+                        event: Some(event),
+                        _capture: PhantomData,
+                    };
+
+                    Ok(())
+                })),
+            )
+        }
+    }
+}
+
 #[cfg(feature = "host")]
 impl<'a, 'stream, T: BorrowMut<C::Completed>, C: Completion<T>> Drop for Async<'a, 'stream, T, C> {
     fn drop(&mut self) {

From 4148959b21ba72881434e6d1f94fd4bd35f27e2f Mon Sep 17 00:00:00 2001
From: Juniper Tyree <juniper.tyree@helsinki.fi>
Date: Sat, 13 Jan 2024 09:22:18 +0000
Subject: [PATCH 107/120] Fix doc link

---
 src/utils/exchange/wrapper.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/utils/exchange/wrapper.rs b/src/utils/exchange/wrapper.rs
index f84bfa04d..bb137a4af 100644
--- a/src/utils/exchange/wrapper.rs
+++ b/src/utils/exchange/wrapper.rs
@@ -87,7 +87,7 @@ impl<T: RustToCuda<CudaAllocation: EmptyCudaAlloc>> ExchangeWrapperOnHost<T> {
 
     /// Moves the data synchronously to the CUDA device, where it can then be
     /// lent out immutably via [`ExchangeWrapperOnDevice::as_ref`], or mutably
-    /// via [`ExchangeWrapperOnDevice::as_mut`].
+    /// via [`ExchangeWrapperOnDevice::as_mut_async`](Async::as_mut_async).
     ///
     /// # Errors
     /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside

From d1f141e9044ffa24bd286c3b8dd1213ca74436cf Mon Sep 17 00:00:00 2001
From: Juniper Tyree <juniper.tyree@helsinki.fi>
Date: Sun, 14 Jan 2024 03:22:39 +0000
Subject: [PATCH 108/120] clean up kernel signature check

---
 .../src/rust_to_cuda/field_copy.rs            |  2 +-
 rust-cuda-kernel/src/kernel/link/config.rs    |  2 +-
 rust-cuda-kernel/src/kernel/link/mod.rs       | 30 +++++++---
 rust-cuda-kernel/src/kernel/mod.rs            |  2 +
 .../wrapper/generate/cuda_generic_function.rs |  2 +-
 .../kernel/wrapper/generate/cuda_wrapper.rs   | 40 +++++--------
 .../kernel/wrapper/generate/host_kernel_ty.rs |  2 +-
 .../generate/host_linker_macro/args_trait.rs  |  2 +-
 .../generate/host_linker_macro/get_ptx.rs     | 56 +++++++++----------
 .../wrapper/generate/host_linker_macro/mod.rs |  2 +-
 rust-cuda-kernel/src/kernel/wrapper/mod.rs    |  6 +-
 src/safety/mod.rs                             |  4 +-
 ...kernel_signature.rs => ptx_entry_point.rs} | 18 ++++--
 src/safety/ptx_kernel_signature.rs            | 51 +++++++++++++++++
 src/safety/type_layout.rs                     | 33 -----------
 15 files changed, 136 insertions(+), 116 deletions(-)
 rename src/safety/{kernel_signature.rs => ptx_entry_point.rs} (62%)
 create mode 100644 src/safety/ptx_kernel_signature.rs
 delete mode 100644 src/safety/type_layout.rs

diff --git a/rust-cuda-derive/src/rust_to_cuda/field_copy.rs b/rust-cuda-derive/src/rust_to_cuda/field_copy.rs
index 05d133156..18fd867c1 100644
--- a/rust-cuda-derive/src/rust_to_cuda/field_copy.rs
+++ b/rust-cuda-derive/src/rust_to_cuda/field_copy.rs
@@ -1,7 +1,7 @@
 use proc_macro2::TokenStream;
 use quote::{format_ident, quote, ToTokens};
 
-use super::field_ty::CudaReprFieldTy;
+use crate::rust_to_cuda::field_ty::CudaReprFieldTy;
 
 #[allow(clippy::too_many_arguments, clippy::too_many_lines)]
 pub fn impl_field_copy_init_and_expand_alloc_type(
diff --git a/rust-cuda-kernel/src/kernel/link/config.rs b/rust-cuda-kernel/src/kernel/link/config.rs
index d7a4d0458..469318f02 100644
--- a/rust-cuda-kernel/src/kernel/link/config.rs
+++ b/rust-cuda-kernel/src/kernel/link/config.rs
@@ -1,6 +1,6 @@
 use std::{collections::HashMap, path::PathBuf};
 
-use super::super::lints::{parse_ptx_lint_level, LintLevel, PtxLint};
+use crate::kernel::lints::{parse_ptx_lint_level, LintLevel, PtxLint};
 
 #[allow(clippy::module_name_repetitions)]
 pub(super) struct LinkKernelConfig {
diff --git a/rust-cuda-kernel/src/kernel/link/mod.rs b/rust-cuda-kernel/src/kernel/link/mod.rs
index 8424e7056..b64776707 100644
--- a/rust-cuda-kernel/src/kernel/link/mod.rs
+++ b/rust-cuda-kernel/src/kernel/link/mod.rs
@@ -13,15 +13,17 @@ use std::{
 
 use colored::Colorize;
 use proc_macro::TokenStream;
+use proc_macro2::Span;
 use ptx_builder::{
     builder::{BuildStatus, Builder, MessageFormat, Profile},
     error::{BuildErrorKind, Error, Result},
 };
 
-use super::{
+use crate::kernel::{
     lints::{LintLevel, PtxLint},
     utils::skip_kernel_compilation,
-    KERNEL_TYPE_USE_END_CANARY, KERNEL_TYPE_USE_START_CANARY,
+    KERNEL_TYPE_LAYOUT_IDENT, KERNEL_TYPE_USE_END_CANARY, KERNEL_TYPE_USE_START_CANARY,
+    PTX_CSTR_IDENT,
 };
 
 mod config;
@@ -33,7 +35,9 @@ use error::emit_ptx_build_error;
 use ptx_compiler_sys::NvptxError;
 
 pub fn check_kernel(tokens: TokenStream) -> TokenStream {
-    proc_macro_error::set_dummy(quote! {::core::result::Result::Err(())});
+    proc_macro_error::set_dummy(
+        quote! {::core::compile_error!("rust-cuda PTX kernel check failed");},
+    );
 
     let CheckKernelConfig {
         kernel,
@@ -54,7 +58,7 @@ pub fn check_kernel(tokens: TokenStream) -> TokenStream {
     let kernel_ptx = compile_kernel(&kernel, &crate_name, &crate_path, Specialisation::Check);
 
     let Some(kernel_ptx) = kernel_ptx else {
-        return quote!(::core::result::Result::Err(())).into();
+        return quote!(::core::compile_error!("rust-cuda PTX kernel check failed");).into();
     };
 
     check_kernel_ptx_and_report(
@@ -64,13 +68,18 @@ pub fn check_kernel(tokens: TokenStream) -> TokenStream {
         &HashMap::new(),
     );
 
-    quote!(::core::result::Result::Ok(())).into()
+    quote!().into()
 }
 
 #[allow(clippy::module_name_repetitions)]
 pub fn link_kernel(tokens: TokenStream) -> TokenStream {
+    let ptx_cstr_ident = syn::Ident::new(PTX_CSTR_IDENT, Span::call_site());
+    let ffi_signature_ident = syn::Ident::new(KERNEL_TYPE_LAYOUT_IDENT, Span::call_site());
+
     proc_macro_error::set_dummy(quote! {
-        const PTX_CSTR: &'static ::core::ffi::CStr = c"ERROR in this PTX compilation";
+        const #ptx_cstr_ident: &'static ::core::ffi::CStr = c"ERROR in this PTX compilation";
+        const #ffi_signature_ident: &[u8; 29] = b"ERROR in this PTX compilation";
+        ::core::compile_error!("rust-cuda PTX kernel compilation failed");
     });
 
     let LinkKernelConfig {
@@ -94,7 +103,7 @@ pub fn link_kernel(tokens: TokenStream) -> TokenStream {
 
     if skip_kernel_compilation() {
         return quote! {
-            const PTX_CSTR: &'static ::core::ffi::CStr = c"CLIPPY skips specialised PTX compilation";
+            const #ptx_cstr_ident: &'static ::core::ffi::CStr = c"CLIPPY skips specialised PTX compilation";
         }
         .into();
     }
@@ -106,7 +115,9 @@ pub fn link_kernel(tokens: TokenStream) -> TokenStream {
         Specialisation::Link(&specialisation),
     ) else {
         return (quote! {
-            const PTX_CSTR: &'static ::core::ffi::CStr = c"ERROR in this PTX compilation";
+            const #ptx_cstr_ident: &'static ::core::ffi::CStr = c"ERROR in this PTX compilation";
+            const #ffi_signature_ident: &[u8; 29] = b"ERROR in this PTX compilation";
+            ::core::compile_error!("rust-cuda PTX kernel compilation failed");
         })
         .into();
     };
@@ -137,7 +148,8 @@ pub fn link_kernel(tokens: TokenStream) -> TokenStream {
     let kernel_ptx =
         quote! { unsafe { ::core::ffi::CStr::from_bytes_with_nul_unchecked(#kernel_ptx) } };
 
-    (quote! { const PTX_CSTR: &'static ::core::ffi::CStr = #kernel_ptx; #(#type_layouts)* }).into()
+    (quote! { const #ptx_cstr_ident: &'static ::core::ffi::CStr = #kernel_ptx; #(#type_layouts)* })
+        .into()
 }
 
 fn extract_ptx_kernel_layout(kernel_ptx: &mut String) -> Vec<proc_macro2::TokenStream> {
diff --git a/rust-cuda-kernel/src/kernel/mod.rs b/rust-cuda-kernel/src/kernel/mod.rs
index 9e3a80789..86ffbd8fd 100644
--- a/rust-cuda-kernel/src/kernel/mod.rs
+++ b/rust-cuda-kernel/src/kernel/mod.rs
@@ -7,3 +7,5 @@ mod utils;
 
 const KERNEL_TYPE_USE_START_CANARY: &str = "// <rust-cuda-kernel-param-type-use-start> //";
 const KERNEL_TYPE_USE_END_CANARY: &str = "// <rust-cuda-kernel-param-type-use-end> //";
+const KERNEL_TYPE_LAYOUT_IDENT: &str = "KERNEL_SIGNATURE_LAYOUT";
+const PTX_CSTR_IDENT: &str = "PTX_CSTR";
diff --git a/rust-cuda-kernel/src/kernel/wrapper/generate/cuda_generic_function.rs b/rust-cuda-kernel/src/kernel/wrapper/generate/cuda_generic_function.rs
index 62cb3456d..ccf21c96b 100644
--- a/rust-cuda-kernel/src/kernel/wrapper/generate/cuda_generic_function.rs
+++ b/rust-cuda-kernel/src/kernel/wrapper/generate/cuda_generic_function.rs
@@ -1,7 +1,7 @@
 use proc_macro2::TokenStream;
 use syn::spanned::Spanned;
 
-use super::super::{DeclGenerics, FuncIdent};
+use crate::kernel::wrapper::{DeclGenerics, FuncIdent};
 
 pub(in super::super) fn quote_cuda_generic_function(
     crate_path: &syn::Path,
diff --git a/rust-cuda-kernel/src/kernel/wrapper/generate/cuda_wrapper.rs b/rust-cuda-kernel/src/kernel/wrapper/generate/cuda_wrapper.rs
index 74ab20f5b..938074e56 100644
--- a/rust-cuda-kernel/src/kernel/wrapper/generate/cuda_wrapper.rs
+++ b/rust-cuda-kernel/src/kernel/wrapper/generate/cuda_wrapper.rs
@@ -1,9 +1,9 @@
 use proc_macro2::TokenStream;
 use syn::spanned::Spanned;
 
-use super::super::{
-    super::{KERNEL_TYPE_USE_END_CANARY, KERNEL_TYPE_USE_START_CANARY},
-    FuncIdent, FunctionInputs, ImplGenerics,
+use crate::kernel::{
+    wrapper::{FuncIdent, FunctionInputs, ImplGenerics},
+    KERNEL_TYPE_LAYOUT_IDENT, KERNEL_TYPE_USE_END_CANARY, KERNEL_TYPE_USE_START_CANARY,
 };
 
 #[allow(clippy::too_many_lines)]
@@ -25,16 +25,6 @@ pub(in super::super) fn quote_cuda_wrapper(
     let (ffi_inputs, ffi_types) =
         specialise_ffi_input_types(crate_path, inputs, func, impl_generics);
 
-    let func_layout_params = func_params
-        .iter()
-        .map(|ident| {
-            syn::Ident::new(
-                &format!("__{func_ident_hash}_{ident}_layout").to_uppercase(),
-                ident.span(),
-            )
-        })
-        .collect::<Vec<_>>();
-
     let ffi_param_ptx_jit_wrap = func_inputs.iter().enumerate().rev().fold(
         quote! {
             #func_ident(#(#func_params),*)
@@ -70,6 +60,9 @@ pub(in super::super) fn quote_cuda_wrapper(
         })
         .collect::<Vec<_>>();
 
+    let ffi_signature_ident = syn::Ident::new(KERNEL_TYPE_LAYOUT_IDENT, func_ident.span());
+    let ffi_signature_ty = quote! { extern "C" fn(#(#ffi_types),*) };
+
     quote! {
         #[cfg(target_os = "cuda")]
         #[#crate_path::device::specialise_kernel_function(#func_ident)]
@@ -89,20 +82,13 @@ pub(in super::super) fn quote_cuda_wrapper(
                 #crate_path::utils::shared::init();
             }
 
-            unsafe {
-                ::core::arch::asm!(#KERNEL_TYPE_USE_START_CANARY);
-            }
-            #(
-                #[no_mangle]
-                static #func_layout_params: [
-                    u8; #crate_path::deps::const_type_layout::serialised_type_graph_len::<#ffi_types>()
-                ] = #crate_path::deps::const_type_layout::serialise_type_graph::<#ffi_types>();
-
-                unsafe { ::core::ptr::read_volatile(&#func_layout_params[0]) };
-            )*
-            unsafe {
-                ::core::arch::asm!(#KERNEL_TYPE_USE_END_CANARY);
-            }
+            unsafe { ::core::arch::asm!(#KERNEL_TYPE_USE_START_CANARY); }
+            #[no_mangle]
+            static #ffi_signature_ident: [
+                u8; #crate_path::deps::const_type_layout::serialised_type_graph_len::<#ffi_signature_ty>()
+            ] = #crate_path::deps::const_type_layout::serialise_type_graph::<#ffi_signature_ty>();
+            unsafe { ::core::ptr::read_volatile(&#ffi_signature_ident) };
+            unsafe { ::core::arch::asm!(#KERNEL_TYPE_USE_END_CANARY); }
 
             #ffi_param_ptx_jit_wrap
         }
diff --git a/rust-cuda-kernel/src/kernel/wrapper/generate/host_kernel_ty.rs b/rust-cuda-kernel/src/kernel/wrapper/generate/host_kernel_ty.rs
index 84ece28b5..78e972d69 100644
--- a/rust-cuda-kernel/src/kernel/wrapper/generate/host_kernel_ty.rs
+++ b/rust-cuda-kernel/src/kernel/wrapper/generate/host_kernel_ty.rs
@@ -1,6 +1,6 @@
 use proc_macro2::TokenStream;
 
-use super::super::{DeclGenerics, FuncIdent, FunctionInputs, ImplGenerics};
+use crate::kernel::wrapper::{DeclGenerics, FuncIdent, FunctionInputs, ImplGenerics};
 
 pub(in super::super) fn quote_host_kernel_ty(
     crate_path: &syn::Path,
diff --git a/rust-cuda-kernel/src/kernel/wrapper/generate/host_linker_macro/args_trait.rs b/rust-cuda-kernel/src/kernel/wrapper/generate/host_linker_macro/args_trait.rs
index 25cc27955..26653e435 100644
--- a/rust-cuda-kernel/src/kernel/wrapper/generate/host_linker_macro/args_trait.rs
+++ b/rust-cuda-kernel/src/kernel/wrapper/generate/host_linker_macro/args_trait.rs
@@ -1,6 +1,6 @@
 use proc_macro2::TokenStream;
 
-use super::super::super::{FunctionInputs, ImplGenerics};
+use crate::kernel::wrapper::{FunctionInputs, ImplGenerics};
 
 pub(in super::super) fn quote_args_trait(
     args: &syn::Ident,
diff --git a/rust-cuda-kernel/src/kernel/wrapper/generate/host_linker_macro/get_ptx.rs b/rust-cuda-kernel/src/kernel/wrapper/generate/host_linker_macro/get_ptx.rs
index 303b43ff1..955093e5c 100644
--- a/rust-cuda-kernel/src/kernel/wrapper/generate/host_linker_macro/get_ptx.rs
+++ b/rust-cuda-kernel/src/kernel/wrapper/generate/host_linker_macro/get_ptx.rs
@@ -1,9 +1,11 @@
 use proc_macro2::TokenStream;
 use syn::spanned::Spanned;
 
-use crate::kernel::utils::skip_kernel_compilation;
-
-use super::super::super::{DeclGenerics, FuncIdent, FunctionInputs, ImplGenerics};
+use crate::kernel::{
+    utils::skip_kernel_compilation,
+    wrapper::{DeclGenerics, FuncIdent, FunctionInputs, ImplGenerics},
+    KERNEL_TYPE_LAYOUT_IDENT, PTX_CSTR_IDENT,
+};
 
 #[allow(clippy::too_many_arguments)]
 pub(super) fn quote_get_ptx(
@@ -38,15 +40,17 @@ pub(super) fn quote_get_ptx(
     let cpu_func_lifetime_erased_types =
         generate_lifetime_erased_types(crate_path, &args, generics, inputs, macro_type_ids);
 
+    let ptx_cstr_ident = syn::Ident::new(PTX_CSTR_IDENT, func_ident.span());
+
     let matching_kernel_assert = if skip_kernel_compilation() {
         quote!()
     } else {
         quote::quote_spanned! { func_ident.span()=>
-            const _: #crate_path::safety::kernel_signature::Assert<{
-                #crate_path::safety::kernel_signature::CpuAndGpuKernelSignatures::Match
-            }> = #crate_path::safety::kernel_signature::Assert::<{
-                #crate_path::safety::kernel_signature::check(
-                    PTX_CSTR.to_bytes(),
+            const _: #crate_path::safety::ptx_entry_point::Assert<{
+                #crate_path::safety::ptx_entry_point::HostAndDeviceKernelEntryPoint::Match
+            }> = #crate_path::safety::ptx_entry_point::Assert::<{
+                #crate_path::safety::ptx_entry_point::check(
+                    #ptx_cstr_ident.to_bytes(),
                     #crate_path::kernel::specialise_kernel_entry_point!(
                         #func_ident_hash #generic_start_token
                             #($#macro_type_ids),*
@@ -57,27 +61,19 @@ pub(super) fn quote_get_ptx(
         }
     };
 
-    let type_layout_asserts = if skip_kernel_compilation() {
-        Vec::new()
+    let signature_layout_assert = if skip_kernel_compilation() {
+        quote!()
     } else {
-        cpu_func_lifetime_erased_types
-            .iter()
-            .zip(func_params.iter())
-            .map(|(ty, param)| {
-                let layout_param = syn::Ident::new(
-                    &format!("__{func_ident_hash}_{param}_layout").to_uppercase(),
-                    param.span(),
-                );
-
-                quote::quote_spanned! { ty.span()=>
-                    const _: #crate_path::safety::type_layout::Assert<{
-                        #crate_path::safety::type_layout::CpuAndGpuTypeLayouts::Match
-                    }> = #crate_path::safety::type_layout::Assert::<{
-                        #crate_path::safety::type_layout::check::<#ty>(#layout_param)
-                    }>;
-                }
-            })
-            .collect::<Vec<_>>()
+        let ffi_signature_ident = syn::Ident::new(KERNEL_TYPE_LAYOUT_IDENT, func_ident.span());
+        let ffi_signature_ty = quote! { extern "C" fn(#(#cpu_func_lifetime_erased_types),*) };
+
+        quote::quote_spanned! { func_ident.span()=>
+            const _: #crate_path::safety::ptx_kernel_signature::Assert<{
+                #crate_path::safety::ptx_kernel_signature::HostAndDeviceKernelSignatureTypeLayout::Match
+            }> = #crate_path::safety::ptx_kernel_signature::Assert::<{
+                #crate_path::safety::ptx_kernel_signature::check::<#ffi_signature_ty>(#ffi_signature_ident)
+            }>;
+        }
     };
 
     let private_func_params = func_params
@@ -107,9 +103,9 @@ pub(super) fn quote_get_ptx(
 
             #matching_kernel_assert
 
-            #(#type_layout_asserts)*
+            #signature_layout_assert
 
-            PTX_CSTR
+            #ptx_cstr_ident
         }
     }
 }
diff --git a/rust-cuda-kernel/src/kernel/wrapper/generate/host_linker_macro/mod.rs b/rust-cuda-kernel/src/kernel/wrapper/generate/host_linker_macro/mod.rs
index cfc0af751..36479b62a 100644
--- a/rust-cuda-kernel/src/kernel/wrapper/generate/host_linker_macro/mod.rs
+++ b/rust-cuda-kernel/src/kernel/wrapper/generate/host_linker_macro/mod.rs
@@ -1,6 +1,6 @@
 use proc_macro2::TokenStream;
 
-use super::super::{DeclGenerics, FuncIdent, FunctionInputs, ImplGenerics, KernelConfig};
+use crate::kernel::wrapper::{DeclGenerics, FuncIdent, FunctionInputs, ImplGenerics, KernelConfig};
 
 mod args_trait;
 mod get_ptx;
diff --git a/rust-cuda-kernel/src/kernel/wrapper/mod.rs b/rust-cuda-kernel/src/kernel/wrapper/mod.rs
index 7793c2dc0..f400e3147 100644
--- a/rust-cuda-kernel/src/kernel/wrapper/mod.rs
+++ b/rust-cuda-kernel/src/kernel/wrapper/mod.rs
@@ -9,7 +9,7 @@ mod config;
 mod generate;
 mod parse;
 
-use super::lints::{parse_ptx_lint_level, LintLevel, PtxLint};
+use crate::kernel::lints::{parse_ptx_lint_level, LintLevel, PtxLint};
 
 use config::KernelConfig;
 use generate::{
@@ -346,8 +346,8 @@ fn quote_generic_check(
 
     quote::quote_spanned! { func_ident_hash.span()=>
         #[cfg(not(target_os = "cuda"))]
-        const _: ::core::result::Result<(), ()> = #crate_path::kernel::check_kernel!(
+        #crate_path::kernel::check_kernel! {
             #func_ident #func_ident_hash #crate_name #crate_manifest_dir
-        );
+        }
     }
 }
diff --git a/src/safety/mod.rs b/src/safety/mod.rs
index c26ef3389..7e078e34e 100644
--- a/src/safety/mod.rs
+++ b/src/safety/mod.rs
@@ -4,9 +4,9 @@ mod portable;
 mod stack_only;
 
 #[doc(hidden)]
-pub mod kernel_signature;
+pub mod ptx_entry_point;
 #[doc(hidden)]
-pub mod type_layout;
+pub mod ptx_kernel_signature;
 
 pub use aliasing::SafeMutableAliasing;
 pub use portable::PortableBitSemantics;
diff --git a/src/safety/kernel_signature.rs b/src/safety/ptx_entry_point.rs
similarity index 62%
rename from src/safety/kernel_signature.rs
rename to src/safety/ptx_entry_point.rs
index 96bdd3f32..b1d62cf4e 100644
--- a/src/safety/kernel_signature.rs
+++ b/src/safety/ptx_entry_point.rs
@@ -1,30 +1,36 @@
 #[derive(PartialEq, Eq, core::marker::ConstParamTy)]
-pub enum CpuAndGpuKernelSignatures {
+pub enum HostAndDeviceKernelEntryPoint {
     Match,
     Mismatch,
 }
 
-pub struct Assert<const MATCH: CpuAndGpuKernelSignatures>;
+pub struct Assert<const MATCH: HostAndDeviceKernelEntryPoint>;
 
 #[must_use]
-pub const fn check(ptx: &[u8], entry_point: &[u8]) -> CpuAndGpuKernelSignatures {
+pub const fn check(ptx: &[u8], entry_point: &[u8]) -> HostAndDeviceKernelEntryPoint {
+    const PTX_ERROR_MESSAGE: &[u8] = b"ERROR in this PTX compilation";
     const KERNEL_TYPE: &[u8] = b".visible .entry ";
 
+    // Short-circuit to avoid extra errors when PTX compilation fails
+    if ptx.len() == PTX_ERROR_MESSAGE.len() && starts_with(ptx, PTX_ERROR_MESSAGE, 0) {
+        return HostAndDeviceKernelEntryPoint::Match;
+    }
+
     let mut j = 0;
 
     while j < ptx.len() {
         let Some(j2) = find(ptx, KERNEL_TYPE, j) else {
-            return CpuAndGpuKernelSignatures::Mismatch;
+            return HostAndDeviceKernelEntryPoint::Mismatch;
         };
 
         if starts_with(ptx, entry_point, j2) {
-            return CpuAndGpuKernelSignatures::Match;
+            return HostAndDeviceKernelEntryPoint::Match;
         }
 
         j += 1;
     }
 
-    CpuAndGpuKernelSignatures::Mismatch
+    HostAndDeviceKernelEntryPoint::Mismatch
 }
 
 const fn find(haystack: &[u8], needle: &[u8], from: usize) -> Option<usize> {
diff --git a/src/safety/ptx_kernel_signature.rs b/src/safety/ptx_kernel_signature.rs
new file mode 100644
index 000000000..eb4a63820
--- /dev/null
+++ b/src/safety/ptx_kernel_signature.rs
@@ -0,0 +1,51 @@
+use const_type_layout::{serialise_type_graph, serialised_type_graph_len, TypeGraphLayout};
+
+#[allow(clippy::module_name_repetitions)]
+#[derive(PartialEq, Eq, core::marker::ConstParamTy)]
+pub enum HostAndDeviceKernelSignatureTypeLayout {
+    Match,
+    Mismatch,
+}
+
+pub struct Assert<const MATCH: HostAndDeviceKernelSignatureTypeLayout>;
+
+#[must_use]
+pub const fn check<T: TypeGraphLayout>(
+    device: &'static [u8],
+) -> HostAndDeviceKernelSignatureTypeLayout
+where
+    [u8; serialised_type_graph_len::<T>()]:,
+{
+    const SIGNATURE_ERROR_MESSAGE: &[u8] = b"ERROR in this PTX compilation";
+
+    // Short-circuit to avoid extra errors when PTX compilation fails
+    if equals(device, SIGNATURE_ERROR_MESSAGE) {
+        return HostAndDeviceKernelSignatureTypeLayout::Match;
+    }
+
+    let host = serialise_type_graph::<T>();
+
+    if equals(device, &host) {
+        HostAndDeviceKernelSignatureTypeLayout::Match
+    } else {
+        HostAndDeviceKernelSignatureTypeLayout::Mismatch
+    }
+}
+
+const fn equals(device: &[u8], host: &[u8]) -> bool {
+    if host.len() != device.len() {
+        return false;
+    }
+
+    let mut i = 0;
+
+    while i < host.len() {
+        if host[i] != device[i] {
+            return false;
+        }
+
+        i += 1;
+    }
+
+    true
+}
diff --git a/src/safety/type_layout.rs b/src/safety/type_layout.rs
deleted file mode 100644
index f225f0055..000000000
--- a/src/safety/type_layout.rs
+++ /dev/null
@@ -1,33 +0,0 @@
-use const_type_layout::{serialise_type_graph, serialised_type_graph_len, TypeGraphLayout};
-
-#[derive(PartialEq, Eq, core::marker::ConstParamTy)]
-pub enum CpuAndGpuTypeLayouts {
-    Match,
-    Mismatch,
-}
-
-pub struct Assert<const MATCH: CpuAndGpuTypeLayouts>;
-
-#[must_use]
-pub const fn check<T: TypeGraphLayout>(device: &'static [u8]) -> CpuAndGpuTypeLayouts
-where
-    [u8; serialised_type_graph_len::<T>()]:,
-{
-    let host = serialise_type_graph::<T>();
-
-    if host.len() != device.len() {
-        return CpuAndGpuTypeLayouts::Mismatch;
-    }
-
-    let mut i = 0;
-
-    while i < host.len() {
-        if host[i] != device[i] {
-            return CpuAndGpuTypeLayouts::Mismatch;
-        }
-
-        i += 1;
-    }
-
-    CpuAndGpuTypeLayouts::Match
-}

From 19120623df08f95f2b0786bdcdb3cdecfdc28842 Mon Sep 17 00:00:00 2001
From: Juniper Tyree <juniper.tyree@helsinki.fi>
Date: Thu, 18 Jan 2024 13:53:51 +0000
Subject: [PATCH 109/120] Some cleanup before merging

---
 examples/print/src/main.rs                    | 10 ++++-
 rust-cuda-kernel/src/kernel/link/mod.rs       | 23 +++++-----
 .../src/kernel/specialise/entry_point.rs      | 23 ++++------
 rust-cuda-kernel/src/lib.rs                   |  1 +
 src/device/utils.rs                           | 43 ++++++++++++++-----
 src/kernel/mod.rs                             | 14 +++---
 src/kernel/param.rs                           | 36 ++++++++++------
 src/lend/impls/option.rs                      |  6 +--
 8 files changed, 94 insertions(+), 62 deletions(-)

diff --git a/examples/print/src/main.rs b/examples/print/src/main.rs
index 7cd9ab3f2..93a50ba55 100644
--- a/examples/print/src/main.rs
+++ b/examples/print/src/main.rs
@@ -93,12 +93,18 @@ mod cuda_prelude {
     fn panic(info: &::core::panic::PanicInfo) -> ! {
         // pretty format and print the panic message
         // but don't allow dynamic formatting or panic payload downcasting
-        rust_cuda::device::utils::pretty_panic_handler(info, false, false)
+        rust_cuda::device::utils::pretty_print_panic_info(info, false, false);
+
+        // Safety: no mutable data is shared with the kernel
+        unsafe { rust_cuda::device::utils::exit() }
     }
 
     #[alloc_error_handler]
     #[track_caller]
     fn alloc_error_handler(layout: ::core::alloc::Layout) -> ! {
-        rust_cuda::device::utils::pretty_alloc_error_handler(layout)
+        rust_cuda::device::utils::pretty_print_alloc_error(layout);
+
+        // Safety: no mutable data is shared with the kernel
+        unsafe { rust_cuda::device::utils::exit() }
     }
 }
diff --git a/rust-cuda-kernel/src/kernel/link/mod.rs b/rust-cuda-kernel/src/kernel/link/mod.rs
index b64776707..b83eab9d2 100644
--- a/rust-cuda-kernel/src/kernel/link/mod.rs
+++ b/rust-cuda-kernel/src/kernel/link/mod.rs
@@ -1,7 +1,7 @@
 use std::{
     collections::HashMap,
     env,
-    ffi::{CStr, CString},
+    ffi::CString,
     fmt::Write as FmtWrite,
     fs,
     io::{Read, Write},
@@ -132,21 +132,18 @@ pub fn link_kernel(tokens: TokenStream) -> TokenStream {
         &ptx_lint_levels,
     );
 
-    let mut kernel_ptx = kernel_ptx.into_bytes();
-    kernel_ptx.push(b'\0');
-
-    if let Err(err) = CStr::from_bytes_with_nul(&kernel_ptx) {
-        abort_call_site!(
+    let kernel_ptx = match CString::new(kernel_ptx) {
+        Ok(kernel_ptx) => kernel_ptx,
+        Err(err) => abort_call_site!(
             "Kernel compilation generated invalid PTX: internal nul byte: {:?}",
             err
-        );
-    }
+        ),
+    };
 
-    // TODO: CStr constructor blocked on https://github.com/rust-lang/rust/issues/118560
-    let kernel_ptx = syn::LitByteStr::new(&kernel_ptx, proc_macro2::Span::call_site());
-    // Safety: the validity of kernel_ptx as a CStr was just checked above
-    let kernel_ptx =
-        quote! { unsafe { ::core::ffi::CStr::from_bytes_with_nul_unchecked(#kernel_ptx) } };
+    let kernel_ptx = proc_macro::Literal::c_string(&kernel_ptx);
+    let kernel_ptx = proc_macro2::TokenStream::from(proc_macro::TokenStream::from(
+        proc_macro::TokenTree::Literal(kernel_ptx),
+    ));
 
     (quote! { const #ptx_cstr_ident: &'static ::core::ffi::CStr = #kernel_ptx; #(#type_layouts)* })
         .into()
diff --git a/rust-cuda-kernel/src/kernel/specialise/entry_point.rs b/rust-cuda-kernel/src/kernel/specialise/entry_point.rs
index 5653a5539..2bc50b0e5 100644
--- a/rust-cuda-kernel/src/kernel/specialise/entry_point.rs
+++ b/rust-cuda-kernel/src/kernel/specialise/entry_point.rs
@@ -1,4 +1,4 @@
-use std::ffi::CStr;
+use std::ffi::CString;
 
 use proc_macro::TokenStream;
 
@@ -27,23 +27,16 @@ pub fn specialise_kernel_entry_point(tokens: TokenStream) -> TokenStream {
         format!("{kernel}_kernel")
     };
 
-    let mut mangled_kernel_ident = mangled_kernel_ident.into_bytes();
-    mangled_kernel_ident.push(b'\0');
-
-    if let Err(err) = CStr::from_bytes_with_nul(&mangled_kernel_ident) {
-        abort_call_site!(
+    let mangled_kernel_ident = match CString::new(mangled_kernel_ident) {
+        Ok(mangled_kernel_ident) => mangled_kernel_ident,
+        Err(err) => abort_call_site!(
             "Kernel compilation generated invalid kernel entry point: internal nul byte: {:?}",
             err
-        );
-    }
-
-    // TODO: CStr constructor blocked on https://github.com/rust-lang/rust/issues/118560
-    let mangled_kernel_ident =
-        syn::LitByteStr::new(&mangled_kernel_ident, proc_macro2::Span::call_site());
-    // Safety: the validity of mangled_kernel_ident as a CStr was just checked above
-    let mangled_kernel_ident = quote! { unsafe { ::core::ffi::CStr::from_bytes_with_nul_unchecked(#mangled_kernel_ident) } };
+        ),
+    };
 
-    (quote! { #mangled_kernel_ident }).into()
+    let mangled_kernel_ident = proc_macro::Literal::c_string(&mangled_kernel_ident);
+    proc_macro::TokenTree::Literal(mangled_kernel_ident).into()
 }
 
 struct SpecialiseMangleConfig {
diff --git a/rust-cuda-kernel/src/lib.rs b/rust-cuda-kernel/src/lib.rs
index b26a78531..436380ff3 100644
--- a/rust-cuda-kernel/src/lib.rs
+++ b/rust-cuda-kernel/src/lib.rs
@@ -5,6 +5,7 @@
 #![feature(let_chains)]
 #![feature(map_try_insert)]
 #![feature(proc_macro_def_site)]
+#![feature(proc_macro_c_str_literals)]
 #![feature(cfg_version)]
 #![cfg_attr(not(version("1.76.0")), feature(c_str_literals))]
 #![doc(html_root_url = "https://juntyr.github.io/rust-cuda/")]
diff --git a/src/device/utils.rs b/src/device/utils.rs
index cbc5080ab..8447c5235 100644
--- a/src/device/utils.rs
+++ b/src/device/utils.rs
@@ -1,15 +1,27 @@
 use crate::deps::alloc::{fmt, string::String};
 
 /// Abort the CUDA kernel using the `trap` system call.
+///
+/// [`abort`] poisons the CUDA context and no more work can be performed in it.
 #[allow(clippy::inline_always)]
 #[inline(always)]
 pub fn abort() -> ! {
     unsafe { ::core::arch::nvptx::trap() }
 }
 
+/// Exit the CUDA kernel using the `exit` instruction.
+///
+/// # Safety
+///
+/// [`exit`] quits the kernel early and any mutable data accessible outside this
+/// kernel launch (by the host or a subsequent kernel launch) may be in an
+/// inconsistent state. Therefore, kernel failure must be communicated back to
+/// host and handled in some other manner.
+///
+/// Safely return from the main kernel function instead.
 #[allow(clippy::inline_always)]
 #[inline(always)]
-pub fn exit() -> ! {
+pub unsafe fn exit() -> ! {
     unsafe { ::core::arch::asm!("exit;", options(noreturn)) }
 }
 
@@ -68,14 +80,28 @@ pub fn print(args: ::core::fmt::Arguments) {
     }
 }
 
-// TODO: docs
+/// Helper function to efficiently pretty-print a [`core::panic::PanicInfo`]
+/// using the `vprintf` system call.
+///
+/// If `allow_dynamic_message` is set,
+/// [`alloc::fmt::format`](crate::deps::alloc::fmt::format) is used to print
+/// [`core::panic::PanicInfo::message`] message when
+/// [`core::fmt::Arguments::as_str`] returns [`None`]. Note that this may pull
+/// in a large amount of string formatting and dynamic allocation code.
+/// If unset, a default placeholder panic message is printed instead.
+///
+/// If `allow_dynamic_payload` is set, [`core::panic::PanicInfo::payload`] is
+/// checked for [`&str`] and [`String`] to get a message to print if
+/// [`core::panic::PanicInfo::message`] returns [`None`]. Note that this may
+/// pull in some dynamic dispatch code. If unset, a default placeholder panic
+/// message is printed instead.
 #[allow(clippy::inline_always)]
 #[inline(always)]
-pub fn pretty_panic_handler(
+pub fn pretty_print_panic_info(
     info: &::core::panic::PanicInfo,
     allow_dynamic_message: bool,
     allow_dynamic_payload: bool,
-) -> ! {
+) {
     #[repr(C)]
     struct FormatArgs {
         file_len: u32,
@@ -140,15 +166,14 @@ pub fn pretty_panic_handler(
             ::core::ptr::from_ref(&args).cast(),
         );
     }
-
-    exit()
 }
 
-// TODO: docs
+/// Helper function to efficiently pretty-print an error message (inside an
+/// allocation error handler) using the `vprintf` system call.
 #[track_caller]
 #[allow(clippy::inline_always)]
 #[inline(always)]
-pub fn pretty_alloc_error_handler(layout: ::core::alloc::Layout) -> ! {
+pub fn pretty_print_alloc_error(layout: ::core::alloc::Layout) {
     #[repr(C)]
     struct FormatArgs {
         size: usize,
@@ -186,6 +211,4 @@ pub fn pretty_alloc_error_handler(layout: ::core::alloc::Layout) -> ! {
             ::core::ptr::from_ref(&args).cast(),
         );
     }
-
-    exit()
 }
diff --git a/src/kernel/mod.rs b/src/kernel/mod.rs
index 42e13e0ce..e69b0b15a 100644
--- a/src/kernel/mod.rs
+++ b/src/kernel/mod.rs
@@ -40,7 +40,8 @@ mod sealed {
     pub struct Token;
 }
 
-#[cfg(feature = "host")] // FIXME: make private?
+#[cfg(all(feature = "host", not(doc)))]
+#[doc(hidden)]
 pub trait WithNewAsync<
     'stream,
     P: ?Sized + CudaKernelParameter,
@@ -48,13 +49,12 @@ pub trait WithNewAsync<
     E: From<rustacuda::error::CudaError>,
 >
 {
-    #[allow(clippy::missing_errors_doc)] // FIXME
     fn with<'b>(self, param: P::AsyncHostType<'stream, 'b>) -> Result<O, E>
     where
         P: 'b;
 }
 
-#[cfg(feature = "host")] // FIXME: make private?
+#[cfg(all(feature = "host", not(doc)))]
 impl<
         'stream,
         P: ?Sized + CudaKernelParameter,
@@ -72,6 +72,7 @@ impl<
 }
 
 #[cfg(feature = "device")]
+#[doc(hidden)]
 pub trait WithFfiAsDevice<P: ?Sized + CudaKernelParameter, O> {
     fn with<'b>(self, param: P::DeviceType<'b>) -> O
     where
@@ -108,13 +109,14 @@ pub trait CudaKernelParameter: sealed::Sealed {
 
     #[cfg(feature = "host")]
     #[allow(clippy::missing_errors_doc)] // FIXME
-    fn with_new_async<'stream, 'param, O, E: From<rustacuda::error::CudaError>>(
+    fn with_new_async<'stream, 'b, O, E: From<rustacuda::error::CudaError>>(
         param: Self::SyncHostType,
         stream: crate::host::Stream<'stream>,
-        inner: impl WithNewAsync<'stream, Self, O, E>,
+        #[cfg(not(doc))] inner: impl WithNewAsync<'stream, Self, O, E>,
+        #[cfg(doc)] inner: impl FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result<O, E>,
     ) -> Result<O, E>
     where
-        Self: 'param;
+        Self: 'b;
 
     #[doc(hidden)]
     #[cfg(feature = "host")]
diff --git a/src/kernel/param.rs b/src/kernel/param.rs
index ff53f6dd4..c87148c7a 100644
--- a/src/kernel/param.rs
+++ b/src/kernel/param.rs
@@ -82,7 +82,8 @@ impl<
     fn with_new_async<'stream, 'b, O, E: From<rustacuda::error::CudaError>>(
         param: Self::SyncHostType,
         _stream: crate::host::Stream<'stream>,
-        inner: impl super::WithNewAsync<'stream, Self, O, E>,
+        #[cfg(not(doc))] inner: impl super::WithNewAsync<'stream, Self, O, E>,
+        #[cfg(doc)] inner: impl FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result<O, E>,
     ) -> Result<O, E>
     where
         Self: 'b,
@@ -168,7 +169,8 @@ impl<
     fn with_new_async<'stream, 'b, O, E: From<rustacuda::error::CudaError>>(
         param: Self::SyncHostType,
         stream: crate::host::Stream<'stream>,
-        inner: impl super::WithNewAsync<'stream, Self, O, E>,
+        #[cfg(not(doc))] inner: impl super::WithNewAsync<'stream, Self, O, E>,
+        #[cfg(doc)] inner: impl FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result<O, E>,
     ) -> Result<O, E>
     where
         Self: 'b,
@@ -253,7 +255,8 @@ impl<
     fn with_new_async<'stream, 'b, O, E: From<rustacuda::error::CudaError>>(
         param: Self::SyncHostType,
         stream: crate::host::Stream<'stream>,
-        inner: impl super::WithNewAsync<'stream, Self, O, E>,
+        #[cfg(not(doc))] inner: impl super::WithNewAsync<'stream, Self, O, E>,
+        #[cfg(doc)] inner: impl FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result<O, E>,
     ) -> Result<O, E>
     where
         Self: 'b,
@@ -377,7 +380,8 @@ impl<
     fn with_new_async<'stream, 'b, O, E: From<rustacuda::error::CudaError>>(
         param: Self::SyncHostType,
         stream: crate::host::Stream<'stream>,
-        inner: impl super::WithNewAsync<'stream, Self, O, E>,
+        #[cfg(not(doc))] inner: impl super::WithNewAsync<'stream, Self, O, E>,
+        #[cfg(doc)] inner: impl FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result<O, E>,
     ) -> Result<O, E>
     where
         Self: 'b,
@@ -514,7 +518,8 @@ impl<
     fn with_new_async<'stream, 'b, O, E: From<rustacuda::error::CudaError>>(
         param: Self::SyncHostType,
         stream: crate::host::Stream<'stream>,
-        inner: impl super::WithNewAsync<'stream, Self, O, E>,
+        #[cfg(not(doc))] inner: impl super::WithNewAsync<'stream, Self, O, E>,
+        #[cfg(doc)] inner: impl FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result<O, E>,
     ) -> Result<O, E>
     where
         Self: 'b,
@@ -600,7 +605,8 @@ impl<'a, T: Sync + RustToCuda> CudaKernelParameter for &'a DeepPerThreadBorrow<T
     fn with_new_async<'stream, 'b, O, E: From<rustacuda::error::CudaError>>(
         param: Self::SyncHostType,
         stream: crate::host::Stream<'stream>,
-        inner: impl super::WithNewAsync<'stream, Self, O, E>,
+        #[cfg(not(doc))] inner: impl super::WithNewAsync<'stream, Self, O, E>,
+        #[cfg(doc)] inner: impl FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result<O, E>,
     ) -> Result<O, E>
     where
         Self: 'b,
@@ -684,7 +690,8 @@ impl<'a, T: Sync + RustToCuda + SafeMutableAliasing> CudaKernelParameter
     fn with_new_async<'stream, 'b, O, E: From<rustacuda::error::CudaError>>(
         param: Self::SyncHostType,
         stream: crate::host::Stream<'stream>,
-        inner: impl super::WithNewAsync<'stream, Self, O, E>,
+        #[cfg(not(doc))] inner: impl super::WithNewAsync<'stream, Self, O, E>,
+        #[cfg(doc)] inner: impl FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result<O, E>,
     ) -> Result<O, E>
     where
         Self: 'b,
@@ -774,7 +781,8 @@ impl<
     fn with_new_async<'stream, 'b, O, E: From<rustacuda::error::CudaError>>(
         param: Self::SyncHostType,
         stream: crate::host::Stream<'stream>,
-        inner: impl super::WithNewAsync<'stream, Self, O, E>,
+        #[cfg(not(doc))] inner: impl super::WithNewAsync<'stream, Self, O, E>,
+        #[cfg(doc)] inner: impl FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result<O, E>,
     ) -> Result<O, E>
     where
         Self: 'b,
@@ -857,7 +865,8 @@ impl<'a, T: Sync + RustToCuda> CudaKernelParameter for &'a PtxJit<DeepPerThreadB
     fn with_new_async<'stream, 'b, O, E: From<rustacuda::error::CudaError>>(
         param: Self::SyncHostType,
         stream: crate::host::Stream<'stream>,
-        inner: impl super::WithNewAsync<'stream, Self, O, E>,
+        #[cfg(not(doc))] inner: impl super::WithNewAsync<'stream, Self, O, E>,
+        #[cfg(doc)] inner: impl FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result<O, E>,
     ) -> Result<O, E>
     where
         Self: 'b,
@@ -940,7 +949,8 @@ impl<'a, T: Sync + RustToCuda + SafeMutableAliasing> CudaKernelParameter
     fn with_new_async<'stream, 'b, O, E: From<rustacuda::error::CudaError>>(
         param: Self::SyncHostType,
         stream: crate::host::Stream<'stream>,
-        inner: impl super::WithNewAsync<'stream, Self, O, E>,
+        #[cfg(not(doc))] inner: impl super::WithNewAsync<'stream, Self, O, E>,
+        #[cfg(doc)] inner: impl FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result<O, E>,
     ) -> Result<O, E>
     where
         Self: 'b,
@@ -1067,7 +1077,8 @@ impl<'a, T: 'static> CudaKernelParameter for &'a mut crate::utils::shared::Threa
     fn with_new_async<'stream, 'b, O, E: From<rustacuda::error::CudaError>>(
         param: Self::SyncHostType,
         _stream: crate::host::Stream<'stream>,
-        inner: impl super::WithNewAsync<'stream, Self, O, E>,
+        #[cfg(not(doc))] inner: impl super::WithNewAsync<'stream, Self, O, E>,
+        #[cfg(doc)] inner: impl FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result<O, E>,
     ) -> Result<O, E>
     where
         Self: 'b,
@@ -1144,7 +1155,8 @@ impl<'a, T: 'static + PortableBitSemantics + TypeGraphLayout> CudaKernelParamete
     fn with_new_async<'stream, 'b, O, E: From<rustacuda::error::CudaError>>(
         param: Self::SyncHostType,
         _stream: crate::host::Stream<'stream>,
-        inner: impl super::WithNewAsync<'stream, Self, O, E>,
+        #[cfg(not(doc))] inner: impl super::WithNewAsync<'stream, Self, O, E>,
+        #[cfg(doc)] inner: impl FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result<O, E>,
     ) -> Result<O, E>
     where
         Self: 'b,
diff --git a/src/lend/impls/option.rs b/src/lend/impls/option.rs
index 0e9c3c34d..3f1d1e160 100644
--- a/src/lend/impls/option.rs
+++ b/src/lend/impls/option.rs
@@ -147,10 +147,8 @@ unsafe impl<T: RustToCudaAsync> RustToCudaAsync for Option<T> {
 
             #[allow(clippy::option_if_let_else)]
             let (r#async, alloc_tail) = RustToCudaAsync::restore_async(
-                this.map_mut(|value| match value {
-                    Some(value) => value,
-                    None => unreachable!(), // TODO
-                }),
+                // Safety: we have already established value is Some above
+                this.map_mut(|value| unsafe { value.as_mut().unwrap_unchecked() }),
                 CombinedCudaAlloc::new(alloc_front, alloc_tail),
                 stream,
             )?;

From 30986364465cdc9f50602f8223e2fa338977a0d3 Mon Sep 17 00:00:00 2001
From: Juniper Tyree <juniper.tyree@helsinki.fi>
Date: Thu, 18 Jan 2024 14:10:14 +0000
Subject: [PATCH 110/120] Fix some clippy lints, add FIXMEs for others

---
 rust-cuda-derive/src/lib.rs                   |  2 ++
 rust-cuda-kernel/src/kernel/link/error.rs     |  9 +++---
 rust-cuda-kernel/src/kernel/link/mod.rs       | 32 ++++++++-----------
 .../src/kernel/link/ptx_compiler_sys.rs       |  2 +-
 .../src/kernel/specialise/entry_point.rs      |  1 +
 .../wrapper/generate/cuda_generic_function.rs |  6 ++--
 rust-cuda-kernel/src/lib.rs                   | 10 +++++-
 src/lib.rs                                    |  5 ++-
 8 files changed, 37 insertions(+), 30 deletions(-)

diff --git a/rust-cuda-derive/src/lib.rs b/rust-cuda-derive/src/lib.rs
index fba846798..a560b6d67 100644
--- a/rust-cuda-derive/src/lib.rs
+++ b/rust-cuda-derive/src/lib.rs
@@ -5,6 +5,8 @@
 #![deny(clippy::perf)]
 #![deny(clippy::style)]
 #![deny(clippy::suspicious)]
+#![deny(unsafe_code)]
+// #![warn(missing_docs)] // FIXME
 #![feature(if_let_guard)]
 #![feature(let_chains)]
 #![doc(html_root_url = "https://juntyr.github.io/rust-cuda/")]
diff --git a/rust-cuda-kernel/src/kernel/link/error.rs b/rust-cuda-kernel/src/kernel/link/error.rs
index 0c83e19a5..811269ccc 100644
--- a/rust-cuda-kernel/src/kernel/link/error.rs
+++ b/rust-cuda-kernel/src/kernel/link/error.rs
@@ -22,15 +22,14 @@ pub fn emit_ptx_build_error() {
 
     let call_site = proc_macro::Span::call_site();
 
-    let (byte_start, byte_end) =
-        if let Some(captures) = PROC_MACRO_SPAN_REGEX.captures(&format!("{call_site:?}")) {
+    let (byte_start, byte_end) = PROC_MACRO_SPAN_REGEX
+        .captures(&format!("{call_site:?}"))
+        .map_or((0_u32, 0_u32), |captures| {
             (
                 captures["start"].parse().unwrap_or(0_u32),
                 captures["end"].parse().unwrap_or(0_u32),
             )
-        } else {
-            (0_u32, 0_u32)
-        };
+        });
 
     let span = DiagnosticSpanBuilder::default()
         .file_name(
diff --git a/rust-cuda-kernel/src/kernel/link/mod.rs b/rust-cuda-kernel/src/kernel/link/mod.rs
index b83eab9d2..27c2533c6 100644
--- a/rust-cuda-kernel/src/kernel/link/mod.rs
+++ b/rust-cuda-kernel/src/kernel/link/mod.rs
@@ -386,6 +386,7 @@ fn check_kernel_ptx(
 ) {
     let compiler = {
         let mut compiler = std::ptr::null_mut();
+        #[allow(unsafe_code)] // FFI
         if let Err(err) = NvptxError::try_err_from(unsafe {
             ptx_compiler_sys::nvPTXCompilerCreate(
                 addr_of_mut!(compiler),
@@ -451,6 +452,7 @@ fn check_kernel_ptx(
 
             let options_ptrs = options.iter().map(|o| o.as_ptr()).collect::<Vec<_>>();
 
+            #[allow(unsafe_code)] // FFI
             NvptxError::try_err_from(unsafe {
                 ptx_compiler_sys::nvPTXCompilerCompile(
                     compiler,
@@ -493,6 +495,7 @@ fn check_kernel_ptx(
 
         let options_ptrs = options.iter().map(|o| o.as_ptr()).collect::<Vec<_>>();
 
+        #[allow(unsafe_code)] // FFI
         NvptxError::try_err_from(unsafe {
             ptx_compiler_sys::nvPTXCompilerCompile(
                 compiler,
@@ -505,6 +508,7 @@ fn check_kernel_ptx(
     let error_log = (|| {
         let mut error_log_size = 0;
 
+        #[allow(unsafe_code)] // FFI
         NvptxError::try_err_from(unsafe {
             ptx_compiler_sys::nvPTXCompilerGetErrorLogSize(compiler, addr_of_mut!(error_log_size))
         })?;
@@ -514,23 +518,20 @@ fn check_kernel_ptx(
         }
 
         #[allow(clippy::cast_possible_truncation)]
-        let mut error_log: Vec<u8> = Vec::with_capacity(error_log_size as usize);
+        let mut error_log: Vec<u8> = vec![0; error_log_size as usize];
 
+        #[allow(unsafe_code)] // FFI
         NvptxError::try_err_from(unsafe {
             ptx_compiler_sys::nvPTXCompilerGetErrorLog(compiler, error_log.as_mut_ptr().cast())
         })?;
 
-        #[allow(clippy::cast_possible_truncation)]
-        unsafe {
-            error_log.set_len(error_log_size as usize);
-        }
-
         Ok(Some(String::from_utf8_lossy(&error_log).into_owned()))
     })();
 
     let info_log = (|| {
         let mut info_log_size = 0;
 
+        #[allow(unsafe_code)] // FFI
         NvptxError::try_err_from(unsafe {
             ptx_compiler_sys::nvPTXCompilerGetInfoLogSize(compiler, addr_of_mut!(info_log_size))
         })?;
@@ -540,17 +541,13 @@ fn check_kernel_ptx(
         }
 
         #[allow(clippy::cast_possible_truncation)]
-        let mut info_log: Vec<u8> = Vec::with_capacity(info_log_size as usize);
+        let mut info_log: Vec<u8> = vec![0; info_log_size as usize];
 
+        #[allow(unsafe_code)] // FFI
         NvptxError::try_err_from(unsafe {
             ptx_compiler_sys::nvPTXCompilerGetInfoLog(compiler, info_log.as_mut_ptr().cast())
         })?;
 
-        #[allow(clippy::cast_possible_truncation)]
-        unsafe {
-            info_log.set_len(info_log_size as usize);
-        }
-
         Ok(Some(String::from_utf8_lossy(&info_log).into_owned()))
     })();
 
@@ -561,6 +558,7 @@ fn check_kernel_ptx(
 
         let mut binary_size = 0;
 
+        #[allow(unsafe_code)] // FFI
         NvptxError::try_err_from(unsafe {
             ptx_compiler_sys::nvPTXCompilerGetCompiledProgramSize(
                 compiler,
@@ -573,17 +571,13 @@ fn check_kernel_ptx(
         }
 
         #[allow(clippy::cast_possible_truncation)]
-        let mut binary: Vec<u8> = Vec::with_capacity(binary_size as usize);
+        let mut binary: Vec<u8> = vec![0; binary_size as usize];
 
+        #[allow(unsafe_code)] // FFI
         NvptxError::try_err_from(unsafe {
             ptx_compiler_sys::nvPTXCompilerGetCompiledProgram(compiler, binary.as_mut_ptr().cast())
         })?;
 
-        #[allow(clippy::cast_possible_truncation)]
-        unsafe {
-            binary.set_len(binary_size as usize);
-        }
-
         Ok(Some(binary))
     })();
 
@@ -591,6 +585,7 @@ fn check_kernel_ptx(
         let mut major = 0;
         let mut minor = 0;
 
+        #[allow(unsafe_code)] // FFI
         NvptxError::try_err_from(unsafe {
             ptx_compiler_sys::nvPTXCompilerGetVersion(addr_of_mut!(major), addr_of_mut!(minor))
         })?;
@@ -600,6 +595,7 @@ fn check_kernel_ptx(
 
     let drop = {
         let mut compiler = compiler;
+        #[allow(unsafe_code)] // FFI
         NvptxError::try_err_from(unsafe {
             ptx_compiler_sys::nvPTXCompilerDestroy(addr_of_mut!(compiler))
         })
diff --git a/rust-cuda-kernel/src/kernel/link/ptx_compiler_sys.rs b/rust-cuda-kernel/src/kernel/link/ptx_compiler_sys.rs
index fac72cebf..7fffc7b4c 100644
--- a/rust-cuda-kernel/src/kernel/link/ptx_compiler_sys.rs
+++ b/rust-cuda-kernel/src/kernel/link/ptx_compiler_sys.rs
@@ -42,7 +42,7 @@ impl NvptxError {
     const NVPTXCOMPILE_ERROR_UNSUPPORTED_PTX_VERSION: NvptxCompileResult = 7;
     const NVPTXCOMPILE_SUCCESS: NvptxCompileResult = 0;
 
-    pub fn try_err_from(result: NvptxCompileResult) -> Result<(), Self> {
+    pub const fn try_err_from(result: NvptxCompileResult) -> Result<(), Self> {
         match result {
             Self::NVPTXCOMPILE_SUCCESS => Ok(()),
             Self::NVPTXCOMPILE_ERROR_INVALID_COMPILER_HANDLE => Err(Self::InvalidCompilerHandle),
diff --git a/rust-cuda-kernel/src/kernel/specialise/entry_point.rs b/rust-cuda-kernel/src/kernel/specialise/entry_point.rs
index 2bc50b0e5..1c80b7899 100644
--- a/rust-cuda-kernel/src/kernel/specialise/entry_point.rs
+++ b/rust-cuda-kernel/src/kernel/specialise/entry_point.rs
@@ -18,6 +18,7 @@ pub fn specialise_kernel_entry_point(tokens: TokenStream) -> TokenStream {
         },
     };
 
+    #[allow(clippy::option_if_let_else)]
     let mangled_kernel_ident = if let Some(specialisation) = specialisation {
         format!(
             "{kernel}_kernel_{:016x}",
diff --git a/rust-cuda-kernel/src/kernel/wrapper/generate/cuda_generic_function.rs b/rust-cuda-kernel/src/kernel/wrapper/generate/cuda_generic_function.rs
index ccf21c96b..00e00d7d8 100644
--- a/rust-cuda-kernel/src/kernel/wrapper/generate/cuda_generic_function.rs
+++ b/rust-cuda-kernel/src/kernel/wrapper/generate/cuda_generic_function.rs
@@ -38,9 +38,7 @@ pub(in super::super) fn quote_cuda_generic_function(
                     elem,
                 }) = &**ty
                 {
-                    let lifetime = if let Some(lifetime) = lifetime {
-                        lifetime.clone()
-                    } else {
+                    let lifetime = lifetime.clone().unwrap_or_else(|| {
                         let lifetime =
                             syn::Lifetime::new(&format!("'__rust_cuda_lt_{i}"), ty.span());
                         generic_params.insert(
@@ -53,7 +51,7 @@ pub(in super::super) fn quote_cuda_generic_function(
                             }),
                         );
                         lifetime
-                    };
+                    });
                     let lt = quote!(#lifetime);
                     (
                         syn::Type::Reference(syn::TypeReference {
diff --git a/rust-cuda-kernel/src/lib.rs b/rust-cuda-kernel/src/lib.rs
index 436380ff3..6aa0d44c7 100644
--- a/rust-cuda-kernel/src/lib.rs
+++ b/rust-cuda-kernel/src/lib.rs
@@ -1,4 +1,12 @@
-#![deny(clippy::pedantic)]
+#![deny(clippy::complexity)]
+#![deny(clippy::correctness)]
+#![warn(clippy::nursery)]
+#![warn(clippy::pedantic)]
+#![deny(clippy::perf)]
+#![deny(clippy::style)]
+#![deny(clippy::suspicious)]
+#![deny(unsafe_code)]
+// #![warn(missing_docs)] // FIXME
 #![feature(box_patterns)]
 #![feature(proc_macro_tracked_env)]
 #![feature(proc_macro_span)]
diff --git a/src/lib.rs b/src/lib.rs
index c782c4047..2a63674fa 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -5,7 +5,10 @@
 #![deny(clippy::perf)]
 #![deny(clippy::style)]
 #![deny(clippy::suspicious)]
-#![allow(clippy::useless_attribute)]
+// #![warn(clippy::multiple_unsafe_ops_per_block)] // FIXME
+// #![warn(clippy::undocumented_unsafe_blocks)] // FIXME
+#![deny(unused_unsafe)]
+// #![warn(missing_docs)] // FIXME
 #![cfg_attr(all(any(feature = "device", target_os = "cuda"), not(doc)), no_std)]
 #![feature(associated_type_bounds)]
 #![feature(auto_traits)]

From 54eacc9245aea9118970fdd20ae03b3008b569b3 Mon Sep 17 00:00:00 2001
From: Juniper Tyree <juniper.tyree@helsinki.fi>
Date: Fri, 19 Jan 2024 05:04:47 +0000
Subject: [PATCH 111/120] Add docs for rust-cuda-derive

---
 README.md                                     |  5 +-
 rust-cuda-derive/src/lib.rs                   | 94 ++++++++++++++++++-
 rust-cuda-derive/src/rust_to_cuda/field_ty.rs |  6 +-
 rust-cuda-derive/src/rust_to_cuda/generics.rs | 10 +-
 rust-cuda-kernel/src/lib.rs                   | 22 +++++
 src/lib.rs                                    | 22 +++++
 6 files changed, 146 insertions(+), 13 deletions(-)

diff --git a/README.md b/README.md
index e9b24ddbb..5080b7033 100644
--- a/README.md
+++ b/README.md
@@ -1,8 +1,11 @@
-# rust-cuda &emsp; [![CI Status]][workflow] [![Rust Doc]][docs] [![License Status]][fossa] [![Code Coverage]][codecov] [![Gitpod Ready-to-Code]][gitpod]
+# rust-cuda &emsp; [![CI Status]][workflow] [![MSRV]][repo] [![Rust Doc]][docs] [![License Status]][fossa] [![Code Coverage]][codecov] [![Gitpod Ready-to-Code]][gitpod]
 
 [CI Status]: https://img.shields.io/github/actions/workflow/status/juntyr/rust-cuda/ci.yml?branch=main
 [workflow]: https://github.com/juntyr/rust-cuda/actions/workflows/ci.yml?query=branch%3Amain
 
+[MSRV]: https://img.shields.io/badge/MSRV-1.77.0--nightly-orange
+[repo]: https://github.com/juntyr/rust-cuda
+
 [Rust Doc]: https://img.shields.io/badge/docs-main-blue
 [docs]: https://juntyr.github.io/rust-cuda/
 
diff --git a/rust-cuda-derive/src/lib.rs b/rust-cuda-derive/src/lib.rs
index a560b6d67..5b897a8b2 100644
--- a/rust-cuda-derive/src/lib.rs
+++ b/rust-cuda-derive/src/lib.rs
@@ -1,3 +1,37 @@
+//! [![CI Status]][workflow] [![MSRV]][repo] [![Rust Doc]][docs] [![License
+//! Status]][fossa] [![Code Coverage]][codecov] [![Gitpod
+//! Ready-to-Code]][gitpod]
+//!
+//! [CI Status]: https://img.shields.io/github/actions/workflow/status/juntyr/rust-cuda/ci.yml?branch=main
+//! [workflow]: https://github.com/juntyr/rust-cuda/actions/workflows/ci.yml?query=branch%3Amain
+//!
+//! [MSRV]: https://img.shields.io/badge/MSRV-1.77.0--nightly-orange
+//! [repo]: https://github.com/juntyr/rust-cuda
+//!
+//! [Rust Doc]: https://img.shields.io/badge/docs-main-blue
+//! [docs]: https://juntyr.github.io/rust-cuda/rust_cuda_derive/
+//!
+//! [License Status]: https://app.fossa.com/api/projects/custom%2B26490%2Fgithub.com%2Fjuntyr%2Frust-cuda.svg?type=shield
+//! [fossa]: https://app.fossa.com/projects/custom%2B26490%2Fgithub.com%2Fjuntyr%2Frust-cuda?ref=badge_shield
+//!
+//! [Code Coverage]: https://img.shields.io/codecov/c/github/juntyr/rust-cuda?token=wfeAeybbbx
+//! [codecov]: https://codecov.io/gh/juntyr/rust-cuda
+//!
+//! [Gitpod Ready-to-Code]: https://img.shields.io/badge/Gitpod-ready-blue?logo=gitpod
+//! [gitpod]: https://gitpod.io/#https://github.com/juntyr/rust-cuda
+//!
+//! `rust-cuda-derive` provides the [`#[derive(LendRustToCuda)`](LendRustToCuda)
+//! derive macro for the
+//! [`rust_cuda::lend::RustToCuda`]
+//! utility trait, which enables the usage of the
+//! [`rust_cuda::lend::LendToCuda`]
+//! trait that allows Rust data structures to be shared with CUDA kernels.
+//!
+//! The async variants of both traits are *optionally* implemented as well.
+//!
+//! [`rust_cuda::lend::RustToCuda`]: https://juntyr.github.io/rust-cuda/rust_cuda/lend/trait.RustToCuda.html
+//! [`rust_cuda::lend::LendToCuda`]: https://juntyr.github.io/rust-cuda/rust_cuda/lend/trait.LendToCuda.html
+
 #![deny(clippy::complexity)]
 #![deny(clippy::correctness)]
 #![warn(clippy::nursery)]
@@ -6,7 +40,7 @@
 #![deny(clippy::style)]
 #![deny(clippy::suspicious)]
 #![deny(unsafe_code)]
-// #![warn(missing_docs)] // FIXME
+#![deny(missing_docs)]
 #![feature(if_let_guard)]
 #![feature(let_chains)]
 #![doc(html_root_url = "https://juntyr.github.io/rust-cuda/")]
@@ -22,6 +56,64 @@ mod rust_to_cuda;
 
 #[proc_macro_error]
 #[proc_macro_derive(LendRustToCuda, attributes(cuda))]
+/// Provides the [`#[derive(LendRustToCuda)`](LendRustToCuda)
+/// derive macro for the
+/// [`rust_cuda::lend::RustToCuda`]
+/// utility trait, which enables the usage of the
+/// [`rust_cuda::lend::LendToCuda`]
+/// trait that allows Rust data structures to be shared with CUDA kernels.
+///
+/// At the moment, only
+/// [`struct`](https://doc.rust-lang.org/std/keyword.struct.html)s are supported
+/// by this derive macro.
+///
+/// The derive also accepts a `#[cuda(...)]` attribute. You can annotate the
+/// entire struct with the `#[cuda(...)]` to configure the implementation as
+/// follows:
+///
+/// - `#[cuda(crate = "<crate-path>")]` changes the path to the [`rust-cuda`]
+///   crate that the derive uses, which by default is `rust_cuda`.
+/// - `#[cuda(bound = "<where-predicate>")]` adds the provided predicate to the
+///   where clause of the trait implementation.
+/// - `#[cuda(free = "<type>")]` removes the the auto-added trait bounds for the
+///   type parameter `<type>` from the trait implementation, e.g. when
+///   implementing a wrapper around [`std::marker::PhantomData<T>`] which should
+///   implement the trait for any `T`.
+/// - `#[cuda(async = <bool>)]` explicitly enables or disables the async
+///   implementation of the trait, [`rust_cuda::lend::RustToCudaAsync`]. By
+///   default, `#[cuda(async = true)]` is set.
+/// - `#[cuda(layout::ATTR = "VALUE")]` adds the `#[layout(ATTR = "VALUE")]`
+///   attribute to the [`#derive(const_type_layout::TypeLayout)`] derive for
+///   this struct's [`rust_cuda::lend::RustToCuda::CudaRepresentation`].
+/// - `#[cuda(ignore)]` removes all subsequent attributes from the generated
+///   [`rust_cuda::lend::RustToCuda::CudaRepresentation`] struct.
+///
+/// Additionally, the `#[cuda(...)]` attribute can also be applied individually
+/// to the fields of the struct to customise the implementation as follows:
+///
+/// - `#[cuda(embed)]` signals that this field has a non-identity CUDA
+///   representation and should be embedded by using the
+///   [`rust_cuda::lend::RustToCuda`] implementation of this field's type. When
+///   this attribute is not specified, the field must instead implement
+///   [`Copy`], [`rust_cuda::safety::PortableBitSemantics`], and
+///   [`const_type_layout::TypeGraphLayout`].
+/// - `#[cuda(embed = "<proxy-type>")]` works like `#[cuda(embed)]` but can be
+///   used when the field's type does not implement
+///   [`rust_cuda::lend::RustToCuda`] itself, but some `<proxy-type>` exists,
+///   which implements [`rust_cuda::lend::RustToCudaProxy`] for the field's
+///   type.
+/// - `#[cuda(ignore)]` removes all subsequent attributes from this field in the
+///   generated [`rust_cuda::lend::RustToCuda::CudaRepresentation`] struct.
+///
+/// [`rust_cuda::lend::RustToCuda`]: https://juntyr.github.io/rust-cuda/rust_cuda/lend/trait.RustToCuda.html
+/// [`rust_cuda::lend::LendToCuda`]: https://juntyr.github.io/rust-cuda/rust_cuda/lend/trait.LendToCuda.html
+/// [`rust-cuda`]: https://juntyr.github.io/rust-cuda/rust_cuda
+/// [`rust_cuda::lend::RustToCudaAsync`]: https://juntyr.github.io/rust-cuda/rust_cuda/lend/trait.RustToCudaAsync.html
+/// [`#derive(const_type_layout::TypeLayout)`]: https://docs.rs/const-type-layout/0.2.1/const_type_layout/derive.TypeLayout.html
+/// [`rust_cuda::lend::RustToCuda::CudaRepresentation`]: https://juntyr.github.io/rust-cuda/rust_cuda/lend/trait.RustToCuda.html#associatedtype.CudaRepresentation
+/// [`rust_cuda::safety::PortableBitSemantics`]: https://juntyr.github.io/rust-cuda/rust_cuda/safety/trait.PortableBitSemantics.html
+/// [`const_type_layout::TypeGraphLayout`]: https://docs.rs/const-type-layout/0.2.1/const_type_layout/trait.TypeGraphLayout.html
+/// [`rust_cuda::lend::RustToCudaProxy`]: https://juntyr.github.io/rust-cuda/rust_cuda/lend/trait.RustToCudaProxy.html
 pub fn rust_to_cuda_derive(input: TokenStream) -> TokenStream {
     // Note: We cannot report a more precise span yet
     let ast = match syn::parse(input) {
diff --git a/rust-cuda-derive/src/rust_to_cuda/field_ty.rs b/rust-cuda-derive/src/rust_to_cuda/field_ty.rs
index b2f624d66..c9fe48b77 100644
--- a/rust-cuda-derive/src/rust_to_cuda/field_ty.rs
+++ b/rust-cuda-derive/src/rust_to_cuda/field_ty.rs
@@ -69,7 +69,7 @@ pub fn swap_field_type_and_filter_attrs(
                                     Err(err) => emit_error!(
                                         s.span(),
                                         "[rust-cuda]: Invalid #[cuda(embed = \
-                                        \"<type>\")] field attribute: {}.",
+                                        \"<proxy-type>\")] field attribute: {}.",
                                         err
                                     ),
                                 }
@@ -84,7 +84,7 @@ pub fn swap_field_type_and_filter_attrs(
                             emit_error!(
                                 meta.span(),
                                 "[rust-cuda]: Expected #[cuda(ignore)] / #[cuda(embed)] / \
-                                #[cuda(embed = \"<type>\")] field attribute"
+                                #[cuda(embed = \"<proxy-type>\")] field attribute"
                             );
                         }
                     }
@@ -93,7 +93,7 @@ pub fn swap_field_type_and_filter_attrs(
                 emit_error!(
                     attr.span(),
                     "[rust-cuda]: Expected #[cuda(ignore)] / #[cuda(embed)] / \
-                    #[cuda(embed = \"<type>\")] field attribute."
+                    #[cuda(embed = \"<proxy-type>\")] field attribute."
                 );
             }
 
diff --git a/rust-cuda-derive/src/rust_to_cuda/generics.rs b/rust-cuda-derive/src/rust_to_cuda/generics.rs
index 4325f39fb..f090f5c70 100644
--- a/rust-cuda-derive/src/rust_to_cuda/generics.rs
+++ b/rust-cuda-derive/src/rust_to_cuda/generics.rs
@@ -159,10 +159,7 @@ pub fn expand_cuda_struct_generics_where_requested_in_attrs(
                         _ => {
                             emit_error!(
                                 meta.span(),
-                                "[rust-cuda]: Expected #[cuda(ignore)] / \
-                                #[cuda(bound = \"<where-predicate>\")] / \
-                                #[cuda(crate = \"<crate-path>\")] / \
-                                #[cuda(layout::ATTR = \"VALUE\")] struct attribute."
+                                "[rust-cuda]: Expected #[cuda(crate = \"<crate-path>\")] / #[cuda(bound = \"<where-predicate>\")] / #[cuda(free = \"<type>\")] / #[cuda(async = <bool>)] / #[cuda(layout::ATTR = \"VALUE\")] / #[cuda(ignore)] struct attribute."
                             );
                         },
                     }
@@ -170,10 +167,7 @@ pub fn expand_cuda_struct_generics_where_requested_in_attrs(
             } else {
                 emit_error!(
                     attr.span(),
-                    "[rust-cuda]: Expected #[cuda(ignore)] / \
-                    #[cuda(bound = \"<where-predicate>\")] / \
-                    #[cuda(crate = \"<crate-path>\")] / \
-                    #[cuda(layout::ATTR = \"VALUE\")] struct attribute."
+                    "[rust-cuda]: Expected #[cuda(crate = \"<crate-path>\")] / #[cuda(bound = \"<where-predicate>\")] / #[cuda(free = \"<type>\")] / #[cuda(async = <bool>)] / #[cuda(layout::ATTR = \"VALUE\")] / #[cuda(ignore)] struct attribute."
                 );
             }
 
diff --git a/rust-cuda-kernel/src/lib.rs b/rust-cuda-kernel/src/lib.rs
index 6aa0d44c7..86d4137bb 100644
--- a/rust-cuda-kernel/src/lib.rs
+++ b/rust-cuda-kernel/src/lib.rs
@@ -1,3 +1,25 @@
+//! [![CI Status]][workflow] [![MSRV]][repo] [![Rust Doc]][docs] [![License
+//! Status]][fossa] [![Code Coverage]][codecov] [![Gitpod
+//! Ready-to-Code]][gitpod]
+//!
+//! [CI Status]: https://img.shields.io/github/actions/workflow/status/juntyr/rust-cuda/ci.yml?branch=main
+//! [workflow]: https://github.com/juntyr/rust-cuda/actions/workflows/ci.yml?query=branch%3Amain
+//!
+//! [MSRV]: https://img.shields.io/badge/MSRV-1.77.0--nightly-orange
+//! [repo]: https://github.com/juntyr/rust-cuda
+//!
+//! [Rust Doc]: https://img.shields.io/badge/docs-main-blue
+//! [docs]: https://juntyr.github.io/rust-cuda/rust_cuda_kernel/
+//!
+//! [License Status]: https://app.fossa.com/api/projects/custom%2B26490%2Fgithub.com%2Fjuntyr%2Frust-cuda.svg?type=shield
+//! [fossa]: https://app.fossa.com/projects/custom%2B26490%2Fgithub.com%2Fjuntyr%2Frust-cuda?ref=badge_shield
+//!
+//! [Code Coverage]: https://img.shields.io/codecov/c/github/juntyr/rust-cuda?token=wfeAeybbbx
+//! [codecov]: https://codecov.io/gh/juntyr/rust-cuda
+//!
+//! [Gitpod Ready-to-Code]: https://img.shields.io/badge/Gitpod-ready-blue?logo=gitpod
+//! [gitpod]: https://gitpod.io/#https://github.com/juntyr/rust-cuda
+
 #![deny(clippy::complexity)]
 #![deny(clippy::correctness)]
 #![warn(clippy::nursery)]
diff --git a/src/lib.rs b/src/lib.rs
index 2a63674fa..df8c42290 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -1,3 +1,25 @@
+//! [![CI Status]][workflow] [![MSRV]][repo] [![Rust Doc]][docs] [![License
+//! Status]][fossa] [![Code Coverage]][codecov] [![Gitpod
+//! Ready-to-Code]][gitpod]
+//!
+//! [CI Status]: https://img.shields.io/github/actions/workflow/status/juntyr/rust-cuda/ci.yml?branch=main
+//! [workflow]: https://github.com/juntyr/rust-cuda/actions/workflows/ci.yml?query=branch%3Amain
+//!
+//! [MSRV]: https://img.shields.io/badge/MSRV-1.77.0--nightly-orange
+//! [repo]: https://github.com/juntyr/rust-cuda
+//!
+//! [Rust Doc]: https://img.shields.io/badge/docs-main-blue
+//! [docs]: https://juntyr.github.io/rust-cuda/rust_cuda/
+//!
+//! [License Status]: https://app.fossa.com/api/projects/custom%2B26490%2Fgithub.com%2Fjuntyr%2Frust-cuda.svg?type=shield
+//! [fossa]: https://app.fossa.com/projects/custom%2B26490%2Fgithub.com%2Fjuntyr%2Frust-cuda?ref=badge_shield
+//!
+//! [Code Coverage]: https://img.shields.io/codecov/c/github/juntyr/rust-cuda?token=wfeAeybbbx
+//! [codecov]: https://codecov.io/gh/juntyr/rust-cuda
+//!
+//! [Gitpod Ready-to-Code]: https://img.shields.io/badge/Gitpod-ready-blue?logo=gitpod
+//! [gitpod]: https://gitpod.io/#https://github.com/juntyr/rust-cuda
+
 #![deny(clippy::complexity)]
 #![deny(clippy::correctness)]
 #![warn(clippy::nursery)]

From fd9682d2635117a5dc1af302fa1c98de882c631d Mon Sep 17 00:00:00 2001
From: Juniper Tyree <juniper.tyree@helsinki.fi>
Date: Sat, 20 Jan 2024 11:58:35 +0000
Subject: [PATCH 112/120] Small refactoring + added docs for rust-cuda-kernel

---
 examples/print/src/main.rs                    |   2 +-
 examples/single-source/src/main.rs            |   2 +-
 rust-cuda-derive/src/lib.rs                   |   4 +-
 rust-cuda-kernel/src/kernel/link/mod.rs       |  18 +--
 rust-cuda-kernel/src/kernel/lints.rs          |  12 +-
 rust-cuda-kernel/src/kernel/specialise/mod.rs |   2 +-
 .../specialise/{ty.rs => param_type.rs}       |   7 +-
 rust-cuda-kernel/src/kernel/wrapper/config.rs |   6 +-
 .../kernel/wrapper/generate/cuda_wrapper.rs   |   4 +-
 .../args_trait.rs                             |   0
 .../get_ptx.rs                                |   2 +-
 .../mod.rs                                    |   6 +-
 .../src/kernel/wrapper/generate/mod.rs        |   2 +-
 rust-cuda-kernel/src/kernel/wrapper/mod.rs    |  12 +-
 rust-cuda-kernel/src/lib.rs                   | 127 +++++++++++++++++-
 src/device/mod.rs                             |   2 +-
 src/kernel/mod.rs                             |   2 +-
 17 files changed, 164 insertions(+), 46 deletions(-)
 rename rust-cuda-kernel/src/kernel/specialise/{ty.rs => param_type.rs} (96%)
 rename rust-cuda-kernel/src/kernel/wrapper/generate/{host_linker_macro => host_link_macro}/args_trait.rs (100%)
 rename rust-cuda-kernel/src/kernel/wrapper/generate/{host_linker_macro => host_link_macro}/get_ptx.rs (99%)
 rename rust-cuda-kernel/src/kernel/wrapper/generate/{host_linker_macro => host_link_macro}/mod.rs (96%)

diff --git a/examples/print/src/main.rs b/examples/print/src/main.rs
index 93a50ba55..5aec2b391 100644
--- a/examples/print/src/main.rs
+++ b/examples/print/src/main.rs
@@ -22,7 +22,7 @@ pub enum Action {
 }
 
 #[rust_cuda::kernel::kernel(use link! for impl)]
-#[kernel(allow(ptx::local_memory_usage))]
+#[kernel(allow(ptx::local_memory_use))]
 pub fn kernel(action: rust_cuda::kernel::param::PerThreadShallowCopy<Action>) {
     match action {
         Action::Print => rust_cuda::device::utils::println!("println! from CUDA kernel"),
diff --git a/examples/single-source/src/main.rs b/examples/single-source/src/main.rs
index 89bbdf990..f0aa25ce7 100644
--- a/examples/single-source/src/main.rs
+++ b/examples/single-source/src/main.rs
@@ -49,7 +49,7 @@ pub struct Triple(i32, i32, i32);
 #[kernel(crate = "rc")]
 #[kernel(
     allow(ptx::double_precision_use),
-    forbid(ptx::local_memory_usage, ptx::register_spills)
+    forbid(ptx::local_memory_use, ptx::register_spills)
 )]
 pub fn kernel<
     'a,
diff --git a/rust-cuda-derive/src/lib.rs b/rust-cuda-derive/src/lib.rs
index 5b897a8b2..514bbf66e 100644
--- a/rust-cuda-derive/src/lib.rs
+++ b/rust-cuda-derive/src/lib.rs
@@ -20,8 +20,8 @@
 //! [Gitpod Ready-to-Code]: https://img.shields.io/badge/Gitpod-ready-blue?logo=gitpod
 //! [gitpod]: https://gitpod.io/#https://github.com/juntyr/rust-cuda
 //!
-//! `rust-cuda-derive` provides the [`#[derive(LendRustToCuda)`](LendRustToCuda)
-//! derive macro for the
+//! `rust-cuda-derive` provides the
+//! [`#[derive(LendRustToCuda)]`](LendRustToCuda) derive macro for the
 //! [`rust_cuda::lend::RustToCuda`]
 //! utility trait, which enables the usage of the
 //! [`rust_cuda::lend::LendToCuda`]
diff --git a/rust-cuda-kernel/src/kernel/link/mod.rs b/rust-cuda-kernel/src/kernel/link/mod.rs
index 27c2533c6..10d3b63ed 100644
--- a/rust-cuda-kernel/src/kernel/link/mod.rs
+++ b/rust-cuda-kernel/src/kernel/link/mod.rs
@@ -55,7 +55,7 @@ pub fn check_kernel(tokens: TokenStream) -> TokenStream {
         },
     };
 
-    let kernel_ptx = compile_kernel(&kernel, &crate_name, &crate_path, Specialisation::Check);
+    let kernel_ptx = compile_kernel_ptx(&kernel, &crate_name, &crate_path, Specialisation::Check);
 
     let Some(kernel_ptx) = kernel_ptx else {
         return quote!(::core::compile_error!("rust-cuda PTX kernel check failed");).into();
@@ -72,7 +72,7 @@ pub fn check_kernel(tokens: TokenStream) -> TokenStream {
 }
 
 #[allow(clippy::module_name_repetitions)]
-pub fn link_kernel(tokens: TokenStream) -> TokenStream {
+pub fn compile_kernel(tokens: TokenStream) -> TokenStream {
     let ptx_cstr_ident = syn::Ident::new(PTX_CSTR_IDENT, Span::call_site());
     let ffi_signature_ident = syn::Ident::new(KERNEL_TYPE_LAYOUT_IDENT, Span::call_site());
 
@@ -93,7 +93,7 @@ pub fn link_kernel(tokens: TokenStream) -> TokenStream {
         Ok(config) => config,
         Err(err) => {
             abort_call_site!(
-                "link_kernel!(KERNEL HASH NAME PATH SPECIALISATION LINTS,*) expects KERNEL and \
+                "compile_kernel!(KERNEL HASH NAME PATH SPECIALISATION LINTS,*) expects KERNEL and \
                  HASH identifiers, NAME and PATH string literals, and SPECIALISATION and LINTS \
                  tokens: {:?}",
                 err
@@ -108,7 +108,7 @@ pub fn link_kernel(tokens: TokenStream) -> TokenStream {
         .into();
     }
 
-    let Some(mut kernel_ptx) = compile_kernel(
+    let Some(mut kernel_ptx) = compile_kernel_ptx(
         &kernel,
         &crate_name,
         &crate_path,
@@ -285,7 +285,7 @@ fn check_kernel_ptx_and_report(
         Ok(None) => (),
         Ok(Some(binary)) => {
             if ptx_lint_levels
-                .get(&PtxLint::DumpBinary)
+                .get(&PtxLint::DumpAssembly)
                 .map_or(false, |level| *level > LintLevel::Allow)
             {
                 const HEX: [char; 16] = [
@@ -299,7 +299,7 @@ fn check_kernel_ptx_and_report(
                 }
 
                 if ptx_lint_levels
-                    .get(&PtxLint::DumpBinary)
+                    .get(&PtxLint::DumpAssembly)
                     .map_or(false, |level| *level > LintLevel::Warn)
                 {
                     emit_call_site_error!(
@@ -431,7 +431,7 @@ fn check_kernel_ptx(
                 options.push(c"--warn-on-double-precision-use");
             }
             if ptx_lint_levels
-                .get(&PtxLint::LocalMemoryUsage)
+                .get(&PtxLint::LocalMemoryUse)
                 .map_or(false, |level| *level > LintLevel::Warn)
             {
                 options.push(c"--warn-on-local-memory-usage");
@@ -475,7 +475,7 @@ fn check_kernel_ptx(
             options.push(c"--warn-on-double-precision-use");
         }
         if ptx_lint_levels
-            .get(&PtxLint::LocalMemoryUsage)
+            .get(&PtxLint::LocalMemoryUse)
             .map_or(false, |level| *level > LintLevel::Allow)
         {
             options.push(c"--warn-on-local-memory-usage");
@@ -604,7 +604,7 @@ fn check_kernel_ptx(
     (result, error_log, info_log, binary, version, drop)
 }
 
-fn compile_kernel(
+fn compile_kernel_ptx(
     kernel: &syn::Ident,
     crate_name: &str,
     crate_path: &Path,
diff --git a/rust-cuda-kernel/src/kernel/lints.rs b/rust-cuda-kernel/src/kernel/lints.rs
index 6c198b71a..5fbe415b2 100644
--- a/rust-cuda-kernel/src/kernel/lints.rs
+++ b/rust-cuda-kernel/src/kernel/lints.rs
@@ -88,9 +88,9 @@ pub fn parse_ptx_lint_level(
         let lint = match lint {
             l if l == "verbose" => PtxLint::Verbose,
             l if l == "double_precision_use" => PtxLint::DoublePrecisionUse,
-            l if l == "local_memory_usage" => PtxLint::LocalMemoryUsage,
+            l if l == "local_memory_use" => PtxLint::LocalMemoryUse,
             l if l == "register_spills" => PtxLint::RegisterSpills,
-            l if l == "dump_binary" => PtxLint::DumpBinary,
+            l if l == "dump_assembly" => PtxLint::DumpAssembly,
             l if l == "dynamic_stack_size" => PtxLint::DynamicStackSize,
             _ => {
                 emit_error!(
@@ -151,9 +151,9 @@ impl fmt::Display for LintLevel {
 pub enum PtxLint {
     Verbose,
     DoublePrecisionUse,
-    LocalMemoryUsage,
+    LocalMemoryUse,
     RegisterSpills,
-    DumpBinary,
+    DumpAssembly,
     DynamicStackSize,
 }
 
@@ -162,9 +162,9 @@ impl fmt::Display for PtxLint {
         match self {
             Self::Verbose => fmt.write_str("verbose"),
             Self::DoublePrecisionUse => fmt.write_str("double_precision_use"),
-            Self::LocalMemoryUsage => fmt.write_str("local_memory_usage"),
+            Self::LocalMemoryUse => fmt.write_str("local_memory_use"),
             Self::RegisterSpills => fmt.write_str("register_spills"),
-            Self::DumpBinary => fmt.write_str("dump_binary"),
+            Self::DumpAssembly => fmt.write_str("dump_assembly"),
             Self::DynamicStackSize => fmt.write_str("dynamic_stack_size"),
         }
     }
diff --git a/rust-cuda-kernel/src/kernel/specialise/mod.rs b/rust-cuda-kernel/src/kernel/specialise/mod.rs
index 6d30d4d5d..e5dcd518e 100644
--- a/rust-cuda-kernel/src/kernel/specialise/mod.rs
+++ b/rust-cuda-kernel/src/kernel/specialise/mod.rs
@@ -1,3 +1,3 @@
 pub mod entry_point;
 pub mod function;
-pub mod ty;
+pub mod param_type;
diff --git a/rust-cuda-kernel/src/kernel/specialise/ty.rs b/rust-cuda-kernel/src/kernel/specialise/param_type.rs
similarity index 96%
rename from rust-cuda-kernel/src/kernel/specialise/ty.rs
rename to rust-cuda-kernel/src/kernel/specialise/param_type.rs
index 1671f43f0..a398e5eac 100644
--- a/rust-cuda-kernel/src/kernel/specialise/ty.rs
+++ b/rust-cuda-kernel/src/kernel/specialise/param_type.rs
@@ -1,7 +1,8 @@
 use proc_macro::TokenStream;
 use quote::ToTokens;
 
-pub fn specialise_kernel_type(tokens: TokenStream) -> TokenStream {
+#[allow(clippy::module_name_repetitions)]
+pub fn specialise_kernel_param_type(tokens: TokenStream) -> TokenStream {
     let SpecialiseTypeConfig {
         mut ty,
         generics,
@@ -10,8 +11,8 @@ pub fn specialise_kernel_type(tokens: TokenStream) -> TokenStream {
         Ok(config) => config,
         Err(err) => {
             abort_call_site!(
-                "specialise_kernel_type!(TY for GENERICS in KERNEL) expects TY type, GENERICS \
-                 generics, and KERNEL identifier: {:?}",
+                "specialise_kernel_param_type!(TY for GENERICS in KERNEL) expects TY type, \
+                 GENERICS generics, and KERNEL identifier: {:?}",
                 err
             )
         },
diff --git a/rust-cuda-kernel/src/kernel/wrapper/config.rs b/rust-cuda-kernel/src/kernel/wrapper/config.rs
index 8f8cd2240..66807f2d1 100644
--- a/rust-cuda-kernel/src/kernel/wrapper/config.rs
+++ b/rust-cuda-kernel/src/kernel/wrapper/config.rs
@@ -1,17 +1,17 @@
 pub(super) struct KernelConfig {
     pub(super) visibility: Option<syn::token::Pub>,
-    pub(super) linker: syn::Ident,
+    pub(super) link: syn::Ident,
 }
 
 impl syn::parse::Parse for KernelConfig {
     fn parse(input: syn::parse::ParseStream) -> syn::Result<Self> {
         let visibility: Option<syn::token::Pub> = input.parse()?;
         let _use: syn::token::Use = input.parse()?;
-        let linker: syn::Ident = input.parse()?;
+        let link: syn::Ident = input.parse()?;
         let _bang: syn::token::Bang = input.parse()?;
         let _for: syn::token::For = input.parse()?;
         let _impl: syn::token::Impl = input.parse()?;
 
-        Ok(Self { visibility, linker })
+        Ok(Self { visibility, link })
     }
 }
diff --git a/rust-cuda-kernel/src/kernel/wrapper/generate/cuda_wrapper.rs b/rust-cuda-kernel/src/kernel/wrapper/generate/cuda_wrapper.rs
index 938074e56..c3cb11458 100644
--- a/rust-cuda-kernel/src/kernel/wrapper/generate/cuda_wrapper.rs
+++ b/rust-cuda-kernel/src/kernel/wrapper/generate/cuda_wrapper.rs
@@ -31,7 +31,7 @@ pub(in super::super) fn quote_cuda_wrapper(
         },
         |inner, (i, syn::PatType { pat, ty, .. })| {
             let specialised_ty = quote::quote_spanned! { ty.span()=>
-                #crate_path::device::specialise_kernel_type!(#ty for #generics in #func_ident)
+                #crate_path::device::specialise_kernel_param_type!(#ty for #generics in #func_ident)
             };
 
             // Load the device param from its FFI representation
@@ -110,7 +110,7 @@ fn specialise_ffi_input_types(
             ty,
         }| {
             let specialised_ty = quote::quote_spanned! { ty.span()=>
-                #crate_path::device::specialise_kernel_type!(#ty for #impl_generics in #func_ident)
+                #crate_path::device::specialise_kernel_param_type!(#ty for #impl_generics in #func_ident)
             };
 
             let ffi_ty: syn::Type = syn::parse_quote_spanned! { ty.span()=>
diff --git a/rust-cuda-kernel/src/kernel/wrapper/generate/host_linker_macro/args_trait.rs b/rust-cuda-kernel/src/kernel/wrapper/generate/host_link_macro/args_trait.rs
similarity index 100%
rename from rust-cuda-kernel/src/kernel/wrapper/generate/host_linker_macro/args_trait.rs
rename to rust-cuda-kernel/src/kernel/wrapper/generate/host_link_macro/args_trait.rs
diff --git a/rust-cuda-kernel/src/kernel/wrapper/generate/host_linker_macro/get_ptx.rs b/rust-cuda-kernel/src/kernel/wrapper/generate/host_link_macro/get_ptx.rs
similarity index 99%
rename from rust-cuda-kernel/src/kernel/wrapper/generate/host_linker_macro/get_ptx.rs
rename to rust-cuda-kernel/src/kernel/wrapper/generate/host_link_macro/get_ptx.rs
index 955093e5c..5504d12a8 100644
--- a/rust-cuda-kernel/src/kernel/wrapper/generate/host_linker_macro/get_ptx.rs
+++ b/rust-cuda-kernel/src/kernel/wrapper/generate/host_link_macro/get_ptx.rs
@@ -95,7 +95,7 @@ pub(super) fn quote_get_ptx(
                 static #private_func_params: #cpu_func_lifetime_erased_types;
             )* }
 
-            #crate_path::kernel::link_kernel!{
+            #crate_path::kernel::compile_kernel!{
                 #func_ident #func_ident_hash #crate_name #crate_manifest_dir #generic_start_token
                     #($#macro_type_ids),*
                 #generic_close_token #ptx_lint_levels
diff --git a/rust-cuda-kernel/src/kernel/wrapper/generate/host_linker_macro/mod.rs b/rust-cuda-kernel/src/kernel/wrapper/generate/host_link_macro/mod.rs
similarity index 96%
rename from rust-cuda-kernel/src/kernel/wrapper/generate/host_linker_macro/mod.rs
rename to rust-cuda-kernel/src/kernel/wrapper/generate/host_link_macro/mod.rs
index 36479b62a..ea5daccdc 100644
--- a/rust-cuda-kernel/src/kernel/wrapper/generate/host_linker_macro/mod.rs
+++ b/rust-cuda-kernel/src/kernel/wrapper/generate/host_link_macro/mod.rs
@@ -8,10 +8,10 @@ mod get_ptx;
 use get_ptx::quote_get_ptx;
 
 #[allow(clippy::too_many_arguments)] // FIXME
-pub(in super::super) fn quote_host_linker_macro(
+pub(in super::super) fn quote_host_link_macro(
     crate_path: &syn::Path,
     KernelConfig {
-        visibility, linker, ..
+        visibility, link, ..
     }: &KernelConfig,
     decl_generics @ DeclGenerics {
         generic_start_token,
@@ -86,7 +86,7 @@ pub(in super::super) fn quote_host_linker_macro(
 
     quote! {
         #[cfg(not(target_os = "cuda"))]
-        #visibility macro #linker(
+        #visibility macro #link(
             impl #func_ident_name #generic_start_token
                 #(#macro_generics),* $(,)?
             #generic_close_token for $ptx:ident
diff --git a/rust-cuda-kernel/src/kernel/wrapper/generate/mod.rs b/rust-cuda-kernel/src/kernel/wrapper/generate/mod.rs
index bf2c293cc..829cb0433 100644
--- a/rust-cuda-kernel/src/kernel/wrapper/generate/mod.rs
+++ b/rust-cuda-kernel/src/kernel/wrapper/generate/mod.rs
@@ -1,4 +1,4 @@
 pub mod cuda_generic_function;
 pub mod cuda_wrapper;
 pub mod host_kernel_ty;
-pub mod host_linker_macro;
+pub mod host_link_macro;
diff --git a/rust-cuda-kernel/src/kernel/wrapper/mod.rs b/rust-cuda-kernel/src/kernel/wrapper/mod.rs
index f400e3147..0c4f743ab 100644
--- a/rust-cuda-kernel/src/kernel/wrapper/mod.rs
+++ b/rust-cuda-kernel/src/kernel/wrapper/mod.rs
@@ -14,7 +14,7 @@ use crate::kernel::lints::{parse_ptx_lint_level, LintLevel, PtxLint};
 use config::KernelConfig;
 use generate::{
     cuda_generic_function::quote_cuda_generic_function, cuda_wrapper::quote_cuda_wrapper,
-    host_kernel_ty::quote_host_kernel_ty, host_linker_macro::quote_host_linker_macro,
+    host_kernel_ty::quote_host_kernel_ty, host_link_macro::quote_host_link_macro,
 };
 use parse::parse_kernel_fn;
 use proc_macro2::{Ident, Span};
@@ -33,7 +33,7 @@ pub fn kernel(attr: TokenStream, func: TokenStream) -> TokenStream {
         Ok(config) => config,
         Err(err) => {
             abort_call_site!(
-                "#[kernel(pub? use LINKER! for impl)] expects LINKER identifier: {:?}",
+                "#[kernel(pub? use LINK! for impl)] expects LINK macro identifier: {:?}",
                 err
             )
         },
@@ -107,9 +107,9 @@ pub fn kernel(attr: TokenStream, func: TokenStream) -> TokenStream {
 
     let _ = ptx_lint_levels.try_insert(PtxLint::Verbose, LintLevel::Allow);
     let _ = ptx_lint_levels.try_insert(PtxLint::DoublePrecisionUse, LintLevel::Warn);
-    let _ = ptx_lint_levels.try_insert(PtxLint::LocalMemoryUsage, LintLevel::Warn);
+    let _ = ptx_lint_levels.try_insert(PtxLint::LocalMemoryUse, LintLevel::Warn);
     let _ = ptx_lint_levels.try_insert(PtxLint::RegisterSpills, LintLevel::Warn);
-    let _ = ptx_lint_levels.try_insert(PtxLint::DumpBinary, LintLevel::Allow);
+    let _ = ptx_lint_levels.try_insert(PtxLint::DumpAssembly, LintLevel::Allow);
     let _ = ptx_lint_levels.try_insert(PtxLint::DynamicStackSize, LintLevel::Warn);
 
     let ptx_lint_levels = {
@@ -223,7 +223,7 @@ pub fn kernel(attr: TokenStream, func: TokenStream) -> TokenStream {
         &func.attrs,
     );
     let host_generic_kernel_check = quote_generic_check(&crate_path, &func_ident);
-    let host_linker_macro = quote_host_linker_macro(
+    let host_link_macro = quote_host_link_macro(
         &crate_path,
         &config,
         &decl_generics,
@@ -255,7 +255,7 @@ pub fn kernel(attr: TokenStream, func: TokenStream) -> TokenStream {
 
         #host_generic_kernel_check
 
-        #host_linker_macro
+        #host_link_macro
 
         #cuda_wrapper
         #cuda_generic_function
diff --git a/rust-cuda-kernel/src/lib.rs b/rust-cuda-kernel/src/lib.rs
index 86d4137bb..29ac48f12 100644
--- a/rust-cuda-kernel/src/lib.rs
+++ b/rust-cuda-kernel/src/lib.rs
@@ -19,6 +19,10 @@
 //!
 //! [Gitpod Ready-to-Code]: https://img.shields.io/badge/Gitpod-ready-blue?logo=gitpod
 //! [gitpod]: https://gitpod.io/#https://github.com/juntyr/rust-cuda
+//!
+//! `rust-cuda-kernel` provides the [`#[kernel]`](macro@kernel) attribute
+//! macro. When applied to a function, it compiles it as a CUDA kernel that
+//! can be *safely* called from Rust code on the host.
 
 #![deny(clippy::complexity)]
 #![deny(clippy::correctness)]
@@ -28,7 +32,7 @@
 #![deny(clippy::style)]
 #![deny(clippy::suspicious)]
 #![deny(unsafe_code)]
-// #![warn(missing_docs)] // FIXME
+#![warn(missing_docs)]
 #![feature(box_patterns)]
 #![feature(proc_macro_tracked_env)]
 #![feature(proc_macro_span)]
@@ -51,6 +55,109 @@ mod kernel;
 
 #[proc_macro_error]
 #[proc_macro_attribute]
+/// Provides the [`#[kernel]`](macro@kernel) attribute macro. When applied to a
+/// function, it compiles it as a CUDA kernel that can be *safely* called from
+/// Rust code on the host.
+///
+/// The annotated function must be public, not const, not async, not have an
+/// explicit ABI, not be variadic, not have a receiver (e.g. `&self`), and
+/// return the unit type `()`. At the moment, the kernel function must also
+/// not use a where clause – use type generic bounds instead.
+///
+/// While the [`#[kernel]`](macro@kernel) attribute supports functions with any
+/// number of arguments, [`rust_cuda::kernel::TypedPtxKernel`] only supports
+/// launching kernels with up to 12 parameters at the moment.
+///
+/// The [`#[kernel]`](macro@kernel) attribute uses the following syntax:
+///
+/// ```rust,ignore
+/// #[kernel(pub? use link! for impl)]
+/// fn my_kernel(/* parameters */) {
+///     /* kernel code */
+/// }
+/// ```
+///
+/// where `link` is the name of a macro that will be generated to manually link
+/// specific monomorphised instantiations of the (optionally generic) kernel
+/// function, and the optional `pub` controls whether this macro is public or
+/// private.
+///
+/// Note that all kernel parameters must implement the sealed
+/// [`rust_cuda::kernel::CudaKernelParameter`] trait.
+///
+/// To use a specific monomorphised instantiation of the kernel, the generated
+/// `link!` macro must be invoked with the following syntax:
+///
+/// ```rust,ignore
+/// struct KernelPtx;
+/// link! { impl my_kernel for KernelPtx }
+/// ```
+/// for the non-generic kernel function `my_kernel` and a non-generic marker
+/// type `KernelPtx`, which can be used as the generic `Kernel` type parameter
+/// for [`rust_cuda::kernel::TypedPtxKernel`] to instantiate and launch the
+/// kernel. Specifically, the [`rust_cuda::kernel::CompiledKernelPtx`] trait is
+/// implemented for the `KernelPtx` type.
+///
+/// If the kernel function is generic, the following syntax is used instead:
+/// ```rust,ignore
+/// #[kernel(pub? use link! for impl)]
+/// fn my_kernel<'a, A, B: Bounded, const N: usize>(/* parameters */) {
+///     /* kernel code */
+/// }
+///
+/// struct KernelPtx<'a, A, B: Bounded, const N: usize>(/* ... */);
+/// link! { impl my_kernel<'a, u32, MyStruct, 42> for KernelPtx }
+/// link! { impl my_kernel<'a, bool, MyOtherStruct, 24> for KernelPtx }
+/// ```
+///
+/// If the kernel generic space is closed, the `link!` macro can be made
+/// private and all instantiations must be requested in the same crate that
+/// defines the kernel function. If downstream code should be allowed to use
+/// and compile new specific monomorphised instantiations of the kernel, the
+/// `link!` macro should be publicly exported. Then, downstream code can define
+/// its own `MyKernelPtx` marker types for which the kernel is linked and which
+/// can be passed to [`rust_cuda::kernel::CompiledKernelPtx`]-generic code in
+/// the kernel-defining crate to construct the requested
+/// [`rust_cuda::kernel::TypedPtxKernel`].
+///
+/// Inside the scope of the [`#[kernel]`](macro@kernel) attribute, a helper
+/// `#[kernel(...)]` attribute can be applied to the kernel function:
+///
+/// - `#[kernel(crate = "<crate-path>")]` changes the path to the [`rust-cuda`]
+///   crate that the kernel compilation uses, which by default is `rust_cuda`.
+/// - `#[kernel(allow/warn/deny/forbid(<lint>))]` checks the specified
+///   CUDA-specific lint for each kernel compilation, using default Rust
+///   semantics for allowing, warning on, denying, or forbidding a lint. The
+///   following lints are supported:
+///   - `ptx::double_precision_use`: check for any uses of [`f64`] operations
+///     inside the compiled PTX binary, as they are often significantly less
+///     performant on NVIDIA GPUs than [`f32`] operations. By default,
+///     `#[kernel(warn(ptx::double_precision_use))]` is set.
+///   - `ptx::local_memory_use`: check for any usage of local memory, which may
+///     slow down kernel execution. By default,
+///     `#[kernel(warn(ptx::local_memory_use))]` is set.
+///   - `ptx::register_spills`: check for any spills of registers to local
+///     memory. While using less registers can allow more kernels to be run in
+///     parallel, register spills may also point to missed optimisations. By
+///     default, `#[kernel(warn(ptx::register_spills))]` is set.
+///   - `ptx::dynamic_stack_size`: check if the PTX compiler is unable to
+///     statically determine the size of the required kernel function stack.
+///     When the static stack size is known, the compiler may be able to keep it
+///     entirely within the fast register file. However, when the stack size is
+///     dynamic, more costly memory load and store operations are needed. By
+///     default, `#[kernel(warn(ptx::dynamic_stack_size))]` is set.
+///   - `ptx::verbose`: utility lint to output verbose PTX compiler messages as
+///     warnings (`warn`) or errors (`deny` or `forbid`) or to not output them
+///     (`allow`). By default, `#[kernel(allow(ptx::verbose))]` is set.
+///   - `ptx::dump_assembly`: utility lint to output the compiled PTX assembly
+///     code as a warning (`warn`) or an error (`deny` or `forbid`) or to not
+///     output it (`allow`). By default, `#[kernel(allow(ptx::dump_assembly))]`
+///     is set.
+///
+/// [`rust_cuda::kernel::TypedPtxKernel`]: https://juntyr.github.io/rust-cuda/rust_cuda/kernel/struct.TypedPtxKernel.html
+/// [`rust_cuda::kernel::CudaKernelParameter`]: https://juntyr.github.io/rust-cuda/rust_cuda/kernel/trait.CudaKernelParameter.html
+/// [`rust_cuda::kernel::CompiledKernelPtx`]: https://juntyr.github.io/rust-cuda/rust_cuda/kernel/trait.CompiledKernelPtx.html
+/// [`rust-cuda`]: https://juntyr.github.io/rust-cuda/rust_cuda
 pub fn kernel(attr: TokenStream, func: TokenStream) -> TokenStream {
     kernel::wrapper::kernel(attr, func)
 }
@@ -58,13 +165,17 @@ pub fn kernel(attr: TokenStream, func: TokenStream) -> TokenStream {
 #[doc(hidden)]
 #[proc_macro_error]
 #[proc_macro]
-pub fn specialise_kernel_type(tokens: TokenStream) -> TokenStream {
-    kernel::specialise::ty::specialise_kernel_type(tokens)
+/// Helper macro to specialise the generic kernel param types when compiling
+/// the specialised kernel for CUDA.
+pub fn specialise_kernel_param_type(tokens: TokenStream) -> TokenStream {
+    kernel::specialise::param_type::specialise_kernel_param_type(tokens)
 }
 
 #[doc(hidden)]
 #[proc_macro_error]
 #[proc_macro]
+/// Helper macro to specialise the CUDA kernel entry point name, used on the
+/// host for linking to it.
 pub fn specialise_kernel_entry_point(tokens: TokenStream) -> TokenStream {
     kernel::specialise::entry_point::specialise_kernel_entry_point(tokens)
 }
@@ -72,6 +183,8 @@ pub fn specialise_kernel_entry_point(tokens: TokenStream) -> TokenStream {
 #[doc(hidden)]
 #[proc_macro_error]
 #[proc_macro_attribute]
+/// Helper macro to specialise the name of the CUDA kernel function item, used
+/// to give each specialised version a unique ident when compiling for CUDA.
 pub fn specialise_kernel_function(attr: TokenStream, func: TokenStream) -> TokenStream {
     kernel::specialise::function::specialise_kernel_function(attr, func)
 }
@@ -79,6 +192,8 @@ pub fn specialise_kernel_function(attr: TokenStream, func: TokenStream) -> Token
 #[doc(hidden)]
 #[proc_macro_error]
 #[proc_macro]
+/// Helper macro to cheaply check the generic CUDA kernel, used on the host to
+/// provide code error feedback even when no specialised kernel is linked.
 pub fn check_kernel(tokens: TokenStream) -> TokenStream {
     kernel::link::check_kernel(tokens)
 }
@@ -86,6 +201,8 @@ pub fn check_kernel(tokens: TokenStream) -> TokenStream {
 #[doc(hidden)]
 #[proc_macro_error]
 #[proc_macro]
-pub fn link_kernel(tokens: TokenStream) -> TokenStream {
-    kernel::link::link_kernel(tokens)
+/// Helper macro to compile a specialised CUDA kernel and produce its PTX
+/// assembly code, which is used on the host when linking specialised kernels.
+pub fn compile_kernel(tokens: TokenStream) -> TokenStream {
+    kernel::link::compile_kernel(tokens)
 }
diff --git a/src/device/mod.rs b/src/device/mod.rs
index 791035d51..df20ae5a8 100644
--- a/src/device/mod.rs
+++ b/src/device/mod.rs
@@ -1,6 +1,6 @@
 #[doc(hidden)]
 #[cfg(feature = "kernel")]
-pub use rust_cuda_kernel::{specialise_kernel_function, specialise_kernel_type};
+pub use rust_cuda_kernel::{specialise_kernel_function, specialise_kernel_param_type};
 
 pub mod alloc;
 pub mod thread;
diff --git a/src/kernel/mod.rs b/src/kernel/mod.rs
index e69b0b15a..3fc2b2e60 100644
--- a/src/kernel/mod.rs
+++ b/src/kernel/mod.rs
@@ -19,7 +19,7 @@ pub use rust_cuda_kernel::kernel;
 #[doc(hidden)]
 #[cfg(all(feature = "kernel", feature = "host"))]
 #[allow(clippy::module_name_repetitions)]
-pub use rust_cuda_kernel::{check_kernel, link_kernel, specialise_kernel_entry_point};
+pub use rust_cuda_kernel::{check_kernel, compile_kernel, specialise_kernel_entry_point};
 
 #[cfg(feature = "host")]
 mod ptx_jit;

From d11e6d9fa9a242e15c9481376254aa245efb216a Mon Sep 17 00:00:00 2001
From: Juniper Tyree <juniper.tyree@helsinki.fi>
Date: Sat, 20 Jan 2024 12:03:33 +0000
Subject: [PATCH 113/120] Bump MSRV to 1.77-nightly

---
 Cargo.toml                         | 2 +-
 examples/print/src/main.rs         | 1 -
 examples/single-source/src/main.rs | 1 -
 rust-cuda-derive/Cargo.toml        | 8 +-------
 rust-cuda-kernel/Cargo.toml        | 1 +
 rust-cuda-kernel/src/lib.rs        | 1 -
 rust-toolchain                     | 1 -
 src/lib.rs                         | 2 --
 8 files changed, 3 insertions(+), 14 deletions(-)

diff --git a/Cargo.toml b/Cargo.toml
index 5aaa324bb..3b6dbf342 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -13,7 +13,7 @@ version = "0.1.0"
 authors = ["Juniper Tyree <juniper.tyree@helsinki.fi>"]
 license = "MIT OR Apache-2.0"
 edition = "2021"
-rust-version = "1.75" # nightly
+rust-version = "1.77" # nightly
 
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 
diff --git a/examples/print/src/main.rs b/examples/print/src/main.rs
index 5aec2b391..c99ae0df9 100644
--- a/examples/print/src/main.rs
+++ b/examples/print/src/main.rs
@@ -6,7 +6,6 @@
 #![cfg_attr(target_os = "cuda", feature(asm_experimental_arch))]
 #![feature(const_type_name)]
 #![feature(cfg_version)]
-#![cfg_attr(not(version("1.76.0")), feature(c_str_literals))]
 #![feature(type_alias_impl_trait)]
 #![feature(decl_macro)]
 
diff --git a/examples/single-source/src/main.rs b/examples/single-source/src/main.rs
index f0aa25ce7..b4a7cec52 100644
--- a/examples/single-source/src/main.rs
+++ b/examples/single-source/src/main.rs
@@ -7,7 +7,6 @@
 #![feature(const_type_name)]
 #![feature(offset_of)]
 #![feature(cfg_version)]
-#![cfg_attr(not(version("1.76.0")), feature(c_str_literals))]
 #![feature(type_alias_impl_trait)]
 #![feature(associated_type_bounds)]
 #![feature(decl_macro)]
diff --git a/rust-cuda-derive/Cargo.toml b/rust-cuda-derive/Cargo.toml
index 73a74907b..fc214dea7 100644
--- a/rust-cuda-derive/Cargo.toml
+++ b/rust-cuda-derive/Cargo.toml
@@ -4,6 +4,7 @@ version = "0.1.0"
 authors = ["Juniper Tyree <juniper.tyree@helsinki.fi>"]
 license = "MIT OR Apache-2.0"
 edition = "2021"
+rust-version = "1.77" # nightly
 
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 
@@ -15,10 +16,3 @@ syn = { version = "1.0", features = ["full", "fold"] }
 quote = "1.0"
 proc-macro2 = "1.0"
 proc-macro-error = "1.0"
-# regex = "1.5"
-# lazy_static = "1.4"
-# serde_json = "1.0"
-# cargo_metadata = { version = "0.18", features = ["builder"] }
-# strip-ansi-escapes = "0.2"
-# colored = "2.0"
-# thiserror = "1.0"
diff --git a/rust-cuda-kernel/Cargo.toml b/rust-cuda-kernel/Cargo.toml
index 23e641841..b944bf875 100644
--- a/rust-cuda-kernel/Cargo.toml
+++ b/rust-cuda-kernel/Cargo.toml
@@ -4,6 +4,7 @@ version = "0.1.0"
 authors = ["Juniper Tyree <juniper.tyree@helsinki.fi>"]
 license = "MIT OR Apache-2.0"
 edition = "2021"
+rust-version = "1.77" # nightly
 links = "libnvptxcompiler_static"
 
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
diff --git a/rust-cuda-kernel/src/lib.rs b/rust-cuda-kernel/src/lib.rs
index 29ac48f12..e6d5cf3ac 100644
--- a/rust-cuda-kernel/src/lib.rs
+++ b/rust-cuda-kernel/src/lib.rs
@@ -41,7 +41,6 @@
 #![feature(proc_macro_def_site)]
 #![feature(proc_macro_c_str_literals)]
 #![feature(cfg_version)]
-#![cfg_attr(not(version("1.76.0")), feature(c_str_literals))]
 #![doc(html_root_url = "https://juntyr.github.io/rust-cuda/")]
 
 extern crate proc_macro;
diff --git a/rust-toolchain b/rust-toolchain
index e6cfef665..7734bcf14 100644
--- a/rust-toolchain
+++ b/rust-toolchain
@@ -1,5 +1,4 @@
 [toolchain]
-# Pin to final 1.75.0 nightly
 channel = "nightly"
 components = [ "cargo", "rustfmt", "clippy" ]
 targets = [ "x86_64-unknown-linux-gnu", "nvptx64-nvidia-cuda" ]
diff --git a/src/lib.rs b/src/lib.rs
index df8c42290..1c92688b1 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -54,8 +54,6 @@
 #![feature(never_type)]
 #![feature(layout_for_ptr)]
 #![feature(cfg_version)]
-#![cfg_attr(not(version("1.76.0")), feature(c_str_literals))]
-#![cfg_attr(not(version("1.76.0")), feature(ptr_from_ref))]
 #![allow(incomplete_features)]
 #![feature(generic_const_exprs)]
 #![cfg_attr(feature = "device", feature(slice_ptr_get))]

From 521419c50b734a55d8ccccdffc45674d681d20d1 Mon Sep 17 00:00:00 2001
From: Juniper Tyree <juniper.tyree@helsinki.fi>
Date: Sat, 20 Jan 2024 17:06:24 +0000
Subject: [PATCH 114/120] Try trait-based kernel signature check

---
 .../generate/host_link_macro/get_ptx.rs       | 13 ++--
 src/safety/ptx_kernel_signature.rs            | 59 +++++--------------
 2 files changed, 23 insertions(+), 49 deletions(-)

diff --git a/rust-cuda-kernel/src/kernel/wrapper/generate/host_link_macro/get_ptx.rs b/rust-cuda-kernel/src/kernel/wrapper/generate/host_link_macro/get_ptx.rs
index 5504d12a8..39b859a77 100644
--- a/rust-cuda-kernel/src/kernel/wrapper/generate/host_link_macro/get_ptx.rs
+++ b/rust-cuda-kernel/src/kernel/wrapper/generate/host_link_macro/get_ptx.rs
@@ -68,11 +68,14 @@ pub(super) fn quote_get_ptx(
         let ffi_signature_ty = quote! { extern "C" fn(#(#cpu_func_lifetime_erased_types),*) };
 
         quote::quote_spanned! { func_ident.span()=>
-            const _: #crate_path::safety::ptx_kernel_signature::Assert<{
-                #crate_path::safety::ptx_kernel_signature::HostAndDeviceKernelSignatureTypeLayout::Match
-            }> = #crate_path::safety::ptx_kernel_signature::Assert::<{
-                #crate_path::safety::ptx_kernel_signature::check::<#ffi_signature_ty>(#ffi_signature_ident)
-            }>;
+            const _: () = #crate_path::safety::ptx_kernel_signature::check::<
+                {
+                    &#crate_path::deps::const_type_layout::serialise_type_graph::<
+                        #ffi_signature_ty
+                    >()
+                },
+                #ffi_signature_ident,
+            >();
         }
     };
 
diff --git a/src/safety/ptx_kernel_signature.rs b/src/safety/ptx_kernel_signature.rs
index eb4a63820..aa42b905e 100644
--- a/src/safety/ptx_kernel_signature.rs
+++ b/src/safety/ptx_kernel_signature.rs
@@ -1,51 +1,22 @@
-use const_type_layout::{serialise_type_graph, serialised_type_graph_len, TypeGraphLayout};
+const SIGNATURE_ERROR_MESSAGE: &[u8] = b"ERROR in this PTX compilation";
 
-#[allow(clippy::module_name_repetitions)]
-#[derive(PartialEq, Eq, core::marker::ConstParamTy)]
-pub enum HostAndDeviceKernelSignatureTypeLayout {
-    Match,
-    Mismatch,
+#[marker]
+pub trait SameHostAndDeviceKernelSignatureTypeLayout<const A: &'static [u8], const B: &'static [u8]>
+{
 }
 
-pub struct Assert<const MATCH: HostAndDeviceKernelSignatureTypeLayout>;
-
-#[must_use]
-pub const fn check<T: TypeGraphLayout>(
-    device: &'static [u8],
-) -> HostAndDeviceKernelSignatureTypeLayout
-where
-    [u8; serialised_type_graph_len::<T>()]:,
+impl<const AB: &'static [u8]> SameHostAndDeviceKernelSignatureTypeLayout<AB, AB> for () {}
+impl<const A: &'static [u8]> SameHostAndDeviceKernelSignatureTypeLayout<A, SIGNATURE_ERROR_MESSAGE>
+    for ()
+{
+}
+impl<const B: &'static [u8]> SameHostAndDeviceKernelSignatureTypeLayout<SIGNATURE_ERROR_MESSAGE, B>
+    for ()
 {
-    const SIGNATURE_ERROR_MESSAGE: &[u8] = b"ERROR in this PTX compilation";
-
-    // Short-circuit to avoid extra errors when PTX compilation fails
-    if equals(device, SIGNATURE_ERROR_MESSAGE) {
-        return HostAndDeviceKernelSignatureTypeLayout::Match;
-    }
-
-    let host = serialise_type_graph::<T>();
-
-    if equals(device, &host) {
-        HostAndDeviceKernelSignatureTypeLayout::Match
-    } else {
-        HostAndDeviceKernelSignatureTypeLayout::Mismatch
-    }
 }
 
-const fn equals(device: &[u8], host: &[u8]) -> bool {
-    if host.len() != device.len() {
-        return false;
-    }
-
-    let mut i = 0;
-
-    while i < host.len() {
-        if host[i] != device[i] {
-            return false;
-        }
-
-        i += 1;
-    }
-
-    true
+pub const fn check<const A: &'static [u8], const B: &'static [u8]>()
+where
+    (): SameHostAndDeviceKernelSignatureTypeLayout<A, B>,
+{
 }

From 07dc90875a7a535006c9abae3c8e903a652814b3 Mon Sep 17 00:00:00 2001
From: Juniper Tyree <juniper.tyree@helsinki.fi>
Date: Sat, 20 Jan 2024 19:39:29 +0000
Subject: [PATCH 115/120] Try naming host kernel layout const

---
 .../wrapper/generate/host_link_macro/get_ptx.rs     | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/rust-cuda-kernel/src/kernel/wrapper/generate/host_link_macro/get_ptx.rs b/rust-cuda-kernel/src/kernel/wrapper/generate/host_link_macro/get_ptx.rs
index 39b859a77..51ecfb0da 100644
--- a/rust-cuda-kernel/src/kernel/wrapper/generate/host_link_macro/get_ptx.rs
+++ b/rust-cuda-kernel/src/kernel/wrapper/generate/host_link_macro/get_ptx.rs
@@ -65,15 +65,18 @@ pub(super) fn quote_get_ptx(
         quote!()
     } else {
         let ffi_signature_ident = syn::Ident::new(KERNEL_TYPE_LAYOUT_IDENT, func_ident.span());
+        let ffi_signature_host_ident = quote::format_ident!("{ffi_signature_ident}_HOST");
         let ffi_signature_ty = quote! { extern "C" fn(#(#cpu_func_lifetime_erased_types),*) };
 
         quote::quote_spanned! { func_ident.span()=>
+            #[allow(dead_code)]
+            const #ffi_signature_host_ident: &'static [u8] =
+                &#crate_path::deps::const_type_layout::serialise_type_graph::<
+                    #ffi_signature_ty
+                >();
+
             const _: () = #crate_path::safety::ptx_kernel_signature::check::<
-                {
-                    &#crate_path::deps::const_type_layout::serialise_type_graph::<
-                        #ffi_signature_ty
-                    >()
-                },
+                #ffi_signature_host_ident,
                 #ffi_signature_ident,
             >();
         }

From 1c8115c219c4d48dfcaafd9dd6889001a0742277 Mon Sep 17 00:00:00 2001
From: Juniper Tyree <juniper.tyree@helsinki.fi>
Date: Sun, 21 Jan 2024 07:16:04 +0000
Subject: [PATCH 116/120] Try match against byte literal for faster comparison

---
 rust-cuda-kernel/src/kernel/link/mod.rs       | 15 ++++++++---
 .../generate/host_link_macro/get_ptx.rs       | 19 +++++++-------
 src/safety/ptx_kernel_signature.rs            | 25 ++++---------------
 3 files changed, 26 insertions(+), 33 deletions(-)

diff --git a/rust-cuda-kernel/src/kernel/link/mod.rs b/rust-cuda-kernel/src/kernel/link/mod.rs
index 10d3b63ed..a0df9366a 100644
--- a/rust-cuda-kernel/src/kernel/link/mod.rs
+++ b/rust-cuda-kernel/src/kernel/link/mod.rs
@@ -78,7 +78,9 @@ pub fn compile_kernel(tokens: TokenStream) -> TokenStream {
 
     proc_macro_error::set_dummy(quote! {
         const #ptx_cstr_ident: &'static ::core::ffi::CStr = c"ERROR in this PTX compilation";
-        const #ffi_signature_ident: &[u8; 29] = b"ERROR in this PTX compilation";
+        const fn #ffi_signature_ident(host: &[u8]) -> rust_cuda_import::safety::ptx_kernel_signature::HostAndDeviceKernelSignatureTypeLayout {
+            rust_cuda_import::safety::ptx_kernel_signature::HostAndDeviceKernelSignatureTypeLayout::Match
+        }
         ::core::compile_error!("rust-cuda PTX kernel compilation failed");
     });
 
@@ -116,7 +118,9 @@ pub fn compile_kernel(tokens: TokenStream) -> TokenStream {
     ) else {
         return (quote! {
             const #ptx_cstr_ident: &'static ::core::ffi::CStr = c"ERROR in this PTX compilation";
-            const #ffi_signature_ident: &[u8; 29] = b"ERROR in this PTX compilation";
+            const fn #ffi_signature_ident(host: &[u8]) -> rust_cuda_import::safety::ptx_kernel_signature::HostAndDeviceKernelSignatureTypeLayout {
+                rust_cuda_import::safety::ptx_kernel_signature::HostAndDeviceKernelSignatureTypeLayout::Match
+            }
             ::core::compile_error!("rust-cuda PTX kernel compilation failed");
         })
         .into();
@@ -199,7 +203,12 @@ fn extract_ptx_kernel_layout(kernel_ptx: &mut String) -> Vec<proc_macro2::TokenS
         let byte_str = syn::LitByteStr::new(&bytes, proc_macro2::Span::call_site());
 
         type_layouts.push(quote! {
-            const #param: &[u8; #len] = #byte_str;
+            const fn #param(host: &[u8]) -> rust_cuda_import::safety::ptx_kernel_signature::HostAndDeviceKernelSignatureTypeLayout {
+                match host {
+                    #byte_str => rust_cuda_import::safety::ptx_kernel_signature::HostAndDeviceKernelSignatureTypeLayout::Match,
+                    _ => rust_cuda_import::safety::ptx_kernel_signature::HostAndDeviceKernelSignatureTypeLayout::Mismatch,
+                }
+            }
         });
 
         let type_layout_end = bytes_start + bytes_end_offset + AFTER_BYTES_PATTERN.len();
diff --git a/rust-cuda-kernel/src/kernel/wrapper/generate/host_link_macro/get_ptx.rs b/rust-cuda-kernel/src/kernel/wrapper/generate/host_link_macro/get_ptx.rs
index 51ecfb0da..a54a62b18 100644
--- a/rust-cuda-kernel/src/kernel/wrapper/generate/host_link_macro/get_ptx.rs
+++ b/rust-cuda-kernel/src/kernel/wrapper/generate/host_link_macro/get_ptx.rs
@@ -65,20 +65,16 @@ pub(super) fn quote_get_ptx(
         quote!()
     } else {
         let ffi_signature_ident = syn::Ident::new(KERNEL_TYPE_LAYOUT_IDENT, func_ident.span());
-        let ffi_signature_host_ident = quote::format_ident!("{ffi_signature_ident}_HOST");
         let ffi_signature_ty = quote! { extern "C" fn(#(#cpu_func_lifetime_erased_types),*) };
 
         quote::quote_spanned! { func_ident.span()=>
-            #[allow(dead_code)]
-            const #ffi_signature_host_ident: &'static [u8] =
-                &#crate_path::deps::const_type_layout::serialise_type_graph::<
+            const _: #crate_path::safety::ptx_kernel_signature::Assert<{
+                #crate_path::safety::ptx_kernel_signature::HostAndDeviceKernelSignatureTypeLayout::Match
+            }> = #crate_path::safety::ptx_kernel_signature::Assert::<{
+                #ffi_signature_ident(&#crate_path::deps::const_type_layout::serialise_type_graph::<
                     #ffi_signature_ty
-                >();
-
-            const _: () = #crate_path::safety::ptx_kernel_signature::check::<
-                #ffi_signature_host_ident,
-                #ffi_signature_ident,
-            >();
+                >())
+            }>;
         }
     };
 
@@ -93,6 +89,9 @@ pub(super) fn quote_get_ptx(
 
     quote! {
         fn get_ptx() -> &'static ::core::ffi::CStr {
+            #[allow(dead_code)]
+            use #crate_path as rust_cuda_import;
+
             #args_trait
 
             extern "C" { #(
diff --git a/src/safety/ptx_kernel_signature.rs b/src/safety/ptx_kernel_signature.rs
index aa42b905e..5b33567bb 100644
--- a/src/safety/ptx_kernel_signature.rs
+++ b/src/safety/ptx_kernel_signature.rs
@@ -1,22 +1,7 @@
-const SIGNATURE_ERROR_MESSAGE: &[u8] = b"ERROR in this PTX compilation";
-
-#[marker]
-pub trait SameHostAndDeviceKernelSignatureTypeLayout<const A: &'static [u8], const B: &'static [u8]>
-{
+#[derive(PartialEq, Eq, core::marker::ConstParamTy)]
+pub enum HostAndDeviceKernelSignatureTypeLayout {
+    Match,
+    Mismatch,
 }
 
-impl<const AB: &'static [u8]> SameHostAndDeviceKernelSignatureTypeLayout<AB, AB> for () {}
-impl<const A: &'static [u8]> SameHostAndDeviceKernelSignatureTypeLayout<A, SIGNATURE_ERROR_MESSAGE>
-    for ()
-{
-}
-impl<const B: &'static [u8]> SameHostAndDeviceKernelSignatureTypeLayout<SIGNATURE_ERROR_MESSAGE, B>
-    for ()
-{
-}
-
-pub const fn check<const A: &'static [u8], const B: &'static [u8]>()
-where
-    (): SameHostAndDeviceKernelSignatureTypeLayout<A, B>,
-{
-}
+pub struct Assert<const MATCH: HostAndDeviceKernelSignatureTypeLayout>;

From b040cac08c51ffe3903d0e900eb1ed7ba59a38d0 Mon Sep 17 00:00:00 2001
From: Juniper Tyree <juniper.tyree@helsinki.fi>
Date: Sun, 21 Jan 2024 08:00:41 +0000
Subject: [PATCH 117/120] Try with memcmp intrinsic

---
 rust-cuda-kernel/src/kernel/link/mod.rs       | 15 ++------
 .../generate/host_link_macro/get_ptx.rs       |  7 +---
 src/lib.rs                                    |  5 ++-
 src/safety/ptx_kernel_signature.rs            | 34 +++++++++++++++++++
 4 files changed, 42 insertions(+), 19 deletions(-)

diff --git a/rust-cuda-kernel/src/kernel/link/mod.rs b/rust-cuda-kernel/src/kernel/link/mod.rs
index a0df9366a..10d3b63ed 100644
--- a/rust-cuda-kernel/src/kernel/link/mod.rs
+++ b/rust-cuda-kernel/src/kernel/link/mod.rs
@@ -78,9 +78,7 @@ pub fn compile_kernel(tokens: TokenStream) -> TokenStream {
 
     proc_macro_error::set_dummy(quote! {
         const #ptx_cstr_ident: &'static ::core::ffi::CStr = c"ERROR in this PTX compilation";
-        const fn #ffi_signature_ident(host: &[u8]) -> rust_cuda_import::safety::ptx_kernel_signature::HostAndDeviceKernelSignatureTypeLayout {
-            rust_cuda_import::safety::ptx_kernel_signature::HostAndDeviceKernelSignatureTypeLayout::Match
-        }
+        const #ffi_signature_ident: &[u8; 29] = b"ERROR in this PTX compilation";
         ::core::compile_error!("rust-cuda PTX kernel compilation failed");
     });
 
@@ -118,9 +116,7 @@ pub fn compile_kernel(tokens: TokenStream) -> TokenStream {
     ) else {
         return (quote! {
             const #ptx_cstr_ident: &'static ::core::ffi::CStr = c"ERROR in this PTX compilation";
-            const fn #ffi_signature_ident(host: &[u8]) -> rust_cuda_import::safety::ptx_kernel_signature::HostAndDeviceKernelSignatureTypeLayout {
-                rust_cuda_import::safety::ptx_kernel_signature::HostAndDeviceKernelSignatureTypeLayout::Match
-            }
+            const #ffi_signature_ident: &[u8; 29] = b"ERROR in this PTX compilation";
             ::core::compile_error!("rust-cuda PTX kernel compilation failed");
         })
         .into();
@@ -203,12 +199,7 @@ fn extract_ptx_kernel_layout(kernel_ptx: &mut String) -> Vec<proc_macro2::TokenS
         let byte_str = syn::LitByteStr::new(&bytes, proc_macro2::Span::call_site());
 
         type_layouts.push(quote! {
-            const fn #param(host: &[u8]) -> rust_cuda_import::safety::ptx_kernel_signature::HostAndDeviceKernelSignatureTypeLayout {
-                match host {
-                    #byte_str => rust_cuda_import::safety::ptx_kernel_signature::HostAndDeviceKernelSignatureTypeLayout::Match,
-                    _ => rust_cuda_import::safety::ptx_kernel_signature::HostAndDeviceKernelSignatureTypeLayout::Mismatch,
-                }
-            }
+            const #param: &[u8; #len] = #byte_str;
         });
 
         let type_layout_end = bytes_start + bytes_end_offset + AFTER_BYTES_PATTERN.len();
diff --git a/rust-cuda-kernel/src/kernel/wrapper/generate/host_link_macro/get_ptx.rs b/rust-cuda-kernel/src/kernel/wrapper/generate/host_link_macro/get_ptx.rs
index a54a62b18..5504d12a8 100644
--- a/rust-cuda-kernel/src/kernel/wrapper/generate/host_link_macro/get_ptx.rs
+++ b/rust-cuda-kernel/src/kernel/wrapper/generate/host_link_macro/get_ptx.rs
@@ -71,9 +71,7 @@ pub(super) fn quote_get_ptx(
             const _: #crate_path::safety::ptx_kernel_signature::Assert<{
                 #crate_path::safety::ptx_kernel_signature::HostAndDeviceKernelSignatureTypeLayout::Match
             }> = #crate_path::safety::ptx_kernel_signature::Assert::<{
-                #ffi_signature_ident(&#crate_path::deps::const_type_layout::serialise_type_graph::<
-                    #ffi_signature_ty
-                >())
+                #crate_path::safety::ptx_kernel_signature::check::<#ffi_signature_ty>(#ffi_signature_ident)
             }>;
         }
     };
@@ -89,9 +87,6 @@ pub(super) fn quote_get_ptx(
 
     quote! {
         fn get_ptx() -> &'static ::core::ffi::CStr {
-            #[allow(dead_code)]
-            use #crate_path as rust_cuda_import;
-
             #args_trait
 
             extern "C" { #(
diff --git a/src/lib.rs b/src/lib.rs
index 1c92688b1..a6d41b648 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -54,9 +54,12 @@
 #![feature(never_type)]
 #![feature(layout_for_ptr)]
 #![feature(cfg_version)]
+#![cfg_attr(feature = "device", feature(slice_ptr_get))]
 #![allow(incomplete_features)]
 #![feature(generic_const_exprs)]
-#![cfg_attr(feature = "device", feature(slice_ptr_get))]
+#![allow(internal_features)]
+#![feature(core_intrinsics)]
+#![feature(const_intrinsic_compare_bytes)]
 #![doc(html_root_url = "https://juntyr.github.io/rust-cuda/")]
 
 #[cfg(all(feature = "host", feature = "device", not(doc)))]
diff --git a/src/safety/ptx_kernel_signature.rs b/src/safety/ptx_kernel_signature.rs
index 5b33567bb..a8b298691 100644
--- a/src/safety/ptx_kernel_signature.rs
+++ b/src/safety/ptx_kernel_signature.rs
@@ -1,3 +1,6 @@
+use const_type_layout::{serialise_type_graph, serialised_type_graph_len, TypeGraphLayout};
+
+#[allow(clippy::module_name_repetitions)]
 #[derive(PartialEq, Eq, core::marker::ConstParamTy)]
 pub enum HostAndDeviceKernelSignatureTypeLayout {
     Match,
@@ -5,3 +8,34 @@ pub enum HostAndDeviceKernelSignatureTypeLayout {
 }
 
 pub struct Assert<const MATCH: HostAndDeviceKernelSignatureTypeLayout>;
+
+#[must_use]
+pub const fn check<T: TypeGraphLayout>(
+    device: &'static [u8],
+) -> HostAndDeviceKernelSignatureTypeLayout
+where
+    [u8; serialised_type_graph_len::<T>()]:,
+{
+    const SIGNATURE_ERROR_MESSAGE: &[u8] = b"ERROR in this PTX compilation";
+
+    // Short-circuit to avoid extra errors when PTX compilation fails
+    if equals(device, SIGNATURE_ERROR_MESSAGE) {
+        return HostAndDeviceKernelSignatureTypeLayout::Match;
+    }
+
+    let host = serialise_type_graph::<T>();
+
+    if equals(device, &host) {
+        HostAndDeviceKernelSignatureTypeLayout::Match
+    } else {
+        HostAndDeviceKernelSignatureTypeLayout::Mismatch
+    }
+}
+
+const fn equals(device: &[u8], host: &[u8]) -> bool {
+    if device.len() != host.len() {
+        return false;
+    }
+
+    unsafe { core::intrinsics::compare_bytes(device.as_ptr(), host.as_ptr(), device.len()) == 0 }
+}

From 44a974bd2b0191193b635c4254995ed7b12ea9f5 Mon Sep 17 00:00:00 2001
From: Juniper Tyree <juniper.tyree@helsinki.fi>
Date: Thu, 1 Feb 2024 08:55:18 +0000
Subject: [PATCH 118/120] Try out experimental const-type-layout with
 compression

---
 Cargo.toml                                    |  2 +-
 examples/derive/src/lib.rs                    |  1 -
 examples/single-source/src/main.rs            |  1 -
 rust-cuda-kernel/src/kernel/link/config.rs    |  2 ++
 rust-cuda-kernel/src/kernel/link/mod.rs       | 21 +++++++++++++++++++
 .../src/kernel/specialise/entry_point.rs      |  1 +
 .../src/kernel/specialise/function.rs         |  1 +
 .../wrapper/generate/cuda_generic_function.rs |  1 +
 .../kernel/wrapper/generate/cuda_wrapper.rs   |  1 +
 .../kernel/wrapper/generate/host_kernel_ty.rs |  1 +
 .../generate/host_link_macro/args_trait.rs    |  1 +
 .../generate/host_link_macro/get_ptx.rs       |  1 +
 .../wrapper/generate/host_link_macro/mod.rs   |  1 +
 rust-cuda-kernel/src/kernel/wrapper/mod.rs    |  1 +
 src/lib.rs                                    |  1 -
 src/safety/ptx_entry_point.rs                 | 17 +++------------
 16 files changed, 36 insertions(+), 18 deletions(-)

diff --git a/Cargo.toml b/Cargo.toml
index 3b6dbf342..acb60681a 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -33,7 +33,7 @@ rustacuda_derive = { git = "https://github.com/juntyr/RustaCUDA", rev = "c6ea7cc
 
 regex = { version = "1.10", optional = true }
 
-const-type-layout = { version = "0.2.1", features = ["derive"] }
+const-type-layout = { git = "https://github.com/juntyr/const-type-layout", branch = "compress", features = ["derive"] }
 
 safer_owning_ref = { version = "0.5", optional = true }
 oneshot = { version = "0.1", optional = true, features = ["std", "async"] }
diff --git a/examples/derive/src/lib.rs b/examples/derive/src/lib.rs
index 622b1b699..6960eadeb 100644
--- a/examples/derive/src/lib.rs
+++ b/examples/derive/src/lib.rs
@@ -1,6 +1,5 @@
 #![deny(clippy::pedantic)]
 #![feature(const_type_name)]
-#![feature(offset_of)]
 
 #[derive(rc::lend::LendRustToCuda)]
 #[cuda(crate = "rc")]
diff --git a/examples/single-source/src/main.rs b/examples/single-source/src/main.rs
index b4a7cec52..3861190d2 100644
--- a/examples/single-source/src/main.rs
+++ b/examples/single-source/src/main.rs
@@ -5,7 +5,6 @@
 #![cfg_attr(target_os = "cuda", feature(alloc_error_handler))]
 #![cfg_attr(target_os = "cuda", feature(asm_experimental_arch))]
 #![feature(const_type_name)]
-#![feature(offset_of)]
 #![feature(cfg_version)]
 #![feature(type_alias_impl_trait)]
 #![feature(associated_type_bounds)]
diff --git a/rust-cuda-kernel/src/kernel/link/config.rs b/rust-cuda-kernel/src/kernel/link/config.rs
index 469318f02..02297ba7d 100644
--- a/rust-cuda-kernel/src/kernel/link/config.rs
+++ b/rust-cuda-kernel/src/kernel/link/config.rs
@@ -1,5 +1,7 @@
 use std::{collections::HashMap, path::PathBuf};
 
+use quote::quote;
+
 use crate::kernel::lints::{parse_ptx_lint_level, LintLevel, PtxLint};
 
 #[allow(clippy::module_name_repetitions)]
diff --git a/rust-cuda-kernel/src/kernel/link/mod.rs b/rust-cuda-kernel/src/kernel/link/mod.rs
index 10d3b63ed..001aad0ef 100644
--- a/rust-cuda-kernel/src/kernel/link/mod.rs
+++ b/rust-cuda-kernel/src/kernel/link/mod.rs
@@ -18,6 +18,7 @@ use ptx_builder::{
     builder::{BuildStatus, Builder, MessageFormat, Profile},
     error::{BuildErrorKind, Error, Result},
 };
+use quote::quote;
 
 use crate::kernel::{
     lints::{LintLevel, PtxLint},
@@ -196,6 +197,26 @@ fn extract_ptx_kernel_layout(kernel_ptx: &mut String) -> Vec<proc_macro2::TokenS
             );
         }
 
+        // let mut ascii_escaped_bytes = Vec::new();
+        // for b in &bytes {
+        //     ascii_escaped_bytes.extend(std::ascii::escape_default(*b));
+        // }
+        // emit_call_site_warning!("{}", std::str::from_utf8(&ascii_escaped_bytes).unwrap());
+
+        let mut zeros = 0;
+        for b in &bytes {
+            if *b == 0 {
+                zeros += 1;
+            } else {
+                zeros = 0;
+            }
+        }
+
+        #[allow(clippy::cast_precision_loss)] // FIXME
+        {
+            emit_call_site_warning!("type layout: {}B (can do {:.02} compression)", bytes.len(), (bytes.len() as f64) / ((bytes.len() - zeros) as f64));
+        }
+
         let byte_str = syn::LitByteStr::new(&bytes, proc_macro2::Span::call_site());
 
         type_layouts.push(quote! {
diff --git a/rust-cuda-kernel/src/kernel/specialise/entry_point.rs b/rust-cuda-kernel/src/kernel/specialise/entry_point.rs
index 1c80b7899..b429a9297 100644
--- a/rust-cuda-kernel/src/kernel/specialise/entry_point.rs
+++ b/rust-cuda-kernel/src/kernel/specialise/entry_point.rs
@@ -1,6 +1,7 @@
 use std::ffi::CString;
 
 use proc_macro::TokenStream;
+use quote::quote;
 
 #[allow(clippy::module_name_repetitions)]
 pub fn specialise_kernel_entry_point(tokens: TokenStream) -> TokenStream {
diff --git a/rust-cuda-kernel/src/kernel/specialise/function.rs b/rust-cuda-kernel/src/kernel/specialise/function.rs
index 068f30d97..44d8b8a81 100644
--- a/rust-cuda-kernel/src/kernel/specialise/function.rs
+++ b/rust-cuda-kernel/src/kernel/specialise/function.rs
@@ -1,6 +1,7 @@
 use std::env::VarError;
 
 use proc_macro::TokenStream;
+use quote::quote;
 
 #[allow(clippy::module_name_repetitions)]
 pub fn specialise_kernel_function(attr: TokenStream, func: TokenStream) -> TokenStream {
diff --git a/rust-cuda-kernel/src/kernel/wrapper/generate/cuda_generic_function.rs b/rust-cuda-kernel/src/kernel/wrapper/generate/cuda_generic_function.rs
index 00e00d7d8..0799f4cc7 100644
--- a/rust-cuda-kernel/src/kernel/wrapper/generate/cuda_generic_function.rs
+++ b/rust-cuda-kernel/src/kernel/wrapper/generate/cuda_generic_function.rs
@@ -1,5 +1,6 @@
 use proc_macro2::TokenStream;
 use syn::spanned::Spanned;
+use quote::quote;
 
 use crate::kernel::wrapper::{DeclGenerics, FuncIdent};
 
diff --git a/rust-cuda-kernel/src/kernel/wrapper/generate/cuda_wrapper.rs b/rust-cuda-kernel/src/kernel/wrapper/generate/cuda_wrapper.rs
index c3cb11458..ff7e2ee48 100644
--- a/rust-cuda-kernel/src/kernel/wrapper/generate/cuda_wrapper.rs
+++ b/rust-cuda-kernel/src/kernel/wrapper/generate/cuda_wrapper.rs
@@ -1,5 +1,6 @@
 use proc_macro2::TokenStream;
 use syn::spanned::Spanned;
+use quote::quote;
 
 use crate::kernel::{
     wrapper::{FuncIdent, FunctionInputs, ImplGenerics},
diff --git a/rust-cuda-kernel/src/kernel/wrapper/generate/host_kernel_ty.rs b/rust-cuda-kernel/src/kernel/wrapper/generate/host_kernel_ty.rs
index 78e972d69..757f22470 100644
--- a/rust-cuda-kernel/src/kernel/wrapper/generate/host_kernel_ty.rs
+++ b/rust-cuda-kernel/src/kernel/wrapper/generate/host_kernel_ty.rs
@@ -1,4 +1,5 @@
 use proc_macro2::TokenStream;
+use quote::quote;
 
 use crate::kernel::wrapper::{DeclGenerics, FuncIdent, FunctionInputs, ImplGenerics};
 
diff --git a/rust-cuda-kernel/src/kernel/wrapper/generate/host_link_macro/args_trait.rs b/rust-cuda-kernel/src/kernel/wrapper/generate/host_link_macro/args_trait.rs
index 26653e435..1813942d8 100644
--- a/rust-cuda-kernel/src/kernel/wrapper/generate/host_link_macro/args_trait.rs
+++ b/rust-cuda-kernel/src/kernel/wrapper/generate/host_link_macro/args_trait.rs
@@ -1,4 +1,5 @@
 use proc_macro2::TokenStream;
+use quote::quote;
 
 use crate::kernel::wrapper::{FunctionInputs, ImplGenerics};
 
diff --git a/rust-cuda-kernel/src/kernel/wrapper/generate/host_link_macro/get_ptx.rs b/rust-cuda-kernel/src/kernel/wrapper/generate/host_link_macro/get_ptx.rs
index 5504d12a8..4d5e01a25 100644
--- a/rust-cuda-kernel/src/kernel/wrapper/generate/host_link_macro/get_ptx.rs
+++ b/rust-cuda-kernel/src/kernel/wrapper/generate/host_link_macro/get_ptx.rs
@@ -1,5 +1,6 @@
 use proc_macro2::TokenStream;
 use syn::spanned::Spanned;
+use quote::quote;
 
 use crate::kernel::{
     utils::skip_kernel_compilation,
diff --git a/rust-cuda-kernel/src/kernel/wrapper/generate/host_link_macro/mod.rs b/rust-cuda-kernel/src/kernel/wrapper/generate/host_link_macro/mod.rs
index ea5daccdc..353e6c5dc 100644
--- a/rust-cuda-kernel/src/kernel/wrapper/generate/host_link_macro/mod.rs
+++ b/rust-cuda-kernel/src/kernel/wrapper/generate/host_link_macro/mod.rs
@@ -1,4 +1,5 @@
 use proc_macro2::TokenStream;
+use quote::quote;
 
 use crate::kernel::wrapper::{DeclGenerics, FuncIdent, FunctionInputs, ImplGenerics, KernelConfig};
 
diff --git a/rust-cuda-kernel/src/kernel/wrapper/mod.rs b/rust-cuda-kernel/src/kernel/wrapper/mod.rs
index 0c4f743ab..9dffacc51 100644
--- a/rust-cuda-kernel/src/kernel/wrapper/mod.rs
+++ b/rust-cuda-kernel/src/kernel/wrapper/mod.rs
@@ -19,6 +19,7 @@ use generate::{
 use parse::parse_kernel_fn;
 use proc_macro2::{Ident, Span};
 use syn::spanned::Spanned;
+use quote::quote;
 
 #[allow(clippy::too_many_lines)]
 pub fn kernel(attr: TokenStream, func: TokenStream) -> TokenStream {
diff --git a/src/lib.rs b/src/lib.rs
index a6d41b648..35e11ed1b 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -42,7 +42,6 @@
 #![feature(doc_cfg)]
 #![feature(marker_trait_attr)]
 #![feature(const_type_name)]
-#![feature(offset_of)]
 #![feature(adt_const_params)]
 #![feature(impl_trait_in_assoc_type)]
 #![feature(ptr_metadata)]
diff --git a/src/safety/ptx_entry_point.rs b/src/safety/ptx_entry_point.rs
index b1d62cf4e..ab06a13d9 100644
--- a/src/safety/ptx_entry_point.rs
+++ b/src/safety/ptx_entry_point.rs
@@ -55,19 +55,8 @@ const fn find(haystack: &[u8], needle: &[u8], from: usize) -> Option<usize> {
 }
 
 const fn starts_with(haystack: &[u8], needle: &[u8], from: usize) -> bool {
-    let mut i = 0;
-
-    while i < needle.len() {
-        if (from + i) >= haystack.len() {
-            return false;
-        }
-
-        if needle[i] == haystack[from + i] {
-            i += 1;
-        } else {
-            return false;
-        }
-    }
+    let haystack_len = haystack.len() - from;
+    let check_len = if needle.len() < haystack_len { needle.len() } else { haystack_len };
 
-    true
+    unsafe { core::intrinsics::compare_bytes(haystack.as_ptr().add(from), needle.as_ptr(), check_len) == 0 }
 }

From 3ec8118114eabbb1b3048af248d0439e4d250a37 Mon Sep 17 00:00:00 2001
From: Juniper Tyree <juniper.tyree@helsinki.fi>
Date: Mon, 5 Feb 2024 06:55:09 +0000
Subject: [PATCH 119/120] Try check

---
 rust-cuda-kernel/src/kernel/link/mod.rs       | 22 +++++++++++++++----
 .../generate/host_link_macro/get_ptx.rs       | 11 +++++++---
 2 files changed, 26 insertions(+), 7 deletions(-)

diff --git a/rust-cuda-kernel/src/kernel/link/mod.rs b/rust-cuda-kernel/src/kernel/link/mod.rs
index 001aad0ef..0b68debc7 100644
--- a/rust-cuda-kernel/src/kernel/link/mod.rs
+++ b/rust-cuda-kernel/src/kernel/link/mod.rs
@@ -79,7 +79,11 @@ pub fn compile_kernel(tokens: TokenStream) -> TokenStream {
 
     proc_macro_error::set_dummy(quote! {
         const #ptx_cstr_ident: &'static ::core::ffi::CStr = c"ERROR in this PTX compilation";
-        const #ffi_signature_ident: &[u8; 29] = b"ERROR in this PTX compilation";
+
+        const fn #ffi_signature_ident<T: TypeLayoutGraph>() -> HostAndDeviceKernelSignatureTypeLayout {
+            HostAndDeviceKernelSignatureTypeLayout::Match
+        }
+
         ::core::compile_error!("rust-cuda PTX kernel compilation failed");
     });
 
@@ -117,7 +121,11 @@ pub fn compile_kernel(tokens: TokenStream) -> TokenStream {
     ) else {
         return (quote! {
             const #ptx_cstr_ident: &'static ::core::ffi::CStr = c"ERROR in this PTX compilation";
-            const #ffi_signature_ident: &[u8; 29] = b"ERROR in this PTX compilation";
+
+            const fn #ffi_signature_ident<T: TypeLayoutGraph>() -> HostAndDeviceKernelSignatureTypeLayout {
+                HostAndDeviceKernelSignatureTypeLayout::Match
+            }
+
             ::core::compile_error!("rust-cuda PTX kernel compilation failed");
         })
         .into();
@@ -217,10 +225,16 @@ fn extract_ptx_kernel_layout(kernel_ptx: &mut String) -> Vec<proc_macro2::TokenS
             emit_call_site_warning!("type layout: {}B (can do {:.02} compression)", bytes.len(), (bytes.len() as f64) / ((bytes.len() - zeros) as f64));
         }
 
-        let byte_str = syn::LitByteStr::new(&bytes, proc_macro2::Span::call_site());
+        let byte_str = syn::LitByteStr::new(&bytes[..bytes.len()-zeros], proc_macro2::Span::call_site());
 
         type_layouts.push(quote! {
-            const #param: &[u8; #len] = #byte_str;
+            const fn #param<T: TypeLayoutGraph>() -> HostAndDeviceKernelSignatureTypeLayout {
+                if check_serialised_type_graph::<T>(#byte_str) {
+                    HostAndDeviceKernelSignatureTypeLayout::Match
+                } else {
+                    HostAndDeviceKernelSignatureTypeLayout::Mismatch
+                }
+            }
         });
 
         let type_layout_end = bytes_start + bytes_end_offset + AFTER_BYTES_PATTERN.len();
diff --git a/rust-cuda-kernel/src/kernel/wrapper/generate/host_link_macro/get_ptx.rs b/rust-cuda-kernel/src/kernel/wrapper/generate/host_link_macro/get_ptx.rs
index 4d5e01a25..08ec8ab40 100644
--- a/rust-cuda-kernel/src/kernel/wrapper/generate/host_link_macro/get_ptx.rs
+++ b/rust-cuda-kernel/src/kernel/wrapper/generate/host_link_macro/get_ptx.rs
@@ -71,9 +71,14 @@ pub(super) fn quote_get_ptx(
         quote::quote_spanned! { func_ident.span()=>
             const _: #crate_path::safety::ptx_kernel_signature::Assert<{
                 #crate_path::safety::ptx_kernel_signature::HostAndDeviceKernelSignatureTypeLayout::Match
-            }> = #crate_path::safety::ptx_kernel_signature::Assert::<{
-                #crate_path::safety::ptx_kernel_signature::check::<#ffi_signature_ty>(#ffi_signature_ident)
-            }>;
+            }> = {
+                use #crate_path::deps::const_type_layout::{TypeLayoutGraph, check_serialised_type_graph};
+                use crate_path::safety::ptx_kernel_signature::HostAndDeviceKernelSignatureTypeLayout;
+
+                #crate_path::safety::ptx_kernel_signature::Assert::<{
+                    #ffi_signature_ident::<#ffi_signature_ty>()
+                }>
+            };
         }
     };
 

From 6311a6d40e91e817f0474447f5312129d2ca9581 Mon Sep 17 00:00:00 2001
From: Juniper Tyree <juniper.tyree@helsinki.fi>
Date: Mon, 5 Feb 2024 09:13:16 +0000
Subject: [PATCH 120/120] Try check again

---
 rust-cuda-kernel/src/kernel/link/mod.rs           |  6 +++---
 .../wrapper/generate/host_link_macro/get_ptx.rs   | 15 +++++++--------
 2 files changed, 10 insertions(+), 11 deletions(-)

diff --git a/rust-cuda-kernel/src/kernel/link/mod.rs b/rust-cuda-kernel/src/kernel/link/mod.rs
index 0b68debc7..bbe243c9f 100644
--- a/rust-cuda-kernel/src/kernel/link/mod.rs
+++ b/rust-cuda-kernel/src/kernel/link/mod.rs
@@ -80,7 +80,7 @@ pub fn compile_kernel(tokens: TokenStream) -> TokenStream {
     proc_macro_error::set_dummy(quote! {
         const #ptx_cstr_ident: &'static ::core::ffi::CStr = c"ERROR in this PTX compilation";
 
-        const fn #ffi_signature_ident<T: TypeLayoutGraph>() -> HostAndDeviceKernelSignatureTypeLayout {
+        const fn #ffi_signature_ident<T: TypeGraphLayout>() -> HostAndDeviceKernelSignatureTypeLayout {
             HostAndDeviceKernelSignatureTypeLayout::Match
         }
 
@@ -122,7 +122,7 @@ pub fn compile_kernel(tokens: TokenStream) -> TokenStream {
         return (quote! {
             const #ptx_cstr_ident: &'static ::core::ffi::CStr = c"ERROR in this PTX compilation";
 
-            const fn #ffi_signature_ident<T: TypeLayoutGraph>() -> HostAndDeviceKernelSignatureTypeLayout {
+            const fn #ffi_signature_ident<T: TypeGraphLayout>() -> HostAndDeviceKernelSignatureTypeLayout {
                 HostAndDeviceKernelSignatureTypeLayout::Match
             }
 
@@ -228,7 +228,7 @@ fn extract_ptx_kernel_layout(kernel_ptx: &mut String) -> Vec<proc_macro2::TokenS
         let byte_str = syn::LitByteStr::new(&bytes[..bytes.len()-zeros], proc_macro2::Span::call_site());
 
         type_layouts.push(quote! {
-            const fn #param<T: TypeLayoutGraph>() -> HostAndDeviceKernelSignatureTypeLayout {
+            const fn #param<T: TypeGraphLayout>() -> HostAndDeviceKernelSignatureTypeLayout {
                 if check_serialised_type_graph::<T>(#byte_str) {
                     HostAndDeviceKernelSignatureTypeLayout::Match
                 } else {
diff --git a/rust-cuda-kernel/src/kernel/wrapper/generate/host_link_macro/get_ptx.rs b/rust-cuda-kernel/src/kernel/wrapper/generate/host_link_macro/get_ptx.rs
index 08ec8ab40..ef65d5596 100644
--- a/rust-cuda-kernel/src/kernel/wrapper/generate/host_link_macro/get_ptx.rs
+++ b/rust-cuda-kernel/src/kernel/wrapper/generate/host_link_macro/get_ptx.rs
@@ -71,14 +71,9 @@ pub(super) fn quote_get_ptx(
         quote::quote_spanned! { func_ident.span()=>
             const _: #crate_path::safety::ptx_kernel_signature::Assert<{
                 #crate_path::safety::ptx_kernel_signature::HostAndDeviceKernelSignatureTypeLayout::Match
-            }> = {
-                use #crate_path::deps::const_type_layout::{TypeLayoutGraph, check_serialised_type_graph};
-                use crate_path::safety::ptx_kernel_signature::HostAndDeviceKernelSignatureTypeLayout;
-
-                #crate_path::safety::ptx_kernel_signature::Assert::<{
-                    #ffi_signature_ident::<#ffi_signature_ty>()
-                }>
-            };
+            }> = #crate_path::safety::ptx_kernel_signature::Assert::<{
+                #ffi_signature_ident::<#ffi_signature_ty>()
+            }>;
         }
     };
 
@@ -93,6 +88,10 @@ pub(super) fn quote_get_ptx(
 
     quote! {
         fn get_ptx() -> &'static ::core::ffi::CStr {
+            // FIXME: don't use imports here
+            use #crate_path::deps::const_type_layout::{TypeGraphLayout, check_serialised_type_graph};
+            use #crate_path::safety::ptx_kernel_signature::HostAndDeviceKernelSignatureTypeLayout;
+
             #args_trait
 
             extern "C" { #(