From 22569d9b511163e49f6822df0eac1b3fed69141d Mon Sep 17 00:00:00 2001 From: Juniper Langenstein Date: Fri, 18 Nov 2022 12:47:43 +0000 Subject: [PATCH 001/120] Initial work on supporting some async memory transfers --- Cargo.toml | 6 +- .../src/rust_to_cuda/field_copy.rs | 38 ++ rust-cuda-derive/src/rust_to_cuda/impl.rs | 2 + rust-cuda-derive/src/rust_to_cuda/mod.rs | 7 + rust-cuda-ptx-jit/Cargo.toml | 2 +- src/common.rs | 62 +++- src/device/mod.rs | 12 +- src/host.rs | 351 ++++++++++++++++-- src/utils/aliasing/const.rs | 38 +- src/utils/aliasing/dynamic.rs | 39 +- src/utils/aliasing/final.rs | 38 +- src/utils/device_copy.rs | 48 ++- src/utils/exchange/buffer/common.rs | 4 +- src/utils/exchange/buffer/device.rs | 11 +- src/utils/exchange/buffer/host.rs | 65 +++- src/utils/exchange/buffer/mod.rs | 4 +- src/utils/exchange/wrapper.rs | 213 ++++++++++- src/utils/option.rs | 83 ++++- 18 files changed, 942 insertions(+), 81 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index e8c86665b..17a279023 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -24,10 +24,10 @@ host = ["rustacuda", "rust-cuda-ptx-jit/host"] derive = ["rustacuda_derive", "rust-cuda-derive"] [dependencies] -rustacuda_core = "0.1.2" +rustacuda_core = { git = "https://github.com/MomoLangenstein/RustaCUDA", rev = "fa4bf52" } -rustacuda = { version = "0.1.3", optional = true } -rustacuda_derive = { version = "0.1.2", optional = true } +rustacuda = { git = "https://github.com/MomoLangenstein/RustaCUDA", rev = "fa4bf52", optional = true } +rustacuda_derive = { git = "https://github.com/MomoLangenstein/RustaCUDA", rev = "fa4bf52", optional = true } const-type-layout = { version = "0.2.0", features = ["derive"] } diff --git a/rust-cuda-derive/src/rust_to_cuda/field_copy.rs b/rust-cuda-derive/src/rust_to_cuda/field_copy.rs index 0ddca9b28..93326aab6 100644 --- a/rust-cuda-derive/src/rust_to_cuda/field_copy.rs +++ b/rust-cuda-derive/src/rust_to_cuda/field_copy.rs @@ -13,8 +13,10 @@ pub fn impl_field_copy_init_and_expand_alloc_type( mut combined_cuda_alloc_type: TokenStream, r2c_field_declarations: &mut Vec, + r2c_field_async_declarations: &mut Vec, r2c_field_initialisations: &mut Vec, r2c_field_destructors: &mut Vec, + r2c_field_async_destructors: &mut Vec, c2r_field_initialisations: &mut Vec, ) -> TokenStream { @@ -35,6 +37,11 @@ pub fn impl_field_copy_init_and_expand_alloc_type( &self.#field_accessor, ); }); + r2c_field_async_declarations.push(quote! { + let #field_repr_ident = rust_cuda::common::DeviceAccessible::from( + &self.#field_accessor, + ); + }); r2c_field_initialisations.push(quote! { #optional_field_ident #field_repr_ident, @@ -60,6 +67,13 @@ pub fn impl_field_copy_init_and_expand_alloc_type( alloc_front, )?; }); + r2c_field_async_declarations.push(quote! { + let (#field_repr_ident, alloc_front) = rust_cuda::common::RustToCudaAsync::borrow_async( + &self.#field_accessor, + alloc_front, + stream, + )?; + }); r2c_field_initialisations.push(quote! { #optional_field_ident #field_repr_ident, @@ -71,6 +85,13 @@ pub fn impl_field_copy_init_and_expand_alloc_type( alloc_front, )?; }); + r2c_field_async_destructors.push(quote! { + let alloc_front = rust_cuda::common::RustToCudaAsync::restore_async( + &mut self.#field_accessor, + alloc_front, + stream, + )?; + }); c2r_field_initialisations.push(quote! { #optional_field_ident { @@ -94,6 +115,15 @@ pub fn impl_field_copy_init_and_expand_alloc_type( alloc_front, )?; }); + r2c_field_async_declarations.push(quote! { + let (#field_repr_ident, alloc_front) = rust_cuda::common::RustToCudaAsync::borrow_async( + < + #proxy_ty as rust_cuda::common::RustToCudaAsyncProxy<#field_ty> + >::from_ref(&self.#field_accessor), + alloc_front, + stream, + )?; + }); r2c_field_initialisations.push(quote! { #optional_field_ident #field_repr_ident, @@ -107,6 +137,14 @@ pub fn impl_field_copy_init_and_expand_alloc_type( alloc_front, )?; }); + r2c_field_async_destructors.push(quote! { + let alloc_front = rust_cuda::common::RustToCudaAsync::restore_async( + < + #proxy_ty as rust_cuda::common::RustToCudaAsyncProxy<#field_ty> + >::from_mut(&mut self.#field_accessor), + alloc_front, + )?; + }); c2r_field_initialisations.push(quote! { #optional_field_ident { diff --git a/rust-cuda-derive/src/rust_to_cuda/impl.rs b/rust-cuda-derive/src/rust_to_cuda/impl.rs index 8b99e4f73..2c6593068 100644 --- a/rust-cuda-derive/src/rust_to_cuda/impl.rs +++ b/rust-cuda-derive/src/rust_to_cuda/impl.rs @@ -42,6 +42,8 @@ pub fn cuda_struct_declaration( } } +// TODO: derive async impl as well -> need different trait bounds + #[allow(clippy::too_many_arguments)] pub fn rust_to_cuda_trait( struct_name: &syn::Ident, diff --git a/rust-cuda-derive/src/rust_to_cuda/mod.rs b/rust-cuda-derive/src/rust_to_cuda/mod.rs index 18589b78a..00e756c00 100644 --- a/rust-cuda-derive/src/rust_to_cuda/mod.rs +++ b/rust-cuda-derive/src/rust_to_cuda/mod.rs @@ -25,8 +25,10 @@ pub fn impl_rust_to_cuda(ast: &syn::DeriveInput) -> proc_macro::TokenStream { rust_cuda::host::NullCudaAlloc }; let mut r2c_field_declarations: Vec = Vec::new(); + let mut r2c_field_async_declarations: Vec = Vec::new(); let mut r2c_field_initialisations: Vec = Vec::new(); let mut r2c_field_destructors: Vec = Vec::new(); + let mut r2c_field_async_destructors: Vec = Vec::new(); let mut c2r_field_initialisations: Vec = Vec::new(); @@ -40,6 +42,7 @@ pub fn impl_rust_to_cuda(ast: &syn::DeriveInput) -> proc_macro::TokenStream { .. }) => { let mut r2c_field_destructors_reverse: Vec = Vec::new(); + let mut r2c_field_async_destructors_reverse: Vec = Vec::new(); for (field_index, field) in fields.iter_mut().enumerate() { let cuda_repr_field_ty = field_ty::swap_field_type_and_filter_attrs(field); @@ -50,14 +53,18 @@ pub fn impl_rust_to_cuda(ast: &syn::DeriveInput) -> proc_macro::TokenStream { &cuda_repr_field_ty, combined_cuda_alloc_type, &mut r2c_field_declarations, + &mut r2c_field_async_declarations, &mut r2c_field_initialisations, &mut r2c_field_destructors_reverse, + &mut r2c_field_async_destructors_reverse, &mut c2r_field_initialisations, ); } // The fields must be deallocated in the reverse order of their allocation r2c_field_destructors.extend(r2c_field_destructors_reverse.into_iter().rev()); + r2c_field_async_destructors + .extend(r2c_field_async_destructors_reverse.into_iter().rev()); }, syn::Fields::Unit => (), } diff --git a/rust-cuda-ptx-jit/Cargo.toml b/rust-cuda-ptx-jit/Cargo.toml index f2a4cd09a..d5b832eb8 100644 --- a/rust-cuda-ptx-jit/Cargo.toml +++ b/rust-cuda-ptx-jit/Cargo.toml @@ -12,6 +12,6 @@ default = [] host = ["regex", "rustacuda", "lazy_static"] [dependencies] -rustacuda = { version = "0.1.3", optional = true } +rustacuda = { git = "https://github.com/MomoLangenstein/RustaCUDA", rev = "fa4bf52", optional = true } regex = { version = "1.5", optional = true } lazy_static = { version = "1.4", optional = true } diff --git a/src/common.rs b/src/common.rs index b2d398e09..abb196c05 100644 --- a/src/common.rs +++ b/src/common.rs @@ -88,12 +88,13 @@ pub unsafe trait RustToCuda { #[doc(cfg(feature = "host"))] /// # Errors /// - /// Returns a `rustacuda::errors::CudaError` iff an error occurs inside CUDA + /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside + /// CUDA /// /// # Safety /// /// This is an internal function and should NEVER be called manually - /// The returned `Self::CudaRepresentation` must NEVER be accessed on the + /// The returned [`Self::CudaRepresentation`] must NEVER be accessed on the /// CPU as it contains a GPU-resident copy of `self`. #[allow(clippy::type_complexity)] unsafe fn borrow( @@ -108,7 +109,8 @@ pub unsafe trait RustToCuda { #[doc(cfg(feature = "host"))] /// # Errors /// - /// Returns a `rustacuda::errors::CudaError` iff an error occurs inside CUDA + /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside + /// CUDA /// /// # Safety /// @@ -120,6 +122,53 @@ pub unsafe trait RustToCuda { ) -> rustacuda::error::CudaResult; } +/// # Safety +/// +/// This is an internal trait and should ONLY be derived automatically using +/// `#[derive(LendRustToCuda)]` +pub unsafe trait RustToCudaAsync: RustToCuda { + #[cfg(feature = "host")] + #[doc(cfg(feature = "host"))] + /// # Errors + /// + /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside + /// CUDA + /// + /// # Safety + /// + /// This is an internal function and should NEVER be called manually + /// The returned + /// [`Self::CudaRepresentation`](RustToCuda::CudaRepresentation) must NEVER + /// be accessed on the CPU as it contains a GPU-resident copy of + /// `self`. + #[allow(clippy::type_complexity)] + unsafe fn borrow_async( + &self, + alloc: A, + stream: &rustacuda::stream::Stream, + ) -> rustacuda::error::CudaResult<( + DeviceAccessible, + crate::host::CombinedCudaAlloc, + )>; + + #[cfg(feature = "host")] + #[doc(cfg(feature = "host"))] + /// # Errors + /// + /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside + /// CUDA + /// + /// # Safety + /// + /// This is an internal function and should NEVER be called manually + #[allow(clippy::type_complexity)] + unsafe fn restore_async( + &mut self, + alloc: crate::host::CombinedCudaAlloc, + stream: &rustacuda::stream::Stream, + ) -> rustacuda::error::CudaResult; +} + /// # Safety /// /// This is an internal trait and should NEVER be implemented manually @@ -141,6 +190,13 @@ pub trait RustToCudaProxy: RustToCuda { fn into(self) -> T; } +pub trait RustToCudaAsyncProxy: RustToCudaAsync { + fn from_ref(val: &T) -> &Self; + fn from_mut(val: &mut T) -> &mut Self; + + fn into(self) -> T; +} + #[repr(transparent)] #[derive(Clone, Copy, TypeLayout)] pub struct DeviceConstRef<'r, T: DeviceCopy + 'r> { diff --git a/src/device/mod.rs b/src/device/mod.rs index 225bc8252..39ae0719f 100644 --- a/src/device/mod.rs +++ b/src/device/mod.rs @@ -18,8 +18,8 @@ pub trait BorrowFromRust: RustToCuda { /// # Safety /// /// This function is only safe to call iff `cuda_repr` is the - /// `DeviceConstRef` borrowed on the CPU using the corresponding - /// `LendToCuda::lend_to_cuda`. + /// [`DeviceConstRef`] borrowed on the CPU using the corresponding + /// [`LendToCuda::lend_to_cuda`](crate::host::LendToCuda::lend_to_cuda). unsafe fn with_borrow_from_rust) -> O>( cuda_repr: DeviceConstRef::CudaRepresentation>>, inner: F, @@ -28,8 +28,8 @@ pub trait BorrowFromRust: RustToCuda { /// # Safety /// /// This function is only safe to call iff `cuda_repr_mut` is the - /// `DeviceMutRef` borrowed on the CPU using the corresponding - /// `LendToCuda::lend_to_cuda_mut`. + /// [`DeviceMutRef`] borrowed on the CPU using the corresponding + /// [`LendToCuda::lend_to_cuda_mut`](crate::host::LendToCuda::lend_to_cuda_mut). /// Furthermore, since different GPU threads can access heap storage /// mutably inside the safe `inner` scope, there must not be any /// aliasing between concurrently running threads. @@ -41,8 +41,8 @@ pub trait BorrowFromRust: RustToCuda { /// # Safety /// /// This function is only safe to call iff `cuda_repr` is the - /// `DeviceMutRef` borrowed on the CPU using the corresponding - /// `LendToCuda::move_to_cuda`. + /// [`DeviceMutRef`] borrowed on the CPU using the corresponding + /// [`LendToCuda::move_to_cuda`](crate::host::LendToCuda::move_to_cuda). unsafe fn with_moved_from_rust O>( cuda_repr_mut: DeviceMutRef::CudaRepresentation>>, inner: F, diff --git a/src/host.rs b/src/host.rs index 6c91a26bc..3c19ac6fd 100644 --- a/src/host.rs +++ b/src/host.rs @@ -7,8 +7,9 @@ use core::{ use rustacuda::{ context::Context, error::{CudaError, CudaResult}, + event::Event, function::Function, - memory::{DeviceBox, DeviceBuffer, LockedBuffer}, + memory::{DeviceBox, DeviceBuffer, LockedBox, LockedBuffer}, module::Module, stream::Stream, }; @@ -32,7 +33,7 @@ pub trait Launcher { /// # Errors /// - /// Should only return a `CudaError` if some implementation-defined + /// Should only return a [`CudaError`] if some implementation-defined /// critical kernel function configuration failed. #[allow(unused_variables)] fn on_compile(kernel: &Function, watcher: &mut Self::CompilationWatcher) -> CudaResult<()> { @@ -72,7 +73,7 @@ pub struct TypedKernel { impl TypedKernel { /// # Errors /// - /// Returns a `CudaError` if `ptx` or `entry_point` contain nul bytes. + /// Returns a [`CudaError`] if `ptx` or `entry_point` contain nul bytes. pub fn new(ptx: &str, entry_point: &str) -> CudaResult { let ptx_cstring = std::ffi::CString::new(ptx).map_err(|_| CudaError::InvalidPtx)?; @@ -92,7 +93,7 @@ impl TypedKernel { /// # Errors /// - /// Returns a `CudaError` if `ptx` (from [`Self::new`]) is not a valid + /// Returns a [`CudaError`] if `ptx` (from [`Self::new`]) is not a valid /// PTX source, or it does not contain an entry point named `entry_point` /// (from [`Self::new`]). pub fn compile_with_ptx_jit_args( @@ -122,12 +123,12 @@ impl TypedKernel { pub trait LendToCuda: RustToCuda { /// Lends an immutable copy of `&self` to CUDA: /// - code in the CUDA kernel can only access `&self` through the - /// `DeviceConstRef` inside the closure + /// [`DeviceConstRef`] inside the closure /// - after the closure, `&self` will not have changed /// /// # Errors /// - /// Returns a `rustacuda::errors::CudaError` iff an error occurs inside CUDA + /// Returns a [`CudaError`] iff an error occurs inside CUDA fn lend_to_cuda< O, E: From, @@ -141,7 +142,7 @@ pub trait LendToCuda: RustToCuda { /// Lends a mutable copy of `&mut self` to CUDA: /// - code in the CUDA kernel can only access `&mut self` through the - /// `DeviceMutRef` inside the closure + /// [`DeviceMutRef`] inside the closure /// - after the closure, `&mut self` might have changed in the following /// ways: /// - to avoid aliasing, each CUDA thread gets its own shallow copy of @@ -152,7 +153,7 @@ pub trait LendToCuda: RustToCuda { /// /// # Errors /// - /// Returns a `rustacuda::errors::CudaError` iff an error occurs inside CUDA + /// Returns a [`CudaError`] iff an error occurs inside CUDA fn lend_to_cuda_mut< O, E: From, @@ -164,11 +165,11 @@ pub trait LendToCuda: RustToCuda { inner: F, ) -> Result; - /// Moves `self` to CUDA iff `self` is `SafeDeviceCopy` + /// Moves `self` to CUDA iff `self` is [`SafeDeviceCopy`] /// /// # Errors /// - /// Returns a `rustacuda::errors::CudaError` iff an error occurs inside CUDA + /// Returns a [`CudaError`] iff an error occurs inside CUDA fn move_to_cuda< O, E: From, @@ -338,6 +339,7 @@ macro_rules! impl_sealed_drop_collection { impl_sealed_drop_collection!(DeviceBuffer); impl_sealed_drop_collection!(DeviceBox); impl_sealed_drop_collection!(LockedBuffer); +impl_sealed_drop_collection!(LockedBox); macro_rules! impl_sealed_drop_value { ($type:ident) => { @@ -352,6 +354,64 @@ macro_rules! impl_sealed_drop_value { impl_sealed_drop_value!(Module); impl_sealed_drop_value!(Stream); impl_sealed_drop_value!(Context); +impl_sealed_drop_value!(Event); + +#[repr(transparent)] +#[allow(clippy::module_name_repetitions)] +pub struct HostLockedBox(*mut T); + +impl HostLockedBox { + /// # Errors + /// Returns a [`CudaError`] iff an error occurs inside CUDA + pub fn new(value: T) -> CudaResult { + // Safety: uninitialised memory is immediately written to without reading it + let locked_ptr = unsafe { + let locked_ptr: *mut T = LockedBox::into_raw(LockedBox::uninitialized()?); + locked_ptr.write(value); + locked_ptr + }; + + Ok(Self(locked_ptr)) + } +} + +impl Deref for HostLockedBox { + type Target = T; + + fn deref(&self) -> &Self::Target { + unsafe { &*self.0 } + } +} + +impl DerefMut for HostLockedBox { + fn deref_mut(&mut self) -> &mut Self::Target { + unsafe { &mut *self.0 } + } +} + +impl From> for HostLockedBox { + fn from(locked_box: LockedBox) -> Self { + Self(LockedBox::into_raw(locked_box)) + } +} + +impl From> for LockedBox { + fn from(host_locked_box: HostLockedBox) -> Self { + // Safety: pointer comes from [`LockedBox::into_raw`] + // i.e. this function completes the roundtrip + unsafe { LockedBox::from_raw(host_locked_box.0) } + } +} + +impl Drop for HostLockedBox { + fn drop(&mut self) { + // Safety: pointer comes from [`LockedBox::into_raw`] + // i.e. this function completes the roundtrip + let locked_box = unsafe { LockedBox::from_raw(self.0) }; + + core::mem::drop(CudaDropWrapper::from(locked_box)); + } +} #[repr(transparent)] #[allow(clippy::module_name_repetitions)] @@ -362,9 +422,9 @@ impl private::alloc::Sealed for HostDeviceBox {} impl HostDeviceBox { /// # Errors /// - /// Returns a `CudaError` iff copying from `value` into `self` failed. + /// Returns a [`CudaError`] iff copying from `value` into `self` failed. pub fn copy_from(&mut self, value: &T) -> CudaResult<()> { - // Safety: pointer comes from `DeviceBox::into_device` + // Safety: pointer comes from [`DeviceBox::into_device`] // i.e. this function completes the roundtrip let mut device_box = unsafe { ManuallyDrop::new(DeviceBox::from_device(self.0)) }; @@ -373,14 +433,73 @@ impl HostDeviceBox { /// # Errors /// - /// Returns a `CudaError` iff copying from `self` into `value` failed. + /// Returns a [`CudaError`] iff copying from `self` into `value` failed. pub fn copy_to(&self, value: &mut T) -> CudaResult<()> { - // Safety: pointer comes from `DeviceBox::into_device` + // Safety: pointer comes from [`DeviceBox::into_device`] // i.e. this function completes the roundtrip let device_box = unsafe { ManuallyDrop::new(DeviceBox::from_device(self.0)) }; rustacuda::memory::CopyDestination::copy_to(&*device_box, value) } + + /// # Errors + /// + /// Returns a [`CudaError`] iff copying from `value` into `self` failed. + /// + /// # Safety + /// + /// To use the data inside the device box, either + /// - the passed-in [`Stream`] must be synchronised + /// - the kernel must be launched on the passed-in [`Stream`] + pub unsafe fn async_copy_from( + &mut self, + value: &HostLockedBox, + stream: &Stream, + ) -> CudaResult<()> { + // Safety: pointer comes from [`DeviceBox::into_device`] + // i.e. this function completes the roundtrip + let mut device_box = unsafe { ManuallyDrop::new(DeviceBox::from_device(self.0)) }; + // Safety: pointer comes from [`LockedBox::into_raw`] + // i.e. this function completes the roundtrip + let locked_box = unsafe { ManuallyDrop::new(LockedBox::from_raw(value.0)) }; + + unsafe { + rustacuda::memory::AsyncCopyDestination::async_copy_from( + &mut *device_box, + &*locked_box, + stream, + ) + } + } + + /// # Errors + /// + /// Returns a [`CudaError`] iff copying from `self` into `value` failed. + /// + /// # Safety + /// + /// To use the data inside `value`, the passed-in [`Stream`] must be + /// synchronised. + pub unsafe fn async_copy_to( + &self, + value: &mut HostLockedBox, + stream: &Stream, + ) -> CudaResult<()> { + // Safety: pointer comes from [`DeviceBox::into_device`] + // i.e. this function completes the roundtrip + let device_box = unsafe { ManuallyDrop::new(DeviceBox::from_device(self.0)) }; + // Safety: pointer comes from [`LockedBox::into_raw`] + // i.e. this function completes the roundtrip + let mut locked_box = unsafe { ManuallyDrop::new(LockedBox::from_raw(value.0)) }; + + unsafe { + rustacuda::memory::AsyncCopyDestination::async_copy_to( + &*device_box, + &mut *locked_box, + stream, + ) + } + } } impl From> for HostDeviceBox { @@ -391,7 +510,7 @@ impl From> for HostDeviceBox { impl From> for DeviceBox { fn from(host_device_box: HostDeviceBox) -> Self { - // Safety: pointer comes from `DeviceBox::into_device` + // Safety: pointer comes from [`DeviceBox::into_device`] // i.e. this function completes the roundtrip unsafe { DeviceBox::from_device(host_device_box.0) } } @@ -399,7 +518,7 @@ impl From> for DeviceBox { impl Drop for HostDeviceBox { fn drop(&mut self) { - // Safety: pointer comes from `DeviceBox::into_device` + // Safety: pointer comes from [`DeviceBox::into_device`] // i.e. this function completes the roundtrip let device_box = unsafe { DeviceBox::from_device(self.0) }; @@ -426,7 +545,7 @@ impl<'a, T: DeviceCopy> HostAndDeviceMutRef<'a, T> { /// # Errors /// - /// Returns a `rustacuda::errors::CudaError` iff `value` cannot be moved + /// Returns a [`CudaError`] iff `value` cannot be moved /// to CUDA or an error occurs inside `inner`. pub fn with_new< O, @@ -473,9 +592,10 @@ impl<'a, T: DeviceCopy> HostAndDeviceMutRef<'a, T> { where 'a: 'b, { - // Safety: `device_box` contains EXACTLY the device copy of `host_ref` - // by construction of `HostAndDeviceMutRef` - unsafe { HostAndDeviceConstRef::new(self.device_box, self.host_ref) } + HostAndDeviceConstRef { + device_box: self.device_box, + host_ref: self.host_ref, + } } #[must_use] @@ -483,9 +603,10 @@ impl<'a, T: DeviceCopy> HostAndDeviceMutRef<'a, T> { where 'a: 'b, { - // Safety: `device_box` contains EXACTLY the device copy of `host_ref` - // by construction of `HostAndDeviceMutRef` - unsafe { HostAndDeviceMutRef::new(self.device_box, self.host_ref) } + HostAndDeviceMutRef { + device_box: self.device_box, + host_ref: self.host_ref, + } } } @@ -516,7 +637,7 @@ impl<'a, T: DeviceCopy> HostAndDeviceConstRef<'a, T> { /// # Errors /// - /// Returns a `rustacuda::errors::CudaError` iff `value` cannot be moved + /// Returns a [`CudaError`] iff `value` cannot be moved /// to CUDA or an error occurs inside `inner`. pub fn with_new< O, @@ -573,7 +694,187 @@ pub struct HostAndDeviceOwned<'a, T: SafeDeviceCopy + DeviceCopy> { impl<'a, T: SafeDeviceCopy + DeviceCopy> HostAndDeviceOwned<'a, T> { /// # Errors /// - /// Returns a `rustacuda::errors::CudaError` iff `value` cannot be moved + /// Returns a [`CudaError`] iff `value` cannot be moved + /// to CUDA or an error occurs inside `inner`. + pub fn with_new< + O, + E: From, + F: for<'b> FnOnce(HostAndDeviceOwned<'b, T>) -> Result, + >( + mut value: T, + inner: F, + ) -> Result { + let mut device_box: HostDeviceBox<_> = DeviceBox::new(&value)?.into(); + + // Safety: `device_box` contains exactly the device copy of `value` + let result = inner(HostAndDeviceOwned { + device_box: &mut device_box, + host_val: &mut value, + }); + + core::mem::drop(device_box); + core::mem::drop(value); + + result + } + + #[must_use] + pub fn for_device(self) -> DeviceMutRef<'a, T> { + DeviceMutRef { + pointer: self.device_box.0.as_raw_mut(), + reference: PhantomData, + } + } + + #[must_use] + pub fn for_host(&'a mut self) -> &'a T { + self.host_val + } +} + +#[allow(clippy::module_name_repetitions)] +pub struct HostAndDeviceMutRefAsync<'stream, 'a, T: DeviceCopy> { + device_box: &'a mut HostDeviceBox, + host_ref: &'a mut T, + stream: PhantomData<&'stream Stream>, +} + +impl<'stream, 'a, T: DeviceCopy> HostAndDeviceMutRefAsync<'stream, 'a, T> { + /// # Safety + /// + /// `device_box` must contain EXACTLY the device copy of `host_ref` + pub unsafe fn new( + device_box: &'a mut HostDeviceBox, + host_ref: &'a mut T, + stream: &'stream Stream, + ) -> Self { + let _ = stream; + + Self { + device_box, + host_ref, + stream: PhantomData::<&'stream Stream>, + } + } + + #[must_use] + /// # Safety + /// + /// The returned [`DeviceMutRef`] must only be used on the constructed-with + /// [`Stream`] + pub unsafe fn for_device_async<'b>(&'b mut self) -> DeviceMutRef<'a, T> + where + 'a: 'b, + { + DeviceMutRef { + pointer: self.device_box.0.as_raw_mut(), + reference: PhantomData, + } + } + + #[must_use] + pub fn for_host<'b: 'a>(&'b self) -> &'a T { + self.host_ref + } + + #[must_use] + pub fn as_ref<'b>(&'b self) -> HostAndDeviceConstRefAsync<'stream, 'b, T> + where + 'a: 'b, + { + HostAndDeviceConstRefAsync { + device_box: self.device_box, + host_ref: self.host_ref, + stream: self.stream, + } + } + + #[must_use] + pub fn as_mut<'b>(&'b mut self) -> HostAndDeviceMutRefAsync<'stream, 'b, T> + where + 'a: 'b, + { + HostAndDeviceMutRefAsync { + device_box: self.device_box, + host_ref: self.host_ref, + stream: self.stream, + } + } +} + +#[allow(clippy::module_name_repetitions)] +pub struct HostAndDeviceConstRefAsync<'stream, 'a, T: DeviceCopy> { + device_box: &'a HostDeviceBox, + host_ref: &'a T, + stream: PhantomData<&'stream Stream>, +} + +impl<'stream, 'a, T: DeviceCopy> Clone for HostAndDeviceConstRefAsync<'stream, 'a, T> { + fn clone(&self) -> Self { + *self + } +} + +impl<'stream, 'a, T: DeviceCopy> Copy for HostAndDeviceConstRefAsync<'stream, 'a, T> {} + +impl<'stream, 'a, T: DeviceCopy> HostAndDeviceConstRefAsync<'stream, 'a, T> { + /// # Safety + /// + /// `device_box` must contain EXACTLY the device copy of `host_ref` + pub unsafe fn new( + device_box: &'a HostDeviceBox, + host_ref: &'a T, + stream: &'stream Stream, + ) -> Self { + let _ = stream; + + Self { + device_box, + host_ref, + stream: PhantomData::<&'stream Stream>, + } + } + + #[must_use] + /// # Safety + /// + /// The returned [`DeviceConstRef`] must only be used on the + /// constructed-with [`Stream`] + pub unsafe fn for_device_async<'b>(&'b self) -> DeviceConstRef<'a, T> + where + 'a: 'b, + { + DeviceConstRef { + pointer: self.device_box.0.as_raw(), + reference: PhantomData, + } + } + + #[must_use] + pub fn for_host(&'a self) -> &'a T { + self.host_ref + } + + #[must_use] + pub fn as_ref<'b>(&'b self) -> HostAndDeviceConstRefAsync<'stream, 'b, T> + where + 'a: 'b, + { + *self + } +} + +#[allow(clippy::module_name_repetitions)] +pub struct HostAndDeviceOwnedAsync<'stream, 'a, T: SafeDeviceCopy + DeviceCopy> { + device_box: &'a mut HostDeviceBox, + host_val: &'a mut T, + stream: PhantomData<&'stream Stream>, +} + +impl<'stream, 'a, T: SafeDeviceCopy + DeviceCopy> HostAndDeviceOwnedAsync<'stream, 'a, T> { + /// # Errors + /// + /// Returns a [`CudaError`] iff `value` cannot be moved /// to CUDA or an error occurs inside `inner`. pub fn with_new< O, diff --git a/src/utils/aliasing/const.rs b/src/utils/aliasing/const.rs index 361151ac2..8f7f1ab98 100644 --- a/src/utils/aliasing/const.rs +++ b/src/utils/aliasing/const.rs @@ -6,7 +6,7 @@ use core::{ use rustacuda_core::DeviceCopy; -use crate::common::{CudaAsRust, DeviceAccessible, RustToCuda}; +use crate::common::{CudaAsRust, DeviceAccessible, RustToCuda, RustToCudaAsync}; #[repr(transparent)] #[derive(Clone, TypeLayout)] @@ -19,7 +19,8 @@ impl SplitSliceOverCudaThreadsConstStride { } } -// Safety: If `T` is `DeviceCopy`, then the newtype struct also is `DeviceCopy` +// Safety: If [`T`] is [`DeviceCopy`], then the newtype struct also is +// [`DeviceCopy`] unsafe impl DeviceCopy for SplitSliceOverCudaThreadsConstStride { @@ -190,6 +191,39 @@ unsafe impl RustToCuda } } +unsafe impl RustToCudaAsync + for SplitSliceOverCudaThreadsConstStride +{ + #[cfg(feature = "host")] + #[doc(cfg(feature = "host"))] + #[allow(clippy::type_complexity)] + unsafe fn borrow_async( + &self, + alloc: A, + stream: &rustacuda::stream::Stream, + ) -> rustacuda::error::CudaResult<( + DeviceAccessible, + crate::host::CombinedCudaAlloc, + )> { + let (cuda_repr, alloc) = self.0.borrow_async(alloc, stream)?; + + Ok(( + DeviceAccessible::from(SplitSliceOverCudaThreadsConstStride::new(cuda_repr)), + alloc, + )) + } + + #[cfg(feature = "host")] + #[doc(cfg(feature = "host"))] + unsafe fn restore_async( + &mut self, + alloc: crate::host::CombinedCudaAlloc, + stream: &rustacuda::stream::Stream, + ) -> rustacuda::error::CudaResult { + self.0.restore_async(alloc, stream) + } +} + unsafe impl CudaAsRust for SplitSliceOverCudaThreadsConstStride, STRIDE> { diff --git a/src/utils/aliasing/dynamic.rs b/src/utils/aliasing/dynamic.rs index 8b0446e08..6cba2ff9c 100644 --- a/src/utils/aliasing/dynamic.rs +++ b/src/utils/aliasing/dynamic.rs @@ -6,7 +6,7 @@ use core::{ use rustacuda_core::DeviceCopy; -use crate::common::{CudaAsRust, DeviceAccessible, RustToCuda}; +use crate::common::{CudaAsRust, DeviceAccessible, RustToCuda, RustToCudaAsync}; #[repr(C)] #[derive(Clone, TypeLayout)] @@ -22,7 +22,8 @@ impl SplitSliceOverCudaThreadsDynamicStride { } } -// Safety: If `T` is `DeviceCopy`, then the newtype struct also is `DeviceCopy` +// Safety: If [`T`] is [`DeviceCopy`], then the newtype struct also is +// [`DeviceCopy`] unsafe impl DeviceCopy for SplitSliceOverCudaThreadsDynamicStride {} #[cfg(all(not(feature = "host"), target_os = "cuda"))] @@ -167,6 +168,40 @@ unsafe impl RustToCuda for SplitSliceOverCudaThreadsDynamicStride } } +unsafe impl RustToCudaAsync for SplitSliceOverCudaThreadsDynamicStride { + #[cfg(feature = "host")] + #[doc(cfg(feature = "host"))] + #[allow(clippy::type_complexity)] + unsafe fn borrow_async( + &self, + alloc: A, + stream: &rustacuda::stream::Stream, + ) -> rustacuda::error::CudaResult<( + DeviceAccessible, + crate::host::CombinedCudaAlloc, + )> { + let (cuda_repr, alloc) = self.inner.borrow_async(alloc, stream)?; + + Ok(( + DeviceAccessible::from(SplitSliceOverCudaThreadsDynamicStride::new( + cuda_repr, + self.stride, + )), + alloc, + )) + } + + #[cfg(feature = "host")] + #[doc(cfg(feature = "host"))] + unsafe fn restore_async( + &mut self, + alloc: crate::host::CombinedCudaAlloc, + stream: &rustacuda::stream::Stream, + ) -> rustacuda::error::CudaResult { + self.inner.restore_async(alloc, stream) + } +} + unsafe impl CudaAsRust for SplitSliceOverCudaThreadsDynamicStride> { diff --git a/src/utils/aliasing/final.rs b/src/utils/aliasing/final.rs index f8d96d5e2..5a3d1695c 100644 --- a/src/utils/aliasing/final.rs +++ b/src/utils/aliasing/final.rs @@ -1,6 +1,6 @@ use r#final::Final; -use crate::common::{CudaAsRust, DeviceAccessible, RustToCuda}; +use crate::common::{CudaAsRust, DeviceAccessible, RustToCuda, RustToCudaAsync}; #[doc(hidden)] #[repr(transparent)] @@ -8,7 +8,7 @@ use crate::common::{CudaAsRust, DeviceAccessible, RustToCuda}; #[allow(clippy::module_name_repetitions)] pub struct FinalCudaRepresentation(DeviceAccessible); -// Safety: If `T` is `CudaAsRust`, then the newtype struct is `DeviceCopy` +// Safety: If [`T`] is [`CudaAsRust`], then the newtype struct is [`DeviceCopy`] unsafe impl rustacuda_core::DeviceCopy for FinalCudaRepresentation {} unsafe impl RustToCuda for Final { @@ -48,6 +48,40 @@ unsafe impl RustToCuda for Final { } } +unsafe impl RustToCudaAsync for Final { + #[cfg(feature = "host")] + #[doc(cfg(feature = "host"))] + #[allow(clippy::type_complexity)] + unsafe fn borrow_async( + &self, + alloc: A, + stream: &rustacuda::stream::Stream, + ) -> rustacuda::error::CudaResult<( + DeviceAccessible, + crate::host::CombinedCudaAlloc, + )> { + let (cuda_repr, alloc) = (**self).borrow_async(alloc, stream)?; + + Ok(( + DeviceAccessible::from(FinalCudaRepresentation(cuda_repr)), + alloc, + )) + } + + #[cfg(feature = "host")] + #[doc(cfg(feature = "host"))] + unsafe fn restore_async( + &mut self, + alloc: crate::host::CombinedCudaAlloc, + stream: &rustacuda::stream::Stream, + ) -> rustacuda::error::CudaResult { + // Safety: Final is a repr(transparent) newtype wrapper around T + let inner: &mut T = &mut *(self as *mut Self).cast(); + + inner.restore_async(alloc, stream) + } +} + unsafe impl CudaAsRust for FinalCudaRepresentation { type RustRepresentation = Final; diff --git a/src/utils/device_copy.rs b/src/utils/device_copy.rs index 289ef9969..1ae0515f9 100644 --- a/src/utils/device_copy.rs +++ b/src/utils/device_copy.rs @@ -3,7 +3,7 @@ use const_type_layout::TypeGraphLayout; use crate::{ - common::{CudaAsRust, DeviceAccessible, RustToCuda}, + common::{CudaAsRust, DeviceAccessible, RustToCuda, RustToCudaAsync}, safety::SafeDeviceCopy, }; @@ -30,42 +30,42 @@ impl SafeDeviceCopyWrapper { } pub fn from_ref(reference: &T) -> &Self { - // Safety: `SafeDeviceCopyWrapper` is a transparent newtype around `T` + // Safety: [`SafeDeviceCopyWrapper`] is a transparent newtype around [`T`] unsafe { &*(reference as *const T).cast() } } pub fn into_ref(&self) -> &T { - // Safety: `SafeDeviceCopyWrapper` is a transparent newtype around `T` + // Safety: [`SafeDeviceCopyWrapper`] is a transparent newtype around [`T`] unsafe { &*(self as *const Self).cast() } } pub fn from_mut(reference: &mut T) -> &mut Self { - // Safety: `SafeDeviceCopyWrapper` is a transparent newtype around `T` + // Safety: [`SafeDeviceCopyWrapper`] is a transparent newtype around [`T`] unsafe { &mut *(reference as *mut T).cast() } } pub fn into_mut(&mut self) -> &mut T { - // Safety: `SafeDeviceCopyWrapper` is a transparent newtype around `T` + // Safety: [`SafeDeviceCopyWrapper`] is a transparent newtype around [`T`] unsafe { &mut *(self as *mut Self).cast() } } pub fn from_slice(slice: &[T]) -> &[Self] { - // Safety: `SafeDeviceCopyWrapper` is a transparent newtype around `T` + // Safety: [`SafeDeviceCopyWrapper`] is a transparent newtype around [`T`] unsafe { core::slice::from_raw_parts(slice.as_ptr().cast(), slice.len()) } } pub fn into_slice(slice: &[Self]) -> &[T] { - // Safety: `SafeDeviceCopyWrapper` is a transparent newtype around `T` + // Safety: [`SafeDeviceCopyWrapper`] is a transparent newtype around [`T`] unsafe { core::slice::from_raw_parts(slice.as_ptr().cast(), slice.len()) } } pub fn from_mut_slice(slice: &mut [T]) -> &mut [Self] { - // Safety: `SafeDeviceCopyWrapper` is a transparent newtype around `T` + // Safety: [`SafeDeviceCopyWrapper`] is a transparent newtype around [`T`] unsafe { core::slice::from_raw_parts_mut(slice.as_mut_ptr().cast(), slice.len()) } } pub fn into_mut_slice(slice: &mut [Self]) -> &mut [T] { - // Safety: `SafeDeviceCopyWrapper` is a transparent newtype around `T` + // Safety: [`SafeDeviceCopyWrapper`] is a transparent newtype around [`T`] unsafe { core::slice::from_raw_parts_mut(slice.as_mut_ptr().cast(), slice.len()) } } } @@ -100,6 +100,36 @@ unsafe impl RustToCuda for SafeDeviceCopyWr } } +unsafe impl RustToCudaAsync + for SafeDeviceCopyWrapper +{ + #[cfg(feature = "host")] + #[allow(clippy::type_complexity)] + unsafe fn borrow_async( + &self, + alloc: A, + _stream: &rustacuda::stream::Stream, + ) -> rustacuda::error::CudaResult<( + DeviceAccessible, + crate::host::CombinedCudaAlloc, + )> { + let alloc = crate::host::CombinedCudaAlloc::new(crate::host::NullCudaAlloc, alloc); + Ok((DeviceAccessible::from(&self.0), alloc)) + } + + #[cfg(feature = "host")] + #[doc(cfg(feature = "host"))] + unsafe fn restore_async( + &mut self, + alloc: crate::host::CombinedCudaAlloc, + _stream: &rustacuda::stream::Stream, + ) -> rustacuda::error::CudaResult { + let (_alloc_front, alloc_tail): (crate::host::NullCudaAlloc, A) = alloc.split(); + + Ok(alloc_tail) + } +} + unsafe impl CudaAsRust for SafeDeviceCopyWrapper { type RustRepresentation = Self; diff --git a/src/utils/exchange/buffer/common.rs b/src/utils/exchange/buffer/common.rs index a153da4d0..c5d1f9128 100644 --- a/src/utils/exchange/buffer/common.rs +++ b/src/utils/exchange/buffer/common.rs @@ -16,8 +16,8 @@ pub struct CudaExchangeBufferCudaRepresentation` is `DeviceCopy` -// iff `T` is `SafeDeviceCopy` +// Safety: [`CudaExchangeBufferCudaRepresentation`] is [`DeviceCopy`] +// iff [`T`] is [`SafeDeviceCopy`] unsafe impl DeviceCopy for CudaExchangeBufferCudaRepresentation { diff --git a/src/utils/exchange/buffer/device.rs b/src/utils/exchange/buffer/device.rs index d284e1193..1ecaf91d2 100644 --- a/src/utils/exchange/buffer/device.rs +++ b/src/utils/exchange/buffer/device.rs @@ -2,7 +2,10 @@ use core::ops::{Deref, DerefMut}; use const_type_layout::TypeGraphLayout; -use crate::{common::RustToCuda, safety::SafeDeviceCopy}; +use crate::{ + common::{RustToCuda, RustToCudaAsync}, + safety::SafeDeviceCopy, +}; use super::{common::CudaExchangeBufferCudaRepresentation, CudaExchangeItem}; @@ -42,3 +45,9 @@ unsafe impl; } + +#[cfg(not(all(doc, feature = "host")))] +unsafe impl + RustToCudaAsync for CudaExchangeBufferDevice +{ +} diff --git a/src/utils/exchange/buffer/host.rs b/src/utils/exchange/buffer/host.rs index ad522629f..debe33059 100644 --- a/src/utils/exchange/buffer/host.rs +++ b/src/utils/exchange/buffer/host.rs @@ -11,7 +11,7 @@ use rustacuda::{ }; use crate::{ - common::{DeviceAccessible, RustToCuda}, + common::{DeviceAccessible, RustToCuda, RustToCudaAsync}, host::{CombinedCudaAlloc, CudaAlloc, CudaDropWrapper, NullCudaAlloc}, safety::SafeDeviceCopy, }; @@ -39,7 +39,8 @@ impl { /// # Errors - /// Returns a `rustacuda::errors::CudaError` iff an error occurs inside CUDA + /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside + /// CUDA pub fn new(elem: &T, capacity: usize) -> CudaResult { // Safety: CudaExchangeItem is a `repr(transparent)` wrapper around T let elem: &CudaExchangeItem = unsafe { &*(elem as *const T).cast() }; @@ -60,7 +61,8 @@ impl CudaExchangeBufferHost { /// # Errors - /// Returns a `rustacuda::errors::CudaError` iff an error occurs inside CUDA + /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside + /// CUDA pub fn from_vec(vec: Vec) -> CudaResult { let mut host_buffer_uninit = CudaDropWrapper::from(unsafe { LockedBuffer::uninitialized(vec.len())? }); @@ -155,3 +157,60 @@ unsafe impl + RustToCudaAsync for CudaExchangeBufferHost +{ + #[allow(clippy::type_complexity)] + unsafe fn borrow_async( + &self, + alloc: A, + stream: &rustacuda::stream::Stream, + ) -> rustacuda::error::CudaResult<( + DeviceAccessible, + CombinedCudaAlloc, + )> { + // Safety: device_buffer is inside an UnsafeCell + // borrow checks must be satisfied through LendToCuda + let device_buffer = &mut *self.device_buffer.get(); + + if M2D { + // Only move the buffer contents to the device if needed + + rustacuda::memory::AsyncCopyDestination::async_copy_from( + &mut ***device_buffer, + self.host_buffer.as_slice(), + stream, + )?; + } + + Ok(( + DeviceAccessible::from(CudaExchangeBufferCudaRepresentation( + device_buffer.as_mut_ptr(), + device_buffer.len(), + )), + CombinedCudaAlloc::new(NullCudaAlloc, alloc), + )) + } + + #[allow(clippy::type_complexity)] + unsafe fn restore_async( + &mut self, + alloc: CombinedCudaAlloc, + stream: &rustacuda::stream::Stream, + ) -> rustacuda::error::CudaResult { + let (_alloc_front, alloc_tail) = alloc.split(); + + if M2H { + // Only move the buffer contents back to the host if needed + + rustacuda::memory::AsyncCopyDestination::async_copy_to( + &***self.device_buffer.get_mut(), + self.host_buffer.as_mut_slice(), + stream, + )?; + } + + Ok(alloc_tail) + } +} diff --git a/src/utils/exchange/buffer/mod.rs b/src/utils/exchange/buffer/mod.rs index 3648f9d04..1a940faa0 100644 --- a/src/utils/exchange/buffer/mod.rs +++ b/src/utils/exchange/buffer/mod.rs @@ -20,8 +20,8 @@ use crate::safety::SafeDeviceCopy; #[derive(Clone, Copy, TypeLayout)] pub struct CudaExchangeItem(T); -// Safety: Transparent newtype wrapper around `SafeDeviceCopy` -// is `DeviceCopy` +// Safety: Transparent newtype wrapper around [`SafeDeviceCopy`] +// is [`DeviceCopy`] unsafe impl rustacuda_core::DeviceCopy for CudaExchangeItem { diff --git a/src/utils/exchange/wrapper.rs b/src/utils/exchange/wrapper.rs index 26958f491..f22a6defe 100644 --- a/src/utils/exchange/wrapper.rs +++ b/src/utils/exchange/wrapper.rs @@ -1,12 +1,20 @@ -use core::ops::{Deref, DerefMut}; +use core::{ + marker::PhantomData, + ops::{Deref, DerefMut}, +}; -use rustacuda::{error::CudaResult, memory::DeviceBox}; +use rustacuda::{ + error::CudaResult, + event::{Event, EventFlags}, + memory::DeviceBox, + stream::Stream, +}; use crate::{ - common::{DeviceAccessible, RustToCuda}, + common::{DeviceAccessible, RustToCuda, RustToCudaAsync}, host::{ - CombinedCudaAlloc, EmptyCudaAlloc, HostAndDeviceConstRef, HostAndDeviceMutRef, - HostDeviceBox, NullCudaAlloc, + CombinedCudaAlloc, CudaDropWrapper, EmptyCudaAlloc, HostAndDeviceConstRef, + HostAndDeviceMutRef, HostDeviceBox, HostLockedBox, NullCudaAlloc, }, }; @@ -14,39 +22,143 @@ use crate::{ pub struct ExchangeWrapperOnHost> { value: T, device_box: HostDeviceBox::CudaRepresentation>>, + locked_cuda_repr: HostLockedBox::CudaRepresentation>>, + move_event: CudaDropWrapper, +} + +#[allow(clippy::module_name_repetitions)] +pub struct ExchangeWrapperOnHostAsync<'stream, T: RustToCuda> { + value: T, + device_box: HostDeviceBox::CudaRepresentation>>, + locked_cuda_repr: HostLockedBox::CudaRepresentation>>, + move_event: CudaDropWrapper, + stream: PhantomData<&'stream Stream>, } #[allow(clippy::module_name_repetitions)] pub struct ExchangeWrapperOnDevice> { value: T, device_box: HostDeviceBox::CudaRepresentation>>, - cuda_repr: DeviceAccessible<::CudaRepresentation>, + locked_cuda_repr: HostLockedBox::CudaRepresentation>>, null_alloc: CombinedCudaAlloc<::CudaAllocation, NullCudaAlloc>, + move_event: CudaDropWrapper, +} + +#[allow(clippy::module_name_repetitions)] +pub struct ExchangeWrapperOnDeviceAsync<'stream, T: RustToCuda> { + value: T, + device_box: HostDeviceBox::CudaRepresentation>>, + locked_cuda_repr: HostLockedBox::CudaRepresentation>>, + null_alloc: CombinedCudaAlloc<::CudaAllocation, NullCudaAlloc>, + move_event: CudaDropWrapper, + stream: PhantomData<&'stream Stream>, } impl> ExchangeWrapperOnHost { /// # Errors - /// Returns a `rustacuda::errors::CudaError` iff an error occurs inside CUDA + /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside + /// CUDA pub fn new(value: T) -> CudaResult { + // Safety: The uninitialised memory is never exposed + // To access the device memory, [`Self::move_to_device`] has to be + // called first, which initialised the memory. + let device_box = unsafe { DeviceBox::uninitialized() }?.into(); + let (cuda_repr, _null_alloc) = unsafe { value.borrow(NullCudaAlloc) }?; + let locked_cuda_repr = HostLockedBox::new(cuda_repr)?; - let device_box = DeviceBox::new(&cuda_repr)?.into(); + let move_event = Event::new(EventFlags::DISABLE_TIMING)?.into(); - Ok(Self { value, device_box }) + Ok(Self { + value, + device_box, + locked_cuda_repr, + move_event, + }) } + /// Moves the data synchronously to the CUDA device, where it can then be + /// lent out immutably via [`ExchangeWrapperOnDevice::as_ref`], or mutably + /// via [`ExchangeWrapperOnDevice::as_mut`]. + /// + /// To avoid aliasing, each CUDA thread will get access to its own shallow + /// copy of the data. Hence, + /// - any shallow changes to the data will NOT be reflected back to the CPU + /// - any deep changes to the data WILL be reflected back to the CPU + /// /// # Errors - /// Returns a `rustacuda::errors::CudaError` iff an error occurs inside CUDA + /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside + /// CUDA pub fn move_to_device(mut self) -> CudaResult> { let (cuda_repr, null_alloc) = unsafe { self.value.borrow(NullCudaAlloc) }?; + *self.locked_cuda_repr = cuda_repr; - self.device_box.copy_from(&cuda_repr)?; + self.device_box.copy_from(&self.locked_cuda_repr)?; Ok(ExchangeWrapperOnDevice { value: self.value, device_box: self.device_box, - cuda_repr, + locked_cuda_repr: self.locked_cuda_repr, null_alloc, + move_event: self.move_event, + }) + } +} + +impl> ExchangeWrapperOnHost { + /// Moves the data asynchronously to the CUDA device. + /// + /// To avoid aliasing, each CUDA thread will get access to its own shallow + /// copy of the data. Hence, + /// - any shallow changes to the data will NOT be reflected back to the CPU + /// - any deep changes to the data WILL be reflected back to the CPU + /// + /// # Errors + /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside + /// CUDA + pub fn move_to_device_async<'stream>( + mut self, + stream: &'stream Stream, + ) -> CudaResult> { + let (cuda_repr, null_alloc) = unsafe { self.value.borrow_async(NullCudaAlloc, stream) }?; + *self.locked_cuda_repr = cuda_repr; + + // Safety: The device value is not safely exposed until either + // - the passed-in [`Stream`] is synchronised + // - the kernel is launched on the passed-in [`Stream`] + unsafe { + self.device_box + .async_copy_from(&self.locked_cuda_repr, stream) + }?; + self.move_event.record(stream)?; + + Ok(ExchangeWrapperOnDeviceAsync { + value: self.value, + device_box: self.device_box, + locked_cuda_repr: self.locked_cuda_repr, + null_alloc, + move_event: self.move_event, + stream: PhantomData::<&'stream Stream>, + }) + } +} + +impl<'stream, T: RustToCuda> + ExchangeWrapperOnHostAsync<'stream, T> +{ + /// Synchronises the host CPU thread until the data has moved to the CPU. + /// + /// # Errors + /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside + /// CUDA + pub fn sync_to_host(self) -> CudaResult> { + self.move_event.synchronize()?; + + Ok(ExchangeWrapperOnHost { + value: self.value, + device_box: self.device_box, + locked_cuda_repr: self.locked_cuda_repr, + move_event: self.move_event, }) } } @@ -65,29 +177,96 @@ impl> DerefMut for ExchangeWrapper } } +impl<'stream, T: RustToCuda> + ExchangeWrapperOnDeviceAsync<'stream, T> +{ + /// Synchronises the host CPU thread until the data has moved to the GPU. + /// + /// # Errors + /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside + /// CUDA + pub fn sync_to_device(self) -> CudaResult> { + self.move_event.synchronize()?; + + Ok(ExchangeWrapperOnDevice { + value: self.value, + device_box: self.device_box, + locked_cuda_repr: self.locked_cuda_repr, + null_alloc: self.null_alloc, + move_event: self.move_event, + }) + } +} + impl> ExchangeWrapperOnDevice { + /// Moves the data synchronously back to the host CPU device. + /// + /// To avoid aliasing, each CUDA thread only got access to its own shallow + /// copy of the data. Hence, + /// - any shallow changes to the data will NOT be reflected back to the CPU + /// - any deep changes to the data WILL be reflected back to the CPU + /// /// # Errors - /// Returns a `rustacuda::errors::CudaError` iff an error occurs inside CUDA + /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside + /// CUDA pub fn move_to_host(mut self) -> CudaResult> { + // Reflect deep changes back to the CPU let _null_alloc: NullCudaAlloc = unsafe { self.value.restore(self.null_alloc) }?; + // Note: Shallow changes are not reflected back to the CPU + Ok(ExchangeWrapperOnHost { value: self.value, device_box: self.device_box, + locked_cuda_repr: self.locked_cuda_repr, + move_event: self.move_event, }) } pub fn as_ref( &self, ) -> HostAndDeviceConstRef::CudaRepresentation>> { - // Safety: `device_box` contains exactly the device copy of `cuda_repr` - unsafe { HostAndDeviceConstRef::new(&self.device_box, &self.cuda_repr) } + // Safety: `device_box` contains exactly the device copy of `locked_cuda_repr` + unsafe { HostAndDeviceConstRef::new(&self.device_box, &self.locked_cuda_repr) } } pub fn as_mut( &mut self, ) -> HostAndDeviceMutRef::CudaRepresentation>> { - // Safety: `device_box` contains exactly the device copy of `cuda_repr` - unsafe { HostAndDeviceMutRef::new(&mut self.device_box, &mut self.cuda_repr) } + // Safety: `device_box` contains exactly the device copy of `locked_cuda_repr` + unsafe { HostAndDeviceMutRef::new(&mut self.device_box, &mut self.locked_cuda_repr) } + } +} + +impl> ExchangeWrapperOnDevice { + /// Moves the data asynchronously back to the host CPU device. + /// + /// To avoid aliasing, each CUDA thread only got access to its own shallow + /// copy of the data. Hence, + /// - any shallow changes to the data will NOT be reflected back to the CPU + /// - any deep changes to the data WILL be reflected back to the CPU + /// + /// # Errors + /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside + /// CUDA + pub fn move_to_host_async<'stream>( + mut self, + stream: &'stream Stream, + ) -> CudaResult> { + // Reflect deep changes back to the CPU + let _null_alloc: NullCudaAlloc = + unsafe { self.value.restore_async(self.null_alloc, stream) }?; + + // Note: Shallow changes are not reflected back to the CPU + + self.move_event.record(stream)?; + + Ok(ExchangeWrapperOnHostAsync { + value: self.value, + device_box: self.device_box, + locked_cuda_repr: self.locked_cuda_repr, + move_event: self.move_event, + stream: PhantomData::<&'stream Stream>, + }) } } diff --git a/src/utils/option.rs b/src/utils/option.rs index 7ef601137..18b86527b 100644 --- a/src/utils/option.rs +++ b/src/utils/option.rs @@ -3,7 +3,10 @@ use core::mem::MaybeUninit; use const_type_layout::TypeGraphLayout; use crate::{ - common::{CudaAsRust, DeviceAccessible, RustToCuda, RustToCudaProxy}, + common::{ + CudaAsRust, DeviceAccessible, RustToCuda, RustToCudaAsync, RustToCudaAsyncProxy, + RustToCudaProxy, + }, safety::SafeDeviceCopy, utils::device_copy::SafeDeviceCopyWrapper, }; @@ -83,6 +86,62 @@ unsafe impl RustToCuda for Option { } } +unsafe impl RustToCudaAsync for Option { + #[cfg(feature = "host")] + #[doc(cfg(feature = "host"))] + #[allow(clippy::type_complexity)] + unsafe fn borrow_async( + &self, + alloc: A, + stream: &rustacuda::stream::Stream, + ) -> CudaResult<( + DeviceAccessible, + CombinedCudaAlloc, + )> { + let (cuda_repr, alloc) = match self { + None => ( + OptionCudaRepresentation { + maybe: MaybeUninit::uninit(), + present: false, + }, + CombinedCudaAlloc::new(None, alloc), + ), + Some(value) => { + let (cuda_repr, alloc) = value.borrow_async(alloc, stream)?; + + let (alloc_front, alloc_tail) = alloc.split(); + + ( + OptionCudaRepresentation { + maybe: MaybeUninit::new(cuda_repr), + present: true, + }, + CombinedCudaAlloc::new(Some(alloc_front), alloc_tail), + ) + }, + }; + + Ok((DeviceAccessible::from(cuda_repr), alloc)) + } + + #[cfg(feature = "host")] + #[doc(cfg(feature = "host"))] + unsafe fn restore_async( + &mut self, + alloc: CombinedCudaAlloc, + stream: &rustacuda::stream::Stream, + ) -> CudaResult { + let (alloc_front, alloc_tail) = alloc.split(); + + match (self, alloc_front) { + (Some(value), Some(alloc_front)) => { + value.restore_async(CombinedCudaAlloc::new(alloc_front, alloc_tail), stream) + }, + _ => Ok(alloc_tail), + } + } +} + unsafe impl CudaAsRust for OptionCudaRepresentation { type RustRepresentation = Option<::RustRepresentation>; @@ -101,12 +160,30 @@ impl RustToCudaProxy> for Option> { fn from_ref(val: &Option) -> &Self { - // Safety: `SafeDeviceCopyWrapper` is a transparent newtype + // Safety: [`SafeDeviceCopyWrapper`] is a transparent newtype + unsafe { &*(val as *const Option).cast() } + } + + fn from_mut(val: &mut Option) -> &mut Self { + // Safety: [`SafeDeviceCopyWrapper`] is a transparent newtype + unsafe { &mut *(val as *mut Option).cast() } + } + + fn into(self) -> Option { + self.map(SafeDeviceCopyWrapper::into_inner) + } +} + +impl RustToCudaAsyncProxy> + for Option> +{ + fn from_ref(val: &Option) -> &Self { + // Safety: [`SafeDeviceCopyWrapper`] is a transparent newtype unsafe { &*(val as *const Option).cast() } } fn from_mut(val: &mut Option) -> &mut Self { - // Safety: `SafeDeviceCopyWrapper` is a transparent newtype + // Safety: [`SafeDeviceCopyWrapper`] is a transparent newtype unsafe { &mut *(val as *mut Option).cast() } } From 4edc14b82c2357a1cebee097ae6f1f87eb3d972e Mon Sep 17 00:00:00 2001 From: Juniper Langenstein Date: Sat, 19 Nov 2022 14:36:34 +0000 Subject: [PATCH 002/120] Experiments with Rust Futures --- src/host.rs | 26 ++++-- src/lib.rs | 1 + src/utils/exchange/wrapper.rs | 153 ++++++++++++++++++++++++++++++++-- 3 files changed, 163 insertions(+), 17 deletions(-) diff --git a/src/host.rs b/src/host.rs index 3c19ac6fd..d434e8d67 100644 --- a/src/host.rs +++ b/src/host.rs @@ -297,19 +297,29 @@ impl CombinedCudaAlloc { } } -pub struct CudaDropWrapper(Option); +#[repr(transparent)] +pub struct CudaDropWrapper(ManuallyDrop); impl private::alloc::Sealed for CudaDropWrapper {} impl From for CudaDropWrapper { fn from(val: C) -> Self { - Self(Some(val)) + Self(ManuallyDrop::new(val)) + } +} +impl CudaDropWrapper { + pub fn into_inner(self) -> C { + let this = ManuallyDrop::new(self); + + // Safety: move out of drop, caller now has to deal with CUDA drop again + unsafe { core::ptr::read(&*this.0) } } } impl Drop for CudaDropWrapper { fn drop(&mut self) { - if let Some(val) = self.0.take() { - if let Err((_err, val)) = C::drop(val) { - core::mem::forget(val); - } + // Safety: drop is only ever called once + let val = unsafe { ManuallyDrop::take(&mut self.0) }; + + if let Err((_err, val)) = C::drop(val) { + core::mem::forget(val); } } } @@ -317,12 +327,12 @@ impl Deref for CudaDropWrapper { type Target = C; fn deref(&self) -> &Self::Target { - self.0.as_ref().unwrap() + &self.0 } } impl DerefMut for CudaDropWrapper { fn deref_mut(&mut self) -> &mut Self::Target { - self.0.as_mut().unwrap() + &mut self.0 } } diff --git a/src/lib.rs b/src/lib.rs index 3c176e4a2..2c202ffee 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -14,6 +14,7 @@ #![feature(const_type_name)] #![feature(offset_of)] #![feature(adt_const_params)] +#![feature(impl_trait_in_assoc_type)] #![allow(incomplete_features)] #![feature(generic_const_exprs)] #![doc(html_root_url = "https://juntyr.github.io/rust-cuda/")] diff --git a/src/utils/exchange/wrapper.rs b/src/utils/exchange/wrapper.rs index f22a6defe..61ab9899f 100644 --- a/src/utils/exchange/wrapper.rs +++ b/src/utils/exchange/wrapper.rs @@ -1,20 +1,25 @@ use core::{ + future::{Future, IntoFuture}, marker::PhantomData, ops::{Deref, DerefMut}, + task::{Poll, Waker}, }; +use std::sync::Mutex; +use alloc::sync::Arc; use rustacuda::{ - error::CudaResult, - event::{Event, EventFlags}, + error::{CudaError, CudaResult}, + event::{Event, EventFlags, EventStatus}, memory::DeviceBox, - stream::Stream, + stream::{Stream, StreamWaitEventFlags}, }; use crate::{ common::{DeviceAccessible, RustToCuda, RustToCudaAsync}, host::{ CombinedCudaAlloc, CudaDropWrapper, EmptyCudaAlloc, HostAndDeviceConstRef, - HostAndDeviceMutRef, HostDeviceBox, HostLockedBox, NullCudaAlloc, + HostAndDeviceConstRefAsync, HostAndDeviceMutRef, HostAndDeviceMutRefAsync, HostDeviceBox, + HostLockedBox, NullCudaAlloc, }, }; @@ -51,7 +56,8 @@ pub struct ExchangeWrapperOnDeviceAsync<'stream, T: RustToCuda::CudaRepresentation>>, null_alloc: CombinedCudaAlloc<::CudaAllocation, NullCudaAlloc>, move_event: CudaDropWrapper, - stream: PhantomData<&'stream Stream>, + stream: &'stream Stream, + waker: Arc>>, } impl> ExchangeWrapperOnHost { @@ -116,10 +122,10 @@ impl> ExchangeWrapperOnHost( + pub fn move_to_device_async( mut self, - stream: &'stream Stream, - ) -> CudaResult> { + stream: &Stream, + ) -> CudaResult> { let (cuda_repr, null_alloc) = unsafe { self.value.borrow_async(NullCudaAlloc, stream) }?; *self.locked_cuda_repr = cuda_repr; @@ -132,13 +138,25 @@ impl> ExchangeWrapperOnHost>> = Arc::new(Mutex::new(None)); + + let waker_callback = waker.clone(); + stream.add_callback(Box::new(move |_| { + if let Ok(mut w) = waker_callback.lock() { + if let Some(w) = w.take() { + w.wake(); + } + } + }))?; + Ok(ExchangeWrapperOnDeviceAsync { value: self.value, device_box: self.device_box, locked_cuda_repr: self.locked_cuda_repr, null_alloc, move_event: self.move_event, - stream: PhantomData::<&'stream Stream>, + stream, + waker, }) } } @@ -161,6 +179,30 @@ impl<'stream, T: RustToCuda> move_event: self.move_event, }) } + + /// Moves the asynchronous data move to a different [`Stream`]. + /// + /// # Errors + /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside + /// CUDA + pub fn move_to_stream<'stream2>( + self, + stream: &'stream2 Stream, + ) -> CudaResult> { + let old_event = self.move_event.into_inner(); + let new_event: CudaDropWrapper = Event::new(EventFlags::DISABLE_TIMING)?.into(); + + stream.wait_event(old_event, StreamWaitEventFlags::DEFAULT)?; + new_event.record(stream)?; + + Ok(ExchangeWrapperOnHostAsync { + value: self.value, + device_box: self.device_box, + locked_cuda_repr: self.locked_cuda_repr, + move_event: new_event, + stream: PhantomData::<&'stream2 Stream>, + }) + } } impl> Deref for ExchangeWrapperOnHost { @@ -196,6 +238,99 @@ impl<'stream, T: RustToCuda> move_event: self.move_event, }) } + + /// Moves the asynchronous data move to a different [`Stream`]. + /// + /// # Errors + /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside + /// CUDA + pub fn move_to_stream( + self, + stream: &Stream, + ) -> CudaResult> { + let old_event = self.move_event.into_inner(); + let new_event: CudaDropWrapper = Event::new(EventFlags::DISABLE_TIMING)?.into(); + + stream.wait_event(old_event, StreamWaitEventFlags::DEFAULT)?; + new_event.record(stream)?; + + let waker_callback = self.waker.clone(); + stream.add_callback(Box::new(move |_| { + if let Ok(mut w) = waker_callback.lock() { + if let Some(w) = w.take() { + w.wake(); + } + } + }))?; + + Ok(ExchangeWrapperOnDeviceAsync { + value: self.value, + device_box: self.device_box, + locked_cuda_repr: self.locked_cuda_repr, + null_alloc: self.null_alloc, + move_event: new_event, + stream, + waker: self.waker, + }) + } + + pub fn as_ref_async( + &self, + ) -> HostAndDeviceConstRefAsync::CudaRepresentation>> { + // Safety: `device_box` contains exactly the device copy of `locked_cuda_repr` + unsafe { + HostAndDeviceConstRefAsync::new(&self.device_box, &self.locked_cuda_repr, self.stream) + } + } + + pub fn as_mut_async( + &mut self, + ) -> HostAndDeviceMutRefAsync::CudaRepresentation>> { + // Safety: `device_box` contains exactly the device copy of `locked_cuda_repr` + unsafe { + HostAndDeviceMutRefAsync::new( + &mut self.device_box, + &mut self.locked_cuda_repr, + self.stream, + ) + } + } +} + +impl<'stream, T: RustToCuda> IntoFuture + for ExchangeWrapperOnDeviceAsync<'stream, T> +{ + type Output = CudaResult>; + + type IntoFuture = impl Future; + + fn into_future(self) -> Self::IntoFuture { + let mut wrapper = Some(self); + + core::future::poll_fn(move |cx| match &wrapper { + Some(inner) => match inner.move_event.query() { + Ok(EventStatus::NotReady) => match inner.waker.lock() { + Ok(mut w) => { + *w = Some(cx.waker().clone()); + Poll::Pending + }, + Err(_) => Poll::Ready(Err(CudaError::OperatingSystemError)), + }, + Ok(EventStatus::Ready) => match wrapper.take() { + Some(inner) => Poll::Ready(Ok(ExchangeWrapperOnDevice { + value: inner.value, + device_box: inner.device_box, + locked_cuda_repr: inner.locked_cuda_repr, + null_alloc: inner.null_alloc, + move_event: inner.move_event, + })), + None => Poll::Ready(Err(CudaError::AlreadyAcquired)), + }, + Err(err) => Poll::Ready(Err(err)), + }, + None => Poll::Ready(Err(CudaError::AlreadyAcquired)), + }) + } } impl> ExchangeWrapperOnDevice { From 8aa63164eb25d5aec06121fc46941b5e2893ea0c Mon Sep 17 00:00:00 2001 From: Juniper Langenstein Date: Sat, 19 Nov 2022 16:53:29 +0000 Subject: [PATCH 003/120] Implemented derive for RustToCudaAsync --- rust-cuda-derive/src/rust_to_cuda/generics.rs | 37 ++++- rust-cuda-derive/src/rust_to_cuda/impl.rs | 73 ++++++++- rust-cuda-derive/src/rust_to_cuda/mod.rs | 14 +- src/utils/exchange/wrapper.rs | 144 ++++++++++++++++-- 4 files changed, 250 insertions(+), 18 deletions(-) diff --git a/rust-cuda-derive/src/rust_to_cuda/generics.rs b/rust-cuda-derive/src/rust_to_cuda/generics.rs index 8b21246d2..d08b1e7c3 100644 --- a/rust-cuda-derive/src/rust_to_cuda/generics.rs +++ b/rust-cuda-derive/src/rust_to_cuda/generics.rs @@ -4,7 +4,12 @@ use syn::spanned::Spanned; #[allow(clippy::too_many_lines)] pub fn expand_cuda_struct_generics_where_requested_in_attrs( ast: &syn::DeriveInput, -) -> (Vec, syn::Generics, Vec) { +) -> ( + Vec, + syn::Generics, + syn::Generics, + Vec, +) { let mut type_params = ast .generics .type_params() @@ -13,6 +18,7 @@ pub fn expand_cuda_struct_generics_where_requested_in_attrs( let mut struct_attrs_cuda = ast.attrs.clone(); let mut struct_generics_cuda = ast.generics.clone(); + let mut struct_generics_cuda_async = ast.generics.clone(); let mut struct_layout_attrs = Vec::new(); for ty in &type_params { @@ -36,11 +42,17 @@ pub fn expand_cuda_struct_generics_where_requested_in_attrs( path, lit: syn::Lit::Str(s), .. - })) if path.is_ident("bound") => match syn::parse_str(&s.value()) { - Ok(bound) => struct_generics_cuda - .make_where_clause() - .predicates - .push(bound), + })) if path.is_ident("bound") => match syn::parse_str::(&s.value()) { + Ok(bound) => { + struct_generics_cuda + .make_where_clause() + .predicates + .push(bound.clone()); + struct_generics_cuda_async + .make_where_clause() + .predicates + .push(bound); + }, Err(err) => emit_error!( s.span(), "[rust-cuda]: Invalid #[cuda(bound = \"\")] \ @@ -136,7 +148,18 @@ pub fn expand_cuda_struct_generics_where_requested_in_attrs( .push(syn::parse_quote! { #ty: ::rust_cuda::common::RustToCuda }); + struct_generics_cuda_async + .make_where_clause() + .predicates + .push(syn::parse_quote! { + #ty: ::rust_cuda::common::RustToCudaAsync + }); } - (struct_attrs_cuda, struct_generics_cuda, struct_layout_attrs) + ( + struct_attrs_cuda, + struct_generics_cuda, + struct_generics_cuda_async, + struct_layout_attrs, + ) } diff --git a/rust-cuda-derive/src/rust_to_cuda/impl.rs b/rust-cuda-derive/src/rust_to_cuda/impl.rs index 2c6593068..1028f0ed6 100644 --- a/rust-cuda-derive/src/rust_to_cuda/impl.rs +++ b/rust-cuda-derive/src/rust_to_cuda/impl.rs @@ -82,7 +82,8 @@ pub fn rust_to_cuda_trait( #[cfg(not(target_os = "cuda"))] unsafe fn borrow( - &self, alloc: CudaAllocType + &self, + alloc: CudaAllocType, ) -> rust_cuda::rustacuda::error::CudaResult<( rust_cuda::common::DeviceAccessible, rust_cuda::host::CombinedCudaAlloc @@ -117,6 +118,76 @@ pub fn rust_to_cuda_trait( } } +#[allow(clippy::too_many_arguments)] +pub fn rust_to_cuda_async_trait( + struct_name: &syn::Ident, + struct_name_cuda: &syn::Ident, + struct_generics_cuda_async: &syn::Generics, + struct_fields_cuda: &syn::Fields, + r2c_field_async_declarations: &[TokenStream], + r2c_field_initialisations: &[TokenStream], + r2c_field_async_destructors: &[TokenStream], +) -> TokenStream { + let rust_to_cuda_struct_construction = match struct_fields_cuda { + syn::Fields::Named(_) => quote! { + #struct_name_cuda { + #(#r2c_field_initialisations)* + } + }, + syn::Fields::Unnamed(_) => quote! { + #struct_name_cuda ( + #(#r2c_field_initialisations)* + ) + }, + syn::Fields::Unit => quote! { #struct_name_cuda }, + }; + + let (impl_generics, ty_generics, where_clause) = struct_generics_cuda_async.split_for_impl(); + + quote! { + unsafe impl #impl_generics rust_cuda::common::RustToCudaAsync for #struct_name #ty_generics + #where_clause + { + #[cfg(not(target_os = "cuda"))] + unsafe fn borrow_async( + &self, + alloc: CudaAllocType, + stream: &rust_cuda::rustacuda::stream::Stream, + ) -> rust_cuda::rustacuda::error::CudaResult<( + rust_cuda::common::DeviceAccessible, + rust_cuda::host::CombinedCudaAlloc + )> { + let alloc_front = rust_cuda::host::NullCudaAlloc; + let alloc_tail = alloc; + + #(#r2c_field_async_declarations)* + + let borrow = #rust_to_cuda_struct_construction; + + Ok(( + rust_cuda::common::DeviceAccessible::from(borrow), + rust_cuda::host::CombinedCudaAlloc::new(alloc_front, alloc_tail) + )) + } + + #[cfg(not(target_os = "cuda"))] + unsafe fn restore_async( + &mut self, + alloc: rust_cuda::host::CombinedCudaAlloc< + Self::CudaAllocation, CudaAllocType + >, + stream: &rust_cuda::rustacuda::stream::Stream, + ) -> rust_cuda::rustacuda::error::CudaResult { + let (alloc_front, alloc_tail) = alloc.split(); + + #(#r2c_field_async_destructors)* + + Ok(alloc_tail) + } + } + } +} + pub fn cuda_as_rust_trait( struct_name: &syn::Ident, struct_name_cuda: &syn::Ident, diff --git a/rust-cuda-derive/src/rust_to_cuda/mod.rs b/rust-cuda-derive/src/rust_to_cuda/mod.rs index 00e756c00..5cfa6fb18 100644 --- a/rust-cuda-derive/src/rust_to_cuda/mod.rs +++ b/rust-cuda-derive/src/rust_to_cuda/mod.rs @@ -69,7 +69,7 @@ pub fn impl_rust_to_cuda(ast: &syn::DeriveInput) -> proc_macro::TokenStream { syn::Fields::Unit => (), } - let (struct_attrs_cuda, struct_generics_cuda, struct_layout_attrs) = + let (struct_attrs_cuda, struct_generics_cuda, struct_generics_cuda_async, struct_layout_attrs) = generics::expand_cuda_struct_generics_where_requested_in_attrs(ast); let cuda_struct_declaration = r#impl::cuda_struct_declaration( @@ -93,6 +93,16 @@ pub fn impl_rust_to_cuda(ast: &syn::DeriveInput) -> proc_macro::TokenStream { &r2c_field_destructors, ); + let rust_to_cuda_async_trait_impl = r#impl::rust_to_cuda_async_trait( + struct_name, + &struct_name_cuda, + &struct_generics_cuda_async, + &struct_fields_cuda, + &r2c_field_async_declarations, + &r2c_field_initialisations, + &r2c_field_async_destructors, + ); + let cuda_as_rust_trait_impl = r#impl::cuda_as_rust_trait( struct_name, &struct_name_cuda, @@ -106,6 +116,8 @@ pub fn impl_rust_to_cuda(ast: &syn::DeriveInput) -> proc_macro::TokenStream { #rust_to_cuda_trait_impl + #rust_to_cuda_async_trait_impl + #cuda_as_rust_trait_impl }) .into() diff --git a/src/utils/exchange/wrapper.rs b/src/utils/exchange/wrapper.rs index 61ab9899f..e9d5a0329 100644 --- a/src/utils/exchange/wrapper.rs +++ b/src/utils/exchange/wrapper.rs @@ -38,6 +38,7 @@ pub struct ExchangeWrapperOnHostAsync<'stream, T: RustToCuda::CudaRepresentation>>, move_event: CudaDropWrapper, stream: PhantomData<&'stream Stream>, + waker: Arc>>, } #[allow(clippy::module_name_repetitions)] @@ -185,22 +186,64 @@ impl<'stream, T: RustToCuda> /// # Errors /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside /// CUDA - pub fn move_to_stream<'stream2>( - self, - stream: &'stream2 Stream, - ) -> CudaResult> { + pub fn move_to_stream(self, stream: &Stream) -> CudaResult> { let old_event = self.move_event.into_inner(); let new_event: CudaDropWrapper = Event::new(EventFlags::DISABLE_TIMING)?.into(); stream.wait_event(old_event, StreamWaitEventFlags::DEFAULT)?; new_event.record(stream)?; + let waker_callback = self.waker.clone(); + stream.add_callback(Box::new(move |_| { + if let Ok(mut w) = waker_callback.lock() { + if let Some(w) = w.take() { + w.wake(); + } + } + }))?; + Ok(ExchangeWrapperOnHostAsync { value: self.value, device_box: self.device_box, locked_cuda_repr: self.locked_cuda_repr, move_event: new_event, - stream: PhantomData::<&'stream2 Stream>, + stream: PhantomData::<&Stream>, + waker: self.waker, + }) + } +} + +impl<'stream, T: RustToCuda> IntoFuture + for ExchangeWrapperOnHostAsync<'stream, T> +{ + type Output = CudaResult>; + + type IntoFuture = impl Future; + + fn into_future(self) -> Self::IntoFuture { + let mut wrapper = Some(self); + + core::future::poll_fn(move |cx| match &wrapper { + Some(inner) => match inner.move_event.query() { + Ok(EventStatus::NotReady) => match inner.waker.lock() { + Ok(mut w) => { + *w = Some(cx.waker().clone()); + Poll::Pending + }, + Err(_) => Poll::Ready(Err(CudaError::OperatingSystemError)), + }, + Ok(EventStatus::Ready) => match wrapper.take() { + Some(inner) => Poll::Ready(Ok(ExchangeWrapperOnHost { + value: inner.value, + device_box: inner.device_box, + locked_cuda_repr: inner.locked_cuda_repr, + move_event: inner.move_event, + })), + None => Poll::Ready(Err(CudaError::AlreadyAcquired)), + }, + Err(err) => Poll::Ready(Err(err)), + }, + None => Poll::Ready(Err(CudaError::AlreadyAcquired)), }) } } @@ -295,6 +338,77 @@ impl<'stream, T: RustToCuda> ) } } + + /// Moves the data synchronously back to the host CPU device. + /// + /// To avoid aliasing, each CUDA thread only got access to its own shallow + /// copy of the data. Hence, + /// - any shallow changes to the data will NOT be reflected back to the CPU + /// - any deep changes to the data WILL be reflected back to the CPU + /// + /// # Errors + /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside + /// CUDA + pub fn move_to_host(mut self) -> CudaResult> { + // Reflect deep changes back to the CPU + let _null_alloc: NullCudaAlloc = unsafe { self.value.restore(self.null_alloc) }?; + + // Note: Shallow changes are not reflected back to the CPU + + Ok(ExchangeWrapperOnHost { + value: self.value, + device_box: self.device_box, + locked_cuda_repr: self.locked_cuda_repr, + move_event: self.move_event, + }) + } +} + +impl<'stream, T: RustToCudaAsync> + ExchangeWrapperOnDeviceAsync<'stream, T> +{ + /// Moves the data asynchronously back to the host CPU device. + /// + /// To avoid aliasing, each CUDA thread only got access to its own shallow + /// copy of the data. Hence, + /// - any shallow changes to the data will NOT be reflected back to the CPU + /// - any deep changes to the data WILL be reflected back to the CPU + /// + /// # Errors + /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside + /// CUDA + pub fn move_to_host_async( + mut self, + stream: &'stream Stream, + ) -> CudaResult> { + // Reflect deep changes back to the CPU + let _null_alloc: NullCudaAlloc = + unsafe { self.value.restore_async(self.null_alloc, stream) }?; + + // Note: Shallow changes are not reflected back to the CPU + + self.move_event.record(stream)?; + + let waker: Arc>> = Arc::new(Mutex::new(None)); + + let waker_callback = waker.clone(); + stream.add_callback(Box::new(move |_| { + if let Ok(mut w) = waker_callback.lock() { + if let Some(w) = w.take() { + w.wake(); + } + } + }))?; + + Ok(ExchangeWrapperOnHostAsync { + value: self.value, + device_box: self.device_box, + locked_cuda_repr: self.locked_cuda_repr, + move_event: self.move_event, + stream: PhantomData::<&'stream Stream>, + waker, + }) + } } impl<'stream, T: RustToCuda> IntoFuture @@ -384,10 +498,10 @@ impl> ExchangeWrapperOnDevice /// # Errors /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside /// CUDA - pub fn move_to_host_async<'stream>( + pub fn move_to_host_async( mut self, - stream: &'stream Stream, - ) -> CudaResult> { + stream: &Stream, + ) -> CudaResult> { // Reflect deep changes back to the CPU let _null_alloc: NullCudaAlloc = unsafe { self.value.restore_async(self.null_alloc, stream) }?; @@ -396,12 +510,24 @@ impl> ExchangeWrapperOnDevice self.move_event.record(stream)?; + let waker: Arc>> = Arc::new(Mutex::new(None)); + + let waker_callback = waker.clone(); + stream.add_callback(Box::new(move |_| { + if let Ok(mut w) = waker_callback.lock() { + if let Some(w) = w.take() { + w.wake(); + } + } + }))?; + Ok(ExchangeWrapperOnHostAsync { value: self.value, device_box: self.device_box, locked_cuda_repr: self.locked_cuda_repr, move_event: self.move_event, - stream: PhantomData::<&'stream Stream>, + stream: PhantomData::<&Stream>, + waker, }) } } From e7b6174ff1112bf3765db8411f6c873be6653a7e Mon Sep 17 00:00:00 2001 From: Juniper Langenstein Date: Sat, 19 Nov 2022 18:47:36 +0000 Subject: [PATCH 004/120] Implemented async kernel launch --- .../generate/cpu_linker_macro/get_ptx_str.rs | 2 +- .../generate/cpu_linker_macro/kernel_func.rs | 18 +++-- .../async_func_types.rs} | 8 +-- .../launch_types.rs | 0 .../mod.rs | 24 +++---- .../type_wrap.rs | 4 +- .../wrapper/generate/cpu_linker_macro/mod.rs | 8 +-- .../kernel/wrapper/generate/cpu_wrapper.rs | 19 +++--- rust-cuda-derive/src/kernel/wrapper/mod.rs | 4 +- rust-cuda-derive/src/kernel/wrapper/parse.rs | 2 +- src/host.rs | 68 +++++++++++-------- 11 files changed, 87 insertions(+), 70 deletions(-) rename rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/{kernel_func_raw/raw_func_types.rs => kernel_func_async/async_func_types.rs} (93%) rename rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/{kernel_func_raw => kernel_func_async}/launch_types.rs (100%) rename rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/{kernel_func_raw => kernel_func_async}/mod.rs (87%) rename rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/{kernel_func_raw => kernel_func_async}/type_wrap.rs (89%) diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/get_ptx_str.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/get_ptx_str.rs index dadda41ec..d39246484 100644 --- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/get_ptx_str.rs +++ b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/get_ptx_str.rs @@ -30,7 +30,7 @@ pub(super) fn quote_get_ptx_str( .unwrap_or_else(|err| abort_call_site!("Failed to read crate path: {:?}.", err)); let cpu_func_lifetime_erased_types = - super::kernel_func_raw::generate_launch_types(config, generics, inputs, macro_type_ids).1; + super::kernel_func_async::generate_launch_types(config, generics, inputs, macro_type_ids).1; let matching_kernel_assert = if skip_kernel_compilation() { quote!() diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func.rs index 7cad78e05..fda5b96e4 100644 --- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func.rs +++ b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func.rs @@ -89,7 +89,9 @@ fn generate_raw_func_input_wrap( func_inputs, func_input_cuda_types, }: &FunctionInputs, - FuncIdent { func_ident_raw, .. }: &FuncIdent, + FuncIdent { + func_ident_async, .. + }: &FuncIdent, func_params: &[syn::Ident], ) -> TokenStream { func_inputs @@ -99,7 +101,11 @@ fn generate_raw_func_input_wrap( .rev() .fold( quote! { - self.#func_ident_raw(#(#func_params),*) + self.#func_ident_async(#(#func_params),*)?; + let rust_cuda::host::LaunchPackage { + stream, .. + } = rust_cuda::host::Launcher::get_launch_package(self); + stream.synchronize() }, |inner, ((arg, param), (cuda_mode, _ptx_jit))| match arg { syn::FnArg::Typed(syn::PatType { pat, ty, .. }) => match cuda_mode { @@ -119,7 +125,7 @@ fn generate_raw_func_input_wrap( let __result = (|#pat| { #inner })(unsafe { rust_cuda::host::HostAndDeviceConstRef::new( &#pat_box, rust_cuda::utils::device_copy::SafeDeviceCopyWrapper::from_ref(#pat) - ) + ).as_async() }); #[allow(invalid_reference_casting)] @@ -149,16 +155,16 @@ fn generate_raw_func_input_wrap( if let syn::Type::Reference(syn::TypeReference { mutability, .. }) = &**ty { if mutability.is_some() { quote! { rust_cuda::host::LendToCuda::lend_to_cuda_mut( - #pat, |#pat| { #inner } + #pat, |mut #pat| { (|#pat| { #inner })(#pat.as_async()) } ) } } else { quote! { rust_cuda::host::LendToCuda::lend_to_cuda( - #pat, |#pat| { #inner } + #pat, |#pat| { (|#pat| { #inner })(#pat.as_async()) } ) } } } else { quote! { rust_cuda::host::LendToCuda::move_to_cuda( - #pat, |#pat| { #inner } + #pat, |mut #pat| { (|#pat| { #inner })(#pat.as_async()) } ) } } }, diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func_raw/raw_func_types.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func_async/async_func_types.rs similarity index 93% rename from rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func_raw/raw_func_types.rs rename to rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func_async/async_func_types.rs index 380048ec5..50e74b02e 100644 --- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func_raw/raw_func_types.rs +++ b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func_async/async_func_types.rs @@ -5,7 +5,7 @@ use crate::kernel::utils::r2c_move_lifetime; use super::super::super::super::{DeclGenerics, FunctionInputs, InputCudaType, KernelConfig}; -pub(super) fn generate_raw_func_types( +pub(super) fn generate_async_func_types( KernelConfig { args, .. }: &KernelConfig, DeclGenerics { generic_start_token, @@ -62,11 +62,11 @@ pub(super) fn generate_raw_func_types( } quote!( - rust_cuda::host::HostAndDeviceMutRef<#lifetime, #cuda_type> + rust_cuda::host::HostAndDeviceMutRefAsync<'stream, #lifetime, #cuda_type> ) } else { quote!( - rust_cuda::host::HostAndDeviceConstRef<#lifetime, #cuda_type> + rust_cuda::host::HostAndDeviceConstRefAsync<'stream, #lifetime, #cuda_type> ) }; @@ -77,7 +77,7 @@ pub(super) fn generate_raw_func_types( let lifetime = r2c_move_lifetime(i, ty); let wrapped_type = quote! { - rust_cuda::host::HostAndDeviceOwned<#lifetime, #cuda_type> + rust_cuda::host::HostAndDeviceOwnedAsync<'stream, #lifetime, #cuda_type> }; quote! { diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func_raw/launch_types.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func_async/launch_types.rs similarity index 100% rename from rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func_raw/launch_types.rs rename to rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func_async/launch_types.rs diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func_raw/mod.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func_async/mod.rs similarity index 87% rename from rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func_raw/mod.rs rename to rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func_async/mod.rs index ab352b4c8..112e760c9 100644 --- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func_raw/mod.rs +++ b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func_async/mod.rs @@ -2,32 +2,32 @@ use proc_macro2::TokenStream; use super::super::super::{DeclGenerics, FuncIdent, FunctionInputs, KernelConfig}; +mod async_func_types; mod launch_types; -mod raw_func_types; mod type_wrap; +use async_func_types::generate_async_func_types; pub(super) use launch_types::generate_launch_types; -use raw_func_types::generate_raw_func_types; use type_wrap::generate_func_input_and_ptx_jit_wraps; #[allow(clippy::too_many_arguments)] -pub(super) fn quote_kernel_func_raw( +pub(super) fn quote_kernel_func_async( config @ KernelConfig { args, .. }: &KernelConfig, decl_generics @ DeclGenerics { - generic_start_token, generic_wrapper_params, - generic_close_token, generic_wrapper_where_clause, .. }: &DeclGenerics, func_inputs: &FunctionInputs, - FuncIdent { func_ident_raw, .. }: &FuncIdent, + FuncIdent { + func_ident_async, .. + }: &FuncIdent, func_params: &[syn::Ident], func_attrs: &[syn::Attribute], macro_type_ids: &[syn::Ident], ) -> TokenStream { - let new_func_inputs_raw = - generate_raw_func_types(config, decl_generics, func_inputs, macro_type_ids); + let new_func_inputs_async = + generate_async_func_types(config, decl_generics, func_inputs, macro_type_ids); let (func_input_wrap, func_cpu_ptx_jit_wrap) = generate_func_input_and_ptx_jit_wraps(func_inputs); let (cpu_func_types_launch, cpu_func_lifetime_erased_types, cpu_func_unboxed_types) = @@ -36,8 +36,8 @@ pub(super) fn quote_kernel_func_raw( quote! { #(#func_attrs)* #[allow(clippy::extra_unused_type_parameters)] - fn #func_ident_raw #generic_start_token #generic_wrapper_params #generic_close_token ( - &mut self, #(#new_func_inputs_raw),* + fn #func_ident_async <'stream, #generic_wrapper_params> ( + &'stream mut self, #(#new_func_inputs_async),* ) -> rust_cuda::rustacuda::error::CudaResult<()> #generic_wrapper_where_clause { @@ -102,9 +102,7 @@ pub(super) fn quote_kernel_func_raw( &#func_params as *const _ as *mut ::std::ffi::c_void ),* ] - ) }?; - - stream.synchronize() + ) } })(#(#func_input_wrap),*) } } diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func_raw/type_wrap.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func_async/type_wrap.rs similarity index 89% rename from rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func_raw/type_wrap.rs rename to rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func_async/type_wrap.rs index 432930731..50ea505f1 100644 --- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func_raw/type_wrap.rs +++ b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func_async/type_wrap.rs @@ -17,9 +17,9 @@ pub(super) fn generate_func_input_and_ptx_jit_wraps( syn::FnArg::Typed(syn::PatType { pat, ty, .. }) => { #[allow(clippy::if_same_then_else)] let func_input = if let syn::Type::Reference(_) = &**ty { - quote! { #pat.for_device() } + quote! { unsafe { #pat.for_device_async() } } } else if matches!(cuda_mode, InputCudaType::LendRustToCuda) { - quote! { #pat.for_device() } + quote! { unsafe { #pat.for_device_async() } } } else { quote! { #pat } }; diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/mod.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/mod.rs index 7ab891e7e..52fd5c506 100644 --- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/mod.rs +++ b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/mod.rs @@ -4,12 +4,12 @@ use super::super::{DeclGenerics, FuncIdent, FunctionInputs, KernelConfig}; mod get_ptx_str; mod kernel_func; -mod kernel_func_raw; +mod kernel_func_async; mod new_kernel; use get_ptx_str::quote_get_ptx_str; use kernel_func::quote_kernel_func; -use kernel_func_raw::quote_kernel_func_raw; +use kernel_func_async::quote_kernel_func_async; use new_kernel::quote_new_kernel; pub(in super::super) fn quote_cpu_linker_macro( @@ -73,7 +73,7 @@ pub(in super::super) fn quote_cpu_linker_macro( func_attrs, ¯o_type_ids, ); - let kernel_func_raw = quote_kernel_func_raw( + let kernel_func_async = quote_kernel_func_async( config, decl_generics, func_inputs, @@ -97,7 +97,7 @@ pub(in super::super) fn quote_cpu_linker_macro( #kernel_func - #kernel_func_raw + #kernel_func_async } }; } diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper.rs index cad3cdc6a..ed93c61dc 100644 --- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper.rs +++ b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper.rs @@ -23,7 +23,7 @@ pub(in super::super) fn quote_cpu_wrapper( func_inputs: &FunctionInputs, FuncIdent { func_ident, - func_ident_raw, + func_ident_async, .. }: &FuncIdent, func_attrs: &[syn::Attribute], @@ -54,7 +54,7 @@ pub(in super::super) fn quote_cpu_wrapper( }, }; - let (new_func_inputs_decl, new_func_inputs_raw_decl) = + let (new_func_inputs_decl, new_func_inputs_async_decl) = generate_new_func_inputs_decl(config, impl_generics, func_inputs); quote! { @@ -76,8 +76,8 @@ pub(in super::super) fn quote_cpu_wrapper( #generic_wrapper_where_clause; #(#func_attrs)* - fn #func_ident_raw #generic_start_token #generic_wrapper_params #generic_close_token ( - &mut self, #(#new_func_inputs_raw_decl),* + fn #func_ident_async <'stream, #generic_wrapper_params> ( + &'stream mut self, #(#new_func_inputs_async_decl),* ) -> rust_cuda::rustacuda::error::CudaResult<()> #generic_wrapper_where_clause; } @@ -157,11 +157,11 @@ fn generate_new_func_inputs_decl( { let wrapped_type = if mutability.is_some() { syn::parse_quote!( - rust_cuda::host::HostAndDeviceMutRef<#lifetime, #cuda_type> + rust_cuda::host::HostAndDeviceMutRefAsync<'stream, #lifetime, #cuda_type> ) } else { syn::parse_quote!( - rust_cuda::host::HostAndDeviceConstRef<#lifetime, #cuda_type> + rust_cuda::host::HostAndDeviceConstRefAsync<'stream, #lifetime, #cuda_type> ) }; @@ -170,7 +170,7 @@ fn generate_new_func_inputs_decl( let lifetime = r2c_move_lifetime(i, ty); let wrapped_type = syn::parse_quote!( - rust_cuda::host::HostAndDeviceOwned<#lifetime, #cuda_type> + rust_cuda::host::HostAndDeviceOwnedAsync<'stream, #lifetime, #cuda_type> ); Box::new(wrapped_type) @@ -178,9 +178,8 @@ fn generate_new_func_inputs_decl( cuda_type } }, - }), + }) ), syn::FnArg::Receiver(_) => unreachable!(), - }) - .unzip() + }).unzip() } diff --git a/rust-cuda-derive/src/kernel/wrapper/mod.rs b/rust-cuda-derive/src/kernel/wrapper/mod.rs index 6f63af892..c057fe7f1 100644 --- a/rust-cuda-derive/src/kernel/wrapper/mod.rs +++ b/rust-cuda-derive/src/kernel/wrapper/mod.rs @@ -128,7 +128,7 @@ pub fn kernel(attr: TokenStream, func: TokenStream) -> TokenStream { let func_ident = FuncIdent { func_ident: &func.sig.ident, - func_ident_raw: quote::format_ident!("{}_raw", &func.sig.ident), + func_ident_async: quote::format_ident!("{}_async", &func.sig.ident), func_ident_hash: quote::format_ident!("{}_{:016x}", &func.sig.ident, kernel_hash), }; @@ -251,7 +251,7 @@ struct ImplGenerics<'f> { #[allow(clippy::struct_field_names)] struct FuncIdent<'f> { func_ident: &'f syn::Ident, - func_ident_raw: syn::Ident, + func_ident_async: syn::Ident, func_ident_hash: syn::Ident, } diff --git a/rust-cuda-derive/src/kernel/wrapper/parse.rs b/rust-cuda-derive/src/kernel/wrapper/parse.rs index 936143cf2..7d523adb0 100644 --- a/rust-cuda-derive/src/kernel/wrapper/parse.rs +++ b/rust-cuda-derive/src/kernel/wrapper/parse.rs @@ -23,7 +23,7 @@ pub(super) fn parse_kernel_fn(tokens: TokenStream) -> syn::ItemFn { if func.sig.asyncness.is_some() { abort!( func.sig.asyncness.span(), - "Kernel function must not (yet) be async." + "Kernel function must not be async." ); } diff --git a/src/host.rs b/src/host.rs index d434e8d67..f600d9b6e 100644 --- a/src/host.rs +++ b/src/host.rs @@ -618,6 +618,18 @@ impl<'a, T: DeviceCopy> HostAndDeviceMutRef<'a, T> { host_ref: self.host_ref, } } + + #[must_use] + pub fn as_async<'stream, 'b>(&'b mut self) -> HostAndDeviceMutRefAsync<'stream, 'b, T> + where + 'a: 'b, + { + HostAndDeviceMutRefAsync { + device_box: self.device_box, + host_ref: self.host_ref, + stream: PhantomData::<&'stream Stream>, + } + } } #[allow(clippy::module_name_repetitions)] @@ -693,6 +705,18 @@ impl<'a, T: DeviceCopy> HostAndDeviceConstRef<'a, T> { { *self } + + #[must_use] + pub fn as_async<'stream, 'b>(&'b self) -> HostAndDeviceConstRefAsync<'stream, 'b, T> + where + 'a: 'b, + { + HostAndDeviceConstRefAsync { + device_box: self.device_box, + host_ref: self.host_ref, + stream: PhantomData::<&'stream Stream>, + } + } } #[allow(clippy::module_name_repetitions)] @@ -740,6 +764,18 @@ impl<'a, T: SafeDeviceCopy + DeviceCopy> HostAndDeviceOwned<'a, T> { pub fn for_host(&'a mut self) -> &'a T { self.host_val } + + #[must_use] + pub fn as_async<'stream, 'b>(&'b mut self) -> HostAndDeviceOwnedAsync<'stream, 'b, T> + where + 'a: 'b, + { + HostAndDeviceOwnedAsync { + device_box: self.device_box, + host_val: self.host_val, + stream: PhantomData::<&'stream Stream>, + } + } } #[allow(clippy::module_name_repetitions)] @@ -882,34 +918,12 @@ pub struct HostAndDeviceOwnedAsync<'stream, 'a, T: SafeDeviceCopy + DeviceCopy> } impl<'stream, 'a, T: SafeDeviceCopy + DeviceCopy> HostAndDeviceOwnedAsync<'stream, 'a, T> { - /// # Errors - /// - /// Returns a [`CudaError`] iff `value` cannot be moved - /// to CUDA or an error occurs inside `inner`. - pub fn with_new< - O, - E: From, - F: for<'b> FnOnce(HostAndDeviceOwned<'b, T>) -> Result, - >( - mut value: T, - inner: F, - ) -> Result { - let mut device_box: HostDeviceBox<_> = DeviceBox::new(&value)?.into(); - - // Safety: `device_box` contains exactly the device copy of `value` - let result = inner(HostAndDeviceOwned { - device_box: &mut device_box, - host_val: &mut value, - }); - - core::mem::drop(device_box); - core::mem::drop(value); - - result - } - #[must_use] - pub fn for_device(self) -> DeviceMutRef<'a, T> { + /// # Safety + /// + /// The returned [`DeviceConstRef`] must only be used on the + /// constructed-with [`Stream`] + pub unsafe fn for_device_async(self) -> DeviceMutRef<'a, T> { DeviceMutRef { pointer: self.device_box.0.as_raw_mut(), reference: PhantomData, From d93fc4ccb8bdbe86dbce07e2c5959e61f2de5f4c Mon Sep 17 00:00:00 2001 From: Juniper Langenstein Date: Sat, 19 Nov 2022 19:09:12 +0000 Subject: [PATCH 005/120] Fixed RustToCudaAsync derive --- .../src/rust_to_cuda/field_copy.rs | 1 + rust-cuda-derive/src/rust_to_cuda/generics.rs | 25 +++++++++++--- rust-cuda-derive/src/rust_to_cuda/mod.rs | 33 ++++++++++++------- 3 files changed, 42 insertions(+), 17 deletions(-) diff --git a/rust-cuda-derive/src/rust_to_cuda/field_copy.rs b/rust-cuda-derive/src/rust_to_cuda/field_copy.rs index 93326aab6..61891aa8c 100644 --- a/rust-cuda-derive/src/rust_to_cuda/field_copy.rs +++ b/rust-cuda-derive/src/rust_to_cuda/field_copy.rs @@ -143,6 +143,7 @@ pub fn impl_field_copy_init_and_expand_alloc_type( #proxy_ty as rust_cuda::common::RustToCudaAsyncProxy<#field_ty> >::from_mut(&mut self.#field_accessor), alloc_front, + stream, )?; }); diff --git a/rust-cuda-derive/src/rust_to_cuda/generics.rs b/rust-cuda-derive/src/rust_to_cuda/generics.rs index d08b1e7c3..646686534 100644 --- a/rust-cuda-derive/src/rust_to_cuda/generics.rs +++ b/rust-cuda-derive/src/rust_to_cuda/generics.rs @@ -9,6 +9,7 @@ pub fn expand_cuda_struct_generics_where_requested_in_attrs( syn::Generics, syn::Generics, Vec, + bool, ) { let mut type_params = ast .generics @@ -30,6 +31,8 @@ pub fn expand_cuda_struct_generics_where_requested_in_attrs( let mut r2c_ignore = false; + let mut r2c_async_impl = None; + struct_attrs_cuda.retain(|attr| { if attr.path.is_ident("cuda") { if let Ok(syn::Meta::List(list)) = attr.parse_meta() { @@ -90,11 +93,22 @@ pub fn expand_cuda_struct_generics_where_requested_in_attrs( } }, syn::NestedMeta::Meta(syn::Meta::NameValue(syn::MetaNameValue { - path: - syn::Path { - leading_colon: None, - segments, - }, + path, + lit: syn::Lit::Bool(b), + .. + })) if path.is_ident("async") => if r2c_async_impl.is_none() { + r2c_async_impl = Some(b.value()); + } else { + emit_error!( + b.span(), + "[rust-cuda]: Duplicate #[cuda(async)] attribute.", + ); + }, + syn::NestedMeta::Meta(syn::Meta::NameValue(syn::MetaNameValue { + path: syn::Path { + leading_colon: None, + segments, + }, lit: syn::Lit::Str(s), .. })) if segments.len() == 2 @@ -161,5 +175,6 @@ pub fn expand_cuda_struct_generics_where_requested_in_attrs( struct_generics_cuda, struct_generics_cuda_async, struct_layout_attrs, + r2c_async_impl.unwrap_or(true), ) } diff --git a/rust-cuda-derive/src/rust_to_cuda/mod.rs b/rust-cuda-derive/src/rust_to_cuda/mod.rs index 5cfa6fb18..dc8eb6491 100644 --- a/rust-cuda-derive/src/rust_to_cuda/mod.rs +++ b/rust-cuda-derive/src/rust_to_cuda/mod.rs @@ -10,7 +10,7 @@ fn get_cuda_repr_ident(rust_repr_ident: &proc_macro2::Ident) -> proc_macro2::Ide format_ident!("{}CudaRepresentation", rust_repr_ident) } -#[allow(clippy::module_name_repetitions)] +#[allow(clippy::module_name_repetitions, clippy::too_many_lines)] pub fn impl_rust_to_cuda(ast: &syn::DeriveInput) -> proc_macro::TokenStream { let (mut struct_fields_cuda, struct_semi_cuda) = if let syn::Data::Struct(s) = &ast.data { (s.fields.clone(), s.semi_token) @@ -69,8 +69,13 @@ pub fn impl_rust_to_cuda(ast: &syn::DeriveInput) -> proc_macro::TokenStream { syn::Fields::Unit => (), } - let (struct_attrs_cuda, struct_generics_cuda, struct_generics_cuda_async, struct_layout_attrs) = - generics::expand_cuda_struct_generics_where_requested_in_attrs(ast); + let ( + struct_attrs_cuda, + struct_generics_cuda, + struct_generics_cuda_async, + struct_layout_attrs, + r2c_async_impl, + ) = generics::expand_cuda_struct_generics_where_requested_in_attrs(ast); let cuda_struct_declaration = r#impl::cuda_struct_declaration( &struct_attrs_cuda, @@ -93,15 +98,19 @@ pub fn impl_rust_to_cuda(ast: &syn::DeriveInput) -> proc_macro::TokenStream { &r2c_field_destructors, ); - let rust_to_cuda_async_trait_impl = r#impl::rust_to_cuda_async_trait( - struct_name, - &struct_name_cuda, - &struct_generics_cuda_async, - &struct_fields_cuda, - &r2c_field_async_declarations, - &r2c_field_initialisations, - &r2c_field_async_destructors, - ); + let rust_to_cuda_async_trait_impl = if r2c_async_impl { + r#impl::rust_to_cuda_async_trait( + struct_name, + &struct_name_cuda, + &struct_generics_cuda_async, + &struct_fields_cuda, + &r2c_field_async_declarations, + &r2c_field_initialisations, + &r2c_field_async_destructors, + ) + } else { + TokenStream::new() + }; let cuda_as_rust_trait_impl = r#impl::cuda_as_rust_trait( struct_name, From 5481e47fd51f644f4fbc65b488d54a702ca4e722 Mon Sep 17 00:00:00 2001 From: Juniper Langenstein Date: Sat, 19 Nov 2022 19:32:01 +0000 Subject: [PATCH 006/120] LaunchPackage with non-mut Stream --- src/host.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/host.rs b/src/host.rs index f600d9b6e..cc44abe9a 100644 --- a/src/host.rs +++ b/src/host.rs @@ -53,7 +53,7 @@ pub struct LaunchPackage<'l, L: ?Sized + Launcher> { pub config: LaunchConfig, pub kernel: &'l mut TypedKernel, - pub stream: &'l mut Stream, + pub stream: &'l Stream, pub watcher: &'l mut L::CompilationWatcher, } From 6a9d4b6c3692a2848b15903c5ff18c5b9090b3e9 Mon Sep 17 00:00:00 2001 From: Juniper Langenstein Date: Sat, 19 Nov 2022 20:18:53 +0000 Subject: [PATCH 007/120] Moved stream to be an explicit kernel argument --- .../generate/cpu_linker_macro/kernel_func.rs | 11 +++++------ .../cpu_linker_macro/kernel_func_async/mod.rs | 8 +++++--- .../src/kernel/wrapper/generate/cpu_wrapper.rs | 14 ++++++++++---- src/host.rs | 3 --- 4 files changed, 20 insertions(+), 16 deletions(-) diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func.rs index fda5b96e4..41d4244b0 100644 --- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func.rs +++ b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func.rs @@ -57,8 +57,10 @@ pub(super) fn quote_kernel_func( quote! { #(#func_attrs)* #[allow(clippy::needless_lifetimes)] - fn #func_ident #generic_start_token #generic_wrapper_params #generic_close_token ( - &mut self, #(#new_func_inputs),* + fn #func_ident <'stream, #generic_wrapper_params>( + &mut self, + stream: &'stream rust_cuda::rustacuda::stream::Stream, + #(#new_func_inputs),* ) -> rust_cuda::rustacuda::error::CudaResult<()> #generic_wrapper_where_clause { @@ -101,10 +103,7 @@ fn generate_raw_func_input_wrap( .rev() .fold( quote! { - self.#func_ident_async(#(#func_params),*)?; - let rust_cuda::host::LaunchPackage { - stream, .. - } = rust_cuda::host::Launcher::get_launch_package(self); + self.#func_ident_async(stream, #(#func_params),*)?; stream.synchronize() }, |inner, ((arg, param), (cuda_mode, _ptx_jit))| match arg { diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func_async/mod.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func_async/mod.rs index 112e760c9..6980a5753 100644 --- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func_async/mod.rs +++ b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func_async/mod.rs @@ -36,13 +36,15 @@ pub(super) fn quote_kernel_func_async( quote! { #(#func_attrs)* #[allow(clippy::extra_unused_type_parameters)] - fn #func_ident_async <'stream, #generic_wrapper_params> ( - &'stream mut self, #(#new_func_inputs_async),* + fn #func_ident_async <'stream, #generic_wrapper_params>( + &mut self, + stream: &'stream rust_cuda::rustacuda::stream::Stream, + #(#new_func_inputs_async),* ) -> rust_cuda::rustacuda::error::CudaResult<()> #generic_wrapper_where_clause { let rust_cuda::host::LaunchPackage { - kernel, watcher, config, stream + kernel, watcher, config } = rust_cuda::host::Launcher::get_launch_package(self); let kernel_jit_result = if config.ptx_jit { diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper.rs index ed93c61dc..e5c318140 100644 --- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper.rs +++ b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper.rs @@ -70,14 +70,20 @@ pub(in super::super) fn quote_cpu_wrapper( > where #launcher_predicate; #(#func_attrs)* - fn #func_ident #generic_start_token #generic_wrapper_params #generic_close_token ( - &mut self, #(#new_func_inputs_decl),* + #[allow(clippy::too_many_arguments)] + fn #func_ident <'stream, #generic_wrapper_params>( + &mut self, + stream: &'stream rust_cuda::rustacuda::stream::Stream, + #(#new_func_inputs_decl),* ) -> rust_cuda::rustacuda::error::CudaResult<()> #generic_wrapper_where_clause; #(#func_attrs)* - fn #func_ident_async <'stream, #generic_wrapper_params> ( - &'stream mut self, #(#new_func_inputs_async_decl),* + #[allow(clippy::too_many_arguments)] + fn #func_ident_async <'stream, #generic_wrapper_params>( + &mut self, + stream: &'stream rust_cuda::rustacuda::stream::Stream, + #(#new_func_inputs_async_decl),* ) -> rust_cuda::rustacuda::error::CudaResult<()> #generic_wrapper_where_clause; } diff --git a/src/host.rs b/src/host.rs index cc44abe9a..98ad817ba 100644 --- a/src/host.rs +++ b/src/host.rs @@ -51,10 +51,7 @@ pub struct LaunchConfig { pub struct LaunchPackage<'l, L: ?Sized + Launcher> { pub config: LaunchConfig, - pub kernel: &'l mut TypedKernel, - pub stream: &'l Stream, - pub watcher: &'l mut L::CompilationWatcher, } From d1ae9aba11bfe69375be41ef1204704406cd15ff Mon Sep 17 00:00:00 2001 From: Juniper Tyree Date: Sun, 20 Nov 2022 04:39:03 -0800 Subject: [PATCH 008/120] Updated ExchangeWrapperOn[Device|Host]Async::move_to_stream --- Cargo.toml | 6 +++--- rust-cuda-derive/src/rust_to_cuda/impl.rs | 2 -- rust-cuda-ptx-jit/Cargo.toml | 2 +- src/utils/exchange/wrapper.rs | 18 ++++++------------ 4 files changed, 10 insertions(+), 18 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 17a279023..4dc3f5af1 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -24,10 +24,10 @@ host = ["rustacuda", "rust-cuda-ptx-jit/host"] derive = ["rustacuda_derive", "rust-cuda-derive"] [dependencies] -rustacuda_core = { git = "https://github.com/MomoLangenstein/RustaCUDA", rev = "fa4bf52" } +rustacuda_core = { git = "https://github.com/MomoLangenstein/RustaCUDA", rev = "bc2c42d" } -rustacuda = { git = "https://github.com/MomoLangenstein/RustaCUDA", rev = "fa4bf52", optional = true } -rustacuda_derive = { git = "https://github.com/MomoLangenstein/RustaCUDA", rev = "fa4bf52", optional = true } +rustacuda = { git = "https://github.com/MomoLangenstein/RustaCUDA", rev = "bc2c42d", optional = true } +rustacuda_derive = { git = "https://github.com/MomoLangenstein/RustaCUDA", rev = "bc2c42d", optional = true } const-type-layout = { version = "0.2.0", features = ["derive"] } diff --git a/rust-cuda-derive/src/rust_to_cuda/impl.rs b/rust-cuda-derive/src/rust_to_cuda/impl.rs index 1028f0ed6..ff607af28 100644 --- a/rust-cuda-derive/src/rust_to_cuda/impl.rs +++ b/rust-cuda-derive/src/rust_to_cuda/impl.rs @@ -42,8 +42,6 @@ pub fn cuda_struct_declaration( } } -// TODO: derive async impl as well -> need different trait bounds - #[allow(clippy::too_many_arguments)] pub fn rust_to_cuda_trait( struct_name: &syn::Ident, diff --git a/rust-cuda-ptx-jit/Cargo.toml b/rust-cuda-ptx-jit/Cargo.toml index d5b832eb8..5afb336e4 100644 --- a/rust-cuda-ptx-jit/Cargo.toml +++ b/rust-cuda-ptx-jit/Cargo.toml @@ -12,6 +12,6 @@ default = [] host = ["regex", "rustacuda", "lazy_static"] [dependencies] -rustacuda = { git = "https://github.com/MomoLangenstein/RustaCUDA", rev = "fa4bf52", optional = true } +rustacuda = { git = "https://github.com/MomoLangenstein/RustaCUDA", rev = "bc2c42d", optional = true } regex = { version = "1.5", optional = true } lazy_static = { version = "1.4", optional = true } diff --git a/src/utils/exchange/wrapper.rs b/src/utils/exchange/wrapper.rs index e9d5a0329..a4a8e50f7 100644 --- a/src/utils/exchange/wrapper.rs +++ b/src/utils/exchange/wrapper.rs @@ -187,11 +187,8 @@ impl<'stream, T: RustToCuda> /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside /// CUDA pub fn move_to_stream(self, stream: &Stream) -> CudaResult> { - let old_event = self.move_event.into_inner(); - let new_event: CudaDropWrapper = Event::new(EventFlags::DISABLE_TIMING)?.into(); - - stream.wait_event(old_event, StreamWaitEventFlags::DEFAULT)?; - new_event.record(stream)?; + stream.wait_event(&self.move_event, StreamWaitEventFlags::DEFAULT)?; + self.move_event.record(stream)?; let waker_callback = self.waker.clone(); stream.add_callback(Box::new(move |_| { @@ -206,7 +203,7 @@ impl<'stream, T: RustToCuda> value: self.value, device_box: self.device_box, locked_cuda_repr: self.locked_cuda_repr, - move_event: new_event, + move_event: self.move_event, stream: PhantomData::<&Stream>, waker: self.waker, }) @@ -291,11 +288,8 @@ impl<'stream, T: RustToCuda> self, stream: &Stream, ) -> CudaResult> { - let old_event = self.move_event.into_inner(); - let new_event: CudaDropWrapper = Event::new(EventFlags::DISABLE_TIMING)?.into(); - - stream.wait_event(old_event, StreamWaitEventFlags::DEFAULT)?; - new_event.record(stream)?; + stream.wait_event(&self.move_event, StreamWaitEventFlags::DEFAULT)?; + self.move_event.record(stream)?; let waker_callback = self.waker.clone(); stream.add_callback(Box::new(move |_| { @@ -311,7 +305,7 @@ impl<'stream, T: RustToCuda> device_box: self.device_box, locked_cuda_repr: self.locked_cuda_repr, null_alloc: self.null_alloc, - move_event: new_event, + move_event: self.move_event, stream, waker: self.waker, }) From d70ea5c5107c98a9b0d51de4ea4e7dae95a78bda Mon Sep 17 00:00:00 2001 From: Juniper Tyree Date: Sun, 20 Nov 2022 05:35:17 -0800 Subject: [PATCH 009/120] Upgraded to fixed RustaCuda --- Cargo.toml | 6 +++--- rust-cuda-ptx-jit/Cargo.toml | 2 +- src/host.rs | 8 -------- 3 files changed, 4 insertions(+), 12 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 4dc3f5af1..2ebfbe32e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -24,10 +24,10 @@ host = ["rustacuda", "rust-cuda-ptx-jit/host"] derive = ["rustacuda_derive", "rust-cuda-derive"] [dependencies] -rustacuda_core = { git = "https://github.com/MomoLangenstein/RustaCUDA", rev = "bc2c42d" } +rustacuda_core = { git = "https://github.com/juntyr/RustaCUDA", rev = "c6ea7cc" } -rustacuda = { git = "https://github.com/MomoLangenstein/RustaCUDA", rev = "bc2c42d", optional = true } -rustacuda_derive = { git = "https://github.com/MomoLangenstein/RustaCUDA", rev = "bc2c42d", optional = true } +rustacuda = { git = "https://github.com/juntyr/RustaCUDA", rev = "c6ea7cc", optional = true } +rustacuda_derive = { git = "https://github.com/juntyr/RustaCUDA", rev = "c6ea7cc", optional = true } const-type-layout = { version = "0.2.0", features = ["derive"] } diff --git a/rust-cuda-ptx-jit/Cargo.toml b/rust-cuda-ptx-jit/Cargo.toml index 5afb336e4..aa7fa32c6 100644 --- a/rust-cuda-ptx-jit/Cargo.toml +++ b/rust-cuda-ptx-jit/Cargo.toml @@ -12,6 +12,6 @@ default = [] host = ["regex", "rustacuda", "lazy_static"] [dependencies] -rustacuda = { git = "https://github.com/MomoLangenstein/RustaCUDA", rev = "bc2c42d", optional = true } +rustacuda = { git = "https://github.com/juntyr/RustaCUDA", rev = "c6ea7cc", optional = true } regex = { version = "1.5", optional = true } lazy_static = { version = "1.4", optional = true } diff --git a/src/host.rs b/src/host.rs index 98ad817ba..a104c50a3 100644 --- a/src/host.rs +++ b/src/host.rs @@ -302,14 +302,6 @@ impl From for CudaDropWrapper { Self(ManuallyDrop::new(val)) } } -impl CudaDropWrapper { - pub fn into_inner(self) -> C { - let this = ManuallyDrop::new(self); - - // Safety: move out of drop, caller now has to deal with CUDA drop again - unsafe { core::ptr::read(&*this.0) } - } -} impl Drop for CudaDropWrapper { fn drop(&mut self) { // Safety: drop is only ever called once From 077e965a4d9bb44bfaefb1d1549da682437bf302 Mon Sep 17 00:00:00 2001 From: Juniper Tyree Date: Sun, 27 Nov 2022 02:08:34 -0800 Subject: [PATCH 010/120] Added scratch-space methods for uni-directional CudaExchangeItem --- src/utils/exchange/buffer/mod.rs | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/src/utils/exchange/buffer/mod.rs b/src/utils/exchange/buffer/mod.rs index 1a940faa0..e7141a43e 100644 --- a/src/utils/exchange/buffer/mod.rs +++ b/src/utils/exchange/buffer/mod.rs @@ -60,3 +60,19 @@ impl AsMut for CudaExchangeItem { &mut self.0 } } + +impl CudaExchangeItem { + #[cfg(any(feature = "host", doc))] + #[doc(cfg(feature = "host"))] + pub fn as_scratch_mut(&mut self) -> &mut T { + &mut self.0 + } +} + +impl CudaExchangeItem { + #[cfg(any(not(feature = "host"), doc))] + #[doc(cfg(not(feature = "host")))] + pub fn as_scratch_mut(&mut self) -> &mut T { + &mut self.0 + } +} From ea6e45997348fbd9d6bb20cfe18dbce0026bbb3a Mon Sep 17 00:00:00 2001 From: Juniper Tyree Date: Sun, 27 Nov 2022 02:32:38 -0800 Subject: [PATCH 011/120] Added unsafe-aliasing API to SplitSlideOverCudaThreads[Const|Dynamic]Stride --- src/utils/aliasing/const.rs | 21 +++++++++++++++++++++ src/utils/aliasing/dynamic.rs | 21 +++++++++++++++++++++ 2 files changed, 42 insertions(+) diff --git a/src/utils/aliasing/const.rs b/src/utils/aliasing/const.rs index 8f7f1ab98..e1d069710 100644 --- a/src/utils/aliasing/const.rs +++ b/src/utils/aliasing/const.rs @@ -42,6 +42,27 @@ fn split_slice_const_stride_mut(slice: &mut [E]) -> &mut unsafe { core::slice::from_raw_parts_mut(slice.as_mut_ptr().add(offset), len) } } +#[cfg(all(not(feature = "host"), target_os = "cuda"))] +impl SplitSliceOverCudaThreadsConstStride { + /// # Safety + /// + /// All cross-CUDA-thread aliasing guarantees are lost with this method. + /// Instead, the caller must ensure that no two threads in a kernel launch + /// access the same underlying elements. + pub unsafe fn get_unchecked(&self) -> &T { + &self.0 + } + + /// # Safety + /// + /// All cross-CUDA-thread aliasing guarantees are lost with this method. + /// Instead, the caller must ensure that no two threads in a kernel launch + /// access the same underlying elements. + pub unsafe fn get_mut_unchecked(&mut self) -> &mut T { + &mut self.0 + } +} + #[cfg(all(not(feature = "host"), target_os = "cuda"))] impl, const STRIDE: usize> Deref for SplitSliceOverCudaThreadsConstStride diff --git a/src/utils/aliasing/dynamic.rs b/src/utils/aliasing/dynamic.rs index 6cba2ff9c..c07bd60a4 100644 --- a/src/utils/aliasing/dynamic.rs +++ b/src/utils/aliasing/dynamic.rs @@ -42,6 +42,27 @@ fn split_slice_dynamic_stride_mut(slice: &mut [E], stride: usize) -> &mut [E] unsafe { core::slice::from_raw_parts_mut(slice.as_mut_ptr().add(offset), len) } } +#[cfg(all(not(feature = "host"), target_os = "cuda"))] +impl SplitSliceOverCudaThreadsDynamicStride { + /// # Safety + /// + /// All cross-CUDA-thread aliasing guarantees are lost with this method. + /// Instead, the caller must ensure that no two threads in a kernel launch + /// access the same underlying elements. + pub unsafe fn get_unchecked(&self) -> &T { + &self.inner + } + + /// # Safety + /// + /// All cross-CUDA-thread aliasing guarantees are lost with this method. + /// Instead, the caller must ensure that no two threads in a kernel launch + /// access the same underlying elements. + pub unsafe fn get_mut_unchecked(&mut self) -> &mut T { + &mut self.inner + } +} + #[cfg(all(not(feature = "host"), target_os = "cuda"))] impl> Deref for SplitSliceOverCudaThreadsDynamicStride { type Target = [E]; From 578453f2e48466f50e9559d4506fd85ebe4baf14 Mon Sep 17 00:00:00 2001 From: Juniper Tyree Date: Sun, 27 Nov 2022 03:11:01 -0800 Subject: [PATCH 012/120] Extended the CudaExchangeItem API with scratch and uMaybeUninit --- src/utils/exchange/buffer/mod.rs | 54 ++++++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) diff --git a/src/utils/exchange/buffer/mod.rs b/src/utils/exchange/buffer/mod.rs index e7141a43e..c4e4b24bd 100644 --- a/src/utils/exchange/buffer/mod.rs +++ b/src/utils/exchange/buffer/mod.rs @@ -1,3 +1,5 @@ +use core::mem::MaybeUninit; + mod common; #[cfg(any(not(feature = "host"), doc))] mod device; @@ -62,6 +64,12 @@ impl AsMut for CudaExchangeItem { } impl CudaExchangeItem { + #[cfg(any(feature = "host", doc))] + #[doc(cfg(feature = "host"))] + pub fn as_scratch(&self) -> &T { + &self.0 + } + #[cfg(any(feature = "host", doc))] #[doc(cfg(feature = "host"))] pub fn as_scratch_mut(&mut self) -> &mut T { @@ -70,9 +78,55 @@ impl CudaExchangeItem { } impl CudaExchangeItem { + #[cfg(any(not(feature = "host"), doc))] + #[doc(cfg(not(feature = "host")))] + pub fn as_scratch(&self) -> &T { + &self.0 + } + #[cfg(any(not(feature = "host"), doc))] #[doc(cfg(not(feature = "host")))] pub fn as_scratch_mut(&mut self) -> &mut T { &mut self.0 } } + +impl CudaExchangeItem { + #[cfg(any(feature = "host", doc))] + #[doc(cfg(feature = "host"))] + pub fn as_uninit(&self) -> &MaybeUninit { + // Safety: + // - MaybeUninit is a transparent newtype union + // - CudaExchangeItem is a transparent newtype + unsafe { &*(self as *const Self).cast() } + } + + #[cfg(any(feature = "host", doc))] + #[doc(cfg(feature = "host"))] + pub fn as_uninit_mut(&mut self) -> &mut MaybeUninit { + // Safety: + // - MaybeUninit is a transparent newtype union + // - CudaExchangeItem is a transparent newtype + unsafe { &mut *(self as *mut Self).cast() } + } +} + +impl CudaExchangeItem { + #[cfg(any(not(feature = "host"), doc))] + #[doc(cfg(not(feature = "host")))] + pub fn as_uninit(&self) -> &MaybeUninit { + // Safety: + // - MaybeUninit is a transparent newtype union + // - CudaExchangeItem is a transparent newtype + unsafe { &*(self as *const Self).cast() } + } + + #[cfg(any(not(feature = "host"), doc))] + #[doc(cfg(not(feature = "host")))] + pub fn as_uninit_mut(&mut self) -> &mut MaybeUninit { + // Safety: + // - MaybeUninit is a transparent newtype union + // - CudaExchangeItem is a transparent newtype + unsafe { &mut *(self as *mut Self).cast() } + } +} From c55d26979094e859557babafc449fb7cdbc40c16 Mon Sep 17 00:00:00 2001 From: Juniper Tyree Date: Sun, 27 Nov 2022 03:43:06 -0800 Subject: [PATCH 013/120] Rename SplitSliceOverCudaThreads[Const|Dynamic]Strude::alias_[mut_]unchecked --- src/utils/aliasing/const.rs | 4 ++-- src/utils/aliasing/dynamic.rs | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/utils/aliasing/const.rs b/src/utils/aliasing/const.rs index e1d069710..a60a94eb9 100644 --- a/src/utils/aliasing/const.rs +++ b/src/utils/aliasing/const.rs @@ -49,7 +49,7 @@ impl SplitSliceOverCudaThreadsConstStride { /// All cross-CUDA-thread aliasing guarantees are lost with this method. /// Instead, the caller must ensure that no two threads in a kernel launch /// access the same underlying elements. - pub unsafe fn get_unchecked(&self) -> &T { + pub unsafe fn alias_unchecked(&self) -> &T { &self.0 } @@ -58,7 +58,7 @@ impl SplitSliceOverCudaThreadsConstStride { /// All cross-CUDA-thread aliasing guarantees are lost with this method. /// Instead, the caller must ensure that no two threads in a kernel launch /// access the same underlying elements. - pub unsafe fn get_mut_unchecked(&mut self) -> &mut T { + pub unsafe fn alias_mut_unchecked(&mut self) -> &mut T { &mut self.0 } } diff --git a/src/utils/aliasing/dynamic.rs b/src/utils/aliasing/dynamic.rs index c07bd60a4..668112f88 100644 --- a/src/utils/aliasing/dynamic.rs +++ b/src/utils/aliasing/dynamic.rs @@ -49,7 +49,7 @@ impl SplitSliceOverCudaThreadsDynamicStride { /// All cross-CUDA-thread aliasing guarantees are lost with this method. /// Instead, the caller must ensure that no two threads in a kernel launch /// access the same underlying elements. - pub unsafe fn get_unchecked(&self) -> &T { + pub unsafe fn alias_unchecked(&self) -> &T { &self.inner } @@ -58,7 +58,7 @@ impl SplitSliceOverCudaThreadsDynamicStride { /// All cross-CUDA-thread aliasing guarantees are lost with this method. /// Instead, the caller must ensure that no two threads in a kernel launch /// access the same underlying elements. - pub unsafe fn get_mut_unchecked(&mut self) -> &mut T { + pub unsafe fn alias_mut_unchecked(&mut self) -> &mut T { &mut self.inner } } From 96036b08253aafd0c68a91a814b03c1768656f34 Mon Sep 17 00:00:00 2001 From: Juniper Langenstein Date: Tue, 29 Nov 2022 09:12:14 +0000 Subject: [PATCH 014/120] Implemented #[cuda(crate)] and #[kernel(crate)] attributes --- examples/derive/Cargo.toml | 2 +- examples/derive/src/lib.rs | 6 +- examples/single-source/Cargo.toml | 4 +- examples/single-source/src/main.rs | 32 ++++---- .../generate/cpu_linker_macro/get_ptx_str.rs | 31 +++++--- .../generate/cpu_linker_macro/kernel_func.rs | 28 ++++--- .../kernel_func_async/async_func_types.rs | 13 ++-- .../kernel_func_async/launch_types.rs | 19 ++--- .../cpu_linker_macro/kernel_func_async/mod.rs | 42 +++++++---- .../wrapper/generate/cpu_linker_macro/mod.rs | 12 ++- .../generate/cpu_linker_macro/new_kernel.rs | 9 ++- .../kernel/wrapper/generate/cpu_wrapper.rs | 30 ++++---- .../kernel/wrapper/generate/cuda_wrapper.rs | 51 +++++++------ rust-cuda-derive/src/kernel/wrapper/mod.rs | 74 +++++++++++++++++-- .../src/rust_to_cuda/field_copy.rs | 45 +++++------ rust-cuda-derive/src/rust_to_cuda/field_ty.rs | 17 +++-- rust-cuda-derive/src/rust_to_cuda/generics.rs | 47 ++++++++++-- rust-cuda-derive/src/rust_to_cuda/impl.rs | 64 +++++++++------- rust-cuda-derive/src/rust_to_cuda/mod.rs | 27 ++++--- 19 files changed, 354 insertions(+), 199 deletions(-) diff --git a/examples/derive/Cargo.toml b/examples/derive/Cargo.toml index e59a344af..f4ea53d90 100644 --- a/examples/derive/Cargo.toml +++ b/examples/derive/Cargo.toml @@ -9,4 +9,4 @@ edition = "2021" [dependencies] const-type-layout = { version = "0.2.0" } -rust-cuda = { path = "../../", features = ["derive", "host"] } +rc = { package = "rust-cuda", path = "../../", features = ["derive", "host"] } diff --git a/examples/derive/src/lib.rs b/examples/derive/src/lib.rs index 814e30f61..76a7d3cb1 100644 --- a/examples/derive/src/lib.rs +++ b/examples/derive/src/lib.rs @@ -2,13 +2,15 @@ #![feature(const_type_name)] #![feature(offset_of)] -#[derive(rust_cuda::common::LendRustToCuda)] +#[derive(rc::common::LendRustToCuda)] +#[cuda(crate = "rc")] struct Inner { #[cuda(embed)] inner: T, } -#[derive(rust_cuda::common::LendRustToCuda)] +#[derive(rc::common::LendRustToCuda)] +#[cuda(crate = "rc")] struct Outer { #[cuda(embed)] inner: Inner, diff --git a/examples/single-source/Cargo.toml b/examples/single-source/Cargo.toml index 128da7cef..351d694a0 100644 --- a/examples/single-source/Cargo.toml +++ b/examples/single-source/Cargo.toml @@ -11,7 +11,7 @@ edition = "2021" const-type-layout = { version = "0.2.0" } [target.'cfg(target_os = "cuda")'.dependencies] -rust-cuda = { path = "../../", features = ["derive"] } +rc = { package = "rust-cuda", path = "../../", features = ["derive"] } [target.'cfg(not(target_os = "cuda"))'.dependencies] -rust-cuda = { path = "../../", features = ["derive", "host"] } +rc = { package = "rust-cuda", path = "../../", features = ["derive", "host"] } diff --git a/examples/single-source/src/main.rs b/examples/single-source/src/main.rs index 36c0736c6..79f6e3ec1 100644 --- a/examples/single-source/src/main.rs +++ b/examples/single-source/src/main.rs @@ -10,32 +10,34 @@ extern crate alloc; -#[macro_use] -extern crate const_type_layout; - #[cfg(not(target_os = "cuda"))] fn main() {} #[repr(C)] -#[derive(TypeLayout)] +#[derive(rc::const_type_layout::TypeLayout)] +#[layout(crate = "rc::const_type_layout")] pub struct Dummy(i32); -#[derive(rust_cuda::common::LendRustToCuda)] +#[derive(rc::common::LendRustToCuda)] +#[cuda(crate = "rc")] #[allow(dead_code)] pub struct Wrapper { #[cuda(embed)] inner: T, } -#[derive(rust_cuda::common::LendRustToCuda)] +#[derive(rc::common::LendRustToCuda)] +#[cuda(crate = "rc")] pub struct Empty([u8; 0]); #[repr(C)] -#[derive(TypeLayout)] +#[derive(rc::const_type_layout::TypeLayout)] +#[layout(crate = "rc::const_type_layout")] pub struct Tuple(u32, i32); -#[rust_cuda::common::kernel(use link_kernel! as impl Kernel for Launcher)] -pub fn kernel<'a, T: rust_cuda::common::RustToCuda>( +#[rc::common::kernel(use link_kernel! as impl Kernel for Launcher)] +#[kernel(crate = "rc")] +pub fn kernel<'a, T: rc::common::RustToCuda>( #[kernel(pass = SafeDeviceCopy)] _x: &Dummy, #[kernel(pass = LendRustToCuda, jit)] _y: &mut ShallowCopy>, #[kernel(pass = LendRustToCuda)] _z: &ShallowCopy>, @@ -43,7 +45,7 @@ pub fn kernel<'a, T: rust_cuda::common::RustToCuda>( #[kernel(pass = LendRustToCuda)] _: Wrapper, #[kernel(pass = SafeDeviceCopy)] Tuple(_s, mut __t): Tuple, ) where - ::CudaRepresentation: rust_cuda::safety::StackOnly, + ::CudaRepresentation: rc::safety::StackOnly, { } @@ -52,16 +54,16 @@ mod host { use super::{Kernel, KernelArgs}; #[allow(dead_code)] - struct Launcher(core::marker::PhantomData); + struct Launcher(core::marker::PhantomData); link_kernel!(crate::Empty); - link_kernel!(rust_cuda::utils::device_copy::SafeDeviceCopyWrapper); + link_kernel!(rc::utils::device_copy::SafeDeviceCopyWrapper); - impl rust_cuda::host::Launcher for Launcher { + impl rc::host::Launcher for Launcher { type CompilationWatcher = (); type KernelTraitObject = dyn Kernel; - fn get_launch_package(&mut self) -> rust_cuda::host::LaunchPackage { + fn get_launch_package(&mut self) -> rc::host::LaunchPackage { unimplemented!() } } @@ -71,7 +73,7 @@ mod host { mod cuda_prelude { use core::arch::nvptx; - use rust_cuda::device::utils; + use rc::device::utils; #[global_allocator] static _GLOBAL_ALLOCATOR: utils::PTXAllocator = utils::PTXAllocator; diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/get_ptx_str.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/get_ptx_str.rs index d39246484..179ba7eed 100644 --- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/get_ptx_str.rs +++ b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/get_ptx_str.rs @@ -6,6 +6,7 @@ use crate::kernel::utils::skip_kernel_compilation; use super::super::super::{DeclGenerics, FuncIdent, FunctionInputs, KernelConfig}; pub(super) fn quote_get_ptx_str( + crate_path: &syn::Path, FuncIdent { func_ident, func_ident_hash, @@ -29,19 +30,25 @@ pub(super) fn quote_get_ptx_str( let crate_manifest_dir = proc_macro::tracked_env::var("CARGO_MANIFEST_DIR") .unwrap_or_else(|err| abort_call_site!("Failed to read crate path: {:?}.", err)); - let cpu_func_lifetime_erased_types = - super::kernel_func_async::generate_launch_types(config, generics, inputs, macro_type_ids).1; + let cpu_func_lifetime_erased_types = super::kernel_func_async::generate_launch_types( + crate_path, + config, + generics, + inputs, + macro_type_ids, + ) + .1; let matching_kernel_assert = if skip_kernel_compilation() { quote!() } else { quote::quote_spanned! { func_ident.span()=> - const _: ::rust_cuda::safety::kernel_signature::Assert<{ - ::rust_cuda::safety::kernel_signature::CpuAndGpuKernelSignatures::Match - }> = ::rust_cuda::safety::kernel_signature::Assert::<{ - ::rust_cuda::safety::kernel_signature::check( + const _: #crate_path::safety::kernel_signature::Assert<{ + #crate_path::safety::kernel_signature::CpuAndGpuKernelSignatures::Match + }> = #crate_path::safety::kernel_signature::Assert::<{ + #crate_path::safety::kernel_signature::check( PTX_STR.as_bytes(), - concat!(".visible .entry ", rust_cuda::host::specialise_kernel_call!( + concat!(".visible .entry ", #crate_path::host::specialise_kernel_call!( #func_ident_hash #generic_start_token #($#macro_type_ids),* #generic_close_token @@ -64,10 +71,10 @@ pub(super) fn quote_get_ptx_str( ); quote::quote_spanned! { ty.span()=> - const _: ::rust_cuda::safety::type_layout::Assert<{ - ::rust_cuda::safety::type_layout::CpuAndGpuTypeLayouts::Match - }> = ::rust_cuda::safety::type_layout::Assert::<{ - ::rust_cuda::safety::type_layout::check::<#ty>(#layout_param) + const _: #crate_path::safety::type_layout::Assert<{ + #crate_path::safety::type_layout::CpuAndGpuTypeLayouts::Match + }> = #crate_path::safety::type_layout::Assert::<{ + #crate_path::safety::type_layout::check::<#ty>(#layout_param) }>; } }) @@ -76,7 +83,7 @@ pub(super) fn quote_get_ptx_str( quote! { fn get_ptx_str() -> &'static str { - rust_cuda::host::link_kernel!{ + #crate_path::host::link_kernel!{ #func_ident #args #crate_name #crate_manifest_dir #generic_start_token #($#macro_type_ids),* #generic_close_token diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func.rs index 41d4244b0..d6e70e276 100644 --- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func.rs +++ b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func.rs @@ -2,7 +2,9 @@ use proc_macro2::TokenStream; use super::super::super::{DeclGenerics, FuncIdent, FunctionInputs, InputCudaType, KernelConfig}; +#[allow(clippy::too_many_arguments)] pub(super) fn quote_kernel_func( + crate_path: &syn::Path, KernelConfig { args, .. }: &KernelConfig, DeclGenerics { generic_start_token, @@ -52,16 +54,17 @@ pub(super) fn quote_kernel_func( }) .collect::>(); - let raw_func_input_wrap = generate_raw_func_input_wrap(inputs, fn_ident, func_params); + let raw_func_input_wrap = + generate_raw_func_input_wrap(crate_path, inputs, fn_ident, func_params); quote! { #(#func_attrs)* #[allow(clippy::needless_lifetimes)] fn #func_ident <'stream, #generic_wrapper_params>( &mut self, - stream: &'stream rust_cuda::rustacuda::stream::Stream, + stream: &'stream #crate_path::rustacuda::stream::Stream, #(#new_func_inputs),* - ) -> rust_cuda::rustacuda::error::CudaResult<()> + ) -> #crate_path::rustacuda::error::CudaResult<()> #generic_wrapper_where_clause { // impls check adapted from Nikolai Vazquez's `impls` crate: @@ -87,6 +90,7 @@ pub(super) fn quote_kernel_func( #[allow(clippy::too_many_lines)] fn generate_raw_func_input_wrap( + crate_path: &syn::Path, FunctionInputs { func_inputs, func_input_cuda_types, @@ -114,16 +118,16 @@ fn generate_raw_func_input_wrap( // DeviceCopy mode only supports immutable references quote! { - let mut #pat_box = rust_cuda::host::HostDeviceBox::from( - rust_cuda::rustacuda::memory::DeviceBox::new( - rust_cuda::utils::device_copy::SafeDeviceCopyWrapper::from_ref(#pat) + let mut #pat_box = #crate_path::host::HostDeviceBox::from( + #crate_path::rustacuda::memory::DeviceBox::new( + #crate_path::utils::device_copy::SafeDeviceCopyWrapper::from_ref(#pat) )? ); #[allow(clippy::redundant_closure_call)] // Safety: `#pat_box` contains exactly the device copy of `#pat` let __result = (|#pat| { #inner })(unsafe { - rust_cuda::host::HostAndDeviceConstRef::new( - &#pat_box, rust_cuda::utils::device_copy::SafeDeviceCopyWrapper::from_ref(#pat) + #crate_path::host::HostAndDeviceConstRef::new( + &#pat_box, #crate_path::utils::device_copy::SafeDeviceCopyWrapper::from_ref(#pat) ).as_async() }); @@ -145,7 +149,7 @@ fn generate_raw_func_input_wrap( } } else { quote! { { - let #pat = rust_cuda::utils::device_copy::SafeDeviceCopyWrapper::from(#pat); + let #pat = #crate_path::utils::device_copy::SafeDeviceCopyWrapper::from(#pat); #inner } } } @@ -153,16 +157,16 @@ fn generate_raw_func_input_wrap( InputCudaType::LendRustToCuda => { if let syn::Type::Reference(syn::TypeReference { mutability, .. }) = &**ty { if mutability.is_some() { - quote! { rust_cuda::host::LendToCuda::lend_to_cuda_mut( + quote! { #crate_path::host::LendToCuda::lend_to_cuda_mut( #pat, |mut #pat| { (|#pat| { #inner })(#pat.as_async()) } ) } } else { - quote! { rust_cuda::host::LendToCuda::lend_to_cuda( + quote! { #crate_path::host::LendToCuda::lend_to_cuda( #pat, |#pat| { (|#pat| { #inner })(#pat.as_async()) } ) } } } else { - quote! { rust_cuda::host::LendToCuda::move_to_cuda( + quote! { #crate_path::host::LendToCuda::move_to_cuda( #pat, |mut #pat| { (|#pat| { #inner })(#pat.as_async()) } ) } } diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func_async/async_func_types.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func_async/async_func_types.rs index 50e74b02e..c24406c9a 100644 --- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func_async/async_func_types.rs +++ b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func_async/async_func_types.rs @@ -6,6 +6,7 @@ use crate::kernel::utils::r2c_move_lifetime; use super::super::super::super::{DeclGenerics, FunctionInputs, InputCudaType, KernelConfig}; pub(super) fn generate_async_func_types( + crate_path: &syn::Path, KernelConfig { args, .. }: &KernelConfig, DeclGenerics { generic_start_token, @@ -38,11 +39,11 @@ pub(super) fn generate_async_func_types( let cuda_type = match cuda_mode { InputCudaType::SafeDeviceCopy => quote! { - rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<#syn_type> + #crate_path::utils::device_copy::SafeDeviceCopyWrapper<#syn_type> }, InputCudaType::LendRustToCuda => quote! { - rust_cuda::common::DeviceAccessible< - <#syn_type as rust_cuda::common::RustToCuda>::CudaRepresentation + #crate_path::common::DeviceAccessible< + <#syn_type as #crate_path::common::RustToCuda>::CudaRepresentation > }, }; @@ -62,11 +63,11 @@ pub(super) fn generate_async_func_types( } quote!( - rust_cuda::host::HostAndDeviceMutRefAsync<'stream, #lifetime, #cuda_type> + #crate_path::host::HostAndDeviceMutRefAsync<'stream, #lifetime, #cuda_type> ) } else { quote!( - rust_cuda::host::HostAndDeviceConstRefAsync<'stream, #lifetime, #cuda_type> + #crate_path::host::HostAndDeviceConstRefAsync<'stream, #lifetime, #cuda_type> ) }; @@ -77,7 +78,7 @@ pub(super) fn generate_async_func_types( let lifetime = r2c_move_lifetime(i, ty); let wrapped_type = quote! { - rust_cuda::host::HostAndDeviceOwnedAsync<'stream, #lifetime, #cuda_type> + #crate_path::host::HostAndDeviceOwnedAsync<'stream, #lifetime, #cuda_type> }; quote! { diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func_async/launch_types.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func_async/launch_types.rs index 0fed7282f..16cd0008e 100644 --- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func_async/launch_types.rs +++ b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func_async/launch_types.rs @@ -6,6 +6,7 @@ use crate::kernel::utils::r2c_move_lifetime; use super::super::super::super::{DeclGenerics, FunctionInputs, InputCudaType, KernelConfig}; pub(in super::super) fn generate_launch_types( + crate_path: &syn::Path, KernelConfig { args, .. }: &KernelConfig, DeclGenerics { generic_start_token, @@ -39,11 +40,11 @@ pub(in super::super) fn generate_launch_types( let cuda_type = match cuda_mode { InputCudaType::SafeDeviceCopy => quote::quote_spanned! { ty.span()=> - rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<#syn_type> + #crate_path::utils::device_copy::SafeDeviceCopyWrapper<#syn_type> }, InputCudaType::LendRustToCuda => quote::quote_spanned! { ty.span()=> - rust_cuda::common::DeviceAccessible< - <#syn_type as rust_cuda::common::RustToCuda>::CudaRepresentation + #crate_path::common::DeviceAccessible< + <#syn_type as #crate_path::common::RustToCuda>::CudaRepresentation > }, }; @@ -57,18 +58,18 @@ pub(in super::super) fn generate_launch_types( { if mutability.is_some() { quote::quote_spanned! { ty.span()=> - rust_cuda::common::DeviceMutRef<#lifetime, #cuda_type> + #crate_path::common::DeviceMutRef<#lifetime, #cuda_type> } } else { quote::quote_spanned! { ty.span()=> - rust_cuda::common::DeviceConstRef<#lifetime, #cuda_type> + #crate_path::common::DeviceConstRef<#lifetime, #cuda_type> } } } else if matches!(cuda_mode, InputCudaType::LendRustToCuda) { let lifetime = r2c_move_lifetime(i, ty); quote::quote_spanned! { ty.span()=> - rust_cuda::common::DeviceMutRef<#lifetime, #cuda_type> + #crate_path::common::DeviceMutRef<#lifetime, #cuda_type> } } else { quote! { #cuda_type } @@ -79,16 +80,16 @@ pub(in super::super) fn generate_launch_types( if let syn::Type::Reference(syn::TypeReference { mutability, .. }) = &**ty { if mutability.is_some() { quote::quote_spanned! { ty.span()=> - rust_cuda::common::DeviceMutRef<'static, #cuda_type> + #crate_path::common::DeviceMutRef<'static, #cuda_type> } } else { quote::quote_spanned! { ty.span()=> - rust_cuda::common::DeviceConstRef<'static, #cuda_type> + #crate_path::common::DeviceConstRef<'static, #cuda_type> } } } else if matches!(cuda_mode, InputCudaType::LendRustToCuda) { quote::quote_spanned! { ty.span()=> - rust_cuda::common::DeviceMutRef<'static, #cuda_type> + #crate_path::common::DeviceMutRef<'static, #cuda_type> } } else { cuda_type diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func_async/mod.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func_async/mod.rs index 6980a5753..44cc4d904 100644 --- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func_async/mod.rs +++ b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func_async/mod.rs @@ -12,6 +12,7 @@ use type_wrap::generate_func_input_and_ptx_jit_wraps; #[allow(clippy::too_many_arguments)] pub(super) fn quote_kernel_func_async( + crate_path: &syn::Path, config @ KernelConfig { args, .. }: &KernelConfig, decl_generics @ DeclGenerics { generic_wrapper_params, @@ -26,29 +27,40 @@ pub(super) fn quote_kernel_func_async( func_attrs: &[syn::Attribute], macro_type_ids: &[syn::Ident], ) -> TokenStream { - let new_func_inputs_async = - generate_async_func_types(config, decl_generics, func_inputs, macro_type_ids); + let new_func_inputs_async = generate_async_func_types( + crate_path, + config, + decl_generics, + func_inputs, + macro_type_ids, + ); let (func_input_wrap, func_cpu_ptx_jit_wrap) = generate_func_input_and_ptx_jit_wraps(func_inputs); let (cpu_func_types_launch, cpu_func_lifetime_erased_types, cpu_func_unboxed_types) = - generate_launch_types(config, decl_generics, func_inputs, macro_type_ids); + generate_launch_types( + crate_path, + config, + decl_generics, + func_inputs, + macro_type_ids, + ); quote! { #(#func_attrs)* #[allow(clippy::extra_unused_type_parameters)] fn #func_ident_async <'stream, #generic_wrapper_params>( &mut self, - stream: &'stream rust_cuda::rustacuda::stream::Stream, + stream: &'stream #crate_path::rustacuda::stream::Stream, #(#new_func_inputs_async),* - ) -> rust_cuda::rustacuda::error::CudaResult<()> + ) -> #crate_path::rustacuda::error::CudaResult<()> #generic_wrapper_where_clause { - let rust_cuda::host::LaunchPackage { + let #crate_path::host::LaunchPackage { kernel, watcher, config - } = rust_cuda::host::Launcher::get_launch_package(self); + } = #crate_path::host::Launcher::get_launch_package(self); let kernel_jit_result = if config.ptx_jit { - rust_cuda::ptx_jit::compilePtxJITwithArguments! { + #crate_path::ptx_jit::compilePtxJITwithArguments! { kernel.compile_with_ptx_jit_args(#(#func_cpu_ptx_jit_wrap),*) }? } else { @@ -56,13 +68,13 @@ pub(super) fn quote_kernel_func_async( }; let function = match kernel_jit_result { - rust_cuda::host::KernelJITResult::Recompiled(function) => { + #crate_path::host::KernelJITResult::Recompiled(function) => { // Call launcher hook on kernel compilation - ::on_compile(function, watcher)?; + ::on_compile(function, watcher)?; function }, - rust_cuda::host::KernelJITResult::Cached(function) => function, + #crate_path::host::KernelJITResult::Cached(function) => function, }; #[allow(clippy::redundant_closure_call)] @@ -79,14 +91,14 @@ pub(super) fn quote_kernel_func_async( if false { #[allow(dead_code)] - fn assert_impl_devicecopy(_val: &T) {} + fn assert_impl_devicecopy(_val: &T) {} #[allow(dead_code)] - fn assert_impl_no_aliasing() {} + fn assert_impl_no_aliasing() {} #[allow(dead_code)] fn assert_impl_fits_into_device_register< - T: rust_cuda::safety::FitsIntoDeviceRegister, + T: #crate_path::safety::FitsIntoDeviceRegister, >(_val: &T) {} #(assert_impl_devicecopy(&#func_params);)* @@ -94,7 +106,7 @@ pub(super) fn quote_kernel_func_async( #(assert_impl_fits_into_device_register(&#func_params);)* } - let rust_cuda::host::LaunchConfig { + let #crate_path::host::LaunchConfig { grid, block, shared_memory_size, ptx_jit: _, } = config; diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/mod.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/mod.rs index 52fd5c506..aedf1e12e 100644 --- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/mod.rs +++ b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/mod.rs @@ -13,6 +13,7 @@ use kernel_func_async::quote_kernel_func_async; use new_kernel::quote_new_kernel; pub(in super::super) fn quote_cpu_linker_macro( + crate_path: &syn::Path, config @ KernelConfig { visibility, kernel, @@ -56,6 +57,7 @@ pub(in super::super) fn quote_cpu_linker_macro( }; let get_ptx_str = quote_get_ptx_str( + crate_path, func_ident, config, decl_generics, @@ -63,8 +65,15 @@ pub(in super::super) fn quote_cpu_linker_macro( func_params, ¯o_type_ids, ); - let new_kernel = quote_new_kernel(config, decl_generics, func_ident, ¯o_type_ids); + let new_kernel = quote_new_kernel( + crate_path, + config, + decl_generics, + func_ident, + ¯o_type_ids, + ); let kernel_func = quote_kernel_func( + crate_path, config, decl_generics, func_inputs, @@ -74,6 +83,7 @@ pub(in super::super) fn quote_cpu_linker_macro( ¯o_type_ids, ); let kernel_func_async = quote_kernel_func_async( + crate_path, config, decl_generics, func_inputs, diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/new_kernel.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/new_kernel.rs index fa32591db..6b53954e4 100644 --- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/new_kernel.rs +++ b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/new_kernel.rs @@ -3,6 +3,7 @@ use proc_macro2::TokenStream; use super::super::super::{DeclGenerics, FuncIdent, KernelConfig}; pub(super) fn quote_new_kernel( + crate_path: &syn::Path, KernelConfig { kernel, .. }: &KernelConfig, DeclGenerics { generic_start_token, @@ -15,19 +16,19 @@ pub(super) fn quote_new_kernel( macro_type_ids: &[syn::Ident], ) -> TokenStream { quote! { - fn new_kernel() -> rust_cuda::rustacuda::error::CudaResult< - rust_cuda::host::TypedKernel #crate_path::rustacuda::error::CudaResult< + #crate_path::host::TypedKernel > { let ptx = Self::get_ptx_str(); - let entry_point = rust_cuda::host::specialise_kernel_call!( + let entry_point = #crate_path::host::specialise_kernel_call!( #func_ident_hash #generic_start_token #($#macro_type_ids),* #generic_close_token ); - rust_cuda::host::TypedKernel::new(ptx, entry_point) + #crate_path::host::TypedKernel::new(ptx, entry_point) } } } diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper.rs index e5c318140..4851af9ce 100644 --- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper.rs +++ b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper.rs @@ -7,6 +7,7 @@ use super::super::{ }; pub(in super::super) fn quote_cpu_wrapper( + crate_path: &syn::Path, config @ KernelConfig { visibility, kernel, .. }: &KernelConfig, @@ -29,7 +30,7 @@ pub(in super::super) fn quote_cpu_wrapper( func_attrs: &[syn::Attribute], ) -> TokenStream { let launcher_predicate = quote! { - Self: Sized + rust_cuda::host::Launcher< + Self: Sized + #crate_path::host::Launcher< KernelTraitObject = dyn #kernel #ty_generics > }; @@ -55,7 +56,7 @@ pub(in super::super) fn quote_cpu_wrapper( }; let (new_func_inputs_decl, new_func_inputs_async_decl) = - generate_new_func_inputs_decl(config, impl_generics, func_inputs); + generate_new_func_inputs_decl(crate_path, config, impl_generics, func_inputs); quote! { #[cfg(not(target_os = "cuda"))] @@ -65,32 +66,33 @@ pub(in super::super) fn quote_cpu_wrapper( { fn get_ptx_str() -> &'static str where #launcher_predicate; - fn new_kernel() -> rust_cuda::rustacuda::error::CudaResult< - rust_cuda::host::TypedKernel + fn new_kernel() -> #crate_path::rustacuda::error::CudaResult< + #crate_path::host::TypedKernel > where #launcher_predicate; #(#func_attrs)* #[allow(clippy::too_many_arguments)] fn #func_ident <'stream, #generic_wrapper_params>( &mut self, - stream: &'stream rust_cuda::rustacuda::stream::Stream, + stream: &'stream #crate_path::rustacuda::stream::Stream, #(#new_func_inputs_decl),* - ) -> rust_cuda::rustacuda::error::CudaResult<()> + ) -> #crate_path::rustacuda::error::CudaResult<()> #generic_wrapper_where_clause; #(#func_attrs)* #[allow(clippy::too_many_arguments)] fn #func_ident_async <'stream, #generic_wrapper_params>( &mut self, - stream: &'stream rust_cuda::rustacuda::stream::Stream, + stream: &'stream #crate_path::rustacuda::stream::Stream, #(#new_func_inputs_async_decl),* - ) -> rust_cuda::rustacuda::error::CudaResult<()> + ) -> #crate_path::rustacuda::error::CudaResult<()> #generic_wrapper_where_clause; } } } fn generate_new_func_inputs_decl( + crate_path: &syn::Path, KernelConfig { args, .. }: &KernelConfig, ImplGenerics { ty_generics, .. }: &ImplGenerics, FunctionInputs { @@ -146,11 +148,11 @@ fn generate_new_func_inputs_decl( let cuda_type = match cuda_mode { InputCudaType::SafeDeviceCopy => syn::parse_quote!( - rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<#syn_type> + #crate_path::utils::device_copy::SafeDeviceCopyWrapper<#syn_type> ), InputCudaType::LendRustToCuda => syn::parse_quote!( - rust_cuda::common::DeviceAccessible< - <#syn_type as rust_cuda::common::RustToCuda>::CudaRepresentation + #crate_path::common::DeviceAccessible< + <#syn_type as #crate_path::common::RustToCuda>::CudaRepresentation > ), }; @@ -163,11 +165,11 @@ fn generate_new_func_inputs_decl( { let wrapped_type = if mutability.is_some() { syn::parse_quote!( - rust_cuda::host::HostAndDeviceMutRefAsync<'stream, #lifetime, #cuda_type> + #crate_path::host::HostAndDeviceMutRefAsync<'stream, #lifetime, #cuda_type> ) } else { syn::parse_quote!( - rust_cuda::host::HostAndDeviceConstRefAsync<'stream, #lifetime, #cuda_type> + #crate_path::host::HostAndDeviceConstRefAsync<'stream, #lifetime, #cuda_type> ) }; @@ -176,7 +178,7 @@ fn generate_new_func_inputs_decl( let lifetime = r2c_move_lifetime(i, ty); let wrapped_type = syn::parse_quote!( - rust_cuda::host::HostAndDeviceOwnedAsync<'stream, #lifetime, #cuda_type> + #crate_path::host::HostAndDeviceOwnedAsync<'stream, #lifetime, #cuda_type> ); Box::new(wrapped_type) diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cuda_wrapper.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cuda_wrapper.rs index d017efae1..36e316708 100644 --- a/rust-cuda-derive/src/kernel/wrapper/generate/cuda_wrapper.rs +++ b/rust-cuda-derive/src/kernel/wrapper/generate/cuda_wrapper.rs @@ -6,6 +6,7 @@ use super::super::{FuncIdent, FunctionInputs, InputCudaType, KernelConfig}; #[allow(clippy::too_many_lines)] pub(in super::super) fn quote_cuda_wrapper( + crate_path: &syn::Path, config @ KernelConfig { args, .. }: &KernelConfig, inputs @ FunctionInputs { func_inputs, @@ -19,8 +20,8 @@ pub(in super::super) fn quote_cuda_wrapper( func_attrs: &[syn::Attribute], func_params: &[syn::Ident], ) -> TokenStream { - let (ptx_func_inputs, ptx_func_types) = specialise_ptx_func_inputs(config, inputs); - let ptx_func_unboxed_types = specialise_ptx_unboxed_types(config, inputs); + let (ptx_func_inputs, ptx_func_types) = specialise_ptx_func_inputs(crate_path, config, inputs); + let ptx_func_unboxed_types = specialise_ptx_unboxed_types(crate_path, config, inputs); let func_layout_params = func_params .iter() @@ -46,13 +47,13 @@ pub(in super::super) fn quote_cuda_wrapper( // Emit PTX JIT load markers let ptx_jit_load = if ptx_jit.0 { quote! { - rust_cuda::ptx_jit::PtxJITConstLoad!([#i] => #pat.as_ref()) + #crate_path::ptx_jit::PtxJITConstLoad!([#i] => #pat.as_ref()) } } else { quote! {} }; let type_ident = quote::format_ident!("__T_{}", i); let syn_type = quote::quote_spanned! { ty.span()=> - rust_cuda::device::specialise_kernel_type!(#args :: #type_ident) + #crate_path::device::specialise_kernel_type!(#args :: #type_ident) }; match cuda_mode { @@ -70,22 +71,22 @@ pub(in super::super) fn quote_cuda_wrapper( if mutability.is_some() { quote! { #ptx_jit_load; - rust_cuda::device::BorrowFromRust::with_borrow_from_rust_mut( - #pat, |#pat: #and_token #mutability rust_cuda::device::ShallowCopy<#syn_type>| { #inner }, + #crate_path::device::BorrowFromRust::with_borrow_from_rust_mut( + #pat, |#pat: #and_token #mutability #crate_path::device::ShallowCopy<#syn_type>| { #inner }, ) } } else { quote! { #ptx_jit_load; - rust_cuda::device::BorrowFromRust::with_borrow_from_rust( - #pat, |#pat: #and_token rust_cuda::device::ShallowCopy<#syn_type>| { #inner }, + #crate_path::device::BorrowFromRust::with_borrow_from_rust( + #pat, |#pat: #and_token #crate_path::device::ShallowCopy<#syn_type>| { #inner }, ) } } } else { quote! { #ptx_jit_load; - rust_cuda::device::BorrowFromRust::with_moved_from_rust( + #crate_path::device::BorrowFromRust::with_moved_from_rust( #pat, |#pat: #syn_type| { #inner }, ) } @@ -99,22 +100,22 @@ pub(in super::super) fn quote_cuda_wrapper( quote! { #[cfg(target_os = "cuda")] - #[rust_cuda::device::specialise_kernel_entry(#args)] + #[#crate_path::device::specialise_kernel_entry(#args)] #[no_mangle] #(#func_attrs)* pub unsafe extern "ptx-kernel" fn #func_type_layout_ident(#(#func_params: &mut &[u8]),*) { #( #[no_mangle] static #func_layout_params: [ - u8; rust_cuda::const_type_layout::serialised_type_graph_len::<#ptx_func_types>() - ] = rust_cuda::const_type_layout::serialise_type_graph::<#ptx_func_types>(); + u8; #crate_path::const_type_layout::serialised_type_graph_len::<#ptx_func_types>() + ] = #crate_path::const_type_layout::serialise_type_graph::<#ptx_func_types>(); *#func_params = &#func_layout_params; )* } #[cfg(target_os = "cuda")] - #[rust_cuda::device::specialise_kernel_entry(#args)] + #[#crate_path::device::specialise_kernel_entry(#args)] #[no_mangle] #(#func_attrs)* pub unsafe extern "ptx-kernel" fn #func_ident_hash(#(#ptx_func_inputs),*) { @@ -130,14 +131,14 @@ pub(in super::super) fn quote_cuda_wrapper( if false { #[allow(dead_code)] - fn assert_impl_devicecopy(_val: &T) {} + fn assert_impl_devicecopy(_val: &T) {} #[allow(dead_code)] - fn assert_impl_no_aliasing() {} + fn assert_impl_no_aliasing() {} #[allow(dead_code)] fn assert_impl_fits_into_device_register< - T: rust_cuda::safety::FitsIntoDeviceRegister, + T: #crate_path::safety::FitsIntoDeviceRegister, >(_val: &T) {} #(assert_impl_devicecopy(&#func_params);)* @@ -151,6 +152,7 @@ pub(in super::super) fn quote_cuda_wrapper( } fn specialise_ptx_func_inputs( + crate_path: &syn::Path, KernelConfig { args, .. }: &KernelConfig, FunctionInputs { func_inputs, @@ -172,16 +174,16 @@ fn specialise_ptx_func_inputs( ) => { let type_ident = quote::format_ident!("__T_{}", i); let syn_type = quote::quote_spanned! { ty.span()=> - rust_cuda::device::specialise_kernel_type!(#args :: #type_ident) + #crate_path::device::specialise_kernel_type!(#args :: #type_ident) }; let cuda_type = match cuda_mode { InputCudaType::SafeDeviceCopy => quote::quote_spanned! { ty.span()=> - rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<#syn_type> + #crate_path::utils::device_copy::SafeDeviceCopyWrapper<#syn_type> }, InputCudaType::LendRustToCuda => quote::quote_spanned! { ty.span()=> - rust_cuda::common::DeviceAccessible< - <#syn_type as rust_cuda::common::RustToCuda>::CudaRepresentation + #crate_path::common::DeviceAccessible< + <#syn_type as #crate_path::common::RustToCuda>::CudaRepresentation > }, }; @@ -198,11 +200,11 @@ fn specialise_ptx_func_inputs( if mutability.is_some() { quote::quote_spanned! { ty.span()=> - rust_cuda::common::DeviceMutRef<#lifetime, #cuda_type> + #crate_path::common::DeviceMutRef<#lifetime, #cuda_type> } } else { quote::quote_spanned! { ty.span()=> - rust_cuda::common::DeviceConstRef<#lifetime, #cuda_type> + #crate_path::common::DeviceConstRef<#lifetime, #cuda_type> } } } else if matches!(cuda_mode, InputCudaType::LendRustToCuda) { @@ -211,7 +213,7 @@ fn specialise_ptx_func_inputs( }; quote::quote_spanned! { ty.span()=> - rust_cuda::common::DeviceMutRef<#lifetime, #cuda_type> + #crate_path::common::DeviceMutRef<#lifetime, #cuda_type> } } else { cuda_type @@ -229,6 +231,7 @@ fn specialise_ptx_func_inputs( } fn specialise_ptx_unboxed_types( + crate_path: &syn::Path, KernelConfig { args, .. }: &KernelConfig, FunctionInputs { func_inputs, .. }: &FunctionInputs, ) -> Vec { @@ -240,7 +243,7 @@ fn specialise_ptx_unboxed_types( let type_ident = quote::format_ident!("__T_{}", i); quote::quote_spanned! { ty.span()=> - rust_cuda::device::specialise_kernel_type!(#args :: #type_ident) + #crate_path::device::specialise_kernel_type!(#args :: #type_ident) } }, syn::FnArg::Receiver(_) => unreachable!(), diff --git a/rust-cuda-derive/src/kernel/wrapper/mod.rs b/rust-cuda-derive/src/kernel/wrapper/mod.rs index c057fe7f1..76b88eee6 100644 --- a/rust-cuda-derive/src/kernel/wrapper/mod.rs +++ b/rust-cuda-derive/src/kernel/wrapper/mod.rs @@ -38,7 +38,63 @@ pub fn kernel(attr: TokenStream, func: TokenStream) -> TokenStream { }, }; - let func = parse_kernel_fn(func); + let mut func = parse_kernel_fn(func); + + let mut crate_path = None; + + func.attrs.retain(|attr| { + if attr.path.is_ident("kernel") { + if let Ok(syn::Meta::List(list)) = attr.parse_meta() { + for meta in &list.nested { + match meta { + syn::NestedMeta::Meta(syn::Meta::NameValue(syn::MetaNameValue { + path, + lit: syn::Lit::Str(s), + .. + })) if path.is_ident("crate") => match syn::parse_str::(&s.value()) { + Ok(new_crate_path) => { + if crate_path.is_none() { + crate_path = Some( + syn::parse_quote_spanned! { s.span() => #new_crate_path }, + ); + + return false; + } + + emit_error!( + s.span(), + "[rust-cuda]: Duplicate #[kernel(crate)] attribute.", + ); + }, + Err(err) => emit_error!( + s.span(), + "[rust-cuda]: Invalid #[kernel(crate = \ + \"\")] attribute: {}.", + err + ), + }, + _ => { + emit_error!( + meta.span(), + "[rust-cuda]: Expected #[kernel(crate = \"\")] function attribute." + ); + } + } + } + } else { + emit_error!( + attr.span(), + "[rust-cuda]: Expected #[kernel(crate = \"\")] function attribute." + ); + } + + false + } else { + true + } + }); + + let crate_path = crate_path.unwrap_or_else(|| syn::parse_quote!(::rust_cuda)); let mut generic_kernel_params = func.sig.generics.params.clone(); let mut func_inputs = parse_function_inputs(&func, &mut generic_kernel_params); @@ -177,6 +233,7 @@ pub fn kernel(attr: TokenStream, func: TokenStream) -> TokenStream { let args_trait = quote_args_trait(&config, &decl_generics, &impl_generics, &func_inputs); let cpu_wrapper = quote_cpu_wrapper( + &crate_path, &config, &decl_generics, &impl_generics, @@ -184,8 +241,9 @@ pub fn kernel(attr: TokenStream, func: TokenStream) -> TokenStream { &func_ident, &func.attrs, ); - let cpu_cuda_check = quote_generic_check(&func_ident, &config); + let cpu_cuda_check = quote_generic_check(&crate_path, &func_ident, &config); let cpu_linker_macro = quote_cpu_linker_macro( + &crate_path, &config, &decl_generics, &func_inputs, @@ -194,6 +252,7 @@ pub fn kernel(attr: TokenStream, func: TokenStream) -> TokenStream { &func.attrs, ); let cuda_wrapper = quote_cuda_wrapper( + &crate_path, &config, &func_inputs, &func_ident, @@ -298,6 +357,7 @@ fn ident_from_pat_iter<'p, I: Iterator>(iter: I) -> Option< } fn quote_generic_check( + crate_path: &syn::Path, FuncIdent { func_ident_hash, .. }: &FuncIdent, @@ -313,11 +373,11 @@ fn quote_generic_check( quote::quote_spanned! { func_ident_hash.span()=> #[cfg(not(target_os = "cuda"))] - const _: ::rust_cuda::safety::kernel_signature::Assert<{ - ::rust_cuda::safety::kernel_signature::CpuAndGpuKernelSignatures::Match - }> = ::rust_cuda::safety::kernel_signature::Assert::<{ - ::rust_cuda::safety::kernel_signature::check( - rust_cuda::host::check_kernel!(#args #crate_name #crate_manifest_dir).as_bytes(), + const _: #crate_path::safety::kernel_signature::Assert<{ + #crate_path::safety::kernel_signature::CpuAndGpuKernelSignatures::Match + }> = #crate_path::safety::kernel_signature::Assert::<{ + #crate_path::safety::kernel_signature::check( + #crate_path::host::check_kernel!(#args #crate_name #crate_manifest_dir).as_bytes(), concat!(".visible .entry ", stringify!(#func_ident_hash)).as_bytes() ) }>; diff --git a/rust-cuda-derive/src/rust_to_cuda/field_copy.rs b/rust-cuda-derive/src/rust_to_cuda/field_copy.rs index 61891aa8c..c6659e9c9 100644 --- a/rust-cuda-derive/src/rust_to_cuda/field_copy.rs +++ b/rust-cuda-derive/src/rust_to_cuda/field_copy.rs @@ -5,6 +5,7 @@ use super::field_ty::CudaReprFieldTy; #[allow(clippy::too_many_arguments, clippy::too_many_lines)] pub fn impl_field_copy_init_and_expand_alloc_type( + crate_path: &syn::Path, field: &syn::Field, field_index: usize, @@ -33,12 +34,12 @@ pub fn impl_field_copy_init_and_expand_alloc_type( match cuda_repr_field_ty { CudaReprFieldTy::SafeDeviceCopy => { r2c_field_declarations.push(quote! { - let #field_repr_ident = rust_cuda::common::DeviceAccessible::from( + let #field_repr_ident = #crate_path::common::DeviceAccessible::from( &self.#field_accessor, ); }); r2c_field_async_declarations.push(quote! { - let #field_repr_ident = rust_cuda::common::DeviceAccessible::from( + let #field_repr_ident = #crate_path::common::DeviceAccessible::from( &self.#field_accessor, ); }); @@ -49,26 +50,26 @@ pub fn impl_field_copy_init_and_expand_alloc_type( c2r_field_initialisations.push(quote! { #optional_field_ident { - rust_cuda::common::CudaAsRust::as_rust(&this.#field_accessor).into_inner() + #crate_path::common::CudaAsRust::as_rust(&this.#field_accessor).into_inner() }, }); }, CudaReprFieldTy::RustToCuda { field_ty } => { combined_cuda_alloc_type = quote! { - rust_cuda::host::CombinedCudaAlloc< - <#field_ty as rust_cuda::common::RustToCuda>::CudaAllocation, + #crate_path::host::CombinedCudaAlloc< + <#field_ty as #crate_path::common::RustToCuda>::CudaAllocation, #combined_cuda_alloc_type > }; r2c_field_declarations.push(quote! { - let (#field_repr_ident, alloc_front) = rust_cuda::common::RustToCuda::borrow( + let (#field_repr_ident, alloc_front) = #crate_path::common::RustToCuda::borrow( &self.#field_accessor, alloc_front, )?; }); r2c_field_async_declarations.push(quote! { - let (#field_repr_ident, alloc_front) = rust_cuda::common::RustToCudaAsync::borrow_async( + let (#field_repr_ident, alloc_front) = #crate_path::common::RustToCudaAsync::borrow_async( &self.#field_accessor, alloc_front, stream, @@ -80,13 +81,13 @@ pub fn impl_field_copy_init_and_expand_alloc_type( }); r2c_field_destructors.push(quote! { - let alloc_front = rust_cuda::common::RustToCuda::restore( + let alloc_front = #crate_path::common::RustToCuda::restore( &mut self.#field_accessor, alloc_front, )?; }); r2c_field_async_destructors.push(quote! { - let alloc_front = rust_cuda::common::RustToCudaAsync::restore_async( + let alloc_front = #crate_path::common::RustToCudaAsync::restore_async( &mut self.#field_accessor, alloc_front, stream, @@ -95,30 +96,30 @@ pub fn impl_field_copy_init_and_expand_alloc_type( c2r_field_initialisations.push(quote! { #optional_field_ident { - rust_cuda::common::CudaAsRust::as_rust(&this.#field_accessor) + #crate_path::common::CudaAsRust::as_rust(&this.#field_accessor) }, }); }, CudaReprFieldTy::RustToCudaProxy { proxy_ty, field_ty } => { combined_cuda_alloc_type = quote! { - rust_cuda::host::CombinedCudaAlloc< - <#proxy_ty as rust_cuda::common::RustToCuda>::CudaAllocation, + #crate_path::host::CombinedCudaAlloc< + <#proxy_ty as #crate_path::common::RustToCuda>::CudaAllocation, #combined_cuda_alloc_type > }; r2c_field_declarations.push(quote! { - let (#field_repr_ident, alloc_front) = rust_cuda::common::RustToCuda::borrow( + let (#field_repr_ident, alloc_front) = #crate_path::common::RustToCuda::borrow( < - #proxy_ty as rust_cuda::common::RustToCudaProxy<#field_ty> + #proxy_ty as #crate_path::common::RustToCudaProxy<#field_ty> >::from_ref(&self.#field_accessor), alloc_front, )?; }); r2c_field_async_declarations.push(quote! { - let (#field_repr_ident, alloc_front) = rust_cuda::common::RustToCudaAsync::borrow_async( + let (#field_repr_ident, alloc_front) = #crate_path::common::RustToCudaAsync::borrow_async( < - #proxy_ty as rust_cuda::common::RustToCudaAsyncProxy<#field_ty> + #proxy_ty as #crate_path::common::RustToCudaAsyncProxy<#field_ty> >::from_ref(&self.#field_accessor), alloc_front, stream, @@ -130,17 +131,17 @@ pub fn impl_field_copy_init_and_expand_alloc_type( }); r2c_field_destructors.push(quote! { - let alloc_front = rust_cuda::common::RustToCuda::restore( + let alloc_front = #crate_path::common::RustToCuda::restore( < - #proxy_ty as rust_cuda::common::RustToCudaProxy<#field_ty> + #proxy_ty as #crate_path::common::RustToCudaProxy<#field_ty> >::from_mut(&mut self.#field_accessor), alloc_front, )?; }); r2c_field_async_destructors.push(quote! { - let alloc_front = rust_cuda::common::RustToCudaAsync::restore_async( + let alloc_front = #crate_path::common::RustToCudaAsync::restore_async( < - #proxy_ty as rust_cuda::common::RustToCudaAsyncProxy<#field_ty> + #proxy_ty as #crate_path::common::RustToCudaAsyncProxy<#field_ty> >::from_mut(&mut self.#field_accessor), alloc_front, stream, @@ -149,8 +150,8 @@ pub fn impl_field_copy_init_and_expand_alloc_type( c2r_field_initialisations.push(quote! { #optional_field_ident { - rust_cuda::common::RustToCudaProxy::<#field_ty>::into( - rust_cuda::common::CudaAsRust::as_rust(&this.#field_accessor) + #crate_path::common::RustToCudaProxy::<#field_ty>::into( + #crate_path::common::CudaAsRust::as_rust(&this.#field_accessor) ) }, }); diff --git a/rust-cuda-derive/src/rust_to_cuda/field_ty.rs b/rust-cuda-derive/src/rust_to_cuda/field_ty.rs index 8416d3c17..21509ef8c 100644 --- a/rust-cuda-derive/src/rust_to_cuda/field_ty.rs +++ b/rust-cuda-derive/src/rust_to_cuda/field_ty.rs @@ -12,7 +12,10 @@ pub enum CudaReprFieldTy { }, } -pub fn swap_field_type_and_filter_attrs(field: &mut syn::Field) -> CudaReprFieldTy { +pub fn swap_field_type_and_filter_attrs( + crate_path: &syn::Path, + field: &mut syn::Field, +) -> CudaReprFieldTy { let mut cuda_repr_field_ty: Option = None; let mut field_ty = field.ty.clone(); @@ -33,8 +36,8 @@ pub fn swap_field_type_and_filter_attrs(field: &mut syn::Field) -> CudaReprField field_ty: Box::new(field_ty.clone()), }); field_ty = parse_quote! { - rust_cuda::common::DeviceAccessible< - <#field_ty as rust_cuda::common::RustToCuda>::CudaRepresentation + #crate_path::common::DeviceAccessible< + <#field_ty as #crate_path::common::RustToCuda>::CudaRepresentation > }; } else { @@ -54,8 +57,8 @@ pub fn swap_field_type_and_filter_attrs(field: &mut syn::Field) -> CudaReprField Ok(proxy_ty) => { let old_field_ty = Box::new(field_ty.clone()); field_ty = parse_quote! { - rust_cuda::common::DeviceAccessible< - <#proxy_ty as rust_cuda::common::RustToCuda>::CudaRepresentation + #crate_path::common::DeviceAccessible< + <#proxy_ty as #crate_path::common::RustToCuda>::CudaRepresentation > }; cuda_repr_field_ty = Some(CudaReprFieldTy::RustToCudaProxy { @@ -104,8 +107,8 @@ pub fn swap_field_type_and_filter_attrs(field: &mut syn::Field) -> CudaReprField cuda_repr_field_ty } else { field_ty = parse_quote! { - rust_cuda::common::DeviceAccessible< - rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<#field_ty> + #crate_path::common::DeviceAccessible< + #crate_path::utils::device_copy::SafeDeviceCopyWrapper<#field_ty> > }; diff --git a/rust-cuda-derive/src/rust_to_cuda/generics.rs b/rust-cuda-derive/src/rust_to_cuda/generics.rs index 646686534..b9335db46 100644 --- a/rust-cuda-derive/src/rust_to_cuda/generics.rs +++ b/rust-cuda-derive/src/rust_to_cuda/generics.rs @@ -10,6 +10,7 @@ pub fn expand_cuda_struct_generics_where_requested_in_attrs( syn::Generics, Vec, bool, + syn::Path, ) { let mut type_params = ast .generics @@ -30,8 +31,8 @@ pub fn expand_cuda_struct_generics_where_requested_in_attrs( } let mut r2c_ignore = false; - let mut r2c_async_impl = None; + let mut crate_path = None; struct_attrs_cuda.retain(|attr| { if attr.path.is_ident("cuda") { @@ -104,6 +105,30 @@ pub fn expand_cuda_struct_generics_where_requested_in_attrs( "[rust-cuda]: Duplicate #[cuda(async)] attribute.", ); }, + syn::NestedMeta::Meta(syn::Meta::NameValue(syn::MetaNameValue { + path, + lit: syn::Lit::Str(s), + .. + })) if path.is_ident("crate") => match syn::parse_str::(&s.value()) { + Ok(new_crate_path) => { + if crate_path.is_none() { + crate_path = Some( + syn::parse_quote_spanned! { s.span() => #new_crate_path }, + ); + } else { + emit_error!( + s.span(), + "[rust-cuda]: Duplicate #[cuda(crate)] attribute.", + ); + } + }, + Err(err) => emit_error!( + s.span(), + "[rust-cuda]: Invalid #[cuda(crate = \ + \"\")] attribute: {}.", + err + ), + }, syn::NestedMeta::Meta(syn::Meta::NameValue(syn::MetaNameValue { path: syn::Path { leading_colon: None, @@ -134,9 +159,10 @@ pub fn expand_cuda_struct_generics_where_requested_in_attrs( _ => { emit_error!( meta.span(), - "[rust-cuda]: Expected #[cuda(ignore)] / #[cuda(bound = \ - \"\")] / #[cuda(layout::ATTR = \"VALUE\")] \ - struct attribute." + "[rust-cuda]: Expected #[cuda(ignore)] / \ + #[cuda(bound = \"\")] / \ + #[cuda(crate = \"\")] / \ + #[cuda(layout::ATTR = \"VALUE\")] struct attribute." ); }, } @@ -144,8 +170,10 @@ pub fn expand_cuda_struct_generics_where_requested_in_attrs( } else { emit_error!( attr.span(), - "[rust-cuda]: Expected #[cuda(ignore)] / #[cuda(bound = \ - \"\")] / #[cuda(layout::ATTR = \"VALUE\")] struct attribute." + "[rust-cuda]: Expected #[cuda(ignore)] / \ + #[cuda(bound = \"\")] / \ + #[cuda(crate = \"\")] / \ + #[cuda(layout::ATTR = \"VALUE\")] struct attribute." ); } @@ -155,18 +183,20 @@ pub fn expand_cuda_struct_generics_where_requested_in_attrs( } }); + let crate_path = crate_path.unwrap_or_else(|| syn::parse_quote!(::rust_cuda)); + for ty in &type_params { struct_generics_cuda .make_where_clause() .predicates .push(syn::parse_quote! { - #ty: ::rust_cuda::common::RustToCuda + #ty: #crate_path::common::RustToCuda }); struct_generics_cuda_async .make_where_clause() .predicates .push(syn::parse_quote! { - #ty: ::rust_cuda::common::RustToCudaAsync + #ty: #crate_path::common::RustToCudaAsync }); } @@ -176,5 +206,6 @@ pub fn expand_cuda_struct_generics_where_requested_in_attrs( struct_generics_cuda_async, struct_layout_attrs, r2c_async_impl.unwrap_or(true), + crate_path, ) } diff --git a/rust-cuda-derive/src/rust_to_cuda/impl.rs b/rust-cuda-derive/src/rust_to_cuda/impl.rs index ff607af28..1ff844645 100644 --- a/rust-cuda-derive/src/rust_to_cuda/impl.rs +++ b/rust-cuda-derive/src/rust_to_cuda/impl.rs @@ -1,7 +1,9 @@ use proc_macro2::TokenStream; use quote::quote; +#[allow(clippy::too_many_arguments)] pub fn cuda_struct_declaration( + crate_path: &syn::Path, struct_attrs_cuda: &[syn::Attribute], struct_layout_attrs: &[syn::Attribute], struct_vis_cuda: &syn::Visibility, @@ -27,23 +29,27 @@ pub fn cuda_struct_declaration( quote!(#where_clause #struct_fields_cuda) }; + let const_type_layout_crate_path = quote! { #crate_path::const_type_layout }.to_string(); + quote! { #[allow(dead_code)] #[doc(hidden)] #(#struct_attrs_cuda)* - #[derive(rust_cuda::const_type_layout::TypeLayout)] + #[derive(#crate_path::const_type_layout::TypeLayout)] #struct_repr #(#struct_layout_attrs)* + #[layout(crate = #const_type_layout_crate_path)] #struct_vis_cuda struct #struct_name_cuda #struct_generics_cuda #struct_fields_where_clause // #[derive(DeviceCopy)] can interfer with type parameters - unsafe impl #impl_generics rust_cuda::rustacuda_core::DeviceCopy + unsafe impl #impl_generics #crate_path::rustacuda_core::DeviceCopy for #struct_name_cuda #ty_generics #where_clause {} } } #[allow(clippy::too_many_arguments)] pub fn rust_to_cuda_trait( + crate_path: &syn::Path, struct_name: &syn::Ident, struct_name_cuda: &syn::Ident, struct_generics_cuda: &syn::Generics, @@ -70,7 +76,7 @@ pub fn rust_to_cuda_trait( let (impl_generics, ty_generics, where_clause) = struct_generics_cuda.split_for_impl(); quote! { - unsafe impl #impl_generics rust_cuda::common::RustToCuda for #struct_name #ty_generics + unsafe impl #impl_generics #crate_path::common::RustToCuda for #struct_name #ty_generics #where_clause { type CudaRepresentation = #struct_name_cuda #ty_generics; @@ -79,14 +85,14 @@ pub fn rust_to_cuda_trait( type CudaAllocation = #combined_cuda_alloc_type; #[cfg(not(target_os = "cuda"))] - unsafe fn borrow( + unsafe fn borrow( &self, alloc: CudaAllocType, - ) -> rust_cuda::rustacuda::error::CudaResult<( - rust_cuda::common::DeviceAccessible, - rust_cuda::host::CombinedCudaAlloc + ) -> #crate_path::rustacuda::error::CudaResult<( + #crate_path::common::DeviceAccessible, + #crate_path::host::CombinedCudaAlloc )> { - let alloc_front = rust_cuda::host::NullCudaAlloc; + let alloc_front = #crate_path::host::NullCudaAlloc; let alloc_tail = alloc; #(#r2c_field_declarations)* @@ -94,18 +100,18 @@ pub fn rust_to_cuda_trait( let borrow = #rust_to_cuda_struct_construction; Ok(( - rust_cuda::common::DeviceAccessible::from(borrow), - rust_cuda::host::CombinedCudaAlloc::new(alloc_front, alloc_tail) + #crate_path::common::DeviceAccessible::from(borrow), + #crate_path::host::CombinedCudaAlloc::new(alloc_front, alloc_tail) )) } #[cfg(not(target_os = "cuda"))] - unsafe fn restore( + unsafe fn restore( &mut self, - alloc: rust_cuda::host::CombinedCudaAlloc< + alloc: #crate_path::host::CombinedCudaAlloc< Self::CudaAllocation, CudaAllocType >, - ) -> rust_cuda::rustacuda::error::CudaResult { + ) -> #crate_path::rustacuda::error::CudaResult { let (alloc_front, alloc_tail) = alloc.split(); #(#r2c_field_destructors)* @@ -118,6 +124,7 @@ pub fn rust_to_cuda_trait( #[allow(clippy::too_many_arguments)] pub fn rust_to_cuda_async_trait( + crate_path: &syn::Path, struct_name: &syn::Ident, struct_name_cuda: &syn::Ident, struct_generics_cuda_async: &syn::Generics, @@ -143,19 +150,19 @@ pub fn rust_to_cuda_async_trait( let (impl_generics, ty_generics, where_clause) = struct_generics_cuda_async.split_for_impl(); quote! { - unsafe impl #impl_generics rust_cuda::common::RustToCudaAsync for #struct_name #ty_generics + unsafe impl #impl_generics #crate_path::common::RustToCudaAsync for #struct_name #ty_generics #where_clause { #[cfg(not(target_os = "cuda"))] - unsafe fn borrow_async( + unsafe fn borrow_async( &self, alloc: CudaAllocType, - stream: &rust_cuda::rustacuda::stream::Stream, - ) -> rust_cuda::rustacuda::error::CudaResult<( - rust_cuda::common::DeviceAccessible, - rust_cuda::host::CombinedCudaAlloc + stream: &#crate_path::rustacuda::stream::Stream, + ) -> #crate_path::rustacuda::error::CudaResult<( + #crate_path::common::DeviceAccessible, + #crate_path::host::CombinedCudaAlloc )> { - let alloc_front = rust_cuda::host::NullCudaAlloc; + let alloc_front = #crate_path::host::NullCudaAlloc; let alloc_tail = alloc; #(#r2c_field_async_declarations)* @@ -163,19 +170,19 @@ pub fn rust_to_cuda_async_trait( let borrow = #rust_to_cuda_struct_construction; Ok(( - rust_cuda::common::DeviceAccessible::from(borrow), - rust_cuda::host::CombinedCudaAlloc::new(alloc_front, alloc_tail) + #crate_path::common::DeviceAccessible::from(borrow), + #crate_path::host::CombinedCudaAlloc::new(alloc_front, alloc_tail) )) } #[cfg(not(target_os = "cuda"))] - unsafe fn restore_async( + unsafe fn restore_async( &mut self, - alloc: rust_cuda::host::CombinedCudaAlloc< + alloc: #crate_path::host::CombinedCudaAlloc< Self::CudaAllocation, CudaAllocType >, - stream: &rust_cuda::rustacuda::stream::Stream, - ) -> rust_cuda::rustacuda::error::CudaResult { + stream: &#crate_path::rustacuda::stream::Stream, + ) -> #crate_path::rustacuda::error::CudaResult { let (alloc_front, alloc_tail) = alloc.split(); #(#r2c_field_async_destructors)* @@ -187,6 +194,7 @@ pub fn rust_to_cuda_async_trait( } pub fn cuda_as_rust_trait( + crate_path: &syn::Path, struct_name: &syn::Ident, struct_name_cuda: &syn::Ident, struct_generics_cuda: &syn::Generics, @@ -210,14 +218,14 @@ pub fn cuda_as_rust_trait( let (impl_generics, ty_generics, where_clause) = &struct_generics_cuda.split_for_impl(); quote! { - unsafe impl #impl_generics rust_cuda::common::CudaAsRust + unsafe impl #impl_generics #crate_path::common::CudaAsRust for #struct_name_cuda #ty_generics #where_clause { type RustRepresentation = #struct_name #ty_generics; #[cfg(target_os = "cuda")] unsafe fn as_rust( - this: &rust_cuda::common::DeviceAccessible, + this: &#crate_path::common::DeviceAccessible, ) -> #struct_name #ty_generics { #cuda_as_rust_struct_construction } diff --git a/rust-cuda-derive/src/rust_to_cuda/mod.rs b/rust-cuda-derive/src/rust_to_cuda/mod.rs index dc8eb6491..4173d6658 100644 --- a/rust-cuda-derive/src/rust_to_cuda/mod.rs +++ b/rust-cuda-derive/src/rust_to_cuda/mod.rs @@ -21,8 +21,17 @@ pub fn impl_rust_to_cuda(ast: &syn::DeriveInput) -> proc_macro::TokenStream { let struct_name = &ast.ident; let struct_name_cuda = get_cuda_repr_ident(struct_name); + let ( + struct_attrs_cuda, + struct_generics_cuda, + struct_generics_cuda_async, + struct_layout_attrs, + r2c_async_impl, + crate_path, + ) = generics::expand_cuda_struct_generics_where_requested_in_attrs(ast); + let mut combined_cuda_alloc_type: TokenStream = quote! { - rust_cuda::host::NullCudaAlloc + #crate_path::host::NullCudaAlloc }; let mut r2c_field_declarations: Vec = Vec::new(); let mut r2c_field_async_declarations: Vec = Vec::new(); @@ -45,9 +54,11 @@ pub fn impl_rust_to_cuda(ast: &syn::DeriveInput) -> proc_macro::TokenStream { let mut r2c_field_async_destructors_reverse: Vec = Vec::new(); for (field_index, field) in fields.iter_mut().enumerate() { - let cuda_repr_field_ty = field_ty::swap_field_type_and_filter_attrs(field); + let cuda_repr_field_ty = + field_ty::swap_field_type_and_filter_attrs(&crate_path, field); combined_cuda_alloc_type = field_copy::impl_field_copy_init_and_expand_alloc_type( + &crate_path, field, field_index, &cuda_repr_field_ty, @@ -69,15 +80,8 @@ pub fn impl_rust_to_cuda(ast: &syn::DeriveInput) -> proc_macro::TokenStream { syn::Fields::Unit => (), } - let ( - struct_attrs_cuda, - struct_generics_cuda, - struct_generics_cuda_async, - struct_layout_attrs, - r2c_async_impl, - ) = generics::expand_cuda_struct_generics_where_requested_in_attrs(ast); - let cuda_struct_declaration = r#impl::cuda_struct_declaration( + &crate_path, &struct_attrs_cuda, &struct_layout_attrs, &ast.vis, @@ -88,6 +92,7 @@ pub fn impl_rust_to_cuda(ast: &syn::DeriveInput) -> proc_macro::TokenStream { ); let rust_to_cuda_trait_impl = r#impl::rust_to_cuda_trait( + &crate_path, struct_name, &struct_name_cuda, &struct_generics_cuda, @@ -100,6 +105,7 @@ pub fn impl_rust_to_cuda(ast: &syn::DeriveInput) -> proc_macro::TokenStream { let rust_to_cuda_async_trait_impl = if r2c_async_impl { r#impl::rust_to_cuda_async_trait( + &crate_path, struct_name, &struct_name_cuda, &struct_generics_cuda_async, @@ -113,6 +119,7 @@ pub fn impl_rust_to_cuda(ast: &syn::DeriveInput) -> proc_macro::TokenStream { }; let cuda_as_rust_trait_impl = r#impl::cuda_as_rust_trait( + &crate_path, struct_name, &struct_name_cuda, &struct_generics_cuda, From 73bb289b6266c577a875d53f2e03e947cc7e4d45 Mon Sep 17 00:00:00 2001 From: Juniper Tyree Date: Wed, 30 Nov 2022 11:28:51 -0800 Subject: [PATCH 015/120] Added simple thread-block shared memory support --- examples/single-source/src/main.rs | 11 +++++++++++ src/device/mod.rs | 29 +++++++++++++++++++++++++++++ src/device/utils.rs | 1 - src/lib.rs | 8 ++++++++ 4 files changed, 48 insertions(+), 1 deletion(-) diff --git a/examples/single-source/src/main.rs b/examples/single-source/src/main.rs index 79f6e3ec1..891c2db06 100644 --- a/examples/single-source/src/main.rs +++ b/examples/single-source/src/main.rs @@ -47,6 +47,17 @@ pub fn kernel<'a, T: rc::common::RustToCuda>( ) where ::CudaRepresentation: rc::safety::StackOnly, { + use rc::device::ThreadBlockShared; + + let shared: ThreadBlockShared<[Tuple; 3]> = ThreadBlockShared::new_uninit(); + let shared2: ThreadBlockShared<[Tuple; 3]> = ThreadBlockShared::new_uninit(); + + unsafe { + (*shared.get().cast::().add(1)).0 = 42; + } + unsafe { + (*shared2.get().cast::().add(2)).1 = 24; + } } #[cfg(not(target_os = "cuda"))] diff --git a/src/device/mod.rs b/src/device/mod.rs index 39ae0719f..583bd2a2e 100644 --- a/src/device/mod.rs +++ b/src/device/mod.rs @@ -113,3 +113,32 @@ impl DerefMut for ShallowCopy { &mut self.0 } } + +#[repr(transparent)] +pub struct ThreadBlockShared { + shared: *mut T, +} + +impl ThreadBlockShared { + #[must_use] + pub fn new_uninit() -> Self { + let shared: *mut T; + + unsafe { + core::arch::asm!( + ".shared .align {align} .b8 {reg}_rust_cuda_shared[{size}];", + "mov.u64 {reg}, {reg}_rust_cuda_shared;", + reg = out(reg64) shared, + align = const(core::mem::align_of::()), + size = const(core::mem::size_of::()), + ); + } + + Self { shared } + } + + #[must_use] + pub fn get(&self) -> *mut T { + self.shared + } +} diff --git a/src/device/utils.rs b/src/device/utils.rs index a45ff9c71..897df29ea 100644 --- a/src/device/utils.rs +++ b/src/device/utils.rs @@ -1,5 +1,4 @@ use alloc::alloc::{GlobalAlloc, Layout}; -#[cfg(target_os = "cuda")] use core::arch::nvptx; /// Memory allocator using CUDA malloc/free diff --git a/src/lib.rs b/src/lib.rs index 2c202ffee..795e00cfa 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -8,6 +8,14 @@ any(all(not(feature = "host"), target_os = "cuda"), doc), feature(stdsimd) )] +#![cfg_attr( + any(all(not(feature = "host"), target_os = "cuda"), doc), + feature(asm_experimental_arch) +)] +#![cfg_attr( + any(all(not(feature = "host"), target_os = "cuda"), doc), + feature(asm_const) +)] #![cfg_attr(any(feature = "alloc", doc), feature(allocator_api))] #![feature(doc_cfg)] #![feature(marker_trait_attr)] From a23e76e47fd79b9b1ab195d6a7e155a5316345b1 Mon Sep 17 00:00:00 2001 From: Juniper Tyree Date: Wed, 30 Nov 2022 14:24:46 -0800 Subject: [PATCH 016/120] Fixed device utils doc tests --- src/device/utils.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/src/device/utils.rs b/src/device/utils.rs index 897df29ea..a45ff9c71 100644 --- a/src/device/utils.rs +++ b/src/device/utils.rs @@ -1,4 +1,5 @@ use alloc::alloc::{GlobalAlloc, Layout}; +#[cfg(target_os = "cuda")] use core::arch::nvptx; /// Memory allocator using CUDA malloc/free From 9f330f4d6ca38c3461034cb5aab721103f3abeca Mon Sep 17 00:00:00 2001 From: Juniper Tyree Date: Thu, 1 Dec 2022 03:39:34 -0800 Subject: [PATCH 017/120] Convert cuda thread-block-shared memory address to generic --- src/device/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/device/mod.rs b/src/device/mod.rs index 583bd2a2e..7c11cb34f 100644 --- a/src/device/mod.rs +++ b/src/device/mod.rs @@ -127,7 +127,7 @@ impl ThreadBlockShared { unsafe { core::arch::asm!( ".shared .align {align} .b8 {reg}_rust_cuda_shared[{size}];", - "mov.u64 {reg}, {reg}_rust_cuda_shared;", + "cvta.shared.u64 {reg}, {reg}_rust_cuda_shared;", reg = out(reg64) shared, align = const(core::mem::align_of::()), size = const(core::mem::size_of::()), From 8970c5b7a95a2e2bb6a4ad08052820e6842ffc42 Mon Sep 17 00:00:00 2001 From: Juniper Tyree Date: Sat, 3 Dec 2022 05:47:38 -0800 Subject: [PATCH 018/120] First steps towards better shared memory, including dynamic --- .../generate/cpu_linker_macro/kernel_func.rs | 19 ++- .../kernel_func_async/async_func_types.rs | 9 ++ .../kernel_func_async/launch_types.rs | 11 ++ .../kernel/wrapper/generate/cpu_wrapper.rs | 20 +++ .../kernel/wrapper/generate/cuda_wrapper.rs | 32 ++++- .../src/kernel/wrapper/inputs/attribute.rs | 7 +- .../src/kernel/wrapper/inputs/mod.rs | 30 +++- rust-cuda-derive/src/kernel/wrapper/mod.rs | 11 +- src/device/alloc.rs | 16 +++ src/device/{utils.rs => macros.rs} | 105 -------------- src/device/mod.rs | 34 +---- src/device/thread.rs | 133 ++++++++++++++++++ src/utils/aliasing/const.rs | 4 +- src/utils/aliasing/dynamic.rs | 4 +- src/utils/mod.rs | 1 + src/utils/shared/mod.rs | 35 +++++ src/utils/shared/slice.rs | 73 ++++++++++ src/utils/shared/static.rs | 44 ++++++ 18 files changed, 441 insertions(+), 147 deletions(-) create mode 100644 src/device/alloc.rs rename src/device/{utils.rs => macros.rs} (59%) create mode 100644 src/device/thread.rs create mode 100644 src/utils/shared/mod.rs create mode 100644 src/utils/shared/slice.rs create mode 100644 src/utils/shared/static.rs diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func.rs index d6e70e276..00208e57e 100644 --- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func.rs +++ b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func.rs @@ -13,7 +13,10 @@ pub(super) fn quote_kernel_func( generic_wrapper_where_clause, .. }: &DeclGenerics, - inputs @ FunctionInputs { func_inputs, .. }: &FunctionInputs, + inputs @ FunctionInputs { + func_inputs, + func_input_cuda_types, + }: &FunctionInputs, fn_ident @ FuncIdent { func_ident, .. }: &FuncIdent, func_params: &[syn::Ident], func_attrs: &[syn::Attribute], @@ -21,8 +24,9 @@ pub(super) fn quote_kernel_func( ) -> TokenStream { let new_func_inputs = func_inputs .iter() + .zip(func_input_cuda_types.iter()) .enumerate() - .map(|(i, arg)| match arg { + .map(|(i, (arg, (cuda_type, _)))| match arg { syn::FnArg::Typed(syn::PatType { attrs, pat, @@ -46,6 +50,16 @@ pub(super) fn quote_kernel_func( quote! { #(#attrs)* #pat #colon_token #and_token #lifetime #mutability #syn_type } + } else if matches!(cuda_type, InputCudaType::ThreadBlockShared) { + if let syn::Type::Slice(_) = &**ty { + quote! { #(#attrs)* #pat #colon_token + #crate_path::utils::shared::slice::ThreadBlockSharedSlice<#syn_type> + } + } else { + quote! { #(#attrs)* #pat #colon_token + #crate_path::utils::shared::r#static::ThreadBlockShared<#syn_type> + } + } } else { quote! { #(#attrs)* #pat #colon_token #syn_type } } @@ -171,6 +185,7 @@ fn generate_raw_func_input_wrap( ) } } }, + InputCudaType::ThreadBlockShared => inner, }, syn::FnArg::Receiver(_) => unreachable!(), }, diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func_async/async_func_types.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func_async/async_func_types.rs index c24406c9a..8cbbc7790 100644 --- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func_async/async_func_types.rs +++ b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func_async/async_func_types.rs @@ -46,6 +46,15 @@ pub(super) fn generate_async_func_types( <#syn_type as #crate_path::common::RustToCuda>::CudaRepresentation > }, + InputCudaType::ThreadBlockShared => if let syn::Type::Slice(_) = &**ty { + quote! { + #crate_path::utils::shared::slice::ThreadBlockSharedSlice<#syn_type> + } + } else { + quote! { + #crate_path::utils::shared::r#static::ThreadBlockShared<#syn_type> + } + }, }; if let syn::Type::Reference(syn::TypeReference { diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func_async/launch_types.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func_async/launch_types.rs index 16cd0008e..cda2d7e4a 100644 --- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func_async/launch_types.rs +++ b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func_async/launch_types.rs @@ -47,6 +47,17 @@ pub(in super::super) fn generate_launch_types( <#syn_type as #crate_path::common::RustToCuda>::CudaRepresentation > }, + InputCudaType::ThreadBlockShared => { + if let syn::Type::Slice(_) = &**ty { + quote::quote_spanned! { ty.span()=> + #crate_path::utils::shared::slice::ThreadBlockSharedSlice<#syn_type> + } + } else { + quote::quote_spanned! { ty.span()=> + #crate_path::utils::shared::r#static::ThreadBlockShared<#syn_type> + } + } + }, }; cpu_func_types_launch.push( diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper.rs index 4851af9ce..6b15f2109 100644 --- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper.rs +++ b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper.rs @@ -91,6 +91,7 @@ pub(in super::super) fn quote_cpu_wrapper( } } +#[allow(clippy::too_many_lines)] fn generate_new_func_inputs_decl( crate_path: &syn::Path, KernelConfig { args, .. }: &KernelConfig, @@ -132,6 +133,16 @@ fn generate_new_func_inputs_decl( mutability: *mutability, elem: syn_type, })) + } else if matches!(cuda_mode, InputCudaType::ThreadBlockShared) { + if let syn::Type::Slice(_) = &**ty { + syn::parse_quote!( + #crate_path::utils::shared::slice::ThreadBlockSharedSlice<#syn_type> + ) + } else { + syn::parse_quote!( + #crate_path::utils::shared::r#static::ThreadBlockShared<#syn_type> + ) + } } else { syn_type } @@ -155,6 +166,15 @@ fn generate_new_func_inputs_decl( <#syn_type as #crate_path::common::RustToCuda>::CudaRepresentation > ), + InputCudaType::ThreadBlockShared => if let syn::Type::Slice(_) = &**ty { + syn::parse_quote!( + #crate_path::utils::shared::slice::ThreadBlockSharedSlice<#syn_type> + ) + } else { + syn::parse_quote!( + #crate_path::utils::shared::r#static::ThreadBlockShared<#syn_type> + ) + }, }; if let syn::Type::Reference(syn::TypeReference { diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cuda_wrapper.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cuda_wrapper.rs index 36e316708..34db62123 100644 --- a/rust-cuda-derive/src/kernel/wrapper/generate/cuda_wrapper.rs +++ b/rust-cuda-derive/src/kernel/wrapper/generate/cuda_wrapper.rs @@ -33,6 +33,8 @@ pub(in super::super) fn quote_cuda_wrapper( }) .collect::>(); + let mut shared_slice = Vec::new(); + let ptx_func_input_unwrap = func_inputs .iter().zip(func_input_cuda_types.iter()).enumerate() .rev() @@ -90,7 +92,24 @@ pub(in super::super) fn quote_cuda_wrapper( #pat, |#pat: #syn_type| { #inner }, ) } - } + }, + InputCudaType::ThreadBlockShared => if let syn::Type::Slice(syn::TypeSlice { elem, .. }) = &**ty { + shared_slice.push(elem); + + quote! { + #ptx_jit_load; + #crate_path::utils::shared::slice::ThreadBlockSharedSlice::with_uninit( + #pat, |#pat: #syn_type| { #inner }, + ) + } + } else { + quote! { + #ptx_jit_load; + #crate_path::utils::shared::r#static::ThreadBlockShared::with_uninit( + #pat, |#pat: #syn_type| { #inner }, + ) + } + }, } }, syn::FnArg::Receiver(_) => unreachable!(), @@ -186,6 +205,17 @@ fn specialise_ptx_func_inputs( <#syn_type as #crate_path::common::RustToCuda>::CudaRepresentation > }, + InputCudaType::ThreadBlockShared => { + if let syn::Type::Slice(_) = &**ty { + quote::quote_spanned! { ty.span()=> + #crate_path::utils::shared::slice::ThreadBlockSharedSlice<#syn_type> + } + } else { + quote::quote_spanned! { ty.span()=> + #crate_path::utils::shared::r#static::ThreadBlockShared<#syn_type> + } + } + }, }; let ty = if let syn::Type::Reference(syn::TypeReference { diff --git a/rust-cuda-derive/src/kernel/wrapper/inputs/attribute.rs b/rust-cuda-derive/src/kernel/wrapper/inputs/attribute.rs index ceeee1e3e..6b479a664 100644 --- a/rust-cuda-derive/src/kernel/wrapper/inputs/attribute.rs +++ b/rust-cuda-derive/src/kernel/wrapper/inputs/attribute.rs @@ -19,10 +19,11 @@ impl syn::parse::Parse for KernelInputAttribute { let cuda_type = match &*mode.to_string() { "SafeDeviceCopy" => InputCudaType::SafeDeviceCopy, "LendRustToCuda" => InputCudaType::LendRustToCuda, + "ThreadBlockShared" => InputCudaType::ThreadBlockShared, _ => abort!( mode.span(), - "Unexpected CUDA transfer mode `{:?}`: Expected `SafeDeviceCopy` or \ - `LendRustToCuda`.", + "Unexpected CUDA transfer mode `{}`: Expected `SafeDeviceCopy`, \ + `LendRustToCuda`, or `ThreadBlockShared`.", mode ), }; @@ -61,7 +62,7 @@ impl syn::parse::Parse for KernelInputAttribute { }, _ => abort!( ident.span(), - "Unexpected kernel attribute `{:?}`: Expected `pass` or `jit`.", + "Unexpected kernel attribute `{}`: Expected `pass` or `jit`.", ident ), } diff --git a/rust-cuda-derive/src/kernel/wrapper/inputs/mod.rs b/rust-cuda-derive/src/kernel/wrapper/inputs/mod.rs index f3cc1a4d8..fb010f76c 100644 --- a/rust-cuda-derive/src/kernel/wrapper/inputs/mod.rs +++ b/rust-cuda-derive/src/kernel/wrapper/inputs/mod.rs @@ -12,6 +12,7 @@ pub(super) struct FunctionInputs { pub(super) func_input_cuda_types: Vec<(InputCudaType, InputPtxJit)>, } +#[allow(clippy::too_many_lines)] pub(super) fn parse_function_inputs( func: &syn::ItemFn, generic_params: &mut syn::punctuated::Punctuated, @@ -53,9 +54,25 @@ pub(super) fn parse_function_inputs( for attr in attrs { match attr { - KernelInputAttribute::PassType(_span, pass_type) + KernelInputAttribute::PassType(span, pass_type) if cuda_type.is_none() => { + if matches!(pass_type, InputCudaType::ThreadBlockShared) + && !matches!( + &**ty, + syn::Type::Ptr(syn::TypePtr { + mutability: Some(_), + .. + }) + ) + { + abort!( + span, + "Only mutable pointer types can be shared in a \ + thread block." + ); + } + cuda_type = Some(pass_type); }, KernelInputAttribute::PassType(span, _pass_type) => { @@ -208,6 +225,17 @@ fn ensure_reference_type_lifetime( elem, })) }, + ty @ syn::Type::Ptr(syn::TypePtr { elem, .. }) => { + if matches!(cuda_type, InputCudaType::ThreadBlockShared) { + if let syn::Type::Slice(syn::TypeSlice { elem, .. }) = &**elem { + elem.clone() + } else { + elem.clone() + } + } else { + Box::new(ty.clone()) + } + }, ty => { if matches!(cuda_type, InputCudaType::LendRustToCuda) { generic_params.insert( diff --git a/rust-cuda-derive/src/kernel/wrapper/mod.rs b/rust-cuda-derive/src/kernel/wrapper/mod.rs index 76b88eee6..744a0f8d8 100644 --- a/rust-cuda-derive/src/kernel/wrapper/mod.rs +++ b/rust-cuda-derive/src/kernel/wrapper/mod.rs @@ -205,7 +205,9 @@ pub fn kernel(attr: TokenStream, func: TokenStream) -> TokenStream { .func_inputs .iter_mut() .zip(&func_params) - .map(|(arg, ident)| match arg { + .zip(&func_inputs.func_input_cuda_types) + .zip(&func.sig.inputs) + .map(|(((arg, ident), (cuda_type, _)), arg_orig)| match arg { syn::FnArg::Typed(syn::PatType { attrs, colon_token, @@ -225,6 +227,12 @@ pub fn kernel(attr: TokenStream, func: TokenStream) -> TokenStream { ty: ty.clone(), }); + if matches!(cuda_type, InputCudaType::ThreadBlockShared) { + if let syn::FnArg::Typed(syn::PatType { ty: ty_orig, .. }) = arg_orig { + *ty = ty_orig.clone(); + } + } + std::mem::replace(arg, ident_fn_arg) }, syn::FnArg::Receiver(_) => unreachable!(), @@ -284,6 +292,7 @@ pub fn kernel(attr: TokenStream, func: TokenStream) -> TokenStream { enum InputCudaType { SafeDeviceCopy, LendRustToCuda, + ThreadBlockShared, } struct InputPtxJit(bool); diff --git a/src/device/alloc.rs b/src/device/alloc.rs new file mode 100644 index 000000000..14a294814 --- /dev/null +++ b/src/device/alloc.rs @@ -0,0 +1,16 @@ +use alloc::alloc::{GlobalAlloc, Layout}; +#[cfg(target_os = "cuda")] +use core::arch::nvptx; + +/// Memory allocator using CUDA malloc/free +pub struct PTXAllocator; + +unsafe impl GlobalAlloc for PTXAllocator { + unsafe fn alloc(&self, layout: Layout) -> *mut u8 { + nvptx::malloc(layout.size()).cast() + } + + unsafe fn dealloc(&self, ptr: *mut u8, _layout: Layout) { + nvptx::free(ptr.cast()); + } +} diff --git a/src/device/utils.rs b/src/device/macros.rs similarity index 59% rename from src/device/utils.rs rename to src/device/macros.rs index a45ff9c71..932ca75ae 100644 --- a/src/device/utils.rs +++ b/src/device/macros.rs @@ -1,20 +1,3 @@ -use alloc::alloc::{GlobalAlloc, Layout}; -#[cfg(target_os = "cuda")] -use core::arch::nvptx; - -/// Memory allocator using CUDA malloc/free -pub struct PTXAllocator; - -unsafe impl GlobalAlloc for PTXAllocator { - unsafe fn alloc(&self, layout: Layout) -> *mut u8 { - nvptx::malloc(layout.size()).cast() - } - - unsafe fn dealloc(&self, ptr: *mut u8, _layout: Layout) { - nvptx::free(ptr.cast()); - } -} - // Based on https://github.com/popzxc/stdext-rs/blob/master/src/macros.rs #[macro_export] #[doc(hidden)] @@ -130,91 +113,3 @@ macro_rules! assert_ne { } }; } - -/// Dimension specified in kernel launching -#[derive(Debug)] -pub struct Dim3 { - pub x: u32, - pub y: u32, - pub z: u32, -} - -/// Indices that the kernel code is running on -#[derive(Debug)] -pub struct Idx3 { - pub x: u32, - pub y: u32, - pub z: u32, -} - -#[must_use] -pub fn block_dim() -> Dim3 { - #[allow(clippy::cast_sign_loss)] - unsafe { - Dim3 { - x: nvptx::_block_dim_x() as u32, - y: nvptx::_block_dim_y() as u32, - z: nvptx::_block_dim_z() as u32, - } - } -} - -#[must_use] -pub fn block_idx() -> Idx3 { - #[allow(clippy::cast_sign_loss)] - unsafe { - Idx3 { - x: nvptx::_block_idx_x() as u32, - y: nvptx::_block_idx_y() as u32, - z: nvptx::_block_idx_z() as u32, - } - } -} - -#[must_use] -pub fn grid_dim() -> Dim3 { - #[allow(clippy::cast_sign_loss)] - unsafe { - Dim3 { - x: nvptx::_grid_dim_x() as u32, - y: nvptx::_grid_dim_y() as u32, - z: nvptx::_grid_dim_z() as u32, - } - } -} - -#[must_use] -pub fn thread_idx() -> Idx3 { - #[allow(clippy::cast_sign_loss)] - unsafe { - Idx3 { - x: nvptx::_thread_idx_x() as u32, - y: nvptx::_thread_idx_y() as u32, - z: nvptx::_thread_idx_z() as u32, - } - } -} - -impl Dim3 { - #[must_use] - pub fn size(&self) -> usize { - (self.x as usize) * (self.y as usize) * (self.z as usize) - } -} - -impl Idx3 { - #[must_use] - pub fn as_id(&self, dim: &Dim3) -> usize { - (self.x as usize) - + (self.y as usize) * (dim.x as usize) - + (self.z as usize) * (dim.x as usize) * (dim.y as usize) - } -} - -#[must_use] -pub fn index() -> usize { - let block_id = block_idx().as_id(&grid_dim()); - let thread_id = thread_idx().as_id(&block_dim()); - - block_id * block_dim().size() + thread_id -} diff --git a/src/device/mod.rs b/src/device/mod.rs index 7c11cb34f..45c833923 100644 --- a/src/device/mod.rs +++ b/src/device/mod.rs @@ -12,7 +12,10 @@ use crate::{ safety::SafeDeviceCopy, }; -pub mod utils; +pub mod alloc; +pub mod thread; + +mod macros; pub trait BorrowFromRust: RustToCuda { /// # Safety @@ -113,32 +116,3 @@ impl DerefMut for ShallowCopy { &mut self.0 } } - -#[repr(transparent)] -pub struct ThreadBlockShared { - shared: *mut T, -} - -impl ThreadBlockShared { - #[must_use] - pub fn new_uninit() -> Self { - let shared: *mut T; - - unsafe { - core::arch::asm!( - ".shared .align {align} .b8 {reg}_rust_cuda_shared[{size}];", - "cvta.shared.u64 {reg}, {reg}_rust_cuda_shared;", - reg = out(reg64) shared, - align = const(core::mem::align_of::()), - size = const(core::mem::size_of::()), - ); - } - - Self { shared } - } - - #[must_use] - pub fn get(&self) -> *mut T { - self.shared - } -} diff --git a/src/device/thread.rs b/src/device/thread.rs new file mode 100644 index 000000000..8f3bc5719 --- /dev/null +++ b/src/device/thread.rs @@ -0,0 +1,133 @@ +#[cfg(target_os = "cuda")] +use core::arch::nvptx; + +#[allow(clippy::module_name_repetitions)] +pub struct Thread { + _private: (), +} + +#[allow(clippy::module_name_repetitions)] +pub struct ThreadBlock { + _private: (), +} + +#[allow(clippy::module_name_repetitions)] +pub struct ThreadBlockGrid { + _private: (), +} + +impl Thread { + #[must_use] + pub fn this() -> Self { + Self { _private: () } + } + + #[must_use] + pub fn index(&self) -> usize { + let block = self.block(); + let grid = block.grid(); + + let block_id = block.idx().as_id(&grid.dim()); + let thread_id = self.idx().as_id(&block.dim()); + + block_id * block.dim().size() + thread_id + } + + #[must_use] + pub fn idx(&self) -> Idx3 { + #[allow(clippy::cast_sign_loss)] + unsafe { + Idx3 { + x: nvptx::_thread_idx_x() as u32, + y: nvptx::_thread_idx_y() as u32, + z: nvptx::_thread_idx_z() as u32, + } + } + } + + #[must_use] + pub fn block(&self) -> ThreadBlock { + ThreadBlock { _private: () } + } +} + +impl ThreadBlock { + #[must_use] + pub fn dim(&self) -> Dim3 { + #[allow(clippy::cast_sign_loss)] + unsafe { + Dim3 { + x: nvptx::_block_dim_x() as u32, + y: nvptx::_block_dim_y() as u32, + z: nvptx::_block_dim_z() as u32, + } + } + } + + #[must_use] + pub fn idx(&self) -> Idx3 { + #[allow(clippy::cast_sign_loss)] + unsafe { + Idx3 { + x: nvptx::_block_idx_x() as u32, + y: nvptx::_block_idx_y() as u32, + z: nvptx::_block_idx_z() as u32, + } + } + } + + #[must_use] + pub fn grid(&self) -> ThreadBlockGrid { + ThreadBlockGrid { _private: () } + } + + pub fn synchronize(&self) { + unsafe { nvptx::_syncthreads() } + } +} + +impl ThreadBlockGrid { + #[must_use] + pub fn dim(&self) -> Dim3 { + #[allow(clippy::cast_sign_loss)] + unsafe { + Dim3 { + x: nvptx::_grid_dim_x() as u32, + y: nvptx::_grid_dim_y() as u32, + z: nvptx::_grid_dim_z() as u32, + } + } + } +} + +/// Dimension specified in kernel launching +#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)] +pub struct Dim3 { + pub x: u32, + pub y: u32, + pub z: u32, +} + +/// Indices that the kernel code is running on +#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)] +pub struct Idx3 { + pub x: u32, + pub y: u32, + pub z: u32, +} + +impl Dim3 { + #[must_use] + pub fn size(&self) -> usize { + (self.x as usize) * (self.y as usize) * (self.z as usize) + } +} + +impl Idx3 { + #[must_use] + pub fn as_id(&self, dim: &Dim3) -> usize { + (self.x as usize) + + (self.y as usize) * (dim.x as usize) + + (self.z as usize) * (dim.x as usize) * (dim.y as usize) + } +} diff --git a/src/utils/aliasing/const.rs b/src/utils/aliasing/const.rs index a60a94eb9..ea5f1bba4 100644 --- a/src/utils/aliasing/const.rs +++ b/src/utils/aliasing/const.rs @@ -28,7 +28,7 @@ unsafe impl DeviceCopy #[cfg(all(not(feature = "host"), target_os = "cuda"))] fn split_slice_const_stride(slice: &[E]) -> &[E] { - let offset: usize = crate::device::utils::index() * STRIDE; + let offset: usize = crate::device::thread::Thread::this().index() * STRIDE; let len = slice.len().min(offset + STRIDE).saturating_sub(offset); unsafe { core::slice::from_raw_parts(slice.as_ptr().add(offset), len) } @@ -36,7 +36,7 @@ fn split_slice_const_stride(slice: &[E]) -> &[E] { #[cfg(all(not(feature = "host"), target_os = "cuda"))] fn split_slice_const_stride_mut(slice: &mut [E]) -> &mut [E] { - let offset: usize = crate::device::utils::index() * STRIDE; + let offset: usize = crate::device::thread::Thread::this().index() * STRIDE; let len = slice.len().min(offset + STRIDE).saturating_sub(offset); unsafe { core::slice::from_raw_parts_mut(slice.as_mut_ptr().add(offset), len) } diff --git a/src/utils/aliasing/dynamic.rs b/src/utils/aliasing/dynamic.rs index 668112f88..c2ad169ff 100644 --- a/src/utils/aliasing/dynamic.rs +++ b/src/utils/aliasing/dynamic.rs @@ -28,7 +28,7 @@ unsafe impl DeviceCopy for SplitSliceOverCudaThreadsDynamicStride #[cfg(all(not(feature = "host"), target_os = "cuda"))] fn split_slice_dynamic_stride(slice: &[E], stride: usize) -> &[E] { - let offset: usize = crate::device::utils::index() * stride; + let offset: usize = crate::device::thread::Thread::this().index() * stride; let len = slice.len().min(offset + stride).saturating_sub(offset); unsafe { core::slice::from_raw_parts(slice.as_ptr().add(offset), len) } @@ -36,7 +36,7 @@ fn split_slice_dynamic_stride(slice: &[E], stride: usize) -> &[E] { #[cfg(all(not(feature = "host"), target_os = "cuda"))] fn split_slice_dynamic_stride_mut(slice: &mut [E], stride: usize) -> &mut [E] { - let offset: usize = crate::device::utils::index() * stride; + let offset: usize = crate::device::thread::Thread::this().index() * stride; let len = slice.len().min(offset + stride).saturating_sub(offset); unsafe { core::slice::from_raw_parts_mut(slice.as_mut_ptr().add(offset), len) } diff --git a/src/utils/mod.rs b/src/utils/mod.rs index 303e96262..c70432f31 100644 --- a/src/utils/mod.rs +++ b/src/utils/mod.rs @@ -4,6 +4,7 @@ pub mod aliasing; pub mod alloc; pub mod device_copy; pub mod exchange; +pub mod shared; mod r#box; mod boxed_slice; diff --git a/src/utils/shared/mod.rs b/src/utils/shared/mod.rs new file mode 100644 index 000000000..8b49ca6d3 --- /dev/null +++ b/src/utils/shared/mod.rs @@ -0,0 +1,35 @@ +pub mod slice; +pub mod r#static; + +#[cfg(not(any(all(not(feature = "host"), target_os = "cuda"), doc)))] +#[doc(cfg(not(all(not(feature = "host"), target_os = "cuda"))))] +#[allow(clippy::module_name_repetitions)] +pub trait ThreadBlockShared: 'static + Sized { + fn share_uninit() -> r#static::ThreadBlockShared; +} + +#[cfg(not(any(all(not(feature = "host"), target_os = "cuda"), doc)))] +#[doc(cfg(not(all(not(feature = "host"), target_os = "cuda"))))] +impl ThreadBlockShared for T { + fn share_uninit() -> r#static::ThreadBlockShared { + r#static::ThreadBlockShared::uninit() + } +} + +#[cfg(not(any(all(not(feature = "host"), target_os = "cuda"), doc)))] +#[doc(cfg(not(all(not(feature = "host"), target_os = "cuda"))))] +pub trait ThreadBlockSharedSlice: 'static { + type Elem: Sized; + + fn share_uninit(len: usize) -> slice::ThreadBlockSharedSlice; +} + +#[cfg(not(any(all(not(feature = "host"), target_os = "cuda"), doc)))] +#[doc(cfg(not(all(not(feature = "host"), target_os = "cuda"))))] +impl ThreadBlockSharedSlice for [T] { + type Elem = T; + + fn share_uninit(len: usize) -> slice::ThreadBlockSharedSlice { + slice::ThreadBlockSharedSlice::with_len(len) + } +} diff --git a/src/utils/shared/slice.rs b/src/utils/shared/slice.rs new file mode 100644 index 000000000..098670fba --- /dev/null +++ b/src/utils/shared/slice.rs @@ -0,0 +1,73 @@ +use rustacuda_core::DeviceCopy; + +#[allow(clippy::module_name_repetitions)] +#[derive(TypeLayout)] +#[repr(C)] +pub struct ThreadBlockSharedSlice { + len: usize, + byte_offset: usize, + marker: [T; 0], +} + +unsafe impl DeviceCopy for ThreadBlockSharedSlice {} + +#[cfg(not(any(all(not(feature = "host"), target_os = "cuda"), doc)))] +#[doc(cfg(not(all(not(feature = "host"), target_os = "cuda"))))] +impl ThreadBlockSharedSlice { + #[must_use] + pub fn with_len(len: usize) -> Self { + Self { + len, + byte_offset: 0, + marker: [], + } + } + + #[must_use] + pub fn len(&self) -> usize { + self.len + } + + #[must_use] + pub fn is_empty(&self) -> bool { + self.len == 0 + } +} + +#[cfg(all(not(feature = "host"), target_os = "cuda"))] +#[doc(cfg(all(not(feature = "host"), target_os = "cuda")))] +impl ThreadBlockSharedSlice { + /// # Safety + /// + /// The thread-block shared dynamic memory must be initialised once and + /// only once per kernel. + pub unsafe fn init() { + unsafe { + core::arch::asm!( + ".shared .align {align} .b8 rust_cuda_dynamic_shared[];", + align = const(core::mem::align_of::()), + ); + } + } + + /// # Safety + /// + /// Exposing the [`ThreadBlockSharedSlice`] must be preceded by exactly one + /// call to [`ThreadBlockSharedSlice::init`] for the type `T` amongst + /// all `ThreadBlockSharedSlice` that has the largest alignment. + pub unsafe fn with_uninit Q, Q>(self, inner: F) -> Q { + let base: *mut u8; + + unsafe { + core::arch::asm!( + "cvta.shared.u64 {reg}, rust_cuda_dynamic_shared;", + reg = out(reg64) base, + ); + } + + let slice = + core::ptr::slice_from_raw_parts_mut(base.add(self.byte_offset).cast(), self.len); + + inner(slice) + } +} diff --git a/src/utils/shared/static.rs b/src/utils/shared/static.rs new file mode 100644 index 000000000..53f8aeb9e --- /dev/null +++ b/src/utils/shared/static.rs @@ -0,0 +1,44 @@ +use rustacuda_core::DeviceCopy; + +#[derive(TypeLayout)] +#[repr(transparent)] +pub struct ThreadBlockShared { + marker: [T; 0], +} + +unsafe impl DeviceCopy for ThreadBlockShared {} + +#[cfg(not(any(all(not(feature = "host"), target_os = "cuda"), doc)))] +#[doc(cfg(not(all(not(feature = "host"), target_os = "cuda"))))] +impl ThreadBlockShared { + #[must_use] + pub fn uninit() -> Self { + Self { marker: [] } + } +} + +#[cfg(any(all(not(feature = "host"), target_os = "cuda"), doc))] +#[doc(cfg(all(not(feature = "host"), target_os = "cuda")))] +impl ThreadBlockShared { + #[must_use] + pub fn new_uninit() -> *mut T { + let shared: *mut T; + + unsafe { + core::arch::asm!( + ".shared .align {align} .b8 {reg}_rust_cuda_static_shared[{size}];", + "cvta.shared.u64 {reg}, {reg}_rust_cuda_static_shared;", + reg = out(reg64) shared, + align = const(core::mem::align_of::()), + size = const(core::mem::size_of::()), + ); + } + + shared + } + + #[must_use] + pub fn with_uninit Q, Q>(self, inner: F) -> Q { + inner(Self::new_uninit()) + } +} From 79792bd56602f30591a535edcee9536668322211 Mon Sep 17 00:00:00 2001 From: Juniper Tyree Date: Sat, 3 Dec 2022 13:24:23 -0800 Subject: [PATCH 019/120] Revert derive changes + R2C-based approach start --- examples/single-source/src/main.rs | 16 ++-- .../generate/cpu_linker_macro/kernel_func.rs | 19 +---- .../kernel_func_async/async_func_types.rs | 9 -- .../kernel_func_async/launch_types.rs | 11 --- .../kernel/wrapper/generate/cpu_wrapper.rs | 20 ----- .../kernel/wrapper/generate/cuda_wrapper.rs | 32 +------ .../src/kernel/wrapper/inputs/attribute.rs | 7 +- .../src/kernel/wrapper/inputs/mod.rs | 30 +------ rust-cuda-derive/src/kernel/wrapper/mod.rs | 11 +-- src/safety/stack_only.rs | 4 + src/utils/shared/mod.rs | 35 +------- src/utils/shared/static.rs | 84 +++++++++++++++++-- 12 files changed, 99 insertions(+), 179 deletions(-) diff --git a/examples/single-source/src/main.rs b/examples/single-source/src/main.rs index 891c2db06..2e1c9e199 100644 --- a/examples/single-source/src/main.rs +++ b/examples/single-source/src/main.rs @@ -10,6 +10,8 @@ extern crate alloc; +use rc::utils::shared::r#static::ThreadBlockShared; + #[cfg(not(target_os = "cuda"))] fn main() {} @@ -44,19 +46,21 @@ pub fn kernel<'a, T: rc::common::RustToCuda>( #[kernel(pass = SafeDeviceCopy, jit)] _v @ _w: &'a core::sync::atomic::AtomicU64, #[kernel(pass = LendRustToCuda)] _: Wrapper, #[kernel(pass = SafeDeviceCopy)] Tuple(_s, mut __t): Tuple, + #[kernel(pass = LendRustToCuda)] shared3: ThreadBlockShared, ) where ::CudaRepresentation: rc::safety::StackOnly, { - use rc::device::ThreadBlockShared; - let shared: ThreadBlockShared<[Tuple; 3]> = ThreadBlockShared::new_uninit(); let shared2: ThreadBlockShared<[Tuple; 3]> = ThreadBlockShared::new_uninit(); unsafe { - (*shared.get().cast::().add(1)).0 = 42; + (*shared.as_mut_ptr().cast::().add(1)).0 = 42; + } + unsafe { + (*shared2.as_mut_ptr().cast::().add(2)).1 = 24; } unsafe { - (*shared2.get().cast::().add(2)).1 = 24; + *shared3.as_mut_ptr() = 12; } } @@ -84,10 +88,10 @@ mod host { mod cuda_prelude { use core::arch::nvptx; - use rc::device::utils; + use rc::device::alloc::PTXAllocator; #[global_allocator] - static _GLOBAL_ALLOCATOR: utils::PTXAllocator = utils::PTXAllocator; + static _GLOBAL_ALLOCATOR: PTXAllocator = PTXAllocator; #[panic_handler] fn panic(_: &::core::panic::PanicInfo) -> ! { diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func.rs index 00208e57e..d6e70e276 100644 --- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func.rs +++ b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func.rs @@ -13,10 +13,7 @@ pub(super) fn quote_kernel_func( generic_wrapper_where_clause, .. }: &DeclGenerics, - inputs @ FunctionInputs { - func_inputs, - func_input_cuda_types, - }: &FunctionInputs, + inputs @ FunctionInputs { func_inputs, .. }: &FunctionInputs, fn_ident @ FuncIdent { func_ident, .. }: &FuncIdent, func_params: &[syn::Ident], func_attrs: &[syn::Attribute], @@ -24,9 +21,8 @@ pub(super) fn quote_kernel_func( ) -> TokenStream { let new_func_inputs = func_inputs .iter() - .zip(func_input_cuda_types.iter()) .enumerate() - .map(|(i, (arg, (cuda_type, _)))| match arg { + .map(|(i, arg)| match arg { syn::FnArg::Typed(syn::PatType { attrs, pat, @@ -50,16 +46,6 @@ pub(super) fn quote_kernel_func( quote! { #(#attrs)* #pat #colon_token #and_token #lifetime #mutability #syn_type } - } else if matches!(cuda_type, InputCudaType::ThreadBlockShared) { - if let syn::Type::Slice(_) = &**ty { - quote! { #(#attrs)* #pat #colon_token - #crate_path::utils::shared::slice::ThreadBlockSharedSlice<#syn_type> - } - } else { - quote! { #(#attrs)* #pat #colon_token - #crate_path::utils::shared::r#static::ThreadBlockShared<#syn_type> - } - } } else { quote! { #(#attrs)* #pat #colon_token #syn_type } } @@ -185,7 +171,6 @@ fn generate_raw_func_input_wrap( ) } } }, - InputCudaType::ThreadBlockShared => inner, }, syn::FnArg::Receiver(_) => unreachable!(), }, diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func_async/async_func_types.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func_async/async_func_types.rs index 8cbbc7790..c24406c9a 100644 --- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func_async/async_func_types.rs +++ b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func_async/async_func_types.rs @@ -46,15 +46,6 @@ pub(super) fn generate_async_func_types( <#syn_type as #crate_path::common::RustToCuda>::CudaRepresentation > }, - InputCudaType::ThreadBlockShared => if let syn::Type::Slice(_) = &**ty { - quote! { - #crate_path::utils::shared::slice::ThreadBlockSharedSlice<#syn_type> - } - } else { - quote! { - #crate_path::utils::shared::r#static::ThreadBlockShared<#syn_type> - } - }, }; if let syn::Type::Reference(syn::TypeReference { diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func_async/launch_types.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func_async/launch_types.rs index cda2d7e4a..16cd0008e 100644 --- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func_async/launch_types.rs +++ b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func_async/launch_types.rs @@ -47,17 +47,6 @@ pub(in super::super) fn generate_launch_types( <#syn_type as #crate_path::common::RustToCuda>::CudaRepresentation > }, - InputCudaType::ThreadBlockShared => { - if let syn::Type::Slice(_) = &**ty { - quote::quote_spanned! { ty.span()=> - #crate_path::utils::shared::slice::ThreadBlockSharedSlice<#syn_type> - } - } else { - quote::quote_spanned! { ty.span()=> - #crate_path::utils::shared::r#static::ThreadBlockShared<#syn_type> - } - } - }, }; cpu_func_types_launch.push( diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper.rs index 6b15f2109..4851af9ce 100644 --- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper.rs +++ b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper.rs @@ -91,7 +91,6 @@ pub(in super::super) fn quote_cpu_wrapper( } } -#[allow(clippy::too_many_lines)] fn generate_new_func_inputs_decl( crate_path: &syn::Path, KernelConfig { args, .. }: &KernelConfig, @@ -133,16 +132,6 @@ fn generate_new_func_inputs_decl( mutability: *mutability, elem: syn_type, })) - } else if matches!(cuda_mode, InputCudaType::ThreadBlockShared) { - if let syn::Type::Slice(_) = &**ty { - syn::parse_quote!( - #crate_path::utils::shared::slice::ThreadBlockSharedSlice<#syn_type> - ) - } else { - syn::parse_quote!( - #crate_path::utils::shared::r#static::ThreadBlockShared<#syn_type> - ) - } } else { syn_type } @@ -166,15 +155,6 @@ fn generate_new_func_inputs_decl( <#syn_type as #crate_path::common::RustToCuda>::CudaRepresentation > ), - InputCudaType::ThreadBlockShared => if let syn::Type::Slice(_) = &**ty { - syn::parse_quote!( - #crate_path::utils::shared::slice::ThreadBlockSharedSlice<#syn_type> - ) - } else { - syn::parse_quote!( - #crate_path::utils::shared::r#static::ThreadBlockShared<#syn_type> - ) - }, }; if let syn::Type::Reference(syn::TypeReference { diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cuda_wrapper.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cuda_wrapper.rs index 34db62123..36e316708 100644 --- a/rust-cuda-derive/src/kernel/wrapper/generate/cuda_wrapper.rs +++ b/rust-cuda-derive/src/kernel/wrapper/generate/cuda_wrapper.rs @@ -33,8 +33,6 @@ pub(in super::super) fn quote_cuda_wrapper( }) .collect::>(); - let mut shared_slice = Vec::new(); - let ptx_func_input_unwrap = func_inputs .iter().zip(func_input_cuda_types.iter()).enumerate() .rev() @@ -92,24 +90,7 @@ pub(in super::super) fn quote_cuda_wrapper( #pat, |#pat: #syn_type| { #inner }, ) } - }, - InputCudaType::ThreadBlockShared => if let syn::Type::Slice(syn::TypeSlice { elem, .. }) = &**ty { - shared_slice.push(elem); - - quote! { - #ptx_jit_load; - #crate_path::utils::shared::slice::ThreadBlockSharedSlice::with_uninit( - #pat, |#pat: #syn_type| { #inner }, - ) - } - } else { - quote! { - #ptx_jit_load; - #crate_path::utils::shared::r#static::ThreadBlockShared::with_uninit( - #pat, |#pat: #syn_type| { #inner }, - ) - } - }, + } } }, syn::FnArg::Receiver(_) => unreachable!(), @@ -205,17 +186,6 @@ fn specialise_ptx_func_inputs( <#syn_type as #crate_path::common::RustToCuda>::CudaRepresentation > }, - InputCudaType::ThreadBlockShared => { - if let syn::Type::Slice(_) = &**ty { - quote::quote_spanned! { ty.span()=> - #crate_path::utils::shared::slice::ThreadBlockSharedSlice<#syn_type> - } - } else { - quote::quote_spanned! { ty.span()=> - #crate_path::utils::shared::r#static::ThreadBlockShared<#syn_type> - } - } - }, }; let ty = if let syn::Type::Reference(syn::TypeReference { diff --git a/rust-cuda-derive/src/kernel/wrapper/inputs/attribute.rs b/rust-cuda-derive/src/kernel/wrapper/inputs/attribute.rs index 6b479a664..ceeee1e3e 100644 --- a/rust-cuda-derive/src/kernel/wrapper/inputs/attribute.rs +++ b/rust-cuda-derive/src/kernel/wrapper/inputs/attribute.rs @@ -19,11 +19,10 @@ impl syn::parse::Parse for KernelInputAttribute { let cuda_type = match &*mode.to_string() { "SafeDeviceCopy" => InputCudaType::SafeDeviceCopy, "LendRustToCuda" => InputCudaType::LendRustToCuda, - "ThreadBlockShared" => InputCudaType::ThreadBlockShared, _ => abort!( mode.span(), - "Unexpected CUDA transfer mode `{}`: Expected `SafeDeviceCopy`, \ - `LendRustToCuda`, or `ThreadBlockShared`.", + "Unexpected CUDA transfer mode `{:?}`: Expected `SafeDeviceCopy` or \ + `LendRustToCuda`.", mode ), }; @@ -62,7 +61,7 @@ impl syn::parse::Parse for KernelInputAttribute { }, _ => abort!( ident.span(), - "Unexpected kernel attribute `{}`: Expected `pass` or `jit`.", + "Unexpected kernel attribute `{:?}`: Expected `pass` or `jit`.", ident ), } diff --git a/rust-cuda-derive/src/kernel/wrapper/inputs/mod.rs b/rust-cuda-derive/src/kernel/wrapper/inputs/mod.rs index fb010f76c..f3cc1a4d8 100644 --- a/rust-cuda-derive/src/kernel/wrapper/inputs/mod.rs +++ b/rust-cuda-derive/src/kernel/wrapper/inputs/mod.rs @@ -12,7 +12,6 @@ pub(super) struct FunctionInputs { pub(super) func_input_cuda_types: Vec<(InputCudaType, InputPtxJit)>, } -#[allow(clippy::too_many_lines)] pub(super) fn parse_function_inputs( func: &syn::ItemFn, generic_params: &mut syn::punctuated::Punctuated, @@ -54,25 +53,9 @@ pub(super) fn parse_function_inputs( for attr in attrs { match attr { - KernelInputAttribute::PassType(span, pass_type) + KernelInputAttribute::PassType(_span, pass_type) if cuda_type.is_none() => { - if matches!(pass_type, InputCudaType::ThreadBlockShared) - && !matches!( - &**ty, - syn::Type::Ptr(syn::TypePtr { - mutability: Some(_), - .. - }) - ) - { - abort!( - span, - "Only mutable pointer types can be shared in a \ - thread block." - ); - } - cuda_type = Some(pass_type); }, KernelInputAttribute::PassType(span, _pass_type) => { @@ -225,17 +208,6 @@ fn ensure_reference_type_lifetime( elem, })) }, - ty @ syn::Type::Ptr(syn::TypePtr { elem, .. }) => { - if matches!(cuda_type, InputCudaType::ThreadBlockShared) { - if let syn::Type::Slice(syn::TypeSlice { elem, .. }) = &**elem { - elem.clone() - } else { - elem.clone() - } - } else { - Box::new(ty.clone()) - } - }, ty => { if matches!(cuda_type, InputCudaType::LendRustToCuda) { generic_params.insert( diff --git a/rust-cuda-derive/src/kernel/wrapper/mod.rs b/rust-cuda-derive/src/kernel/wrapper/mod.rs index 744a0f8d8..76b88eee6 100644 --- a/rust-cuda-derive/src/kernel/wrapper/mod.rs +++ b/rust-cuda-derive/src/kernel/wrapper/mod.rs @@ -205,9 +205,7 @@ pub fn kernel(attr: TokenStream, func: TokenStream) -> TokenStream { .func_inputs .iter_mut() .zip(&func_params) - .zip(&func_inputs.func_input_cuda_types) - .zip(&func.sig.inputs) - .map(|(((arg, ident), (cuda_type, _)), arg_orig)| match arg { + .map(|(arg, ident)| match arg { syn::FnArg::Typed(syn::PatType { attrs, colon_token, @@ -227,12 +225,6 @@ pub fn kernel(attr: TokenStream, func: TokenStream) -> TokenStream { ty: ty.clone(), }); - if matches!(cuda_type, InputCudaType::ThreadBlockShared) { - if let syn::FnArg::Typed(syn::PatType { ty: ty_orig, .. }) = arg_orig { - *ty = ty_orig.clone(); - } - } - std::mem::replace(arg, ident_fn_arg) }, syn::FnArg::Receiver(_) => unreachable!(), @@ -292,7 +284,6 @@ pub fn kernel(attr: TokenStream, func: TokenStream) -> TokenStream { enum InputCudaType { SafeDeviceCopy, LendRustToCuda, - ThreadBlockShared, } struct InputPtxJit(bool); diff --git a/src/safety/stack_only.rs b/src/safety/stack_only.rs index e96f48993..ce8887bb3 100644 --- a/src/safety/stack_only.rs +++ b/src/safety/stack_only.rs @@ -36,5 +36,9 @@ mod sealed { impl !StackOnly for &T {} impl !StackOnly for &mut T {} + impl !StackOnly for crate::utils::shared::r#static::ThreadBlockShared {} + // impl !StackOnly for + // crate::utils::shared::slice::ThreadBlockSharedSlice {} + impl StackOnly for core::marker::PhantomData {} } diff --git a/src/utils/shared/mod.rs b/src/utils/shared/mod.rs index 8b49ca6d3..dcfe3b008 100644 --- a/src/utils/shared/mod.rs +++ b/src/utils/shared/mod.rs @@ -1,35 +1,2 @@ -pub mod slice; +// pub mod slice; pub mod r#static; - -#[cfg(not(any(all(not(feature = "host"), target_os = "cuda"), doc)))] -#[doc(cfg(not(all(not(feature = "host"), target_os = "cuda"))))] -#[allow(clippy::module_name_repetitions)] -pub trait ThreadBlockShared: 'static + Sized { - fn share_uninit() -> r#static::ThreadBlockShared; -} - -#[cfg(not(any(all(not(feature = "host"), target_os = "cuda"), doc)))] -#[doc(cfg(not(all(not(feature = "host"), target_os = "cuda"))))] -impl ThreadBlockShared for T { - fn share_uninit() -> r#static::ThreadBlockShared { - r#static::ThreadBlockShared::uninit() - } -} - -#[cfg(not(any(all(not(feature = "host"), target_os = "cuda"), doc)))] -#[doc(cfg(not(all(not(feature = "host"), target_os = "cuda"))))] -pub trait ThreadBlockSharedSlice: 'static { - type Elem: Sized; - - fn share_uninit(len: usize) -> slice::ThreadBlockSharedSlice; -} - -#[cfg(not(any(all(not(feature = "host"), target_os = "cuda"), doc)))] -#[doc(cfg(not(all(not(feature = "host"), target_os = "cuda"))))] -impl ThreadBlockSharedSlice for [T] { - type Elem = T; - - fn share_uninit(len: usize) -> slice::ThreadBlockSharedSlice { - slice::ThreadBlockSharedSlice::with_len(len) - } -} diff --git a/src/utils/shared/static.rs b/src/utils/shared/static.rs index 53f8aeb9e..fc3e86b3a 100644 --- a/src/utils/shared/static.rs +++ b/src/utils/shared/static.rs @@ -1,19 +1,87 @@ +#[cfg(not(target_os = "cuda"))] +use core::marker::PhantomData; + +use const_type_layout::TypeGraphLayout; use rustacuda_core::DeviceCopy; -#[derive(TypeLayout)] +use crate::common::{CudaAsRust, DeviceAccessible, RustToCuda}; + +#[cfg(not(target_os = "cuda"))] +#[repr(transparent)] +pub struct ThreadBlockShared { + marker: PhantomData, +} + +#[cfg(target_os = "cuda")] #[repr(transparent)] pub struct ThreadBlockShared { + shared: *mut T, +} + +#[doc(hidden)] +#[derive(TypeLayout)] +#[repr(transparent)] +pub struct ThreadBlockSharedCudaRepresentation { + // Note: uses a zero-element array instead of PhantomData here so that + // TypeLayout can still observe T's layout marker: [T; 0], } -unsafe impl DeviceCopy for ThreadBlockShared {} +unsafe impl DeviceCopy for ThreadBlockSharedCudaRepresentation {} + +unsafe impl RustToCuda for ThreadBlockShared { + #[cfg(feature = "host")] + #[doc(cfg(feature = "host"))] + type CudaAllocation = crate::host::NullCudaAlloc; + type CudaRepresentation = ThreadBlockSharedCudaRepresentation; + + #[cfg(feature = "host")] + #[doc(cfg(feature = "host"))] + unsafe fn borrow( + &self, + alloc: A, + ) -> rustacuda::error::CudaResult<( + DeviceAccessible, + crate::host::CombinedCudaAlloc, + )> { + Ok(( + DeviceAccessible::from(ThreadBlockSharedCudaRepresentation { marker: [] }), + crate::host::CombinedCudaAlloc::new(crate::host::NullCudaAlloc, alloc), + )) + } + + #[cfg(feature = "host")] + #[doc(cfg(feature = "host"))] + unsafe fn restore( + &mut self, + alloc: crate::host::CombinedCudaAlloc, + ) -> rustacuda::error::CudaResult { + let (_null, alloc): (crate::host::NullCudaAlloc, A) = alloc.split(); + + Ok(alloc) + } +} + +unsafe impl CudaAsRust + for ThreadBlockSharedCudaRepresentation +{ + type RustRepresentation = ThreadBlockShared; + + #[cfg(any(not(feature = "host"), doc))] + #[doc(cfg(not(feature = "host")))] + unsafe fn as_rust(_this: &DeviceAccessible) -> Self::RustRepresentation { + ThreadBlockShared::new_uninit() + } +} #[cfg(not(any(all(not(feature = "host"), target_os = "cuda"), doc)))] #[doc(cfg(not(all(not(feature = "host"), target_os = "cuda"))))] impl ThreadBlockShared { #[must_use] - pub fn uninit() -> Self { - Self { marker: [] } + pub fn new_uninit() -> Self { + Self { + marker: PhantomData::, + } } } @@ -21,7 +89,7 @@ impl ThreadBlockShared { #[doc(cfg(all(not(feature = "host"), target_os = "cuda")))] impl ThreadBlockShared { #[must_use] - pub fn new_uninit() -> *mut T { + pub fn new_uninit() -> Self { let shared: *mut T; unsafe { @@ -34,11 +102,11 @@ impl ThreadBlockShared { ); } - shared + Self { shared } } #[must_use] - pub fn with_uninit Q, Q>(self, inner: F) -> Q { - inner(Self::new_uninit()) + pub fn as_mut_ptr(&self) -> *mut T { + self.shared } } From 914dd90f9628a13c7a942a05b333bee65b9b852e Mon Sep 17 00:00:00 2001 From: Juniper Tyree Date: Sat, 3 Dec 2022 14:13:25 -0800 Subject: [PATCH 020/120] Some progress on shared slices --- src/lib.rs | 1 + src/safety/stack_only.rs | 6 +- src/utils/shared/mod.rs | 2 +- src/utils/shared/slice.rs | 160 +++++++++++++++++++++++++++---------- src/utils/shared/static.rs | 72 ++++++++--------- 5 files changed, 160 insertions(+), 81 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 795e00cfa..0c149d40f 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -16,6 +16,7 @@ any(all(not(feature = "host"), target_os = "cuda"), doc), feature(asm_const) )] +#![cfg_attr(target_os = "cuda", feature(ptr_metadata))] #![cfg_attr(any(feature = "alloc", doc), feature(allocator_api))] #![feature(doc_cfg)] #![feature(marker_trait_attr)] diff --git a/src/safety/stack_only.rs b/src/safety/stack_only.rs index ce8887bb3..eb3a69706 100644 --- a/src/safety/stack_only.rs +++ b/src/safety/stack_only.rs @@ -37,8 +37,10 @@ mod sealed { impl !StackOnly for &mut T {} impl !StackOnly for crate::utils::shared::r#static::ThreadBlockShared {} - // impl !StackOnly for - // crate::utils::shared::slice::ThreadBlockSharedSlice {} + impl !StackOnly + for crate::utils::shared::slice::ThreadBlockSharedSlice + { + } impl StackOnly for core::marker::PhantomData {} } diff --git a/src/utils/shared/mod.rs b/src/utils/shared/mod.rs index dcfe3b008..88a586ad6 100644 --- a/src/utils/shared/mod.rs +++ b/src/utils/shared/mod.rs @@ -1,2 +1,2 @@ -// pub mod slice; +pub mod slice; pub mod r#static; diff --git a/src/utils/shared/slice.rs b/src/utils/shared/slice.rs index 098670fba..238b1aac8 100644 --- a/src/utils/shared/slice.rs +++ b/src/utils/shared/slice.rs @@ -1,73 +1,151 @@ +#[cfg(not(target_os = "cuda"))] +use core::marker::PhantomData; + +use const_type_layout::TypeGraphLayout; use rustacuda_core::DeviceCopy; +use crate::common::{CudaAsRust, DeviceAccessible, RustToCuda}; + +#[cfg(not(target_os = "cuda"))] +#[allow(clippy::module_name_repetitions)] +#[repr(transparent)] +pub struct ThreadBlockSharedSlice { + len: usize, + marker: PhantomData, +} + +#[cfg(target_os = "cuda")] #[allow(clippy::module_name_repetitions)] +#[repr(transparent)] +pub struct ThreadBlockSharedSlice { + shared: *mut [T], +} + +#[doc(hidden)] #[derive(TypeLayout)] +#[layout(bound = "T: 'static + ~const TypeGraphLayout")] #[repr(C)] -pub struct ThreadBlockSharedSlice { +pub struct ThreadBlockSharedSliceCudaRepresentation { len: usize, - byte_offset: usize, + // Note: uses a zero-element array instead of PhantomData here so that + // TypeLayout can still observe T's layout marker: [T; 0], } -unsafe impl DeviceCopy for ThreadBlockSharedSlice {} +unsafe impl DeviceCopy + for ThreadBlockSharedSliceCudaRepresentation +{ +} -#[cfg(not(any(all(not(feature = "host"), target_os = "cuda"), doc)))] -#[doc(cfg(not(all(not(feature = "host"), target_os = "cuda"))))] -impl ThreadBlockSharedSlice { +// #[cfg(not(any(all(not(feature = "host"), target_os = "cuda"), doc)))] +// #[doc(cfg(not(all(not(feature = "host"), target_os = "cuda"))))] +impl ThreadBlockSharedSlice { + #[cfg(any(not(target_os = "cuda"), doc))] + #[doc(cfg(not(target_os = "cuda")))] #[must_use] - pub fn with_len(len: usize) -> Self { + pub fn new_uninit_with_len(len: usize) -> Self { Self { len, - byte_offset: 0, - marker: [], + marker: PhantomData::, } } + #[cfg(not(target_os = "cuda"))] #[must_use] pub fn len(&self) -> usize { self.len } + #[cfg(target_os = "cuda")] + #[must_use] + pub fn len(&self) -> usize { + core::ptr::metadata(self.shared) + } + #[must_use] pub fn is_empty(&self) -> bool { - self.len == 0 + self.len() == 0 + } + + #[cfg(any(target_os = "cuda", doc))] + #[doc(cfg(target_os = "cuda"))] + #[must_use] + pub fn as_mut_slice_ptr(&self) -> *mut [T] { + self.shared + } + + #[cfg(any(target_os = "cuda", doc))] + #[doc(cfg(target_os = "cuda"))] + #[must_use] + pub fn as_mut_ptr(&self) -> *mut T { + self.shared.cast() } } -#[cfg(all(not(feature = "host"), target_os = "cuda"))] -#[doc(cfg(all(not(feature = "host"), target_os = "cuda")))] -impl ThreadBlockSharedSlice { - /// # Safety - /// - /// The thread-block shared dynamic memory must be initialised once and - /// only once per kernel. - pub unsafe fn init() { - unsafe { - core::arch::asm!( - ".shared .align {align} .b8 rust_cuda_dynamic_shared[];", - align = const(core::mem::align_of::()), - ); - } +unsafe impl RustToCuda for ThreadBlockSharedSlice { + #[cfg(feature = "host")] + #[doc(cfg(feature = "host"))] + type CudaAllocation = crate::host::NullCudaAlloc; + type CudaRepresentation = ThreadBlockSharedSliceCudaRepresentation; + + #[cfg(feature = "host")] + #[doc(cfg(feature = "host"))] + unsafe fn borrow( + &self, + alloc: A, + ) -> rustacuda::error::CudaResult<( + DeviceAccessible, + crate::host::CombinedCudaAlloc, + )> { + Ok(( + DeviceAccessible::from(ThreadBlockSharedSliceCudaRepresentation { + len: self.len, + marker: [], + }), + crate::host::CombinedCudaAlloc::new(crate::host::NullCudaAlloc, alloc), + )) } - /// # Safety - /// - /// Exposing the [`ThreadBlockSharedSlice`] must be preceded by exactly one - /// call to [`ThreadBlockSharedSlice::init`] for the type `T` amongst - /// all `ThreadBlockSharedSlice` that has the largest alignment. - pub unsafe fn with_uninit Q, Q>(self, inner: F) -> Q { - let base: *mut u8; - - unsafe { - core::arch::asm!( - "cvta.shared.u64 {reg}, rust_cuda_dynamic_shared;", - reg = out(reg64) base, - ); - } + #[cfg(feature = "host")] + #[doc(cfg(feature = "host"))] + unsafe fn restore( + &mut self, + alloc: crate::host::CombinedCudaAlloc, + ) -> rustacuda::error::CudaResult { + let (_null, alloc): (crate::host::NullCudaAlloc, A) = alloc.split(); + + Ok(alloc) + } +} + +unsafe impl CudaAsRust + for ThreadBlockSharedSliceCudaRepresentation +{ + type RustRepresentation = ThreadBlockSharedSlice; + + #[cfg(any(not(feature = "host"), doc))] + #[doc(cfg(not(feature = "host")))] + unsafe fn as_rust(_this: &DeviceAccessible) -> Self::RustRepresentation { + todo!() + + // unsafe { + // core::arch::asm!( + // ".shared .align {align} .b8 rust_cuda_dynamic_shared[];", + // align = const(core::mem::align_of::()), + // ); + // } + + // let base: *mut u8; - let slice = - core::ptr::slice_from_raw_parts_mut(base.add(self.byte_offset).cast(), self.len); + // unsafe { + // core::arch::asm!( + // "cvta.shared.u64 {reg}, rust_cuda_dynamic_shared;", + // reg = out(reg64) base, + // ); + // } - inner(slice) + // let slice = core::ptr::slice_from_raw_parts_mut( + // base.add(self.byte_offset).cast(), self.len, + // ); } } diff --git a/src/utils/shared/static.rs b/src/utils/shared/static.rs index fc3e86b3a..b93e24523 100644 --- a/src/utils/shared/static.rs +++ b/src/utils/shared/static.rs @@ -29,6 +29,41 @@ pub struct ThreadBlockSharedCudaRepresentation { unsafe impl DeviceCopy for ThreadBlockSharedCudaRepresentation {} +impl ThreadBlockShared { + #[cfg(not(target_os = "cuda"))] + #[must_use] + pub fn new_uninit() -> Self { + Self { + marker: PhantomData::, + } + } + + #[cfg(target_os = "cuda")] + #[must_use] + pub fn new_uninit() -> Self { + let shared: *mut T; + + unsafe { + core::arch::asm!( + ".shared .align {align} .b8 {reg}_rust_cuda_static_shared[{size}];", + "cvta.shared.u64 {reg}, {reg}_rust_cuda_static_shared;", + reg = out(reg64) shared, + align = const(core::mem::align_of::()), + size = const(core::mem::size_of::()), + ); + } + + Self { shared } + } + + #[cfg(any(target_os = "cuda", doc))] + #[doc(cfg(target_os = "cuda"))] + #[must_use] + pub fn as_mut_ptr(&self) -> *mut T { + self.shared + } +} + unsafe impl RustToCuda for ThreadBlockShared { #[cfg(feature = "host")] #[doc(cfg(feature = "host"))] @@ -73,40 +108,3 @@ unsafe impl CudaAsRust ThreadBlockShared::new_uninit() } } - -#[cfg(not(any(all(not(feature = "host"), target_os = "cuda"), doc)))] -#[doc(cfg(not(all(not(feature = "host"), target_os = "cuda"))))] -impl ThreadBlockShared { - #[must_use] - pub fn new_uninit() -> Self { - Self { - marker: PhantomData::, - } - } -} - -#[cfg(any(all(not(feature = "host"), target_os = "cuda"), doc))] -#[doc(cfg(all(not(feature = "host"), target_os = "cuda")))] -impl ThreadBlockShared { - #[must_use] - pub fn new_uninit() -> Self { - let shared: *mut T; - - unsafe { - core::arch::asm!( - ".shared .align {align} .b8 {reg}_rust_cuda_static_shared[{size}];", - "cvta.shared.u64 {reg}, {reg}_rust_cuda_static_shared;", - reg = out(reg64) shared, - align = const(core::mem::align_of::()), - size = const(core::mem::size_of::()), - ); - } - - Self { shared } - } - - #[must_use] - pub fn as_mut_ptr(&self) -> *mut T { - self.shared - } -} From b0826d7ffa454414e863ad946a85d9aeb96fa440 Mon Sep 17 00:00:00 2001 From: Juniper Tyree Date: Sun, 8 Jan 2023 14:31:06 +0000 Subject: [PATCH 021/120] Backup of progress on compile-time PTX checking --- examples/single-source/src/main.rs | 15 ++- rust-cuda-derive/Cargo.toml | 2 + rust-cuda-derive/build.rs | 3 + rust-cuda-derive/src/kernel/link/config.rs | 3 + rust-cuda-derive/src/kernel/link/mod.rs | 122 +++++++++++++++++- .../generate/cpu_linker_macro/get_ptx_str.rs | 2 +- src/safety/device_copy.rs | 7 + src/safety/no_aliasing.rs | 6 + 8 files changed, 152 insertions(+), 8 deletions(-) create mode 100644 rust-cuda-derive/build.rs diff --git a/examples/single-source/src/main.rs b/examples/single-source/src/main.rs index 2e1c9e199..c3b83d5ec 100644 --- a/examples/single-source/src/main.rs +++ b/examples/single-source/src/main.rs @@ -10,6 +10,7 @@ extern crate alloc; +#[cfg(target_os = "cuda")] use rc::utils::shared::r#static::ThreadBlockShared; #[cfg(not(target_os = "cuda"))] @@ -45,23 +46,25 @@ pub fn kernel<'a, T: rc::common::RustToCuda>( #[kernel(pass = LendRustToCuda)] _z: &ShallowCopy>, #[kernel(pass = SafeDeviceCopy, jit)] _v @ _w: &'a core::sync::atomic::AtomicU64, #[kernel(pass = LendRustToCuda)] _: Wrapper, - #[kernel(pass = SafeDeviceCopy)] Tuple(_s, mut __t): Tuple, - #[kernel(pass = LendRustToCuda)] shared3: ThreadBlockShared, + #[kernel(pass = SafeDeviceCopy)] Tuple(s, mut __t): Tuple, + // #[kernel(pass = LendRustToCuda)] shared3: ThreadBlockShared, ) where ::CudaRepresentation: rc::safety::StackOnly, { let shared: ThreadBlockShared<[Tuple; 3]> = ThreadBlockShared::new_uninit(); let shared2: ThreadBlockShared<[Tuple; 3]> = ThreadBlockShared::new_uninit(); + #[allow(clippy::cast_possible_truncation, clippy::cast_sign_loss)] unsafe { - (*shared.as_mut_ptr().cast::().add(1)).0 = 42; + (*shared.as_mut_ptr().cast::().add(1)).0 = (f64::from(s) * 2.0) as u32; } unsafe { (*shared2.as_mut_ptr().cast::().add(2)).1 = 24; } - unsafe { - *shared3.as_mut_ptr() = 12; - } + unsafe { core::arch::asm!("hi") } + // unsafe { + // *shared3.as_mut_ptr() = 12; + // } } #[cfg(not(target_os = "cuda"))] diff --git a/rust-cuda-derive/Cargo.toml b/rust-cuda-derive/Cargo.toml index 4b8677df4..788a08716 100644 --- a/rust-cuda-derive/Cargo.toml +++ b/rust-cuda-derive/Cargo.toml @@ -4,6 +4,7 @@ version = "0.1.0" authors = ["Juniper Tyree "] license = "MIT OR Apache-2.0" edition = "2021" +links = "libnvptxcompiler_static" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html @@ -24,3 +25,4 @@ colored = "2.0" seahash = "4.1" ptx-builder = { git = "https://github.com/juntyr/rust-ptx-builder", rev = "1f1f49d" } +ptx_compiler = "0.1" diff --git a/rust-cuda-derive/build.rs b/rust-cuda-derive/build.rs new file mode 100644 index 000000000..27d940ad2 --- /dev/null +++ b/rust-cuda-derive/build.rs @@ -0,0 +1,3 @@ +fn main() { + println!("cargo:rustc-link-lib=nvptxcompiler_static"); +} diff --git a/rust-cuda-derive/src/kernel/link/config.rs b/rust-cuda-derive/src/kernel/link/config.rs index cdfd0b575..bb5f011d6 100644 --- a/rust-cuda-derive/src/kernel/link/config.rs +++ b/rust-cuda-derive/src/kernel/link/config.rs @@ -3,6 +3,7 @@ use std::path::PathBuf; #[allow(clippy::module_name_repetitions)] pub(super) struct LinkKernelConfig { pub(super) kernel: syn::Ident, + pub(super) kernel_hash: syn::Ident, pub(super) args: syn::Ident, pub(super) crate_name: String, pub(super) crate_path: PathBuf, @@ -12,6 +13,7 @@ pub(super) struct LinkKernelConfig { impl syn::parse::Parse for LinkKernelConfig { fn parse(input: syn::parse::ParseStream) -> syn::Result { let kernel: syn::Ident = input.parse()?; + let kernel_hash: syn::Ident = input.parse()?; let args: syn::Ident = input.parse()?; let name: syn::LitStr = input.parse()?; let path: syn::LitStr = input.parse()?; @@ -37,6 +39,7 @@ impl syn::parse::Parse for LinkKernelConfig { Ok(Self { kernel, + kernel_hash, args, crate_name: name.value(), crate_path: PathBuf::from(path.value()), diff --git a/rust-cuda-derive/src/kernel/link/mod.rs b/rust-cuda-derive/src/kernel/link/mod.rs index 506d8ea03..1b116435c 100644 --- a/rust-cuda-derive/src/kernel/link/mod.rs +++ b/rust-cuda-derive/src/kernel/link/mod.rs @@ -1,7 +1,12 @@ use std::{ - env, fs, + env, + ffi::CString, + fs, io::{Read, Write}, + mem::MaybeUninit, + os::raw::c_int, path::{Path, PathBuf}, + ptr::addr_of_mut, sync::atomic::{AtomicBool, Ordering}, }; @@ -11,6 +16,7 @@ use ptx_builder::{ builder::{BuildStatus, Builder, MessageFormat, Profile}, error::{BuildErrorKind, Error, Result}, }; +use ptx_compiler::sys::size_t; use super::utils::skip_kernel_compilation; @@ -56,6 +62,7 @@ pub fn link_kernel(tokens: TokenStream) -> TokenStream { let LinkKernelConfig { kernel, + kernel_hash, args, crate_name, crate_path, @@ -199,6 +206,119 @@ pub fn link_kernel(tokens: TokenStream) -> TokenStream { kernel_ptx.replace_range(type_layout_start..type_layout_end, ""); } + let mut compiler = MaybeUninit::uninit(); + let r = unsafe { + ptx_compiler::sys::nvPTXCompilerCreate( + compiler.as_mut_ptr(), + kernel_ptx.len() as size_t, + kernel_ptx.as_ptr().cast(), + ) + }; + emit_call_site_warning!("PTX compiler create result {}", r); + let compiler = unsafe { compiler.assume_init() }; + + let mut major = 0; + let mut minor = 0; + let r = unsafe { + ptx_compiler::sys::nvPTXCompilerGetVersion(addr_of_mut!(major), addr_of_mut!(minor)) + }; + emit_call_site_warning!("PTX version result {}", r); + emit_call_site_warning!("PTX compiler version {}.{}", major, minor); + + let kernel_name = if specialisation.is_empty() { + format!("{kernel_hash}_kernel") + } else { + format!( + "{kernel_hash}_kernel_{:016x}", + seahash::hash(specialisation.as_bytes()) + ) + }; + + let options = vec![ + CString::new("--entry").unwrap(), + CString::new(kernel_name).unwrap(), + CString::new("--verbose").unwrap(), + CString::new("--warn-on-double-precision-use").unwrap(), + CString::new("--warn-on-local-memory-usage").unwrap(), + CString::new("--warn-on-spills").unwrap(), + ]; + let options_ptrs = options.iter().map(|o| o.as_ptr()).collect::>(); + + let r = unsafe { + ptx_compiler::sys::nvPTXCompilerCompile( + compiler, + options_ptrs.len() as c_int, + options_ptrs.as_ptr().cast(), + ) + }; + emit_call_site_warning!("PTX compile result {}", r); + + let mut info_log_size = 0; + let r = unsafe { + ptx_compiler::sys::nvPTXCompilerGetInfoLogSize(compiler, addr_of_mut!(info_log_size)) + }; + emit_call_site_warning!("PTX info log size result {}", r); + #[allow(clippy::cast_possible_truncation)] + let mut info_log: Vec = Vec::with_capacity(info_log_size as usize); + if info_log_size > 0 { + let r = unsafe { + ptx_compiler::sys::nvPTXCompilerGetInfoLog(compiler, info_log.as_mut_ptr().cast()) + }; + emit_call_site_warning!("PTX info log content result {}", r); + #[allow(clippy::cast_possible_truncation)] + unsafe { + info_log.set_len(info_log_size as usize); + } + } + let info_log = String::from_utf8_lossy(&info_log); + + let mut error_log_size = 0; + let r = unsafe { + ptx_compiler::sys::nvPTXCompilerGetErrorLogSize(compiler, addr_of_mut!(error_log_size)) + }; + emit_call_site_warning!("PTX error log size result {}", r); + #[allow(clippy::cast_possible_truncation)] + let mut error_log: Vec = Vec::with_capacity(error_log_size as usize); + if error_log_size > 0 { + let r = unsafe { + ptx_compiler::sys::nvPTXCompilerGetErrorLog(compiler, error_log.as_mut_ptr().cast()) + }; + emit_call_site_warning!("PTX error log content result {}", r); + #[allow(clippy::cast_possible_truncation)] + unsafe { + error_log.set_len(error_log_size as usize); + } + } + let error_log = String::from_utf8_lossy(&error_log); + + // Ensure the compiler is not dropped + let mut compiler = MaybeUninit::new(compiler); + let r = unsafe { ptx_compiler::sys::nvPTXCompilerDestroy(compiler.as_mut_ptr()) }; + emit_call_site_warning!("PTX compiler destroy result {}", r); + + if !info_log.is_empty() { + emit_call_site_warning!("PTX compiler info log:\n{}", info_log); + } + if !error_log.is_empty() { + let mut max_lines = kernel_ptx.chars().filter(|c| *c == '\n').count() + 1; + let mut indent = 0; + while max_lines > 0 { + max_lines /= 10; + indent += 1; + } + + abort_call_site!( + "PTX compiler error log:\n{}\nPTX source:\n{}", + error_log, + kernel_ptx + .lines() + .enumerate() + .map(|(i, l)| format!("{:indent$}| {l}", i + 1)) + .collect::>() + .join("\n") + ); + } + (quote! { const PTX_STR: &'static str = #kernel_ptx; #(#type_layouts)* }).into() } diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/get_ptx_str.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/get_ptx_str.rs index 179ba7eed..d412bd316 100644 --- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/get_ptx_str.rs +++ b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/get_ptx_str.rs @@ -84,7 +84,7 @@ pub(super) fn quote_get_ptx_str( quote! { fn get_ptx_str() -> &'static str { #crate_path::host::link_kernel!{ - #func_ident #args #crate_name #crate_manifest_dir #generic_start_token + #func_ident #func_ident_hash #args #crate_name #crate_manifest_dir #generic_start_token #($#macro_type_ids),* #generic_close_token } diff --git a/src/safety/device_copy.rs b/src/safety/device_copy.rs index c5de73430..a2b17627f 100644 --- a/src/safety/device_copy.rs +++ b/src/safety/device_copy.rs @@ -19,4 +19,11 @@ mod sealed { for crate::utils::device_copy::SafeDeviceCopyWrapper { } + + // Only unsafe aliasing is possible since both only expose raw pointers + // impl SafeDeviceCopy for + // crate::utils::shared::r#static::ThreadBlockShared {} + // impl + // SafeDeviceCopy for crate::utils::shared::slice::ThreadBlockSharedSlice + // {} } diff --git a/src/safety/no_aliasing.rs b/src/safety/no_aliasing.rs index 22488efb8..dbc163e59 100644 --- a/src/safety/no_aliasing.rs +++ b/src/safety/no_aliasing.rs @@ -22,4 +22,10 @@ mod private { { } impl NoAliasing for crate::utils::aliasing::SplitSliceOverCudaThreadsDynamicStride {} + + // Only unsafe aliasing is possible since both only expose raw pointers + // impl NoAliasing for + // crate::utils::shared::r#static::ThreadBlockShared {} + // impl NoAliasing + // for crate::utils::shared::slice::ThreadBlockSharedSlice {} } From 5538d71c998a7679fe6f533e6edd3d7ffc876408 Mon Sep 17 00:00:00 2001 From: Juniper Tyree Date: Sat, 6 May 2023 14:49:33 +0000 Subject: [PATCH 022/120] Clean up the PTX JIT implementation --- .../cpu_linker_macro/kernel_func_async/mod.rs | 8 +-- .../kernel_func_async/type_wrap.rs | 67 ++++++++++++------- rust-cuda-ptx-jit/src/host/arguments.rs | 48 ------------- .../src/host/compiler/replace.rs | 6 +- rust-cuda-ptx-jit/src/host/mod.rs | 2 - rust-cuda-ptx-jit/src/lib.rs | 8 +++ src/host.rs | 2 +- 7 files changed, 56 insertions(+), 85 deletions(-) delete mode 100644 rust-cuda-ptx-jit/src/host/arguments.rs diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func_async/mod.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func_async/mod.rs index 44cc4d904..c01dcdce3 100644 --- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func_async/mod.rs +++ b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func_async/mod.rs @@ -35,7 +35,7 @@ pub(super) fn quote_kernel_func_async( macro_type_ids, ); let (func_input_wrap, func_cpu_ptx_jit_wrap) = - generate_func_input_and_ptx_jit_wraps(func_inputs); + generate_func_input_and_ptx_jit_wraps(crate_path, func_inputs); let (cpu_func_types_launch, cpu_func_lifetime_erased_types, cpu_func_unboxed_types) = generate_launch_types( crate_path, @@ -60,11 +60,9 @@ pub(super) fn quote_kernel_func_async( } = #crate_path::host::Launcher::get_launch_package(self); let kernel_jit_result = if config.ptx_jit { - #crate_path::ptx_jit::compilePtxJITwithArguments! { - kernel.compile_with_ptx_jit_args(#(#func_cpu_ptx_jit_wrap),*) - }? + kernel.compile_with_ptx_jit_args(#func_cpu_ptx_jit_wrap)? } else { - kernel.compile_with_ptx_jit_args(None)? + kernel.compile_with_ptx_jit_args(None)? }; let function = match kernel_jit_result { diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func_async/type_wrap.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func_async/type_wrap.rs index 50ea505f1..54ba2945b 100644 --- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func_async/type_wrap.rs +++ b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func_async/type_wrap.rs @@ -5,34 +5,49 @@ use crate::kernel::wrapper::InputCudaType; use super::super::super::super::FunctionInputs; pub(super) fn generate_func_input_and_ptx_jit_wraps( + crate_path: &syn::Path, FunctionInputs { func_inputs, func_input_cuda_types, }: &FunctionInputs, -) -> (Vec, Vec) { - func_inputs - .iter() - .zip(func_input_cuda_types.iter()) - .map(|(arg, (cuda_mode, ptx_jit))| match arg { - syn::FnArg::Typed(syn::PatType { pat, ty, .. }) => { - #[allow(clippy::if_same_then_else)] - let func_input = if let syn::Type::Reference(_) = &**ty { - quote! { unsafe { #pat.for_device_async() } } - } else if matches!(cuda_mode, InputCudaType::LendRustToCuda) { - quote! { unsafe { #pat.for_device_async() } } - } else { - quote! { #pat } - }; - - let ptx_load = if ptx_jit.0 { - quote! { ConstLoad[#pat.for_host()] } - } else { - quote! { Ignore[#pat] } - }; - - (func_input, ptx_load) - }, - syn::FnArg::Receiver(_) => unreachable!(), - }) - .unzip() +) -> (Vec, TokenStream) { + let mut any_ptx_jit = false; + + let (func_input_wrap, func_cpu_ptx_jit_wrap): (Vec, Vec) = + func_inputs + .iter() + .zip(func_input_cuda_types.iter()) + .map(|(arg, (cuda_mode, ptx_jit))| match arg { + syn::FnArg::Typed(syn::PatType { pat, ty, .. }) => { + #[allow(clippy::if_same_then_else)] + let func_input = if let syn::Type::Reference(_) = &**ty { + quote! { unsafe { #pat.for_device_async() } } + } else if matches!(cuda_mode, InputCudaType::LendRustToCuda) { + quote! { unsafe { #pat.for_device_async() } } + } else { + quote! { #pat } + }; + + let ptx_load = if ptx_jit.0 { + any_ptx_jit = true; + + quote! { Some(#crate_path::ptx_jit::arg_as_raw_bytes(#pat.for_host())) } + } else { + quote! { None } + }; + + (func_input, ptx_load) + }, + syn::FnArg::Receiver(_) => unreachable!(), + }) + .unzip(); + + if any_ptx_jit { + ( + func_input_wrap, + quote!(Some(&[#(#func_cpu_ptx_jit_wrap),*])), + ) + } else { + (func_input_wrap, quote!(None)) + } } diff --git a/rust-cuda-ptx-jit/src/host/arguments.rs b/rust-cuda-ptx-jit/src/host/arguments.rs deleted file mode 100644 index 0a67d42ea..000000000 --- a/rust-cuda-ptx-jit/src/host/arguments.rs +++ /dev/null @@ -1,48 +0,0 @@ -#[macro_export] -#[doc(hidden)] -#[doc(cfg(feature = "host"))] -#[allow(clippy::module_name_repetitions)] -macro_rules! compilePtxJITwithArguments { - // Invocation without arguments fast track - ($compiler:ident ()) => { - $crate::compilePtxJITwithArguments!($compiler.with_arguments ()) - }; - // Invocation without arguments fast track - ($compiler:ident $(. $path:ident)+ ()) => { - $compiler$(.$path)+(None) - }; - // Invocation with arguments is forwarded to incremental muncher - ($compiler:ident ( $($args:tt)* )) => { - $crate::compilePtxJITwithArguments!($compiler.with_arguments ( $($args)* )) - }; - // Invocation with arguments is forwarded to incremental muncher - ($compiler:ident $(. $path:ident)+ ( $($args:tt)* )) => { - $crate::compilePtxJITwithArguments!(@munch None $compiler$(.$path)+ => [, $($args)*] =>) - }; - // Muncher base case: no `ConstLoad[$expr]` arguments - (@munch None $compiler:ident $(. $path:ident)+ => [] => $($rubbish:expr),*) => { - $compiler$(.$path)+(None) - }; - // Muncher base case: at least one `ConstLoad[$expr]` argument - (@munch Some $compiler:ident $(. $path:ident)+ => [] => $($exprs:expr),*) => { - $compiler$(.$path)+(Some(&[$($exprs),*])) - }; - // Muncher helper case: first `ConstLoad[$expr]` argument is recognised (redirect) - (@munch None $compiler:ident $(. $path:ident)+ => [, ConstLoad [ $head:expr ] $($tail:tt)*] => $($exprs:expr),*) => { - $crate::compilePtxJITwithArguments!(@munch Some $compiler$(.$path)+ => [, ConstLoad [ $head ] $($tail)*] => $($exprs),*) - }; - // Muncher recursive case: much one `Ignore[$expr]` argument (no `ConstLoad[$expr]`s so far) - (@munch None $compiler:ident $(. $path:ident)+ => [, Ignore [ $head:expr ] $($tail:tt)*] => $($exprs:expr),*) => { - $crate::compilePtxJITwithArguments!(@munch None $compiler$(.$path)+ => [$($tail)*] => $($exprs,)* None) - }; - // Muncher recursive case: much one `Ignore[$expr]` argument (some `ConstLoad[$expr]`s already) - (@munch Some $compiler:ident $(. $path:ident)+ => [, Ignore [ $head:expr ] $($tail:tt)*] => $($exprs:expr),*) => { - $crate::compilePtxJITwithArguments!(@munch Some $compiler$(.$path)+ => [$($tail)*] => $($exprs,)* None) - }; - // Muncher recursive case: much one `ConstLoad[$expr]` (some `ConstLoad[$expr]`s already) - (@munch Some $compiler:ident $(. $path:ident)+ => [, ConstLoad [ $head:expr ] $($tail:tt)*] => $($exprs:expr),*) => { - $crate::compilePtxJITwithArguments!(@munch Some $compiler$(.$path)+ => [$($tail)*] => $($exprs,)* Some(unsafe { - ::std::slice::from_raw_parts($head as *const _ as *const u8, ::std::mem::size_of_val($head)) - })) - }; -} diff --git a/rust-cuda-ptx-jit/src/host/compiler/replace.rs b/rust-cuda-ptx-jit/src/host/compiler/replace.rs index df4d270b8..920842d6f 100644 --- a/rust-cuda-ptx-jit/src/host/compiler/replace.rs +++ b/rust-cuda-ptx-jit/src/host/compiler/replace.rs @@ -4,7 +4,7 @@ use super::{PtxElement, PtxJITCompiler, PtxJITResult, PtxLoadWidth}; impl PtxJITCompiler { #[allow(clippy::too_many_lines)] - pub fn with_arguments(&mut self, arguments: Option<&[Option<&[u8]>]>) -> PtxJITResult { + pub fn with_arguments(&mut self, arguments: Option<&[Option<*const [u8]>]>) -> PtxJITResult { // Check if the arguments, cast as byte slices, are the same as the last cached // ones #[allow(clippy::explicit_deref_methods)] @@ -16,7 +16,7 @@ impl PtxJITCompiler { .zip(last_arguments.iter()) .all(|(a, b)| match (a, b) { (None, None) => false, - (Some(a), Some(b)) => *a != b.deref(), + (Some(a), Some(b)) => (unsafe { &**a }) != b.deref(), _ => true, }) }, @@ -30,7 +30,7 @@ impl PtxJITCompiler { self.last_arguments = arguments.map(|arguments| { arguments .iter() - .map(|arg| arg.map(|bytes| bytes.to_owned().into_boxed_slice())) + .map(|arg| arg.map(|bytes| unsafe { &*bytes }.to_owned().into_boxed_slice())) .collect::>>>() .into_boxed_slice() }); diff --git a/rust-cuda-ptx-jit/src/host/mod.rs b/rust-cuda-ptx-jit/src/host/mod.rs index d0d9ffb53..2ace3405d 100644 --- a/rust-cuda-ptx-jit/src/host/mod.rs +++ b/rust-cuda-ptx-jit/src/host/mod.rs @@ -1,4 +1,2 @@ pub mod compiler; pub mod kernel; - -mod arguments; diff --git a/rust-cuda-ptx-jit/src/lib.rs b/rust-cuda-ptx-jit/src/lib.rs index ae6080a3e..1f22b2830 100644 --- a/rust-cuda-ptx-jit/src/lib.rs +++ b/rust-cuda-ptx-jit/src/lib.rs @@ -1,5 +1,6 @@ #![deny(clippy::pedantic)] #![cfg_attr(not(feature = "host"), no_std)] +#![feature(ptr_from_ref)] #![feature(doc_cfg)] #![doc(html_root_url = "https://juntyr.github.io/rust-cuda/")] @@ -12,3 +13,10 @@ pub use host::{compiler::PtxJITCompiler, compiler::PtxJITResult, kernel::CudaKer #[cfg(any(not(feature = "host"), doc))] #[doc(cfg(not(feature = "host")))] mod device; + +pub fn arg_as_raw_bytes(r: &T) -> *const [u8] { + core::ptr::slice_from_raw_parts( + core::ptr::from_ref(r).cast::(), + core::mem::size_of_val(r), + ) +} diff --git a/src/host.rs b/src/host.rs index a104c50a3..591ed4ed5 100644 --- a/src/host.rs +++ b/src/host.rs @@ -95,7 +95,7 @@ impl TypedKernel { /// (from [`Self::new`]). pub fn compile_with_ptx_jit_args( &mut self, - arguments: Option<&[Option<&[u8]>]>, + arguments: Option<&[Option<*const [u8]>]>, ) -> CudaResult { let ptx_jit = self.compiler.with_arguments(arguments); From eb576605b3415f0eaa9a7b5a028a57e754f7d5d2 Mon Sep 17 00:00:00 2001 From: Juniper Tyree Date: Sat, 6 May 2023 15:11:54 +0000 Subject: [PATCH 023/120] Add convenience functions for ThreadBlockShared arrays --- examples/single-source/src/main.rs | 15 +++++++-------- src/utils/shared/static.rs | 23 +++++++++++++++++++++++ 2 files changed, 30 insertions(+), 8 deletions(-) diff --git a/examples/single-source/src/main.rs b/examples/single-source/src/main.rs index c3b83d5ec..af382ff42 100644 --- a/examples/single-source/src/main.rs +++ b/examples/single-source/src/main.rs @@ -10,7 +10,6 @@ extern crate alloc; -#[cfg(target_os = "cuda")] use rc::utils::shared::r#static::ThreadBlockShared; #[cfg(not(target_os = "cuda"))] @@ -47,7 +46,7 @@ pub fn kernel<'a, T: rc::common::RustToCuda>( #[kernel(pass = SafeDeviceCopy, jit)] _v @ _w: &'a core::sync::atomic::AtomicU64, #[kernel(pass = LendRustToCuda)] _: Wrapper, #[kernel(pass = SafeDeviceCopy)] Tuple(s, mut __t): Tuple, - // #[kernel(pass = LendRustToCuda)] shared3: ThreadBlockShared, + #[kernel(pass = LendRustToCuda)] shared3: ThreadBlockShared, ) where ::CudaRepresentation: rc::safety::StackOnly, { @@ -56,15 +55,15 @@ pub fn kernel<'a, T: rc::common::RustToCuda>( #[allow(clippy::cast_possible_truncation, clippy::cast_sign_loss)] unsafe { - (*shared.as_mut_ptr().cast::().add(1)).0 = (f64::from(s) * 2.0) as u32; + (*shared.index_mut(1)).0 = (f64::from(s) * 2.0) as u32; + } + unsafe { + (*shared2.index_mut(2)).1 = 24; } + // unsafe { core::arch::asm!("hi") } unsafe { - (*shared2.as_mut_ptr().cast::().add(2)).1 = 24; + *shared3.as_mut_ptr() = 12; } - unsafe { core::arch::asm!("hi") } - // unsafe { - // *shared3.as_mut_ptr() = 12; - // } } #[cfg(not(target_os = "cuda"))] diff --git a/src/utils/shared/static.rs b/src/utils/shared/static.rs index b93e24523..58973ace4 100644 --- a/src/utils/shared/static.rs +++ b/src/utils/shared/static.rs @@ -7,12 +7,14 @@ use rustacuda_core::DeviceCopy; use crate::common::{CudaAsRust, DeviceAccessible, RustToCuda}; #[cfg(not(target_os = "cuda"))] +#[derive(TypeLayout)] #[repr(transparent)] pub struct ThreadBlockShared { marker: PhantomData, } #[cfg(target_os = "cuda")] +#[derive(TypeLayout)] #[repr(transparent)] pub struct ThreadBlockShared { shared: *mut T, @@ -64,6 +66,27 @@ impl ThreadBlockShared { } } +impl ThreadBlockShared<[T; N]> { + #[cfg(any(target_os = "cuda", doc))] + #[doc(cfg(target_os = "cuda"))] + #[inline] + #[must_use] + pub fn index(&self, index: usize) -> *const T { + self.index_mut(index) + } + + #[cfg(any(target_os = "cuda", doc))] + #[doc(cfg(target_os = "cuda"))] + #[inline] + #[must_use] + pub fn index_mut(&self, index: usize) -> *mut T { + assert!(index < N); + + // Safety: Since *[T; N] is valid, *T is valid iff index < N + unsafe { self.shared.cast::().add(index) } + } +} + unsafe impl RustToCuda for ThreadBlockShared { #[cfg(feature = "host")] #[doc(cfg(feature = "host"))] From a5ffb0e8da5efda143d1731127a2f11a62cb002e Mon Sep 17 00:00:00 2001 From: Juniper Tyree Date: Sat, 6 May 2023 16:23:38 +0000 Subject: [PATCH 024/120] Improve and fix CI --- .github/workflows/ci.yml | 109 ++++++------------------- examples/single-source/src/main.rs | 4 +- src/lib.rs | 1 + src/safety/device_copy.rs | 8 +- src/safety/no_aliasing.rs | 5 +- src/safety/stack_only.rs | 2 +- src/utils/shared/slice.rs | 127 ++++++++--------------------- src/utils/shared/static.rs | 25 +++--- 8 files changed, 74 insertions(+), 207 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 57b6377f7..2e66a8ed9 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -40,53 +40,25 @@ jobs: sudo ./llvm.sh $(rustc --version -v | grep -oP "LLVM version: \K\d+") rm llvm.sh cargo install rust-ptx-linker --git https://github.com/juntyr/rust-ptx-linker --force + + - name: Install cargo-hack + uses: taiki-e/install-action@cargo-hack - - name: Check without features on CPU - run: | - cargo check - - - name: Check with alloc feature on CPU - run: | - cargo check \ - --features alloc - - - name: Check with derive feature on CPU - run: | - cargo check \ - --features derive - - - name: Check with host feature on CPU - run: | - cargo check \ - --features host - - - name: Check with host,derive,alloc features on CPU + - name: Check feature powerset on the CPU run: | - cargo check \ - --features host,derive,alloc + cargo hack check --feature-powerset --optional-deps \ + --keep-going - - name: Check without features on CUDA + - name: Check feature powerset on CUDA run: | - cargo check \ + cargo hack check --feature-powerset --optional-deps \ + --skip host,rustacuda,rustacuda_derive \ + --keep-going \ --target nvptx64-nvidia-cuda - - name: Check with alloc feature on CUDA - run: | - cargo check \ - --target nvptx64-nvidia-cuda \ - --features alloc - - - name: Check with derive feature on CUDA - run: | - cargo check \ - --target nvptx64-nvidia-cuda \ - --features derive - - name: Check all workspace targets run: | - cargo check \ - --workspace \ - --all-targets + cargo check --workspace --all-targets test: name: Test Suite @@ -176,58 +148,23 @@ jobs: rm llvm.sh cargo install rust-ptx-linker --git https://github.com/juntyr/rust-ptx-linker --force - - name: Check the code style without features on CPU - run: | - cargo clippy \ - -- -D warnings - - - name: Check the code style with alloc feature on CPU - run: | - cargo clippy \ - --features alloc \ - -- -D warnings - - - name: Check the code style with derive feature on CPU - run: | - cargo clippy \ - --features derive \ - -- -D warnings - - - name: Check the code style with host feature on CPU - run: | - cargo clippy \ - --features host \ - -- -D warnings - - - name: Check the code style with host,derive,alloc features on CPU - run: | - cargo clippy \ - --features host,derive,alloc \ - -- -D warnings - - - name: Check the code style without features on CUDA - run: | - cargo clippy \ - --target nvptx64-nvidia-cuda \ - -- -D warnings + - name: Install cargo-hack + uses: taiki-e/install-action@cargo-hack - - name: Check the code style with alloc feature on CUDA + - name: Check feature powerset on the CPU run: | - cargo clippy \ - --target nvptx64-nvidia-cuda \ - --features alloc \ + cargo hack clippy --feature-powerset --optional-deps \ + --keep-going \ -- -D warnings - - - name: Check the code style with derive feature on CUDA + + - name: Check feature powerset on CUDA run: | - cargo clippy \ + cargo hack clippy --feature-powerset --optional-deps \ + --skip host,rustacuda,rustacuda_derive \ + --keep-going \ --target nvptx64-nvidia-cuda \ - --features derive \ -- -D warnings - - name: Check the code style for all workspace targets + - name: Check all workspace targets run: | - cargo clippy \ - --workspace \ - --all-targets \ - -- -D warnings + cargo clippy --workspace --all-targets -- -D warnings diff --git a/examples/single-source/src/main.rs b/examples/single-source/src/main.rs index af382ff42..e1030d261 100644 --- a/examples/single-source/src/main.rs +++ b/examples/single-source/src/main.rs @@ -55,10 +55,10 @@ pub fn kernel<'a, T: rc::common::RustToCuda>( #[allow(clippy::cast_possible_truncation, clippy::cast_sign_loss)] unsafe { - (*shared.index_mut(1)).0 = (f64::from(s) * 2.0) as u32; + (*shared.index_mut_unchecked(1)).0 = (f64::from(s) * 2.0) as u32; } unsafe { - (*shared2.index_mut(2)).1 = 24; + (*shared2.index_mut_unchecked(2)).1 = 24; } // unsafe { core::arch::asm!("hi") } unsafe { diff --git a/src/lib.rs b/src/lib.rs index 0c149d40f..de590c29b 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -26,6 +26,7 @@ #![feature(impl_trait_in_assoc_type)] #![allow(incomplete_features)] #![feature(generic_const_exprs)] +#![cfg_attr(target_os = "cuda", feature(slice_ptr_get))] #![doc(html_root_url = "https://juntyr.github.io/rust-cuda/")] #[doc(hidden)] diff --git a/src/safety/device_copy.rs b/src/safety/device_copy.rs index a2b17627f..0631f54d1 100644 --- a/src/safety/device_copy.rs +++ b/src/safety/device_copy.rs @@ -20,10 +20,6 @@ mod sealed { { } - // Only unsafe aliasing is possible since both only expose raw pointers - // impl SafeDeviceCopy for - // crate::utils::shared::r#static::ThreadBlockShared {} - // impl - // SafeDeviceCopy for crate::utils::shared::slice::ThreadBlockSharedSlice - // {} + // No data is actually copied to the device + impl SafeDeviceCopy for crate::utils::shared::r#static::ThreadBlockShared {} } diff --git a/src/safety/no_aliasing.rs b/src/safety/no_aliasing.rs index dbc163e59..0b246a52f 100644 --- a/src/safety/no_aliasing.rs +++ b/src/safety/no_aliasing.rs @@ -24,8 +24,5 @@ mod private { impl NoAliasing for crate::utils::aliasing::SplitSliceOverCudaThreadsDynamicStride {} // Only unsafe aliasing is possible since both only expose raw pointers - // impl NoAliasing for - // crate::utils::shared::r#static::ThreadBlockShared {} - // impl NoAliasing - // for crate::utils::shared::slice::ThreadBlockSharedSlice {} + impl NoAliasing for crate::utils::shared::r#static::ThreadBlockShared {} } diff --git a/src/safety/stack_only.rs b/src/safety/stack_only.rs index eb3a69706..47812cbcc 100644 --- a/src/safety/stack_only.rs +++ b/src/safety/stack_only.rs @@ -37,7 +37,7 @@ mod sealed { impl !StackOnly for &mut T {} impl !StackOnly for crate::utils::shared::r#static::ThreadBlockShared {} - impl !StackOnly + impl !StackOnly for crate::utils::shared::slice::ThreadBlockSharedSlice { } diff --git a/src/utils/shared/slice.rs b/src/utils/shared/slice.rs index 238b1aac8..e1f95ba95 100644 --- a/src/utils/shared/slice.rs +++ b/src/utils/shared/slice.rs @@ -2,14 +2,11 @@ use core::marker::PhantomData; use const_type_layout::TypeGraphLayout; -use rustacuda_core::DeviceCopy; - -use crate::common::{CudaAsRust, DeviceAccessible, RustToCuda}; #[cfg(not(target_os = "cuda"))] #[allow(clippy::module_name_repetitions)] #[repr(transparent)] -pub struct ThreadBlockSharedSlice { +pub struct ThreadBlockSharedSlice { len: usize, marker: PhantomData, } @@ -17,29 +14,11 @@ pub struct ThreadBlockSharedSlice { #[cfg(target_os = "cuda")] #[allow(clippy::module_name_repetitions)] #[repr(transparent)] -pub struct ThreadBlockSharedSlice { +pub struct ThreadBlockSharedSlice { shared: *mut [T], } -#[doc(hidden)] -#[derive(TypeLayout)] -#[layout(bound = "T: 'static + ~const TypeGraphLayout")] -#[repr(C)] -pub struct ThreadBlockSharedSliceCudaRepresentation { - len: usize, - // Note: uses a zero-element array instead of PhantomData here so that - // TypeLayout can still observe T's layout - marker: [T; 0], -} - -unsafe impl DeviceCopy - for ThreadBlockSharedSliceCudaRepresentation -{ -} - -// #[cfg(not(any(all(not(feature = "host"), target_os = "cuda"), doc)))] -// #[doc(cfg(not(all(not(feature = "host"), target_os = "cuda"))))] -impl ThreadBlockSharedSlice { +impl ThreadBlockSharedSlice { #[cfg(any(not(target_os = "cuda"), doc))] #[doc(cfg(not(target_os = "cuda")))] #[must_use] @@ -50,6 +29,22 @@ impl ThreadBlockSharedSlice { } } + #[cfg(any(not(target_os = "cuda"), doc))] + #[doc(cfg(not(target_os = "cuda")))] + #[must_use] + pub fn with_len(mut self, len: usize) -> Self { + self.len = len; + self + } + + #[cfg(any(not(target_os = "cuda"), doc))] + #[doc(cfg(not(target_os = "cuda")))] + #[must_use] + pub fn with_len_mut(&mut self, len: usize) -> &mut Self { + self.len = len; + self + } + #[cfg(not(target_os = "cuda"))] #[must_use] pub fn len(&self) -> usize { @@ -70,82 +65,28 @@ impl ThreadBlockSharedSlice { #[cfg(any(target_os = "cuda", doc))] #[doc(cfg(target_os = "cuda"))] #[must_use] - pub fn as_mut_slice_ptr(&self) -> *mut [T] { - self.shared + pub fn as_mut_ptr(&self) -> *mut T { + self.shared.cast() } #[cfg(any(target_os = "cuda", doc))] #[doc(cfg(target_os = "cuda"))] #[must_use] - pub fn as_mut_ptr(&self) -> *mut T { - self.shared.cast() + pub fn as_mut_slice_ptr(&self) -> *mut [T] { + self.shared } -} -unsafe impl RustToCuda for ThreadBlockSharedSlice { - #[cfg(feature = "host")] - #[doc(cfg(feature = "host"))] - type CudaAllocation = crate::host::NullCudaAlloc; - type CudaRepresentation = ThreadBlockSharedSliceCudaRepresentation; - - #[cfg(feature = "host")] - #[doc(cfg(feature = "host"))] - unsafe fn borrow( + #[cfg(any(target_os = "cuda", doc))] + #[doc(cfg(target_os = "cuda"))] + /// Safety: + /// + /// The provided `index` must not be out of bounds. + #[inline] + #[must_use] + pub unsafe fn index_mut_unchecked>( &self, - alloc: A, - ) -> rustacuda::error::CudaResult<( - DeviceAccessible, - crate::host::CombinedCudaAlloc, - )> { - Ok(( - DeviceAccessible::from(ThreadBlockSharedSliceCudaRepresentation { - len: self.len, - marker: [], - }), - crate::host::CombinedCudaAlloc::new(crate::host::NullCudaAlloc, alloc), - )) - } - - #[cfg(feature = "host")] - #[doc(cfg(feature = "host"))] - unsafe fn restore( - &mut self, - alloc: crate::host::CombinedCudaAlloc, - ) -> rustacuda::error::CudaResult { - let (_null, alloc): (crate::host::NullCudaAlloc, A) = alloc.split(); - - Ok(alloc) - } -} - -unsafe impl CudaAsRust - for ThreadBlockSharedSliceCudaRepresentation -{ - type RustRepresentation = ThreadBlockSharedSlice; - - #[cfg(any(not(feature = "host"), doc))] - #[doc(cfg(not(feature = "host")))] - unsafe fn as_rust(_this: &DeviceAccessible) -> Self::RustRepresentation { - todo!() - - // unsafe { - // core::arch::asm!( - // ".shared .align {align} .b8 rust_cuda_dynamic_shared[];", - // align = const(core::mem::align_of::()), - // ); - // } - - // let base: *mut u8; - - // unsafe { - // core::arch::asm!( - // "cvta.shared.u64 {reg}, rust_cuda_dynamic_shared;", - // reg = out(reg64) base, - // ); - // } - - // let slice = core::ptr::slice_from_raw_parts_mut( - // base.add(self.byte_offset).cast(), self.len, - // ); + index: I, + ) -> *mut >::Output { + self.shared.get_unchecked_mut(index) } } diff --git a/src/utils/shared/static.rs b/src/utils/shared/static.rs index 58973ace4..368dc8296 100644 --- a/src/utils/shared/static.rs +++ b/src/utils/shared/static.rs @@ -69,25 +69,20 @@ impl ThreadBlockShared { impl ThreadBlockShared<[T; N]> { #[cfg(any(target_os = "cuda", doc))] #[doc(cfg(target_os = "cuda"))] + /// Safety: + /// + /// The provided `index` must not be out of bounds. #[inline] #[must_use] - pub fn index(&self, index: usize) -> *const T { - self.index_mut(index) - } - - #[cfg(any(target_os = "cuda", doc))] - #[doc(cfg(target_os = "cuda"))] - #[inline] - #[must_use] - pub fn index_mut(&self, index: usize) -> *mut T { - assert!(index < N); - - // Safety: Since *[T; N] is valid, *T is valid iff index < N - unsafe { self.shared.cast::().add(index) } + pub unsafe fn index_mut_unchecked>( + &self, + index: I, + ) -> *mut >::Output { + core::ptr::slice_from_raw_parts_mut(self.shared.cast::(), N).get_unchecked_mut(index) } } -unsafe impl RustToCuda for ThreadBlockShared { +unsafe impl RustToCuda for ThreadBlockShared { #[cfg(feature = "host")] #[doc(cfg(feature = "host"))] type CudaAllocation = crate::host::NullCudaAlloc; @@ -120,7 +115,7 @@ unsafe impl RustToCuda for ThreadBlockShare } } -unsafe impl CudaAsRust +unsafe impl CudaAsRust for ThreadBlockSharedCudaRepresentation { type RustRepresentation = ThreadBlockShared; From 8864dbf91ba879d4cf53670e95cc7f75ee23bbe5 Mon Sep 17 00:00:00 2001 From: Juniper Tyree Date: Sat, 6 May 2023 17:30:51 +0000 Subject: [PATCH 025/120] Remove broken ThreadBlockShared RustToCuda impl --- examples/single-source/src/main.rs | 9 +++-- src/safety/device_copy.rs | 11 ++++-- src/safety/no_aliasing.rs | 7 +++- src/safety/stack_only.rs | 1 + src/safety/unified_heap.rs | 7 ++++ src/utils/shared/static.rs | 63 ------------------------------ 6 files changed, 27 insertions(+), 71 deletions(-) diff --git a/examples/single-source/src/main.rs b/examples/single-source/src/main.rs index e1030d261..55a2e8046 100644 --- a/examples/single-source/src/main.rs +++ b/examples/single-source/src/main.rs @@ -10,6 +10,7 @@ extern crate alloc; +#[cfg(target_os = "cuda")] use rc::utils::shared::r#static::ThreadBlockShared; #[cfg(not(target_os = "cuda"))] @@ -46,7 +47,7 @@ pub fn kernel<'a, T: rc::common::RustToCuda>( #[kernel(pass = SafeDeviceCopy, jit)] _v @ _w: &'a core::sync::atomic::AtomicU64, #[kernel(pass = LendRustToCuda)] _: Wrapper, #[kernel(pass = SafeDeviceCopy)] Tuple(s, mut __t): Tuple, - #[kernel(pass = LendRustToCuda)] shared3: ThreadBlockShared, + // #[kernel(pass = SafeDeviceCopy)] shared3: ThreadBlockShared, ) where ::CudaRepresentation: rc::safety::StackOnly, { @@ -61,9 +62,9 @@ pub fn kernel<'a, T: rc::common::RustToCuda>( (*shared2.index_mut_unchecked(2)).1 = 24; } // unsafe { core::arch::asm!("hi") } - unsafe { - *shared3.as_mut_ptr() = 12; - } + // unsafe { + // *shared3.as_mut_ptr() = 12; + // } } #[cfg(not(target_os = "cuda"))] diff --git a/src/safety/device_copy.rs b/src/safety/device_copy.rs index 0631f54d1..ee1cef0dc 100644 --- a/src/safety/device_copy.rs +++ b/src/safety/device_copy.rs @@ -7,6 +7,14 @@ mod sealed { #[marker] pub trait SafeDeviceCopy {} + // Thread-block-shared data cannot be copied since information is added inside + // CUDA + impl !SafeDeviceCopy for crate::utils::shared::r#static::ThreadBlockShared {} + impl !SafeDeviceCopy + for crate::utils::shared::slice::ThreadBlockSharedSlice + { + } + impl SafeDeviceCopy for T {} #[cfg(any(feature = "alloc", doc))] impl SafeDeviceCopy for T {} @@ -19,7 +27,4 @@ mod sealed { for crate::utils::device_copy::SafeDeviceCopyWrapper { } - - // No data is actually copied to the device - impl SafeDeviceCopy for crate::utils::shared::r#static::ThreadBlockShared {} } diff --git a/src/safety/no_aliasing.rs b/src/safety/no_aliasing.rs index 0b246a52f..98a180b6a 100644 --- a/src/safety/no_aliasing.rs +++ b/src/safety/no_aliasing.rs @@ -23,6 +23,11 @@ mod private { } impl NoAliasing for crate::utils::aliasing::SplitSliceOverCudaThreadsDynamicStride {} - // Only unsafe aliasing is possible since both only expose raw pointers + // Thread-block-shared data only allows unsafe aliasing since only raw pointers + // are exposed impl NoAliasing for crate::utils::shared::r#static::ThreadBlockShared {} + impl NoAliasing + for crate::utils::shared::slice::ThreadBlockSharedSlice + { + } } diff --git a/src/safety/stack_only.rs b/src/safety/stack_only.rs index 47812cbcc..5dc5c0cbb 100644 --- a/src/safety/stack_only.rs +++ b/src/safety/stack_only.rs @@ -36,6 +36,7 @@ mod sealed { impl !StackOnly for &T {} impl !StackOnly for &mut T {} + // Thread-block-shared data contains data not on the stack impl !StackOnly for crate::utils::shared::r#static::ThreadBlockShared {} impl !StackOnly for crate::utils::shared::slice::ThreadBlockSharedSlice diff --git a/src/safety/unified_heap.rs b/src/safety/unified_heap.rs index 9eda2d550..483b40c3a 100644 --- a/src/safety/unified_heap.rs +++ b/src/safety/unified_heap.rs @@ -38,6 +38,13 @@ mod sealed { impl !UnifiedHeapOnly for &T {} impl !UnifiedHeapOnly for &mut T {} + // Thread-block-shared data contains CUDA-only data + impl !UnifiedHeapOnly for crate::utils::shared::r#static::ThreadBlockShared {} + impl !UnifiedHeapOnly + for crate::utils::shared::slice::ThreadBlockSharedSlice + { + } + impl UnifiedHeapOnly for core::marker::PhantomData {} impl UnifiedHeapOnly for alloc::boxed::Box {} diff --git a/src/utils/shared/static.rs b/src/utils/shared/static.rs index 368dc8296..324c0fdef 100644 --- a/src/utils/shared/static.rs +++ b/src/utils/shared/static.rs @@ -1,36 +1,18 @@ #[cfg(not(target_os = "cuda"))] use core::marker::PhantomData; -use const_type_layout::TypeGraphLayout; -use rustacuda_core::DeviceCopy; - -use crate::common::{CudaAsRust, DeviceAccessible, RustToCuda}; - #[cfg(not(target_os = "cuda"))] -#[derive(TypeLayout)] #[repr(transparent)] pub struct ThreadBlockShared { marker: PhantomData, } #[cfg(target_os = "cuda")] -#[derive(TypeLayout)] #[repr(transparent)] pub struct ThreadBlockShared { shared: *mut T, } -#[doc(hidden)] -#[derive(TypeLayout)] -#[repr(transparent)] -pub struct ThreadBlockSharedCudaRepresentation { - // Note: uses a zero-element array instead of PhantomData here so that - // TypeLayout can still observe T's layout - marker: [T; 0], -} - -unsafe impl DeviceCopy for ThreadBlockSharedCudaRepresentation {} - impl ThreadBlockShared { #[cfg(not(target_os = "cuda"))] #[must_use] @@ -81,48 +63,3 @@ impl ThreadBlockShared<[T; N]> { core::ptr::slice_from_raw_parts_mut(self.shared.cast::(), N).get_unchecked_mut(index) } } - -unsafe impl RustToCuda for ThreadBlockShared { - #[cfg(feature = "host")] - #[doc(cfg(feature = "host"))] - type CudaAllocation = crate::host::NullCudaAlloc; - type CudaRepresentation = ThreadBlockSharedCudaRepresentation; - - #[cfg(feature = "host")] - #[doc(cfg(feature = "host"))] - unsafe fn borrow( - &self, - alloc: A, - ) -> rustacuda::error::CudaResult<( - DeviceAccessible, - crate::host::CombinedCudaAlloc, - )> { - Ok(( - DeviceAccessible::from(ThreadBlockSharedCudaRepresentation { marker: [] }), - crate::host::CombinedCudaAlloc::new(crate::host::NullCudaAlloc, alloc), - )) - } - - #[cfg(feature = "host")] - #[doc(cfg(feature = "host"))] - unsafe fn restore( - &mut self, - alloc: crate::host::CombinedCudaAlloc, - ) -> rustacuda::error::CudaResult { - let (_null, alloc): (crate::host::NullCudaAlloc, A) = alloc.split(); - - Ok(alloc) - } -} - -unsafe impl CudaAsRust - for ThreadBlockSharedCudaRepresentation -{ - type RustRepresentation = ThreadBlockShared; - - #[cfg(any(not(feature = "host"), doc))] - #[doc(cfg(not(feature = "host")))] - unsafe fn as_rust(_this: &DeviceAccessible) -> Self::RustRepresentation { - ThreadBlockShared::new_uninit() - } -} From 9645e3c43adaf0966cba12e0c4d933983d26365c Mon Sep 17 00:00:00 2001 From: Juniper Tyree Date: Sun, 7 May 2023 09:47:16 +0000 Subject: [PATCH 026/120] Refactor kernel trait generation to push more safety constraints to the kernel definition --- examples/single-source/expanded.rs | 1150 +++++++++++++++++ examples/single-source/src/main.rs | 8 +- rust-cuda-derive/src/kernel/link/mod.rs | 2 +- rust-cuda-derive/src/kernel/wrapper/config.rs | 5 + .../generate/cpu_linker_macro/get_ptx_str.rs | 88 +- .../wrapper/generate/cpu_linker_macro/mod.rs | 33 +- .../kernel/wrapper/generate/cpu_wrapper.rs | 193 --- .../kernel_func.rs | 49 +- .../kernel_func_async/async_func_types.rs | 13 +- .../kernel_func_async/launch_types.rs | 42 +- .../kernel_func_async/mod.rs | 70 +- .../kernel_func_async/type_wrap.rs | 0 .../wrapper/generate/cpu_wrapper/mod.rs | 96 ++ rust-cuda-derive/src/kernel/wrapper/mod.rs | 49 +- .../src/rust_to_cuda/field_copy.rs | 4 +- rust-cuda-derive/src/rust_to_cuda/impl.rs | 25 +- rust-cuda-derive/src/rust_to_cuda/mod.rs | 2 +- src/common.rs | 60 +- src/device/mod.rs | 4 + src/host.rs | 46 +- src/utils/aliasing/const.rs | 18 +- src/utils/aliasing/dynamic.rs | 18 +- src/utils/aliasing/final.rs | 18 +- src/utils/box.rs | 12 +- src/utils/boxed_slice.rs | 12 +- src/utils/device_copy.rs | 32 +- src/utils/exchange/buffer/device.rs | 3 +- src/utils/exchange/buffer/host.rs | 6 +- src/utils/exchange/wrapper.rs | 10 +- src/utils/option.rs | 7 +- src/utils/shared/slice.rs | 2 +- src/utils/shared/static.rs | 2 +- 32 files changed, 1611 insertions(+), 468 deletions(-) create mode 100644 examples/single-source/expanded.rs delete mode 100644 rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper.rs rename rust-cuda-derive/src/kernel/wrapper/generate/{cpu_linker_macro => cpu_wrapper}/kernel_func.rs (84%) rename rust-cuda-derive/src/kernel/wrapper/generate/{cpu_linker_macro => cpu_wrapper}/kernel_func_async/async_func_types.rs (89%) rename rust-cuda-derive/src/kernel/wrapper/generate/{cpu_linker_macro => cpu_wrapper}/kernel_func_async/launch_types.rs (63%) rename rust-cuda-derive/src/kernel/wrapper/generate/{cpu_linker_macro => cpu_wrapper}/kernel_func_async/mod.rs (70%) rename rust-cuda-derive/src/kernel/wrapper/generate/{cpu_linker_macro => cpu_wrapper}/kernel_func_async/type_wrap.rs (100%) create mode 100644 rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/mod.rs diff --git a/examples/single-source/expanded.rs b/examples/single-source/expanded.rs new file mode 100644 index 000000000..f16379c37 --- /dev/null +++ b/examples/single-source/expanded.rs @@ -0,0 +1,1150 @@ +#![feature(prelude_import)] +#![deny(clippy::pedantic)] +#![feature(cfg_version)] +#![feature(const_type_name)] +#![feature(const_refs_to_cell)] +#![feature(const_trait_impl)] +#![feature(const_mut_refs)] +#[prelude_import] +use std::prelude::rust_2021::*; +#[macro_use] +extern crate std; +extern crate alloc; +#[cfg(not(target_os = "cuda"))] +fn main() {} +#[repr(C)] +#[layout(crate = "rc::const_type_layout")] +pub struct Dummy(i32); +unsafe impl const rc::const_type_layout::TypeLayout for Dummy { + const TYPE_LAYOUT: rc::const_type_layout::TypeLayoutInfo<'static> = { + rc::const_type_layout::TypeLayoutInfo { + name: ::core::any::type_name::(), + size: ::core::mem::size_of::(), + alignment: ::core::mem::align_of::(), + structure: rc::const_type_layout::TypeStructure::Struct { + repr: "C", + fields: &[ + rc::const_type_layout::Field { + name: "0", + offset: { + { + #[allow(clippy::unneeded_field_pattern)] + let Dummy { 0: _, .. }: Dummy; + if let ::const_type_layout::MaybeUninhabited::Inhabited( + uninit, + ) + = unsafe { + ::uninit() + } { + let base_ptr: *const Dummy = (&raw const uninit).cast(); + #[allow(unused_unsafe)] + let field_ptr = unsafe { &raw const (*base_ptr).0 }; + #[allow(clippy::cast_sign_loss)] + let offset = unsafe { + field_ptr.cast::().offset_from(base_ptr.cast()) as usize + }; + #[allow(clippy::forget_non_drop, clippy::forget_copy)] + core::mem::forget(uninit); + ::const_type_layout::MaybeUninhabited::Inhabited(offset) + } else { + ::const_type_layout::MaybeUninhabited::Uninhabited + } + } + }, + ty: ::core::any::type_name::(), + }, + ], + }, + } + }; + unsafe fn uninit() -> rc::const_type_layout::MaybeUninhabited< + ::core::mem::MaybeUninit, + > { + if let (rc::const_type_layout::MaybeUninhabited::Inhabited(f_0)) + = (::uninit()) { + rc::const_type_layout::MaybeUninhabited::Inhabited( + ::core::mem::MaybeUninit::new(Dummy(f_0.assume_init())), + ) + } else { + rc::const_type_layout::MaybeUninhabited::Uninhabited + } + } +} +unsafe impl const rc::const_type_layout::TypeGraph for Dummy { + fn populate_graph(graph: &mut rc::const_type_layout::TypeLayoutGraph<'static>) { + if graph.insert(&::TYPE_LAYOUT) { + ::populate_graph(graph); + } + } +} +#[cuda(crate = "rc")] +#[allow(dead_code)] +pub struct Wrapper { + #[cuda(embed)] + inner: T, +} +#[allow(dead_code)] +#[doc(hidden)] +#[allow(dead_code)] +#[repr(C)] +#[layout(free = "T")] +#[layout(crate = "rc :: const_type_layout")] +pub struct WrapperCudaRepresentation +where + T: rc::common::RustToCuda, +{ + inner: rc::common::DeviceAccessible< + ::CudaRepresentation, + >, +} +unsafe impl const rc::const_type_layout::TypeLayout for WrapperCudaRepresentation +where + T: rc::common::RustToCuda, +{ + const TYPE_LAYOUT: rc::const_type_layout::TypeLayoutInfo<'static> = { + rc::const_type_layout::TypeLayoutInfo { + name: ::core::any::type_name::(), + size: ::core::mem::size_of::(), + alignment: ::core::mem::align_of::(), + structure: rc::const_type_layout::TypeStructure::Struct { + repr: "C", + fields: &[ + rc::const_type_layout::Field { + name: "inner", + offset: { + { + #[allow(clippy::unneeded_field_pattern)] + let WrapperCudaRepresentation { + inner: _, + .. + }: WrapperCudaRepresentation; + if let ::const_type_layout::MaybeUninhabited::Inhabited( + uninit, + ) + = unsafe { + as ::const_type_layout::TypeLayout>::uninit() + } { + let base_ptr: *const WrapperCudaRepresentation = (&raw const uninit) + .cast(); + #[allow(unused_unsafe)] + let field_ptr = unsafe { &raw const (*base_ptr).inner }; + #[allow(clippy::cast_sign_loss)] + let offset = unsafe { + field_ptr.cast::().offset_from(base_ptr.cast()) as usize + }; + #[allow(clippy::forget_non_drop, clippy::forget_copy)] + core::mem::forget(uninit); + ::const_type_layout::MaybeUninhabited::Inhabited(offset) + } else { + ::const_type_layout::MaybeUninhabited::Uninhabited + } + } + }, + ty: ::core::any::type_name::< + rc::common::DeviceAccessible< + ::CudaRepresentation, + >, + >(), + }, + ], + }, + } + }; + unsafe fn uninit() -> rc::const_type_layout::MaybeUninhabited< + ::core::mem::MaybeUninit, + > { + if let (rc::const_type_layout::MaybeUninhabited::Inhabited(inner)) + = (::CudaRepresentation, + > as rc::const_type_layout::TypeLayout>::uninit()) { + rc::const_type_layout::MaybeUninhabited::Inhabited( + ::core::mem::MaybeUninit::new(WrapperCudaRepresentation { + inner: inner.assume_init(), + }), + ) + } else { + rc::const_type_layout::MaybeUninhabited::Uninhabited + } + } +} +unsafe impl const rc::const_type_layout::TypeGraph for WrapperCudaRepresentation +where + T: rc::common::RustToCuda, +{ + fn populate_graph(graph: &mut rc::const_type_layout::TypeLayoutGraph<'static>) { + if graph.insert(&::TYPE_LAYOUT) { + ::CudaRepresentation, + > as rc::const_type_layout::TypeGraph>::populate_graph(graph); + } + } +} +unsafe impl rc::rustacuda_core::DeviceCopy for WrapperCudaRepresentation +where + T: rc::common::RustToCuda, +{} +unsafe impl rc::common::RustToCuda for Wrapper +where + T: rc::common::RustToCuda, +{ + type CudaRepresentation = WrapperCudaRepresentation; + type CudaAllocation = rc::common::CombinedCudaAlloc< + ::CudaAllocation, + rc::common::NullCudaAlloc, + >; + #[cfg(not(target_os = "cuda"))] + unsafe fn borrow( + &self, + alloc: CudaAllocType, + ) -> rc::rustacuda::error::CudaResult< + ( + rc::common::DeviceAccessible, + rc::common::CombinedCudaAlloc, + ), + > { + let alloc_front = rc::common::NullCudaAlloc; + let alloc_tail = alloc; + let (field_inner_repr, alloc_front) = rc::common::RustToCuda::borrow( + &self.inner, + alloc_front, + )?; + let borrow = WrapperCudaRepresentation { + inner: field_inner_repr, + }; + Ok(( + rc::common::DeviceAccessible::from(borrow), + rc::common::CombinedCudaAlloc::new(alloc_front, alloc_tail), + )) + } + #[cfg(not(target_os = "cuda"))] + unsafe fn restore( + &mut self, + alloc: rc::common::CombinedCudaAlloc, + ) -> rc::rustacuda::error::CudaResult { + let (alloc_front, alloc_tail) = alloc.split(); + let alloc_front = rc::common::RustToCuda::restore(&mut self.inner, alloc_front)?; + Ok(alloc_tail) + } +} +unsafe impl rc::common::RustToCudaAsync for Wrapper +where + T: rc::common::RustToCudaAsync, +{ + #[cfg(not(target_os = "cuda"))] + unsafe fn borrow_async( + &self, + alloc: CudaAllocType, + stream: &rc::rustacuda::stream::Stream, + ) -> rc::rustacuda::error::CudaResult< + ( + rc::common::DeviceAccessible, + rc::common::CombinedCudaAlloc, + ), + > { + let alloc_front = rc::common::NullCudaAlloc; + let alloc_tail = alloc; + let (field_inner_repr, alloc_front) = rc::common::RustToCudaAsync::borrow_async( + &self.inner, + alloc_front, + stream, + )?; + let borrow = WrapperCudaRepresentation { + inner: field_inner_repr, + }; + Ok(( + rc::common::DeviceAccessible::from(borrow), + rc::common::CombinedCudaAlloc::new(alloc_front, alloc_tail), + )) + } + #[cfg(not(target_os = "cuda"))] + unsafe fn restore_async( + &mut self, + alloc: rc::common::CombinedCudaAlloc, + stream: &rc::rustacuda::stream::Stream, + ) -> rc::rustacuda::error::CudaResult { + let (alloc_front, alloc_tail) = alloc.split(); + let alloc_front = rc::common::RustToCudaAsync::restore_async( + &mut self.inner, + alloc_front, + stream, + )?; + Ok(alloc_tail) + } +} +unsafe impl rc::common::CudaAsRust for WrapperCudaRepresentation +where + T: rc::common::RustToCuda, +{ + type RustRepresentation = Wrapper; +} +#[cuda(crate = "rc")] +pub struct Empty([u8; 0]); +#[allow(dead_code)] +#[doc(hidden)] +#[repr(C)] +#[layout(crate = "rc :: const_type_layout")] +pub struct EmptyCudaRepresentation( + rc::common::DeviceAccessible>, +); +unsafe impl const rc::const_type_layout::TypeLayout for EmptyCudaRepresentation { + const TYPE_LAYOUT: rc::const_type_layout::TypeLayoutInfo<'static> = { + rc::const_type_layout::TypeLayoutInfo { + name: ::core::any::type_name::(), + size: ::core::mem::size_of::(), + alignment: ::core::mem::align_of::(), + structure: rc::const_type_layout::TypeStructure::Struct { + repr: "C", + fields: &[ + rc::const_type_layout::Field { + name: "0", + offset: { + { + #[allow(clippy::unneeded_field_pattern)] + let EmptyCudaRepresentation { + 0: _, + .. + }: EmptyCudaRepresentation; + if let ::const_type_layout::MaybeUninhabited::Inhabited( + uninit, + ) + = unsafe { + ::uninit() + } { + let base_ptr: *const EmptyCudaRepresentation = (&raw const uninit) + .cast(); + #[allow(unused_unsafe)] + let field_ptr = unsafe { &raw const (*base_ptr).0 }; + #[allow(clippy::cast_sign_loss)] + let offset = unsafe { + field_ptr.cast::().offset_from(base_ptr.cast()) as usize + }; + #[allow(clippy::forget_non_drop, clippy::forget_copy)] + core::mem::forget(uninit); + ::const_type_layout::MaybeUninhabited::Inhabited(offset) + } else { + ::const_type_layout::MaybeUninhabited::Uninhabited + } + } + }, + ty: ::core::any::type_name::< + rc::common::DeviceAccessible< + rc::utils::device_copy::SafeDeviceCopyWrapper<[u8; 0]>, + >, + >(), + }, + ], + }, + } + }; + unsafe fn uninit() -> rc::const_type_layout::MaybeUninhabited< + ::core::mem::MaybeUninit, + > { + if let (rc::const_type_layout::MaybeUninhabited::Inhabited(f_0)) + = (, + > as rc::const_type_layout::TypeLayout>::uninit()) { + rc::const_type_layout::MaybeUninhabited::Inhabited( + ::core::mem::MaybeUninit::new(EmptyCudaRepresentation(f_0.assume_init())), + ) + } else { + rc::const_type_layout::MaybeUninhabited::Uninhabited + } + } +} +unsafe impl const rc::const_type_layout::TypeGraph for EmptyCudaRepresentation { + fn populate_graph(graph: &mut rc::const_type_layout::TypeLayoutGraph<'static>) { + if graph.insert(&::TYPE_LAYOUT) { + , + > as rc::const_type_layout::TypeGraph>::populate_graph(graph); + } + } +} +unsafe impl rc::rustacuda_core::DeviceCopy for EmptyCudaRepresentation {} +unsafe impl rc::common::RustToCuda for Empty { + type CudaRepresentation = EmptyCudaRepresentation; + type CudaAllocation = rc::common::NullCudaAlloc; + #[cfg(not(target_os = "cuda"))] + unsafe fn borrow( + &self, + alloc: CudaAllocType, + ) -> rc::rustacuda::error::CudaResult< + ( + rc::common::DeviceAccessible, + rc::common::CombinedCudaAlloc, + ), + > { + let alloc_front = rc::common::NullCudaAlloc; + let alloc_tail = alloc; + let field_0_repr = rc::common::DeviceAccessible::from(&self.0); + let borrow = EmptyCudaRepresentation(field_0_repr); + Ok(( + rc::common::DeviceAccessible::from(borrow), + rc::common::CombinedCudaAlloc::new(alloc_front, alloc_tail), + )) + } + #[cfg(not(target_os = "cuda"))] + unsafe fn restore( + &mut self, + alloc: rc::common::CombinedCudaAlloc, + ) -> rc::rustacuda::error::CudaResult { + let (alloc_front, alloc_tail) = alloc.split(); + Ok(alloc_tail) + } +} +unsafe impl rc::common::RustToCudaAsync for Empty { + #[cfg(not(target_os = "cuda"))] + unsafe fn borrow_async( + &self, + alloc: CudaAllocType, + stream: &rc::rustacuda::stream::Stream, + ) -> rc::rustacuda::error::CudaResult< + ( + rc::common::DeviceAccessible, + rc::common::CombinedCudaAlloc, + ), + > { + let alloc_front = rc::common::NullCudaAlloc; + let alloc_tail = alloc; + let field_0_repr = rc::common::DeviceAccessible::from(&self.0); + let borrow = EmptyCudaRepresentation(field_0_repr); + Ok(( + rc::common::DeviceAccessible::from(borrow), + rc::common::CombinedCudaAlloc::new(alloc_front, alloc_tail), + )) + } + #[cfg(not(target_os = "cuda"))] + unsafe fn restore_async( + &mut self, + alloc: rc::common::CombinedCudaAlloc, + stream: &rc::rustacuda::stream::Stream, + ) -> rc::rustacuda::error::CudaResult { + let (alloc_front, alloc_tail) = alloc.split(); + Ok(alloc_tail) + } +} +unsafe impl rc::common::CudaAsRust for EmptyCudaRepresentation { + type RustRepresentation = Empty; +} +#[repr(C)] +#[layout(crate = "rc::const_type_layout")] +pub struct Tuple(u32, i32); +unsafe impl const rc::const_type_layout::TypeLayout for Tuple { + const TYPE_LAYOUT: rc::const_type_layout::TypeLayoutInfo<'static> = { + rc::const_type_layout::TypeLayoutInfo { + name: ::core::any::type_name::(), + size: ::core::mem::size_of::(), + alignment: ::core::mem::align_of::(), + structure: rc::const_type_layout::TypeStructure::Struct { + repr: "C", + fields: &[ + rc::const_type_layout::Field { + name: "0", + offset: { + { + #[allow(clippy::unneeded_field_pattern)] + let Tuple { 0: _, .. }: Tuple; + if let ::const_type_layout::MaybeUninhabited::Inhabited( + uninit, + ) + = unsafe { + ::uninit() + } { + let base_ptr: *const Tuple = (&raw const uninit).cast(); + #[allow(unused_unsafe)] + let field_ptr = unsafe { &raw const (*base_ptr).0 }; + #[allow(clippy::cast_sign_loss)] + let offset = unsafe { + field_ptr.cast::().offset_from(base_ptr.cast()) as usize + }; + #[allow(clippy::forget_non_drop, clippy::forget_copy)] + core::mem::forget(uninit); + ::const_type_layout::MaybeUninhabited::Inhabited(offset) + } else { + ::const_type_layout::MaybeUninhabited::Uninhabited + } + } + }, + ty: ::core::any::type_name::(), + }, + rc::const_type_layout::Field { + name: "1", + offset: { + { + #[allow(clippy::unneeded_field_pattern)] + let Tuple { 1: _, .. }: Tuple; + if let ::const_type_layout::MaybeUninhabited::Inhabited( + uninit, + ) + = unsafe { + ::uninit() + } { + let base_ptr: *const Tuple = (&raw const uninit).cast(); + #[allow(unused_unsafe)] + let field_ptr = unsafe { &raw const (*base_ptr).1 }; + #[allow(clippy::cast_sign_loss)] + let offset = unsafe { + field_ptr.cast::().offset_from(base_ptr.cast()) as usize + }; + #[allow(clippy::forget_non_drop, clippy::forget_copy)] + core::mem::forget(uninit); + ::const_type_layout::MaybeUninhabited::Inhabited(offset) + } else { + ::const_type_layout::MaybeUninhabited::Uninhabited + } + } + }, + ty: ::core::any::type_name::(), + }, + ], + }, + } + }; + unsafe fn uninit() -> rc::const_type_layout::MaybeUninhabited< + ::core::mem::MaybeUninit, + > { + if let ( + rc::const_type_layout::MaybeUninhabited::Inhabited(f_0), + rc::const_type_layout::MaybeUninhabited::Inhabited(f_1), + ) + = ( + ::uninit(), + ::uninit(), + ) { + rc::const_type_layout::MaybeUninhabited::Inhabited( + ::core::mem::MaybeUninit::new( + Tuple(f_0.assume_init(), f_1.assume_init()), + ), + ) + } else { + rc::const_type_layout::MaybeUninhabited::Uninhabited + } + } +} +unsafe impl const rc::const_type_layout::TypeGraph for Tuple { + fn populate_graph(graph: &mut rc::const_type_layout::TypeLayoutGraph<'static>) { + if graph.insert(&::TYPE_LAYOUT) { + ::populate_graph(graph); + ::populate_graph(graph); + } + } +} +#[cfg(not(target_os = "cuda"))] +#[allow(clippy::missing_safety_doc)] +unsafe trait KernelArgs +where + T: rc::safety::StackOnly, + ::CudaRepresentation: rc::safety::StackOnly, + ::CudaAllocation: rc::common::EmptyCudaAlloc, +{ + type __T_0; + type __T_1; + type __T_2; + type __T_3; + type __T_4; + type __T_5; +} +unsafe impl KernelArgs for () +where + T: rc::safety::StackOnly, + ::CudaRepresentation: rc::safety::StackOnly, + ::CudaAllocation: rc::common::EmptyCudaAlloc, +{ + type __T_0 = Dummy; + type __T_1 = Wrapper; + type __T_2 = Wrapper; + type __T_3 = core::sync::atomic::AtomicU64; + type __T_4 = Wrapper; + type __T_5 = Tuple; +} +#[cfg(not(target_os = "cuda"))] +#[allow(clippy::missing_safety_doc)] +unsafe trait KernelPtx +where + T: rc::safety::StackOnly, + ::CudaRepresentation: rc::safety::StackOnly, + ::CudaAllocation: rc::common::EmptyCudaAlloc, +{ + fn get_ptx_str() -> &'static str + where + Self: Sized + rc::host::Launcher>; + fn new_kernel() -> rc::rustacuda::error::CudaResult< + rc::host::TypedKernel>, + > + where + Self: Sized + rc::host::Launcher>; +} +#[cfg(not(target_os = "cuda"))] +#[allow(clippy::missing_safety_doc)] +unsafe trait Kernel: KernelPtx +where + T: rc::safety::StackOnly, + ::CudaRepresentation: rc::safety::StackOnly, + ::CudaAllocation: rc::common::EmptyCudaAlloc, +{ + #[allow(clippy::needless_lifetimes)] + #[allow(clippy::too_many_arguments)] + #[allow(clippy::used_underscore_binding)] + #[allow(unused_variables)] + fn kernel<'stream, '__r2c_lt_0, '__r2c_lt_1, '__r2c_lt_2, '__r2c_move_lt_4, 'a>( + &mut self, + stream: &'stream rc::rustacuda::stream::Stream, + _x: &'__r2c_lt_0 <() as KernelArgs>::__T_0, + _y: &'__r2c_lt_1 mut <() as KernelArgs>::__T_1, + _z: &'__r2c_lt_2 <() as KernelArgs>::__T_2, + _v: &'a <() as KernelArgs>::__T_3, + kernel_arg_4: <() as KernelArgs>::__T_4, + s_t: <() as KernelArgs>::__T_5, + ) -> rc::rustacuda::error::CudaResult<()> + where + Self: Sized + rc::host::Launcher>, + { + const fn __check_is_sync(_x: &T) -> bool { + trait IsSyncMarker { + const SYNC: bool = false; + } + impl IsSyncMarker for T {} + struct CheckIs(::core::marker::PhantomData); + #[allow(dead_code)] + impl CheckIs { + const SYNC: bool = true; + } + >::SYNC + } + let mut ___x_box = rc::host::HostDeviceBox::from( + rc::rustacuda::memory::DeviceBox::new( + rc::utils::device_copy::SafeDeviceCopyWrapper::from_ref(_x), + )?, + ); + #[allow(clippy::redundant_closure_call)] + let __result = (|_x| { + rc::host::LendToCuda::lend_to_cuda_mut( + _y, + |mut _y| { + (|_y| { + rc::host::LendToCuda::lend_to_cuda( + _z, + |_z| { + (|_z| { + let mut ___v_box = rc::host::HostDeviceBox::from( + rc::rustacuda::memory::DeviceBox::new( + rc::utils::device_copy::SafeDeviceCopyWrapper::from_ref(_v), + )?, + ); + #[allow(clippy::redundant_closure_call)] + let __result = (|_v| { + rc::host::LendToCuda::move_to_cuda( + kernel_arg_4, + |mut kernel_arg_4| { + (|kernel_arg_4| { + { + let s_t = rc::utils::device_copy::SafeDeviceCopyWrapper::from( + s_t, + ); + self.kernel_async( + stream, + _x, + _y, + _z, + _v, + kernel_arg_4, + s_t, + )?; + stream.synchronize() + } + })(kernel_arg_4.as_async()) + }, + ) + })(unsafe { + rc::host::HostAndDeviceConstRef::new( + &___v_box, + rc::utils::device_copy::SafeDeviceCopyWrapper::from_ref(_v), + ) + .as_async() + }); + if !__check_is_sync(_v) { + ___v_box + .copy_to(unsafe { &mut *(_v as *const _ as *mut _) })?; + } + ::core::mem::drop(___v_box); + __result + })(_z.as_async()) + }, + ) + })(_y.as_async()) + }, + ) + })(unsafe { + rc::host::HostAndDeviceConstRef::new( + &___x_box, + rc::utils::device_copy::SafeDeviceCopyWrapper::from_ref(_x), + ) + .as_async() + }); + if !__check_is_sync(_x) { + ___x_box.copy_to(unsafe { &mut *(_x as *const _ as *mut _) })?; + } + ::core::mem::drop(___x_box); + __result + } + #[allow(clippy::extra_unused_type_parameters)] + #[allow(clippy::too_many_arguments)] + #[allow(clippy::used_underscore_binding)] + #[allow(unused_variables)] + fn kernel_async< + 'stream, + '__r2c_lt_0, + '__r2c_lt_1, + '__r2c_lt_2, + '__r2c_move_lt_4, + 'a, + >( + &mut self, + stream: &'stream rc::rustacuda::stream::Stream, + _x: rc::host::HostAndDeviceConstRefAsync< + 'stream, + '__r2c_lt_0, + rc::utils::device_copy::SafeDeviceCopyWrapper<<() as KernelArgs>::__T_0>, + >, + mut _y: rc::host::HostAndDeviceMutRefAsync< + 'stream, + '__r2c_lt_1, + rc::common::DeviceAccessible< + <<() as KernelArgs< + T, + >>::__T_1 as rc::common::RustToCuda>::CudaRepresentation, + >, + >, + _z: rc::host::HostAndDeviceConstRefAsync< + 'stream, + '__r2c_lt_2, + rc::common::DeviceAccessible< + <<() as KernelArgs< + T, + >>::__T_2 as rc::common::RustToCuda>::CudaRepresentation, + >, + >, + _v: rc::host::HostAndDeviceConstRefAsync< + 'stream, + 'a, + rc::utils::device_copy::SafeDeviceCopyWrapper<<() as KernelArgs>::__T_3>, + >, + kernel_arg_4: rc::host::HostAndDeviceOwnedAsync< + 'stream, + '__r2c_move_lt_4, + rc::common::DeviceAccessible< + <<() as KernelArgs< + T, + >>::__T_4 as rc::common::RustToCuda>::CudaRepresentation, + >, + >, + s_t: rc::utils::device_copy::SafeDeviceCopyWrapper<<() as KernelArgs>::__T_5>, + ) -> rc::rustacuda::error::CudaResult<()> + where + Self: Sized + rc::host::Launcher>, + { + let rc::host::LaunchPackage { kernel, watcher, config } = rc::host::Launcher::get_launch_package( + self, + ); + let kernel_jit_result = if config.ptx_jit { + kernel + .compile_with_ptx_jit_args( + Some( + &[ + None, + Some(rc::ptx_jit::arg_as_raw_bytes(_y.for_host())), + None, + Some(rc::ptx_jit::arg_as_raw_bytes(_v.for_host())), + None, + None, + ], + ), + )? + } else { + kernel.compile_with_ptx_jit_args(None)? + }; + let function = match kernel_jit_result { + rc::host::KernelJITResult::Recompiled(function) => { + ::on_compile(function, watcher)?; + function + } + rc::host::KernelJITResult::Cached(function) => function, + }; + #[allow(clippy::redundant_closure_call)] + (| + _x: rc::common::DeviceConstRef< + '__r2c_lt_0, + rc::utils::device_copy::SafeDeviceCopyWrapper< + <() as KernelArgs>::__T_0, + >, + >, + _y: rc::common::DeviceMutRef< + '__r2c_lt_1, + rc::common::DeviceAccessible< + <<() as KernelArgs< + T, + >>::__T_1 as rc::common::RustToCuda>::CudaRepresentation, + >, + >, + _z: rc::common::DeviceConstRef< + '__r2c_lt_2, + rc::common::DeviceAccessible< + <<() as KernelArgs< + T, + >>::__T_2 as rc::common::RustToCuda>::CudaRepresentation, + >, + >, + _v: rc::common::DeviceConstRef< + 'a, + rc::utils::device_copy::SafeDeviceCopyWrapper< + <() as KernelArgs>::__T_3, + >, + >, + kernel_arg_4: rc::common::DeviceMutRef< + '__r2c_move_lt_4, + rc::common::DeviceAccessible< + <<() as KernelArgs< + T, + >>::__T_4 as rc::common::RustToCuda>::CudaRepresentation, + >, + >, + s_t: rc::utils::device_copy::SafeDeviceCopyWrapper< + <() as KernelArgs>::__T_5, + >| + { + if false { + #[allow(dead_code)] + fn assert_impl_devicecopy(_val: &T) {} + #[allow(dead_code)] + fn assert_impl_no_aliasing() {} + #[allow(dead_code)] + fn assert_impl_fits_into_device_register< + T: rc::safety::FitsIntoDeviceRegister, + >(_val: &T) {} + assert_impl_devicecopy(&_x); + assert_impl_devicecopy(&_y); + assert_impl_devicecopy(&_z); + assert_impl_devicecopy(&_v); + assert_impl_devicecopy(&kernel_arg_4); + assert_impl_devicecopy(&s_t); + assert_impl_no_aliasing::<<() as KernelArgs>::__T_0>(); + assert_impl_no_aliasing::<<() as KernelArgs>::__T_1>(); + assert_impl_no_aliasing::<<() as KernelArgs>::__T_2>(); + assert_impl_no_aliasing::<<() as KernelArgs>::__T_3>(); + assert_impl_no_aliasing::<<() as KernelArgs>::__T_4>(); + assert_impl_no_aliasing::<<() as KernelArgs>::__T_5>(); + assert_impl_fits_into_device_register(&_x); + assert_impl_fits_into_device_register(&_y); + assert_impl_fits_into_device_register(&_z); + assert_impl_fits_into_device_register(&_v); + assert_impl_fits_into_device_register(&kernel_arg_4); + assert_impl_fits_into_device_register(&s_t); + } + let rc::host::LaunchConfig { grid, block, shared_memory_size, ptx_jit: _ } = config; + unsafe { + stream + .launch( + function, + grid, + block, + shared_memory_size, + &[ + &_x as *const _ as *mut ::std::ffi::c_void, + &_y as *const _ as *mut ::std::ffi::c_void, + &_z as *const _ as *mut ::std::ffi::c_void, + &_v as *const _ as *mut ::std::ffi::c_void, + &kernel_arg_4 as *const _ as *mut ::std::ffi::c_void, + &s_t as *const _ as *mut ::std::ffi::c_void, + ], + ) + } + })( + unsafe { _x.for_device_async() }, + unsafe { _y.for_device_async() }, + unsafe { _z.for_device_async() }, + unsafe { _v.for_device_async() }, + unsafe { kernel_arg_4.for_device_async() }, + s_t, + ) + } +} +#[cfg(not(target_os = "cuda"))] +#[allow(clippy::missing_safety_doc)] +unsafe impl> Kernel for K +where + T: rc::safety::StackOnly, + ::CudaRepresentation: rc::safety::StackOnly, + ::CudaAllocation: rc::common::EmptyCudaAlloc, +{} +#[cfg(not(target_os = "cuda"))] +const _: rc::safety::kernel_signature::Assert< + { rc::safety::kernel_signature::CpuAndGpuKernelSignatures::Match }, +> = rc::safety::kernel_signature::Assert::< + { + rc::safety::kernel_signature::check( + "//\n// Generated by LLVM NVPTX Back-End\n//\n\n.version 3.2\n.target sm_35\n.address_size 64\n\n\t// .globl\tkernel_type_layout\n\n.visible .entry kernel_type_layout()\n{\n\n\n\tret;\n\n}\n\t// .globl\tkernel_dfae7eaf723a670c\n.visible .entry kernel_dfae7eaf723a670c()\n{\n\n\n\tret;\n\n}\n" + .as_bytes(), + ".visible .entry kernel_dfae7eaf723a670c".as_bytes(), + ) + }, +>; +#[cfg(not(target_os = "cuda"))] +mod host { + #[allow(unused_imports)] + use super::KernelArgs; + use super::{Kernel, KernelPtx}; + #[allow(dead_code)] + struct Launcher(core::marker::PhantomData); + unsafe impl KernelPtx for Launcher { + fn get_ptx_str() -> &'static str { + const PTX_STR: &'static str = "//\n// Generated by LLVM NVPTX Back-End\n//\n\n.version 3.2\n.target sm_35\n.address_size 64\n\n\t// .globl\tkernel_dfae7eaf723a670c_kernel_aab1c403129e575b\n.visible .entry kernel_dfae7eaf723a670c_kernel_aab1c403129e575b(\n\t.param .u64 kernel_dfae7eaf723a670c_kernel_aab1c403129e575b_param_0,\n\t.param .u64 kernel_dfae7eaf723a670c_kernel_aab1c403129e575b_param_1,\n\t.param .u64 kernel_dfae7eaf723a670c_kernel_aab1c403129e575b_param_2,\n\t.param .u64 kernel_dfae7eaf723a670c_kernel_aab1c403129e575b_param_3,\n\t.param .u64 kernel_dfae7eaf723a670c_kernel_aab1c403129e575b_param_4,\n\t.param .align 4 .b8 kernel_dfae7eaf723a670c_kernel_aab1c403129e575b_param_5[8]\n)\n{\n\t.reg .b32 \t%r<6>;\n\t.reg .b64 \t%rd<7>;\n\t.reg .f64 \t%fd<5>;\n\n\tld.param.u64 \t%rd3, [kernel_dfae7eaf723a670c_kernel_aab1c403129e575b_param_3];\n\tcvta.to.global.u64 \t%rd4, %rd3;\n\tld.param.u64 \t%rd5, [kernel_dfae7eaf723a670c_kernel_aab1c403129e575b_param_1];\n\tcvta.to.global.u64 \t%rd6, %rd5;\n\tld.global.u32 \t%r1, [%rd6];\n\t// begin inline asm\n\t// //\n\t// end inline asm\n\tld.param.u32 \t%r3, [kernel_dfae7eaf723a670c_kernel_aab1c403129e575b_param_5];\n\tld.global.u32 \t%r2, [%rd4];\n\t// begin inline asm\n\t// //\n\t// end inline asm\n\t// begin inline asm\n\t.shared .align 4 .b8 %rd1_rust_cuda_static_shared[24];\ncvta.shared.u64 %rd1, %rd1_rust_cuda_static_shared;\n\t// end inline asm\n\t// begin inline asm\n\t.shared .align 4 .b8 %rd2_rust_cuda_static_shared[24];\ncvta.shared.u64 %rd2, %rd2_rust_cuda_static_shared;\n\t// end inline asm\n\tcvt.rn.f64.u32 \t%fd1, %r3;\n\tadd.rn.f64 \t%fd2, %fd1, %fd1;\n\tmax.f64 \t%fd3, %fd2, 0d0000000000000000;\n\tmin.f64 \t%fd4, %fd3, 0d41EFFFFFFFE00000;\n\tcvt.rzi.u32.f64 \t%r4, %fd4;\n\tst.u32 \t[%rd1+8], %r4;\n\tmov.u32 \t%r5, 24;\n\tst.u32 \t[%rd2+20], %r5;\n\tret;\n\n}\n\n// \n"; + const __KERNEL_DFAE7EAF723A670C__X_LAYOUT: &[u8; 879usize] = b"\xef\x06\x050.1.0mrust_cuda::common::DeviceConstRef>\x06mrust_cuda::common::DeviceConstRef>\x08\x08s\x0btransparent\x02\x07pointerh\x00Q*const rust_cuda::utils::device_copy::SafeDeviceCopyWrapper\treferenceh\x00fcore::marker::PhantomData<&rust_cuda::utils::device_copy::SafeDeviceCopyWrapper>Q*const rust_cuda::utils::device_copy::SafeDeviceCopyWrapper\x08\x08pJrust_cuda::utils::device_copy::SafeDeviceCopyWrapperiJrust_cuda::utils::device_copy::SafeDeviceCopyWrapper\x04\x04s\x0btransparent\x01\x010h\x00\x14single_source::Dummy\x14single_source::Dummy\x04\x04s\x01C\x01\x010h\x00\x03i32\x03i32\x04\x04vfcore::marker::PhantomData<&rust_cuda::utils::device_copy::SafeDeviceCopyWrapper>\x00\x01s\x00\x00"; + const __KERNEL_DFAE7EAF723A670C__Y_LAYOUT: &[u8; 1811usize] = b"\x93\x0e\x050.1.0\x84\x01rust_cuda::common::DeviceMutRef>>\x0b\x84\x01rust_cuda::common::DeviceMutRef>>\x08\x08s\x0btransparent\x02\x07pointerh\x00h*mut rust_cuda::common::DeviceAccessible>\treferenceh\x00\x83\x01core::marker::PhantomData<&mut rust_cuda::common::DeviceAccessible>>h*mut rust_cuda::common::DeviceAccessible>\x08\x08pcrust_cuda::common::DeviceAccessible>mcrust_cuda::common::DeviceAccessible>\x00\x01s\x0btransparent\x01\x010h\x00>single_source::WrapperCudaRepresentation>single_source::WrapperCudaRepresentation\x00\x01s\x01C\x01\x05innerh\x00Krust_cuda::common::DeviceAccessibleKrust_cuda::common::DeviceAccessible\x00\x01s\x0btransparent\x01\x010h\x00&single_source::EmptyCudaRepresentation&single_source::EmptyCudaRepresentation\x00\x01s\x01C\x01\x010h\x00brust_cuda::common::DeviceAccessible>brust_cuda::common::DeviceAccessible>\x00\x01s\x0btransparent\x01\x010h\x00=rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<[u8; 0]>=rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<[u8; 0]>\x00\x01s\x0btransparent\x01\x010h\x00\x07[u8; 0]\x07[u8; 0]\x00\x01a\x02u8\x00\x02u8\x01\x01v\x83\x01core::marker::PhantomData<&mut rust_cuda::common::DeviceAccessible>>\x00\x01s\x00\x00"; + const __KERNEL_DFAE7EAF723A670C__Z_LAYOUT: &[u8; 1809usize] = b"\x91\x0e\x050.1.0\x86\x01rust_cuda::common::DeviceConstRef>>\x0b\x86\x01rust_cuda::common::DeviceConstRef>>\x08\x08s\x0btransparent\x02\x07pointerh\x00j*const rust_cuda::common::DeviceAccessible>\treferenceh\x00\x7fcore::marker::PhantomData<&rust_cuda::common::DeviceAccessible>>j*const rust_cuda::common::DeviceAccessible>\x08\x08pcrust_cuda::common::DeviceAccessible>icrust_cuda::common::DeviceAccessible>\x00\x01s\x0btransparent\x01\x010h\x00>single_source::WrapperCudaRepresentation>single_source::WrapperCudaRepresentation\x00\x01s\x01C\x01\x05innerh\x00Krust_cuda::common::DeviceAccessibleKrust_cuda::common::DeviceAccessible\x00\x01s\x0btransparent\x01\x010h\x00&single_source::EmptyCudaRepresentation&single_source::EmptyCudaRepresentation\x00\x01s\x01C\x01\x010h\x00brust_cuda::common::DeviceAccessible>brust_cuda::common::DeviceAccessible>\x00\x01s\x0btransparent\x01\x010h\x00=rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<[u8; 0]>=rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<[u8; 0]>\x00\x01s\x0btransparent\x01\x010h\x00\x07[u8; 0]\x07[u8; 0]\x00\x01a\x02u8\x00\x02u8\x01\x01v\x7fcore::marker::PhantomData<&rust_cuda::common::DeviceAccessible>>\x00\x01s\x00\x00"; + const __KERNEL_DFAE7EAF723A670C__V_LAYOUT: &[u8; 1068usize] = b"\xac\x08\x050.1.0vrust_cuda::common::DeviceConstRef>\x07vrust_cuda::common::DeviceConstRef>\x08\x08s\x0btransparent\x02\x07pointerh\x00Z*const rust_cuda::utils::device_copy::SafeDeviceCopyWrapper\treferenceh\x00ocore::marker::PhantomData<&rust_cuda::utils::device_copy::SafeDeviceCopyWrapper>Z*const rust_cuda::utils::device_copy::SafeDeviceCopyWrapper\x08\x08pSrust_cuda::utils::device_copy::SafeDeviceCopyWrapperiSrust_cuda::utils::device_copy::SafeDeviceCopyWrapper\x08\x08s\x0btransparent\x01\x010h\x00\x1dcore::sync::atomic::AtomicU64\x1dcore::sync::atomic::AtomicU64\x08\x08s\nC,align(8)\x01\x01vh\x00\x1bcore::cell::UnsafeCell\x1bcore::cell::UnsafeCell\x08\x08s\x15no_nieche,transparent\x01\x05valueh\x00\x03u64\x03u64\x08\x08vocore::marker::PhantomData<&rust_cuda::utils::device_copy::SafeDeviceCopyWrapper>\x00\x01s\x00\x00"; + const __KERNEL_DFAE7EAF723A670C_KERNEL_ARG_4_LAYOUT: &[u8; 1811usize] = b"\x93\x0e\x050.1.0\x84\x01rust_cuda::common::DeviceMutRef>>\x0b\x84\x01rust_cuda::common::DeviceMutRef>>\x08\x08s\x0btransparent\x02\x07pointerh\x00h*mut rust_cuda::common::DeviceAccessible>\treferenceh\x00\x83\x01core::marker::PhantomData<&mut rust_cuda::common::DeviceAccessible>>h*mut rust_cuda::common::DeviceAccessible>\x08\x08pcrust_cuda::common::DeviceAccessible>mcrust_cuda::common::DeviceAccessible>\x00\x01s\x0btransparent\x01\x010h\x00>single_source::WrapperCudaRepresentation>single_source::WrapperCudaRepresentation\x00\x01s\x01C\x01\x05innerh\x00Krust_cuda::common::DeviceAccessibleKrust_cuda::common::DeviceAccessible\x00\x01s\x0btransparent\x01\x010h\x00&single_source::EmptyCudaRepresentation&single_source::EmptyCudaRepresentation\x00\x01s\x01C\x01\x010h\x00brust_cuda::common::DeviceAccessible>brust_cuda::common::DeviceAccessible>\x00\x01s\x0btransparent\x01\x010h\x00=rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<[u8; 0]>=rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<[u8; 0]>\x00\x01s\x0btransparent\x01\x010h\x00\x07[u8; 0]\x07[u8; 0]\x00\x01a\x02u8\x00\x02u8\x01\x01v\x83\x01core::marker::PhantomData<&mut rust_cuda::common::DeviceAccessible>>\x00\x01s\x00\x00"; + const __KERNEL_DFAE7EAF723A670C_S_T_LAYOUT: &[u8; 257usize] = b"\x81\x02\x050.1.0Jrust_cuda::utils::device_copy::SafeDeviceCopyWrapper\x04Jrust_cuda::utils::device_copy::SafeDeviceCopyWrapper\x08\x04s\x0btransparent\x01\x010h\x00\x14single_source::Tuple\x14single_source::Tuple\x08\x04s\x01C\x02\x010h\x00\x03u32\x011h\x04\x03i32\x03u32\x04\x04v\x03i32\x04\x04v"; + const _: rc::safety::kernel_signature::Assert< + { rc::safety::kernel_signature::CpuAndGpuKernelSignatures::Match }, + > = rc::safety::kernel_signature::Assert::< + { + rc::safety::kernel_signature::check( + PTX_STR.as_bytes(), + ".visible .entry kernel_dfae7eaf723a670c_kernel_aab1c403129e575b" + .as_bytes(), + ) + }, + >; + const _: rc::safety::type_layout::Assert< + { rc::safety::type_layout::CpuAndGpuTypeLayouts::Match }, + > = rc::safety::type_layout::Assert::< + { + rc::safety::type_layout::check::< + rc::common::DeviceConstRef< + 'static, + rc::utils::device_copy::SafeDeviceCopyWrapper< + <() as KernelArgs>::__T_0, + >, + >, + >(__KERNEL_DFAE7EAF723A670C__X_LAYOUT) + }, + >; + const _: rc::safety::type_layout::Assert< + { rc::safety::type_layout::CpuAndGpuTypeLayouts::Match }, + > = rc::safety::type_layout::Assert::< + { + rc::safety::type_layout::check::< + rc::common::DeviceMutRef< + 'static, + rc::common::DeviceAccessible< + <<() as KernelArgs< + crate::Empty, + >>::__T_1 as rc::common::RustToCuda>::CudaRepresentation, + >, + >, + >(__KERNEL_DFAE7EAF723A670C__Y_LAYOUT) + }, + >; + const _: rc::safety::type_layout::Assert< + { rc::safety::type_layout::CpuAndGpuTypeLayouts::Match }, + > = rc::safety::type_layout::Assert::< + { + rc::safety::type_layout::check::< + rc::common::DeviceConstRef< + 'static, + rc::common::DeviceAccessible< + <<() as KernelArgs< + crate::Empty, + >>::__T_2 as rc::common::RustToCuda>::CudaRepresentation, + >, + >, + >(__KERNEL_DFAE7EAF723A670C__Z_LAYOUT) + }, + >; + const _: rc::safety::type_layout::Assert< + { rc::safety::type_layout::CpuAndGpuTypeLayouts::Match }, + > = rc::safety::type_layout::Assert::< + { + rc::safety::type_layout::check::< + rc::common::DeviceConstRef< + 'static, + rc::utils::device_copy::SafeDeviceCopyWrapper< + <() as KernelArgs>::__T_3, + >, + >, + >(__KERNEL_DFAE7EAF723A670C__V_LAYOUT) + }, + >; + const _: rc::safety::type_layout::Assert< + { rc::safety::type_layout::CpuAndGpuTypeLayouts::Match }, + > = rc::safety::type_layout::Assert::< + { + rc::safety::type_layout::check::< + rc::common::DeviceMutRef< + 'static, + rc::common::DeviceAccessible< + <<() as KernelArgs< + crate::Empty, + >>::__T_4 as rc::common::RustToCuda>::CudaRepresentation, + >, + >, + >(__KERNEL_DFAE7EAF723A670C_KERNEL_ARG_4_LAYOUT) + }, + >; + const _: rc::safety::type_layout::Assert< + { rc::safety::type_layout::CpuAndGpuTypeLayouts::Match }, + > = rc::safety::type_layout::Assert::< + { + rc::safety::type_layout::check::< + rc::utils::device_copy::SafeDeviceCopyWrapper< + <() as KernelArgs>::__T_5, + >, + >(__KERNEL_DFAE7EAF723A670C_S_T_LAYOUT) + }, + >; + PTX_STR + } + fn new_kernel() -> rc::rustacuda::error::CudaResult< + rc::host::TypedKernel>, + > { + let ptx = Self::get_ptx_str(); + let entry_point = "kernel_dfae7eaf723a670c_kernel_aab1c403129e575b"; + rc::host::TypedKernel::new(ptx, entry_point) + } + } + unsafe impl KernelPtx> + for Launcher> { + fn get_ptx_str() -> &'static str { + const PTX_STR: &'static str = "//\n// Generated by LLVM NVPTX Back-End\n//\n\n.version 3.2\n.target sm_35\n.address_size 64\n\n\t// .globl\tkernel_dfae7eaf723a670c_kernel_54d0891c50855d77\n.visible .entry kernel_dfae7eaf723a670c_kernel_54d0891c50855d77(\n\t.param .u64 kernel_dfae7eaf723a670c_kernel_54d0891c50855d77_param_0,\n\t.param .u64 kernel_dfae7eaf723a670c_kernel_54d0891c50855d77_param_1,\n\t.param .u64 kernel_dfae7eaf723a670c_kernel_54d0891c50855d77_param_2,\n\t.param .u64 kernel_dfae7eaf723a670c_kernel_54d0891c50855d77_param_3,\n\t.param .u64 kernel_dfae7eaf723a670c_kernel_54d0891c50855d77_param_4,\n\t.param .align 4 .b8 kernel_dfae7eaf723a670c_kernel_54d0891c50855d77_param_5[8]\n)\n{\n\t.reg .b32 \t%r<6>;\n\t.reg .b64 \t%rd<7>;\n\t.reg .f64 \t%fd<5>;\n\n\tld.param.u64 \t%rd3, [kernel_dfae7eaf723a670c_kernel_54d0891c50855d77_param_3];\n\tcvta.to.global.u64 \t%rd4, %rd3;\n\tld.param.u64 \t%rd5, [kernel_dfae7eaf723a670c_kernel_54d0891c50855d77_param_1];\n\tcvta.to.global.u64 \t%rd6, %rd5;\n\tld.global.u32 \t%r1, [%rd6];\n\t// begin inline asm\n\t// //\n\t// end inline asm\n\tld.param.u32 \t%r3, [kernel_dfae7eaf723a670c_kernel_54d0891c50855d77_param_5];\n\tld.global.u32 \t%r2, [%rd4];\n\t// begin inline asm\n\t// //\n\t// end inline asm\n\t// begin inline asm\n\t.shared .align 4 .b8 %rd1_rust_cuda_static_shared[24];\ncvta.shared.u64 %rd1, %rd1_rust_cuda_static_shared;\n\t// end inline asm\n\t// begin inline asm\n\t.shared .align 4 .b8 %rd2_rust_cuda_static_shared[24];\ncvta.shared.u64 %rd2, %rd2_rust_cuda_static_shared;\n\t// end inline asm\n\tcvt.rn.f64.u32 \t%fd1, %r3;\n\tadd.rn.f64 \t%fd2, %fd1, %fd1;\n\tmax.f64 \t%fd3, %fd2, 0d0000000000000000;\n\tmin.f64 \t%fd4, %fd3, 0d41EFFFFFFFE00000;\n\tcvt.rzi.u32.f64 \t%r4, %fd4;\n\tst.u32 \t[%rd1+8], %r4;\n\tmov.u32 \t%r5, 24;\n\tst.u32 \t[%rd2+20], %r5;\n\tret;\n\n}\n\n// >\n"; + const __KERNEL_DFAE7EAF723A670C__X_LAYOUT: &[u8; 879usize] = b"\xef\x06\x050.1.0mrust_cuda::common::DeviceConstRef>\x06mrust_cuda::common::DeviceConstRef>\x08\x08s\x0btransparent\x02\x07pointerh\x00Q*const rust_cuda::utils::device_copy::SafeDeviceCopyWrapper\treferenceh\x00fcore::marker::PhantomData<&rust_cuda::utils::device_copy::SafeDeviceCopyWrapper>Q*const rust_cuda::utils::device_copy::SafeDeviceCopyWrapper\x08\x08pJrust_cuda::utils::device_copy::SafeDeviceCopyWrapperiJrust_cuda::utils::device_copy::SafeDeviceCopyWrapper\x04\x04s\x0btransparent\x01\x010h\x00\x14single_source::Dummy\x14single_source::Dummy\x04\x04s\x01C\x01\x010h\x00\x03i32\x03i32\x04\x04vfcore::marker::PhantomData<&rust_cuda::utils::device_copy::SafeDeviceCopyWrapper>\x00\x01s\x00\x00"; + const __KERNEL_DFAE7EAF723A670C__Y_LAYOUT: &[u8; 1891usize] = b"\xe3\x0e\x050.1.0\xa9\x01rust_cuda::common::DeviceMutRef>>>\x08\xa9\x01rust_cuda::common::DeviceMutRef>>>\x08\x08s\x0btransparent\x02\x07pointerh\x00\x8d\x01*mut rust_cuda::common::DeviceAccessible>>\treferenceh\x00\xa8\x01core::marker::PhantomData<&mut rust_cuda::common::DeviceAccessible>>>\x8d\x01*mut rust_cuda::common::DeviceAccessible>>\x08\x08p\x88\x01rust_cuda::common::DeviceAccessible>>m\x88\x01rust_cuda::common::DeviceAccessible>>\x08\x08s\x0btransparent\x01\x010h\x00csingle_source::WrapperCudaRepresentation>csingle_source::WrapperCudaRepresentation>\x08\x08s\x01C\x01\x05innerh\x00^rust_cuda::common::DeviceAccessible>^rust_cuda::common::DeviceAccessible>\x08\x08s\x0btransparent\x01\x010h\x009rust_cuda::utils::device_copy::SafeDeviceCopyWrapper9rust_cuda::utils::device_copy::SafeDeviceCopyWrapper\x08\x08s\x0btransparent\x01\x010h\x00\x03u64\x03u64\x08\x08v\xa8\x01core::marker::PhantomData<&mut rust_cuda::common::DeviceAccessible>>>\x00\x01s\x00\x00"; + const __KERNEL_DFAE7EAF723A670C__Z_LAYOUT: &[u8; 1891usize] = b"\xe3\x0e\x050.1.0\xab\x01rust_cuda::common::DeviceConstRef>>>\x08\xab\x01rust_cuda::common::DeviceConstRef>>>\x08\x08s\x0btransparent\x02\x07pointerh\x00\x8f\x01*const rust_cuda::common::DeviceAccessible>>\treferenceh\x00\xa4\x01core::marker::PhantomData<&rust_cuda::common::DeviceAccessible>>>\x8f\x01*const rust_cuda::common::DeviceAccessible>>\x08\x08p\x88\x01rust_cuda::common::DeviceAccessible>>i\x88\x01rust_cuda::common::DeviceAccessible>>\x08\x08s\x0btransparent\x01\x010h\x00csingle_source::WrapperCudaRepresentation>csingle_source::WrapperCudaRepresentation>\x08\x08s\x01C\x01\x05innerh\x00^rust_cuda::common::DeviceAccessible>^rust_cuda::common::DeviceAccessible>\x08\x08s\x0btransparent\x01\x010h\x009rust_cuda::utils::device_copy::SafeDeviceCopyWrapper9rust_cuda::utils::device_copy::SafeDeviceCopyWrapper\x08\x08s\x0btransparent\x01\x010h\x00\x03u64\x03u64\x08\x08v\xa4\x01core::marker::PhantomData<&rust_cuda::common::DeviceAccessible>>>\x00\x01s\x00\x00"; + const __KERNEL_DFAE7EAF723A670C__V_LAYOUT: &[u8; 1068usize] = b"\xac\x08\x050.1.0vrust_cuda::common::DeviceConstRef>\x07vrust_cuda::common::DeviceConstRef>\x08\x08s\x0btransparent\x02\x07pointerh\x00Z*const rust_cuda::utils::device_copy::SafeDeviceCopyWrapper\treferenceh\x00ocore::marker::PhantomData<&rust_cuda::utils::device_copy::SafeDeviceCopyWrapper>Z*const rust_cuda::utils::device_copy::SafeDeviceCopyWrapper\x08\x08pSrust_cuda::utils::device_copy::SafeDeviceCopyWrapperiSrust_cuda::utils::device_copy::SafeDeviceCopyWrapper\x08\x08s\x0btransparent\x01\x010h\x00\x1dcore::sync::atomic::AtomicU64\x1dcore::sync::atomic::AtomicU64\x08\x08s\nC,align(8)\x01\x01vh\x00\x1bcore::cell::UnsafeCell\x1bcore::cell::UnsafeCell\x08\x08s\x15no_nieche,transparent\x01\x05valueh\x00\x03u64\x03u64\x08\x08vocore::marker::PhantomData<&rust_cuda::utils::device_copy::SafeDeviceCopyWrapper>\x00\x01s\x00\x00"; + const __KERNEL_DFAE7EAF723A670C_KERNEL_ARG_4_LAYOUT: &[u8; 1891usize] = b"\xe3\x0e\x050.1.0\xa9\x01rust_cuda::common::DeviceMutRef>>>\x08\xa9\x01rust_cuda::common::DeviceMutRef>>>\x08\x08s\x0btransparent\x02\x07pointerh\x00\x8d\x01*mut rust_cuda::common::DeviceAccessible>>\treferenceh\x00\xa8\x01core::marker::PhantomData<&mut rust_cuda::common::DeviceAccessible>>>\x8d\x01*mut rust_cuda::common::DeviceAccessible>>\x08\x08p\x88\x01rust_cuda::common::DeviceAccessible>>m\x88\x01rust_cuda::common::DeviceAccessible>>\x08\x08s\x0btransparent\x01\x010h\x00csingle_source::WrapperCudaRepresentation>csingle_source::WrapperCudaRepresentation>\x08\x08s\x01C\x01\x05innerh\x00^rust_cuda::common::DeviceAccessible>^rust_cuda::common::DeviceAccessible>\x08\x08s\x0btransparent\x01\x010h\x009rust_cuda::utils::device_copy::SafeDeviceCopyWrapper9rust_cuda::utils::device_copy::SafeDeviceCopyWrapper\x08\x08s\x0btransparent\x01\x010h\x00\x03u64\x03u64\x08\x08v\xa8\x01core::marker::PhantomData<&mut rust_cuda::common::DeviceAccessible>>>\x00\x01s\x00\x00"; + const __KERNEL_DFAE7EAF723A670C_S_T_LAYOUT: &[u8; 257usize] = b"\x81\x02\x050.1.0Jrust_cuda::utils::device_copy::SafeDeviceCopyWrapper\x04Jrust_cuda::utils::device_copy::SafeDeviceCopyWrapper\x08\x04s\x0btransparent\x01\x010h\x00\x14single_source::Tuple\x14single_source::Tuple\x08\x04s\x01C\x02\x010h\x00\x03u32\x011h\x04\x03i32\x03u32\x04\x04v\x03i32\x04\x04v"; + const _: rc::safety::kernel_signature::Assert< + { rc::safety::kernel_signature::CpuAndGpuKernelSignatures::Match }, + > = rc::safety::kernel_signature::Assert::< + { + rc::safety::kernel_signature::check( + PTX_STR.as_bytes(), + ".visible .entry kernel_dfae7eaf723a670c_kernel_54d0891c50855d77" + .as_bytes(), + ) + }, + >; + const _: rc::safety::type_layout::Assert< + { rc::safety::type_layout::CpuAndGpuTypeLayouts::Match }, + > = rc::safety::type_layout::Assert::< + { + rc::safety::type_layout::check::< + rc::common::DeviceConstRef< + 'static, + rc::utils::device_copy::SafeDeviceCopyWrapper< + <() as KernelArgs< + rc::utils::device_copy::SafeDeviceCopyWrapper, + >>::__T_0, + >, + >, + >(__KERNEL_DFAE7EAF723A670C__X_LAYOUT) + }, + >; + const _: rc::safety::type_layout::Assert< + { rc::safety::type_layout::CpuAndGpuTypeLayouts::Match }, + > = rc::safety::type_layout::Assert::< + { + rc::safety::type_layout::check::< + rc::common::DeviceMutRef< + 'static, + rc::common::DeviceAccessible< + <<() as KernelArgs< + rc::utils::device_copy::SafeDeviceCopyWrapper, + >>::__T_1 as rc::common::RustToCuda>::CudaRepresentation, + >, + >, + >(__KERNEL_DFAE7EAF723A670C__Y_LAYOUT) + }, + >; + const _: rc::safety::type_layout::Assert< + { rc::safety::type_layout::CpuAndGpuTypeLayouts::Match }, + > = rc::safety::type_layout::Assert::< + { + rc::safety::type_layout::check::< + rc::common::DeviceConstRef< + 'static, + rc::common::DeviceAccessible< + <<() as KernelArgs< + rc::utils::device_copy::SafeDeviceCopyWrapper, + >>::__T_2 as rc::common::RustToCuda>::CudaRepresentation, + >, + >, + >(__KERNEL_DFAE7EAF723A670C__Z_LAYOUT) + }, + >; + const _: rc::safety::type_layout::Assert< + { rc::safety::type_layout::CpuAndGpuTypeLayouts::Match }, + > = rc::safety::type_layout::Assert::< + { + rc::safety::type_layout::check::< + rc::common::DeviceConstRef< + 'static, + rc::utils::device_copy::SafeDeviceCopyWrapper< + <() as KernelArgs< + rc::utils::device_copy::SafeDeviceCopyWrapper, + >>::__T_3, + >, + >, + >(__KERNEL_DFAE7EAF723A670C__V_LAYOUT) + }, + >; + const _: rc::safety::type_layout::Assert< + { rc::safety::type_layout::CpuAndGpuTypeLayouts::Match }, + > = rc::safety::type_layout::Assert::< + { + rc::safety::type_layout::check::< + rc::common::DeviceMutRef< + 'static, + rc::common::DeviceAccessible< + <<() as KernelArgs< + rc::utils::device_copy::SafeDeviceCopyWrapper, + >>::__T_4 as rc::common::RustToCuda>::CudaRepresentation, + >, + >, + >(__KERNEL_DFAE7EAF723A670C_KERNEL_ARG_4_LAYOUT) + }, + >; + const _: rc::safety::type_layout::Assert< + { rc::safety::type_layout::CpuAndGpuTypeLayouts::Match }, + > = rc::safety::type_layout::Assert::< + { + rc::safety::type_layout::check::< + rc::utils::device_copy::SafeDeviceCopyWrapper< + <() as KernelArgs< + rc::utils::device_copy::SafeDeviceCopyWrapper, + >>::__T_5, + >, + >(__KERNEL_DFAE7EAF723A670C_S_T_LAYOUT) + }, + >; + PTX_STR + } + fn new_kernel() -> rc::rustacuda::error::CudaResult< + rc::host::TypedKernel< + dyn Kernel>, + >, + > { + let ptx = Self::get_ptx_str(); + let entry_point = "kernel_dfae7eaf723a670c_kernel_54d0891c50855d77"; + rc::host::TypedKernel::new(ptx, entry_point) + } + } + impl rc::host::Launcher for Launcher { + type CompilationWatcher = (); + type KernelTraitObject = dyn Kernel; + fn get_launch_package(&mut self) -> rc::host::LaunchPackage { + ::core::panicking::panic("not implemented") + } + } +} diff --git a/examples/single-source/src/main.rs b/examples/single-source/src/main.rs index 55a2e8046..981c9bccc 100644 --- a/examples/single-source/src/main.rs +++ b/examples/single-source/src/main.rs @@ -38,7 +38,7 @@ pub struct Empty([u8; 0]); #[layout(crate = "rc::const_type_layout")] pub struct Tuple(u32, i32); -#[rc::common::kernel(use link_kernel! as impl Kernel for Launcher)] +#[rc::common::kernel(use link_kernel! as impl Kernel for Launcher)] #[kernel(crate = "rc")] pub fn kernel<'a, T: rc::common::RustToCuda>( #[kernel(pass = SafeDeviceCopy)] _x: &Dummy, @@ -49,7 +49,9 @@ pub fn kernel<'a, T: rc::common::RustToCuda>( #[kernel(pass = SafeDeviceCopy)] Tuple(s, mut __t): Tuple, // #[kernel(pass = SafeDeviceCopy)] shared3: ThreadBlockShared, ) where + T: rc::safety::StackOnly + rc::safety::NoAliasing, ::CudaRepresentation: rc::safety::StackOnly, + ::CudaAllocation: rc::common::EmptyCudaAlloc, { let shared: ThreadBlockShared<[Tuple; 3]> = ThreadBlockShared::new_uninit(); let shared2: ThreadBlockShared<[Tuple; 3]> = ThreadBlockShared::new_uninit(); @@ -69,7 +71,9 @@ pub fn kernel<'a, T: rc::common::RustToCuda>( #[cfg(not(target_os = "cuda"))] mod host { - use super::{Kernel, KernelArgs}; + #[allow(unused_imports)] + use super::KernelArgs; + use super::{Kernel, KernelPtx}; #[allow(dead_code)] struct Launcher(core::marker::PhantomData); diff --git a/rust-cuda-derive/src/kernel/link/mod.rs b/rust-cuda-derive/src/kernel/link/mod.rs index 1b116435c..d383198ec 100644 --- a/rust-cuda-derive/src/kernel/link/mod.rs +++ b/rust-cuda-derive/src/kernel/link/mod.rs @@ -247,7 +247,7 @@ pub fn link_kernel(tokens: TokenStream) -> TokenStream { let r = unsafe { ptx_compiler::sys::nvPTXCompilerCompile( compiler, - options_ptrs.len() as c_int, + c_int::try_from(options_ptrs.len()).unwrap(), options_ptrs.as_ptr().cast(), ) }; diff --git a/rust-cuda-derive/src/kernel/wrapper/config.rs b/rust-cuda-derive/src/kernel/wrapper/config.rs index c07486c2b..382db35f9 100644 --- a/rust-cuda-derive/src/kernel/wrapper/config.rs +++ b/rust-cuda-derive/src/kernel/wrapper/config.rs @@ -3,6 +3,7 @@ pub(super) struct KernelConfig { pub(super) linker: syn::Ident, pub(super) kernel: syn::Ident, pub(super) args: syn::Ident, + pub(super) ptx: syn::Ident, pub(super) launcher: syn::Ident, } @@ -17,6 +18,9 @@ impl syn::parse::Parse for KernelConfig { let kernel: syn::Ident = input.parse()?; let _lt_token: syn::token::Lt = input.parse()?; let args: syn::Ident = input.parse()?; + let _comma: syn::token::Comma = input.parse()?; + let ptx: syn::Ident = input.parse()?; + let _comma: Option = input.parse()?; let _gt_token: syn::token::Gt = input.parse()?; let _for: syn::token::For = input.parse()?; let launcher: syn::Ident = input.parse()?; @@ -26,6 +30,7 @@ impl syn::parse::Parse for KernelConfig { linker, kernel, args, + ptx, launcher, }) } diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/get_ptx_str.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/get_ptx_str.rs index d412bd316..b3e215a20 100644 --- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/get_ptx_str.rs +++ b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/get_ptx_str.rs @@ -3,7 +3,7 @@ use syn::spanned::Spanned; use crate::kernel::utils::skip_kernel_compilation; -use super::super::super::{DeclGenerics, FuncIdent, FunctionInputs, KernelConfig}; +use super::super::super::{DeclGenerics, FuncIdent, FunctionInputs, InputCudaType, KernelConfig}; pub(super) fn quote_get_ptx_str( crate_path: &syn::Path, @@ -30,14 +30,8 @@ pub(super) fn quote_get_ptx_str( let crate_manifest_dir = proc_macro::tracked_env::var("CARGO_MANIFEST_DIR") .unwrap_or_else(|err| abort_call_site!("Failed to read crate path: {:?}.", err)); - let cpu_func_lifetime_erased_types = super::kernel_func_async::generate_launch_types( - crate_path, - config, - generics, - inputs, - macro_type_ids, - ) - .1; + let cpu_func_lifetime_erased_types = + generate_lifetime_erased_types(crate_path, config, generics, inputs, macro_type_ids); let matching_kernel_assert = if skip_kernel_compilation() { quote!() @@ -93,7 +87,83 @@ pub(super) fn quote_get_ptx_str( #(#type_layout_asserts)* + #[deny(improper_ctypes)] + mod __rust_cuda_ffi_safe_assert { + use super::#args; + + extern "C" { #( + #[allow(dead_code)] + static #func_params: #cpu_func_lifetime_erased_types; + )* } + } + PTX_STR } } } + +fn generate_lifetime_erased_types( + crate_path: &syn::Path, + KernelConfig { args, .. }: &KernelConfig, + DeclGenerics { + generic_start_token, + generic_close_token, + .. + }: &DeclGenerics, + FunctionInputs { + func_inputs, + func_input_cuda_types, + }: &FunctionInputs, + macro_type_ids: &[syn::Ident], +) -> Vec { + let mut cpu_func_lifetime_erased_types = Vec::with_capacity(func_inputs.len()); + + func_inputs + .iter() + .zip(func_input_cuda_types.iter()) + .enumerate() + .for_each(|(i, (arg, (cuda_mode, _ptx_jit)))| match arg { + syn::FnArg::Typed(syn::PatType { ty, .. }) => { + let type_ident = quote::format_ident!("__T_{}", i); + let syn_type = quote::quote_spanned! { ty.span()=> + <() as #args #generic_start_token + #($#macro_type_ids),* + #generic_close_token>::#type_ident + }; + + let cuda_type = match cuda_mode { + InputCudaType::SafeDeviceCopy => quote::quote_spanned! { ty.span()=> + #crate_path::utils::device_copy::SafeDeviceCopyWrapper<#syn_type> + }, + InputCudaType::LendRustToCuda => quote::quote_spanned! { ty.span()=> + #crate_path::common::DeviceAccessible< + <#syn_type as #crate_path::common::RustToCuda>::CudaRepresentation + > + }, + }; + + cpu_func_lifetime_erased_types.push( + if let syn::Type::Reference(syn::TypeReference { mutability, .. }) = &**ty { + if mutability.is_some() { + quote::quote_spanned! { ty.span()=> + #crate_path::common::DeviceMutRef<'static, #cuda_type> + } + } else { + quote::quote_spanned! { ty.span()=> + #crate_path::common::DeviceConstRef<'static, #cuda_type> + } + } + } else if matches!(cuda_mode, InputCudaType::LendRustToCuda) { + quote::quote_spanned! { ty.span()=> + #crate_path::common::DeviceMutRef<'static, #cuda_type> + } + } else { + cuda_type + }, + ); + }, + syn::FnArg::Receiver(_) => unreachable!(), + }); + + cpu_func_lifetime_erased_types +} diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/mod.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/mod.rs index aedf1e12e..91f94a568 100644 --- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/mod.rs +++ b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/mod.rs @@ -3,22 +3,18 @@ use proc_macro2::TokenStream; use super::super::{DeclGenerics, FuncIdent, FunctionInputs, KernelConfig}; mod get_ptx_str; -mod kernel_func; -mod kernel_func_async; mod new_kernel; use get_ptx_str::quote_get_ptx_str; -use kernel_func::quote_kernel_func; -use kernel_func_async::quote_kernel_func_async; use new_kernel::quote_new_kernel; pub(in super::super) fn quote_cpu_linker_macro( crate_path: &syn::Path, config @ KernelConfig { visibility, - kernel, linker, launcher, + ptx, .. }: &KernelConfig, decl_generics @ DeclGenerics { @@ -30,7 +26,6 @@ pub(in super::super) fn quote_cpu_linker_macro( func_inputs: &FunctionInputs, func_ident: &FuncIdent, func_params: &[syn::Ident], - func_attrs: &[syn::Attribute], ) -> TokenStream { let macro_types = generic_params .iter() @@ -72,42 +67,18 @@ pub(in super::super) fn quote_cpu_linker_macro( func_ident, ¯o_type_ids, ); - let kernel_func = quote_kernel_func( - crate_path, - config, - decl_generics, - func_inputs, - func_ident, - func_params, - func_attrs, - ¯o_type_ids, - ); - let kernel_func_async = quote_kernel_func_async( - crate_path, - config, - decl_generics, - func_inputs, - func_ident, - func_params, - func_attrs, - ¯o_type_ids, - ); quote! { #[cfg(not(target_os = "cuda"))] #cpu_linker_macro_visibility macro_rules! #linker { (#(#macro_types),* $(,)?) => { - unsafe impl #kernel #generic_start_token #($#macro_type_ids),* #generic_close_token + unsafe impl #ptx #generic_start_token #($#macro_type_ids),* #generic_close_token for #launcher #generic_start_token #($#macro_type_ids),* #generic_close_token { #get_ptx_str #new_kernel - - #kernel_func - - #kernel_func_async } }; } diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper.rs deleted file mode 100644 index 4851af9ce..000000000 --- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper.rs +++ /dev/null @@ -1,193 +0,0 @@ -use proc_macro2::TokenStream; - -use crate::kernel::utils::r2c_move_lifetime; - -use super::super::{ - DeclGenerics, FuncIdent, FunctionInputs, ImplGenerics, InputCudaType, KernelConfig, -}; - -pub(in super::super) fn quote_cpu_wrapper( - crate_path: &syn::Path, - config @ KernelConfig { - visibility, kernel, .. - }: &KernelConfig, - DeclGenerics { - generic_start_token, - generic_trait_params, - generic_close_token, - generic_trait_where_clause, - generic_wrapper_params, - generic_wrapper_where_clause, - .. - }: &DeclGenerics, - impl_generics @ ImplGenerics { ty_generics, .. }: &ImplGenerics, - func_inputs: &FunctionInputs, - FuncIdent { - func_ident, - func_ident_async, - .. - }: &FuncIdent, - func_attrs: &[syn::Attribute], -) -> TokenStream { - let launcher_predicate = quote! { - Self: Sized + #crate_path::host::Launcher< - KernelTraitObject = dyn #kernel #ty_generics - > - }; - - let generic_wrapper_where_clause = match generic_wrapper_where_clause { - Some(syn::WhereClause { - where_token, - predicates, - }) if !predicates.is_empty() => { - let comma = if predicates.empty_or_trailing() { - quote!() - } else { - quote!(,) - }; - - quote! { - #where_token #predicates #comma #launcher_predicate - } - }, - _ => quote! { - where #launcher_predicate - }, - }; - - let (new_func_inputs_decl, new_func_inputs_async_decl) = - generate_new_func_inputs_decl(crate_path, config, impl_generics, func_inputs); - - quote! { - #[cfg(not(target_os = "cuda"))] - #[allow(clippy::missing_safety_doc)] - #visibility unsafe trait #kernel #generic_start_token #generic_trait_params #generic_close_token - #generic_trait_where_clause - { - fn get_ptx_str() -> &'static str where #launcher_predicate; - - fn new_kernel() -> #crate_path::rustacuda::error::CudaResult< - #crate_path::host::TypedKernel - > where #launcher_predicate; - - #(#func_attrs)* - #[allow(clippy::too_many_arguments)] - fn #func_ident <'stream, #generic_wrapper_params>( - &mut self, - stream: &'stream #crate_path::rustacuda::stream::Stream, - #(#new_func_inputs_decl),* - ) -> #crate_path::rustacuda::error::CudaResult<()> - #generic_wrapper_where_clause; - - #(#func_attrs)* - #[allow(clippy::too_many_arguments)] - fn #func_ident_async <'stream, #generic_wrapper_params>( - &mut self, - stream: &'stream #crate_path::rustacuda::stream::Stream, - #(#new_func_inputs_async_decl),* - ) -> #crate_path::rustacuda::error::CudaResult<()> - #generic_wrapper_where_clause; - } - } -} - -fn generate_new_func_inputs_decl( - crate_path: &syn::Path, - KernelConfig { args, .. }: &KernelConfig, - ImplGenerics { ty_generics, .. }: &ImplGenerics, - FunctionInputs { - func_inputs, - func_input_cuda_types, - }: &FunctionInputs, -) -> (Vec, Vec) { - func_inputs - .iter() - .zip(func_input_cuda_types.iter()) - .enumerate() - .map(|(i, (arg, (cuda_mode, _ptx_jit)))| match arg { - syn::FnArg::Typed(syn::PatType { - attrs, - pat, - colon_token, - ty, - }) => ( - syn::FnArg::Typed(syn::PatType { - attrs: attrs.clone(), - pat: pat.clone(), - colon_token: *colon_token, - ty: { - let type_ident = quote::format_ident!("__T_{}", i); - let syn_type = syn::parse_quote!(<() as #args #ty_generics>::#type_ident); - - if let syn::Type::Reference(syn::TypeReference { - and_token, - lifetime, - mutability, - .. - }) = &**ty - { - Box::new(syn::Type::Reference(syn::TypeReference { - and_token: *and_token, - lifetime: lifetime.clone(), - mutability: *mutability, - elem: syn_type, - })) - } else { - syn_type - } - }, - }), - syn::FnArg::Typed(syn::PatType { - attrs: attrs.clone(), - pat: pat.clone(), - colon_token: *colon_token, - ty: { - let type_ident = quote::format_ident!("__T_{}", i); - let syn_type: Box = - syn::parse_quote!(<() as #args #ty_generics>::#type_ident); - - let cuda_type = match cuda_mode { - InputCudaType::SafeDeviceCopy => syn::parse_quote!( - #crate_path::utils::device_copy::SafeDeviceCopyWrapper<#syn_type> - ), - InputCudaType::LendRustToCuda => syn::parse_quote!( - #crate_path::common::DeviceAccessible< - <#syn_type as #crate_path::common::RustToCuda>::CudaRepresentation - > - ), - }; - - if let syn::Type::Reference(syn::TypeReference { - lifetime, - mutability, - .. - }) = &**ty - { - let wrapped_type = if mutability.is_some() { - syn::parse_quote!( - #crate_path::host::HostAndDeviceMutRefAsync<'stream, #lifetime, #cuda_type> - ) - } else { - syn::parse_quote!( - #crate_path::host::HostAndDeviceConstRefAsync<'stream, #lifetime, #cuda_type> - ) - }; - - Box::new(wrapped_type) - } else if matches!(cuda_mode, InputCudaType::LendRustToCuda) { - let lifetime = r2c_move_lifetime(i, ty); - - let wrapped_type = syn::parse_quote!( - #crate_path::host::HostAndDeviceOwnedAsync<'stream, #lifetime, #cuda_type> - ); - - Box::new(wrapped_type) - } else { - cuda_type - } - }, - }) - ), - syn::FnArg::Receiver(_) => unreachable!(), - }).unzip() -} diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func.rs similarity index 84% rename from rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func.rs rename to rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func.rs index d6e70e276..94b4b9598 100644 --- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func.rs +++ b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func.rs @@ -1,15 +1,16 @@ use proc_macro2::TokenStream; -use super::super::super::{DeclGenerics, FuncIdent, FunctionInputs, InputCudaType, KernelConfig}; +use super::super::super::{ + DeclGenerics, FuncIdent, FunctionInputs, ImplGenerics, InputCudaType, KernelConfig, +}; #[allow(clippy::too_many_arguments)] -pub(super) fn quote_kernel_func( +pub(super) fn quote_kernel_func_inputs( crate_path: &syn::Path, - KernelConfig { args, .. }: &KernelConfig, + KernelConfig { kernel, args, .. }: &KernelConfig, + ImplGenerics { ty_generics, .. }: &ImplGenerics, DeclGenerics { - generic_start_token, generic_wrapper_params, - generic_close_token, generic_wrapper_where_clause, .. }: &DeclGenerics, @@ -17,9 +18,34 @@ pub(super) fn quote_kernel_func( fn_ident @ FuncIdent { func_ident, .. }: &FuncIdent, func_params: &[syn::Ident], func_attrs: &[syn::Attribute], - macro_type_ids: &[syn::Ident], ) -> TokenStream { - let new_func_inputs = func_inputs + let launcher_predicate = quote! { + Self: Sized + #crate_path::host::Launcher< + KernelTraitObject = dyn #kernel #ty_generics + > + }; + + let generic_wrapper_where_clause = match generic_wrapper_where_clause { + Some(syn::WhereClause { + where_token, + predicates, + }) if !predicates.is_empty() => { + let comma = if predicates.empty_or_trailing() { + quote!() + } else { + quote!(,) + }; + + quote! { + #where_token #predicates #comma #launcher_predicate + } + }, + _ => quote! { + where #launcher_predicate + }, + }; + + let kernel_func_inputs = func_inputs .iter() .enumerate() .map(|(i, arg)| match arg { @@ -31,9 +57,7 @@ pub(super) fn quote_kernel_func( }) => { let type_ident = quote::format_ident!("__T_{}", i); let syn_type = quote! { - <() as #args #generic_start_token - #($#macro_type_ids),* - #generic_close_token>::#type_ident + <() as #args #ty_generics>::#type_ident }; if let syn::Type::Reference(syn::TypeReference { @@ -60,10 +84,13 @@ pub(super) fn quote_kernel_func( quote! { #(#func_attrs)* #[allow(clippy::needless_lifetimes)] + #[allow(clippy::too_many_arguments)] + #[allow(clippy::used_underscore_binding)] + #[allow(unused_variables)] fn #func_ident <'stream, #generic_wrapper_params>( &mut self, stream: &'stream #crate_path::rustacuda::stream::Stream, - #(#new_func_inputs),* + #(#kernel_func_inputs),* ) -> #crate_path::rustacuda::error::CudaResult<()> #generic_wrapper_where_clause { diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func_async/async_func_types.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func_async/async_func_types.rs similarity index 89% rename from rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func_async/async_func_types.rs rename to rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func_async/async_func_types.rs index c24406c9a..efe8026eb 100644 --- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func_async/async_func_types.rs +++ b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func_async/async_func_types.rs @@ -3,21 +3,16 @@ use syn::spanned::Spanned; use crate::kernel::utils::r2c_move_lifetime; -use super::super::super::super::{DeclGenerics, FunctionInputs, InputCudaType, KernelConfig}; +use super::super::super::super::{FunctionInputs, ImplGenerics, InputCudaType, KernelConfig}; pub(super) fn generate_async_func_types( crate_path: &syn::Path, KernelConfig { args, .. }: &KernelConfig, - DeclGenerics { - generic_start_token, - generic_close_token, - .. - }: &DeclGenerics, + ImplGenerics { ty_generics, .. }: &ImplGenerics, FunctionInputs { func_inputs, func_input_cuda_types, }: &FunctionInputs, - macro_type_ids: &[syn::Ident], ) -> Vec { func_inputs .iter() @@ -32,9 +27,7 @@ pub(super) fn generate_async_func_types( }) => { let type_ident = quote::format_ident!("__T_{}", i); let syn_type = quote! { - <() as #args #generic_start_token - #($#macro_type_ids),* - #generic_close_token>::#type_ident + <() as #args #ty_generics>::#type_ident }; let cuda_type = match cuda_mode { diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func_async/launch_types.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func_async/launch_types.rs similarity index 63% rename from rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func_async/launch_types.rs rename to rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func_async/launch_types.rs index 16cd0008e..454bdcd57 100644 --- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func_async/launch_types.rs +++ b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func_async/launch_types.rs @@ -3,24 +3,18 @@ use syn::spanned::Spanned; use crate::kernel::utils::r2c_move_lifetime; -use super::super::super::super::{DeclGenerics, FunctionInputs, InputCudaType, KernelConfig}; +use super::super::super::super::{FunctionInputs, ImplGenerics, InputCudaType, KernelConfig}; pub(in super::super) fn generate_launch_types( crate_path: &syn::Path, KernelConfig { args, .. }: &KernelConfig, - DeclGenerics { - generic_start_token, - generic_close_token, - .. - }: &DeclGenerics, + ImplGenerics { ty_generics, .. }: &ImplGenerics, FunctionInputs { func_inputs, func_input_cuda_types, }: &FunctionInputs, - macro_type_ids: &[syn::Ident], -) -> (Vec, Vec, Vec) { +) -> (Vec, Vec) { let mut cpu_func_types_launch = Vec::with_capacity(func_inputs.len()); - let mut cpu_func_lifetime_erased_types = Vec::with_capacity(func_inputs.len()); let mut cpu_func_unboxed_types = Vec::with_capacity(func_inputs.len()); func_inputs @@ -31,9 +25,7 @@ pub(in super::super) fn generate_launch_types( syn::FnArg::Typed(syn::PatType { ty, .. }) => { let type_ident = quote::format_ident!("__T_{}", i); let syn_type = quote::quote_spanned! { ty.span()=> - <() as #args #generic_start_token - #($#macro_type_ids),* - #generic_close_token>::#type_ident + <() as #args #ty_generics>::#type_ident }; cpu_func_unboxed_types.push(syn_type.clone()); @@ -75,33 +67,9 @@ pub(in super::super) fn generate_launch_types( quote! { #cuda_type } }, ); - - cpu_func_lifetime_erased_types.push( - if let syn::Type::Reference(syn::TypeReference { mutability, .. }) = &**ty { - if mutability.is_some() { - quote::quote_spanned! { ty.span()=> - #crate_path::common::DeviceMutRef<'static, #cuda_type> - } - } else { - quote::quote_spanned! { ty.span()=> - #crate_path::common::DeviceConstRef<'static, #cuda_type> - } - } - } else if matches!(cuda_mode, InputCudaType::LendRustToCuda) { - quote::quote_spanned! { ty.span()=> - #crate_path::common::DeviceMutRef<'static, #cuda_type> - } - } else { - cuda_type - }, - ); }, syn::FnArg::Receiver(_) => unreachable!(), }); - ( - cpu_func_types_launch, - cpu_func_lifetime_erased_types, - cpu_func_unboxed_types, - ) + (cpu_func_types_launch, cpu_func_unboxed_types) } diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func_async/mod.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func_async/mod.rs similarity index 70% rename from rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func_async/mod.rs rename to rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func_async/mod.rs index c01dcdce3..462855156 100644 --- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func_async/mod.rs +++ b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func_async/mod.rs @@ -1,20 +1,21 @@ use proc_macro2::TokenStream; -use super::super::super::{DeclGenerics, FuncIdent, FunctionInputs, KernelConfig}; +use super::super::super::{DeclGenerics, FuncIdent, FunctionInputs, ImplGenerics, KernelConfig}; mod async_func_types; mod launch_types; mod type_wrap; use async_func_types::generate_async_func_types; -pub(super) use launch_types::generate_launch_types; +use launch_types::generate_launch_types; use type_wrap::generate_func_input_and_ptx_jit_wraps; #[allow(clippy::too_many_arguments)] pub(super) fn quote_kernel_func_async( crate_path: &syn::Path, - config @ KernelConfig { args, .. }: &KernelConfig, - decl_generics @ DeclGenerics { + config @ KernelConfig { kernel, .. }: &KernelConfig, + impl_generics @ ImplGenerics { ty_generics, .. }: &ImplGenerics, + DeclGenerics { generic_wrapper_params, generic_wrapper_where_clause, .. @@ -25,33 +26,50 @@ pub(super) fn quote_kernel_func_async( }: &FuncIdent, func_params: &[syn::Ident], func_attrs: &[syn::Attribute], - macro_type_ids: &[syn::Ident], ) -> TokenStream { - let new_func_inputs_async = generate_async_func_types( - crate_path, - config, - decl_generics, - func_inputs, - macro_type_ids, - ); + let launcher_predicate = quote! { + Self: Sized + #crate_path::host::Launcher< + KernelTraitObject = dyn #kernel #ty_generics + > + }; + + let generic_wrapper_where_clause = match generic_wrapper_where_clause { + Some(syn::WhereClause { + where_token, + predicates, + }) if !predicates.is_empty() => { + let comma = if predicates.empty_or_trailing() { + quote!() + } else { + quote!(,) + }; + + quote! { + #where_token #predicates #comma #launcher_predicate + } + }, + _ => quote! { + where #launcher_predicate + }, + }; + + let kernel_func_async_inputs = + generate_async_func_types(crate_path, config, impl_generics, func_inputs); let (func_input_wrap, func_cpu_ptx_jit_wrap) = generate_func_input_and_ptx_jit_wraps(crate_path, func_inputs); - let (cpu_func_types_launch, cpu_func_lifetime_erased_types, cpu_func_unboxed_types) = - generate_launch_types( - crate_path, - config, - decl_generics, - func_inputs, - macro_type_ids, - ); + let (cpu_func_types_launch, cpu_func_unboxed_types) = + generate_launch_types(crate_path, config, impl_generics, func_inputs); quote! { #(#func_attrs)* #[allow(clippy::extra_unused_type_parameters)] + #[allow(clippy::too_many_arguments)] + #[allow(clippy::used_underscore_binding)] + #[allow(unused_variables)] fn #func_ident_async <'stream, #generic_wrapper_params>( &mut self, stream: &'stream #crate_path::rustacuda::stream::Stream, - #(#new_func_inputs_async),* + #(#kernel_func_async_inputs),* ) -> #crate_path::rustacuda::error::CudaResult<()> #generic_wrapper_where_clause { @@ -77,16 +95,6 @@ pub(super) fn quote_kernel_func_async( #[allow(clippy::redundant_closure_call)] (|#(#func_params: #cpu_func_types_launch),*| { - #[deny(improper_ctypes)] - mod __rust_cuda_ffi_safe_assert { - use super::#args; - - extern "C" { #( - #[allow(dead_code)] - static #func_params: #cpu_func_lifetime_erased_types; - )* } - } - if false { #[allow(dead_code)] fn assert_impl_devicecopy(_val: &T) {} diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func_async/type_wrap.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func_async/type_wrap.rs similarity index 100% rename from rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/kernel_func_async/type_wrap.rs rename to rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func_async/type_wrap.rs diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/mod.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/mod.rs new file mode 100644 index 000000000..1b984f920 --- /dev/null +++ b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/mod.rs @@ -0,0 +1,96 @@ +use proc_macro2::TokenStream; + +use super::super::{ + BlanketGenerics, DeclGenerics, FuncIdent, FunctionInputs, ImplGenerics, KernelConfig, +}; + +mod kernel_func; +mod kernel_func_async; + +use kernel_func::quote_kernel_func_inputs; +use kernel_func_async::quote_kernel_func_async; + +#[allow(clippy::too_many_arguments)] +pub(in super::super) fn quote_cpu_wrapper( + crate_path: &syn::Path, + config @ KernelConfig { + visibility, + kernel, + ptx, + .. + }: &KernelConfig, + decl @ DeclGenerics { + generic_start_token, + generic_trait_params, + generic_close_token, + generic_trait_where_clause, + .. + }: &DeclGenerics, + impl_generics @ ImplGenerics { ty_generics, .. }: &ImplGenerics, + BlanketGenerics { + blanket_ty, + impl_generics: blanket_impl_generics, + where_clause: blanket_where_clause, + }: &BlanketGenerics, + func_inputs: &FunctionInputs, + fn_ident: &FuncIdent, + func_params: &[syn::Ident], + func_attrs: &[syn::Attribute], +) -> TokenStream { + let launcher_predicate = quote! { + Self: Sized + #crate_path::host::Launcher< + KernelTraitObject = dyn #kernel #ty_generics + > + }; + + let kernel_func = quote_kernel_func_inputs( + crate_path, + config, + impl_generics, + decl, + func_inputs, + fn_ident, + func_params, + func_attrs, + ); + let kernel_func_async = quote_kernel_func_async( + crate_path, + config, + impl_generics, + decl, + func_inputs, + fn_ident, + func_params, + func_attrs, + ); + + quote! { + #[cfg(not(target_os = "cuda"))] + #[allow(clippy::missing_safety_doc)] + #visibility unsafe trait #ptx #generic_start_token #generic_trait_params #generic_close_token + #generic_trait_where_clause + { + fn get_ptx_str() -> &'static str where #launcher_predicate; + + fn new_kernel() -> #crate_path::rustacuda::error::CudaResult< + #crate_path::host::TypedKernel + > where #launcher_predicate; + } + + #[cfg(not(target_os = "cuda"))] + #[allow(clippy::missing_safety_doc)] + #visibility unsafe trait #kernel #generic_start_token #generic_trait_params #generic_close_token: #ptx #ty_generics + #generic_trait_where_clause + { + #kernel_func + + #kernel_func_async + } + + #[cfg(not(target_os = "cuda"))] + #[allow(clippy::missing_safety_doc)] + unsafe impl #blanket_impl_generics #kernel #ty_generics for #blanket_ty + #blanket_where_clause + {} + } +} diff --git a/rust-cuda-derive/src/kernel/wrapper/mod.rs b/rust-cuda-derive/src/kernel/wrapper/mod.rs index 76b88eee6..b720a8965 100644 --- a/rust-cuda-derive/src/kernel/wrapper/mod.rs +++ b/rust-cuda-derive/src/kernel/wrapper/mod.rs @@ -31,8 +31,8 @@ pub fn kernel(attr: TokenStream, func: TokenStream) -> TokenStream { Ok(config) => config, Err(err) => { abort_call_site!( - "#[kernel(pub? use LINKER! as impl KERNEL for LAUNCHER)] expects LINKER, \ - KERNEL, ARGS and LAUNCHER identifiers: {:?}", + "#[kernel(pub? use LINKER! as impl KERNEL for LAUNCHER)] expects \ + LINKER, KERNEL, ARGS, PTX, and LAUNCHER identifiers: {:?}", err ) }, @@ -172,14 +172,34 @@ pub fn kernel(attr: TokenStream, func: TokenStream) -> TokenStream { gt_token: generic_close_token, where_clause: generic_trait_where_clause.clone(), }; - let impl_generics = { - let (impl_generics, ty_generics, where_clause) = trait_generics.split_for_impl(); - - ImplGenerics { - impl_generics, - ty_generics, - where_clause, - } + let (impl_generics, ty_generics, where_clause) = trait_generics.split_for_impl(); + let blanket_ty = syn::Ident::new("K", Span::mixed_site()); + let mut blanket_params = generic_trait_params.clone(); + let ptx = &config.ptx; + blanket_params.push(syn::GenericParam::Type(syn::TypeParam { + attrs: Vec::new(), + ident: blanket_ty.clone(), + colon_token: syn::parse_quote!(:), + bounds: syn::parse_quote!(#ptx #ty_generics), + eq_token: None, + default: None, + })); + let trait_blanket_generics = syn::Generics { + lt_token: Some(generic_start_token.unwrap_or(syn::parse_quote!(<))), + params: blanket_params, + gt_token: Some(generic_close_token.unwrap_or(syn::parse_quote!(>))), + where_clause: generic_trait_where_clause.clone(), + }; + let (blanket_impl_generics, _, blanket_where_clause) = trait_blanket_generics.split_for_impl(); + let blanket_generics = BlanketGenerics { + blanket_ty, + impl_generics: blanket_impl_generics, + where_clause: blanket_where_clause, + }; + let impl_generics = ImplGenerics { + impl_generics, + ty_generics, + where_clause, }; let func_ident = FuncIdent { @@ -237,8 +257,10 @@ pub fn kernel(attr: TokenStream, func: TokenStream) -> TokenStream { &config, &decl_generics, &impl_generics, + &blanket_generics, &func_inputs, &func_ident, + &func_params, &func.attrs, ); let cpu_cuda_check = quote_generic_check(&crate_path, &func_ident, &config); @@ -249,7 +271,6 @@ pub fn kernel(attr: TokenStream, func: TokenStream) -> TokenStream { &func_inputs, &func_ident, &func_params, - &func.attrs, ); let cuda_wrapper = quote_cuda_wrapper( &crate_path, @@ -307,6 +328,12 @@ struct ImplGenerics<'f> { where_clause: Option<&'f syn::WhereClause>, } +struct BlanketGenerics<'f> { + blanket_ty: syn::Ident, + impl_generics: syn::ImplGenerics<'f>, + where_clause: Option<&'f syn::WhereClause>, +} + #[allow(clippy::struct_field_names)] struct FuncIdent<'f> { func_ident: &'f syn::Ident, diff --git a/rust-cuda-derive/src/rust_to_cuda/field_copy.rs b/rust-cuda-derive/src/rust_to_cuda/field_copy.rs index c6659e9c9..549f5ab56 100644 --- a/rust-cuda-derive/src/rust_to_cuda/field_copy.rs +++ b/rust-cuda-derive/src/rust_to_cuda/field_copy.rs @@ -56,7 +56,7 @@ pub fn impl_field_copy_init_and_expand_alloc_type( }, CudaReprFieldTy::RustToCuda { field_ty } => { combined_cuda_alloc_type = quote! { - #crate_path::host::CombinedCudaAlloc< + #crate_path::common::CombinedCudaAlloc< <#field_ty as #crate_path::common::RustToCuda>::CudaAllocation, #combined_cuda_alloc_type > @@ -102,7 +102,7 @@ pub fn impl_field_copy_init_and_expand_alloc_type( }, CudaReprFieldTy::RustToCudaProxy { proxy_ty, field_ty } => { combined_cuda_alloc_type = quote! { - #crate_path::host::CombinedCudaAlloc< + #crate_path::common::CombinedCudaAlloc< <#proxy_ty as #crate_path::common::RustToCuda>::CudaAllocation, #combined_cuda_alloc_type > diff --git a/rust-cuda-derive/src/rust_to_cuda/impl.rs b/rust-cuda-derive/src/rust_to_cuda/impl.rs index 1ff844645..1682c0c80 100644 --- a/rust-cuda-derive/src/rust_to_cuda/impl.rs +++ b/rust-cuda-derive/src/rust_to_cuda/impl.rs @@ -81,18 +81,17 @@ pub fn rust_to_cuda_trait( { type CudaRepresentation = #struct_name_cuda #ty_generics; - #[cfg(not(target_os = "cuda"))] type CudaAllocation = #combined_cuda_alloc_type; #[cfg(not(target_os = "cuda"))] - unsafe fn borrow( + unsafe fn borrow( &self, alloc: CudaAllocType, ) -> #crate_path::rustacuda::error::CudaResult<( #crate_path::common::DeviceAccessible, - #crate_path::host::CombinedCudaAlloc + #crate_path::common::CombinedCudaAlloc )> { - let alloc_front = #crate_path::host::NullCudaAlloc; + let alloc_front = #crate_path::common::NullCudaAlloc; let alloc_tail = alloc; #(#r2c_field_declarations)* @@ -101,14 +100,14 @@ pub fn rust_to_cuda_trait( Ok(( #crate_path::common::DeviceAccessible::from(borrow), - #crate_path::host::CombinedCudaAlloc::new(alloc_front, alloc_tail) + #crate_path::common::CombinedCudaAlloc::new(alloc_front, alloc_tail) )) } #[cfg(not(target_os = "cuda"))] - unsafe fn restore( + unsafe fn restore( &mut self, - alloc: #crate_path::host::CombinedCudaAlloc< + alloc: #crate_path::common::CombinedCudaAlloc< Self::CudaAllocation, CudaAllocType >, ) -> #crate_path::rustacuda::error::CudaResult { @@ -154,15 +153,15 @@ pub fn rust_to_cuda_async_trait( #where_clause { #[cfg(not(target_os = "cuda"))] - unsafe fn borrow_async( + unsafe fn borrow_async( &self, alloc: CudaAllocType, stream: &#crate_path::rustacuda::stream::Stream, ) -> #crate_path::rustacuda::error::CudaResult<( #crate_path::common::DeviceAccessible, - #crate_path::host::CombinedCudaAlloc + #crate_path::common::CombinedCudaAlloc )> { - let alloc_front = #crate_path::host::NullCudaAlloc; + let alloc_front = #crate_path::common::NullCudaAlloc; let alloc_tail = alloc; #(#r2c_field_async_declarations)* @@ -171,14 +170,14 @@ pub fn rust_to_cuda_async_trait( Ok(( #crate_path::common::DeviceAccessible::from(borrow), - #crate_path::host::CombinedCudaAlloc::new(alloc_front, alloc_tail) + #crate_path::common::CombinedCudaAlloc::new(alloc_front, alloc_tail) )) } #[cfg(not(target_os = "cuda"))] - unsafe fn restore_async( + unsafe fn restore_async( &mut self, - alloc: #crate_path::host::CombinedCudaAlloc< + alloc: #crate_path::common::CombinedCudaAlloc< Self::CudaAllocation, CudaAllocType >, stream: &#crate_path::rustacuda::stream::Stream, diff --git a/rust-cuda-derive/src/rust_to_cuda/mod.rs b/rust-cuda-derive/src/rust_to_cuda/mod.rs index 4173d6658..5e11ffc8c 100644 --- a/rust-cuda-derive/src/rust_to_cuda/mod.rs +++ b/rust-cuda-derive/src/rust_to_cuda/mod.rs @@ -31,7 +31,7 @@ pub fn impl_rust_to_cuda(ast: &syn::DeriveInput) -> proc_macro::TokenStream { ) = generics::expand_cuda_struct_generics_where_requested_in_attrs(ast); let mut combined_cuda_alloc_type: TokenStream = quote! { - #crate_path::host::NullCudaAlloc + #crate_path::common::NullCudaAlloc }; let mut r2c_field_declarations: Vec = Vec::new(); let mut r2c_field_async_declarations: Vec = Vec::new(); diff --git a/src/common.rs b/src/common.rs index abb196c05..6a7e7d926 100644 --- a/src/common.rs +++ b/src/common.rs @@ -79,9 +79,7 @@ impl DerefMut for DeviceAccessible { /// This is an internal trait and should ONLY be derived automatically using /// `#[derive(LendRustToCuda)]` pub unsafe trait RustToCuda { - #[cfg(feature = "host")] - #[doc(cfg(feature = "host"))] - type CudaAllocation: crate::host::CudaAlloc; + type CudaAllocation: CudaAlloc; type CudaRepresentation: CudaAsRust + TypeGraphLayout; #[cfg(feature = "host")] @@ -97,12 +95,12 @@ pub unsafe trait RustToCuda { /// The returned [`Self::CudaRepresentation`] must NEVER be accessed on the /// CPU as it contains a GPU-resident copy of `self`. #[allow(clippy::type_complexity)] - unsafe fn borrow( + unsafe fn borrow( &self, alloc: A, ) -> rustacuda::error::CudaResult<( DeviceAccessible, - crate::host::CombinedCudaAlloc, + CombinedCudaAlloc, )>; #[cfg(feature = "host")] @@ -116,9 +114,9 @@ pub unsafe trait RustToCuda { /// /// This is an internal function and should NEVER be called manually #[allow(clippy::type_complexity)] - unsafe fn restore( + unsafe fn restore( &mut self, - alloc: crate::host::CombinedCudaAlloc, + alloc: CombinedCudaAlloc, ) -> rustacuda::error::CudaResult; } @@ -142,13 +140,13 @@ pub unsafe trait RustToCudaAsync: RustToCuda { /// be accessed on the CPU as it contains a GPU-resident copy of /// `self`. #[allow(clippy::type_complexity)] - unsafe fn borrow_async( + unsafe fn borrow_async( &self, alloc: A, stream: &rustacuda::stream::Stream, ) -> rustacuda::error::CudaResult<( DeviceAccessible, - crate::host::CombinedCudaAlloc, + CombinedCudaAlloc, )>; #[cfg(feature = "host")] @@ -162,9 +160,9 @@ pub unsafe trait RustToCudaAsync: RustToCuda { /// /// This is an internal function and should NEVER be called manually #[allow(clippy::type_complexity)] - unsafe fn restore_async( + unsafe fn restore_async( &mut self, - alloc: crate::host::CombinedCudaAlloc, + alloc: CombinedCudaAlloc, stream: &rustacuda::stream::Stream, ) -> rustacuda::error::CudaResult; } @@ -240,3 +238,43 @@ impl<'r, T: DeviceCopy> AsMut for DeviceMutRef<'r, T> { unsafe { &mut *self.pointer } } } + +pub(crate) mod crate_private { + pub mod alloc { + pub trait Sealed {} + } +} + +mod private { + pub mod empty { + pub trait Sealed {} + } +} + +pub trait EmptyCudaAlloc: private::empty::Sealed {} +impl EmptyCudaAlloc for T {} + +pub trait CudaAlloc: crate_private::alloc::Sealed {} +impl CudaAlloc for T {} + +impl crate_private::alloc::Sealed for Option {} + +pub struct NullCudaAlloc; +impl crate_private::alloc::Sealed for NullCudaAlloc {} +impl private::empty::Sealed for NullCudaAlloc {} + +pub struct CombinedCudaAlloc(A, B); +impl crate_private::alloc::Sealed for CombinedCudaAlloc {} +impl private::empty::Sealed + for CombinedCudaAlloc +{ +} +impl CombinedCudaAlloc { + pub fn new(front: A, tail: B) -> Self { + Self(front, tail) + } + + pub fn split(self) -> (A, B) { + (self.0, self.1) + } +} diff --git a/src/device/mod.rs b/src/device/mod.rs index 45c833923..f7347aa98 100644 --- a/src/device/mod.rs +++ b/src/device/mod.rs @@ -116,3 +116,7 @@ impl DerefMut for ShallowCopy { &mut self.0 } } + +pub struct SomeCudaAlloc(()); + +impl crate::common::crate_private::alloc::Sealed for SomeCudaAlloc {} diff --git a/src/host.rs b/src/host.rs index 591ed4ed5..7a5eaf854 100644 --- a/src/host.rs +++ b/src/host.rs @@ -20,7 +20,9 @@ use rustacuda_core::{DeviceCopy, DevicePointer}; pub use rust_cuda_derive::{check_kernel, link_kernel, specialise_kernel_call}; use crate::{ - common::{DeviceAccessible, DeviceConstRef, DeviceMutRef, RustToCuda}, + common::{ + DeviceAccessible, DeviceConstRef, DeviceMutRef, EmptyCudaAlloc, NullCudaAlloc, RustToCuda, + }, ptx_jit::{CudaKernel, PtxJITCompiler, PtxJITResult}, safety::SafeDeviceCopy, }; @@ -250,53 +252,17 @@ impl LendToCuda for T { } } -pub(crate) mod private { - pub mod alloc { - pub trait Sealed {} - } - +mod private { pub mod drop { pub trait Sealed: Sized { fn drop(val: Self) -> Result<(), (rustacuda::error::CudaError, Self)>; } } - - pub mod empty { - pub trait Sealed {} - } -} - -pub trait EmptyCudaAlloc: private::empty::Sealed {} -impl EmptyCudaAlloc for T {} - -pub trait CudaAlloc: private::alloc::Sealed {} -impl CudaAlloc for T {} - -impl private::alloc::Sealed for Option {} - -pub struct NullCudaAlloc; -impl private::alloc::Sealed for NullCudaAlloc {} -impl private::empty::Sealed for NullCudaAlloc {} - -pub struct CombinedCudaAlloc(A, B); -impl private::alloc::Sealed for CombinedCudaAlloc {} -impl private::empty::Sealed - for CombinedCudaAlloc -{ -} -impl CombinedCudaAlloc { - pub fn new(front: A, tail: B) -> Self { - Self(front, tail) - } - - pub fn split(self) -> (A, B) { - (self.0, self.1) - } } #[repr(transparent)] pub struct CudaDropWrapper(ManuallyDrop); -impl private::alloc::Sealed for CudaDropWrapper {} +impl crate::common::crate_private::alloc::Sealed for CudaDropWrapper {} impl From for CudaDropWrapper { fn from(val: C) -> Self { Self(ManuallyDrop::new(val)) @@ -416,7 +382,7 @@ impl Drop for HostLockedBox { #[allow(clippy::module_name_repetitions)] pub struct HostDeviceBox(DevicePointer); -impl private::alloc::Sealed for HostDeviceBox {} +impl crate::common::crate_private::alloc::Sealed for HostDeviceBox {} impl HostDeviceBox { /// # Errors diff --git a/src/utils/aliasing/const.rs b/src/utils/aliasing/const.rs index ea5f1bba4..91496a47d 100644 --- a/src/utils/aliasing/const.rs +++ b/src/utils/aliasing/const.rs @@ -178,8 +178,6 @@ impl, const STRIDE: usize> BorrowMut<[E]> unsafe impl RustToCuda for SplitSliceOverCudaThreadsConstStride { - #[cfg(feature = "host")] - #[doc(cfg(feature = "host"))] type CudaAllocation = T::CudaAllocation; type CudaRepresentation = SplitSliceOverCudaThreadsConstStride, STRIDE>; @@ -187,12 +185,12 @@ unsafe impl RustToCuda #[cfg(feature = "host")] #[doc(cfg(feature = "host"))] #[allow(clippy::type_complexity)] - unsafe fn borrow( + unsafe fn borrow( &self, alloc: A, ) -> rustacuda::error::CudaResult<( DeviceAccessible, - crate::host::CombinedCudaAlloc, + crate::common::CombinedCudaAlloc, )> { let (cuda_repr, alloc) = self.0.borrow(alloc)?; @@ -204,9 +202,9 @@ unsafe impl RustToCuda #[cfg(feature = "host")] #[doc(cfg(feature = "host"))] - unsafe fn restore( + unsafe fn restore( &mut self, - alloc: crate::host::CombinedCudaAlloc, + alloc: crate::common::CombinedCudaAlloc, ) -> rustacuda::error::CudaResult { self.0.restore(alloc) } @@ -218,13 +216,13 @@ unsafe impl RustToCudaAsync #[cfg(feature = "host")] #[doc(cfg(feature = "host"))] #[allow(clippy::type_complexity)] - unsafe fn borrow_async( + unsafe fn borrow_async( &self, alloc: A, stream: &rustacuda::stream::Stream, ) -> rustacuda::error::CudaResult<( DeviceAccessible, - crate::host::CombinedCudaAlloc, + crate::common::CombinedCudaAlloc, )> { let (cuda_repr, alloc) = self.0.borrow_async(alloc, stream)?; @@ -236,9 +234,9 @@ unsafe impl RustToCudaAsync #[cfg(feature = "host")] #[doc(cfg(feature = "host"))] - unsafe fn restore_async( + unsafe fn restore_async( &mut self, - alloc: crate::host::CombinedCudaAlloc, + alloc: crate::common::CombinedCudaAlloc, stream: &rustacuda::stream::Stream, ) -> rustacuda::error::CudaResult { self.0.restore_async(alloc, stream) diff --git a/src/utils/aliasing/dynamic.rs b/src/utils/aliasing/dynamic.rs index c2ad169ff..d7b48b05f 100644 --- a/src/utils/aliasing/dynamic.rs +++ b/src/utils/aliasing/dynamic.rs @@ -152,8 +152,6 @@ impl> BorrowMut<[E]> for SplitSliceOverCudaThreadsDynamicSt } unsafe impl RustToCuda for SplitSliceOverCudaThreadsDynamicStride { - #[cfg(feature = "host")] - #[doc(cfg(feature = "host"))] type CudaAllocation = T::CudaAllocation; type CudaRepresentation = SplitSliceOverCudaThreadsDynamicStride>; @@ -161,12 +159,12 @@ unsafe impl RustToCuda for SplitSliceOverCudaThreadsDynamicStride #[cfg(feature = "host")] #[doc(cfg(feature = "host"))] #[allow(clippy::type_complexity)] - unsafe fn borrow( + unsafe fn borrow( &self, alloc: A, ) -> rustacuda::error::CudaResult<( DeviceAccessible, - crate::host::CombinedCudaAlloc, + crate::common::CombinedCudaAlloc, )> { let (cuda_repr, alloc) = self.inner.borrow(alloc)?; @@ -181,9 +179,9 @@ unsafe impl RustToCuda for SplitSliceOverCudaThreadsDynamicStride #[cfg(feature = "host")] #[doc(cfg(feature = "host"))] - unsafe fn restore( + unsafe fn restore( &mut self, - alloc: crate::host::CombinedCudaAlloc, + alloc: crate::common::CombinedCudaAlloc, ) -> rustacuda::error::CudaResult { self.inner.restore(alloc) } @@ -193,13 +191,13 @@ unsafe impl RustToCudaAsync for SplitSliceOverCudaThreadsDyn #[cfg(feature = "host")] #[doc(cfg(feature = "host"))] #[allow(clippy::type_complexity)] - unsafe fn borrow_async( + unsafe fn borrow_async( &self, alloc: A, stream: &rustacuda::stream::Stream, ) -> rustacuda::error::CudaResult<( DeviceAccessible, - crate::host::CombinedCudaAlloc, + crate::common::CombinedCudaAlloc, )> { let (cuda_repr, alloc) = self.inner.borrow_async(alloc, stream)?; @@ -214,9 +212,9 @@ unsafe impl RustToCudaAsync for SplitSliceOverCudaThreadsDyn #[cfg(feature = "host")] #[doc(cfg(feature = "host"))] - unsafe fn restore_async( + unsafe fn restore_async( &mut self, - alloc: crate::host::CombinedCudaAlloc, + alloc: crate::common::CombinedCudaAlloc, stream: &rustacuda::stream::Stream, ) -> rustacuda::error::CudaResult { self.inner.restore_async(alloc, stream) diff --git a/src/utils/aliasing/final.rs b/src/utils/aliasing/final.rs index 5a3d1695c..019ece1b6 100644 --- a/src/utils/aliasing/final.rs +++ b/src/utils/aliasing/final.rs @@ -12,20 +12,18 @@ pub struct FinalCudaRepresentation(DeviceAccessible); unsafe impl rustacuda_core::DeviceCopy for FinalCudaRepresentation {} unsafe impl RustToCuda for Final { - #[cfg(feature = "host")] - #[doc(cfg(feature = "host"))] type CudaAllocation = T::CudaAllocation; type CudaRepresentation = FinalCudaRepresentation; #[cfg(feature = "host")] #[doc(cfg(feature = "host"))] #[allow(clippy::type_complexity)] - unsafe fn borrow( + unsafe fn borrow( &self, alloc: A, ) -> rustacuda::error::CudaResult<( DeviceAccessible, - crate::host::CombinedCudaAlloc, + crate::common::CombinedCudaAlloc, )> { let (cuda_repr, alloc) = (**self).borrow(alloc)?; @@ -37,9 +35,9 @@ unsafe impl RustToCuda for Final { #[cfg(feature = "host")] #[doc(cfg(feature = "host"))] - unsafe fn restore( + unsafe fn restore( &mut self, - alloc: crate::host::CombinedCudaAlloc, + alloc: crate::common::CombinedCudaAlloc, ) -> rustacuda::error::CudaResult { // Safety: Final is a repr(transparent) newtype wrapper around T let inner: &mut T = &mut *(self as *mut Self).cast(); @@ -52,13 +50,13 @@ unsafe impl RustToCudaAsync for Final { #[cfg(feature = "host")] #[doc(cfg(feature = "host"))] #[allow(clippy::type_complexity)] - unsafe fn borrow_async( + unsafe fn borrow_async( &self, alloc: A, stream: &rustacuda::stream::Stream, ) -> rustacuda::error::CudaResult<( DeviceAccessible, - crate::host::CombinedCudaAlloc, + crate::common::CombinedCudaAlloc, )> { let (cuda_repr, alloc) = (**self).borrow_async(alloc, stream)?; @@ -70,9 +68,9 @@ unsafe impl RustToCudaAsync for Final { #[cfg(feature = "host")] #[doc(cfg(feature = "host"))] - unsafe fn restore_async( + unsafe fn restore_async( &mut self, - alloc: crate::host::CombinedCudaAlloc, + alloc: crate::common::CombinedCudaAlloc, stream: &rustacuda::stream::Stream, ) -> rustacuda::error::CudaResult { // Safety: Final is a repr(transparent) newtype wrapper around T diff --git a/src/utils/box.rs b/src/utils/box.rs index e3381f022..195536f0d 100644 --- a/src/utils/box.rs +++ b/src/utils/box.rs @@ -9,8 +9,11 @@ use crate::{ #[cfg(feature = "host")] use crate::{ - host::CombinedCudaAlloc, host::CudaAlloc, host::CudaDropWrapper, rustacuda::error::CudaResult, - rustacuda::memory::DeviceBox, utils::device_copy::SafeDeviceCopyWrapper, + common::{CombinedCudaAlloc, CudaAlloc}, + host::CudaDropWrapper, + rustacuda::error::CudaResult, + rustacuda::memory::DeviceBox, + utils::device_copy::SafeDeviceCopyWrapper, }; #[doc(hidden)] @@ -29,8 +32,9 @@ unsafe impl rustacuda_core::DeviceCopy unsafe impl RustToCuda for Box { #[cfg(feature = "host")] - #[doc(cfg(feature = "host"))] - type CudaAllocation = CudaDropWrapper>>; + type CudaAllocation = crate::host::CudaDropWrapper>>; + #[cfg(not(feature = "host"))] + type CudaAllocation = crate::device::SomeCudaAlloc; type CudaRepresentation = BoxCudaRepresentation; #[cfg(feature = "host")] diff --git a/src/utils/boxed_slice.rs b/src/utils/boxed_slice.rs index 5ed008801..d5c022ede 100644 --- a/src/utils/boxed_slice.rs +++ b/src/utils/boxed_slice.rs @@ -9,8 +9,11 @@ use crate::{ #[cfg(feature = "host")] use crate::{ - host::CombinedCudaAlloc, host::CudaAlloc, host::CudaDropWrapper, rustacuda::error::CudaResult, - rustacuda::memory::DeviceBuffer, utils::device_copy::SafeDeviceCopyWrapper, + common::{CombinedCudaAlloc, CudaAlloc}, + host::CudaDropWrapper, + rustacuda::error::CudaResult, + rustacuda::memory::DeviceBuffer, + utils::device_copy::SafeDeviceCopyWrapper, }; #[doc(hidden)] @@ -29,8 +32,9 @@ unsafe impl rustacuda_core::DeviceCopy unsafe impl RustToCuda for Box<[T]> { #[cfg(feature = "host")] - #[doc(cfg(feature = "host"))] - type CudaAllocation = CudaDropWrapper>>; + type CudaAllocation = crate::host::CudaDropWrapper>>; + #[cfg(not(feature = "host"))] + type CudaAllocation = crate::device::SomeCudaAlloc; type CudaRepresentation = BoxedSliceCudaRepresentation; #[cfg(feature = "host")] diff --git a/src/utils/device_copy.rs b/src/utils/device_copy.rs index 1ae0515f9..46a75824c 100644 --- a/src/utils/device_copy.rs +++ b/src/utils/device_copy.rs @@ -3,10 +3,13 @@ use const_type_layout::TypeGraphLayout; use crate::{ - common::{CudaAsRust, DeviceAccessible, RustToCuda, RustToCudaAsync}, + common::{CudaAsRust, DeviceAccessible, NullCudaAlloc, RustToCuda, RustToCudaAsync}, safety::SafeDeviceCopy, }; +#[cfg(feature = "host")] +use crate::common::{CombinedCudaAlloc, CudaAlloc}; + #[derive(Copy, Clone, Debug, TypeLayout)] #[repr(transparent)] pub struct SafeDeviceCopyWrapper(T) @@ -71,30 +74,29 @@ impl SafeDeviceCopyWrapper { } unsafe impl RustToCuda for SafeDeviceCopyWrapper { - #[cfg(feature = "host")] - type CudaAllocation = crate::host::NullCudaAlloc; + type CudaAllocation = NullCudaAlloc; type CudaRepresentation = Self; #[cfg(feature = "host")] #[allow(clippy::type_complexity)] - unsafe fn borrow( + unsafe fn borrow( &self, alloc: A, ) -> rustacuda::error::CudaResult<( DeviceAccessible, - crate::host::CombinedCudaAlloc, + CombinedCudaAlloc, )> { - let alloc = crate::host::CombinedCudaAlloc::new(crate::host::NullCudaAlloc, alloc); + let alloc = CombinedCudaAlloc::new(NullCudaAlloc, alloc); Ok((DeviceAccessible::from(&self.0), alloc)) } #[cfg(feature = "host")] #[doc(cfg(feature = "host"))] - unsafe fn restore( + unsafe fn restore( &mut self, - alloc: crate::host::CombinedCudaAlloc, + alloc: CombinedCudaAlloc, ) -> rustacuda::error::CudaResult { - let (_alloc_front, alloc_tail): (crate::host::NullCudaAlloc, A) = alloc.split(); + let (_alloc_front, alloc_tail): (NullCudaAlloc, A) = alloc.split(); Ok(alloc_tail) } @@ -105,26 +107,26 @@ unsafe impl RustToCudaAsync { #[cfg(feature = "host")] #[allow(clippy::type_complexity)] - unsafe fn borrow_async( + unsafe fn borrow_async( &self, alloc: A, _stream: &rustacuda::stream::Stream, ) -> rustacuda::error::CudaResult<( DeviceAccessible, - crate::host::CombinedCudaAlloc, + CombinedCudaAlloc, )> { - let alloc = crate::host::CombinedCudaAlloc::new(crate::host::NullCudaAlloc, alloc); + let alloc = CombinedCudaAlloc::new(NullCudaAlloc, alloc); Ok((DeviceAccessible::from(&self.0), alloc)) } #[cfg(feature = "host")] #[doc(cfg(feature = "host"))] - unsafe fn restore_async( + unsafe fn restore_async( &mut self, - alloc: crate::host::CombinedCudaAlloc, + alloc: CombinedCudaAlloc, _stream: &rustacuda::stream::Stream, ) -> rustacuda::error::CudaResult { - let (_alloc_front, alloc_tail): (crate::host::NullCudaAlloc, A) = alloc.split(); + let (_alloc_front, alloc_tail): (NullCudaAlloc, A) = alloc.split(); Ok(alloc_tail) } diff --git a/src/utils/exchange/buffer/device.rs b/src/utils/exchange/buffer/device.rs index 1ecaf91d2..14ffac979 100644 --- a/src/utils/exchange/buffer/device.rs +++ b/src/utils/exchange/buffer/device.rs @@ -3,7 +3,7 @@ use core::ops::{Deref, DerefMut}; use const_type_layout::TypeGraphLayout; use crate::{ - common::{RustToCuda, RustToCudaAsync}, + common::{NullCudaAlloc, RustToCuda, RustToCudaAsync}, safety::SafeDeviceCopy, }; @@ -43,6 +43,7 @@ impl DerefMut unsafe impl RustToCuda for CudaExchangeBufferDevice { + type CudaAllocation = NullCudaAlloc; type CudaRepresentation = CudaExchangeBufferCudaRepresentation; } diff --git a/src/utils/exchange/buffer/host.rs b/src/utils/exchange/buffer/host.rs index debe33059..e45efc71e 100644 --- a/src/utils/exchange/buffer/host.rs +++ b/src/utils/exchange/buffer/host.rs @@ -11,8 +11,10 @@ use rustacuda::{ }; use crate::{ - common::{DeviceAccessible, RustToCuda, RustToCudaAsync}, - host::{CombinedCudaAlloc, CudaAlloc, CudaDropWrapper, NullCudaAlloc}, + common::{ + CombinedCudaAlloc, CudaAlloc, DeviceAccessible, NullCudaAlloc, RustToCuda, RustToCudaAsync, + }, + host::CudaDropWrapper, safety::SafeDeviceCopy, }; diff --git a/src/utils/exchange/wrapper.rs b/src/utils/exchange/wrapper.rs index a4a8e50f7..711409469 100644 --- a/src/utils/exchange/wrapper.rs +++ b/src/utils/exchange/wrapper.rs @@ -15,11 +15,13 @@ use rustacuda::{ }; use crate::{ - common::{DeviceAccessible, RustToCuda, RustToCudaAsync}, + common::{ + CombinedCudaAlloc, DeviceAccessible, EmptyCudaAlloc, NullCudaAlloc, RustToCuda, + RustToCudaAsync, + }, host::{ - CombinedCudaAlloc, CudaDropWrapper, EmptyCudaAlloc, HostAndDeviceConstRef, - HostAndDeviceConstRefAsync, HostAndDeviceMutRef, HostAndDeviceMutRefAsync, HostDeviceBox, - HostLockedBox, NullCudaAlloc, + CudaDropWrapper, HostAndDeviceConstRef, HostAndDeviceConstRefAsync, HostAndDeviceMutRef, + HostAndDeviceMutRefAsync, HostDeviceBox, HostLockedBox, }, }; diff --git a/src/utils/option.rs b/src/utils/option.rs index 18b86527b..f939f5ba0 100644 --- a/src/utils/option.rs +++ b/src/utils/option.rs @@ -12,7 +12,10 @@ use crate::{ }; #[cfg(feature = "host")] -use crate::{host::CombinedCudaAlloc, host::CudaAlloc, rustacuda::error::CudaResult}; +use crate::{ + common::{CombinedCudaAlloc, CudaAlloc}, + rustacuda::error::CudaResult, +}; #[doc(hidden)] #[allow(clippy::module_name_repetitions)] @@ -28,8 +31,6 @@ pub struct OptionCudaRepresentation { unsafe impl rustacuda_core::DeviceCopy for OptionCudaRepresentation {} unsafe impl RustToCuda for Option { - #[cfg(feature = "host")] - #[doc(cfg(feature = "host"))] type CudaAllocation = Option<::CudaAllocation>; type CudaRepresentation = OptionCudaRepresentation<::CudaRepresentation>; diff --git a/src/utils/shared/slice.rs b/src/utils/shared/slice.rs index e1f95ba95..0a8a66c62 100644 --- a/src/utils/shared/slice.rs +++ b/src/utils/shared/slice.rs @@ -78,7 +78,7 @@ impl ThreadBlockSharedSlice { #[cfg(any(target_os = "cuda", doc))] #[doc(cfg(target_os = "cuda"))] - /// Safety: + /// # Safety /// /// The provided `index` must not be out of bounds. #[inline] diff --git a/src/utils/shared/static.rs b/src/utils/shared/static.rs index 324c0fdef..5b8cdfc52 100644 --- a/src/utils/shared/static.rs +++ b/src/utils/shared/static.rs @@ -51,7 +51,7 @@ impl ThreadBlockShared { impl ThreadBlockShared<[T; N]> { #[cfg(any(target_os = "cuda", doc))] #[doc(cfg(target_os = "cuda"))] - /// Safety: + /// # Safety /// /// The provided `index` must not be out of bounds. #[inline] From 0f4fc4639554007b700be0a5aa752a1a69ad541c Mon Sep 17 00:00:00 2001 From: Juniper Tyree Date: Sun, 7 May 2023 10:04:28 +0000 Subject: [PATCH 027/120] Fixed SomeCudaAlloc import --- .gitignore | 3 + examples/single-source/expanded.rs | 1150 --------------------- rust-cuda-derive/src/rust_to_cuda/impl.rs | 4 +- rust-cuda-derive/src/rust_to_cuda/mod.rs | 2 +- src/common.rs | 10 +- src/device/mod.rs | 4 - src/host.rs | 10 +- src/utils/box.rs | 2 +- src/utils/boxed_slice.rs | 2 +- src/utils/device_copy.rs | 12 +- src/utils/exchange/buffer/device.rs | 4 +- src/utils/exchange/buffer/host.rs | 8 +- src/utils/exchange/wrapper.rs | 20 +- 13 files changed, 42 insertions(+), 1189 deletions(-) delete mode 100644 examples/single-source/expanded.rs diff --git a/.gitignore b/.gitignore index 767dae236..218ca8786 100644 --- a/.gitignore +++ b/.gitignore @@ -8,3 +8,6 @@ Cargo.lock # These are backup files generated by rustfmt **/*.rs.bk + +# cargo expand dev output files +**/expanded.rs diff --git a/examples/single-source/expanded.rs b/examples/single-source/expanded.rs deleted file mode 100644 index f16379c37..000000000 --- a/examples/single-source/expanded.rs +++ /dev/null @@ -1,1150 +0,0 @@ -#![feature(prelude_import)] -#![deny(clippy::pedantic)] -#![feature(cfg_version)] -#![feature(const_type_name)] -#![feature(const_refs_to_cell)] -#![feature(const_trait_impl)] -#![feature(const_mut_refs)] -#[prelude_import] -use std::prelude::rust_2021::*; -#[macro_use] -extern crate std; -extern crate alloc; -#[cfg(not(target_os = "cuda"))] -fn main() {} -#[repr(C)] -#[layout(crate = "rc::const_type_layout")] -pub struct Dummy(i32); -unsafe impl const rc::const_type_layout::TypeLayout for Dummy { - const TYPE_LAYOUT: rc::const_type_layout::TypeLayoutInfo<'static> = { - rc::const_type_layout::TypeLayoutInfo { - name: ::core::any::type_name::(), - size: ::core::mem::size_of::(), - alignment: ::core::mem::align_of::(), - structure: rc::const_type_layout::TypeStructure::Struct { - repr: "C", - fields: &[ - rc::const_type_layout::Field { - name: "0", - offset: { - { - #[allow(clippy::unneeded_field_pattern)] - let Dummy { 0: _, .. }: Dummy; - if let ::const_type_layout::MaybeUninhabited::Inhabited( - uninit, - ) - = unsafe { - ::uninit() - } { - let base_ptr: *const Dummy = (&raw const uninit).cast(); - #[allow(unused_unsafe)] - let field_ptr = unsafe { &raw const (*base_ptr).0 }; - #[allow(clippy::cast_sign_loss)] - let offset = unsafe { - field_ptr.cast::().offset_from(base_ptr.cast()) as usize - }; - #[allow(clippy::forget_non_drop, clippy::forget_copy)] - core::mem::forget(uninit); - ::const_type_layout::MaybeUninhabited::Inhabited(offset) - } else { - ::const_type_layout::MaybeUninhabited::Uninhabited - } - } - }, - ty: ::core::any::type_name::(), - }, - ], - }, - } - }; - unsafe fn uninit() -> rc::const_type_layout::MaybeUninhabited< - ::core::mem::MaybeUninit, - > { - if let (rc::const_type_layout::MaybeUninhabited::Inhabited(f_0)) - = (::uninit()) { - rc::const_type_layout::MaybeUninhabited::Inhabited( - ::core::mem::MaybeUninit::new(Dummy(f_0.assume_init())), - ) - } else { - rc::const_type_layout::MaybeUninhabited::Uninhabited - } - } -} -unsafe impl const rc::const_type_layout::TypeGraph for Dummy { - fn populate_graph(graph: &mut rc::const_type_layout::TypeLayoutGraph<'static>) { - if graph.insert(&::TYPE_LAYOUT) { - ::populate_graph(graph); - } - } -} -#[cuda(crate = "rc")] -#[allow(dead_code)] -pub struct Wrapper { - #[cuda(embed)] - inner: T, -} -#[allow(dead_code)] -#[doc(hidden)] -#[allow(dead_code)] -#[repr(C)] -#[layout(free = "T")] -#[layout(crate = "rc :: const_type_layout")] -pub struct WrapperCudaRepresentation -where - T: rc::common::RustToCuda, -{ - inner: rc::common::DeviceAccessible< - ::CudaRepresentation, - >, -} -unsafe impl const rc::const_type_layout::TypeLayout for WrapperCudaRepresentation -where - T: rc::common::RustToCuda, -{ - const TYPE_LAYOUT: rc::const_type_layout::TypeLayoutInfo<'static> = { - rc::const_type_layout::TypeLayoutInfo { - name: ::core::any::type_name::(), - size: ::core::mem::size_of::(), - alignment: ::core::mem::align_of::(), - structure: rc::const_type_layout::TypeStructure::Struct { - repr: "C", - fields: &[ - rc::const_type_layout::Field { - name: "inner", - offset: { - { - #[allow(clippy::unneeded_field_pattern)] - let WrapperCudaRepresentation { - inner: _, - .. - }: WrapperCudaRepresentation; - if let ::const_type_layout::MaybeUninhabited::Inhabited( - uninit, - ) - = unsafe { - as ::const_type_layout::TypeLayout>::uninit() - } { - let base_ptr: *const WrapperCudaRepresentation = (&raw const uninit) - .cast(); - #[allow(unused_unsafe)] - let field_ptr = unsafe { &raw const (*base_ptr).inner }; - #[allow(clippy::cast_sign_loss)] - let offset = unsafe { - field_ptr.cast::().offset_from(base_ptr.cast()) as usize - }; - #[allow(clippy::forget_non_drop, clippy::forget_copy)] - core::mem::forget(uninit); - ::const_type_layout::MaybeUninhabited::Inhabited(offset) - } else { - ::const_type_layout::MaybeUninhabited::Uninhabited - } - } - }, - ty: ::core::any::type_name::< - rc::common::DeviceAccessible< - ::CudaRepresentation, - >, - >(), - }, - ], - }, - } - }; - unsafe fn uninit() -> rc::const_type_layout::MaybeUninhabited< - ::core::mem::MaybeUninit, - > { - if let (rc::const_type_layout::MaybeUninhabited::Inhabited(inner)) - = (::CudaRepresentation, - > as rc::const_type_layout::TypeLayout>::uninit()) { - rc::const_type_layout::MaybeUninhabited::Inhabited( - ::core::mem::MaybeUninit::new(WrapperCudaRepresentation { - inner: inner.assume_init(), - }), - ) - } else { - rc::const_type_layout::MaybeUninhabited::Uninhabited - } - } -} -unsafe impl const rc::const_type_layout::TypeGraph for WrapperCudaRepresentation -where - T: rc::common::RustToCuda, -{ - fn populate_graph(graph: &mut rc::const_type_layout::TypeLayoutGraph<'static>) { - if graph.insert(&::TYPE_LAYOUT) { - ::CudaRepresentation, - > as rc::const_type_layout::TypeGraph>::populate_graph(graph); - } - } -} -unsafe impl rc::rustacuda_core::DeviceCopy for WrapperCudaRepresentation -where - T: rc::common::RustToCuda, -{} -unsafe impl rc::common::RustToCuda for Wrapper -where - T: rc::common::RustToCuda, -{ - type CudaRepresentation = WrapperCudaRepresentation; - type CudaAllocation = rc::common::CombinedCudaAlloc< - ::CudaAllocation, - rc::common::NullCudaAlloc, - >; - #[cfg(not(target_os = "cuda"))] - unsafe fn borrow( - &self, - alloc: CudaAllocType, - ) -> rc::rustacuda::error::CudaResult< - ( - rc::common::DeviceAccessible, - rc::common::CombinedCudaAlloc, - ), - > { - let alloc_front = rc::common::NullCudaAlloc; - let alloc_tail = alloc; - let (field_inner_repr, alloc_front) = rc::common::RustToCuda::borrow( - &self.inner, - alloc_front, - )?; - let borrow = WrapperCudaRepresentation { - inner: field_inner_repr, - }; - Ok(( - rc::common::DeviceAccessible::from(borrow), - rc::common::CombinedCudaAlloc::new(alloc_front, alloc_tail), - )) - } - #[cfg(not(target_os = "cuda"))] - unsafe fn restore( - &mut self, - alloc: rc::common::CombinedCudaAlloc, - ) -> rc::rustacuda::error::CudaResult { - let (alloc_front, alloc_tail) = alloc.split(); - let alloc_front = rc::common::RustToCuda::restore(&mut self.inner, alloc_front)?; - Ok(alloc_tail) - } -} -unsafe impl rc::common::RustToCudaAsync for Wrapper -where - T: rc::common::RustToCudaAsync, -{ - #[cfg(not(target_os = "cuda"))] - unsafe fn borrow_async( - &self, - alloc: CudaAllocType, - stream: &rc::rustacuda::stream::Stream, - ) -> rc::rustacuda::error::CudaResult< - ( - rc::common::DeviceAccessible, - rc::common::CombinedCudaAlloc, - ), - > { - let alloc_front = rc::common::NullCudaAlloc; - let alloc_tail = alloc; - let (field_inner_repr, alloc_front) = rc::common::RustToCudaAsync::borrow_async( - &self.inner, - alloc_front, - stream, - )?; - let borrow = WrapperCudaRepresentation { - inner: field_inner_repr, - }; - Ok(( - rc::common::DeviceAccessible::from(borrow), - rc::common::CombinedCudaAlloc::new(alloc_front, alloc_tail), - )) - } - #[cfg(not(target_os = "cuda"))] - unsafe fn restore_async( - &mut self, - alloc: rc::common::CombinedCudaAlloc, - stream: &rc::rustacuda::stream::Stream, - ) -> rc::rustacuda::error::CudaResult { - let (alloc_front, alloc_tail) = alloc.split(); - let alloc_front = rc::common::RustToCudaAsync::restore_async( - &mut self.inner, - alloc_front, - stream, - )?; - Ok(alloc_tail) - } -} -unsafe impl rc::common::CudaAsRust for WrapperCudaRepresentation -where - T: rc::common::RustToCuda, -{ - type RustRepresentation = Wrapper; -} -#[cuda(crate = "rc")] -pub struct Empty([u8; 0]); -#[allow(dead_code)] -#[doc(hidden)] -#[repr(C)] -#[layout(crate = "rc :: const_type_layout")] -pub struct EmptyCudaRepresentation( - rc::common::DeviceAccessible>, -); -unsafe impl const rc::const_type_layout::TypeLayout for EmptyCudaRepresentation { - const TYPE_LAYOUT: rc::const_type_layout::TypeLayoutInfo<'static> = { - rc::const_type_layout::TypeLayoutInfo { - name: ::core::any::type_name::(), - size: ::core::mem::size_of::(), - alignment: ::core::mem::align_of::(), - structure: rc::const_type_layout::TypeStructure::Struct { - repr: "C", - fields: &[ - rc::const_type_layout::Field { - name: "0", - offset: { - { - #[allow(clippy::unneeded_field_pattern)] - let EmptyCudaRepresentation { - 0: _, - .. - }: EmptyCudaRepresentation; - if let ::const_type_layout::MaybeUninhabited::Inhabited( - uninit, - ) - = unsafe { - ::uninit() - } { - let base_ptr: *const EmptyCudaRepresentation = (&raw const uninit) - .cast(); - #[allow(unused_unsafe)] - let field_ptr = unsafe { &raw const (*base_ptr).0 }; - #[allow(clippy::cast_sign_loss)] - let offset = unsafe { - field_ptr.cast::().offset_from(base_ptr.cast()) as usize - }; - #[allow(clippy::forget_non_drop, clippy::forget_copy)] - core::mem::forget(uninit); - ::const_type_layout::MaybeUninhabited::Inhabited(offset) - } else { - ::const_type_layout::MaybeUninhabited::Uninhabited - } - } - }, - ty: ::core::any::type_name::< - rc::common::DeviceAccessible< - rc::utils::device_copy::SafeDeviceCopyWrapper<[u8; 0]>, - >, - >(), - }, - ], - }, - } - }; - unsafe fn uninit() -> rc::const_type_layout::MaybeUninhabited< - ::core::mem::MaybeUninit, - > { - if let (rc::const_type_layout::MaybeUninhabited::Inhabited(f_0)) - = (, - > as rc::const_type_layout::TypeLayout>::uninit()) { - rc::const_type_layout::MaybeUninhabited::Inhabited( - ::core::mem::MaybeUninit::new(EmptyCudaRepresentation(f_0.assume_init())), - ) - } else { - rc::const_type_layout::MaybeUninhabited::Uninhabited - } - } -} -unsafe impl const rc::const_type_layout::TypeGraph for EmptyCudaRepresentation { - fn populate_graph(graph: &mut rc::const_type_layout::TypeLayoutGraph<'static>) { - if graph.insert(&::TYPE_LAYOUT) { - , - > as rc::const_type_layout::TypeGraph>::populate_graph(graph); - } - } -} -unsafe impl rc::rustacuda_core::DeviceCopy for EmptyCudaRepresentation {} -unsafe impl rc::common::RustToCuda for Empty { - type CudaRepresentation = EmptyCudaRepresentation; - type CudaAllocation = rc::common::NullCudaAlloc; - #[cfg(not(target_os = "cuda"))] - unsafe fn borrow( - &self, - alloc: CudaAllocType, - ) -> rc::rustacuda::error::CudaResult< - ( - rc::common::DeviceAccessible, - rc::common::CombinedCudaAlloc, - ), - > { - let alloc_front = rc::common::NullCudaAlloc; - let alloc_tail = alloc; - let field_0_repr = rc::common::DeviceAccessible::from(&self.0); - let borrow = EmptyCudaRepresentation(field_0_repr); - Ok(( - rc::common::DeviceAccessible::from(borrow), - rc::common::CombinedCudaAlloc::new(alloc_front, alloc_tail), - )) - } - #[cfg(not(target_os = "cuda"))] - unsafe fn restore( - &mut self, - alloc: rc::common::CombinedCudaAlloc, - ) -> rc::rustacuda::error::CudaResult { - let (alloc_front, alloc_tail) = alloc.split(); - Ok(alloc_tail) - } -} -unsafe impl rc::common::RustToCudaAsync for Empty { - #[cfg(not(target_os = "cuda"))] - unsafe fn borrow_async( - &self, - alloc: CudaAllocType, - stream: &rc::rustacuda::stream::Stream, - ) -> rc::rustacuda::error::CudaResult< - ( - rc::common::DeviceAccessible, - rc::common::CombinedCudaAlloc, - ), - > { - let alloc_front = rc::common::NullCudaAlloc; - let alloc_tail = alloc; - let field_0_repr = rc::common::DeviceAccessible::from(&self.0); - let borrow = EmptyCudaRepresentation(field_0_repr); - Ok(( - rc::common::DeviceAccessible::from(borrow), - rc::common::CombinedCudaAlloc::new(alloc_front, alloc_tail), - )) - } - #[cfg(not(target_os = "cuda"))] - unsafe fn restore_async( - &mut self, - alloc: rc::common::CombinedCudaAlloc, - stream: &rc::rustacuda::stream::Stream, - ) -> rc::rustacuda::error::CudaResult { - let (alloc_front, alloc_tail) = alloc.split(); - Ok(alloc_tail) - } -} -unsafe impl rc::common::CudaAsRust for EmptyCudaRepresentation { - type RustRepresentation = Empty; -} -#[repr(C)] -#[layout(crate = "rc::const_type_layout")] -pub struct Tuple(u32, i32); -unsafe impl const rc::const_type_layout::TypeLayout for Tuple { - const TYPE_LAYOUT: rc::const_type_layout::TypeLayoutInfo<'static> = { - rc::const_type_layout::TypeLayoutInfo { - name: ::core::any::type_name::(), - size: ::core::mem::size_of::(), - alignment: ::core::mem::align_of::(), - structure: rc::const_type_layout::TypeStructure::Struct { - repr: "C", - fields: &[ - rc::const_type_layout::Field { - name: "0", - offset: { - { - #[allow(clippy::unneeded_field_pattern)] - let Tuple { 0: _, .. }: Tuple; - if let ::const_type_layout::MaybeUninhabited::Inhabited( - uninit, - ) - = unsafe { - ::uninit() - } { - let base_ptr: *const Tuple = (&raw const uninit).cast(); - #[allow(unused_unsafe)] - let field_ptr = unsafe { &raw const (*base_ptr).0 }; - #[allow(clippy::cast_sign_loss)] - let offset = unsafe { - field_ptr.cast::().offset_from(base_ptr.cast()) as usize - }; - #[allow(clippy::forget_non_drop, clippy::forget_copy)] - core::mem::forget(uninit); - ::const_type_layout::MaybeUninhabited::Inhabited(offset) - } else { - ::const_type_layout::MaybeUninhabited::Uninhabited - } - } - }, - ty: ::core::any::type_name::(), - }, - rc::const_type_layout::Field { - name: "1", - offset: { - { - #[allow(clippy::unneeded_field_pattern)] - let Tuple { 1: _, .. }: Tuple; - if let ::const_type_layout::MaybeUninhabited::Inhabited( - uninit, - ) - = unsafe { - ::uninit() - } { - let base_ptr: *const Tuple = (&raw const uninit).cast(); - #[allow(unused_unsafe)] - let field_ptr = unsafe { &raw const (*base_ptr).1 }; - #[allow(clippy::cast_sign_loss)] - let offset = unsafe { - field_ptr.cast::().offset_from(base_ptr.cast()) as usize - }; - #[allow(clippy::forget_non_drop, clippy::forget_copy)] - core::mem::forget(uninit); - ::const_type_layout::MaybeUninhabited::Inhabited(offset) - } else { - ::const_type_layout::MaybeUninhabited::Uninhabited - } - } - }, - ty: ::core::any::type_name::(), - }, - ], - }, - } - }; - unsafe fn uninit() -> rc::const_type_layout::MaybeUninhabited< - ::core::mem::MaybeUninit, - > { - if let ( - rc::const_type_layout::MaybeUninhabited::Inhabited(f_0), - rc::const_type_layout::MaybeUninhabited::Inhabited(f_1), - ) - = ( - ::uninit(), - ::uninit(), - ) { - rc::const_type_layout::MaybeUninhabited::Inhabited( - ::core::mem::MaybeUninit::new( - Tuple(f_0.assume_init(), f_1.assume_init()), - ), - ) - } else { - rc::const_type_layout::MaybeUninhabited::Uninhabited - } - } -} -unsafe impl const rc::const_type_layout::TypeGraph for Tuple { - fn populate_graph(graph: &mut rc::const_type_layout::TypeLayoutGraph<'static>) { - if graph.insert(&::TYPE_LAYOUT) { - ::populate_graph(graph); - ::populate_graph(graph); - } - } -} -#[cfg(not(target_os = "cuda"))] -#[allow(clippy::missing_safety_doc)] -unsafe trait KernelArgs -where - T: rc::safety::StackOnly, - ::CudaRepresentation: rc::safety::StackOnly, - ::CudaAllocation: rc::common::EmptyCudaAlloc, -{ - type __T_0; - type __T_1; - type __T_2; - type __T_3; - type __T_4; - type __T_5; -} -unsafe impl KernelArgs for () -where - T: rc::safety::StackOnly, - ::CudaRepresentation: rc::safety::StackOnly, - ::CudaAllocation: rc::common::EmptyCudaAlloc, -{ - type __T_0 = Dummy; - type __T_1 = Wrapper; - type __T_2 = Wrapper; - type __T_3 = core::sync::atomic::AtomicU64; - type __T_4 = Wrapper; - type __T_5 = Tuple; -} -#[cfg(not(target_os = "cuda"))] -#[allow(clippy::missing_safety_doc)] -unsafe trait KernelPtx -where - T: rc::safety::StackOnly, - ::CudaRepresentation: rc::safety::StackOnly, - ::CudaAllocation: rc::common::EmptyCudaAlloc, -{ - fn get_ptx_str() -> &'static str - where - Self: Sized + rc::host::Launcher>; - fn new_kernel() -> rc::rustacuda::error::CudaResult< - rc::host::TypedKernel>, - > - where - Self: Sized + rc::host::Launcher>; -} -#[cfg(not(target_os = "cuda"))] -#[allow(clippy::missing_safety_doc)] -unsafe trait Kernel: KernelPtx -where - T: rc::safety::StackOnly, - ::CudaRepresentation: rc::safety::StackOnly, - ::CudaAllocation: rc::common::EmptyCudaAlloc, -{ - #[allow(clippy::needless_lifetimes)] - #[allow(clippy::too_many_arguments)] - #[allow(clippy::used_underscore_binding)] - #[allow(unused_variables)] - fn kernel<'stream, '__r2c_lt_0, '__r2c_lt_1, '__r2c_lt_2, '__r2c_move_lt_4, 'a>( - &mut self, - stream: &'stream rc::rustacuda::stream::Stream, - _x: &'__r2c_lt_0 <() as KernelArgs>::__T_0, - _y: &'__r2c_lt_1 mut <() as KernelArgs>::__T_1, - _z: &'__r2c_lt_2 <() as KernelArgs>::__T_2, - _v: &'a <() as KernelArgs>::__T_3, - kernel_arg_4: <() as KernelArgs>::__T_4, - s_t: <() as KernelArgs>::__T_5, - ) -> rc::rustacuda::error::CudaResult<()> - where - Self: Sized + rc::host::Launcher>, - { - const fn __check_is_sync(_x: &T) -> bool { - trait IsSyncMarker { - const SYNC: bool = false; - } - impl IsSyncMarker for T {} - struct CheckIs(::core::marker::PhantomData); - #[allow(dead_code)] - impl CheckIs { - const SYNC: bool = true; - } - >::SYNC - } - let mut ___x_box = rc::host::HostDeviceBox::from( - rc::rustacuda::memory::DeviceBox::new( - rc::utils::device_copy::SafeDeviceCopyWrapper::from_ref(_x), - )?, - ); - #[allow(clippy::redundant_closure_call)] - let __result = (|_x| { - rc::host::LendToCuda::lend_to_cuda_mut( - _y, - |mut _y| { - (|_y| { - rc::host::LendToCuda::lend_to_cuda( - _z, - |_z| { - (|_z| { - let mut ___v_box = rc::host::HostDeviceBox::from( - rc::rustacuda::memory::DeviceBox::new( - rc::utils::device_copy::SafeDeviceCopyWrapper::from_ref(_v), - )?, - ); - #[allow(clippy::redundant_closure_call)] - let __result = (|_v| { - rc::host::LendToCuda::move_to_cuda( - kernel_arg_4, - |mut kernel_arg_4| { - (|kernel_arg_4| { - { - let s_t = rc::utils::device_copy::SafeDeviceCopyWrapper::from( - s_t, - ); - self.kernel_async( - stream, - _x, - _y, - _z, - _v, - kernel_arg_4, - s_t, - )?; - stream.synchronize() - } - })(kernel_arg_4.as_async()) - }, - ) - })(unsafe { - rc::host::HostAndDeviceConstRef::new( - &___v_box, - rc::utils::device_copy::SafeDeviceCopyWrapper::from_ref(_v), - ) - .as_async() - }); - if !__check_is_sync(_v) { - ___v_box - .copy_to(unsafe { &mut *(_v as *const _ as *mut _) })?; - } - ::core::mem::drop(___v_box); - __result - })(_z.as_async()) - }, - ) - })(_y.as_async()) - }, - ) - })(unsafe { - rc::host::HostAndDeviceConstRef::new( - &___x_box, - rc::utils::device_copy::SafeDeviceCopyWrapper::from_ref(_x), - ) - .as_async() - }); - if !__check_is_sync(_x) { - ___x_box.copy_to(unsafe { &mut *(_x as *const _ as *mut _) })?; - } - ::core::mem::drop(___x_box); - __result - } - #[allow(clippy::extra_unused_type_parameters)] - #[allow(clippy::too_many_arguments)] - #[allow(clippy::used_underscore_binding)] - #[allow(unused_variables)] - fn kernel_async< - 'stream, - '__r2c_lt_0, - '__r2c_lt_1, - '__r2c_lt_2, - '__r2c_move_lt_4, - 'a, - >( - &mut self, - stream: &'stream rc::rustacuda::stream::Stream, - _x: rc::host::HostAndDeviceConstRefAsync< - 'stream, - '__r2c_lt_0, - rc::utils::device_copy::SafeDeviceCopyWrapper<<() as KernelArgs>::__T_0>, - >, - mut _y: rc::host::HostAndDeviceMutRefAsync< - 'stream, - '__r2c_lt_1, - rc::common::DeviceAccessible< - <<() as KernelArgs< - T, - >>::__T_1 as rc::common::RustToCuda>::CudaRepresentation, - >, - >, - _z: rc::host::HostAndDeviceConstRefAsync< - 'stream, - '__r2c_lt_2, - rc::common::DeviceAccessible< - <<() as KernelArgs< - T, - >>::__T_2 as rc::common::RustToCuda>::CudaRepresentation, - >, - >, - _v: rc::host::HostAndDeviceConstRefAsync< - 'stream, - 'a, - rc::utils::device_copy::SafeDeviceCopyWrapper<<() as KernelArgs>::__T_3>, - >, - kernel_arg_4: rc::host::HostAndDeviceOwnedAsync< - 'stream, - '__r2c_move_lt_4, - rc::common::DeviceAccessible< - <<() as KernelArgs< - T, - >>::__T_4 as rc::common::RustToCuda>::CudaRepresentation, - >, - >, - s_t: rc::utils::device_copy::SafeDeviceCopyWrapper<<() as KernelArgs>::__T_5>, - ) -> rc::rustacuda::error::CudaResult<()> - where - Self: Sized + rc::host::Launcher>, - { - let rc::host::LaunchPackage { kernel, watcher, config } = rc::host::Launcher::get_launch_package( - self, - ); - let kernel_jit_result = if config.ptx_jit { - kernel - .compile_with_ptx_jit_args( - Some( - &[ - None, - Some(rc::ptx_jit::arg_as_raw_bytes(_y.for_host())), - None, - Some(rc::ptx_jit::arg_as_raw_bytes(_v.for_host())), - None, - None, - ], - ), - )? - } else { - kernel.compile_with_ptx_jit_args(None)? - }; - let function = match kernel_jit_result { - rc::host::KernelJITResult::Recompiled(function) => { - ::on_compile(function, watcher)?; - function - } - rc::host::KernelJITResult::Cached(function) => function, - }; - #[allow(clippy::redundant_closure_call)] - (| - _x: rc::common::DeviceConstRef< - '__r2c_lt_0, - rc::utils::device_copy::SafeDeviceCopyWrapper< - <() as KernelArgs>::__T_0, - >, - >, - _y: rc::common::DeviceMutRef< - '__r2c_lt_1, - rc::common::DeviceAccessible< - <<() as KernelArgs< - T, - >>::__T_1 as rc::common::RustToCuda>::CudaRepresentation, - >, - >, - _z: rc::common::DeviceConstRef< - '__r2c_lt_2, - rc::common::DeviceAccessible< - <<() as KernelArgs< - T, - >>::__T_2 as rc::common::RustToCuda>::CudaRepresentation, - >, - >, - _v: rc::common::DeviceConstRef< - 'a, - rc::utils::device_copy::SafeDeviceCopyWrapper< - <() as KernelArgs>::__T_3, - >, - >, - kernel_arg_4: rc::common::DeviceMutRef< - '__r2c_move_lt_4, - rc::common::DeviceAccessible< - <<() as KernelArgs< - T, - >>::__T_4 as rc::common::RustToCuda>::CudaRepresentation, - >, - >, - s_t: rc::utils::device_copy::SafeDeviceCopyWrapper< - <() as KernelArgs>::__T_5, - >| - { - if false { - #[allow(dead_code)] - fn assert_impl_devicecopy(_val: &T) {} - #[allow(dead_code)] - fn assert_impl_no_aliasing() {} - #[allow(dead_code)] - fn assert_impl_fits_into_device_register< - T: rc::safety::FitsIntoDeviceRegister, - >(_val: &T) {} - assert_impl_devicecopy(&_x); - assert_impl_devicecopy(&_y); - assert_impl_devicecopy(&_z); - assert_impl_devicecopy(&_v); - assert_impl_devicecopy(&kernel_arg_4); - assert_impl_devicecopy(&s_t); - assert_impl_no_aliasing::<<() as KernelArgs>::__T_0>(); - assert_impl_no_aliasing::<<() as KernelArgs>::__T_1>(); - assert_impl_no_aliasing::<<() as KernelArgs>::__T_2>(); - assert_impl_no_aliasing::<<() as KernelArgs>::__T_3>(); - assert_impl_no_aliasing::<<() as KernelArgs>::__T_4>(); - assert_impl_no_aliasing::<<() as KernelArgs>::__T_5>(); - assert_impl_fits_into_device_register(&_x); - assert_impl_fits_into_device_register(&_y); - assert_impl_fits_into_device_register(&_z); - assert_impl_fits_into_device_register(&_v); - assert_impl_fits_into_device_register(&kernel_arg_4); - assert_impl_fits_into_device_register(&s_t); - } - let rc::host::LaunchConfig { grid, block, shared_memory_size, ptx_jit: _ } = config; - unsafe { - stream - .launch( - function, - grid, - block, - shared_memory_size, - &[ - &_x as *const _ as *mut ::std::ffi::c_void, - &_y as *const _ as *mut ::std::ffi::c_void, - &_z as *const _ as *mut ::std::ffi::c_void, - &_v as *const _ as *mut ::std::ffi::c_void, - &kernel_arg_4 as *const _ as *mut ::std::ffi::c_void, - &s_t as *const _ as *mut ::std::ffi::c_void, - ], - ) - } - })( - unsafe { _x.for_device_async() }, - unsafe { _y.for_device_async() }, - unsafe { _z.for_device_async() }, - unsafe { _v.for_device_async() }, - unsafe { kernel_arg_4.for_device_async() }, - s_t, - ) - } -} -#[cfg(not(target_os = "cuda"))] -#[allow(clippy::missing_safety_doc)] -unsafe impl> Kernel for K -where - T: rc::safety::StackOnly, - ::CudaRepresentation: rc::safety::StackOnly, - ::CudaAllocation: rc::common::EmptyCudaAlloc, -{} -#[cfg(not(target_os = "cuda"))] -const _: rc::safety::kernel_signature::Assert< - { rc::safety::kernel_signature::CpuAndGpuKernelSignatures::Match }, -> = rc::safety::kernel_signature::Assert::< - { - rc::safety::kernel_signature::check( - "//\n// Generated by LLVM NVPTX Back-End\n//\n\n.version 3.2\n.target sm_35\n.address_size 64\n\n\t// .globl\tkernel_type_layout\n\n.visible .entry kernel_type_layout()\n{\n\n\n\tret;\n\n}\n\t// .globl\tkernel_dfae7eaf723a670c\n.visible .entry kernel_dfae7eaf723a670c()\n{\n\n\n\tret;\n\n}\n" - .as_bytes(), - ".visible .entry kernel_dfae7eaf723a670c".as_bytes(), - ) - }, ->; -#[cfg(not(target_os = "cuda"))] -mod host { - #[allow(unused_imports)] - use super::KernelArgs; - use super::{Kernel, KernelPtx}; - #[allow(dead_code)] - struct Launcher(core::marker::PhantomData); - unsafe impl KernelPtx for Launcher { - fn get_ptx_str() -> &'static str { - const PTX_STR: &'static str = "//\n// Generated by LLVM NVPTX Back-End\n//\n\n.version 3.2\n.target sm_35\n.address_size 64\n\n\t// .globl\tkernel_dfae7eaf723a670c_kernel_aab1c403129e575b\n.visible .entry kernel_dfae7eaf723a670c_kernel_aab1c403129e575b(\n\t.param .u64 kernel_dfae7eaf723a670c_kernel_aab1c403129e575b_param_0,\n\t.param .u64 kernel_dfae7eaf723a670c_kernel_aab1c403129e575b_param_1,\n\t.param .u64 kernel_dfae7eaf723a670c_kernel_aab1c403129e575b_param_2,\n\t.param .u64 kernel_dfae7eaf723a670c_kernel_aab1c403129e575b_param_3,\n\t.param .u64 kernel_dfae7eaf723a670c_kernel_aab1c403129e575b_param_4,\n\t.param .align 4 .b8 kernel_dfae7eaf723a670c_kernel_aab1c403129e575b_param_5[8]\n)\n{\n\t.reg .b32 \t%r<6>;\n\t.reg .b64 \t%rd<7>;\n\t.reg .f64 \t%fd<5>;\n\n\tld.param.u64 \t%rd3, [kernel_dfae7eaf723a670c_kernel_aab1c403129e575b_param_3];\n\tcvta.to.global.u64 \t%rd4, %rd3;\n\tld.param.u64 \t%rd5, [kernel_dfae7eaf723a670c_kernel_aab1c403129e575b_param_1];\n\tcvta.to.global.u64 \t%rd6, %rd5;\n\tld.global.u32 \t%r1, [%rd6];\n\t// begin inline asm\n\t// //\n\t// end inline asm\n\tld.param.u32 \t%r3, [kernel_dfae7eaf723a670c_kernel_aab1c403129e575b_param_5];\n\tld.global.u32 \t%r2, [%rd4];\n\t// begin inline asm\n\t// //\n\t// end inline asm\n\t// begin inline asm\n\t.shared .align 4 .b8 %rd1_rust_cuda_static_shared[24];\ncvta.shared.u64 %rd1, %rd1_rust_cuda_static_shared;\n\t// end inline asm\n\t// begin inline asm\n\t.shared .align 4 .b8 %rd2_rust_cuda_static_shared[24];\ncvta.shared.u64 %rd2, %rd2_rust_cuda_static_shared;\n\t// end inline asm\n\tcvt.rn.f64.u32 \t%fd1, %r3;\n\tadd.rn.f64 \t%fd2, %fd1, %fd1;\n\tmax.f64 \t%fd3, %fd2, 0d0000000000000000;\n\tmin.f64 \t%fd4, %fd3, 0d41EFFFFFFFE00000;\n\tcvt.rzi.u32.f64 \t%r4, %fd4;\n\tst.u32 \t[%rd1+8], %r4;\n\tmov.u32 \t%r5, 24;\n\tst.u32 \t[%rd2+20], %r5;\n\tret;\n\n}\n\n// \n"; - const __KERNEL_DFAE7EAF723A670C__X_LAYOUT: &[u8; 879usize] = b"\xef\x06\x050.1.0mrust_cuda::common::DeviceConstRef>\x06mrust_cuda::common::DeviceConstRef>\x08\x08s\x0btransparent\x02\x07pointerh\x00Q*const rust_cuda::utils::device_copy::SafeDeviceCopyWrapper\treferenceh\x00fcore::marker::PhantomData<&rust_cuda::utils::device_copy::SafeDeviceCopyWrapper>Q*const rust_cuda::utils::device_copy::SafeDeviceCopyWrapper\x08\x08pJrust_cuda::utils::device_copy::SafeDeviceCopyWrapperiJrust_cuda::utils::device_copy::SafeDeviceCopyWrapper\x04\x04s\x0btransparent\x01\x010h\x00\x14single_source::Dummy\x14single_source::Dummy\x04\x04s\x01C\x01\x010h\x00\x03i32\x03i32\x04\x04vfcore::marker::PhantomData<&rust_cuda::utils::device_copy::SafeDeviceCopyWrapper>\x00\x01s\x00\x00"; - const __KERNEL_DFAE7EAF723A670C__Y_LAYOUT: &[u8; 1811usize] = b"\x93\x0e\x050.1.0\x84\x01rust_cuda::common::DeviceMutRef>>\x0b\x84\x01rust_cuda::common::DeviceMutRef>>\x08\x08s\x0btransparent\x02\x07pointerh\x00h*mut rust_cuda::common::DeviceAccessible>\treferenceh\x00\x83\x01core::marker::PhantomData<&mut rust_cuda::common::DeviceAccessible>>h*mut rust_cuda::common::DeviceAccessible>\x08\x08pcrust_cuda::common::DeviceAccessible>mcrust_cuda::common::DeviceAccessible>\x00\x01s\x0btransparent\x01\x010h\x00>single_source::WrapperCudaRepresentation>single_source::WrapperCudaRepresentation\x00\x01s\x01C\x01\x05innerh\x00Krust_cuda::common::DeviceAccessibleKrust_cuda::common::DeviceAccessible\x00\x01s\x0btransparent\x01\x010h\x00&single_source::EmptyCudaRepresentation&single_source::EmptyCudaRepresentation\x00\x01s\x01C\x01\x010h\x00brust_cuda::common::DeviceAccessible>brust_cuda::common::DeviceAccessible>\x00\x01s\x0btransparent\x01\x010h\x00=rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<[u8; 0]>=rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<[u8; 0]>\x00\x01s\x0btransparent\x01\x010h\x00\x07[u8; 0]\x07[u8; 0]\x00\x01a\x02u8\x00\x02u8\x01\x01v\x83\x01core::marker::PhantomData<&mut rust_cuda::common::DeviceAccessible>>\x00\x01s\x00\x00"; - const __KERNEL_DFAE7EAF723A670C__Z_LAYOUT: &[u8; 1809usize] = b"\x91\x0e\x050.1.0\x86\x01rust_cuda::common::DeviceConstRef>>\x0b\x86\x01rust_cuda::common::DeviceConstRef>>\x08\x08s\x0btransparent\x02\x07pointerh\x00j*const rust_cuda::common::DeviceAccessible>\treferenceh\x00\x7fcore::marker::PhantomData<&rust_cuda::common::DeviceAccessible>>j*const rust_cuda::common::DeviceAccessible>\x08\x08pcrust_cuda::common::DeviceAccessible>icrust_cuda::common::DeviceAccessible>\x00\x01s\x0btransparent\x01\x010h\x00>single_source::WrapperCudaRepresentation>single_source::WrapperCudaRepresentation\x00\x01s\x01C\x01\x05innerh\x00Krust_cuda::common::DeviceAccessibleKrust_cuda::common::DeviceAccessible\x00\x01s\x0btransparent\x01\x010h\x00&single_source::EmptyCudaRepresentation&single_source::EmptyCudaRepresentation\x00\x01s\x01C\x01\x010h\x00brust_cuda::common::DeviceAccessible>brust_cuda::common::DeviceAccessible>\x00\x01s\x0btransparent\x01\x010h\x00=rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<[u8; 0]>=rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<[u8; 0]>\x00\x01s\x0btransparent\x01\x010h\x00\x07[u8; 0]\x07[u8; 0]\x00\x01a\x02u8\x00\x02u8\x01\x01v\x7fcore::marker::PhantomData<&rust_cuda::common::DeviceAccessible>>\x00\x01s\x00\x00"; - const __KERNEL_DFAE7EAF723A670C__V_LAYOUT: &[u8; 1068usize] = b"\xac\x08\x050.1.0vrust_cuda::common::DeviceConstRef>\x07vrust_cuda::common::DeviceConstRef>\x08\x08s\x0btransparent\x02\x07pointerh\x00Z*const rust_cuda::utils::device_copy::SafeDeviceCopyWrapper\treferenceh\x00ocore::marker::PhantomData<&rust_cuda::utils::device_copy::SafeDeviceCopyWrapper>Z*const rust_cuda::utils::device_copy::SafeDeviceCopyWrapper\x08\x08pSrust_cuda::utils::device_copy::SafeDeviceCopyWrapperiSrust_cuda::utils::device_copy::SafeDeviceCopyWrapper\x08\x08s\x0btransparent\x01\x010h\x00\x1dcore::sync::atomic::AtomicU64\x1dcore::sync::atomic::AtomicU64\x08\x08s\nC,align(8)\x01\x01vh\x00\x1bcore::cell::UnsafeCell\x1bcore::cell::UnsafeCell\x08\x08s\x15no_nieche,transparent\x01\x05valueh\x00\x03u64\x03u64\x08\x08vocore::marker::PhantomData<&rust_cuda::utils::device_copy::SafeDeviceCopyWrapper>\x00\x01s\x00\x00"; - const __KERNEL_DFAE7EAF723A670C_KERNEL_ARG_4_LAYOUT: &[u8; 1811usize] = b"\x93\x0e\x050.1.0\x84\x01rust_cuda::common::DeviceMutRef>>\x0b\x84\x01rust_cuda::common::DeviceMutRef>>\x08\x08s\x0btransparent\x02\x07pointerh\x00h*mut rust_cuda::common::DeviceAccessible>\treferenceh\x00\x83\x01core::marker::PhantomData<&mut rust_cuda::common::DeviceAccessible>>h*mut rust_cuda::common::DeviceAccessible>\x08\x08pcrust_cuda::common::DeviceAccessible>mcrust_cuda::common::DeviceAccessible>\x00\x01s\x0btransparent\x01\x010h\x00>single_source::WrapperCudaRepresentation>single_source::WrapperCudaRepresentation\x00\x01s\x01C\x01\x05innerh\x00Krust_cuda::common::DeviceAccessibleKrust_cuda::common::DeviceAccessible\x00\x01s\x0btransparent\x01\x010h\x00&single_source::EmptyCudaRepresentation&single_source::EmptyCudaRepresentation\x00\x01s\x01C\x01\x010h\x00brust_cuda::common::DeviceAccessible>brust_cuda::common::DeviceAccessible>\x00\x01s\x0btransparent\x01\x010h\x00=rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<[u8; 0]>=rust_cuda::utils::device_copy::SafeDeviceCopyWrapper<[u8; 0]>\x00\x01s\x0btransparent\x01\x010h\x00\x07[u8; 0]\x07[u8; 0]\x00\x01a\x02u8\x00\x02u8\x01\x01v\x83\x01core::marker::PhantomData<&mut rust_cuda::common::DeviceAccessible>>\x00\x01s\x00\x00"; - const __KERNEL_DFAE7EAF723A670C_S_T_LAYOUT: &[u8; 257usize] = b"\x81\x02\x050.1.0Jrust_cuda::utils::device_copy::SafeDeviceCopyWrapper\x04Jrust_cuda::utils::device_copy::SafeDeviceCopyWrapper\x08\x04s\x0btransparent\x01\x010h\x00\x14single_source::Tuple\x14single_source::Tuple\x08\x04s\x01C\x02\x010h\x00\x03u32\x011h\x04\x03i32\x03u32\x04\x04v\x03i32\x04\x04v"; - const _: rc::safety::kernel_signature::Assert< - { rc::safety::kernel_signature::CpuAndGpuKernelSignatures::Match }, - > = rc::safety::kernel_signature::Assert::< - { - rc::safety::kernel_signature::check( - PTX_STR.as_bytes(), - ".visible .entry kernel_dfae7eaf723a670c_kernel_aab1c403129e575b" - .as_bytes(), - ) - }, - >; - const _: rc::safety::type_layout::Assert< - { rc::safety::type_layout::CpuAndGpuTypeLayouts::Match }, - > = rc::safety::type_layout::Assert::< - { - rc::safety::type_layout::check::< - rc::common::DeviceConstRef< - 'static, - rc::utils::device_copy::SafeDeviceCopyWrapper< - <() as KernelArgs>::__T_0, - >, - >, - >(__KERNEL_DFAE7EAF723A670C__X_LAYOUT) - }, - >; - const _: rc::safety::type_layout::Assert< - { rc::safety::type_layout::CpuAndGpuTypeLayouts::Match }, - > = rc::safety::type_layout::Assert::< - { - rc::safety::type_layout::check::< - rc::common::DeviceMutRef< - 'static, - rc::common::DeviceAccessible< - <<() as KernelArgs< - crate::Empty, - >>::__T_1 as rc::common::RustToCuda>::CudaRepresentation, - >, - >, - >(__KERNEL_DFAE7EAF723A670C__Y_LAYOUT) - }, - >; - const _: rc::safety::type_layout::Assert< - { rc::safety::type_layout::CpuAndGpuTypeLayouts::Match }, - > = rc::safety::type_layout::Assert::< - { - rc::safety::type_layout::check::< - rc::common::DeviceConstRef< - 'static, - rc::common::DeviceAccessible< - <<() as KernelArgs< - crate::Empty, - >>::__T_2 as rc::common::RustToCuda>::CudaRepresentation, - >, - >, - >(__KERNEL_DFAE7EAF723A670C__Z_LAYOUT) - }, - >; - const _: rc::safety::type_layout::Assert< - { rc::safety::type_layout::CpuAndGpuTypeLayouts::Match }, - > = rc::safety::type_layout::Assert::< - { - rc::safety::type_layout::check::< - rc::common::DeviceConstRef< - 'static, - rc::utils::device_copy::SafeDeviceCopyWrapper< - <() as KernelArgs>::__T_3, - >, - >, - >(__KERNEL_DFAE7EAF723A670C__V_LAYOUT) - }, - >; - const _: rc::safety::type_layout::Assert< - { rc::safety::type_layout::CpuAndGpuTypeLayouts::Match }, - > = rc::safety::type_layout::Assert::< - { - rc::safety::type_layout::check::< - rc::common::DeviceMutRef< - 'static, - rc::common::DeviceAccessible< - <<() as KernelArgs< - crate::Empty, - >>::__T_4 as rc::common::RustToCuda>::CudaRepresentation, - >, - >, - >(__KERNEL_DFAE7EAF723A670C_KERNEL_ARG_4_LAYOUT) - }, - >; - const _: rc::safety::type_layout::Assert< - { rc::safety::type_layout::CpuAndGpuTypeLayouts::Match }, - > = rc::safety::type_layout::Assert::< - { - rc::safety::type_layout::check::< - rc::utils::device_copy::SafeDeviceCopyWrapper< - <() as KernelArgs>::__T_5, - >, - >(__KERNEL_DFAE7EAF723A670C_S_T_LAYOUT) - }, - >; - PTX_STR - } - fn new_kernel() -> rc::rustacuda::error::CudaResult< - rc::host::TypedKernel>, - > { - let ptx = Self::get_ptx_str(); - let entry_point = "kernel_dfae7eaf723a670c_kernel_aab1c403129e575b"; - rc::host::TypedKernel::new(ptx, entry_point) - } - } - unsafe impl KernelPtx> - for Launcher> { - fn get_ptx_str() -> &'static str { - const PTX_STR: &'static str = "//\n// Generated by LLVM NVPTX Back-End\n//\n\n.version 3.2\n.target sm_35\n.address_size 64\n\n\t// .globl\tkernel_dfae7eaf723a670c_kernel_54d0891c50855d77\n.visible .entry kernel_dfae7eaf723a670c_kernel_54d0891c50855d77(\n\t.param .u64 kernel_dfae7eaf723a670c_kernel_54d0891c50855d77_param_0,\n\t.param .u64 kernel_dfae7eaf723a670c_kernel_54d0891c50855d77_param_1,\n\t.param .u64 kernel_dfae7eaf723a670c_kernel_54d0891c50855d77_param_2,\n\t.param .u64 kernel_dfae7eaf723a670c_kernel_54d0891c50855d77_param_3,\n\t.param .u64 kernel_dfae7eaf723a670c_kernel_54d0891c50855d77_param_4,\n\t.param .align 4 .b8 kernel_dfae7eaf723a670c_kernel_54d0891c50855d77_param_5[8]\n)\n{\n\t.reg .b32 \t%r<6>;\n\t.reg .b64 \t%rd<7>;\n\t.reg .f64 \t%fd<5>;\n\n\tld.param.u64 \t%rd3, [kernel_dfae7eaf723a670c_kernel_54d0891c50855d77_param_3];\n\tcvta.to.global.u64 \t%rd4, %rd3;\n\tld.param.u64 \t%rd5, [kernel_dfae7eaf723a670c_kernel_54d0891c50855d77_param_1];\n\tcvta.to.global.u64 \t%rd6, %rd5;\n\tld.global.u32 \t%r1, [%rd6];\n\t// begin inline asm\n\t// //\n\t// end inline asm\n\tld.param.u32 \t%r3, [kernel_dfae7eaf723a670c_kernel_54d0891c50855d77_param_5];\n\tld.global.u32 \t%r2, [%rd4];\n\t// begin inline asm\n\t// //\n\t// end inline asm\n\t// begin inline asm\n\t.shared .align 4 .b8 %rd1_rust_cuda_static_shared[24];\ncvta.shared.u64 %rd1, %rd1_rust_cuda_static_shared;\n\t// end inline asm\n\t// begin inline asm\n\t.shared .align 4 .b8 %rd2_rust_cuda_static_shared[24];\ncvta.shared.u64 %rd2, %rd2_rust_cuda_static_shared;\n\t// end inline asm\n\tcvt.rn.f64.u32 \t%fd1, %r3;\n\tadd.rn.f64 \t%fd2, %fd1, %fd1;\n\tmax.f64 \t%fd3, %fd2, 0d0000000000000000;\n\tmin.f64 \t%fd4, %fd3, 0d41EFFFFFFFE00000;\n\tcvt.rzi.u32.f64 \t%r4, %fd4;\n\tst.u32 \t[%rd1+8], %r4;\n\tmov.u32 \t%r5, 24;\n\tst.u32 \t[%rd2+20], %r5;\n\tret;\n\n}\n\n// >\n"; - const __KERNEL_DFAE7EAF723A670C__X_LAYOUT: &[u8; 879usize] = b"\xef\x06\x050.1.0mrust_cuda::common::DeviceConstRef>\x06mrust_cuda::common::DeviceConstRef>\x08\x08s\x0btransparent\x02\x07pointerh\x00Q*const rust_cuda::utils::device_copy::SafeDeviceCopyWrapper\treferenceh\x00fcore::marker::PhantomData<&rust_cuda::utils::device_copy::SafeDeviceCopyWrapper>Q*const rust_cuda::utils::device_copy::SafeDeviceCopyWrapper\x08\x08pJrust_cuda::utils::device_copy::SafeDeviceCopyWrapperiJrust_cuda::utils::device_copy::SafeDeviceCopyWrapper\x04\x04s\x0btransparent\x01\x010h\x00\x14single_source::Dummy\x14single_source::Dummy\x04\x04s\x01C\x01\x010h\x00\x03i32\x03i32\x04\x04vfcore::marker::PhantomData<&rust_cuda::utils::device_copy::SafeDeviceCopyWrapper>\x00\x01s\x00\x00"; - const __KERNEL_DFAE7EAF723A670C__Y_LAYOUT: &[u8; 1891usize] = b"\xe3\x0e\x050.1.0\xa9\x01rust_cuda::common::DeviceMutRef>>>\x08\xa9\x01rust_cuda::common::DeviceMutRef>>>\x08\x08s\x0btransparent\x02\x07pointerh\x00\x8d\x01*mut rust_cuda::common::DeviceAccessible>>\treferenceh\x00\xa8\x01core::marker::PhantomData<&mut rust_cuda::common::DeviceAccessible>>>\x8d\x01*mut rust_cuda::common::DeviceAccessible>>\x08\x08p\x88\x01rust_cuda::common::DeviceAccessible>>m\x88\x01rust_cuda::common::DeviceAccessible>>\x08\x08s\x0btransparent\x01\x010h\x00csingle_source::WrapperCudaRepresentation>csingle_source::WrapperCudaRepresentation>\x08\x08s\x01C\x01\x05innerh\x00^rust_cuda::common::DeviceAccessible>^rust_cuda::common::DeviceAccessible>\x08\x08s\x0btransparent\x01\x010h\x009rust_cuda::utils::device_copy::SafeDeviceCopyWrapper9rust_cuda::utils::device_copy::SafeDeviceCopyWrapper\x08\x08s\x0btransparent\x01\x010h\x00\x03u64\x03u64\x08\x08v\xa8\x01core::marker::PhantomData<&mut rust_cuda::common::DeviceAccessible>>>\x00\x01s\x00\x00"; - const __KERNEL_DFAE7EAF723A670C__Z_LAYOUT: &[u8; 1891usize] = b"\xe3\x0e\x050.1.0\xab\x01rust_cuda::common::DeviceConstRef>>>\x08\xab\x01rust_cuda::common::DeviceConstRef>>>\x08\x08s\x0btransparent\x02\x07pointerh\x00\x8f\x01*const rust_cuda::common::DeviceAccessible>>\treferenceh\x00\xa4\x01core::marker::PhantomData<&rust_cuda::common::DeviceAccessible>>>\x8f\x01*const rust_cuda::common::DeviceAccessible>>\x08\x08p\x88\x01rust_cuda::common::DeviceAccessible>>i\x88\x01rust_cuda::common::DeviceAccessible>>\x08\x08s\x0btransparent\x01\x010h\x00csingle_source::WrapperCudaRepresentation>csingle_source::WrapperCudaRepresentation>\x08\x08s\x01C\x01\x05innerh\x00^rust_cuda::common::DeviceAccessible>^rust_cuda::common::DeviceAccessible>\x08\x08s\x0btransparent\x01\x010h\x009rust_cuda::utils::device_copy::SafeDeviceCopyWrapper9rust_cuda::utils::device_copy::SafeDeviceCopyWrapper\x08\x08s\x0btransparent\x01\x010h\x00\x03u64\x03u64\x08\x08v\xa4\x01core::marker::PhantomData<&rust_cuda::common::DeviceAccessible>>>\x00\x01s\x00\x00"; - const __KERNEL_DFAE7EAF723A670C__V_LAYOUT: &[u8; 1068usize] = b"\xac\x08\x050.1.0vrust_cuda::common::DeviceConstRef>\x07vrust_cuda::common::DeviceConstRef>\x08\x08s\x0btransparent\x02\x07pointerh\x00Z*const rust_cuda::utils::device_copy::SafeDeviceCopyWrapper\treferenceh\x00ocore::marker::PhantomData<&rust_cuda::utils::device_copy::SafeDeviceCopyWrapper>Z*const rust_cuda::utils::device_copy::SafeDeviceCopyWrapper\x08\x08pSrust_cuda::utils::device_copy::SafeDeviceCopyWrapperiSrust_cuda::utils::device_copy::SafeDeviceCopyWrapper\x08\x08s\x0btransparent\x01\x010h\x00\x1dcore::sync::atomic::AtomicU64\x1dcore::sync::atomic::AtomicU64\x08\x08s\nC,align(8)\x01\x01vh\x00\x1bcore::cell::UnsafeCell\x1bcore::cell::UnsafeCell\x08\x08s\x15no_nieche,transparent\x01\x05valueh\x00\x03u64\x03u64\x08\x08vocore::marker::PhantomData<&rust_cuda::utils::device_copy::SafeDeviceCopyWrapper>\x00\x01s\x00\x00"; - const __KERNEL_DFAE7EAF723A670C_KERNEL_ARG_4_LAYOUT: &[u8; 1891usize] = b"\xe3\x0e\x050.1.0\xa9\x01rust_cuda::common::DeviceMutRef>>>\x08\xa9\x01rust_cuda::common::DeviceMutRef>>>\x08\x08s\x0btransparent\x02\x07pointerh\x00\x8d\x01*mut rust_cuda::common::DeviceAccessible>>\treferenceh\x00\xa8\x01core::marker::PhantomData<&mut rust_cuda::common::DeviceAccessible>>>\x8d\x01*mut rust_cuda::common::DeviceAccessible>>\x08\x08p\x88\x01rust_cuda::common::DeviceAccessible>>m\x88\x01rust_cuda::common::DeviceAccessible>>\x08\x08s\x0btransparent\x01\x010h\x00csingle_source::WrapperCudaRepresentation>csingle_source::WrapperCudaRepresentation>\x08\x08s\x01C\x01\x05innerh\x00^rust_cuda::common::DeviceAccessible>^rust_cuda::common::DeviceAccessible>\x08\x08s\x0btransparent\x01\x010h\x009rust_cuda::utils::device_copy::SafeDeviceCopyWrapper9rust_cuda::utils::device_copy::SafeDeviceCopyWrapper\x08\x08s\x0btransparent\x01\x010h\x00\x03u64\x03u64\x08\x08v\xa8\x01core::marker::PhantomData<&mut rust_cuda::common::DeviceAccessible>>>\x00\x01s\x00\x00"; - const __KERNEL_DFAE7EAF723A670C_S_T_LAYOUT: &[u8; 257usize] = b"\x81\x02\x050.1.0Jrust_cuda::utils::device_copy::SafeDeviceCopyWrapper\x04Jrust_cuda::utils::device_copy::SafeDeviceCopyWrapper\x08\x04s\x0btransparent\x01\x010h\x00\x14single_source::Tuple\x14single_source::Tuple\x08\x04s\x01C\x02\x010h\x00\x03u32\x011h\x04\x03i32\x03u32\x04\x04v\x03i32\x04\x04v"; - const _: rc::safety::kernel_signature::Assert< - { rc::safety::kernel_signature::CpuAndGpuKernelSignatures::Match }, - > = rc::safety::kernel_signature::Assert::< - { - rc::safety::kernel_signature::check( - PTX_STR.as_bytes(), - ".visible .entry kernel_dfae7eaf723a670c_kernel_54d0891c50855d77" - .as_bytes(), - ) - }, - >; - const _: rc::safety::type_layout::Assert< - { rc::safety::type_layout::CpuAndGpuTypeLayouts::Match }, - > = rc::safety::type_layout::Assert::< - { - rc::safety::type_layout::check::< - rc::common::DeviceConstRef< - 'static, - rc::utils::device_copy::SafeDeviceCopyWrapper< - <() as KernelArgs< - rc::utils::device_copy::SafeDeviceCopyWrapper, - >>::__T_0, - >, - >, - >(__KERNEL_DFAE7EAF723A670C__X_LAYOUT) - }, - >; - const _: rc::safety::type_layout::Assert< - { rc::safety::type_layout::CpuAndGpuTypeLayouts::Match }, - > = rc::safety::type_layout::Assert::< - { - rc::safety::type_layout::check::< - rc::common::DeviceMutRef< - 'static, - rc::common::DeviceAccessible< - <<() as KernelArgs< - rc::utils::device_copy::SafeDeviceCopyWrapper, - >>::__T_1 as rc::common::RustToCuda>::CudaRepresentation, - >, - >, - >(__KERNEL_DFAE7EAF723A670C__Y_LAYOUT) - }, - >; - const _: rc::safety::type_layout::Assert< - { rc::safety::type_layout::CpuAndGpuTypeLayouts::Match }, - > = rc::safety::type_layout::Assert::< - { - rc::safety::type_layout::check::< - rc::common::DeviceConstRef< - 'static, - rc::common::DeviceAccessible< - <<() as KernelArgs< - rc::utils::device_copy::SafeDeviceCopyWrapper, - >>::__T_2 as rc::common::RustToCuda>::CudaRepresentation, - >, - >, - >(__KERNEL_DFAE7EAF723A670C__Z_LAYOUT) - }, - >; - const _: rc::safety::type_layout::Assert< - { rc::safety::type_layout::CpuAndGpuTypeLayouts::Match }, - > = rc::safety::type_layout::Assert::< - { - rc::safety::type_layout::check::< - rc::common::DeviceConstRef< - 'static, - rc::utils::device_copy::SafeDeviceCopyWrapper< - <() as KernelArgs< - rc::utils::device_copy::SafeDeviceCopyWrapper, - >>::__T_3, - >, - >, - >(__KERNEL_DFAE7EAF723A670C__V_LAYOUT) - }, - >; - const _: rc::safety::type_layout::Assert< - { rc::safety::type_layout::CpuAndGpuTypeLayouts::Match }, - > = rc::safety::type_layout::Assert::< - { - rc::safety::type_layout::check::< - rc::common::DeviceMutRef< - 'static, - rc::common::DeviceAccessible< - <<() as KernelArgs< - rc::utils::device_copy::SafeDeviceCopyWrapper, - >>::__T_4 as rc::common::RustToCuda>::CudaRepresentation, - >, - >, - >(__KERNEL_DFAE7EAF723A670C_KERNEL_ARG_4_LAYOUT) - }, - >; - const _: rc::safety::type_layout::Assert< - { rc::safety::type_layout::CpuAndGpuTypeLayouts::Match }, - > = rc::safety::type_layout::Assert::< - { - rc::safety::type_layout::check::< - rc::utils::device_copy::SafeDeviceCopyWrapper< - <() as KernelArgs< - rc::utils::device_copy::SafeDeviceCopyWrapper, - >>::__T_5, - >, - >(__KERNEL_DFAE7EAF723A670C_S_T_LAYOUT) - }, - >; - PTX_STR - } - fn new_kernel() -> rc::rustacuda::error::CudaResult< - rc::host::TypedKernel< - dyn Kernel>, - >, - > { - let ptx = Self::get_ptx_str(); - let entry_point = "kernel_dfae7eaf723a670c_kernel_54d0891c50855d77"; - rc::host::TypedKernel::new(ptx, entry_point) - } - } - impl rc::host::Launcher for Launcher { - type CompilationWatcher = (); - type KernelTraitObject = dyn Kernel; - fn get_launch_package(&mut self) -> rc::host::LaunchPackage { - ::core::panicking::panic("not implemented") - } - } -} diff --git a/rust-cuda-derive/src/rust_to_cuda/impl.rs b/rust-cuda-derive/src/rust_to_cuda/impl.rs index 1682c0c80..896e51e89 100644 --- a/rust-cuda-derive/src/rust_to_cuda/impl.rs +++ b/rust-cuda-derive/src/rust_to_cuda/impl.rs @@ -91,7 +91,7 @@ pub fn rust_to_cuda_trait( #crate_path::common::DeviceAccessible, #crate_path::common::CombinedCudaAlloc )> { - let alloc_front = #crate_path::common::NullCudaAlloc; + let alloc_front = #crate_path::common::NoCudaAlloc; let alloc_tail = alloc; #(#r2c_field_declarations)* @@ -161,7 +161,7 @@ pub fn rust_to_cuda_async_trait( #crate_path::common::DeviceAccessible, #crate_path::common::CombinedCudaAlloc )> { - let alloc_front = #crate_path::common::NullCudaAlloc; + let alloc_front = #crate_path::common::NoCudaAlloc; let alloc_tail = alloc; #(#r2c_field_async_declarations)* diff --git a/rust-cuda-derive/src/rust_to_cuda/mod.rs b/rust-cuda-derive/src/rust_to_cuda/mod.rs index 5e11ffc8c..fb5b39503 100644 --- a/rust-cuda-derive/src/rust_to_cuda/mod.rs +++ b/rust-cuda-derive/src/rust_to_cuda/mod.rs @@ -31,7 +31,7 @@ pub fn impl_rust_to_cuda(ast: &syn::DeriveInput) -> proc_macro::TokenStream { ) = generics::expand_cuda_struct_generics_where_requested_in_attrs(ast); let mut combined_cuda_alloc_type: TokenStream = quote! { - #crate_path::common::NullCudaAlloc + #crate_path::common::NoCudaAlloc }; let mut r2c_field_declarations: Vec = Vec::new(); let mut r2c_field_async_declarations: Vec = Vec::new(); diff --git a/src/common.rs b/src/common.rs index 6a7e7d926..cf44848a4 100644 --- a/src/common.rs +++ b/src/common.rs @@ -259,9 +259,13 @@ impl CudaAlloc for T {} impl crate_private::alloc::Sealed for Option {} -pub struct NullCudaAlloc; -impl crate_private::alloc::Sealed for NullCudaAlloc {} -impl private::empty::Sealed for NullCudaAlloc {} +pub struct NoCudaAlloc; +impl crate_private::alloc::Sealed for NoCudaAlloc {} +impl private::empty::Sealed for NoCudaAlloc {} + +pub struct SomeCudaAlloc(()); +impl crate_private::alloc::Sealed for SomeCudaAlloc {} +impl !private::empty::Sealed for SomeCudaAlloc {} pub struct CombinedCudaAlloc(A, B); impl crate_private::alloc::Sealed for CombinedCudaAlloc {} diff --git a/src/device/mod.rs b/src/device/mod.rs index f7347aa98..45c833923 100644 --- a/src/device/mod.rs +++ b/src/device/mod.rs @@ -116,7 +116,3 @@ impl DerefMut for ShallowCopy { &mut self.0 } } - -pub struct SomeCudaAlloc(()); - -impl crate::common::crate_private::alloc::Sealed for SomeCudaAlloc {} diff --git a/src/host.rs b/src/host.rs index 7a5eaf854..aed9aaa83 100644 --- a/src/host.rs +++ b/src/host.rs @@ -21,7 +21,7 @@ pub use rust_cuda_derive::{check_kernel, link_kernel, specialise_kernel_call}; use crate::{ common::{ - DeviceAccessible, DeviceConstRef, DeviceMutRef, EmptyCudaAlloc, NullCudaAlloc, RustToCuda, + DeviceAccessible, DeviceConstRef, DeviceMutRef, EmptyCudaAlloc, NoCudaAlloc, RustToCuda, }, ptx_jit::{CudaKernel, PtxJITCompiler, PtxJITResult}, safety::SafeDeviceCopy, @@ -196,7 +196,7 @@ impl LendToCuda for T { &self, inner: F, ) -> Result { - let (cuda_repr, alloc) = unsafe { self.borrow(NullCudaAlloc) }?; + let (cuda_repr, alloc) = unsafe { self.borrow(NoCudaAlloc) }?; let result = HostAndDeviceConstRef::with_new(&cuda_repr, inner); @@ -216,13 +216,13 @@ impl LendToCuda for T { &mut self, inner: F, ) -> Result { - let (mut cuda_repr, alloc) = unsafe { self.borrow(NullCudaAlloc) }?; + let (mut cuda_repr, alloc) = unsafe { self.borrow(NoCudaAlloc) }?; let result = HostAndDeviceMutRef::with_new(&mut cuda_repr, inner); core::mem::drop(cuda_repr); - let _: NullCudaAlloc = unsafe { self.restore(alloc) }?; + let _: NoCudaAlloc = unsafe { self.restore(alloc) }?; result } @@ -242,7 +242,7 @@ impl LendToCuda for T { ::CudaRepresentation: SafeDeviceCopy, ::CudaAllocation: EmptyCudaAlloc, { - let (cuda_repr, alloc) = unsafe { self.borrow(NullCudaAlloc) }?; + let (cuda_repr, alloc) = unsafe { self.borrow(NoCudaAlloc) }?; let result = HostAndDeviceOwned::with_new(cuda_repr, inner); diff --git a/src/utils/box.rs b/src/utils/box.rs index 195536f0d..8e81941a1 100644 --- a/src/utils/box.rs +++ b/src/utils/box.rs @@ -34,7 +34,7 @@ unsafe impl RustToCuda for Box { #[cfg(feature = "host")] type CudaAllocation = crate::host::CudaDropWrapper>>; #[cfg(not(feature = "host"))] - type CudaAllocation = crate::device::SomeCudaAlloc; + type CudaAllocation = crate::common::SomeCudaAlloc; type CudaRepresentation = BoxCudaRepresentation; #[cfg(feature = "host")] diff --git a/src/utils/boxed_slice.rs b/src/utils/boxed_slice.rs index d5c022ede..4a06e0a8d 100644 --- a/src/utils/boxed_slice.rs +++ b/src/utils/boxed_slice.rs @@ -34,7 +34,7 @@ unsafe impl RustToCuda for Box<[T]> { #[cfg(feature = "host")] type CudaAllocation = crate::host::CudaDropWrapper>>; #[cfg(not(feature = "host"))] - type CudaAllocation = crate::device::SomeCudaAlloc; + type CudaAllocation = crate::common::SomeCudaAlloc; type CudaRepresentation = BoxedSliceCudaRepresentation; #[cfg(feature = "host")] diff --git a/src/utils/device_copy.rs b/src/utils/device_copy.rs index 46a75824c..0c77a8d1a 100644 --- a/src/utils/device_copy.rs +++ b/src/utils/device_copy.rs @@ -3,7 +3,7 @@ use const_type_layout::TypeGraphLayout; use crate::{ - common::{CudaAsRust, DeviceAccessible, NullCudaAlloc, RustToCuda, RustToCudaAsync}, + common::{CudaAsRust, DeviceAccessible, NoCudaAlloc, RustToCuda, RustToCudaAsync}, safety::SafeDeviceCopy, }; @@ -74,7 +74,7 @@ impl SafeDeviceCopyWrapper { } unsafe impl RustToCuda for SafeDeviceCopyWrapper { - type CudaAllocation = NullCudaAlloc; + type CudaAllocation = NoCudaAlloc; type CudaRepresentation = Self; #[cfg(feature = "host")] @@ -86,7 +86,7 @@ unsafe impl RustToCuda for SafeDeviceCopyWr DeviceAccessible, CombinedCudaAlloc, )> { - let alloc = CombinedCudaAlloc::new(NullCudaAlloc, alloc); + let alloc = CombinedCudaAlloc::new(NoCudaAlloc, alloc); Ok((DeviceAccessible::from(&self.0), alloc)) } @@ -96,7 +96,7 @@ unsafe impl RustToCuda for SafeDeviceCopyWr &mut self, alloc: CombinedCudaAlloc, ) -> rustacuda::error::CudaResult { - let (_alloc_front, alloc_tail): (NullCudaAlloc, A) = alloc.split(); + let (_alloc_front, alloc_tail): (NoCudaAlloc, A) = alloc.split(); Ok(alloc_tail) } @@ -115,7 +115,7 @@ unsafe impl RustToCudaAsync DeviceAccessible, CombinedCudaAlloc, )> { - let alloc = CombinedCudaAlloc::new(NullCudaAlloc, alloc); + let alloc = CombinedCudaAlloc::new(NoCudaAlloc, alloc); Ok((DeviceAccessible::from(&self.0), alloc)) } @@ -126,7 +126,7 @@ unsafe impl RustToCudaAsync alloc: CombinedCudaAlloc, _stream: &rustacuda::stream::Stream, ) -> rustacuda::error::CudaResult { - let (_alloc_front, alloc_tail): (NullCudaAlloc, A) = alloc.split(); + let (_alloc_front, alloc_tail): (NoCudaAlloc, A) = alloc.split(); Ok(alloc_tail) } diff --git a/src/utils/exchange/buffer/device.rs b/src/utils/exchange/buffer/device.rs index 14ffac979..09ffa2b43 100644 --- a/src/utils/exchange/buffer/device.rs +++ b/src/utils/exchange/buffer/device.rs @@ -3,7 +3,7 @@ use core::ops::{Deref, DerefMut}; use const_type_layout::TypeGraphLayout; use crate::{ - common::{NullCudaAlloc, RustToCuda, RustToCudaAsync}, + common::{NoCudaAlloc, RustToCuda, RustToCudaAsync}, safety::SafeDeviceCopy, }; @@ -43,7 +43,7 @@ impl DerefMut unsafe impl RustToCuda for CudaExchangeBufferDevice { - type CudaAllocation = NullCudaAlloc; + type CudaAllocation = NoCudaAlloc; type CudaRepresentation = CudaExchangeBufferCudaRepresentation; } diff --git a/src/utils/exchange/buffer/host.rs b/src/utils/exchange/buffer/host.rs index e45efc71e..384f290bb 100644 --- a/src/utils/exchange/buffer/host.rs +++ b/src/utils/exchange/buffer/host.rs @@ -12,7 +12,7 @@ use rustacuda::{ use crate::{ common::{ - CombinedCudaAlloc, CudaAlloc, DeviceAccessible, NullCudaAlloc, RustToCuda, RustToCudaAsync, + CombinedCudaAlloc, CudaAlloc, DeviceAccessible, NoCudaAlloc, RustToCuda, RustToCudaAsync, }, host::CudaDropWrapper, safety::SafeDeviceCopy, @@ -107,7 +107,7 @@ impl Dere unsafe impl RustToCuda for CudaExchangeBufferHost { - type CudaAllocation = NullCudaAlloc; + type CudaAllocation = NoCudaAlloc; type CudaRepresentation = CudaExchangeBufferCudaRepresentation; #[allow(clippy::type_complexity)] @@ -136,7 +136,7 @@ unsafe impl value: T, device_box: HostDeviceBox::CudaRepresentation>>, locked_cuda_repr: HostLockedBox::CudaRepresentation>>, - null_alloc: CombinedCudaAlloc<::CudaAllocation, NullCudaAlloc>, + null_alloc: CombinedCudaAlloc<::CudaAllocation, NoCudaAlloc>, move_event: CudaDropWrapper, } @@ -57,7 +57,7 @@ pub struct ExchangeWrapperOnDeviceAsync<'stream, T: RustToCuda::CudaRepresentation>>, locked_cuda_repr: HostLockedBox::CudaRepresentation>>, - null_alloc: CombinedCudaAlloc<::CudaAllocation, NullCudaAlloc>, + null_alloc: CombinedCudaAlloc<::CudaAllocation, NoCudaAlloc>, move_event: CudaDropWrapper, stream: &'stream Stream, waker: Arc>>, @@ -73,7 +73,7 @@ impl> ExchangeWrapperOnHost { // called first, which initialised the memory. let device_box = unsafe { DeviceBox::uninitialized() }?.into(); - let (cuda_repr, _null_alloc) = unsafe { value.borrow(NullCudaAlloc) }?; + let (cuda_repr, _null_alloc) = unsafe { value.borrow(NoCudaAlloc) }?; let locked_cuda_repr = HostLockedBox::new(cuda_repr)?; let move_event = Event::new(EventFlags::DISABLE_TIMING)?.into(); @@ -99,7 +99,7 @@ impl> ExchangeWrapperOnHost { /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside /// CUDA pub fn move_to_device(mut self) -> CudaResult> { - let (cuda_repr, null_alloc) = unsafe { self.value.borrow(NullCudaAlloc) }?; + let (cuda_repr, null_alloc) = unsafe { self.value.borrow(NoCudaAlloc) }?; *self.locked_cuda_repr = cuda_repr; self.device_box.copy_from(&self.locked_cuda_repr)?; @@ -129,7 +129,7 @@ impl> ExchangeWrapperOnHost CudaResult> { - let (cuda_repr, null_alloc) = unsafe { self.value.borrow_async(NullCudaAlloc, stream) }?; + let (cuda_repr, null_alloc) = unsafe { self.value.borrow_async(NoCudaAlloc, stream) }?; *self.locked_cuda_repr = cuda_repr; // Safety: The device value is not safely exposed until either @@ -347,7 +347,7 @@ impl<'stream, T: RustToCuda> /// CUDA pub fn move_to_host(mut self) -> CudaResult> { // Reflect deep changes back to the CPU - let _null_alloc: NullCudaAlloc = unsafe { self.value.restore(self.null_alloc) }?; + let _null_alloc: NoCudaAlloc = unsafe { self.value.restore(self.null_alloc) }?; // Note: Shallow changes are not reflected back to the CPU @@ -378,7 +378,7 @@ impl<'stream, T: RustToCudaAsync> stream: &'stream Stream, ) -> CudaResult> { // Reflect deep changes back to the CPU - let _null_alloc: NullCudaAlloc = + let _null_alloc: NoCudaAlloc = unsafe { self.value.restore_async(self.null_alloc, stream) }?; // Note: Shallow changes are not reflected back to the CPU @@ -456,7 +456,7 @@ impl> ExchangeWrapperOnDevice { /// CUDA pub fn move_to_host(mut self) -> CudaResult> { // Reflect deep changes back to the CPU - let _null_alloc: NullCudaAlloc = unsafe { self.value.restore(self.null_alloc) }?; + let _null_alloc: NoCudaAlloc = unsafe { self.value.restore(self.null_alloc) }?; // Note: Shallow changes are not reflected back to the CPU @@ -499,7 +499,7 @@ impl> ExchangeWrapperOnDevice stream: &Stream, ) -> CudaResult> { // Reflect deep changes back to the CPU - let _null_alloc: NullCudaAlloc = + let _null_alloc: NoCudaAlloc = unsafe { self.value.restore_async(self.null_alloc, stream) }?; // Note: Shallow changes are not reflected back to the CPU From 41d36161359e3d275abd79e10652433d450cf217 Mon Sep 17 00:00:00 2001 From: Juniper Tyree Date: Sun, 7 May 2023 14:59:20 +0000 Subject: [PATCH 028/120] Added error handling to the compile-time PTX checking --- rust-cuda-derive/Cargo.toml | 6 +- rust-cuda-derive/build.rs | 2 + rust-cuda-derive/src/kernel/link/mod.rs | 272 ++++++++++------ .../src/kernel/link/ptx_compiler_sys.rs | 301 ++++++++++++++++++ 4 files changed, 485 insertions(+), 96 deletions(-) create mode 100644 rust-cuda-derive/src/kernel/link/ptx_compiler_sys.rs diff --git a/rust-cuda-derive/Cargo.toml b/rust-cuda-derive/Cargo.toml index 788a08716..41ad5a33f 100644 --- a/rust-cuda-derive/Cargo.toml +++ b/rust-cuda-derive/Cargo.toml @@ -22,7 +22,9 @@ serde_json = "1.0" cargo_metadata = { version = "0.18", features = ["builder"] } strip-ansi-escapes = "0.2" colored = "2.0" - +thiserror = "1.0" seahash = "4.1" ptx-builder = { git = "https://github.com/juntyr/rust-ptx-builder", rev = "1f1f49d" } -ptx_compiler = "0.1" + +[build-dependencies] +find_cuda_helper = "0.2" diff --git a/rust-cuda-derive/build.rs b/rust-cuda-derive/build.rs index 27d940ad2..f7aa5b1a9 100644 --- a/rust-cuda-derive/build.rs +++ b/rust-cuda-derive/build.rs @@ -1,3 +1,5 @@ fn main() { + find_cuda_helper::include_cuda(); + println!("cargo:rustc-link-lib=nvptxcompiler_static"); } diff --git a/rust-cuda-derive/src/kernel/link/mod.rs b/rust-cuda-derive/src/kernel/link/mod.rs index d383198ec..f6f4719c4 100644 --- a/rust-cuda-derive/src/kernel/link/mod.rs +++ b/rust-cuda-derive/src/kernel/link/mod.rs @@ -1,9 +1,9 @@ use std::{ env, ffi::CString, + fmt::Write as FmtWrite, fs, io::{Read, Write}, - mem::MaybeUninit, os::raw::c_int, path::{Path, PathBuf}, ptr::addr_of_mut, @@ -16,15 +16,16 @@ use ptx_builder::{ builder::{BuildStatus, Builder, MessageFormat, Profile}, error::{BuildErrorKind, Error, Result}, }; -use ptx_compiler::sys::size_t; use super::utils::skip_kernel_compilation; mod config; mod error; +mod ptx_compiler_sys; use config::{CheckKernelConfig, LinkKernelConfig}; use error::emit_ptx_build_error; +use ptx_compiler_sys::NvptxError; pub fn check_kernel(tokens: TokenStream) -> TokenStream { proc_macro_error::set_dummy(quote! { @@ -206,100 +207,32 @@ pub fn link_kernel(tokens: TokenStream) -> TokenStream { kernel_ptx.replace_range(type_layout_start..type_layout_end, ""); } - let mut compiler = MaybeUninit::uninit(); - let r = unsafe { - ptx_compiler::sys::nvPTXCompilerCreate( - compiler.as_mut_ptr(), - kernel_ptx.len() as size_t, - kernel_ptx.as_ptr().cast(), - ) - }; - emit_call_site_warning!("PTX compiler create result {}", r); - let compiler = unsafe { compiler.assume_init() }; - - let mut major = 0; - let mut minor = 0; - let r = unsafe { - ptx_compiler::sys::nvPTXCompilerGetVersion(addr_of_mut!(major), addr_of_mut!(minor)) - }; - emit_call_site_warning!("PTX version result {}", r); - emit_call_site_warning!("PTX compiler version {}.{}", major, minor); + let (result, error_log, info_log, version, drop) = + check_kernel_ptx(&kernel_ptx, &specialisation, &kernel_hash); - let kernel_name = if specialisation.is_empty() { - format!("{kernel_hash}_kernel") - } else { - format!( - "{kernel_hash}_kernel_{:016x}", - seahash::hash(specialisation.as_bytes()) - ) - }; - - let options = vec![ - CString::new("--entry").unwrap(), - CString::new(kernel_name).unwrap(), - CString::new("--verbose").unwrap(), - CString::new("--warn-on-double-precision-use").unwrap(), - CString::new("--warn-on-local-memory-usage").unwrap(), - CString::new("--warn-on-spills").unwrap(), - ]; - let options_ptrs = options.iter().map(|o| o.as_ptr()).collect::>(); - - let r = unsafe { - ptx_compiler::sys::nvPTXCompilerCompile( - compiler, - c_int::try_from(options_ptrs.len()).unwrap(), - options_ptrs.as_ptr().cast(), - ) + let ptx_compiler = match &version { + Ok((major, minor)) => format!("PTX compiler v{major}.{minor}"), + Err(_) => String::from("PTX compiler"), }; - emit_call_site_warning!("PTX compile result {}", r); - let mut info_log_size = 0; - let r = unsafe { - ptx_compiler::sys::nvPTXCompilerGetInfoLogSize(compiler, addr_of_mut!(info_log_size)) - }; - emit_call_site_warning!("PTX info log size result {}", r); - #[allow(clippy::cast_possible_truncation)] - let mut info_log: Vec = Vec::with_capacity(info_log_size as usize); - if info_log_size > 0 { - let r = unsafe { - ptx_compiler::sys::nvPTXCompilerGetInfoLog(compiler, info_log.as_mut_ptr().cast()) - }; - emit_call_site_warning!("PTX info log content result {}", r); - #[allow(clippy::cast_possible_truncation)] - unsafe { - info_log.set_len(info_log_size as usize); - } - } - let info_log = String::from_utf8_lossy(&info_log); - - let mut error_log_size = 0; - let r = unsafe { - ptx_compiler::sys::nvPTXCompilerGetErrorLogSize(compiler, addr_of_mut!(error_log_size)) - }; - emit_call_site_warning!("PTX error log size result {}", r); - #[allow(clippy::cast_possible_truncation)] - let mut error_log: Vec = Vec::with_capacity(error_log_size as usize); - if error_log_size > 0 { - let r = unsafe { - ptx_compiler::sys::nvPTXCompilerGetErrorLog(compiler, error_log.as_mut_ptr().cast()) - }; - emit_call_site_warning!("PTX error log content result {}", r); - #[allow(clippy::cast_possible_truncation)] - unsafe { - error_log.set_len(error_log_size as usize); - } + // TODO: allow user to select + // - warn on double + // - warn on float + // - warn on spills + // - verbose warn + // - warnings as errors + // - show PTX source if warning or error + + let mut errors = String::new(); + if let Err(err) = drop { + let _ = errors.write_fmt(format_args!("Error dropping the {ptx_compiler}: {err}\n")); } - let error_log = String::from_utf8_lossy(&error_log); - - // Ensure the compiler is not dropped - let mut compiler = MaybeUninit::new(compiler); - let r = unsafe { ptx_compiler::sys::nvPTXCompilerDestroy(compiler.as_mut_ptr()) }; - emit_call_site_warning!("PTX compiler destroy result {}", r); - - if !info_log.is_empty() { - emit_call_site_warning!("PTX compiler info log:\n{}", info_log); + if let Err(err) = version { + let _ = errors.write_fmt(format_args!( + "Error fetching the version of the {ptx_compiler}: {err}\n" + )); } - if !error_log.is_empty() { + if let (Ok(Some(_)), _) | (_, Ok(Some(_))) = (&info_log, &error_log) { let mut max_lines = kernel_ptx.chars().filter(|c| *c == '\n').count() + 1; let mut indent = 0; while max_lines > 0 { @@ -307,9 +240,8 @@ pub fn link_kernel(tokens: TokenStream) -> TokenStream { indent += 1; } - abort_call_site!( - "PTX compiler error log:\n{}\nPTX source:\n{}", - error_log, + emit_call_site_warning!( + "PTX source code:\n{}", kernel_ptx .lines() .enumerate() @@ -318,10 +250,162 @@ pub fn link_kernel(tokens: TokenStream) -> TokenStream { .join("\n") ); } + match info_log { + Ok(None) => (), + Ok(Some(info_log)) => emit_call_site_warning!("{ptx_compiler} info log:\n{}", info_log), + Err(err) => { + let _ = errors.write_fmt(format_args!( + "Error fetching the info log of the {ptx_compiler}: {err}\n" + )); + }, + }; + match error_log { + Ok(None) => (), + Ok(Some(error_log)) => emit_call_site_error!("{ptx_compiler} error log:\n{}", error_log), + Err(err) => { + let _ = errors.write_fmt(format_args!( + "Error fetching the error log of the {ptx_compiler}: {err}\n" + )); + }, + }; + if let Err(err) = result { + let _ = errors.write_fmt(format_args!("Error compiling the PTX source code: {err}\n")); + } + if !errors.is_empty() { + abort_call_site!("{}", errors); + } (quote! { const PTX_STR: &'static str = #kernel_ptx; #(#type_layouts)* }).into() } +#[allow(clippy::type_complexity)] +fn check_kernel_ptx( + kernel_ptx: &str, + specialisation: &str, + kernel_hash: &proc_macro2::Ident, +) -> ( + Result<(), NvptxError>, + Result, NvptxError>, + Result, NvptxError>, + Result<(u32, u32), NvptxError>, + Result<(), NvptxError>, +) { + let compiler = { + let mut compiler = std::ptr::null_mut(); + if let Err(err) = NvptxError::try_err_from(unsafe { + ptx_compiler_sys::nvPTXCompilerCreate( + addr_of_mut!(compiler), + kernel_ptx.len() as ptx_compiler_sys::size_t, + kernel_ptx.as_ptr().cast(), + ) + }) { + abort_call_site!("PTX compiler creation failed: {}", err); + } + compiler + }; + + let result = { + let kernel_name = if specialisation.is_empty() { + format!("{kernel_hash}_kernel") + } else { + format!( + "{kernel_hash}_kernel_{:016x}", + seahash::hash(specialisation.as_bytes()) + ) + }; + + let options = vec![ + CString::new("--entry").unwrap(), + CString::new(kernel_name).unwrap(), + CString::new("--verbose").unwrap(), + CString::new("--warn-on-double-precision-use").unwrap(), + CString::new("--warn-on-local-memory-usage").unwrap(), + CString::new("--warn-on-spills").unwrap(), + ]; + let options_ptrs = options.iter().map(|o| o.as_ptr()).collect::>(); + + NvptxError::try_err_from(unsafe { + ptx_compiler_sys::nvPTXCompilerCompile( + compiler, + c_int::try_from(options_ptrs.len()).unwrap(), + options_ptrs.as_ptr().cast(), + ) + }) + }; + + let error_log = (|| { + let mut error_log_size = 0; + + NvptxError::try_err_from(unsafe { + ptx_compiler_sys::nvPTXCompilerGetErrorLogSize(compiler, addr_of_mut!(error_log_size)) + })?; + + if error_log_size == 0 { + return Ok(None); + } + + #[allow(clippy::cast_possible_truncation)] + let mut error_log: Vec = Vec::with_capacity(error_log_size as usize); + + NvptxError::try_err_from(unsafe { + ptx_compiler_sys::nvPTXCompilerGetErrorLog(compiler, error_log.as_mut_ptr().cast()) + })?; + + #[allow(clippy::cast_possible_truncation)] + unsafe { + error_log.set_len(error_log_size as usize); + } + + Ok(Some(String::from_utf8_lossy(&error_log).into_owned())) + })(); + + let info_log = (|| { + let mut info_log_size = 0; + + NvptxError::try_err_from(unsafe { + ptx_compiler_sys::nvPTXCompilerGetInfoLogSize(compiler, addr_of_mut!(info_log_size)) + })?; + + if info_log_size == 0 { + return Ok(None); + } + + #[allow(clippy::cast_possible_truncation)] + let mut info_log: Vec = Vec::with_capacity(info_log_size as usize); + + NvptxError::try_err_from(unsafe { + ptx_compiler_sys::nvPTXCompilerGetInfoLog(compiler, info_log.as_mut_ptr().cast()) + })?; + + #[allow(clippy::cast_possible_truncation)] + unsafe { + info_log.set_len(info_log_size as usize); + } + + Ok(Some(String::from_utf8_lossy(&info_log).into_owned())) + })(); + + let version = (|| { + let mut major = 0; + let mut minor = 0; + + NvptxError::try_err_from(unsafe { + ptx_compiler_sys::nvPTXCompilerGetVersion(addr_of_mut!(major), addr_of_mut!(minor)) + })?; + + Ok((major, minor)) + })(); + + let drop = { + let mut compiler = compiler; + NvptxError::try_err_from(unsafe { + ptx_compiler_sys::nvPTXCompilerDestroy(addr_of_mut!(compiler)) + }) + }; + + (result, error_log, info_log, version, drop) +} + fn compile_kernel( args: &syn::Ident, crate_name: &str, diff --git a/rust-cuda-derive/src/kernel/link/ptx_compiler_sys.rs b/rust-cuda-derive/src/kernel/link/ptx_compiler_sys.rs new file mode 100644 index 000000000..93837a418 --- /dev/null +++ b/rust-cuda-derive/src/kernel/link/ptx_compiler_sys.rs @@ -0,0 +1,301 @@ +use thiserror::Error; + +#[allow(non_camel_case_types)] +pub type size_t = ::std::os::raw::c_ulonglong; + +#[repr(C)] +pub struct nvPTXCompiler { + _private: [u8; 0], +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Error)] +#[non_exhaustive] +pub enum NvptxError { + #[error("Invalid compiler handle")] + InvalidCompilerHandle, + #[error("Invalid PTX input")] + InvalidInput, + #[error("Compilation failure")] + CompilationFailure, + #[error("Internal error")] + Internal, + #[error("Out of memory")] + OutOfMemory, + #[error("Incomplete compiler invocation")] + CompilerInvocationIncomplete, + #[error("Unsupported PTX version")] + UnsupportedPtxVersion, + #[error("Unsupported dev-side sync")] + UnsupportedDevSideSync, + #[error("Unknown error code")] + UnknownError, +} + +impl NvptxError { + const NVPTXCOMPILE_ERROR_COMPILATION_FAILURE: NvptxCompileResult = 3; + const NVPTXCOMPILE_ERROR_COMPILER_INVOCATION_INCOMPLETE: NvptxCompileResult = 6; + const NVPTXCOMPILE_ERROR_INTERNAL: NvptxCompileResult = 4; + const NVPTXCOMPILE_ERROR_INVALID_COMPILER_HANDLE: NvptxCompileResult = 1; + const NVPTXCOMPILE_ERROR_INVALID_INPUT: NvptxCompileResult = 2; + const NVPTXCOMPILE_ERROR_OUT_OF_MEMORY: NvptxCompileResult = 5; + const NVPTXCOMPILE_ERROR_UNSUPPORTED_DEVSIDE_SYNC: NvptxCompileResult = 8; + const NVPTXCOMPILE_ERROR_UNSUPPORTED_PTX_VERSION: NvptxCompileResult = 7; + const NVPTXCOMPILE_SUCCESS: NvptxCompileResult = 0; + + pub fn try_err_from(result: NvptxCompileResult) -> Result<(), Self> { + match result { + Self::NVPTXCOMPILE_SUCCESS => Ok(()), + Self::NVPTXCOMPILE_ERROR_INVALID_COMPILER_HANDLE => Err(Self::InvalidCompilerHandle), + Self::NVPTXCOMPILE_ERROR_INVALID_INPUT => Err(Self::InvalidInput), + Self::NVPTXCOMPILE_ERROR_COMPILATION_FAILURE => Err(Self::CompilationFailure), + Self::NVPTXCOMPILE_ERROR_INTERNAL => Err(Self::Internal), + Self::NVPTXCOMPILE_ERROR_OUT_OF_MEMORY => Err(Self::OutOfMemory), + Self::NVPTXCOMPILE_ERROR_COMPILER_INVOCATION_INCOMPLETE => { + Err(Self::CompilerInvocationIncomplete) + }, + Self::NVPTXCOMPILE_ERROR_UNSUPPORTED_PTX_VERSION => Err(Self::UnsupportedPtxVersion), + Self::NVPTXCOMPILE_ERROR_UNSUPPORTED_DEVSIDE_SYNC => Err(Self::UnsupportedDevSideSync), + _ => Err(Self::UnknownError), + } + } +} + +/// [`nvPTXCompilerHandle`] represents a handle to the PTX Compiler. +/// +/// To compile a PTX program string, an instance of [`nvPTXCompiler`] +/// must be created and the handle to it must be obtained using the +/// API [`nvPTXCompilerCreate`]. Then the compilation can be done +/// using the API [`nvPTXCompilerCompile`]. +pub type NvptxCompilerHandle = *mut nvPTXCompiler; + +/// The [`nvPTXCompiler`] APIs return the [`nvPTXCompileResult`] codes to +/// indicate the call result"] +pub type NvptxCompileResult = ::std::os::raw::c_int; + +extern "C" { + /// Queries the current major and minor version of PTX Compiler APIs being + /// used + /// + /// # Parameters + /// - [out] `major`: Major version of the PTX Compiler APIs + /// - [out] `minor`: Minor version of the PTX Compiler APIs + /// + /// # Return + /// - [`NvptxCompileResult`]::`NVPTXCOMPILE_SUCCESS` + /// - [`NvptxCompileResult`]::`NVPTXCOMPILE_ERROR_INTERNAL` + /// + /// # Note + /// The version of PTX Compiler APIs follows the CUDA Toolkit versioning. + /// The PTX ISA version supported by a PTX Compiler API version is listed + /// [here](https://docs.nvidia.com/cuda/parallel-thread-execution/#release-notes). + pub fn nvPTXCompilerGetVersion( + major: *mut ::std::os::raw::c_uint, + minor: *mut ::std::os::raw::c_uint, + ) -> NvptxCompileResult; + + #[doc = " \\ingroup compilation"] + #[doc = ""] + #[doc = " \\brief Obtains the handle to an instance of the PTX compiler"] + #[doc = " initialized with the given PTX program \\p ptxCode"] + #[doc = ""] + #[doc = " \\param [out] compiler Returns a handle to PTX compiler initialized"] + #[doc = " with the PTX program \\p ptxCode"] + #[doc = " \\param [in] ptxCodeLen Size of the PTX program \\p ptxCode passed as \ + string"] + #[doc = " \\param [in] ptxCode The PTX program which is to be compiled passed as \ + string."] + #[doc = ""] + #[doc = ""] + #[doc = " \\return"] + #[doc = " - \\link #nvPTXCompileResult NVPTXCOMPILE_SUCCESS \\endlink"] + #[doc = " - \\link #nvPTXCompileResult NVPTXCOMPILE_ERROR_OUT_OF_MEMORY \\endlink"] + #[doc = " - \\link #nvPTXCompileResult NVPTXCOMPILE_ERROR_INTERNAL \\endlink"] + pub fn nvPTXCompilerCreate( + compiler: *mut NvptxCompilerHandle, + ptxCodeLen: size_t, + ptxCode: *const ::std::os::raw::c_char, + ) -> NvptxCompileResult; + + #[doc = " \\ingroup compilation"] + #[doc = ""] + #[doc = " \\brief Destroys and cleans the already created PTX compiler"] + #[doc = ""] + #[doc = " \\param [in] compiler A handle to the PTX compiler which is to be \ + destroyed"] + #[doc = ""] + #[doc = " \\return"] + #[doc = " - \\link #nvPTXCompileResult NVPTXCOMPILE_SUCCESS \\endlink"] + #[doc = " - \\link #nvPTXCompileResult NVPTXCOMPILE_ERROR_OUT_OF_MEMORY \\endlink"] + #[doc = " - \\link #nvPTXCompileResult NVPTXCOMPILE_ERROR_INTERNAL \\endlink"] + #[doc = " - \\link #nvPTXCompileResult NVPTXCOMPILE_ERROR_INVALID_PROGRAM_HANDLE \\endlink"] + #[doc = ""] + pub fn nvPTXCompilerDestroy(compiler: *mut NvptxCompilerHandle) -> NvptxCompileResult; + + #[doc = " \\ingroup compilation"] + #[doc = ""] + #[doc = " \\brief Compile a PTX program with the given compiler options"] + #[doc = ""] + #[doc = " \\param [in,out] compiler A handle to PTX compiler initialized with \ + the"] + #[doc = " PTX program which is to be compiled."] + #[doc = " The compiled program can be accessed using \ + the handle"] + #[doc = " \\param [in] numCompileOptions Length of the array \\p compileOptions"] + #[doc = " \\param [in] compileOptions Compiler options with which compilation \ + should be done."] + #[doc = " The compiler options string is a null \ + terminated character array."] + #[doc = " A valid list of compiler options is at"] + #[doc = " link."] + #[doc = " \\note --gpu-name (-arch) is a mandatory option."] + #[doc = ""] + #[doc = " \\return"] + #[doc = " - \\link #nvPTXCompileResult NVPTXCOMPILE_SUCCESS \\endlink"] + #[doc = " - \\link #nvPTXCompileResult NVPTXCOMPILE_ERROR_OUT_OF_MEMORY \\endlink"] + #[doc = " - \\link #nvPTXCompileResult NVPTXCOMPILE_ERROR_INTERNAL \\endlink"] + #[doc = " - \\link #nvPTXCompileResult NVPTXCOMPILE_ERROR_INVALID_PROGRAM_HANDLE \\endlink"] + #[doc = " - \\link #nvPTXCompileResult NVPTXCOMPILE_ERROR_COMPILATION_FAILURE \\endlink"] + #[doc = " - \\link #nvPTXCompileResult NVPTXCOMPILE_ERROR_UNSUPPORTED_PTX_VERSION \\endlink"] + #[doc = ""] + pub fn nvPTXCompilerCompile( + compiler: NvptxCompilerHandle, + numCompileOptions: ::std::os::raw::c_int, + compileOptions: *const *const ::std::os::raw::c_char, + ) -> NvptxCompileResult; + + #[doc = " \\ingroup compilation"] + #[doc = ""] + #[doc = " \\brief Obtains the size of the image of the compiled program"] + #[doc = ""] + #[doc = " \\param [in] compiler A handle to PTX compiler on which \ + nvPTXCompilerCompile() has been performed."] + #[doc = " \\param [out] binaryImageSize The size of the image of the compiled \ + program"] + #[doc = ""] + #[doc = " \\return"] + #[doc = " - \\link #nvPTXCompileResult NVPTXCOMPILE_SUCCESS \\endlink"] + #[doc = " - \\link #nvPTXCompileResult NVPTXCOMPILE_ERROR_INTERNAL \\endlink"] + #[doc = " - \\link #nvPTXCompileResult NVPTXCOMPILE_ERROR_INVALID_PROGRAM_HANDLE \\endlink"] + #[doc = " - \\link #nvPTXCompileResult NVPTXCOMPILE_ERROR_COMPILER_INVOCATION_INCOMPLETE \ + \\endlink"] + #[doc = ""] + #[doc = " \\note nvPTXCompilerCompile() API should be invoked for the handle \ + before calling this API."] + #[doc = " Otherwise, NVPTXCOMPILE_ERROR_COMPILER_INVOCATION_INCOMPLETE is \ + returned."] + pub fn nvPTXCompilerGetCompiledProgramSize( + compiler: NvptxCompilerHandle, + binaryImageSize: *mut size_t, + ) -> NvptxCompileResult; + + #[doc = " \\ingroup compilation"] + #[doc = ""] + #[doc = " \\brief Obtains the image of the compiled program"] + #[doc = ""] + #[doc = " \\param [in] compiler A handle to PTX compiler on which \ + nvPTXCompilerCompile() has been performed."] + #[doc = " \\param [out] binaryImage The image of the compiled program."] + #[doc = " Client should allocate memory for \\p \ + binaryImage"] + #[doc = ""] + #[doc = " \\return"] + #[doc = " - \\link #nvPTXCompileResult NVPTXCOMPILE_SUCCESS \\endlink"] + #[doc = " - \\link #nvPTXCompileResult NVPTXCOMPILE_ERROR_INTERNAL \\endlink"] + #[doc = " - \\link #nvPTXCompileResult NVPTXCOMPILE_ERROR_INVALID_PROGRAM_HANDLE \\endlink"] + #[doc = " - \\link #nvPTXCompileResult NVPTXCOMPILE_ERROR_COMPILER_INVOCATION_INCOMPLETE \ + \\endlink"] + #[doc = ""] + #[doc = " \\note nvPTXCompilerCompile() API should be invoked for the handle \ + before calling this API."] + #[doc = " Otherwise, NVPTXCOMPILE_ERROR_COMPILER_INVOCATION_INCOMPLETE is \ + returned."] + #[doc = ""] + pub fn nvPTXCompilerGetCompiledProgram( + compiler: NvptxCompilerHandle, + binaryImage: *mut ::std::os::raw::c_void, + ) -> NvptxCompileResult; + + #[doc = " \\ingroup compilation"] + #[doc = ""] + #[doc = " \\brief Query the size of the error message that was seen previously for \ + the handle"] + #[doc = ""] + #[doc = " \\param [in] compiler A handle to PTX compiler on which \ + nvPTXCompilerCompile() has been performed."] + #[doc = " \\param [out] errorLogSize The size of the error log in bytes which \ + was produced"] + #[doc = " in previous call to nvPTXCompilerCompiler()."] + #[doc = ""] + #[doc = " \\return"] + #[doc = " - \\link #nvPTXCompileResult NVPTXCOMPILE_SUCCESS \\endlink"] + #[doc = " - \\link #nvPTXCompileResult NVPTXCOMPILE_ERROR_INTERNAL \\endlink"] + #[doc = " - \\link #nvPTXCompileResult NVPTXCOMPILE_ERROR_INVALID_PROGRAM_HANDLE \\endlink"] + #[doc = ""] + pub fn nvPTXCompilerGetErrorLogSize( + compiler: NvptxCompilerHandle, + errorLogSize: *mut size_t, + ) -> NvptxCompileResult; + + #[doc = " \\ingroup compilation"] + #[doc = ""] + #[doc = " \\brief Query the error message that was seen previously for the handle"] + #[doc = ""] + #[doc = " \\param [in] compiler A handle to PTX compiler on which \ + nvPTXCompilerCompile() has been performed."] + #[doc = " \\param [out] errorLog The error log which was produced in \ + previous call to nvPTXCompilerCompiler()."] + #[doc = " Clients should allocate memory for \\p \ + errorLog"] + #[doc = ""] + #[doc = " \\return"] + #[doc = " - \\link #nvPTXCompileResult NVPTXCOMPILE_SUCCESS \\endlink"] + #[doc = " - \\link #nvPTXCompileResult NVPTXCOMPILE_ERROR_INTERNAL \\endlink"] + #[doc = " - \\link #nvPTXCompileResult NVPTXCOMPILE_ERROR_INVALID_PROGRAM_HANDLE \\endlink"] + #[doc = ""] + pub fn nvPTXCompilerGetErrorLog( + compiler: NvptxCompilerHandle, + errorLog: *mut ::std::os::raw::c_char, + ) -> NvptxCompileResult; + + #[doc = " \\ingroup compilation"] + #[doc = ""] + #[doc = " \\brief Query the size of the information message that was seen \ + previously for the handle"] + #[doc = ""] + #[doc = " \\param [in] compiler A handle to PTX compiler on which \ + nvPTXCompilerCompile() has been performed."] + #[doc = " \\param [out] infoLogSize The size of the information log in bytes \ + which was produced"] + #[doc = " in previous call to nvPTXCompilerCompiler()."] + #[doc = ""] + #[doc = " \\return"] + #[doc = " - \\link #nvPTXCompileResult NVPTXCOMPILE_SUCCESS \\endlink"] + #[doc = " - \\link #nvPTXCompileResult NVPTXCOMPILE_ERROR_INTERNAL \\endlink"] + #[doc = " - \\link #nvPTXCompileResult NVPTXCOMPILE_ERROR_INVALID_PROGRAM_HANDLE \\endlink"] + #[doc = ""] + pub fn nvPTXCompilerGetInfoLogSize( + compiler: NvptxCompilerHandle, + infoLogSize: *mut size_t, + ) -> NvptxCompileResult; + + #[doc = " \\ingroup compilation"] + #[doc = ""] + #[doc = " \\brief Query the information message that was seen previously for the \ + handle"] + #[doc = ""] + #[doc = " \\param [in] compiler A handle to PTX compiler on which \ + nvPTXCompilerCompile() has been performed."] + #[doc = " \\param [out] infoLog The information log which was produced in \ + previous call to nvPTXCompilerCompiler()."] + #[doc = " Clients should allocate memory for \\p infoLog"] + #[doc = ""] + #[doc = " \\return"] + #[doc = " - \\link #nvPTXCompileResult NVPTXCOMPILE_SUCCESS \\endlink"] + #[doc = " - \\link #nvPTXCompileResult NVPTXCOMPILE_ERROR_INTERNAL \\endlink"] + #[doc = " - \\link #nvPTXCompileResult NVPTXCOMPILE_ERROR_INVALID_PROGRAM_HANDLE \\endlink"] + #[doc = ""] + pub fn nvPTXCompilerGetInfoLog( + compiler: NvptxCompilerHandle, + infoLog: *mut ::std::os::raw::c_char, + ) -> NvptxCompileResult; +} From 57e10d77272a9b38fa36c1402db3888e3bb7c940 Mon Sep 17 00:00:00 2001 From: Juniper Tyree Date: Mon, 8 May 2023 04:32:28 +0000 Subject: [PATCH 029/120] Add PTX lint parsing, no actual support yet --- examples/single-source/src/main.rs | 4 + .../src/kernel/link/ptx_compiler_sys.rs | 64 ++++---- rust-cuda-derive/src/kernel/wrapper/mod.rs | 151 +++++++++++++++++- rust-cuda-derive/src/lib.rs | 1 + 4 files changed, 182 insertions(+), 38 deletions(-) diff --git a/examples/single-source/src/main.rs b/examples/single-source/src/main.rs index 981c9bccc..997fa88bc 100644 --- a/examples/single-source/src/main.rs +++ b/examples/single-source/src/main.rs @@ -40,6 +40,10 @@ pub struct Tuple(u32, i32); #[rc::common::kernel(use link_kernel! as impl Kernel for Launcher)] #[kernel(crate = "rc")] +#[kernel( + allow(ptx::double_precision_use), + forbid(ptx::local_memory_usage, ptx::register_spills) +)] pub fn kernel<'a, T: rc::common::RustToCuda>( #[kernel(pass = SafeDeviceCopy)] _x: &Dummy, #[kernel(pass = LendRustToCuda, jit)] _y: &mut ShallowCopy>, diff --git a/rust-cuda-derive/src/kernel/link/ptx_compiler_sys.rs b/rust-cuda-derive/src/kernel/link/ptx_compiler_sys.rs index 93837a418..5e459a623 100644 --- a/rust-cuda-derive/src/kernel/link/ptx_compiler_sys.rs +++ b/rust-cuda-derive/src/kernel/link/ptx_compiler_sys.rs @@ -74,15 +74,15 @@ pub type NvptxCompileResult = ::std::os::raw::c_int; extern "C" { /// Queries the current major and minor version of PTX Compiler APIs being - /// used + /// used. /// /// # Parameters /// - [out] `major`: Major version of the PTX Compiler APIs /// - [out] `minor`: Minor version of the PTX Compiler APIs /// - /// # Return - /// - [`NvptxCompileResult`]::`NVPTXCOMPILE_SUCCESS` - /// - [`NvptxCompileResult`]::`NVPTXCOMPILE_ERROR_INTERNAL` + /// # Returns + /// - [`NvptxCompileResult::NVPTXCOMPILE_SUCCESS`] + /// - [`NvptxCompileResult::NVPTXCOMPILE_ERROR_INTERNAL`] /// /// # Note /// The version of PTX Compiler APIs follows the CUDA Toolkit versioning. @@ -93,42 +93,38 @@ extern "C" { minor: *mut ::std::os::raw::c_uint, ) -> NvptxCompileResult; - #[doc = " \\ingroup compilation"] - #[doc = ""] - #[doc = " \\brief Obtains the handle to an instance of the PTX compiler"] - #[doc = " initialized with the given PTX program \\p ptxCode"] - #[doc = ""] - #[doc = " \\param [out] compiler Returns a handle to PTX compiler initialized"] - #[doc = " with the PTX program \\p ptxCode"] - #[doc = " \\param [in] ptxCodeLen Size of the PTX program \\p ptxCode passed as \ - string"] - #[doc = " \\param [in] ptxCode The PTX program which is to be compiled passed as \ - string."] - #[doc = ""] - #[doc = ""] - #[doc = " \\return"] - #[doc = " - \\link #nvPTXCompileResult NVPTXCOMPILE_SUCCESS \\endlink"] - #[doc = " - \\link #nvPTXCompileResult NVPTXCOMPILE_ERROR_OUT_OF_MEMORY \\endlink"] - #[doc = " - \\link #nvPTXCompileResult NVPTXCOMPILE_ERROR_INTERNAL \\endlink"] + /// Obtains the handle to an instance of the PTX compiler + /// initialized with the given PTX program `ptxCode`. + /// + /// # Parameters + /// - [out] `compiler`: Returns a handle to PTX compiler initialized with + /// the PTX program `ptxCode` + /// - [in] `ptxCodeLen`: Size of the PTX program `ptxCode` passed as a + /// string + /// - [in] `ptxCode`: The PTX program which is to be compiled passed as a + /// string + /// + /// # Returns + /// - [`NvptxCompileResult::NVPTXCOMPILE_SUCCESS`] + /// - [`NvptxCompileResult::NVPTXCOMPILE_ERROR_OUT_OF_MEMORY`] + /// - [`NvptxCompileResult::NVPTXCOMPILE_ERROR_INTERNAL`] pub fn nvPTXCompilerCreate( compiler: *mut NvptxCompilerHandle, ptxCodeLen: size_t, ptxCode: *const ::std::os::raw::c_char, ) -> NvptxCompileResult; - #[doc = " \\ingroup compilation"] - #[doc = ""] - #[doc = " \\brief Destroys and cleans the already created PTX compiler"] - #[doc = ""] - #[doc = " \\param [in] compiler A handle to the PTX compiler which is to be \ - destroyed"] - #[doc = ""] - #[doc = " \\return"] - #[doc = " - \\link #nvPTXCompileResult NVPTXCOMPILE_SUCCESS \\endlink"] - #[doc = " - \\link #nvPTXCompileResult NVPTXCOMPILE_ERROR_OUT_OF_MEMORY \\endlink"] - #[doc = " - \\link #nvPTXCompileResult NVPTXCOMPILE_ERROR_INTERNAL \\endlink"] - #[doc = " - \\link #nvPTXCompileResult NVPTXCOMPILE_ERROR_INVALID_PROGRAM_HANDLE \\endlink"] - #[doc = ""] + /// Destroys and cleans the already created PTX compiler. + /// + /// # Parameters + /// - [in] `compiler`: A handle to the PTX compiler which is to be + /// destroyed. + /// + /// # Returns + /// - [`NvptxCompileResult::NVPTXCOMPILE_SUCCESS`] + /// - [`NvptxCompileResult::NVPTXCOMPILE_ERROR_OUT_OF_MEMORY`] + /// - [`NvptxCompileResult::NVPTXCOMPILE_ERROR_INTERNAL`] + /// - [`NvptxCompileResult::NVPTXCOMPILE_ERROR_INVALID_PROGRAM_HANDLE`] pub fn nvPTXCompilerDestroy(compiler: *mut NvptxCompilerHandle) -> NvptxCompileResult; #[doc = " \\ingroup compilation"] diff --git a/rust-cuda-derive/src/kernel/wrapper/mod.rs b/rust-cuda-derive/src/kernel/wrapper/mod.rs index b720a8965..a677c3e0f 100644 --- a/rust-cuda-derive/src/kernel/wrapper/mod.rs +++ b/rust-cuda-derive/src/kernel/wrapper/mod.rs @@ -1,4 +1,8 @@ -use std::hash::{Hash, Hasher}; +use std::{ + collections::HashMap, + fmt, + hash::{Hash, Hasher}, +}; use proc_macro::TokenStream; @@ -41,6 +45,7 @@ pub fn kernel(attr: TokenStream, func: TokenStream) -> TokenStream { let mut func = parse_kernel_fn(func); let mut crate_path = None; + let mut lint_levels = HashMap::new(); func.attrs.retain(|attr| { if attr.path.is_ident("kernel") { @@ -58,7 +63,7 @@ pub fn kernel(attr: TokenStream, func: TokenStream) -> TokenStream { syn::parse_quote_spanned! { s.span() => #new_crate_path }, ); - return false; + continue; } emit_error!( @@ -73,10 +78,106 @@ pub fn kernel(attr: TokenStream, func: TokenStream) -> TokenStream { err ), }, + syn::NestedMeta::Meta(syn::Meta::List(syn::MetaList { + path, + nested, + .. + })) if path.is_ident("allow") || path.is_ident("warn") || path.is_ident("deny") || path.is_ident("forbid") => { + let level = match path.get_ident() { + Some(ident) if ident == "allow" => LintLevel::Allow, + Some(ident) if ident == "warn" => LintLevel::Warn, + Some(ident) if ident == "deny" => LintLevel::Deny, + Some(ident) if ident == "forbid" => LintLevel::Forbid, + _ => unreachable!(), + }; + + for meta in nested { + let syn::NestedMeta::Meta(syn::Meta::Path(path)) = meta else { + emit_error!( + meta.span(), + "[rust-cuda]: Invalid #[kernel({}())] attribute.", + level, + ); + continue; + }; + + if path.leading_colon.is_some() || path.segments.empty_or_trailing() || path.segments.len() != 2 { + emit_error!( + meta.span(), + "[rust-cuda]: Invalid #[kernel({}())] attribute: must be of the form `ptx::lint`.", + level, + ); + continue; + } + + let Some(syn::PathSegment { ident: namespace, arguments: syn::PathArguments::None }) = path.segments.first() else { + emit_error!( + meta.span(), + "[rust-cuda]: Invalid #[kernel({}())] attribute: must be of the form `ptx::lint`.", + level, + ); + continue; + }; + + if namespace != "ptx" { + emit_error!( + meta.span(), + "[rust-cuda]: Invalid #[kernel({}())] attribute: must be of the form `ptx::lint`.", + level, + ); + continue; + } + + let Some(syn::PathSegment { ident: lint, arguments: syn::PathArguments::None }) = path.segments.last() else { + emit_error!( + meta.span(), + "[rust-cuda]: Invalid #[kernel({}())] attribute: must be of the form `ptx::lint`.", + level, + ); + continue; + }; + + let lint = match lint { + l if l == "verbose" => PtxLint::Verbose, + l if l == "double_precision_use" => PtxLint::DoublePrecisionUse, + l if l == "local_memory_usage" => PtxLint::LocalMemoryUsage, + l if l == "register_spills" => PtxLint::RegisterSpills, + _ => { + emit_error!( + meta.span(), + "[rust-cuda]: Unknown PTX kernel lint `ptx::{}`.", + lint, + ); + continue; + } + }; + + match lint_levels.get(&lint) { + None => (), + Some(LintLevel::Forbid) if level < LintLevel::Forbid => { + emit_error!( + meta.span(), + "[rust-cuda]: {}(ptx::{}) incompatible with previous forbid.", + level, lint, + ); + continue; + }, + Some(previous) => { + emit_warning!( + meta.span(), + "[rust-cuda]: {}(ptx::{}) overwrites previous {}.", + level, lint, previous, + ); + } + } + + lint_levels.insert(lint, level); + } + }, _ => { emit_error!( meta.span(), - "[rust-cuda]: Expected #[kernel(crate = \"\")] function attribute." + "[rust-cuda]: Expected #[kernel(crate = \"\")] or #[kernel(allow/warn/deny/forbid())] function attribute." ); } } @@ -84,7 +185,7 @@ pub fn kernel(attr: TokenStream, func: TokenStream) -> TokenStream { } else { emit_error!( attr.span(), - "[rust-cuda]: Expected #[kernel(crate = \"\")] function attribute." + "[rust-cuda]: Expected #[kernel(crate = \"\")] or or #[kernel(allow/warn/deny/forbid())] function attribute." ); } @@ -96,6 +197,10 @@ pub fn kernel(attr: TokenStream, func: TokenStream) -> TokenStream { let crate_path = crate_path.unwrap_or_else(|| syn::parse_quote!(::rust_cuda)); + let _ = lint_levels.try_insert(PtxLint::DoublePrecisionUse, LintLevel::Warn); + let _ = lint_levels.try_insert(PtxLint::LocalMemoryUsage, LintLevel::Warn); + let _ = lint_levels.try_insert(PtxLint::RegisterSpills, LintLevel::Warn); + let mut generic_kernel_params = func.sig.generics.params.clone(); let mut func_inputs = parse_function_inputs(&func, &mut generic_kernel_params); @@ -341,6 +446,44 @@ struct FuncIdent<'f> { func_ident_hash: syn::Ident, } +#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Debug)] +enum LintLevel { + Allow, + Warn, + Deny, + Forbid, +} + +impl fmt::Display for LintLevel { + fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result { + match self { + Self::Allow => fmt.write_str("allow"), + Self::Warn => fmt.write_str("warn"), + Self::Deny => fmt.write_str("deny"), + Self::Forbid => fmt.write_str("forbid"), + } + } +} + +#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Debug)] +enum PtxLint { + Verbose, + DoublePrecisionUse, + LocalMemoryUsage, + RegisterSpills, +} + +impl fmt::Display for PtxLint { + fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result { + match self { + Self::Verbose => fmt.write_str("verbose"), + Self::DoublePrecisionUse => fmt.write_str("double_precision_use"), + Self::LocalMemoryUsage => fmt.write_str("local_memory_usage"), + Self::RegisterSpills => fmt.write_str("register_spills"), + } + } +} + fn ident_from_pat(pat: &syn::Pat) -> Option { match pat { syn::Pat::Lit(_) diff --git a/rust-cuda-derive/src/lib.rs b/rust-cuda-derive/src/lib.rs index d5d8f3018..572e1c9da 100644 --- a/rust-cuda-derive/src/lib.rs +++ b/rust-cuda-derive/src/lib.rs @@ -4,6 +4,7 @@ #![feature(proc_macro_span)] #![feature(if_let_guard)] #![feature(let_chains)] +#![feature(map_try_insert)] #![doc(html_root_url = "https://juntyr.github.io/rust-cuda/")] extern crate proc_macro; From 1ab8b471dfc01b822b5b5d2a81cb18c78b333d13 Mon Sep 17 00:00:00 2001 From: Juniper Tyree Date: Mon, 8 May 2023 19:01:57 +0000 Subject: [PATCH 030/120] Added lint checking support to monomorphised kernel impls --- rust-cuda-derive/src/kernel/link/config.rs | 19 +- rust-cuda-derive/src/kernel/link/mod.rs | 96 ++++++++-- rust-cuda-derive/src/kernel/lints.rs | 154 ++++++++++++++++ rust-cuda-derive/src/kernel/mod.rs | 1 + .../generate/cpu_linker_macro/get_ptx_str.rs | 4 +- .../wrapper/generate/cpu_linker_macro/mod.rs | 2 + rust-cuda-derive/src/kernel/wrapper/mod.rs | 164 ++++-------------- 7 files changed, 291 insertions(+), 149 deletions(-) create mode 100644 rust-cuda-derive/src/kernel/lints.rs diff --git a/rust-cuda-derive/src/kernel/link/config.rs b/rust-cuda-derive/src/kernel/link/config.rs index bb5f011d6..e2b399dc4 100644 --- a/rust-cuda-derive/src/kernel/link/config.rs +++ b/rust-cuda-derive/src/kernel/link/config.rs @@ -1,4 +1,6 @@ -use std::path::PathBuf; +use std::{collections::HashMap, path::PathBuf}; + +use super::super::lints::{parse_ptx_lint_level, LintLevel, PtxLint}; #[allow(clippy::module_name_repetitions)] pub(super) struct LinkKernelConfig { @@ -8,6 +10,7 @@ pub(super) struct LinkKernelConfig { pub(super) crate_name: String, pub(super) crate_path: PathBuf, pub(super) specialisation: String, + pub(super) ptx_lint_levels: HashMap, } impl syn::parse::Parse for LinkKernelConfig { @@ -37,6 +40,19 @@ impl syn::parse::Parse for LinkKernelConfig { String::new() }; + let attrs = syn::punctuated::Punctuated::< + syn::MetaList, + syn::token::Comma, + >::parse_separated_nonempty(input)?; + + let mut ptx_lint_levels = HashMap::new(); + + for syn::MetaList { path, nested, .. } in attrs { + parse_ptx_lint_level(&path, &nested, &mut ptx_lint_levels); + } + + proc_macro_error::abort_if_dirty(); + Ok(Self { kernel, kernel_hash, @@ -44,6 +60,7 @@ impl syn::parse::Parse for LinkKernelConfig { crate_name: name.value(), crate_path: PathBuf::from(path.value()), specialisation, + ptx_lint_levels, }) } } diff --git a/rust-cuda-derive/src/kernel/link/mod.rs b/rust-cuda-derive/src/kernel/link/mod.rs index f6f4719c4..a79505c13 100644 --- a/rust-cuda-derive/src/kernel/link/mod.rs +++ b/rust-cuda-derive/src/kernel/link/mod.rs @@ -1,4 +1,5 @@ use std::{ + collections::HashMap, env, ffi::CString, fmt::Write as FmtWrite, @@ -17,7 +18,10 @@ use ptx_builder::{ error::{BuildErrorKind, Error, Result}, }; -use super::utils::skip_kernel_compilation; +use super::{ + lints::{LintLevel, PtxLint}, + utils::skip_kernel_compilation, +}; mod config; mod error; @@ -68,12 +72,14 @@ pub fn link_kernel(tokens: TokenStream) -> TokenStream { crate_name, crate_path, specialisation, + ptx_lint_levels, } = match syn::parse_macro_input::parse(tokens) { Ok(config) => config, Err(err) => { abort_call_site!( - "link_kernel!(KERNEL ARGS NAME PATH SPECIALISATION) expects KERNEL and ARGS \ - identifiers, NAME and PATH string literals, and SPECIALISATION tokens: {:?}", + "link_kernel!(KERNEL ARGS NAME PATH SPECIALISATION LINTS,*) expects KERNEL and \ + ARGS identifiers, NAME and PATH string literals, SPECIALISATION and LINTS \ + tokens: {:?}", err ) }, @@ -208,7 +214,7 @@ pub fn link_kernel(tokens: TokenStream) -> TokenStream { } let (result, error_log, info_log, version, drop) = - check_kernel_ptx(&kernel_ptx, &specialisation, &kernel_hash); + check_kernel_ptx(&kernel_ptx, &specialisation, &kernel_hash, &ptx_lint_levels); let ptx_compiler = match &version { Ok((major, minor)) => format!("PTX compiler v{major}.{minor}"), @@ -279,10 +285,12 @@ pub fn link_kernel(tokens: TokenStream) -> TokenStream { } #[allow(clippy::type_complexity)] +#[allow(clippy::too_many_lines)] fn check_kernel_ptx( kernel_ptx: &str, specialisation: &str, kernel_hash: &proc_macro2::Ident, + ptx_lint_levels: &HashMap, ) -> ( Result<(), NvptxError>, Result, NvptxError>, @@ -304,7 +312,7 @@ fn check_kernel_ptx( compiler }; - let result = { + let result = (|| { let kernel_name = if specialisation.is_empty() { format!("{kernel_hash}_kernel") } else { @@ -313,15 +321,79 @@ fn check_kernel_ptx( seahash::hash(specialisation.as_bytes()) ) }; - - let options = vec![ + let mut options = vec![ CString::new("--entry").unwrap(), CString::new(kernel_name).unwrap(), - CString::new("--verbose").unwrap(), - CString::new("--warn-on-double-precision-use").unwrap(), - CString::new("--warn-on-local-memory-usage").unwrap(), - CString::new("--warn-on-spills").unwrap(), ]; + + if ptx_lint_levels + .values() + .any(|level| *level > LintLevel::Warn) + { + let mut options = options.clone(); + + if ptx_lint_levels + .get(&PtxLint::Verbose) + .map_or(false, |level| *level > LintLevel::Warn) + { + options.push(CString::new("--verbose").unwrap()); + } + if ptx_lint_levels + .get(&PtxLint::DoublePrecisionUse) + .map_or(false, |level| *level > LintLevel::Warn) + { + options.push(CString::new("--warn-on-double-precision-use").unwrap()); + } + if ptx_lint_levels + .get(&PtxLint::LocalMemoryUsage) + .map_or(false, |level| *level > LintLevel::Warn) + { + options.push(CString::new("--warn-on-local-memory-usage").unwrap()); + } + if ptx_lint_levels + .get(&PtxLint::RegisterSpills) + .map_or(false, |level| *level > LintLevel::Warn) + { + options.push(CString::new("--warn-on-spills").unwrap()); + } + options.push(CString::new("--warning-as-error").unwrap()); + + let options_ptrs = options.iter().map(|o| o.as_ptr()).collect::>(); + + NvptxError::try_err_from(unsafe { + ptx_compiler_sys::nvPTXCompilerCompile( + compiler, + options_ptrs.len() as c_int, + options_ptrs.as_ptr().cast(), + ) + })?; + }; + + if ptx_lint_levels + .get(&PtxLint::Verbose) + .map_or(false, |level| *level > LintLevel::Allow) + { + options.push(CString::new("--verbose").unwrap()); + } + if ptx_lint_levels + .get(&PtxLint::DoublePrecisionUse) + .map_or(false, |level| *level > LintLevel::Allow) + { + options.push(CString::new("--warn-on-double-precision-use").unwrap()); + } + if ptx_lint_levels + .get(&PtxLint::LocalMemoryUsage) + .map_or(false, |level| *level > LintLevel::Allow) + { + options.push(CString::new("--warn-on-local-memory-usage").unwrap()); + } + if ptx_lint_levels + .get(&PtxLint::RegisterSpills) + .map_or(false, |level| *level > LintLevel::Allow) + { + options.push(CString::new("--warn-on-spills").unwrap()); + } + let options_ptrs = options.iter().map(|o| o.as_ptr()).collect::>(); NvptxError::try_err_from(unsafe { @@ -331,7 +403,7 @@ fn check_kernel_ptx( options_ptrs.as_ptr().cast(), ) }) - }; + })(); let error_log = (|| { let mut error_log_size = 0; diff --git a/rust-cuda-derive/src/kernel/lints.rs b/rust-cuda-derive/src/kernel/lints.rs new file mode 100644 index 000000000..6cdb63ca8 --- /dev/null +++ b/rust-cuda-derive/src/kernel/lints.rs @@ -0,0 +1,154 @@ +use std::{collections::HashMap, fmt}; + +use syn::spanned::Spanned; + +pub fn parse_ptx_lint_level( + path: &syn::Path, + nested: &syn::punctuated::Punctuated, + ptx_lint_levels: &mut HashMap, +) { + let level = match path.get_ident() { + Some(ident) if ident == "allow" => LintLevel::Allow, + Some(ident) if ident == "warn" => LintLevel::Warn, + Some(ident) if ident == "deny" => LintLevel::Deny, + Some(ident) if ident == "forbid" => LintLevel::Forbid, + _ => { + emit_error!( + path.span(), + "[rust-cuda]: Invalid lint #[kernel(())] attribute: unknown lint \ + level, must be one of `allow`, `warn`, `deny`, `forbid`.", + ); + + return; + }, + }; + + for meta in nested { + let syn::NestedMeta::Meta(syn::Meta::Path(path)) = meta else { + emit_error!( + meta.span(), + "[rust-cuda]: Invalid #[kernel({}())] attribute.", + level, + ); + continue; + }; + + if path.leading_colon.is_some() + || path.segments.empty_or_trailing() + || path.segments.len() != 2 + { + emit_error!( + meta.span(), + "[rust-cuda]: Invalid #[kernel({}())] attribute: must be of the form \ + `ptx::lint`.", + level, + ); + continue; + } + + let Some(syn::PathSegment { ident: namespace, arguments: syn::PathArguments::None }) = path.segments.first() else { + emit_error!( + meta.span(), + "[rust-cuda]: Invalid #[kernel({}())] attribute: must be of the form `ptx::lint`.", + level, + ); + continue; + }; + + if namespace != "ptx" { + emit_error!( + meta.span(), + "[rust-cuda]: Invalid #[kernel({}())] attribute: must be of the form \ + `ptx::lint`.", + level, + ); + continue; + } + + let Some(syn::PathSegment { ident: lint, arguments: syn::PathArguments::None }) = path.segments.last() else { + emit_error!( + meta.span(), + "[rust-cuda]: Invalid #[kernel({}())] attribute: must be of the form `ptx::lint`.", + level, + ); + continue; + }; + + let lint = match lint { + l if l == "verbose" => PtxLint::Verbose, + l if l == "double_precision_use" => PtxLint::DoublePrecisionUse, + l if l == "local_memory_usage" => PtxLint::LocalMemoryUsage, + l if l == "register_spills" => PtxLint::RegisterSpills, + _ => { + emit_error!( + meta.span(), + "[rust-cuda]: Unknown PTX kernel lint `ptx::{}`.", + lint, + ); + continue; + }, + }; + + match ptx_lint_levels.get(&lint) { + None => (), + Some(LintLevel::Forbid) if level < LintLevel::Forbid => { + emit_error!( + meta.span(), + "[rust-cuda]: {}(ptx::{}) incompatible with previous forbid.", + level, + lint, + ); + continue; + }, + Some(previous) => { + emit_warning!( + meta.span(), + "[rust-cuda]: {}(ptx::{}) overwrites previous {}.", + level, + lint, + previous, + ); + }, + } + + ptx_lint_levels.insert(lint, level); + } +} + +#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Debug)] +pub enum LintLevel { + Allow, + Warn, + Deny, + Forbid, +} + +impl fmt::Display for LintLevel { + fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result { + match self { + Self::Allow => fmt.write_str("allow"), + Self::Warn => fmt.write_str("warn"), + Self::Deny => fmt.write_str("deny"), + Self::Forbid => fmt.write_str("forbid"), + } + } +} + +#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Debug)] +pub enum PtxLint { + Verbose, + DoublePrecisionUse, + LocalMemoryUsage, + RegisterSpills, +} + +impl fmt::Display for PtxLint { + fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result { + match self { + Self::Verbose => fmt.write_str("verbose"), + Self::DoublePrecisionUse => fmt.write_str("double_precision_use"), + Self::LocalMemoryUsage => fmt.write_str("local_memory_usage"), + Self::RegisterSpills => fmt.write_str("register_spills"), + } + } +} diff --git a/rust-cuda-derive/src/kernel/mod.rs b/rust-cuda-derive/src/kernel/mod.rs index c44f1dd2f..6dff13380 100644 --- a/rust-cuda-derive/src/kernel/mod.rs +++ b/rust-cuda-derive/src/kernel/mod.rs @@ -2,4 +2,5 @@ pub mod link; pub mod specialise; pub mod wrapper; +mod lints; mod utils; diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/get_ptx_str.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/get_ptx_str.rs index b3e215a20..d62445803 100644 --- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/get_ptx_str.rs +++ b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/get_ptx_str.rs @@ -5,6 +5,7 @@ use crate::kernel::utils::skip_kernel_compilation; use super::super::super::{DeclGenerics, FuncIdent, FunctionInputs, InputCudaType, KernelConfig}; +#[allow(clippy::too_many_arguments)] pub(super) fn quote_get_ptx_str( crate_path: &syn::Path, FuncIdent { @@ -21,6 +22,7 @@ pub(super) fn quote_get_ptx_str( inputs: &FunctionInputs, func_params: &[syn::Ident], macro_type_ids: &[syn::Ident], + ptx_lint_levels: &TokenStream, ) -> TokenStream { let crate_name = match proc_macro::tracked_env::var("CARGO_CRATE_NAME") { Ok(crate_name) => crate_name.to_uppercase(), @@ -80,7 +82,7 @@ pub(super) fn quote_get_ptx_str( #crate_path::host::link_kernel!{ #func_ident #func_ident_hash #args #crate_name #crate_manifest_dir #generic_start_token #($#macro_type_ids),* - #generic_close_token + #generic_close_token #ptx_lint_levels } #matching_kernel_assert diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/mod.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/mod.rs index 91f94a568..0ca963bb2 100644 --- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/mod.rs +++ b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/mod.rs @@ -26,6 +26,7 @@ pub(in super::super) fn quote_cpu_linker_macro( func_inputs: &FunctionInputs, func_ident: &FuncIdent, func_params: &[syn::Ident], + ptx_lint_levels: &TokenStream, ) -> TokenStream { let macro_types = generic_params .iter() @@ -59,6 +60,7 @@ pub(in super::super) fn quote_cpu_linker_macro( func_inputs, func_params, ¯o_type_ids, + ptx_lint_levels, ); let new_kernel = quote_new_kernel( crate_path, diff --git a/rust-cuda-derive/src/kernel/wrapper/mod.rs b/rust-cuda-derive/src/kernel/wrapper/mod.rs index a677c3e0f..ee7cfa404 100644 --- a/rust-cuda-derive/src/kernel/wrapper/mod.rs +++ b/rust-cuda-derive/src/kernel/wrapper/mod.rs @@ -1,6 +1,5 @@ use std::{ collections::HashMap, - fmt, hash::{Hash, Hasher}, }; @@ -11,6 +10,8 @@ mod generate; mod inputs; mod parse; +use super::lints::{parse_ptx_lint_level, LintLevel, PtxLint}; + use config::KernelConfig; use generate::{ args_trait::quote_args_trait, cpu_linker_macro::quote_cpu_linker_macro, @@ -19,7 +20,7 @@ use generate::{ }; use inputs::{parse_function_inputs, FunctionInputs}; use parse::parse_kernel_fn; -use proc_macro2::Span; +use proc_macro2::{Ident, Span}; use syn::spanned::Spanned; #[allow(clippy::too_many_lines)] @@ -45,7 +46,7 @@ pub fn kernel(attr: TokenStream, func: TokenStream) -> TokenStream { let mut func = parse_kernel_fn(func); let mut crate_path = None; - let mut lint_levels = HashMap::new(); + let mut ptx_lint_levels = HashMap::new(); func.attrs.retain(|attr| { if attr.path.is_ident("kernel") { @@ -83,96 +84,7 @@ pub fn kernel(attr: TokenStream, func: TokenStream) -> TokenStream { nested, .. })) if path.is_ident("allow") || path.is_ident("warn") || path.is_ident("deny") || path.is_ident("forbid") => { - let level = match path.get_ident() { - Some(ident) if ident == "allow" => LintLevel::Allow, - Some(ident) if ident == "warn" => LintLevel::Warn, - Some(ident) if ident == "deny" => LintLevel::Deny, - Some(ident) if ident == "forbid" => LintLevel::Forbid, - _ => unreachable!(), - }; - - for meta in nested { - let syn::NestedMeta::Meta(syn::Meta::Path(path)) = meta else { - emit_error!( - meta.span(), - "[rust-cuda]: Invalid #[kernel({}())] attribute.", - level, - ); - continue; - }; - - if path.leading_colon.is_some() || path.segments.empty_or_trailing() || path.segments.len() != 2 { - emit_error!( - meta.span(), - "[rust-cuda]: Invalid #[kernel({}())] attribute: must be of the form `ptx::lint`.", - level, - ); - continue; - } - - let Some(syn::PathSegment { ident: namespace, arguments: syn::PathArguments::None }) = path.segments.first() else { - emit_error!( - meta.span(), - "[rust-cuda]: Invalid #[kernel({}())] attribute: must be of the form `ptx::lint`.", - level, - ); - continue; - }; - - if namespace != "ptx" { - emit_error!( - meta.span(), - "[rust-cuda]: Invalid #[kernel({}())] attribute: must be of the form `ptx::lint`.", - level, - ); - continue; - } - - let Some(syn::PathSegment { ident: lint, arguments: syn::PathArguments::None }) = path.segments.last() else { - emit_error!( - meta.span(), - "[rust-cuda]: Invalid #[kernel({}())] attribute: must be of the form `ptx::lint`.", - level, - ); - continue; - }; - - let lint = match lint { - l if l == "verbose" => PtxLint::Verbose, - l if l == "double_precision_use" => PtxLint::DoublePrecisionUse, - l if l == "local_memory_usage" => PtxLint::LocalMemoryUsage, - l if l == "register_spills" => PtxLint::RegisterSpills, - _ => { - emit_error!( - meta.span(), - "[rust-cuda]: Unknown PTX kernel lint `ptx::{}`.", - lint, - ); - continue; - } - }; - - match lint_levels.get(&lint) { - None => (), - Some(LintLevel::Forbid) if level < LintLevel::Forbid => { - emit_error!( - meta.span(), - "[rust-cuda]: {}(ptx::{}) incompatible with previous forbid.", - level, lint, - ); - continue; - }, - Some(previous) => { - emit_warning!( - meta.span(), - "[rust-cuda]: {}(ptx::{}) overwrites previous {}.", - level, lint, previous, - ); - } - } - - lint_levels.insert(lint, level); - } + parse_ptx_lint_level(path, nested, &mut ptx_lint_levels); }, _ => { emit_error!( @@ -197,9 +109,26 @@ pub fn kernel(attr: TokenStream, func: TokenStream) -> TokenStream { let crate_path = crate_path.unwrap_or_else(|| syn::parse_quote!(::rust_cuda)); - let _ = lint_levels.try_insert(PtxLint::DoublePrecisionUse, LintLevel::Warn); - let _ = lint_levels.try_insert(PtxLint::LocalMemoryUsage, LintLevel::Warn); - let _ = lint_levels.try_insert(PtxLint::RegisterSpills, LintLevel::Warn); + let _ = ptx_lint_levels.try_insert(PtxLint::Verbose, LintLevel::Allow); + let _ = ptx_lint_levels.try_insert(PtxLint::DoublePrecisionUse, LintLevel::Warn); + let _ = ptx_lint_levels.try_insert(PtxLint::LocalMemoryUsage, LintLevel::Warn); + let _ = ptx_lint_levels.try_insert(PtxLint::RegisterSpills, LintLevel::Warn); + + let ptx_lint_levels = { + let (lints, levels): (Vec, Vec) = ptx_lint_levels + .into_iter() + .map(|(lint, level)| { + ( + Ident::new(&lint.to_string(), Span::call_site()), + Ident::new(&level.to_string(), Span::call_site()), + ) + }) + .unzip(); + + quote! { + #(#levels(ptx::#lints)),* + } + }; let mut generic_kernel_params = func.sig.generics.params.clone(); let mut func_inputs = parse_function_inputs(&func, &mut generic_kernel_params); @@ -376,6 +305,7 @@ pub fn kernel(attr: TokenStream, func: TokenStream) -> TokenStream { &func_inputs, &func_ident, &func_params, + &ptx_lint_levels, ); let cuda_wrapper = quote_cuda_wrapper( &crate_path, @@ -446,44 +376,6 @@ struct FuncIdent<'f> { func_ident_hash: syn::Ident, } -#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Debug)] -enum LintLevel { - Allow, - Warn, - Deny, - Forbid, -} - -impl fmt::Display for LintLevel { - fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result { - match self { - Self::Allow => fmt.write_str("allow"), - Self::Warn => fmt.write_str("warn"), - Self::Deny => fmt.write_str("deny"), - Self::Forbid => fmt.write_str("forbid"), - } - } -} - -#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Debug)] -enum PtxLint { - Verbose, - DoublePrecisionUse, - LocalMemoryUsage, - RegisterSpills, -} - -impl fmt::Display for PtxLint { - fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result { - match self { - Self::Verbose => fmt.write_str("verbose"), - Self::DoublePrecisionUse => fmt.write_str("double_precision_use"), - Self::LocalMemoryUsage => fmt.write_str("local_memory_usage"), - Self::RegisterSpills => fmt.write_str("register_spills"), - } - } -} - fn ident_from_pat(pat: &syn::Pat) -> Option { match pat { syn::Pat::Lit(_) @@ -547,7 +439,9 @@ fn quote_generic_check( #crate_path::safety::kernel_signature::CpuAndGpuKernelSignatures::Match }> = #crate_path::safety::kernel_signature::Assert::<{ #crate_path::safety::kernel_signature::check( - #crate_path::host::check_kernel!(#args #crate_name #crate_manifest_dir).as_bytes(), + #crate_path::host::check_kernel!( + #args #crate_name #crate_manifest_dir + ).as_bytes(), concat!(".visible .entry ", stringify!(#func_ident_hash)).as_bytes() ) }>; From d8a732f85d1c168d6c4f1726155624009115aac0 Mon Sep 17 00:00:00 2001 From: Juniper Tyree Date: Tue, 9 May 2023 07:23:25 +0000 Subject: [PATCH 031/120] Improve kernel checking + added cubin dump lint --- rust-cuda-derive/src/kernel/link/config.rs | 3 + rust-cuda-derive/src/kernel/link/mod.rs | 195 +++++++++++--- .../src/kernel/link/ptx_compiler_sys.rs | 246 ++++++++---------- rust-cuda-derive/src/kernel/lints.rs | 3 + .../src/kernel/specialise/entry.rs | 2 +- rust-cuda-derive/src/kernel/wrapper/mod.rs | 14 +- 6 files changed, 278 insertions(+), 185 deletions(-) diff --git a/rust-cuda-derive/src/kernel/link/config.rs b/rust-cuda-derive/src/kernel/link/config.rs index e2b399dc4..efb7899fa 100644 --- a/rust-cuda-derive/src/kernel/link/config.rs +++ b/rust-cuda-derive/src/kernel/link/config.rs @@ -67,6 +67,7 @@ impl syn::parse::Parse for LinkKernelConfig { #[allow(clippy::module_name_repetitions)] pub(super) struct CheckKernelConfig { + pub(super) kernel_hash: syn::Ident, pub(super) args: syn::Ident, pub(super) crate_name: String, pub(super) crate_path: PathBuf, @@ -74,11 +75,13 @@ pub(super) struct CheckKernelConfig { impl syn::parse::Parse for CheckKernelConfig { fn parse(input: syn::parse::ParseStream) -> syn::Result { + let kernel_hash: syn::Ident = input.parse()?; let args: syn::Ident = input.parse()?; let name: syn::LitStr = input.parse()?; let path: syn::LitStr = input.parse()?; Ok(Self { + kernel_hash, args, crate_name: name.value(), crate_path: PathBuf::from(path.value()), diff --git a/rust-cuda-derive/src/kernel/link/mod.rs b/rust-cuda-derive/src/kernel/link/mod.rs index a79505c13..5d5fef5b4 100644 --- a/rust-cuda-derive/src/kernel/link/mod.rs +++ b/rust-cuda-derive/src/kernel/link/mod.rs @@ -32,11 +32,10 @@ use error::emit_ptx_build_error; use ptx_compiler_sys::NvptxError; pub fn check_kernel(tokens: TokenStream) -> TokenStream { - proc_macro_error::set_dummy(quote! { - "ERROR in this PTX compilation" - }); + proc_macro_error::set_dummy(quote! {::core::result::Result::Err(())}); let CheckKernelConfig { + kernel_hash, args, crate_name, crate_path, @@ -44,8 +43,8 @@ pub fn check_kernel(tokens: TokenStream) -> TokenStream { Ok(config) => config, Err(err) => { abort_call_site!( - "check_kernel!(ARGS NAME PATH) expects ARGS identifier, NAME and PATH string \ - literals: {:?}", + "check_kernel!(HASH ARGS NAME PATH) expects HASH and ARGS identifiers, annd NAME \ + and PATH string literals: {:?}", err ) }, @@ -53,10 +52,18 @@ pub fn check_kernel(tokens: TokenStream) -> TokenStream { let kernel_ptx = compile_kernel(&args, &crate_name, &crate_path, Specialisation::Check); - match kernel_ptx { - Some(kernel_ptx) => quote!(#kernel_ptx).into(), - None => quote!("ERROR in this PTX compilation").into(), - } + let Some(kernel_ptx) = kernel_ptx else { + return quote!(::core::result::Result::Err(())).into() + }; + + check_kernel_ptx_and_report( + &kernel_ptx, + Specialisation::Check, + &kernel_hash, + &HashMap::new(), + ); + + quote!(::core::result::Result::Ok(())).into() } #[allow(clippy::module_name_repetitions, clippy::too_many_lines)] @@ -77,9 +84,9 @@ pub fn link_kernel(tokens: TokenStream) -> TokenStream { Ok(config) => config, Err(err) => { abort_call_site!( - "link_kernel!(KERNEL ARGS NAME PATH SPECIALISATION LINTS,*) expects KERNEL and \ - ARGS identifiers, NAME and PATH string literals, SPECIALISATION and LINTS \ - tokens: {:?}", + "link_kernel!(KERNEL HASH ARGS NAME PATH SPECIALISATION LINTS,*) expects KERNEL, \ + HASH, and ARGS identifiers, NAME and PATH string literals, and SPECIALISATION \ + and LINTS tokens: {:?}", err ) }, @@ -213,32 +220,44 @@ pub fn link_kernel(tokens: TokenStream) -> TokenStream { kernel_ptx.replace_range(type_layout_start..type_layout_end, ""); } - let (result, error_log, info_log, version, drop) = - check_kernel_ptx(&kernel_ptx, &specialisation, &kernel_hash, &ptx_lint_levels); + check_kernel_ptx_and_report( + &kernel_ptx, + Specialisation::Link(&specialisation), + &kernel_hash, + &ptx_lint_levels, + ); + + (quote! { const PTX_STR: &'static str = #kernel_ptx; #(#type_layouts)* }).into() +} + +#[allow(clippy::too_many_lines)] +fn check_kernel_ptx_and_report( + kernel_ptx: &str, + specialisation: Specialisation, + kernel_hash: &proc_macro2::Ident, + ptx_lint_levels: &HashMap, +) { + let (result, error_log, info_log, binary, version, drop) = + check_kernel_ptx(kernel_ptx, specialisation, kernel_hash, ptx_lint_levels); let ptx_compiler = match &version { Ok((major, minor)) => format!("PTX compiler v{major}.{minor}"), Err(_) => String::from("PTX compiler"), }; - // TODO: allow user to select - // - warn on double - // - warn on float - // - warn on spills - // - verbose warn - // - warnings as errors - // - show PTX source if warning or error - let mut errors = String::new(); + if let Err(err) = drop { let _ = errors.write_fmt(format_args!("Error dropping the {ptx_compiler}: {err}\n")); } + if let Err(err) = version { let _ = errors.write_fmt(format_args!( "Error fetching the version of the {ptx_compiler}: {err}\n" )); } - if let (Ok(Some(_)), _) | (_, Ok(Some(_))) = (&info_log, &error_log) { + + let ptx_source_code = { let mut max_lines = kernel_ptx.chars().filter(|c| *c == '\n').count() + 1; let mut indent = 0; while max_lines > 0 { @@ -246,7 +265,7 @@ pub fn link_kernel(tokens: TokenStream) -> TokenStream { indent += 1; } - emit_call_site_warning!( + format!( "PTX source code:\n{}", kernel_ptx .lines() @@ -254,47 +273,109 @@ pub fn link_kernel(tokens: TokenStream) -> TokenStream { .map(|(i, l)| format!("{:indent$}| {l}", i + 1)) .collect::>() .join("\n") - ); + ) + }; + + match binary { + Ok(None) => (), + Ok(Some(binary)) => { + if ptx_lint_levels + .get(&PtxLint::DumpBinary) + .map_or(false, |level| *level > LintLevel::Allow) + { + const HEX: [char; 16] = [ + '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f', + ]; + + let mut binary_hex = String::with_capacity(binary.len() * 2); + for byte in binary { + binary_hex.push(HEX[usize::from(byte >> 4)]); + binary_hex.push(HEX[usize::from(byte & 0x0F)]); + } + + if ptx_lint_levels + .get(&PtxLint::DumpBinary) + .map_or(false, |level| *level > LintLevel::Warn) + { + emit_call_site_error!( + "{} compiled binary:\n{}\n\n{}", + ptx_compiler, + binary_hex, + ptx_source_code + ); + } else { + emit_call_site_warning!( + "{} compiled binary:\n{}\n\n{}", + ptx_compiler, + binary_hex, + ptx_source_code + ); + } + } + }, + Err(err) => { + let _ = errors.write_fmt(format_args!( + "Error fetching the compiled binary from {ptx_compiler}: {err}\n" + )); + }, } + match info_log { Ok(None) => (), - Ok(Some(info_log)) => emit_call_site_warning!("{ptx_compiler} info log:\n{}", info_log), + Ok(Some(info_log)) => emit_call_site_warning!( + "{} info log:\n{}\n{}", + ptx_compiler, + info_log, + ptx_source_code + ), Err(err) => { let _ = errors.write_fmt(format_args!( "Error fetching the info log of the {ptx_compiler}: {err}\n" )); }, }; - match error_log { - Ok(None) => (), - Ok(Some(error_log)) => emit_call_site_error!("{ptx_compiler} error log:\n{}", error_log), + + let error_log = match error_log { + Ok(None) => String::new(), + Ok(Some(error_log)) => { + format!("{ptx_compiler} error log:\n{error_log}\n{ptx_source_code}") + }, Err(err) => { let _ = errors.write_fmt(format_args!( "Error fetching the error log of the {ptx_compiler}: {err}\n" )); + String::new() }, }; + if let Err(err) = result { let _ = errors.write_fmt(format_args!("Error compiling the PTX source code: {err}\n")); } - if !errors.is_empty() { - abort_call_site!("{}", errors); - } - (quote! { const PTX_STR: &'static str = #kernel_ptx; #(#type_layouts)* }).into() + if !error_log.is_empty() || !errors.is_empty() { + abort_call_site!( + "{error_log}{}{errors}", + if !error_log.is_empty() && !errors.is_empty() { + "\n\n" + } else { + "" + } + ); + } } #[allow(clippy::type_complexity)] #[allow(clippy::too_many_lines)] fn check_kernel_ptx( kernel_ptx: &str, - specialisation: &str, + specialisation: Specialisation, kernel_hash: &proc_macro2::Ident, ptx_lint_levels: &HashMap, ) -> ( Result<(), NvptxError>, Result, NvptxError>, Result, NvptxError>, + Result>, NvptxError>, Result<(u32, u32), NvptxError>, Result<(), NvptxError>, ) { @@ -313,14 +394,15 @@ fn check_kernel_ptx( }; let result = (|| { - let kernel_name = if specialisation.is_empty() { - format!("{kernel_hash}_kernel") - } else { - format!( + let kernel_name = match specialisation { + Specialisation::Check => format!("{kernel_hash}_chECK"), + Specialisation::Link("") => format!("{kernel_hash}_kernel"), + Specialisation::Link(specialisation) => format!( "{kernel_hash}_kernel_{:016x}", seahash::hash(specialisation.as_bytes()) - ) + ), }; + let mut options = vec![ CString::new("--entry").unwrap(), CString::new(kernel_name).unwrap(), @@ -457,6 +539,39 @@ fn check_kernel_ptx( Ok(Some(String::from_utf8_lossy(&info_log).into_owned())) })(); + let binary = (|| { + if result.is_err() { + return Ok(None); + } + + let mut binary_size = 0; + + NvptxError::try_err_from(unsafe { + ptx_compiler_sys::nvPTXCompilerGetCompiledProgramSize( + compiler, + addr_of_mut!(binary_size), + ) + })?; + + if binary_size == 0 { + return Ok(None); + } + + #[allow(clippy::cast_possible_truncation)] + let mut binary: Vec = Vec::with_capacity(binary_size as usize); + + NvptxError::try_err_from(unsafe { + ptx_compiler_sys::nvPTXCompilerGetCompiledProgram(compiler, binary.as_mut_ptr().cast()) + })?; + + #[allow(clippy::cast_possible_truncation)] + unsafe { + binary.set_len(binary_size as usize); + } + + Ok(Some(binary)) + })(); + let version = (|| { let mut major = 0; let mut minor = 0; @@ -475,7 +590,7 @@ fn check_kernel_ptx( }) }; - (result, error_log, info_log, version, drop) + (result, error_log, info_log, binary, version, drop) } fn compile_kernel( diff --git a/rust-cuda-derive/src/kernel/link/ptx_compiler_sys.rs b/rust-cuda-derive/src/kernel/link/ptx_compiler_sys.rs index 5e459a623..0ab332dad 100644 --- a/rust-cuda-derive/src/kernel/link/ptx_compiler_sys.rs +++ b/rust-cuda-derive/src/kernel/link/ptx_compiler_sys.rs @@ -74,7 +74,7 @@ pub type NvptxCompileResult = ::std::os::raw::c_int; extern "C" { /// Queries the current major and minor version of PTX Compiler APIs being - /// used. + /// used. /// /// # Parameters /// - [out] `major`: Major version of the PTX Compiler APIs @@ -94,7 +94,7 @@ extern "C" { ) -> NvptxCompileResult; /// Obtains the handle to an instance of the PTX compiler - /// initialized with the given PTX program `ptxCode`. + /// initialized with the given PTX program `ptxCode`. /// /// # Parameters /// - [out] `compiler`: Returns a handle to PTX compiler initialized with @@ -127,169 +127,147 @@ extern "C" { /// - [`NvptxCompileResult::NVPTXCOMPILE_ERROR_INVALID_PROGRAM_HANDLE`] pub fn nvPTXCompilerDestroy(compiler: *mut NvptxCompilerHandle) -> NvptxCompileResult; - #[doc = " \\ingroup compilation"] - #[doc = ""] - #[doc = " \\brief Compile a PTX program with the given compiler options"] - #[doc = ""] - #[doc = " \\param [in,out] compiler A handle to PTX compiler initialized with \ - the"] - #[doc = " PTX program which is to be compiled."] - #[doc = " The compiled program can be accessed using \ - the handle"] - #[doc = " \\param [in] numCompileOptions Length of the array \\p compileOptions"] - #[doc = " \\param [in] compileOptions Compiler options with which compilation \ - should be done."] - #[doc = " The compiler options string is a null \ - terminated character array."] - #[doc = " A valid list of compiler options is at"] - #[doc = " link."] - #[doc = " \\note --gpu-name (-arch) is a mandatory option."] - #[doc = ""] - #[doc = " \\return"] - #[doc = " - \\link #nvPTXCompileResult NVPTXCOMPILE_SUCCESS \\endlink"] - #[doc = " - \\link #nvPTXCompileResult NVPTXCOMPILE_ERROR_OUT_OF_MEMORY \\endlink"] - #[doc = " - \\link #nvPTXCompileResult NVPTXCOMPILE_ERROR_INTERNAL \\endlink"] - #[doc = " - \\link #nvPTXCompileResult NVPTXCOMPILE_ERROR_INVALID_PROGRAM_HANDLE \\endlink"] - #[doc = " - \\link #nvPTXCompileResult NVPTXCOMPILE_ERROR_COMPILATION_FAILURE \\endlink"] - #[doc = " - \\link #nvPTXCompileResult NVPTXCOMPILE_ERROR_UNSUPPORTED_PTX_VERSION \\endlink"] - #[doc = ""] + /// Compile a PTX program with the given compiler options. + /// + /// # Parameters + /// - [in, out] `compiler`: A handle to PTX compiler initialized with the + /// PTX program which is to be compiled. The compiled program can be + /// accessed using the handle. + /// - [in] `numCompileOptions`: Length of the array `compileOptions` + /// - [in] `compileOptions`: Compiler options with which compilation should + /// be done. The compiler options string is a null terminated character + /// array. A valid list of compiler options is available at + /// [link](http://docs.nvidia.com/cuda/ptx-compiler-api/index.html#compile-options). + /// + /// # Note + /// `--gpu-name` (`-arch`) is a mandatory option. + /// + /// # Returns + /// - [`NvptxCompileResult::NVPTXCOMPILE_SUCCESS`] + /// - [`NvptxCompileResult::NVPTXCOMPILE_ERROR_OUT_OF_MEMORY`] + /// - [`NvptxCompileResult::NVPTXCOMPILE_ERROR_INTERNAL`] + /// - [`NvptxCompileResult::NVPTXCOMPILE_ERROR_INVALID_PROGRAM_HANDLE`] + /// - [`NvptxCompileResult::NVPTXCOMPILE_ERROR_COMPILATION_FAILURE`] + /// - [`NvptxCompileResult::NVPTXCOMPILE_ERROR_UNSUPPORTED_PTX_VERSION`] pub fn nvPTXCompilerCompile( compiler: NvptxCompilerHandle, numCompileOptions: ::std::os::raw::c_int, compileOptions: *const *const ::std::os::raw::c_char, ) -> NvptxCompileResult; - #[doc = " \\ingroup compilation"] - #[doc = ""] - #[doc = " \\brief Obtains the size of the image of the compiled program"] - #[doc = ""] - #[doc = " \\param [in] compiler A handle to PTX compiler on which \ - nvPTXCompilerCompile() has been performed."] - #[doc = " \\param [out] binaryImageSize The size of the image of the compiled \ - program"] - #[doc = ""] - #[doc = " \\return"] - #[doc = " - \\link #nvPTXCompileResult NVPTXCOMPILE_SUCCESS \\endlink"] - #[doc = " - \\link #nvPTXCompileResult NVPTXCOMPILE_ERROR_INTERNAL \\endlink"] - #[doc = " - \\link #nvPTXCompileResult NVPTXCOMPILE_ERROR_INVALID_PROGRAM_HANDLE \\endlink"] - #[doc = " - \\link #nvPTXCompileResult NVPTXCOMPILE_ERROR_COMPILER_INVOCATION_INCOMPLETE \ - \\endlink"] - #[doc = ""] - #[doc = " \\note nvPTXCompilerCompile() API should be invoked for the handle \ - before calling this API."] - #[doc = " Otherwise, NVPTXCOMPILE_ERROR_COMPILER_INVOCATION_INCOMPLETE is \ - returned."] + /// Obtains the size of the image of the compiled program. + /// + /// # Parameters + /// - [in] `compiler`: A handle to PTX compiler on which + /// [`nvPTXCompilerCompile`] has been performed. + /// - [out] `binaryImageSize`: The size of the image of the compiled program + /// + /// # Returns + /// - [`NvptxCompileResult::NVPTXCOMPILE_SUCCESS`] + /// - [`NvptxCompileResult::NVPTXCOMPILE_ERROR_INTERNAL`] + /// - [`NvptxCompileResult::NVPTXCOMPILE_ERROR_INVALID_PROGRAM_HANDLE`] + /// - [`NvptxCompileResult::NVPTXCOMPILE_ERROR_COMPILER_INVOCATION_INCOMPLETE`] + /// + /// # Note + /// The [`nvPTXCompilerCompile`] function should be invoked for the handle + /// before calling this API. Otherwise, + /// [`NvptxCompileResult::NVPTXCOMPILE_ERROR_COMPILER_INVOCATION_INCOMPLETE`] + /// is returned. pub fn nvPTXCompilerGetCompiledProgramSize( compiler: NvptxCompilerHandle, binaryImageSize: *mut size_t, ) -> NvptxCompileResult; - #[doc = " \\ingroup compilation"] - #[doc = ""] - #[doc = " \\brief Obtains the image of the compiled program"] - #[doc = ""] - #[doc = " \\param [in] compiler A handle to PTX compiler on which \ - nvPTXCompilerCompile() has been performed."] - #[doc = " \\param [out] binaryImage The image of the compiled program."] - #[doc = " Client should allocate memory for \\p \ - binaryImage"] - #[doc = ""] - #[doc = " \\return"] - #[doc = " - \\link #nvPTXCompileResult NVPTXCOMPILE_SUCCESS \\endlink"] - #[doc = " - \\link #nvPTXCompileResult NVPTXCOMPILE_ERROR_INTERNAL \\endlink"] - #[doc = " - \\link #nvPTXCompileResult NVPTXCOMPILE_ERROR_INVALID_PROGRAM_HANDLE \\endlink"] - #[doc = " - \\link #nvPTXCompileResult NVPTXCOMPILE_ERROR_COMPILER_INVOCATION_INCOMPLETE \ - \\endlink"] - #[doc = ""] - #[doc = " \\note nvPTXCompilerCompile() API should be invoked for the handle \ - before calling this API."] - #[doc = " Otherwise, NVPTXCOMPILE_ERROR_COMPILER_INVOCATION_INCOMPLETE is \ - returned."] - #[doc = ""] + /// Obtains the image of the compiled program. + /// + /// # Parameters + /// - [in] `compiler`: A handle to PTX compiler on which + /// [`nvPTXCompilerCompile`] has been performed. + /// - [out] `binaryImage`: The image of the compiled program. The caller + /// should allocate memory for `binaryImage`. + /// + /// # Returns + /// - [`NvptxCompileResult::NVPTXCOMPILE_SUCCESS`] + /// - [`NvptxCompileResult::NVPTXCOMPILE_ERROR_INTERNAL`] + /// - [`NvptxCompileResult::NVPTXCOMPILE_ERROR_INVALID_PROGRAM_HANDLE`] + /// - [`NvptxCompileResult::NVPTXCOMPILE_ERROR_COMPILER_INVOCATION_INCOMPLETE`] + /// + /// # Note + /// The [`nvPTXCompilerCompile`] function should be invoked for the handle + /// before calling this API. Otherwise, + /// [`NvptxCompileResult::NVPTXCOMPILE_ERROR_COMPILER_INVOCATION_INCOMPLETE`] + /// is returned. pub fn nvPTXCompilerGetCompiledProgram( compiler: NvptxCompilerHandle, binaryImage: *mut ::std::os::raw::c_void, ) -> NvptxCompileResult; - #[doc = " \\ingroup compilation"] - #[doc = ""] - #[doc = " \\brief Query the size of the error message that was seen previously for \ - the handle"] - #[doc = ""] - #[doc = " \\param [in] compiler A handle to PTX compiler on which \ - nvPTXCompilerCompile() has been performed."] - #[doc = " \\param [out] errorLogSize The size of the error log in bytes which \ - was produced"] - #[doc = " in previous call to nvPTXCompilerCompiler()."] - #[doc = ""] - #[doc = " \\return"] - #[doc = " - \\link #nvPTXCompileResult NVPTXCOMPILE_SUCCESS \\endlink"] - #[doc = " - \\link #nvPTXCompileResult NVPTXCOMPILE_ERROR_INTERNAL \\endlink"] - #[doc = " - \\link #nvPTXCompileResult NVPTXCOMPILE_ERROR_INVALID_PROGRAM_HANDLE \\endlink"] - #[doc = ""] + /// Query the size of the error message that was seen previously for the + /// handle. + /// + /// - [in] `compiler`: A handle to PTX compiler on which + /// [`nvPTXCompilerCompile`] has been performed. + /// - [out] `errorLogSize`: The size of the error log in bytes which was + /// produced in previous call to [`nvPTXCompilerCompile`]. + /// + /// # Returns + /// - [`NvptxCompileResult::NVPTXCOMPILE_SUCCESS`] + /// - [`NvptxCompileResult::NVPTXCOMPILE_ERROR_INTERNAL`] + /// - [`NvptxCompileResult::NVPTXCOMPILE_ERROR_INVALID_PROGRAM_HANDLE`] pub fn nvPTXCompilerGetErrorLogSize( compiler: NvptxCompilerHandle, errorLogSize: *mut size_t, ) -> NvptxCompileResult; - #[doc = " \\ingroup compilation"] - #[doc = ""] - #[doc = " \\brief Query the error message that was seen previously for the handle"] - #[doc = ""] - #[doc = " \\param [in] compiler A handle to PTX compiler on which \ - nvPTXCompilerCompile() has been performed."] - #[doc = " \\param [out] errorLog The error log which was produced in \ - previous call to nvPTXCompilerCompiler()."] - #[doc = " Clients should allocate memory for \\p \ - errorLog"] - #[doc = ""] - #[doc = " \\return"] - #[doc = " - \\link #nvPTXCompileResult NVPTXCOMPILE_SUCCESS \\endlink"] - #[doc = " - \\link #nvPTXCompileResult NVPTXCOMPILE_ERROR_INTERNAL \\endlink"] - #[doc = " - \\link #nvPTXCompileResult NVPTXCOMPILE_ERROR_INVALID_PROGRAM_HANDLE \\endlink"] - #[doc = ""] + /// Query the error message that was seen previously for the handle. + /// + /// # Parameters + /// - [in] `compiler`: A handle to PTX compiler on which + /// [`nvPTXCompilerCompile`] has been performed. + /// - [out] `errorLog`: The error log which was produced in previous call to + /// [`nvPTXCompilerCompile`]. The caller should allocate memory for + /// `errorLog`. + /// + /// # Returns + /// - [`NvptxCompileResult::NVPTXCOMPILE_SUCCESS`] + /// - [`NvptxCompileResult::NVPTXCOMPILE_ERROR_INTERNAL`] + /// - [`NvptxCompileResult::NVPTXCOMPILE_ERROR_INVALID_PROGRAM_HANDLE`] pub fn nvPTXCompilerGetErrorLog( compiler: NvptxCompilerHandle, errorLog: *mut ::std::os::raw::c_char, ) -> NvptxCompileResult; - #[doc = " \\ingroup compilation"] - #[doc = ""] - #[doc = " \\brief Query the size of the information message that was seen \ - previously for the handle"] - #[doc = ""] - #[doc = " \\param [in] compiler A handle to PTX compiler on which \ - nvPTXCompilerCompile() has been performed."] - #[doc = " \\param [out] infoLogSize The size of the information log in bytes \ - which was produced"] - #[doc = " in previous call to nvPTXCompilerCompiler()."] - #[doc = ""] - #[doc = " \\return"] - #[doc = " - \\link #nvPTXCompileResult NVPTXCOMPILE_SUCCESS \\endlink"] - #[doc = " - \\link #nvPTXCompileResult NVPTXCOMPILE_ERROR_INTERNAL \\endlink"] - #[doc = " - \\link #nvPTXCompileResult NVPTXCOMPILE_ERROR_INVALID_PROGRAM_HANDLE \\endlink"] - #[doc = ""] + /// Query the size of the information message that was seen previously for + /// the handle. + /// + /// # Parameters + /// - [in] `compiler`: A handle to PTX compiler on which + /// [`nvPTXCompilerCompile`] has been performed. + /// - [out] `infoLogSize`: The size of the information log in bytes which + /// was produced in previous call to [`nvPTXCompilerCompile`]. + /// + /// # Returns + /// - [`NvptxCompileResult::NVPTXCOMPILE_SUCCESS`] + /// - [`NvptxCompileResult::NVPTXCOMPILE_ERROR_INTERNAL`] + /// - [`NvptxCompileResult::NVPTXCOMPILE_ERROR_INVALID_PROGRAM_HANDLE`] pub fn nvPTXCompilerGetInfoLogSize( compiler: NvptxCompilerHandle, infoLogSize: *mut size_t, ) -> NvptxCompileResult; - #[doc = " \\ingroup compilation"] - #[doc = ""] - #[doc = " \\brief Query the information message that was seen previously for the \ - handle"] - #[doc = ""] - #[doc = " \\param [in] compiler A handle to PTX compiler on which \ - nvPTXCompilerCompile() has been performed."] - #[doc = " \\param [out] infoLog The information log which was produced in \ - previous call to nvPTXCompilerCompiler()."] - #[doc = " Clients should allocate memory for \\p infoLog"] - #[doc = ""] - #[doc = " \\return"] - #[doc = " - \\link #nvPTXCompileResult NVPTXCOMPILE_SUCCESS \\endlink"] - #[doc = " - \\link #nvPTXCompileResult NVPTXCOMPILE_ERROR_INTERNAL \\endlink"] - #[doc = " - \\link #nvPTXCompileResult NVPTXCOMPILE_ERROR_INVALID_PROGRAM_HANDLE \\endlink"] - #[doc = ""] + /// Query the information message that was seen previously for the handle. + /// + /// # Parameters + /// - [in] `compiler`: A handle to PTX compiler on which + /// [`nvPTXCompilerCompile`] has been performed. + /// - [out] `infoLog`: The information log which was produced in previous + /// call to [`nvPTXCompilerCompile`]. The caller should allocate memory + /// for `infoLog`. + /// + /// # Returns + /// - [`NvptxCompileResult::NVPTXCOMPILE_SUCCESS`] + /// - [`NvptxCompileResult::NVPTXCOMPILE_ERROR_INTERNAL`] + /// - [`NvptxCompileResult::NVPTXCOMPILE_ERROR_INVALID_PROGRAM_HANDLE`] pub fn nvPTXCompilerGetInfoLog( compiler: NvptxCompilerHandle, infoLog: *mut ::std::os::raw::c_char, diff --git a/rust-cuda-derive/src/kernel/lints.rs b/rust-cuda-derive/src/kernel/lints.rs index 6cdb63ca8..e91222dcd 100644 --- a/rust-cuda-derive/src/kernel/lints.rs +++ b/rust-cuda-derive/src/kernel/lints.rs @@ -79,6 +79,7 @@ pub fn parse_ptx_lint_level( l if l == "double_precision_use" => PtxLint::DoublePrecisionUse, l if l == "local_memory_usage" => PtxLint::LocalMemoryUsage, l if l == "register_spills" => PtxLint::RegisterSpills, + l if l == "dump_binary" => PtxLint::DumpBinary, _ => { emit_error!( meta.span(), @@ -140,6 +141,7 @@ pub enum PtxLint { DoublePrecisionUse, LocalMemoryUsage, RegisterSpills, + DumpBinary, } impl fmt::Display for PtxLint { @@ -149,6 +151,7 @@ impl fmt::Display for PtxLint { Self::DoublePrecisionUse => fmt.write_str("double_precision_use"), Self::LocalMemoryUsage => fmt.write_str("local_memory_usage"), Self::RegisterSpills => fmt.write_str("register_spills"), + Self::DumpBinary => fmt.write_str("dump_binary"), } } } diff --git a/rust-cuda-derive/src/kernel/specialise/entry.rs b/rust-cuda-derive/src/kernel/specialise/entry.rs index e8bce23b9..b85a433e7 100644 --- a/rust-cuda-derive/src/kernel/specialise/entry.rs +++ b/rust-cuda-derive/src/kernel/specialise/entry.rs @@ -33,7 +33,7 @@ pub fn specialise_kernel_entry(attr: TokenStream, func: TokenStream) -> TokenStr func.sig.ident = match proc_macro::tracked_env::var(&specialisation_var).as_deref() { Ok("") => quote::format_ident!("{}_kernel", func.sig.ident), Ok("chECK") => { - let func_ident = func.sig.ident; + let func_ident = quote::format_ident!("{}_chECK", func.sig.ident); return (quote! { #[cfg(target_os = "cuda")] diff --git a/rust-cuda-derive/src/kernel/wrapper/mod.rs b/rust-cuda-derive/src/kernel/wrapper/mod.rs index ee7cfa404..a70c38e94 100644 --- a/rust-cuda-derive/src/kernel/wrapper/mod.rs +++ b/rust-cuda-derive/src/kernel/wrapper/mod.rs @@ -113,6 +113,7 @@ pub fn kernel(attr: TokenStream, func: TokenStream) -> TokenStream { let _ = ptx_lint_levels.try_insert(PtxLint::DoublePrecisionUse, LintLevel::Warn); let _ = ptx_lint_levels.try_insert(PtxLint::LocalMemoryUsage, LintLevel::Warn); let _ = ptx_lint_levels.try_insert(PtxLint::RegisterSpills, LintLevel::Warn); + let _ = ptx_lint_levels.try_insert(PtxLint::DumpBinary, LintLevel::Allow); let ptx_lint_levels = { let (lints, levels): (Vec, Vec) = ptx_lint_levels @@ -435,15 +436,8 @@ fn quote_generic_check( quote::quote_spanned! { func_ident_hash.span()=> #[cfg(not(target_os = "cuda"))] - const _: #crate_path::safety::kernel_signature::Assert<{ - #crate_path::safety::kernel_signature::CpuAndGpuKernelSignatures::Match - }> = #crate_path::safety::kernel_signature::Assert::<{ - #crate_path::safety::kernel_signature::check( - #crate_path::host::check_kernel!( - #args #crate_name #crate_manifest_dir - ).as_bytes(), - concat!(".visible .entry ", stringify!(#func_ident_hash)).as_bytes() - ) - }>; + const _: ::core::result::Result<(), ()> = #crate_path::host::check_kernel!( + #func_ident_hash #args #crate_name #crate_manifest_dir + ); } } From 8f4e7a17e46df26138026dd767fd1553950c6099 Mon Sep 17 00:00:00 2001 From: Juniper Tyree Date: Tue, 9 May 2023 08:03:58 +0000 Subject: [PATCH 032/120] Fix kernel macro config parsing --- rust-cuda-derive/src/kernel/wrapper/config.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rust-cuda-derive/src/kernel/wrapper/config.rs b/rust-cuda-derive/src/kernel/wrapper/config.rs index 382db35f9..d8951230d 100644 --- a/rust-cuda-derive/src/kernel/wrapper/config.rs +++ b/rust-cuda-derive/src/kernel/wrapper/config.rs @@ -20,7 +20,7 @@ impl syn::parse::Parse for KernelConfig { let args: syn::Ident = input.parse()?; let _comma: syn::token::Comma = input.parse()?; let ptx: syn::Ident = input.parse()?; - let _comma: Option = input.parse()?; + let _comma: Option = input.parse()?; let _gt_token: syn::token::Gt = input.parse()?; let _for: syn::token::For = input.parse()?; let launcher: syn::Ident = input.parse()?; From e9df07ddb3921e3ef8031c90aaa7509f8e77252b Mon Sep 17 00:00:00 2001 From: Juniper Tyree Date: Tue, 9 May 2023 08:58:04 +0000 Subject: [PATCH 033/120] Explicitly fitting Device[Const|Mut]Ref into device registers --- rust-cuda-derive/src/kernel/link/mod.rs | 2 +- src/safety/register_fit.rs | 12 ++++++++++++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/rust-cuda-derive/src/kernel/link/mod.rs b/rust-cuda-derive/src/kernel/link/mod.rs index 5d5fef5b4..cdc727a22 100644 --- a/rust-cuda-derive/src/kernel/link/mod.rs +++ b/rust-cuda-derive/src/kernel/link/mod.rs @@ -445,7 +445,7 @@ fn check_kernel_ptx( NvptxError::try_err_from(unsafe { ptx_compiler_sys::nvPTXCompilerCompile( compiler, - options_ptrs.len() as c_int, + c_int::try_from(options_ptrs.len()).unwrap(), options_ptrs.as_ptr().cast(), ) })?; diff --git a/src/safety/register_fit.rs b/src/safety/register_fit.rs index 1ddf33849..ef1c8ce98 100644 --- a/src/safety/register_fit.rs +++ b/src/safety/register_fit.rs @@ -2,12 +2,24 @@ pub trait FitsIntoDeviceRegister: private::FitsIntoDeviceRegister {} impl FitsIntoDeviceRegister for T {} mod private { + #[marker] pub trait FitsIntoDeviceRegister {} impl FitsIntoDeviceRegister for T where AssertTypeFitsInto64Bits<{ TypeSize::check::() }>: FitsInto64Bits { } + // Since T: Sized, the pointers are thin, and must thus fit into device + // registers + impl<'r, T: rustacuda_core::DeviceCopy + 'r> FitsIntoDeviceRegister + for crate::common::DeviceConstRef<'r, T> + { + } + impl<'r, T: rustacuda_core::DeviceCopy + 'r> FitsIntoDeviceRegister + for crate::common::DeviceMutRef<'r, T> + { + } + #[derive(PartialEq, Eq, core::marker::ConstParamTy)] pub enum TypeSize { TypeFitsInto64Bits, From cff4eab1c4dbe6a1248ed707482b94b22fa2fea9 Mon Sep 17 00:00:00 2001 From: Juniper Tyree Date: Tue, 9 May 2023 09:25:48 +0000 Subject: [PATCH 034/120] Switched one std:: to core:: --- .../wrapper/generate/cpu_wrapper/kernel_func_async/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func_async/mod.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func_async/mod.rs index 462855156..9a22a46e8 100644 --- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func_async/mod.rs +++ b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func_async/mod.rs @@ -119,7 +119,7 @@ pub(super) fn quote_kernel_func_async( unsafe { stream.launch(function, grid, block, shared_memory_size, &[ #( - &#func_params as *const _ as *mut ::std::ffi::c_void + &#func_params as *const _ as *mut ::core::ffi::c_void ),* ] ) } From fb9461abfd8b03df3046ddb34e280e00bccf515e Mon Sep 17 00:00:00 2001 From: Juniper Tyree Date: Tue, 9 May 2023 12:25:17 +0000 Subject: [PATCH 035/120] Remove register-sized CUDA kernel args check, unnecessary since https://github.com/rust-lang/rust/pull/94703 --- examples/single-source/src/main.rs | 8 ++- .../cpu_wrapper/kernel_func_async/mod.rs | 6 -- .../kernel/wrapper/generate/cuda_wrapper.rs | 6 -- src/safety/mod.rs | 2 - src/safety/register_fit.rs | 55 ------------------- 5 files changed, 7 insertions(+), 70 deletions(-) delete mode 100644 src/safety/register_fit.rs diff --git a/examples/single-source/src/main.rs b/examples/single-source/src/main.rs index 997fa88bc..ccd384676 100644 --- a/examples/single-source/src/main.rs +++ b/examples/single-source/src/main.rs @@ -38,6 +38,11 @@ pub struct Empty([u8; 0]); #[layout(crate = "rc::const_type_layout")] pub struct Tuple(u32, i32); +#[repr(C)] +#[derive(rc::const_type_layout::TypeLayout)] +#[layout(crate = "rc::const_type_layout")] +pub struct Triple(i32, i32, i32); + #[rc::common::kernel(use link_kernel! as impl Kernel for Launcher)] #[kernel(crate = "rc")] #[kernel( @@ -51,6 +56,7 @@ pub fn kernel<'a, T: rc::common::RustToCuda>( #[kernel(pass = SafeDeviceCopy, jit)] _v @ _w: &'a core::sync::atomic::AtomicU64, #[kernel(pass = LendRustToCuda)] _: Wrapper, #[kernel(pass = SafeDeviceCopy)] Tuple(s, mut __t): Tuple, + #[kernel(pass = SafeDeviceCopy)] q: Triple, // #[kernel(pass = SafeDeviceCopy)] shared3: ThreadBlockShared, ) where T: rc::safety::StackOnly + rc::safety::NoAliasing, @@ -65,7 +71,7 @@ pub fn kernel<'a, T: rc::common::RustToCuda>( (*shared.index_mut_unchecked(1)).0 = (f64::from(s) * 2.0) as u32; } unsafe { - (*shared2.index_mut_unchecked(2)).1 = 24; + (*shared2.index_mut_unchecked(2)).1 = q.0 + q.1 + q.2; } // unsafe { core::arch::asm!("hi") } // unsafe { diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func_async/mod.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func_async/mod.rs index 9a22a46e8..6e123440e 100644 --- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func_async/mod.rs +++ b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func_async/mod.rs @@ -102,14 +102,8 @@ pub(super) fn quote_kernel_func_async( #[allow(dead_code)] fn assert_impl_no_aliasing() {} - #[allow(dead_code)] - fn assert_impl_fits_into_device_register< - T: #crate_path::safety::FitsIntoDeviceRegister, - >(_val: &T) {} - #(assert_impl_devicecopy(&#func_params);)* #(assert_impl_no_aliasing::<#cpu_func_unboxed_types>();)* - #(assert_impl_fits_into_device_register(&#func_params);)* } let #crate_path::host::LaunchConfig { diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cuda_wrapper.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cuda_wrapper.rs index 36e316708..29473858e 100644 --- a/rust-cuda-derive/src/kernel/wrapper/generate/cuda_wrapper.rs +++ b/rust-cuda-derive/src/kernel/wrapper/generate/cuda_wrapper.rs @@ -136,14 +136,8 @@ pub(in super::super) fn quote_cuda_wrapper( #[allow(dead_code)] fn assert_impl_no_aliasing() {} - #[allow(dead_code)] - fn assert_impl_fits_into_device_register< - T: #crate_path::safety::FitsIntoDeviceRegister, - >(_val: &T) {} - #(assert_impl_devicecopy(&#func_params);)* #(assert_impl_no_aliasing::<#ptx_func_unboxed_types>();)* - #(assert_impl_fits_into_device_register(&#func_params);)* } #ptx_func_input_unwrap diff --git a/src/safety/mod.rs b/src/safety/mod.rs index cf7a8f718..7e5c419a5 100644 --- a/src/safety/mod.rs +++ b/src/safety/mod.rs @@ -1,7 +1,6 @@ mod arch; mod device_copy; mod no_aliasing; -mod register_fit; mod stack_only; #[cfg(any(feature = "alloc", doc))] mod unified_heap; @@ -13,7 +12,6 @@ pub mod type_layout; pub use device_copy::SafeDeviceCopy; pub use no_aliasing::NoAliasing; -pub use register_fit::FitsIntoDeviceRegister; pub use stack_only::StackOnly; #[cfg(any(feature = "alloc", doc))] pub use unified_heap::UnifiedHeapOnly; diff --git a/src/safety/register_fit.rs b/src/safety/register_fit.rs deleted file mode 100644 index ef1c8ce98..000000000 --- a/src/safety/register_fit.rs +++ /dev/null @@ -1,55 +0,0 @@ -pub trait FitsIntoDeviceRegister: private::FitsIntoDeviceRegister {} -impl FitsIntoDeviceRegister for T {} - -mod private { - #[marker] - pub trait FitsIntoDeviceRegister {} - impl FitsIntoDeviceRegister for T where - AssertTypeFitsInto64Bits<{ TypeSize::check::() }>: FitsInto64Bits - { - } - - // Since T: Sized, the pointers are thin, and must thus fit into device - // registers - impl<'r, T: rustacuda_core::DeviceCopy + 'r> FitsIntoDeviceRegister - for crate::common::DeviceConstRef<'r, T> - { - } - impl<'r, T: rustacuda_core::DeviceCopy + 'r> FitsIntoDeviceRegister - for crate::common::DeviceMutRef<'r, T> - { - } - - #[derive(PartialEq, Eq, core::marker::ConstParamTy)] - pub enum TypeSize { - TypeFitsInto64Bits, - // FIXME: ConstParamTy variant with str ICEs in rustdoc - #[cfg(not(doc))] - TypeExeceeds64Bits(&'static str), - #[cfg(doc)] - TypeExeceeds64Bits, - } - - impl TypeSize { - pub const fn check() -> Self { - if core::mem::size_of::() <= core::mem::size_of::() { - Self::TypeFitsInto64Bits - } else { - #[cfg(not(doc))] - { - Self::TypeExeceeds64Bits(core::any::type_name::()) - } - #[cfg(doc)] - { - Self::TypeExeceeds64Bits - } - } - } - } - - pub enum AssertTypeFitsInto64Bits {} - - pub trait FitsInto64Bits {} - - impl FitsInto64Bits for AssertTypeFitsInto64Bits<{ TypeSize::TypeFitsInto64Bits }> {} -} From e33a270b8b2e34101336d3558f3f62c9a7a2c227 Mon Sep 17 00:00:00 2001 From: Juniper Tyree Date: Sun, 14 May 2023 09:30:51 +0000 Subject: [PATCH 036/120] Simplified the kernel parameter layout extraction from PTX --- rust-cuda-derive/src/kernel/link/mod.rs | 185 ++++++++---------- rust-cuda-derive/src/kernel/mod.rs | 3 + .../kernel/wrapper/generate/cuda_wrapper.rs | 23 +-- rust-cuda-ptx-jit/src/device.rs | 2 +- 4 files changed, 102 insertions(+), 111 deletions(-) diff --git a/rust-cuda-derive/src/kernel/link/mod.rs b/rust-cuda-derive/src/kernel/link/mod.rs index cdc727a22..8df29f33a 100644 --- a/rust-cuda-derive/src/kernel/link/mod.rs +++ b/rust-cuda-derive/src/kernel/link/mod.rs @@ -21,6 +21,7 @@ use ptx_builder::{ use super::{ lints::{LintLevel, PtxLint}, utils::skip_kernel_compilation, + KERNEL_TYPE_USE_END_CANARY, KERNEL_TYPE_USE_START_CANARY, }; mod config; @@ -66,14 +67,14 @@ pub fn check_kernel(tokens: TokenStream) -> TokenStream { quote!(::core::result::Result::Ok(())).into() } -#[allow(clippy::module_name_repetitions, clippy::too_many_lines)] +#[allow(clippy::module_name_repetitions)] pub fn link_kernel(tokens: TokenStream) -> TokenStream { proc_macro_error::set_dummy(quote! { const PTX_STR: &'static str = "ERROR in this PTX compilation"; }); let LinkKernelConfig { - kernel, + kernel: _kernel, kernel_hash, args, crate_name, @@ -111,123 +112,109 @@ pub fn link_kernel(tokens: TokenStream) -> TokenStream { .into(); }; - let kernel_layout_name = if specialisation.is_empty() { - format!("{kernel}_type_layout_kernel") - } else { - format!( - "{kernel}_type_layout_kernel_{:016x}", - seahash::hash(specialisation.as_bytes()) - ) - }; + let type_layouts = extract_ptx_kernel_layout(&mut kernel_ptx); + remove_kernel_type_use_from_ptx(&mut kernel_ptx); - let mut type_layouts = Vec::new(); + check_kernel_ptx_and_report( + &kernel_ptx, + Specialisation::Link(&specialisation), + &kernel_hash, + &ptx_lint_levels, + ); - let type_layout_start_pattern = format!("\n\t// .globl\t{kernel_layout_name}"); + (quote! { const PTX_STR: &'static str = #kernel_ptx; #(#type_layouts)* }).into() +} - if let Some(type_layout_start) = kernel_ptx.find(&type_layout_start_pattern) { - const BEFORE_PARAM_PATTERN: &str = ".global .align 1 .b8 "; - const PARAM_LEN_PATTERN: &str = "["; - const LEN_BYTES_PATTERN: &str = "] = {"; - const AFTER_BYTES_PATTERN: &str = "};"; +fn extract_ptx_kernel_layout(kernel_ptx: &mut String) -> Vec { + const BEFORE_PARAM_PATTERN: &str = "global .align 1 .b8 "; + const PARAM_LEN_PATTERN: &str = "["; + const LEN_BYTES_PATTERN: &str = "] = {"; + const AFTER_BYTES_PATTERN: &str = "};"; - let after_type_layout_start = type_layout_start + type_layout_start_pattern.len(); + let mut type_layouts = Vec::new(); + + while let Some(type_layout_start) = kernel_ptx.find(BEFORE_PARAM_PATTERN) { + let param_start = type_layout_start + BEFORE_PARAM_PATTERN.len(); - let Some(type_layout_middle) = kernel_ptx[after_type_layout_start..] - .find(&format!(".visible .entry {kernel_layout_name}")) - .map(|i| after_type_layout_start + i) - else { + let Some(len_start_offset) = kernel_ptx[param_start..].find(PARAM_LEN_PATTERN) else { abort_call_site!( - "Kernel compilation generated invalid PTX: incomplete type layout information" + "Kernel compilation generated invalid PTX: missing type layout data" ) }; + let len_start = param_start + len_start_offset + PARAM_LEN_PATTERN.len(); - let mut next_type_layout = after_type_layout_start; + let Some(bytes_start_offset) = kernel_ptx[len_start..].find(LEN_BYTES_PATTERN) else { + abort_call_site!( + "Kernel compilation generated invalid PTX: missing type layout length" + ) + }; + let bytes_start = len_start + bytes_start_offset + LEN_BYTES_PATTERN.len(); - while let Some(param_start_offset) = - kernel_ptx[next_type_layout..type_layout_middle].find(BEFORE_PARAM_PATTERN) - { - let param_start = next_type_layout + param_start_offset + BEFORE_PARAM_PATTERN.len(); + let Some(bytes_end_offset) = kernel_ptx[bytes_start..].find(AFTER_BYTES_PATTERN) else { + abort_call_site!( + "Kernel compilation generated invalid PTX: invalid type layout data" + ) + }; + let param = &kernel_ptx[param_start..(param_start + len_start_offset)]; + let len = &kernel_ptx[len_start..(len_start + bytes_start_offset)]; + let bytes = &kernel_ptx[bytes_start..(bytes_start + bytes_end_offset)]; - if let Some(len_start_offset) = - kernel_ptx[param_start..type_layout_middle].find(PARAM_LEN_PATTERN) - { - let len_start = param_start + len_start_offset + PARAM_LEN_PATTERN.len(); + let param = quote::format_ident!("{}", param); - if let Some(bytes_start_offset) = - kernel_ptx[len_start..type_layout_middle].find(LEN_BYTES_PATTERN) - { - let bytes_start = len_start + bytes_start_offset + LEN_BYTES_PATTERN.len(); + let Ok(len) = len.parse::() else { + abort_call_site!( + "Kernel compilation generated invalid PTX: invalid type layout length" + ) + }; + let Ok(bytes) = bytes.split(", ").map(std::str::FromStr::from_str).collect::, _>>() else { + abort_call_site!( + "Kernel compilation generated invalid PTX: invalid type layout byte" + ) + }; - if let Some(bytes_end_offset) = - kernel_ptx[bytes_start..type_layout_middle].find(AFTER_BYTES_PATTERN) - { - let param = &kernel_ptx[param_start..(param_start + len_start_offset)]; - let len = &kernel_ptx[len_start..(len_start + bytes_start_offset)]; - let bytes = &kernel_ptx[bytes_start..(bytes_start + bytes_end_offset)]; - - let param = quote::format_ident!("{}", param); - - let Ok(len) = len.parse::() else { - abort_call_site!( - "Kernel compilation generated invalid PTX: invalid type layout \ - length" - ) - }; - let Ok(bytes) = bytes - .split(", ") - .map(std::str::FromStr::from_str) - .collect::, _>>() - else { - abort_call_site!( - "Kernel compilation generated invalid PTX: invalid type layout \ - byte" - ) - }; - - if bytes.len() != len { - abort_call_site!( - "Kernel compilation generated invalid PTX: type layout length \ - mismatch" - ); - } - - let byte_str = syn::LitByteStr::new(&bytes, proc_macro2::Span::call_site()); - - type_layouts.push(quote! { - const #param: &[u8; #len] = #byte_str; - }); - - next_type_layout = - bytes_start + bytes_end_offset + AFTER_BYTES_PATTERN.len(); - } else { - next_type_layout = bytes_start; - } - } else { - next_type_layout = len_start; - } - } else { - next_type_layout = param_start; - } + if bytes.len() != len { + abort_call_site!( + "Kernel compilation generated invalid PTX: type layout length mismatch" + ); } - let Some(type_layout_end) = kernel_ptx[type_layout_middle..] - .find('}') - .map(|i| type_layout_middle + i + '}'.len_utf8()) - else { - abort_call_site!("Kernel compilation generated invalid PTX") - }; + let byte_str = syn::LitByteStr::new(&bytes, proc_macro2::Span::call_site()); + + type_layouts.push(quote! { + const #param: &[u8; #len] = #byte_str; + }); + + let type_layout_end = bytes_start + bytes_end_offset + AFTER_BYTES_PATTERN.len(); kernel_ptx.replace_range(type_layout_start..type_layout_end, ""); } - check_kernel_ptx_and_report( - &kernel_ptx, - Specialisation::Link(&specialisation), - &kernel_hash, - &ptx_lint_levels, - ); + type_layouts +} - (quote! { const PTX_STR: &'static str = #kernel_ptx; #(#type_layouts)* }).into() +fn remove_kernel_type_use_from_ptx(kernel_ptx: &mut String) { + while let Some(kernel_type_layout_start) = kernel_ptx.find(KERNEL_TYPE_USE_START_CANARY) { + let kernel_type_layout_start = kernel_ptx[..kernel_type_layout_start] + .rfind('\n') + .unwrap_or(kernel_type_layout_start); + + let Some(kernel_type_layout_end_offset) = kernel_ptx[ + kernel_type_layout_start.. + ].find(KERNEL_TYPE_USE_END_CANARY) else { + abort_call_site!( + "Kernel compilation generated invalid PTX: incomplete type layout use section" + ); + }; + + let kernel_type_layout_end_offset = kernel_type_layout_end_offset + + kernel_ptx[kernel_type_layout_start + kernel_type_layout_end_offset..] + .find('\n') + .unwrap_or(KERNEL_TYPE_USE_END_CANARY.len()); + + let kernel_type_layout_end = kernel_type_layout_start + kernel_type_layout_end_offset; + + kernel_ptx.replace_range(kernel_type_layout_start..kernel_type_layout_end, ""); + } } #[allow(clippy::too_many_lines)] diff --git a/rust-cuda-derive/src/kernel/mod.rs b/rust-cuda-derive/src/kernel/mod.rs index 6dff13380..9e3a80789 100644 --- a/rust-cuda-derive/src/kernel/mod.rs +++ b/rust-cuda-derive/src/kernel/mod.rs @@ -4,3 +4,6 @@ pub mod wrapper; mod lints; mod utils; + +const KERNEL_TYPE_USE_START_CANARY: &str = "// //"; +const KERNEL_TYPE_USE_END_CANARY: &str = "// //"; diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cuda_wrapper.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cuda_wrapper.rs index 29473858e..04e396d70 100644 --- a/rust-cuda-derive/src/kernel/wrapper/generate/cuda_wrapper.rs +++ b/rust-cuda-derive/src/kernel/wrapper/generate/cuda_wrapper.rs @@ -2,7 +2,10 @@ use proc_macro2::TokenStream; use quote::quote_spanned; use syn::spanned::Spanned; -use super::super::{FuncIdent, FunctionInputs, InputCudaType, KernelConfig}; +use super::super::{ + super::{KERNEL_TYPE_USE_END_CANARY, KERNEL_TYPE_USE_START_CANARY}, + FuncIdent, FunctionInputs, InputCudaType, KernelConfig, +}; #[allow(clippy::too_many_lines)] pub(in super::super) fn quote_cuda_wrapper( @@ -96,29 +99,27 @@ pub(in super::super) fn quote_cuda_wrapper( syn::FnArg::Receiver(_) => unreachable!(), }); - let func_type_layout_ident = quote::format_ident!("{}_type_layout", func_ident); - quote! { #[cfg(target_os = "cuda")] #[#crate_path::device::specialise_kernel_entry(#args)] #[no_mangle] #(#func_attrs)* - pub unsafe extern "ptx-kernel" fn #func_type_layout_ident(#(#func_params: &mut &[u8]),*) { + pub unsafe extern "ptx-kernel" fn #func_ident_hash(#(#ptx_func_inputs),*) { + unsafe { + ::core::arch::asm!(#KERNEL_TYPE_USE_START_CANARY); + } #( #[no_mangle] static #func_layout_params: [ u8; #crate_path::const_type_layout::serialised_type_graph_len::<#ptx_func_types>() ] = #crate_path::const_type_layout::serialise_type_graph::<#ptx_func_types>(); - *#func_params = &#func_layout_params; + unsafe { ::core::ptr::read_volatile(&#func_layout_params[0]) }; )* - } + unsafe { + ::core::arch::asm!(#KERNEL_TYPE_USE_END_CANARY); + } - #[cfg(target_os = "cuda")] - #[#crate_path::device::specialise_kernel_entry(#args)] - #[no_mangle] - #(#func_attrs)* - pub unsafe extern "ptx-kernel" fn #func_ident_hash(#(#ptx_func_inputs),*) { #[deny(improper_ctypes)] mod __rust_cuda_ffi_safe_assert { use super::#args; diff --git a/rust-cuda-ptx-jit/src/device.rs b/rust-cuda-ptx-jit/src/device.rs index 533021b90..c647a65eb 100644 --- a/rust-cuda-ptx-jit/src/device.rs +++ b/rust-cuda-ptx-jit/src/device.rs @@ -5,7 +5,7 @@ macro_rules! PtxJITConstLoad { ([$index:literal] => $reference:expr) => { unsafe { ::core::arch::asm!( - concat!("// //"), + ::core::concat!("// //"), in(reg32) *($reference as *const _ as *const u32), ) } From d28f237af93f28e0d0b2a7a3a8d6c0421925a13b Mon Sep 17 00:00:00 2001 From: Juniper Tyree Date: Tue, 12 Dec 2023 12:42:21 +0000 Subject: [PATCH 037/120] Fix up rebase issues --- rust-cuda-derive/src/kernel/link/mod.rs | 36 +++++++++++-------------- rust-cuda-derive/src/kernel/lints.rs | 19 ++++++++++--- rust-toolchain | 2 +- src/utils/device_copy.rs | 4 +-- src/utils/exchange/buffer/device.rs | 4 +-- src/utils/exchange/buffer/host.rs | 4 +-- 6 files changed, 36 insertions(+), 33 deletions(-) diff --git a/rust-cuda-derive/src/kernel/link/mod.rs b/rust-cuda-derive/src/kernel/link/mod.rs index 8df29f33a..b03c9f756 100644 --- a/rust-cuda-derive/src/kernel/link/mod.rs +++ b/rust-cuda-derive/src/kernel/link/mod.rs @@ -54,7 +54,7 @@ pub fn check_kernel(tokens: TokenStream) -> TokenStream { let kernel_ptx = compile_kernel(&args, &crate_name, &crate_path, Specialisation::Check); let Some(kernel_ptx) = kernel_ptx else { - return quote!(::core::result::Result::Err(())).into() + return quote!(::core::result::Result::Err(())).into(); }; check_kernel_ptx_and_report( @@ -126,7 +126,7 @@ pub fn link_kernel(tokens: TokenStream) -> TokenStream { } fn extract_ptx_kernel_layout(kernel_ptx: &mut String) -> Vec { - const BEFORE_PARAM_PATTERN: &str = "global .align 1 .b8 "; + const BEFORE_PARAM_PATTERN: &str = ".visible .global .align 1 .b8 "; const PARAM_LEN_PATTERN: &str = "["; const LEN_BYTES_PATTERN: &str = "] = {"; const AFTER_BYTES_PATTERN: &str = "};"; @@ -137,23 +137,17 @@ fn extract_ptx_kernel_layout(kernel_ptx: &mut String) -> Vec Vec() else { - abort_call_site!( - "Kernel compilation generated invalid PTX: invalid type layout length" - ) + abort_call_site!("Kernel compilation generated invalid PTX: invalid type layout length") }; - let Ok(bytes) = bytes.split(", ").map(std::str::FromStr::from_str).collect::, _>>() else { - abort_call_site!( - "Kernel compilation generated invalid PTX: invalid type layout byte" - ) + let Ok(bytes) = bytes + .split(", ") + .map(std::str::FromStr::from_str) + .collect::, _>>() + else { + abort_call_site!("Kernel compilation generated invalid PTX: invalid type layout byte") }; if bytes.len() != len { @@ -198,9 +192,9 @@ fn remove_kernel_type_use_from_ptx(kernel_ptx: &mut String) { .rfind('\n') .unwrap_or(kernel_type_layout_start); - let Some(kernel_type_layout_end_offset) = kernel_ptx[ - kernel_type_layout_start.. - ].find(KERNEL_TYPE_USE_END_CANARY) else { + let Some(kernel_type_layout_end_offset) = + kernel_ptx[kernel_type_layout_start..].find(KERNEL_TYPE_USE_END_CANARY) + else { abort_call_site!( "Kernel compilation generated invalid PTX: incomplete type layout use section" ); diff --git a/rust-cuda-derive/src/kernel/lints.rs b/rust-cuda-derive/src/kernel/lints.rs index e91222dcd..6da06ed4b 100644 --- a/rust-cuda-derive/src/kernel/lints.rs +++ b/rust-cuda-derive/src/kernel/lints.rs @@ -2,6 +2,7 @@ use std::{collections::HashMap, fmt}; use syn::spanned::Spanned; +#[allow(clippy::too_many_lines)] pub fn parse_ptx_lint_level( path: &syn::Path, nested: &syn::punctuated::Punctuated, @@ -46,10 +47,15 @@ pub fn parse_ptx_lint_level( continue; } - let Some(syn::PathSegment { ident: namespace, arguments: syn::PathArguments::None }) = path.segments.first() else { + let Some(syn::PathSegment { + ident: namespace, + arguments: syn::PathArguments::None, + }) = path.segments.first() + else { emit_error!( meta.span(), - "[rust-cuda]: Invalid #[kernel({}())] attribute: must be of the form `ptx::lint`.", + "[rust-cuda]: Invalid #[kernel({}())] attribute: must be of the form \ + `ptx::lint`.", level, ); continue; @@ -65,10 +71,15 @@ pub fn parse_ptx_lint_level( continue; } - let Some(syn::PathSegment { ident: lint, arguments: syn::PathArguments::None }) = path.segments.last() else { + let Some(syn::PathSegment { + ident: lint, + arguments: syn::PathArguments::None, + }) = path.segments.last() + else { emit_error!( meta.span(), - "[rust-cuda]: Invalid #[kernel({}())] attribute: must be of the form `ptx::lint`.", + "[rust-cuda]: Invalid #[kernel({}())] attribute: must be of the form \ + `ptx::lint`.", level, ); continue; diff --git a/rust-toolchain b/rust-toolchain index 512b40786..d6e655e5f 100644 --- a/rust-toolchain +++ b/rust-toolchain @@ -1,5 +1,5 @@ [toolchain] -# Pin version pin until const traits are back +# Pin to final 1.75.0 nightly channel = "nightly-2023-11-10" components = [ "cargo", "rustfmt", "clippy" ] targets = [ "x86_64-unknown-linux-gnu", "nvptx64-nvidia-cuda" ] diff --git a/src/utils/device_copy.rs b/src/utils/device_copy.rs index 0c77a8d1a..2869cd296 100644 --- a/src/utils/device_copy.rs +++ b/src/utils/device_copy.rs @@ -102,9 +102,7 @@ unsafe impl RustToCuda for SafeDeviceCopyWr } } -unsafe impl RustToCudaAsync - for SafeDeviceCopyWrapper -{ +unsafe impl RustToCudaAsync for SafeDeviceCopyWrapper { #[cfg(feature = "host")] #[allow(clippy::type_complexity)] unsafe fn borrow_async( diff --git a/src/utils/exchange/buffer/device.rs b/src/utils/exchange/buffer/device.rs index 09ffa2b43..f6f00248b 100644 --- a/src/utils/exchange/buffer/device.rs +++ b/src/utils/exchange/buffer/device.rs @@ -48,7 +48,7 @@ unsafe impl - RustToCudaAsync for CudaExchangeBufferDevice +unsafe impl RustToCudaAsync + for CudaExchangeBufferDevice { } diff --git a/src/utils/exchange/buffer/host.rs b/src/utils/exchange/buffer/host.rs index 384f290bb..24a95bfe3 100644 --- a/src/utils/exchange/buffer/host.rs +++ b/src/utils/exchange/buffer/host.rs @@ -160,8 +160,8 @@ unsafe impl - RustToCudaAsync for CudaExchangeBufferHost +unsafe impl RustToCudaAsync + for CudaExchangeBufferHost { #[allow(clippy::type_complexity)] unsafe fn borrow_async( From e9bb611f60fe2222b2bb3be247aa44d6a7cd6a66 Mon Sep 17 00:00:00 2001 From: Juniper Tyree Date: Tue, 12 Dec 2023 12:47:23 +0000 Subject: [PATCH 038/120] Install CUDA in all CI steps --- .github/workflows/ci.yml | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 2e66a8ed9..954395a77 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -23,6 +23,16 @@ jobs: rust: [nightly] steps: + - name: Install CUDA + run: | + wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/cuda-ubuntu1804.pin + sudo mv cuda-ubuntu1804.pin /etc/apt/preferences.d/cuda-repository-pin-600 + curl -L -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/cuda-keyring_1.0-1_all.deb + sudo dpkg -i cuda-keyring_1.0-1_all.deb + sudo add-apt-repository "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/ /" + sudo apt-get update -q + sudo apt-get install cuda -y --no-install-recommends + - name: Checkout the Repository uses: actions/checkout@v2 @@ -129,6 +139,16 @@ jobs: rust: [nightly] steps: + - name: Install CUDA + run: | + wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/cuda-ubuntu1804.pin + sudo mv cuda-ubuntu1804.pin /etc/apt/preferences.d/cuda-repository-pin-600 + curl -L -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/cuda-keyring_1.0-1_all.deb + sudo dpkg -i cuda-keyring_1.0-1_all.deb + sudo add-apt-repository "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/ /" + sudo apt-get update -q + sudo apt-get install cuda -y --no-install-recommends + - name: Checkout the Repository uses: actions/checkout@v2 From 1493d97b16f9fd04a723c4cf96e85f7d7e1c6612 Mon Sep 17 00:00:00 2001 From: Juniper Tyree Date: Tue, 12 Dec 2023 14:27:10 +0000 Subject: [PATCH 039/120] Use CStr literals --- .github/workflows/rustdoc.yml | 1 + rust-cuda-derive/src/kernel/link/mod.rs | 24 +++++++++++------------- rust-cuda-derive/src/lib.rs | 1 + 3 files changed, 13 insertions(+), 13 deletions(-) diff --git a/.github/workflows/rustdoc.yml b/.github/workflows/rustdoc.yml index 285fc57c2..ec466acf0 100644 --- a/.github/workflows/rustdoc.yml +++ b/.github/workflows/rustdoc.yml @@ -31,6 +31,7 @@ jobs: --extern-html-root-url rustacuda=https://docs.rs/rustacuda/0.1.3/ \ --extern-html-root-url rustacuda_core=https://docs.rs/rustacuda_core/0.1.2/ \ --extern-html-root-url rustacuda_derive=https://docs.rs/rustacuda_derive/0.1.2/ \ + --extern-html-root-url const_type_layout=https://docs.rs/const-type-layout/0.2.0/ \ -Zunstable-options \ " cargo doc \ --all-features \ diff --git a/rust-cuda-derive/src/kernel/link/mod.rs b/rust-cuda-derive/src/kernel/link/mod.rs index b03c9f756..ae0b5ea63 100644 --- a/rust-cuda-derive/src/kernel/link/mod.rs +++ b/rust-cuda-derive/src/kernel/link/mod.rs @@ -383,11 +383,9 @@ fn check_kernel_ptx( seahash::hash(specialisation.as_bytes()) ), }; + let kernel_name = CString::new(kernel_name).unwrap(); - let mut options = vec![ - CString::new("--entry").unwrap(), - CString::new(kernel_name).unwrap(), - ]; + let mut options = vec![c"--entry", kernel_name.as_c_str()]; if ptx_lint_levels .values() @@ -399,27 +397,27 @@ fn check_kernel_ptx( .get(&PtxLint::Verbose) .map_or(false, |level| *level > LintLevel::Warn) { - options.push(CString::new("--verbose").unwrap()); + options.push(c"--verbose"); } if ptx_lint_levels .get(&PtxLint::DoublePrecisionUse) .map_or(false, |level| *level > LintLevel::Warn) { - options.push(CString::new("--warn-on-double-precision-use").unwrap()); + options.push(c"--warn-on-double-precision-use"); } if ptx_lint_levels .get(&PtxLint::LocalMemoryUsage) .map_or(false, |level| *level > LintLevel::Warn) { - options.push(CString::new("--warn-on-local-memory-usage").unwrap()); + options.push(c"--warn-on-local-memory-usage"); } if ptx_lint_levels .get(&PtxLint::RegisterSpills) .map_or(false, |level| *level > LintLevel::Warn) { - options.push(CString::new("--warn-on-spills").unwrap()); + options.push(c"--warn-on-spills"); } - options.push(CString::new("--warning-as-error").unwrap()); + options.push(c"--warning-as-error"); let options_ptrs = options.iter().map(|o| o.as_ptr()).collect::>(); @@ -436,25 +434,25 @@ fn check_kernel_ptx( .get(&PtxLint::Verbose) .map_or(false, |level| *level > LintLevel::Allow) { - options.push(CString::new("--verbose").unwrap()); + options.push(c"--verbose"); } if ptx_lint_levels .get(&PtxLint::DoublePrecisionUse) .map_or(false, |level| *level > LintLevel::Allow) { - options.push(CString::new("--warn-on-double-precision-use").unwrap()); + options.push(c"--warn-on-double-precision-use"); } if ptx_lint_levels .get(&PtxLint::LocalMemoryUsage) .map_or(false, |level| *level > LintLevel::Allow) { - options.push(CString::new("--warn-on-local-memory-usage").unwrap()); + options.push(c"--warn-on-local-memory-usage"); } if ptx_lint_levels .get(&PtxLint::RegisterSpills) .map_or(false, |level| *level > LintLevel::Allow) { - options.push(CString::new("--warn-on-spills").unwrap()); + options.push(c"--warn-on-spills"); } let options_ptrs = options.iter().map(|o| o.as_ptr()).collect::>(); diff --git a/rust-cuda-derive/src/lib.rs b/rust-cuda-derive/src/lib.rs index 572e1c9da..e94048081 100644 --- a/rust-cuda-derive/src/lib.rs +++ b/rust-cuda-derive/src/lib.rs @@ -5,6 +5,7 @@ #![feature(if_let_guard)] #![feature(let_chains)] #![feature(map_try_insert)] +#![feature(c_str_literals)] #![doc(html_root_url = "https://juntyr.github.io/rust-cuda/")] extern crate proc_macro; From e09e884cd934b97d2a0663052c14e8aa62b54995 Mon Sep 17 00:00:00 2001 From: Juniper Tyree Date: Wed, 13 Dec 2023 13:37:20 +0000 Subject: [PATCH 040/120] Simplify and document the safety traits --- .github/workflows/rustdoc.yml | 3 +- .vscode/settings.json | 2 +- Cargo.toml | 5 - examples/single-source/src/main.rs | 2 +- .../cpu_wrapper/kernel_func_async/mod.rs | 4 +- .../kernel/wrapper/generate/cuda_wrapper.rs | 8 +- src/lib.rs | 3 +- src/safety/device_copy.rs | 47 ++++--- src/safety/mod.rs | 6 +- src/safety/no_aliasing.rs | 105 +++++++++++---- src/safety/stack_only.rs | 123 ++++++++++++------ src/safety/unified_heap.rs | 53 -------- src/utils/alloc.rs | 67 ---------- src/utils/mod.rs | 3 - src/utils/shared/slice.rs | 40 +++--- src/utils/shared/static.rs | 53 ++++---- 16 files changed, 238 insertions(+), 286 deletions(-) delete mode 100644 src/safety/unified_heap.rs delete mode 100644 src/utils/alloc.rs diff --git a/.github/workflows/rustdoc.yml b/.github/workflows/rustdoc.yml index ec466acf0..23f4f1c07 100644 --- a/.github/workflows/rustdoc.yml +++ b/.github/workflows/rustdoc.yml @@ -28,10 +28,11 @@ jobs: run: | RUSTDOCFLAGS="\ --enable-index-page \ + --extern-html-root-url const_type_layout=https://docs.rs/const-type-layout/0.2.0/ \ + --extern-html-root-url final=https://docs.rs/final/0.1.1/ \ --extern-html-root-url rustacuda=https://docs.rs/rustacuda/0.1.3/ \ --extern-html-root-url rustacuda_core=https://docs.rs/rustacuda_core/0.1.2/ \ --extern-html-root-url rustacuda_derive=https://docs.rs/rustacuda_derive/0.1.2/ \ - --extern-html-root-url const_type_layout=https://docs.rs/const-type-layout/0.2.0/ \ -Zunstable-options \ " cargo doc \ --all-features \ diff --git a/.vscode/settings.json b/.vscode/settings.json index 93f713cad..b033ed643 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -4,5 +4,5 @@ "rust-analyzer.updates.askBeforeDownload": false, "rust-analyzer.checkOnSave.command": "reap-clippy", "rust-analyzer.cargo.allFeatures": false, - "rust-analyzer.cargo.features": ["alloc", "derive", "host"], + "rust-analyzer.cargo.features": ["derive", "host"], } diff --git a/Cargo.toml b/Cargo.toml index 2ebfbe32e..a2076ca1c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -19,7 +19,6 @@ rust-version = "1.75" # nightly [features] default = [] -alloc = ["hashbrown"] host = ["rustacuda", "rust-cuda-ptx-jit/host"] derive = ["rustacuda_derive", "rust-cuda-derive"] @@ -32,10 +31,6 @@ rustacuda_derive = { git = "https://github.com/juntyr/RustaCUDA", rev = "c6ea7cc const-type-layout = { version = "0.2.0", features = ["derive"] } final = "0.1.1" -hashbrown = { version = "0.14", default-features = false, features = ["inline-more"], optional = true } rust-cuda-derive = { path = "rust-cuda-derive", optional = true } rust-cuda-ptx-jit = { path = "rust-cuda-ptx-jit" } - -[dev-dependencies] -hashbrown = { version = "0.14", default-features = false, features = ["inline-more"] } diff --git a/examples/single-source/src/main.rs b/examples/single-source/src/main.rs index ccd384676..b80a14201 100644 --- a/examples/single-source/src/main.rs +++ b/examples/single-source/src/main.rs @@ -59,7 +59,7 @@ pub fn kernel<'a, T: rc::common::RustToCuda>( #[kernel(pass = SafeDeviceCopy)] q: Triple, // #[kernel(pass = SafeDeviceCopy)] shared3: ThreadBlockShared, ) where - T: rc::safety::StackOnly + rc::safety::NoAliasing, + T: rc::safety::StackOnly + rc::safety::NoSafeAliasing, ::CudaRepresentation: rc::safety::StackOnly, ::CudaAllocation: rc::common::EmptyCudaAlloc, { diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func_async/mod.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func_async/mod.rs index 6e123440e..747f4a278 100644 --- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func_async/mod.rs +++ b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func_async/mod.rs @@ -100,10 +100,10 @@ pub(super) fn quote_kernel_func_async( fn assert_impl_devicecopy(_val: &T) {} #[allow(dead_code)] - fn assert_impl_no_aliasing() {} + fn assert_impl_no_safe_aliasing() {} #(assert_impl_devicecopy(&#func_params);)* - #(assert_impl_no_aliasing::<#cpu_func_unboxed_types>();)* + #(assert_impl_no_safe_aliasing::<#cpu_func_unboxed_types>();)* } let #crate_path::host::LaunchConfig { diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cuda_wrapper.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cuda_wrapper.rs index 04e396d70..042ae5e7a 100644 --- a/rust-cuda-derive/src/kernel/wrapper/generate/cuda_wrapper.rs +++ b/rust-cuda-derive/src/kernel/wrapper/generate/cuda_wrapper.rs @@ -64,9 +64,9 @@ pub(in super::super) fn quote_cuda_wrapper( syn::TypeReference { and_token, .. } ) = &**ty { // DeviceCopy mode only supports immutable references - quote! { #ptx_jit_load; { let #pat: #and_token #syn_type = #pat.as_ref().into_ref(); #inner } } + quote! { { let #pat: #and_token #syn_type = #pat.as_ref().into_ref(); #inner } } } else { - quote! { { let #pat: #syn_type = #pat.into_inner(); #inner } } + quote! { #ptx_jit_load; { let #pat: #syn_type = #pat.into_inner(); #inner } } }, InputCudaType::LendRustToCuda => if let syn::Type::Reference( syn::TypeReference { and_token, mutability, ..} @@ -135,10 +135,10 @@ pub(in super::super) fn quote_cuda_wrapper( fn assert_impl_devicecopy(_val: &T) {} #[allow(dead_code)] - fn assert_impl_no_aliasing() {} + fn assert_impl_no_safe_aliasing() {} #(assert_impl_devicecopy(&#func_params);)* - #(assert_impl_no_aliasing::<#ptx_func_unboxed_types>();)* + #(assert_impl_no_safe_aliasing::<#ptx_func_unboxed_types>();)* } #ptx_func_input_unwrap diff --git a/src/lib.rs b/src/lib.rs index de590c29b..273e27779 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -16,14 +16,13 @@ any(all(not(feature = "host"), target_os = "cuda"), doc), feature(asm_const) )] -#![cfg_attr(target_os = "cuda", feature(ptr_metadata))] -#![cfg_attr(any(feature = "alloc", doc), feature(allocator_api))] #![feature(doc_cfg)] #![feature(marker_trait_attr)] #![feature(const_type_name)] #![feature(offset_of)] #![feature(adt_const_params)] #![feature(impl_trait_in_assoc_type)] +#![feature(ptr_metadata)] #![allow(incomplete_features)] #![feature(generic_const_exprs)] #![cfg_attr(target_os = "cuda", feature(slice_ptr_get))] diff --git a/src/safety/device_copy.rs b/src/safety/device_copy.rs index ee1cef0dc..9aedc8e81 100644 --- a/src/safety/device_copy.rs +++ b/src/safety/device_copy.rs @@ -1,30 +1,29 @@ -#[allow(clippy::module_name_repetitions)] -pub trait SafeDeviceCopy: sealed::SafeDeviceCopy {} +use const_type_layout::TypeGraphLayout; -impl SafeDeviceCopy for T {} +use crate::{common::DeviceAccessible, safety::StackOnly}; -mod sealed { - #[marker] - pub trait SafeDeviceCopy {} +#[allow(clippy::module_name_repetitions)] +/// Types which are safe to memcpy from the CPU to a GPU. +/// +/// For a type to implement [`SafeDeviceCopy`], it must +/// +/// * have the same memory layout on both the CPU and GPU +/// +/// * not contain any references to data that is inaccessible from the GPU +/// +/// Types that implement both [`TypeGraphLayout`] and [`StackOnly`] satisfy +/// both of these criteria and thus implement [`SafeDeviceCopy`]. +#[marker] +pub trait SafeDeviceCopy: sealed::Sealed {} - // Thread-block-shared data cannot be copied since information is added inside - // CUDA - impl !SafeDeviceCopy for crate::utils::shared::r#static::ThreadBlockShared {} - impl !SafeDeviceCopy - for crate::utils::shared::slice::ThreadBlockSharedSlice - { - } +impl SafeDeviceCopy for T {} +impl sealed::Sealed for T {} - impl SafeDeviceCopy for T {} - #[cfg(any(feature = "alloc", doc))] - impl SafeDeviceCopy for T {} +#[doc(hidden)] +impl SafeDeviceCopy for DeviceAccessible {} +impl sealed::Sealed for DeviceAccessible {} - impl SafeDeviceCopy - for crate::common::DeviceAccessible - { - } - impl SafeDeviceCopy - for crate::utils::device_copy::SafeDeviceCopyWrapper - { - } +mod sealed { + #[marker] + pub trait Sealed {} } diff --git a/src/safety/mod.rs b/src/safety/mod.rs index 7e5c419a5..72ed9c7db 100644 --- a/src/safety/mod.rs +++ b/src/safety/mod.rs @@ -2,8 +2,6 @@ mod arch; mod device_copy; mod no_aliasing; mod stack_only; -#[cfg(any(feature = "alloc", doc))] -mod unified_heap; #[doc(hidden)] pub mod kernel_signature; @@ -11,7 +9,5 @@ pub mod kernel_signature; pub mod type_layout; pub use device_copy::SafeDeviceCopy; -pub use no_aliasing::NoAliasing; +pub use no_aliasing::NoSafeAliasing; pub use stack_only::StackOnly; -#[cfg(any(feature = "alloc", doc))] -pub use unified_heap::UnifiedHeapOnly; diff --git a/src/safety/no_aliasing.rs b/src/safety/no_aliasing.rs index 98a180b6a..0fc3abf9c 100644 --- a/src/safety/no_aliasing.rs +++ b/src/safety/no_aliasing.rs @@ -1,33 +1,86 @@ #[allow(clippy::module_name_repetitions)] -pub trait NoAliasing: private::NoAliasing {} -impl NoAliasing for T {} +/// Types which can be safely shared between CUDA threads because they do +/// not provide safe aliasing mutable access to some shared inner state. +/// +/// This trait is automatically implemented when the compiler determines +/// it's appropriate. +/// +/// Data types that contain no references and can thus live entirely on +/// the stack, e.g. primitive types like [`u8`] and structs, tuples, and +/// enums made only from them, or more generally those types that implement +/// [`StackOnly`](super::StackOnly), also implement [`NoSafeAliasing`] as they +/// do not contain any inner data that might be shared when each thread is +/// given mutable access to a copy. +/// +/// In contrast, `&mut T` (and any type containing a mutable reference) do *not* +/// implement [`NoSafeAliasing`] as several threads would obtain mutable +/// aliasing access to the same date, thus violating Rust's borrowing and +/// memory safety rules. +/// +/// Even though `*const T` and `*mut T` do not provide *safe* mutable aliasing +/// access to their underlying data, as dereferincing them is always unsafe, +/// they (and any type containing a pointer) do *not* implement +/// [`NoSafeAliasing`] to ensure that any data type that uses them to build a +/// safe interface to accessing data, e.g. [`Box`], does not accidentially +/// implement [`NoSafeAliasing`]. If you have implemented a data structure that +/// uses `*const T` or `*mut T` internally but also ensures that no safe +/// aliasing mutable access is provided, you can *unsafely* implement +/// [`NoSafeAliasing`] for your type. Please reference the [Safety](#safety) +/// section below for more details on the contract you must uphold in this case. +/// +/// # Safety +/// +/// This trait must only be manually implemented for a type that upholds +/// the no-mutable-aliasing guarantee through its safe API. +/// +/// The following examples outline three different cases for types that do +/// fulfil this safety requirement: +/// +/// * [`Final`](final::Final) implements [`NoSafeAliasing`] +/// because even a mutable reference to it only provides read-only access +/// to its inner data. +/// +/// * [`SplitSliceOverCudaThreadsConstStride`](crate::utils::aliasing::SplitSliceOverCudaThreadsConstStride) +/// and +/// [`SplitSliceOverCudaThreadsDynamicStride`](crate::utils::aliasing::SplitSliceOverCudaThreadsDynamicStride) +/// also implement [`NoSafeAliasing`] because they only provide each CUDA thread +/// with mutable access to its own partition of a slice and thus avoid mutable +/// aliasing. +/// +/// * [`ThreadBlockShared`](crate::utils::shared::static::ThreadBlockShared) +/// and +/// [`ThreadBlockSharedSlice`](crate::utils::shared::slice::ThreadBlockSharedSlice) +/// also implement [`NoSafeAliasing`] since they only provide access to `*mut +/// T`, which is always unsafe to mutate and thus moves the burden to uphoald +/// the no-mutable-aliasing safety invariant to the user who derefereces these +/// pointers. +pub unsafe auto trait NoSafeAliasing {} -mod private { - pub auto trait NoAliasing {} +impl !NoSafeAliasing for &mut T {} +impl !NoSafeAliasing for *const T {} +impl !NoSafeAliasing for *mut T {} - impl !NoAliasing for *const T {} - impl !NoAliasing for *mut T {} - impl !NoAliasing for &mut T {} +unsafe impl NoSafeAliasing for core::marker::PhantomData {} - impl NoAliasing for core::marker::PhantomData {} - - impl NoAliasing for r#final::Final {} - impl NoAliasing - for crate::utils::aliasing::FinalCudaRepresentation - { - } +unsafe impl NoSafeAliasing for r#final::Final {} +unsafe impl NoSafeAliasing + for crate::utils::aliasing::FinalCudaRepresentation +{ +} - impl NoAliasing - for crate::utils::aliasing::SplitSliceOverCudaThreadsConstStride - { - } - impl NoAliasing for crate::utils::aliasing::SplitSliceOverCudaThreadsDynamicStride {} +unsafe impl NoSafeAliasing + for crate::utils::aliasing::SplitSliceOverCudaThreadsConstStride +{ +} +unsafe impl NoSafeAliasing + for crate::utils::aliasing::SplitSliceOverCudaThreadsDynamicStride +{ +} - // Thread-block-shared data only allows unsafe aliasing since only raw pointers - // are exposed - impl NoAliasing for crate::utils::shared::r#static::ThreadBlockShared {} - impl NoAliasing - for crate::utils::shared::slice::ThreadBlockSharedSlice - { - } +// Thread-block-shared data only allows unsafe aliasing since only raw pointers +// are exposed +unsafe impl NoSafeAliasing for crate::utils::shared::r#static::ThreadBlockShared {} +unsafe impl NoSafeAliasing + for crate::utils::shared::slice::ThreadBlockSharedSlice +{ } diff --git a/src/safety/stack_only.rs b/src/safety/stack_only.rs index 5dc5c0cbb..bfb4e80d0 100644 --- a/src/safety/stack_only.rs +++ b/src/safety/stack_only.rs @@ -1,47 +1,86 @@ -/// ```rust -/// # use rust_cuda::safety::StackOnly; -/// fn assert_stackonly(_x: impl StackOnly) {} -/// ``` -/// ```rust -/// # use rust_cuda::safety::StackOnly; -/// # fn assert_stackonly(_x: impl StackOnly) {} -/// assert_stackonly(42); -/// ``` -/// ```rust -/// # use rust_cuda::safety::StackOnly; -/// # fn assert_stackonly(_x: impl StackOnly) {} -/// assert_stackonly([42; 42]); -/// ``` -/// ```rust,compile_fail -/// # use alloc::vec; -/// # use rust_cuda::safety::StackOnly; -/// # fn assert_stackonly(_x: impl StackOnly) {} -/// assert_stackonly(vec![42]); -/// ``` -/// ```rust,compile_fail -/// # use alloc::vec; -/// # use rust_cuda::safety::StackOnly; -/// # fn assert_stackonly(_x: impl StackOnly) {} -/// assert_stackonly(&42); -/// ``` -#[allow(clippy::module_name_repetitions)] -pub trait StackOnly: sealed::StackOnly {} -impl StackOnly for T {} +macro_rules! stack_only_docs { + ($item:item) => { + /// Types which contain no pointers or references and can thus live entirely + /// on the stack. + /// + /// This trait is automatically implemented when the compiler determines + /// it's appropriate. + /// + /// Note that this trait is *sealed*, i.e. you cannot implement it on your + /// own custom types. + /// + /// Primitive types like [`u8`] and structs, tuples, and enums made only + /// from them implement [`StackOnly`]. + /// + /// In contrast, `&T`, `&mut T`, `*const T`, `*mut T`, and any type + /// containing a reference or a pointer do *not* implement [`StackOnly`]. + /// + /// # Examples + /// + /// ```rust + /// # use rust_cuda::safety::StackOnly; + /// fn assert_stackonly(_x: impl StackOnly) {} + /// ``` + /// ```rust + /// # use rust_cuda::safety::StackOnly; + /// # fn assert_stackonly(_x: impl StackOnly) {} + /// assert_stackonly(42); // ok + /// ``` + /// ```rust + /// # use rust_cuda::safety::StackOnly; + /// # fn assert_stackonly(_x: impl StackOnly) {} + /// assert_stackonly([42; 42]); // ok + /// ``` + /// ```rust,compile_fail + /// # use alloc::vec; + /// # use rust_cuda::safety::StackOnly; + /// # fn assert_stackonly(_x: impl StackOnly) {} + /// assert_stackonly(vec![42]); // error + /// ``` + /// ```rust,compile_fail + /// # use alloc::vec; + /// # use rust_cuda::safety::StackOnly; + /// # fn assert_stackonly(_x: impl StackOnly) {} + /// assert_stackonly(&42); // error + /// ``` + /// ```rust,compile_fail + /// # use alloc::vec; + /// # use rust_cuda::safety::StackOnly; + /// # fn assert_stackonly(_x: impl StackOnly) {} + /// # use crate::utils::shared::r#static::ThreadBlockShared; + /// assert_stackonly(ThreadBlockShared::new_uninit()); // error + /// ``` + /// ```rust,compile_fail + /// # use alloc::vec; + /// # use rust_cuda::safety::StackOnly; + /// # fn assert_stackonly(_x: impl StackOnly) {} + /// # use crate::utils::shared::slice::ThreadBlockSharedSlice; + /// assert_stackonly(ThreadBlockSharedSlice::new_uninit_with_len(0)); // error + /// ``` + $item + }; +} -mod sealed { - pub auto trait StackOnly {} +#[cfg(not(doc))] +stack_only_docs! { + #[allow(clippy::module_name_repetitions)] + pub trait StackOnly: sealed::Sealed {} +} +#[cfg(doc)] +stack_only_docs! { + pub use sealed::Sealed as StackOnly; +} + +#[cfg(not(doc))] +impl StackOnly for T {} - impl !StackOnly for *const T {} - impl !StackOnly for *mut T {} - impl !StackOnly for &T {} - impl !StackOnly for &mut T {} +mod sealed { + pub auto trait Sealed {} - // Thread-block-shared data contains data not on the stack - impl !StackOnly for crate::utils::shared::r#static::ThreadBlockShared {} - impl !StackOnly - for crate::utils::shared::slice::ThreadBlockSharedSlice - { - } + impl !Sealed for &T {} + impl !Sealed for &mut T {} + impl !Sealed for *const T {} + impl !Sealed for *mut T {} - impl StackOnly for core::marker::PhantomData {} + impl Sealed for core::marker::PhantomData {} } diff --git a/src/safety/unified_heap.rs b/src/safety/unified_heap.rs deleted file mode 100644 index 483b40c3a..000000000 --- a/src/safety/unified_heap.rs +++ /dev/null @@ -1,53 +0,0 @@ -#[doc(cfg(feature = "alloc"))] -/// ```rust -/// # use rust_cuda::safety::UnifiedHeapOnly; -/// fn assert_unified_heap_only(_x: impl UnifiedHeapOnly) {} -/// ``` -/// ```rust -/// # use rust_cuda::safety::UnifiedHeapOnly; -/// # fn assert_unified_heap_only(_x: impl UnifiedHeapOnly) {} -/// assert_unified_heap_only(42); -/// ``` -/// ```rust -/// # use rust_cuda::safety::UnifiedHeapOnly; -/// # fn assert_unified_heap_only(_x: impl UnifiedHeapOnly) {} -/// assert_unified_heap_only([42; 42]); -/// ``` -/// ```rust,compile_fail -/// # use alloc::vec; -/// # use rust_cuda::safety::UnifiedHeapOnly; -/// # fn assert_unified_heap_only(_x: impl UnifiedHeapOnly) {} -/// assert_unified_heap_only(vec![42]); -/// ``` -/// ```rust,compile_fail -/// # use rust_cuda::safety::UnifiedHeapOnly; -/// # fn assert_unified_heap_only(_x: impl UnifiedHeapOnly) {} -/// assert_unified_heap_only(&42); -/// ``` -#[allow(clippy::module_name_repetitions)] -pub trait UnifiedHeapOnly: sealed::UnifiedHeapOnly {} -impl UnifiedHeapOnly for T {} - -mod sealed { - use crate::utils::alloc::UnifiedAllocator; - - pub auto trait UnifiedHeapOnly {} - - impl !UnifiedHeapOnly for *const T {} - impl !UnifiedHeapOnly for *mut T {} - impl !UnifiedHeapOnly for &T {} - impl !UnifiedHeapOnly for &mut T {} - - // Thread-block-shared data contains CUDA-only data - impl !UnifiedHeapOnly for crate::utils::shared::r#static::ThreadBlockShared {} - impl !UnifiedHeapOnly - for crate::utils::shared::slice::ThreadBlockSharedSlice - { - } - - impl UnifiedHeapOnly for core::marker::PhantomData {} - - impl UnifiedHeapOnly for alloc::boxed::Box {} - impl UnifiedHeapOnly for alloc::vec::Vec {} - impl UnifiedHeapOnly for hashbrown::HashMap {} -} diff --git a/src/utils/alloc.rs b/src/utils/alloc.rs deleted file mode 100644 index 3bbcf225b..000000000 --- a/src/utils/alloc.rs +++ /dev/null @@ -1,67 +0,0 @@ -use alloc::alloc::{AllocError, Allocator, Layout}; -use core::ptr::NonNull; - -#[allow(clippy::module_name_repetitions)] -pub struct UnifiedAllocator; - -unsafe impl Allocator for UnifiedAllocator { - #[cfg(feature = "host")] - fn allocate(&self, layout: Layout) -> Result, AllocError> { - if layout.size() == 0 { - return Ok(NonNull::<[u8; 0]>::dangling()); - } - - match layout.align() { - 1 => alloc_unified_aligned::(layout.size()), - 2 => alloc_unified_aligned::(layout.size() >> 1), - 4 => alloc_unified_aligned::(layout.size() >> 2), - 8 => alloc_unified_aligned::(layout.size() >> 3), - _ => Err(AllocError), - } - } - - #[cfg(not(feature = "host"))] - fn allocate(&self, _layout: Layout) -> Result, AllocError> { - Err(AllocError) - } - - #[cfg(feature = "host")] - unsafe fn deallocate(&self, ptr: NonNull, layout: Layout) { - use rustacuda::{ - error::CudaResult, - memory::{cuda_free_unified, UnifiedPointer}, - }; - - if layout.size() == 0 { - return; - } - - let _: CudaResult<()> = cuda_free_unified(UnifiedPointer::wrap(ptr.as_ptr())); - } - - #[cfg(not(feature = "host"))] - unsafe fn deallocate(&self, _ptr: NonNull, _layout: Layout) { - // no-op - } -} - -#[cfg(feature = "host")] -fn alloc_unified_aligned( - size: usize, -) -> Result, AllocError> { - use rustacuda::memory::cuda_malloc_unified; - - match unsafe { cuda_malloc_unified::(size) } { - Ok(mut ptr) => { - let bytes: &mut [u8] = unsafe { - core::slice::from_raw_parts_mut( - ptr.as_raw_mut().cast(), - size * core::mem::align_of::(), - ) - }; - - NonNull::new(bytes).ok_or(AllocError) - }, - Err(_) => Err(AllocError), - } -} diff --git a/src/utils/mod.rs b/src/utils/mod.rs index c70432f31..dadf5a443 100644 --- a/src/utils/mod.rs +++ b/src/utils/mod.rs @@ -1,7 +1,4 @@ pub mod aliasing; -#[cfg(any(feature = "alloc", doc))] -#[doc(cfg(feature = "alloc"))] -pub mod alloc; pub mod device_copy; pub mod exchange; pub mod shared; diff --git a/src/utils/shared/slice.rs b/src/utils/shared/slice.rs index 0a8a66c62..804623ae4 100644 --- a/src/utils/shared/slice.rs +++ b/src/utils/shared/slice.rs @@ -1,20 +1,12 @@ -#[cfg(not(target_os = "cuda"))] -use core::marker::PhantomData; - use const_type_layout::TypeGraphLayout; -#[cfg(not(target_os = "cuda"))] -#[allow(clippy::module_name_repetitions)] -#[repr(transparent)] -pub struct ThreadBlockSharedSlice { - len: usize, - marker: PhantomData, -} - -#[cfg(target_os = "cuda")] #[allow(clippy::module_name_repetitions)] #[repr(transparent)] pub struct ThreadBlockSharedSlice { + #[cfg(not(target_os = "cuda"))] + // dangling marker s.t. Self is not StackOnly + dangling: *mut [T], + #[cfg(target_os = "cuda")] shared: *mut [T], } @@ -24,8 +16,7 @@ impl ThreadBlockSharedSlice { #[must_use] pub fn new_uninit_with_len(len: usize) -> Self { Self { - len, - marker: PhantomData::, + dangling: Self::dangling_slice_with_len(len), } } @@ -33,7 +24,7 @@ impl ThreadBlockSharedSlice { #[doc(cfg(not(target_os = "cuda")))] #[must_use] pub fn with_len(mut self, len: usize) -> Self { - self.len = len; + self.dangling = Self::dangling_slice_with_len(len); self } @@ -41,20 +32,27 @@ impl ThreadBlockSharedSlice { #[doc(cfg(not(target_os = "cuda")))] #[must_use] pub fn with_len_mut(&mut self, len: usize) -> &mut Self { - self.len = len; + self.dangling = Self::dangling_slice_with_len(len); self } #[cfg(not(target_os = "cuda"))] - #[must_use] - pub fn len(&self) -> usize { - self.len + fn dangling_slice_with_len(len: usize) -> *mut [T] { + core::ptr::slice_from_raw_parts_mut(core::ptr::NonNull::dangling().as_ptr(), len) } - #[cfg(target_os = "cuda")] #[must_use] pub fn len(&self) -> usize { - core::ptr::metadata(self.shared) + core::ptr::metadata({ + #[cfg(not(target_os = "cuda"))] + { + self.dangling + } + #[cfg(target_os = "cuda")] + { + self.shared + } + }) } #[must_use] diff --git a/src/utils/shared/static.rs b/src/utils/shared/static.rs index 5b8cdfc52..0ba7f9df0 100644 --- a/src/utils/shared/static.rs +++ b/src/utils/shared/static.rs @@ -1,43 +1,38 @@ -#[cfg(not(target_os = "cuda"))] -use core::marker::PhantomData; - -#[cfg(not(target_os = "cuda"))] -#[repr(transparent)] -pub struct ThreadBlockShared { - marker: PhantomData, -} - -#[cfg(target_os = "cuda")] #[repr(transparent)] pub struct ThreadBlockShared { + #[cfg(not(target_os = "cuda"))] + // dangling marker s.t. Self is not StackOnly + _dangling: *mut T, + #[cfg(target_os = "cuda")] shared: *mut T, } impl ThreadBlockShared { - #[cfg(not(target_os = "cuda"))] #[must_use] pub fn new_uninit() -> Self { - Self { - marker: PhantomData::, + #[cfg(not(target_os = "cuda"))] + { + Self { + _dangling: core::ptr::NonNull::dangling().as_ptr(), + } } - } - #[cfg(target_os = "cuda")] - #[must_use] - pub fn new_uninit() -> Self { - let shared: *mut T; - - unsafe { - core::arch::asm!( - ".shared .align {align} .b8 {reg}_rust_cuda_static_shared[{size}];", - "cvta.shared.u64 {reg}, {reg}_rust_cuda_static_shared;", - reg = out(reg64) shared, - align = const(core::mem::align_of::()), - size = const(core::mem::size_of::()), - ); + #[cfg(target_os = "cuda")] + { + let shared: *mut T; + + unsafe { + core::arch::asm!( + ".shared .align {align} .b8 {reg}_rust_cuda_static_shared[{size}];", + "cvta.shared.u64 {reg}, {reg}_rust_cuda_static_shared;", + reg = out(reg64) shared, + align = const(core::mem::align_of::()), + size = const(core::mem::size_of::()), + ); + } + + Self { shared } } - - Self { shared } } #[cfg(any(target_os = "cuda", doc))] From 4baa5dc7b49ad1d9e1830da981e866945df1dc59 Mon Sep 17 00:00:00 2001 From: Juniper Tyree Date: Wed, 13 Dec 2023 13:42:12 +0000 Subject: [PATCH 041/120] Fix move_to_cuda bound --- src/device/mod.rs | 4 ++-- src/host.rs | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/device/mod.rs b/src/device/mod.rs index 45c833923..699424355 100644 --- a/src/device/mod.rs +++ b/src/device/mod.rs @@ -51,7 +51,7 @@ pub trait BorrowFromRust: RustToCuda { inner: F, ) -> O where - Self: Sized + SafeDeviceCopy, + Self: Sized, ::CudaRepresentation: SafeDeviceCopy; } @@ -86,7 +86,7 @@ impl BorrowFromRust for T { inner: F, ) -> O where - Self: Sized + SafeDeviceCopy, + Self: Sized, ::CudaRepresentation: SafeDeviceCopy, { inner(CudaAsRust::as_rust(cuda_repr_mut.as_mut())) diff --git a/src/host.rs b/src/host.rs index aed9aaa83..9709798d3 100644 --- a/src/host.rs +++ b/src/host.rs @@ -180,7 +180,7 @@ pub trait LendToCuda: RustToCuda { inner: F, ) -> Result where - Self: Sized + SafeDeviceCopy, + Self: Sized, ::CudaRepresentation: SafeDeviceCopy, ::CudaAllocation: EmptyCudaAlloc; } @@ -238,7 +238,7 @@ impl LendToCuda for T { inner: F, ) -> Result where - Self: Sized + SafeDeviceCopy, + Self: Sized, ::CudaRepresentation: SafeDeviceCopy, ::CudaAllocation: EmptyCudaAlloc, { From 720d14a48fe1b01a2a6ff1a848c54d4215b7f766 Mon Sep 17 00:00:00 2001 From: Juniper Tyree Date: Wed, 13 Dec 2023 13:58:43 +0000 Subject: [PATCH 042/120] Fix clippy for 1.76 --- rust-cuda-derive/src/lib.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/rust-cuda-derive/src/lib.rs b/rust-cuda-derive/src/lib.rs index e94048081..74e76a2cc 100644 --- a/rust-cuda-derive/src/lib.rs +++ b/rust-cuda-derive/src/lib.rs @@ -5,7 +5,8 @@ #![feature(if_let_guard)] #![feature(let_chains)] #![feature(map_try_insert)] -#![feature(c_str_literals)] +#![feature(cfg_version)] +#![cfg_attr(not(version("1.76.0")), feature(c_str_literals))] #![doc(html_root_url = "https://juntyr.github.io/rust-cuda/")] extern crate proc_macro; From 942c5f9a748e33c41fefe1cb8b22451b91de3334 Mon Sep 17 00:00:00 2001 From: Juniper Tyree Date: Thu, 14 Dec 2023 13:51:17 +0000 Subject: [PATCH 043/120] Cleaned up the rust-cuda device macros with better print The implementation still uses String for dynamic formatting, which currently pulls in loads of formatting and panic machinery. While a custom String type that pre-allocated the exact format String length can avoid some of that, the formatting machinery even for e.g. usize is still large. If `format_args!` is ever optimised for better inlining, the more verbose and lower-level implementation could be reconsidered. --- .vscode/settings.json | 6 +- Cargo.toml | 2 +- examples/derive/Cargo.toml | 3 +- examples/print/.cargo/config.toml | 5 + examples/print/Cargo.toml | 14 +++ examples/print/src/main.rs | 69 +++++++++++ examples/single-source/.cargo/config.toml | 2 +- examples/single-source/Cargo.toml | 5 +- examples/single-source/src/main.rs | 7 +- rust-cuda-derive/Cargo.toml | 2 +- rust-cuda-derive/src/kernel/link/mod.rs | 12 ++ .../src/kernel/link/ptx_compiler_sys.rs | 10 +- rust-cuda-derive/src/kernel/lints.rs | 3 + .../generate/cpu_linker_macro/get_ptx_str.rs | 1 + .../kernel/wrapper/generate/cuda_wrapper.rs | 1 + rust-cuda-derive/src/kernel/wrapper/mod.rs | 1 + rust-cuda-ptx-jit/Cargo.toml | 2 +- src/device/macros.rs | 115 ------------------ src/device/mod.rs | 3 +- src/device/utils.rs | 28 +++++ 20 files changed, 153 insertions(+), 138 deletions(-) create mode 100644 examples/print/.cargo/config.toml create mode 100644 examples/print/Cargo.toml create mode 100644 examples/print/src/main.rs delete mode 100644 src/device/macros.rs create mode 100644 src/device/utils.rs diff --git a/.vscode/settings.json b/.vscode/settings.json index b033ed643..c2b4219f5 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -4,5 +4,9 @@ "rust-analyzer.updates.askBeforeDownload": false, "rust-analyzer.checkOnSave.command": "reap-clippy", "rust-analyzer.cargo.allFeatures": false, - "rust-analyzer.cargo.features": ["derive", "host"], + "rust-analyzer.cargo.features": [ + "derive", + "host" + ], + "rust-analyzer.showUnlinkedFileNotification": false, } diff --git a/Cargo.toml b/Cargo.toml index a2076ca1c..9e9a568f2 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,7 +1,7 @@ [workspace] members = [ ".", "rust-cuda-derive", "rust-cuda-ptx-jit", - "examples/single-source", "examples/derive", + "examples/derive", "examples/print", "examples/single-source", ] default-members = [ ".", "rust-cuda-derive", "rust-cuda-ptx-jit" diff --git a/examples/derive/Cargo.toml b/examples/derive/Cargo.toml index f4ea53d90..1b000fe8c 100644 --- a/examples/derive/Cargo.toml +++ b/examples/derive/Cargo.toml @@ -1,12 +1,11 @@ [package] name = "derive" version = "0.1.0" -authors = ["Juniper Tyree "] +authors = ["Juniper Tyree "] license = "MIT OR Apache-2.0" edition = "2021" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] -const-type-layout = { version = "0.2.0" } rc = { package = "rust-cuda", path = "../../", features = ["derive", "host"] } diff --git a/examples/print/.cargo/config.toml b/examples/print/.cargo/config.toml new file mode 100644 index 000000000..f7029e166 --- /dev/null +++ b/examples/print/.cargo/config.toml @@ -0,0 +1,5 @@ +[target.nvptx64-nvidia-cuda] +rustflags = ["-Clink-args=--arch sm_35", "-Clinker-plugin-lto", "-Ccodegen-units=1", "-Clink-arg=-O3", "-Clink-arg=--lto"] + +[unstable] +features = ["all"] diff --git a/examples/print/Cargo.toml b/examples/print/Cargo.toml new file mode 100644 index 000000000..21f513d8f --- /dev/null +++ b/examples/print/Cargo.toml @@ -0,0 +1,14 @@ +[package] +name = "print" +version = "0.1.0" +authors = ["Juniper Tyree "] +license = "MIT OR Apache-2.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[target.'cfg(target_os = "cuda")'.dependencies] +rust-cuda = { path = "../../", features = ["derive"] } + +[target.'cfg(not(target_os = "cuda"))'.dependencies] +rust-cuda = { path = "../../", features = ["derive", "host"] } diff --git a/examples/print/src/main.rs b/examples/print/src/main.rs new file mode 100644 index 000000000..49492e0a4 --- /dev/null +++ b/examples/print/src/main.rs @@ -0,0 +1,69 @@ +#![deny(clippy::pedantic)] +#![cfg_attr(target_os = "cuda", no_std)] +#![cfg_attr(target_os = "cuda", no_main)] +#![cfg_attr(target_os = "cuda", feature(abi_ptx))] +#![cfg_attr(target_os = "cuda", feature(alloc_error_handler))] +#![cfg_attr(target_os = "cuda", feature(asm_experimental_arch))] +#![cfg_attr(target_os = "cuda", feature(core_panic))] + +extern crate alloc; + +#[cfg(not(target_os = "cuda"))] +fn main() {} + +#[rust_cuda::common::kernel(use link_kernel! as impl Kernel for Launcher)] +#[kernel(deny( + ptx::double_precision_use, + ptx::local_memory_usage, + ptx::register_spills, + ptx::dynamic_stack_size +))] +pub fn kernel() { + rust_cuda::device::utils::print(format_args!("println! from CUDA kernel")); +} + +#[cfg(not(target_os = "cuda"))] +mod host { + #[allow(unused_imports)] + use super::KernelArgs; + use super::{Kernel, KernelPtx}; + + #[allow(dead_code)] + struct Launcher; + + link_kernel!(); + + impl rust_cuda::host::Launcher for Launcher { + type CompilationWatcher = (); + type KernelTraitObject = dyn Kernel; + + fn get_launch_package(&mut self) -> rust_cuda::host::LaunchPackage { + unimplemented!() + } + } +} + +#[cfg(target_os = "cuda")] +mod cuda_prelude { + use rust_cuda::device::alloc::PTXAllocator; + + #[global_allocator] + static _GLOBAL_ALLOCATOR: PTXAllocator = PTXAllocator; + + #[panic_handler] + fn panic(info: &::core::panic::PanicInfo) -> ! { + rust_cuda::device::utils::print(format_args!("{info}\n")); + + rust_cuda::device::utils::abort() + } + + #[alloc_error_handler] + fn alloc_error_handler(layout: ::core::alloc::Layout) -> ! { + let (size, align) = (layout.size(), layout.align()); + + ::core::panicking::panic_nounwind_fmt( + format_args!("memory allocation of {size} bytes with alignment {align} failed\n"), + true, + ) + } +} diff --git a/examples/single-source/.cargo/config.toml b/examples/single-source/.cargo/config.toml index 48db9d693..f7029e166 100644 --- a/examples/single-source/.cargo/config.toml +++ b/examples/single-source/.cargo/config.toml @@ -1,5 +1,5 @@ [target.nvptx64-nvidia-cuda] -rustflags = ["-Clink-args=--arch sm_35", "-Clink-arg=-O3", "-Clink-arg=--lto"] +rustflags = ["-Clink-args=--arch sm_35", "-Clinker-plugin-lto", "-Ccodegen-units=1", "-Clink-arg=-O3", "-Clink-arg=--lto"] [unstable] features = ["all"] diff --git a/examples/single-source/Cargo.toml b/examples/single-source/Cargo.toml index 351d694a0..eeada181d 100644 --- a/examples/single-source/Cargo.toml +++ b/examples/single-source/Cargo.toml @@ -1,15 +1,12 @@ [package] name = "single-source" version = "0.1.0" -authors = ["Juniper Tyree "] +authors = ["Juniper Tyree "] license = "MIT OR Apache-2.0" edition = "2021" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html -[dependencies] -const-type-layout = { version = "0.2.0" } - [target.'cfg(target_os = "cuda")'.dependencies] rc = { package = "rust-cuda", path = "../../", features = ["derive"] } diff --git a/examples/single-source/src/main.rs b/examples/single-source/src/main.rs index b80a14201..085bd3b8d 100644 --- a/examples/single-source/src/main.rs +++ b/examples/single-source/src/main.rs @@ -3,7 +3,6 @@ #![cfg_attr(target_os = "cuda", no_main)] #![cfg_attr(target_os = "cuda", feature(abi_ptx))] #![cfg_attr(target_os = "cuda", feature(alloc_error_handler))] -#![cfg_attr(target_os = "cuda", feature(stdsimd))] #![cfg_attr(target_os = "cuda", feature(asm_experimental_arch))] #![feature(const_type_name)] #![feature(offset_of)] @@ -103,8 +102,6 @@ mod host { #[cfg(target_os = "cuda")] mod cuda_prelude { - use core::arch::nvptx; - use rc::device::alloc::PTXAllocator; #[global_allocator] @@ -112,11 +109,11 @@ mod cuda_prelude { #[panic_handler] fn panic(_: &::core::panic::PanicInfo) -> ! { - unsafe { nvptx::trap() } + rc::device::utils::abort() } #[alloc_error_handler] fn alloc_error_handler(_: core::alloc::Layout) -> ! { - unsafe { nvptx::trap() } + rc::device::utils::abort() } } diff --git a/rust-cuda-derive/Cargo.toml b/rust-cuda-derive/Cargo.toml index 41ad5a33f..31a686008 100644 --- a/rust-cuda-derive/Cargo.toml +++ b/rust-cuda-derive/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "rust-cuda-derive" version = "0.1.0" -authors = ["Juniper Tyree "] +authors = ["Juniper Tyree "] license = "MIT OR Apache-2.0" edition = "2021" links = "libnvptxcompiler_static" diff --git a/rust-cuda-derive/src/kernel/link/mod.rs b/rust-cuda-derive/src/kernel/link/mod.rs index ae0b5ea63..78a352780 100644 --- a/rust-cuda-derive/src/kernel/link/mod.rs +++ b/rust-cuda-derive/src/kernel/link/mod.rs @@ -417,6 +417,12 @@ fn check_kernel_ptx( { options.push(c"--warn-on-spills"); } + if ptx_lint_levels + .get(&PtxLint::DynamicStackSize) + .map_or(true, |level| *level <= LintLevel::Warn) + { + options.push(c"--suppress-stack-size-warning"); + } options.push(c"--warning-as-error"); let options_ptrs = options.iter().map(|o| o.as_ptr()).collect::>(); @@ -454,6 +460,12 @@ fn check_kernel_ptx( { options.push(c"--warn-on-spills"); } + if ptx_lint_levels + .get(&PtxLint::DynamicStackSize) + .map_or(true, |level| *level < LintLevel::Warn) + { + options.push(c"--suppress-stack-size-warning"); + } let options_ptrs = options.iter().map(|o| o.as_ptr()).collect::>(); diff --git a/rust-cuda-derive/src/kernel/link/ptx_compiler_sys.rs b/rust-cuda-derive/src/kernel/link/ptx_compiler_sys.rs index 0ab332dad..fac72cebf 100644 --- a/rust-cuda-derive/src/kernel/link/ptx_compiler_sys.rs +++ b/rust-cuda-derive/src/kernel/link/ptx_compiler_sys.rs @@ -4,7 +4,7 @@ use thiserror::Error; pub type size_t = ::std::os::raw::c_ulonglong; #[repr(C)] -pub struct nvPTXCompiler { +pub struct NvptxCompiler { _private: [u8; 0], } @@ -60,15 +60,15 @@ impl NvptxError { } } -/// [`nvPTXCompilerHandle`] represents a handle to the PTX Compiler. +/// [`NvptxCompilerHandle`] represents a handle to the PTX Compiler. /// -/// To compile a PTX program string, an instance of [`nvPTXCompiler`] +/// To compile a PTX program string, an instance of [`NvptxCompiler`] /// must be created and the handle to it must be obtained using the /// API [`nvPTXCompilerCreate`]. Then the compilation can be done /// using the API [`nvPTXCompilerCompile`]. -pub type NvptxCompilerHandle = *mut nvPTXCompiler; +pub type NvptxCompilerHandle = *mut NvptxCompiler; -/// The [`nvPTXCompiler`] APIs return the [`nvPTXCompileResult`] codes to +/// The [`NvptxCompiler`] APIs return the [`NvptxCompileResult`] codes to /// indicate the call result"] pub type NvptxCompileResult = ::std::os::raw::c_int; diff --git a/rust-cuda-derive/src/kernel/lints.rs b/rust-cuda-derive/src/kernel/lints.rs index 6da06ed4b..6c198b71a 100644 --- a/rust-cuda-derive/src/kernel/lints.rs +++ b/rust-cuda-derive/src/kernel/lints.rs @@ -91,6 +91,7 @@ pub fn parse_ptx_lint_level( l if l == "local_memory_usage" => PtxLint::LocalMemoryUsage, l if l == "register_spills" => PtxLint::RegisterSpills, l if l == "dump_binary" => PtxLint::DumpBinary, + l if l == "dynamic_stack_size" => PtxLint::DynamicStackSize, _ => { emit_error!( meta.span(), @@ -153,6 +154,7 @@ pub enum PtxLint { LocalMemoryUsage, RegisterSpills, DumpBinary, + DynamicStackSize, } impl fmt::Display for PtxLint { @@ -163,6 +165,7 @@ impl fmt::Display for PtxLint { Self::LocalMemoryUsage => fmt.write_str("local_memory_usage"), Self::RegisterSpills => fmt.write_str("register_spills"), Self::DumpBinary => fmt.write_str("dump_binary"), + Self::DynamicStackSize => fmt.write_str("dynamic_stack_size"), } } } diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/get_ptx_str.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/get_ptx_str.rs index d62445803..10732a133 100644 --- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/get_ptx_str.rs +++ b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/get_ptx_str.rs @@ -91,6 +91,7 @@ pub(super) fn quote_get_ptx_str( #[deny(improper_ctypes)] mod __rust_cuda_ffi_safe_assert { + #[allow(unused_imports)] use super::#args; extern "C" { #( diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cuda_wrapper.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cuda_wrapper.rs index 042ae5e7a..40d4abfbf 100644 --- a/rust-cuda-derive/src/kernel/wrapper/generate/cuda_wrapper.rs +++ b/rust-cuda-derive/src/kernel/wrapper/generate/cuda_wrapper.rs @@ -122,6 +122,7 @@ pub(in super::super) fn quote_cuda_wrapper( #[deny(improper_ctypes)] mod __rust_cuda_ffi_safe_assert { + #[allow(unused_imports)] use super::#args; extern "C" { #( diff --git a/rust-cuda-derive/src/kernel/wrapper/mod.rs b/rust-cuda-derive/src/kernel/wrapper/mod.rs index a70c38e94..a812f9dd4 100644 --- a/rust-cuda-derive/src/kernel/wrapper/mod.rs +++ b/rust-cuda-derive/src/kernel/wrapper/mod.rs @@ -114,6 +114,7 @@ pub fn kernel(attr: TokenStream, func: TokenStream) -> TokenStream { let _ = ptx_lint_levels.try_insert(PtxLint::LocalMemoryUsage, LintLevel::Warn); let _ = ptx_lint_levels.try_insert(PtxLint::RegisterSpills, LintLevel::Warn); let _ = ptx_lint_levels.try_insert(PtxLint::DumpBinary, LintLevel::Allow); + let _ = ptx_lint_levels.try_insert(PtxLint::DynamicStackSize, LintLevel::Warn); let ptx_lint_levels = { let (lints, levels): (Vec, Vec) = ptx_lint_levels diff --git a/rust-cuda-ptx-jit/Cargo.toml b/rust-cuda-ptx-jit/Cargo.toml index aa7fa32c6..dc5fe4249 100644 --- a/rust-cuda-ptx-jit/Cargo.toml +++ b/rust-cuda-ptx-jit/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "rust-cuda-ptx-jit" version = "0.1.0" -authors = ["Juniper Tyree "] +authors = ["Juniper Tyree "] license = "MIT OR Apache-2.0" edition = "2021" diff --git a/src/device/macros.rs b/src/device/macros.rs deleted file mode 100644 index 932ca75ae..000000000 --- a/src/device/macros.rs +++ /dev/null @@ -1,115 +0,0 @@ -// Based on https://github.com/popzxc/stdext-rs/blob/master/src/macros.rs -#[macro_export] -#[doc(hidden)] -macro_rules! function { - () => {{ - // Hack to get the name of the enclosing function - fn f() {} - fn type_name_of(_: T) -> &'static str { - core::any::type_name::() - } - let name = type_name_of(f); - - // Remove the `::f` suffix - &name[..name.len() - 3] - }}; -} - -/// Alternative of [`std::print!`](https://doc.rust-lang.org/std/macro.print.html) using CUDA `vprintf` system-call -#[doc(cfg(all(not(feature = "host"), target_os = "cuda")))] -#[macro_export] -macro_rules! print { - ($($arg:tt)*) => { - let msg = $crate::alloc::format!($($arg)*); - - #[allow(unused_unsafe)] - unsafe { - ::core::arch::nvptx::vprintf(msg.as_ptr(), ::core::ptr::null_mut()); - } - } -} - -/// Alternative of [`std::println!`](https://doc.rust-lang.org/std/macro.println.html) using CUDA `vprintf` system-call -#[doc(cfg(all(not(feature = "host"), target_os = "cuda")))] -#[macro_export] -macro_rules! println { - () => ($crate::print!("\n")); - ($fmt:expr) => ($crate::print!(concat!($fmt, "\n"))); - ($fmt:expr, $($arg:tt)*) => ($crate::print!(concat!($fmt, "\n"), $($arg)*)); -} - -/// Assertion in GPU kernel for one expression is true. -#[doc(cfg(all(not(feature = "host"), target_os = "cuda")))] -#[macro_export] -macro_rules! assert { - ($e:expr) => { - if !$e { - let msg = $crate::alloc::format!( - "\nassertion failed: {}\nexpression: {:?}", - stringify!($e), - $e, - ); - - unsafe { - ::core::arch::nvptx::__assert_fail( - msg.as_ptr(), - file!().as_ptr(), - line!(), - $crate::function!().as_ptr(), - ) - }; - } - }; -} - -/// Assertion in GPU kernel for two expressions are equal. -#[doc(cfg(all(not(feature = "host"), target_os = "cuda")))] -#[macro_export] -macro_rules! assert_eq { - ($a:expr, $b:expr) => { - if $a != $b { - let msg = $crate::alloc::format!( - "\nassertion failed: ({} == {})\nleft : {:?}\nright: {:?}", - stringify!($a), - stringify!($b), - $a, - $b - ); - - unsafe { - ::core::arch::nvptx::__assert_fail( - msg.as_ptr(), - file!().as_ptr(), - line!(), - $crate::function!().as_ptr(), - ) - }; - } - }; -} - -/// Assertion in GPU kernel for two expressions are not equal. -#[doc(cfg(all(not(feature = "host"), target_os = "cuda")))] -#[macro_export] -macro_rules! assert_ne { - ($a:expr, $b:expr) => { - if $a == $b { - let msg = $crate::alloc::format!( - "\nassertion failed: ({} != {})\nleft : {:?}\nright: {:?}", - stringify!($a), - stringify!($b), - $a, - $b - ); - - unsafe { - ::core::arch::nvptx::__assert_fail( - msg.as_ptr(), - file!().as_ptr(), - line!(), - $crate::function!().as_ptr(), - ) - }; - } - }; -} diff --git a/src/device/mod.rs b/src/device/mod.rs index 699424355..ca9aab9fd 100644 --- a/src/device/mod.rs +++ b/src/device/mod.rs @@ -14,8 +14,7 @@ use crate::{ pub mod alloc; pub mod thread; - -mod macros; +pub mod utils; pub trait BorrowFromRust: RustToCuda { /// # Safety diff --git a/src/device/utils.rs b/src/device/utils.rs new file mode 100644 index 000000000..e12f5b83c --- /dev/null +++ b/src/device/utils.rs @@ -0,0 +1,28 @@ +/// Abort the CUDA kernel using the `trap` system call. +#[allow(clippy::inline_always)] +#[inline(always)] +pub fn abort() -> ! { + unsafe { ::core::arch::nvptx::trap() } +} + +/// The [`print`](print()) function takes an [`Arguments`](core::fmt::Arguments) +/// struct and formats and prints it to the CUDA kernel's standard output using +/// the `vprintf` system call. +/// +/// The [`Arguments`](core::fmt::Arguments) instance can be created with the +/// [`format_args!`](core::format_args) macro. +#[allow(clippy::inline_always)] +#[inline(always)] +pub fn print(args: ::core::fmt::Arguments) { + let msg; // place to store the dynamically expanded format string + let msg = if let Some(msg) = args.as_str() { + msg + } else { + msg = ::alloc::fmt::format(args); + msg.as_str() + }; + + unsafe { + ::core::arch::nvptx::vprintf(msg.as_ptr(), ::core::ptr::null_mut()); + } +} From 068e4584d7be8818d3ee16c7e8547a16b498728f Mon Sep 17 00:00:00 2001 From: Juniper Tyree Date: Sat, 16 Dec 2023 00:06:00 +0000 Subject: [PATCH 044/120] Switch to using more vprintf in embedded CUDA kernel --- examples/print/src/main.rs | 49 ++++++++++++++++++++++++++++---------- src/device/alloc.rs | 4 ++++ src/device/utils.rs | 16 ++++++++++++- src/lib.rs | 3 +++ 4 files changed, 59 insertions(+), 13 deletions(-) diff --git a/examples/print/src/main.rs b/examples/print/src/main.rs index 49492e0a4..4d38e0ede 100644 --- a/examples/print/src/main.rs +++ b/examples/print/src/main.rs @@ -4,7 +4,9 @@ #![cfg_attr(target_os = "cuda", feature(abi_ptx))] #![cfg_attr(target_os = "cuda", feature(alloc_error_handler))] #![cfg_attr(target_os = "cuda", feature(asm_experimental_arch))] -#![cfg_attr(target_os = "cuda", feature(core_panic))] +#![feature(ptr_from_ref)] +#![feature(stdsimd)] +#![feature(c_str_literals)] extern crate alloc; @@ -12,14 +14,11 @@ extern crate alloc; fn main() {} #[rust_cuda::common::kernel(use link_kernel! as impl Kernel for Launcher)] -#[kernel(deny( - ptx::double_precision_use, - ptx::local_memory_usage, - ptx::register_spills, - ptx::dynamic_stack_size -))] +#[kernel(allow(ptx::local_memory_usage))] pub fn kernel() { rust_cuda::device::utils::print(format_args!("println! from CUDA kernel")); + + ::alloc::alloc::handle_alloc_error(::core::alloc::Layout::new::()); } #[cfg(not(target_os = "cuda"))] @@ -58,12 +57,38 @@ mod cuda_prelude { } #[alloc_error_handler] + #[track_caller] fn alloc_error_handler(layout: ::core::alloc::Layout) -> ! { - let (size, align) = (layout.size(), layout.align()); + #[repr(C)] + struct FormatArgs { + size: usize, + align: usize, + file_len: u32, + file_ptr: *const u8, + line: u32, + column: u32, + } + + let location = ::core::panic::Location::caller(); - ::core::panicking::panic_nounwind_fmt( - format_args!("memory allocation of {size} bytes with alignment {align} failed\n"), - true, - ) + unsafe { + ::core::arch::nvptx::vprintf( + c"memory allocation of %llu bytes with alignment %llu failed at %.*s:%lu:%lu\n" + .as_ptr() + .cast(), + #[allow(clippy::cast_possible_truncation)] + ::core::ptr::from_ref(&FormatArgs { + size: layout.size(), + align: layout.align(), + file_len: location.file().len() as u32, + file_ptr: location.file().as_ptr(), + line: location.line(), + column: location.column(), + }) + .cast(), + ); + } + + rust_cuda::device::utils::abort() } } diff --git a/src/device/alloc.rs b/src/device/alloc.rs index 14a294814..0217fa939 100644 --- a/src/device/alloc.rs +++ b/src/device/alloc.rs @@ -6,10 +6,14 @@ use core::arch::nvptx; pub struct PTXAllocator; unsafe impl GlobalAlloc for PTXAllocator { + #[allow(clippy::inline_always)] + #[inline(always)] unsafe fn alloc(&self, layout: Layout) -> *mut u8 { nvptx::malloc(layout.size()).cast() } + #[allow(clippy::inline_always)] + #[inline(always)] unsafe fn dealloc(&self, ptr: *mut u8, _layout: Layout) { nvptx::free(ptr.cast()); } diff --git a/src/device/utils.rs b/src/device/utils.rs index e12f5b83c..bac1c6d3b 100644 --- a/src/device/utils.rs +++ b/src/device/utils.rs @@ -14,6 +14,12 @@ pub fn abort() -> ! { #[allow(clippy::inline_always)] #[inline(always)] pub fn print(args: ::core::fmt::Arguments) { + #[repr(C)] + struct FormatArgs { + msg_len: u32, + msg_ptr: *const u8, + } + let msg; // place to store the dynamically expanded format string let msg = if let Some(msg) = args.as_str() { msg @@ -23,6 +29,14 @@ pub fn print(args: ::core::fmt::Arguments) { }; unsafe { - ::core::arch::nvptx::vprintf(msg.as_ptr(), ::core::ptr::null_mut()); + ::core::arch::nvptx::vprintf( + c"%.*s".as_ptr().cast(), + #[allow(clippy::cast_possible_truncation)] + ::core::ptr::from_ref(&FormatArgs { + msg_len: msg.len() as u32, + msg_ptr: msg.as_ptr(), + }) + .cast(), + ); } } diff --git a/src/lib.rs b/src/lib.rs index 273e27779..0316613c9 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -23,6 +23,9 @@ #![feature(adt_const_params)] #![feature(impl_trait_in_assoc_type)] #![feature(ptr_metadata)] +#![feature(ptr_from_ref)] +#![feature(cfg_version)] +#![cfg_attr(not(version("1.76.0")), feature(c_str_literals))] #![allow(incomplete_features)] #![feature(generic_const_exprs)] #![cfg_attr(target_os = "cuda", feature(slice_ptr_get))] From eb1a9b4bf6c20c6778d134951ee0b4d1f42ac1a5 Mon Sep 17 00:00:00 2001 From: Juniper Tyree Date: Sat, 16 Dec 2023 01:45:07 -0800 Subject: [PATCH 045/120] Make print example fully executable --- examples/print/src/main.rs | 92 +++++++++++++++++++++++++++++++++++--- src/device/utils.rs | 2 +- 2 files changed, 86 insertions(+), 8 deletions(-) diff --git a/examples/print/src/main.rs b/examples/print/src/main.rs index 4d38e0ede..94302a7a9 100644 --- a/examples/print/src/main.rs +++ b/examples/print/src/main.rs @@ -11,12 +11,52 @@ extern crate alloc; #[cfg(not(target_os = "cuda"))] -fn main() {} +fn main() -> rust_cuda::rustacuda::error::CudaResult<()> { + // Initialize the CUDA API + rust_cuda::rustacuda::init(rust_cuda::rustacuda::CudaFlags::empty())?; + + // Get the first device + let device = rust_cuda::rustacuda::device::Device::get_device(0)?; + + // Create a context associated to this device + let context = rust_cuda::host::CudaDropWrapper::from( + rust_cuda::rustacuda::context::Context::create_and_push( + rust_cuda::rustacuda::context::ContextFlags::MAP_HOST + | rust_cuda::rustacuda::context::ContextFlags::SCHED_AUTO, + device, + )?, + ); + + rust_cuda::rustacuda::context::CurrentContext::set_resource_limit( + rust_cuda::rustacuda::context::ResourceLimit::StackSize, + 4096, + )?; + rust_cuda::rustacuda::context::CurrentContext::set_resource_limit( + rust_cuda::rustacuda::context::ResourceLimit::PrintfFifoSize, + 4096, + )?; + + let stream = rust_cuda::host::CudaDropWrapper::from(rust_cuda::rustacuda::stream::Stream::new( + rust_cuda::rustacuda::stream::StreamFlags::NON_BLOCKING, + None, + )?); + + let mut kernel = host::Launcher::try_new( + rust_cuda::rustacuda::function::GridSize::x(1), + rust_cuda::rustacuda::function::BlockSize::x(4), + )?; + + kernel.kernel(&stream)?; + + std::mem::drop(context); + + Ok(()) +} -#[rust_cuda::common::kernel(use link_kernel! as impl Kernel for Launcher)] +#[rust_cuda::common::kernel(pub use link_kernel! as impl Kernel for Launcher)] #[kernel(allow(ptx::local_memory_usage))] pub fn kernel() { - rust_cuda::device::utils::print(format_args!("println! from CUDA kernel")); + rust_cuda::device::utils::print(format_args!("print from CUDA kernel\n")); ::alloc::alloc::handle_alloc_error(::core::alloc::Layout::new::()); } @@ -27,8 +67,28 @@ mod host { use super::KernelArgs; use super::{Kernel, KernelPtx}; - #[allow(dead_code)] - struct Launcher; + pub struct Launcher { + kernel: rust_cuda::host::TypedKernel, + grid: rust_cuda::rustacuda::function::GridSize, + block: rust_cuda::rustacuda::function::BlockSize, + watcher: (), + } + + impl Launcher { + pub fn try_new( + grid: rust_cuda::rustacuda::function::GridSize, + block: rust_cuda::rustacuda::function::BlockSize, + ) -> rust_cuda::rustacuda::error::CudaResult { + let kernel = Self::new_kernel()?; + + Ok(Self { + kernel, + grid, + block, + watcher: (), + }) + } + } link_kernel!(); @@ -37,7 +97,18 @@ mod host { type KernelTraitObject = dyn Kernel; fn get_launch_package(&mut self) -> rust_cuda::host::LaunchPackage { - unimplemented!() + rust_cuda::host::LaunchPackage { + config: rust_cuda::host::LaunchConfig { + grid: self.grid.clone(), + block: self.block.clone(), + shared_memory_size: 0_u32, + ptx_jit: false, + }, + + kernel: &mut self.kernel, + + watcher: &mut self.watcher, + } } } } @@ -63,23 +134,30 @@ mod cuda_prelude { struct FormatArgs { size: usize, align: usize, + thread_idx_x: u32, + thread_idx_y: u32, + thread_idx_z: u32, file_len: u32, file_ptr: *const u8, line: u32, column: u32, } + let thread_idx = rust_cuda::device::thread::Thread::this().idx(); let location = ::core::panic::Location::caller(); unsafe { ::core::arch::nvptx::vprintf( - c"memory allocation of %llu bytes with alignment %llu failed at %.*s:%lu:%lu\n" + c"memory allocation of %llu bytes with alignment %llu failed on thread (x=%u, y=%u, z=%u) at %*s:%u:%u\n" .as_ptr() .cast(), #[allow(clippy::cast_possible_truncation)] ::core::ptr::from_ref(&FormatArgs { size: layout.size(), align: layout.align(), + thread_idx_x: thread_idx.x, + thread_idx_y: thread_idx.y, + thread_idx_z: thread_idx.z, file_len: location.file().len() as u32, file_ptr: location.file().as_ptr(), line: location.line(), diff --git a/src/device/utils.rs b/src/device/utils.rs index bac1c6d3b..8e644be48 100644 --- a/src/device/utils.rs +++ b/src/device/utils.rs @@ -30,7 +30,7 @@ pub fn print(args: ::core::fmt::Arguments) { unsafe { ::core::arch::nvptx::vprintf( - c"%.*s".as_ptr().cast(), + c"%*s".as_ptr().cast(), #[allow(clippy::cast_possible_truncation)] ::core::ptr::from_ref(&FormatArgs { msg_len: msg.len() as u32, From b0303d6bbbee5a5845f45bb65525dbc4cad00535 Mon Sep 17 00:00:00 2001 From: Juniper Tyree Date: Sat, 16 Dec 2023 13:31:49 +0000 Subject: [PATCH 046/120] Clean up the print example --- examples/print/.cargo/config.toml | 3 - examples/print/src/main.rs | 167 +++++++--------------- examples/single-source/.cargo/config.toml | 3 - examples/single-source/src/main.rs | 2 +- src/device/thread.rs | 22 +++ src/device/utils.rs | 159 ++++++++++++++++++-- src/host.rs | 24 +++- src/lib.rs | 3 + 8 files changed, 248 insertions(+), 135 deletions(-) diff --git a/examples/print/.cargo/config.toml b/examples/print/.cargo/config.toml index f7029e166..4a98afe58 100644 --- a/examples/print/.cargo/config.toml +++ b/examples/print/.cargo/config.toml @@ -1,5 +1,2 @@ [target.nvptx64-nvidia-cuda] rustflags = ["-Clink-args=--arch sm_35", "-Clinker-plugin-lto", "-Ccodegen-units=1", "-Clink-arg=-O3", "-Clink-arg=--lto"] - -[unstable] -features = ["all"] diff --git a/examples/print/src/main.rs b/examples/print/src/main.rs index 94302a7a9..c74380bf2 100644 --- a/examples/print/src/main.rs +++ b/examples/print/src/main.rs @@ -4,22 +4,45 @@ #![cfg_attr(target_os = "cuda", feature(abi_ptx))] #![cfg_attr(target_os = "cuda", feature(alloc_error_handler))] #![cfg_attr(target_os = "cuda", feature(asm_experimental_arch))] -#![feature(ptr_from_ref)] -#![feature(stdsimd)] -#![feature(c_str_literals)] +#![feature(const_type_name)] extern crate alloc; +#[derive(rust_cuda::const_type_layout::TypeLayout)] +#[layout(crate = "rust_cuda::const_type_layout")] +#[repr(C)] +pub enum Action { + Print, + Panic, + AllocError, +} + +#[rust_cuda::common::kernel(pub use link_kernel! as impl Kernel for Launcher)] +#[kernel(allow(ptx::local_memory_usage))] +pub fn kernel(#[kernel(pass = SafeDeviceCopy)] action: Action) { + match action { + Action::Print => rust_cuda::device::utils::println!("println! from CUDA kernel"), + Action::Panic => panic!("panic! from CUDA kernel"), + Action::AllocError => { + ::alloc::alloc::handle_alloc_error(::core::alloc::Layout::new::()) + }, + } +} + #[cfg(not(target_os = "cuda"))] fn main() -> rust_cuda::rustacuda::error::CudaResult<()> { + // Link the non-generic CUDA kernel + type Launcher = rust_cuda::host::SimpleKernelLauncher; + link_kernel!(); + // Initialize the CUDA API rust_cuda::rustacuda::init(rust_cuda::rustacuda::CudaFlags::empty())?; - // Get the first device + // Get the first CUDA GPU device let device = rust_cuda::rustacuda::device::Device::get_device(0)?; - // Create a context associated to this device - let context = rust_cuda::host::CudaDropWrapper::from( + // Create a CUDA context associated to this device + let _context = rust_cuda::host::CudaDropWrapper::from( rust_cuda::rustacuda::context::Context::create_and_push( rust_cuda::rustacuda::context::ContextFlags::MAP_HOST | rust_cuda::rustacuda::context::ContextFlags::SCHED_AUTO, @@ -36,83 +59,34 @@ fn main() -> rust_cuda::rustacuda::error::CudaResult<()> { 4096, )?; + // Create a new CUDA stream to submit kernels to let stream = rust_cuda::host::CudaDropWrapper::from(rust_cuda::rustacuda::stream::Stream::new( rust_cuda::rustacuda::stream::StreamFlags::NON_BLOCKING, None, )?); - let mut kernel = host::Launcher::try_new( - rust_cuda::rustacuda::function::GridSize::x(1), - rust_cuda::rustacuda::function::BlockSize::x(4), - )?; - - kernel.kernel(&stream)?; - - std::mem::drop(context); + // Create a new launcher for the CUDA kernel + let mut launcher = Launcher { + kernel: ::new_kernel()?, + config: rust_cuda::host::LaunchConfig { + grid: rust_cuda::rustacuda::function::GridSize::x(1), + block: rust_cuda::rustacuda::function::BlockSize::x(4), + shared_memory_size: 0, + ptx_jit: false, + }, + }; + + // Launch the CUDA kernel on the stream and synchronise to its completion + println!("Launching print kernel ..."); + launcher.kernel(&stream, Action::Print).unwrap(); + println!("Launching panic kernel ..."); + launcher.kernel(&stream, Action::Panic).unwrap_err(); + println!("Launching alloc error kernel ..."); + launcher.kernel(&stream, Action::AllocError).unwrap_err(); Ok(()) } -#[rust_cuda::common::kernel(pub use link_kernel! as impl Kernel for Launcher)] -#[kernel(allow(ptx::local_memory_usage))] -pub fn kernel() { - rust_cuda::device::utils::print(format_args!("print from CUDA kernel\n")); - - ::alloc::alloc::handle_alloc_error(::core::alloc::Layout::new::()); -} - -#[cfg(not(target_os = "cuda"))] -mod host { - #[allow(unused_imports)] - use super::KernelArgs; - use super::{Kernel, KernelPtx}; - - pub struct Launcher { - kernel: rust_cuda::host::TypedKernel, - grid: rust_cuda::rustacuda::function::GridSize, - block: rust_cuda::rustacuda::function::BlockSize, - watcher: (), - } - - impl Launcher { - pub fn try_new( - grid: rust_cuda::rustacuda::function::GridSize, - block: rust_cuda::rustacuda::function::BlockSize, - ) -> rust_cuda::rustacuda::error::CudaResult { - let kernel = Self::new_kernel()?; - - Ok(Self { - kernel, - grid, - block, - watcher: (), - }) - } - } - - link_kernel!(); - - impl rust_cuda::host::Launcher for Launcher { - type CompilationWatcher = (); - type KernelTraitObject = dyn Kernel; - - fn get_launch_package(&mut self) -> rust_cuda::host::LaunchPackage { - rust_cuda::host::LaunchPackage { - config: rust_cuda::host::LaunchConfig { - grid: self.grid.clone(), - block: self.block.clone(), - shared_memory_size: 0_u32, - ptx_jit: false, - }, - - kernel: &mut self.kernel, - - watcher: &mut self.watcher, - } - } - } -} - #[cfg(target_os = "cuda")] mod cuda_prelude { use rust_cuda::device::alloc::PTXAllocator; @@ -122,51 +96,14 @@ mod cuda_prelude { #[panic_handler] fn panic(info: &::core::panic::PanicInfo) -> ! { - rust_cuda::device::utils::print(format_args!("{info}\n")); - - rust_cuda::device::utils::abort() + // pretty format and print the panic message + // but don't allow dynamic formatting or panic payload downcasting + rust_cuda::device::utils::pretty_panic_handler(info, false, false) } #[alloc_error_handler] #[track_caller] fn alloc_error_handler(layout: ::core::alloc::Layout) -> ! { - #[repr(C)] - struct FormatArgs { - size: usize, - align: usize, - thread_idx_x: u32, - thread_idx_y: u32, - thread_idx_z: u32, - file_len: u32, - file_ptr: *const u8, - line: u32, - column: u32, - } - - let thread_idx = rust_cuda::device::thread::Thread::this().idx(); - let location = ::core::panic::Location::caller(); - - unsafe { - ::core::arch::nvptx::vprintf( - c"memory allocation of %llu bytes with alignment %llu failed on thread (x=%u, y=%u, z=%u) at %*s:%u:%u\n" - .as_ptr() - .cast(), - #[allow(clippy::cast_possible_truncation)] - ::core::ptr::from_ref(&FormatArgs { - size: layout.size(), - align: layout.align(), - thread_idx_x: thread_idx.x, - thread_idx_y: thread_idx.y, - thread_idx_z: thread_idx.z, - file_len: location.file().len() as u32, - file_ptr: location.file().as_ptr(), - line: location.line(), - column: location.column(), - }) - .cast(), - ); - } - - rust_cuda::device::utils::abort() + rust_cuda::device::utils::pretty_alloc_error_handler(layout) } } diff --git a/examples/single-source/.cargo/config.toml b/examples/single-source/.cargo/config.toml index f7029e166..4a98afe58 100644 --- a/examples/single-source/.cargo/config.toml +++ b/examples/single-source/.cargo/config.toml @@ -1,5 +1,2 @@ [target.nvptx64-nvidia-cuda] rustflags = ["-Clink-args=--arch sm_35", "-Clinker-plugin-lto", "-Ccodegen-units=1", "-Clink-arg=-O3", "-Clink-arg=--lto"] - -[unstable] -features = ["all"] diff --git a/examples/single-source/src/main.rs b/examples/single-source/src/main.rs index 085bd3b8d..97e0be020 100644 --- a/examples/single-source/src/main.rs +++ b/examples/single-source/src/main.rs @@ -91,7 +91,7 @@ mod host { link_kernel!(rc::utils::device_copy::SafeDeviceCopyWrapper); impl rc::host::Launcher for Launcher { - type CompilationWatcher = (); + type CompilationWatcher<'a> = (); type KernelTraitObject = dyn Kernel; fn get_launch_package(&mut self) -> rc::host::LaunchPackage { diff --git a/src/device/thread.rs b/src/device/thread.rs index 8f3bc5719..26ee357d2 100644 --- a/src/device/thread.rs +++ b/src/device/thread.rs @@ -18,11 +18,15 @@ pub struct ThreadBlockGrid { impl Thread { #[must_use] + #[allow(clippy::inline_always)] + #[inline(always)] pub fn this() -> Self { Self { _private: () } } #[must_use] + #[allow(clippy::inline_always)] + #[inline(always)] pub fn index(&self) -> usize { let block = self.block(); let grid = block.grid(); @@ -34,6 +38,8 @@ impl Thread { } #[must_use] + #[allow(clippy::inline_always)] + #[inline(always)] pub fn idx(&self) -> Idx3 { #[allow(clippy::cast_sign_loss)] unsafe { @@ -46,6 +52,8 @@ impl Thread { } #[must_use] + #[allow(clippy::inline_always)] + #[inline(always)] pub fn block(&self) -> ThreadBlock { ThreadBlock { _private: () } } @@ -53,6 +61,8 @@ impl Thread { impl ThreadBlock { #[must_use] + #[allow(clippy::inline_always)] + #[inline(always)] pub fn dim(&self) -> Dim3 { #[allow(clippy::cast_sign_loss)] unsafe { @@ -65,6 +75,8 @@ impl ThreadBlock { } #[must_use] + #[allow(clippy::inline_always)] + #[inline(always)] pub fn idx(&self) -> Idx3 { #[allow(clippy::cast_sign_loss)] unsafe { @@ -77,10 +89,14 @@ impl ThreadBlock { } #[must_use] + #[allow(clippy::inline_always)] + #[inline(always)] pub fn grid(&self) -> ThreadBlockGrid { ThreadBlockGrid { _private: () } } + #[allow(clippy::inline_always)] + #[inline(always)] pub fn synchronize(&self) { unsafe { nvptx::_syncthreads() } } @@ -88,6 +104,8 @@ impl ThreadBlock { impl ThreadBlockGrid { #[must_use] + #[allow(clippy::inline_always)] + #[inline(always)] pub fn dim(&self) -> Dim3 { #[allow(clippy::cast_sign_loss)] unsafe { @@ -118,6 +136,8 @@ pub struct Idx3 { impl Dim3 { #[must_use] + #[allow(clippy::inline_always)] + #[inline(always)] pub fn size(&self) -> usize { (self.x as usize) * (self.y as usize) * (self.z as usize) } @@ -125,6 +145,8 @@ impl Dim3 { impl Idx3 { #[must_use] + #[allow(clippy::inline_always)] + #[inline(always)] pub fn as_id(&self, dim: &Dim3) -> usize { (self.x as usize) + (self.y as usize) * (dim.x as usize) diff --git a/src/device/utils.rs b/src/device/utils.rs index 8e644be48..e7206e118 100644 --- a/src/device/utils.rs +++ b/src/device/utils.rs @@ -5,9 +5,30 @@ pub fn abort() -> ! { unsafe { ::core::arch::nvptx::trap() } } -/// The [`print`](print()) function takes an [`Arguments`](core::fmt::Arguments) -/// struct and formats and prints it to the CUDA kernel's standard output using -/// the `vprintf` system call. +/// Prints to the CUDA kernel's standard output using the `vprintf` system call. +/// +/// Replacement for the [`std::print!`] macro, which now forwards to the +/// [`print()`] function. +pub macro print($($arg:tt)*) { + self::print(::core::format_args!($($arg)*)) +} + +/// Prints to the CUDA kernel's standard output using the `vprintf` system call. +/// +/// Replacement for the [`std::println!`] macro, which now forwards to the +/// [`print()`] function. +pub macro println { + () => { + self::print(::core::format_args!("\n")) + }, + ($($arg:tt)*) => { + self::print(::core::format_args!("{}\n", ::core::format_args!($($arg)*))) + }, +} + +/// The [`print()`] function takes an [`Arguments`](core::fmt::Arguments) struct +/// and formats and prints it to the CUDA kernel's standard output using the +/// `vprintf` system call. /// /// The [`Arguments`](core::fmt::Arguments) instance can be created with the /// [`format_args!`](core::format_args) macro. @@ -28,15 +49,133 @@ pub fn print(args: ::core::fmt::Arguments) { msg.as_str() }; + let args = FormatArgs { + msg_len: u32::try_from(msg.len()).unwrap_or(u32::MAX), + msg_ptr: msg.as_ptr(), + }; + + unsafe { + ::core::arch::nvptx::vprintf(c"%*s".as_ptr().cast(), ::core::ptr::from_ref(&args).cast()); + } +} + +// TODO: docs +#[allow(clippy::inline_always)] +#[inline(always)] +pub fn pretty_panic_handler( + info: &::core::panic::PanicInfo, + allow_dynamic_message: bool, + allow_dynamic_payload: bool, +) -> ! { + #[repr(C)] + struct FormatArgs { + file_len: u32, + file_ptr: *const u8, + line: u32, + column: u32, + thread_idx_x: u32, + thread_idx_y: u32, + thread_idx_z: u32, + msg_len: u32, + msg_ptr: *const u8, + } + + let msg; // place to store the dynamically expanded format string + let msg = if let Some(message) = info.message() { + if let Some(msg) = message.as_str() { + msg + } else if allow_dynamic_message { + msg = ::alloc::fmt::format(*message); + msg.as_str() + } else { + "" + } + } else if let Some(msg) = info.payload().downcast_ref::<&'static str>() + && allow_dynamic_payload + { + msg + } else if let Some(msg) = info.payload().downcast_ref::<::alloc::string::String>() + && allow_dynamic_payload + { + msg.as_str() + } else { + "" + }; + + let location_line = info.location().map_or(0, ::core::panic::Location::line); + let location_column = info.location().map_or(0, ::core::panic::Location::column); + let location_file = info + .location() + .map_or("", ::core::panic::Location::file); + + let thread_idx = crate::device::thread::Thread::this().idx(); + + let args = FormatArgs { + file_len: u32::try_from(location_file.len()).unwrap_or(u32::MAX), + file_ptr: location_file.as_ptr(), + line: location_line, + column: location_column, + thread_idx_x: thread_idx.x, + thread_idx_y: thread_idx.y, + thread_idx_z: thread_idx.z, + msg_len: u32::try_from(msg.len()).unwrap_or(u32::MAX), + msg_ptr: msg.as_ptr(), + }; + unsafe { ::core::arch::nvptx::vprintf( - c"%*s".as_ptr().cast(), - #[allow(clippy::cast_possible_truncation)] - ::core::ptr::from_ref(&FormatArgs { - msg_len: msg.len() as u32, - msg_ptr: msg.as_ptr(), - }) - .cast(), + c"panicked at %*s:%u:%u on thread (x=%u, y=%u, z=%u):\n%*s\n" + .as_ptr() + .cast(), + ::core::ptr::from_ref(&args).cast(), ); } + + abort() +} + +// TODO: docs +#[track_caller] +#[allow(clippy::inline_always)] +#[inline(always)] +pub fn pretty_alloc_error_handler(layout: ::core::alloc::Layout) -> ! { + #[repr(C)] + struct FormatArgs { + size: usize, + align: usize, + file_len: u32, + file_ptr: *const u8, + line: u32, + column: u32, + thread_idx_x: u32, + thread_idx_y: u32, + thread_idx_z: u32, + } + + let location = ::core::panic::Location::caller(); + let thread_idx = crate::device::thread::Thread::this().idx(); + + let args = FormatArgs { + size: layout.size(), + align: layout.align(), + file_len: u32::try_from(location.file().len()).unwrap_or(u32::MAX), + file_ptr: location.file().as_ptr(), + line: location.line(), + column: location.column(), + thread_idx_x: thread_idx.x, + thread_idx_y: thread_idx.y, + thread_idx_z: thread_idx.z, + }; + + unsafe { + ::core::arch::nvptx::vprintf( + c"memory allocation of %llu bytes with alignment %llu failed at \ + %*s:%u:%u on thread (x=%u, y=%u, z=%u)\n" + .as_ptr() + .cast(), + ::core::ptr::from_ref(&args).cast(), + ); + } + + abort() } diff --git a/src/host.rs b/src/host.rs index 9709798d3..1bdce5ee6 100644 --- a/src/host.rs +++ b/src/host.rs @@ -29,7 +29,7 @@ use crate::{ pub trait Launcher { type KernelTraitObject: ?Sized; - type CompilationWatcher; + type CompilationWatcher<'a>; fn get_launch_package(&mut self) -> LaunchPackage; @@ -38,7 +38,7 @@ pub trait Launcher { /// Should only return a [`CudaError`] if some implementation-defined /// critical kernel function configuration failed. #[allow(unused_variables)] - fn on_compile(kernel: &Function, watcher: &mut Self::CompilationWatcher) -> CudaResult<()> { + fn on_compile(kernel: &Function, watcher: Self::CompilationWatcher<'_>) -> CudaResult<()> { Ok(()) } } @@ -54,7 +54,25 @@ pub struct LaunchConfig { pub struct LaunchPackage<'l, L: ?Sized + Launcher> { pub config: LaunchConfig, pub kernel: &'l mut TypedKernel, - pub watcher: &'l mut L::CompilationWatcher, + pub watcher: L::CompilationWatcher<'l>, +} + +pub struct SimpleKernelLauncher { + pub kernel: TypedKernel, + pub config: LaunchConfig, +} + +impl Launcher for SimpleKernelLauncher { + type CompilationWatcher<'a> = (); + type KernelTraitObject = KernelTraitObject; + + fn get_launch_package(&mut self) -> LaunchPackage { + LaunchPackage { + config: self.config.clone(), + kernel: &mut self.kernel, + watcher: (), + } + } } pub enum KernelJITResult<'k> { diff --git a/src/lib.rs b/src/lib.rs index 0316613c9..5ac5f8218 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -24,6 +24,9 @@ #![feature(impl_trait_in_assoc_type)] #![feature(ptr_metadata)] #![feature(ptr_from_ref)] +#![feature(decl_macro)] +#![feature(panic_info_message)] +#![feature(let_chains)] #![feature(cfg_version)] #![cfg_attr(not(version("1.76.0")), feature(c_str_literals))] #![allow(incomplete_features)] From df09a966393be7720853d1ab0b00567702143b59 Mon Sep 17 00:00:00 2001 From: Juniper Tyree Date: Sat, 16 Dec 2023 14:20:54 +0000 Subject: [PATCH 047/120] ptr_from_ref is stable from 1.76 --- rust-cuda-ptx-jit/src/lib.rs | 3 ++- src/lib.rs | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/rust-cuda-ptx-jit/src/lib.rs b/rust-cuda-ptx-jit/src/lib.rs index 1f22b2830..802e26cdd 100644 --- a/rust-cuda-ptx-jit/src/lib.rs +++ b/rust-cuda-ptx-jit/src/lib.rs @@ -1,6 +1,7 @@ #![deny(clippy::pedantic)] #![cfg_attr(not(feature = "host"), no_std)] -#![feature(ptr_from_ref)] +#![feature(cfg_version)] +#![cfg_attr(not(version("1.76.0")), feature(ptr_from_ref))] #![feature(doc_cfg)] #![doc(html_root_url = "https://juntyr.github.io/rust-cuda/")] diff --git a/src/lib.rs b/src/lib.rs index 5ac5f8218..100e95325 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -23,12 +23,12 @@ #![feature(adt_const_params)] #![feature(impl_trait_in_assoc_type)] #![feature(ptr_metadata)] -#![feature(ptr_from_ref)] #![feature(decl_macro)] #![feature(panic_info_message)] #![feature(let_chains)] #![feature(cfg_version)] #![cfg_attr(not(version("1.76.0")), feature(c_str_literals))] +#![cfg_attr(not(version("1.76.0")), feature(ptr_from_ref))] #![allow(incomplete_features)] #![feature(generic_const_exprs)] #![cfg_attr(target_os = "cuda", feature(slice_ptr_get))] From a49dd175698010d3ec985526c0b28c67deed50db Mon Sep 17 00:00:00 2001 From: Juniper Tyree Date: Sat, 16 Dec 2023 15:34:15 +0000 Subject: [PATCH 048/120] Exit on CUDA panic instead of abort to allow the host to handle the error --- examples/print/src/main.rs | 15 +++------------ src/device/utils.rs | 10 ++++++++-- 2 files changed, 11 insertions(+), 14 deletions(-) diff --git a/examples/print/src/main.rs b/examples/print/src/main.rs index c74380bf2..68c16ab03 100644 --- a/examples/print/src/main.rs +++ b/examples/print/src/main.rs @@ -50,15 +50,6 @@ fn main() -> rust_cuda::rustacuda::error::CudaResult<()> { )?, ); - rust_cuda::rustacuda::context::CurrentContext::set_resource_limit( - rust_cuda::rustacuda::context::ResourceLimit::StackSize, - 4096, - )?; - rust_cuda::rustacuda::context::CurrentContext::set_resource_limit( - rust_cuda::rustacuda::context::ResourceLimit::PrintfFifoSize, - 4096, - )?; - // Create a new CUDA stream to submit kernels to let stream = rust_cuda::host::CudaDropWrapper::from(rust_cuda::rustacuda::stream::Stream::new( rust_cuda::rustacuda::stream::StreamFlags::NON_BLOCKING, @@ -78,11 +69,11 @@ fn main() -> rust_cuda::rustacuda::error::CudaResult<()> { // Launch the CUDA kernel on the stream and synchronise to its completion println!("Launching print kernel ..."); - launcher.kernel(&stream, Action::Print).unwrap(); + launcher.kernel(&stream, Action::Print)?; println!("Launching panic kernel ..."); - launcher.kernel(&stream, Action::Panic).unwrap_err(); + launcher.kernel(&stream, Action::Panic)?; println!("Launching alloc error kernel ..."); - launcher.kernel(&stream, Action::AllocError).unwrap_err(); + launcher.kernel(&stream, Action::AllocError)?; Ok(()) } diff --git a/src/device/utils.rs b/src/device/utils.rs index e7206e118..073e7bd54 100644 --- a/src/device/utils.rs +++ b/src/device/utils.rs @@ -5,6 +5,12 @@ pub fn abort() -> ! { unsafe { ::core::arch::nvptx::trap() } } +#[allow(clippy::inline_always)] +#[inline(always)] +pub fn exit() -> ! { + unsafe { ::core::arch::asm!("exit;", options(noreturn)) } +} + /// Prints to the CUDA kernel's standard output using the `vprintf` system call. /// /// Replacement for the [`std::print!`] macro, which now forwards to the @@ -131,7 +137,7 @@ pub fn pretty_panic_handler( ); } - abort() + exit() } // TODO: docs @@ -177,5 +183,5 @@ pub fn pretty_alloc_error_handler(layout: ::core::alloc::Layout) -> ! { ); } - abort() + exit() } From 1e4de0cb478fd5afd01ab59e301b68bdddc601e4 Mon Sep 17 00:00:00 2001 From: Juniper Tyree Date: Tue, 19 Dec 2023 06:55:49 +0000 Subject: [PATCH 049/120] Backup of early progress for switching from kernel traits to functions --- examples/print/src/main.rs | 39 ++-- examples/single-source/src/main.rs | 36 ++-- rust-cuda-derive/src/kernel/link/mod.rs | 34 ++- .../src/kernel/specialise/call.rs | 18 ++ rust-cuda-derive/src/kernel/wrapper/config.rs | 4 - .../{get_ptx_str.rs => get_ptx.rs} | 12 +- .../wrapper/generate/cpu_linker_macro/mod.rs | 75 ++++--- .../generate/cpu_linker_macro/new_kernel.rs | 34 --- .../generate/cpu_wrapper/kernel_func.rs | 65 ++++-- .../cpu_wrapper/kernel_func_async/mod.rs | 1 + .../wrapper/generate/cpu_wrapper/mod.rs | 52 ++--- .../src/kernel/wrapper/inputs/mod.rs | 68 +++--- rust-cuda-derive/src/kernel/wrapper/mod.rs | 14 +- rust-cuda-ptx-jit/src/host/compiler/mod.rs | 41 ---- rust-cuda-ptx-jit/src/host/kernel.rs | 58 ------ rust-cuda-ptx-jit/src/host/mod.rs | 43 +++- .../src/host/{compiler => }/preprocess.rs | 0 .../src/host/{compiler => }/regex.rs | 0 .../src/host/{compiler => }/replace.rs | 0 rust-cuda-ptx-jit/src/lib.rs | 2 +- src/host.rs | 194 ++++++++++++------ src/lib.rs | 1 + src/safety/kernel_signature.rs | 46 ++++- 23 files changed, 469 insertions(+), 368 deletions(-) rename rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/{get_ptx_str.rs => get_ptx.rs} (95%) delete mode 100644 rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/new_kernel.rs delete mode 100644 rust-cuda-ptx-jit/src/host/compiler/mod.rs delete mode 100644 rust-cuda-ptx-jit/src/host/kernel.rs rename rust-cuda-ptx-jit/src/host/{compiler => }/preprocess.rs (100%) rename rust-cuda-ptx-jit/src/host/{compiler => }/regex.rs (100%) rename rust-cuda-ptx-jit/src/host/{compiler => }/replace.rs (100%) diff --git a/examples/print/src/main.rs b/examples/print/src/main.rs index 68c16ab03..cedaa6bae 100644 --- a/examples/print/src/main.rs +++ b/examples/print/src/main.rs @@ -5,6 +5,9 @@ #![cfg_attr(target_os = "cuda", feature(alloc_error_handler))] #![cfg_attr(target_os = "cuda", feature(asm_experimental_arch))] #![feature(const_type_name)] +#![feature(cfg_version)] +#![cfg_attr(not(version("1.76.0")), feature(c_str_literals))] +#![feature(type_alias_impl_trait)] extern crate alloc; @@ -17,7 +20,7 @@ pub enum Action { AllocError, } -#[rust_cuda::common::kernel(pub use link_kernel! as impl Kernel for Launcher)] +#[rust_cuda::common::kernel(pub use link! as impl Kernel for Launcher)] #[kernel(allow(ptx::local_memory_usage))] pub fn kernel(#[kernel(pass = SafeDeviceCopy)] action: Action) { match action { @@ -32,11 +35,11 @@ pub fn kernel(#[kernel(pass = SafeDeviceCopy)] action: Action) { #[cfg(not(target_os = "cuda"))] fn main() -> rust_cuda::rustacuda::error::CudaResult<()> { // Link the non-generic CUDA kernel - type Launcher = rust_cuda::host::SimpleKernelLauncher; - link_kernel!(); + struct KernelPtx; + link! { impl kernel for KernelPtx } // Initialize the CUDA API - rust_cuda::rustacuda::init(rust_cuda::rustacuda::CudaFlags::empty())?; + /*rust_cuda::rustacuda::init(rust_cuda::rustacuda::CudaFlags::empty())?; // Get the first CUDA GPU device let device = rust_cuda::rustacuda::device::Device::get_device(0)?; @@ -54,26 +57,28 @@ fn main() -> rust_cuda::rustacuda::error::CudaResult<()> { let stream = rust_cuda::host::CudaDropWrapper::from(rust_cuda::rustacuda::stream::Stream::new( rust_cuda::rustacuda::stream::StreamFlags::NON_BLOCKING, None, - )?); + )?);*/ - // Create a new launcher for the CUDA kernel - let mut launcher = Launcher { - kernel: ::new_kernel()?, - config: rust_cuda::host::LaunchConfig { - grid: rust_cuda::rustacuda::function::GridSize::x(1), - block: rust_cuda::rustacuda::function::BlockSize::x(4), - shared_memory_size: 0, - ptx_jit: false, - }, + // Create a new instance of the CUDA kernel and prepare the launch config + let mut kernel = rust_cuda::host::TypedPtxKernel::::new::(None); + let config = rust_cuda::host::LaunchConfig { + grid: rust_cuda::rustacuda::function::GridSize::x(1), + block: rust_cuda::rustacuda::function::BlockSize::x(4), + shared_memory_size: 0, + ptx_jit: false, }; + // let mut launcher = rust_cuda::host::Launcher { kernel: &mut typed_kernel, config }; // Launch the CUDA kernel on the stream and synchronise to its completion println!("Launching print kernel ..."); - launcher.kernel(&stream, Action::Print)?; + kernel.launch1(&config, Action::Print)?; + // kernel(&mut launcher, Action::Print)?; println!("Launching panic kernel ..."); - launcher.kernel(&stream, Action::Panic)?; + kernel.launch1(&config, Action::Panic)?; + // kernel(&mut launcher, Action::Panic)?; println!("Launching alloc error kernel ..."); - launcher.kernel(&stream, Action::AllocError)?; + kernel.launch1(&config, Action::AllocError)?; + // kernel(&mut launcher, Action::AllocError)?; Ok(()) } diff --git a/examples/single-source/src/main.rs b/examples/single-source/src/main.rs index 97e0be020..b06f7031c 100644 --- a/examples/single-source/src/main.rs +++ b/examples/single-source/src/main.rs @@ -6,6 +6,10 @@ #![cfg_attr(target_os = "cuda", feature(asm_experimental_arch))] #![feature(const_type_name)] #![feature(offset_of)] +#![feature(cfg_version)] +#![cfg_attr(not(version("1.76.0")), feature(c_str_literals))] +#![feature(type_alias_impl_trait)] +#![feature(associated_type_bounds)] extern crate alloc; @@ -42,13 +46,13 @@ pub struct Tuple(u32, i32); #[layout(crate = "rc::const_type_layout")] pub struct Triple(i32, i32, i32); -#[rc::common::kernel(use link_kernel! as impl Kernel for Launcher)] +#[rc::common::kernel(use link! as impl Kernel for Launcher)] #[kernel(crate = "rc")] #[kernel( allow(ptx::double_precision_use), forbid(ptx::local_memory_usage, ptx::register_spills) )] -pub fn kernel<'a, T: rc::common::RustToCuda>( +pub fn kernel<'a, T: rc::common::RustToCuda + rc::safety::StackOnly + rc::safety::NoSafeAliasing>( #[kernel(pass = SafeDeviceCopy)] _x: &Dummy, #[kernel(pass = LendRustToCuda, jit)] _y: &mut ShallowCopy>, #[kernel(pass = LendRustToCuda)] _z: &ShallowCopy>, @@ -57,11 +61,7 @@ pub fn kernel<'a, T: rc::common::RustToCuda>( #[kernel(pass = SafeDeviceCopy)] Tuple(s, mut __t): Tuple, #[kernel(pass = SafeDeviceCopy)] q: Triple, // #[kernel(pass = SafeDeviceCopy)] shared3: ThreadBlockShared, -) where - T: rc::safety::StackOnly + rc::safety::NoSafeAliasing, - ::CudaRepresentation: rc::safety::StackOnly, - ::CudaAllocation: rc::common::EmptyCudaAlloc, -{ +) { let shared: ThreadBlockShared<[Tuple; 3]> = ThreadBlockShared::new_uninit(); let shared2: ThreadBlockShared<[Tuple; 3]> = ThreadBlockShared::new_uninit(); @@ -80,24 +80,12 @@ pub fn kernel<'a, T: rc::common::RustToCuda>( #[cfg(not(target_os = "cuda"))] mod host { - #[allow(unused_imports)] - use super::KernelArgs; - use super::{Kernel, KernelPtx}; + use super::{kernel, KernelArgs}; - #[allow(dead_code)] - struct Launcher(core::marker::PhantomData); - - link_kernel!(crate::Empty); - link_kernel!(rc::utils::device_copy::SafeDeviceCopyWrapper); - - impl rc::host::Launcher for Launcher { - type CompilationWatcher<'a> = (); - type KernelTraitObject = dyn Kernel; - - fn get_launch_package(&mut self) -> rc::host::LaunchPackage { - unimplemented!() - } - } + // Link several instances of the generic CUDA kernel + struct KernelPtx<'a, T>(std::marker::PhantomData<&'a T>); + link! { impl kernel<'a, crate::Empty> for KernelPtx } + link! { impl kernel<'a, rc::utils::device_copy::SafeDeviceCopyWrapper> for KernelPtx } } #[cfg(target_os = "cuda")] diff --git a/rust-cuda-derive/src/kernel/link/mod.rs b/rust-cuda-derive/src/kernel/link/mod.rs index 78a352780..bcbe297cf 100644 --- a/rust-cuda-derive/src/kernel/link/mod.rs +++ b/rust-cuda-derive/src/kernel/link/mod.rs @@ -1,7 +1,7 @@ use std::{ collections::HashMap, env, - ffi::CString, + ffi::{CStr, CString}, fmt::Write as FmtWrite, fs, io::{Read, Write}, @@ -70,7 +70,7 @@ pub fn check_kernel(tokens: TokenStream) -> TokenStream { #[allow(clippy::module_name_repetitions)] pub fn link_kernel(tokens: TokenStream) -> TokenStream { proc_macro_error::set_dummy(quote! { - const PTX_STR: &'static str = "ERROR in this PTX compilation"; + const PTX_CSTR: &'static ::core::ffi::CStr = c"ERROR in this PTX compilation"; }); let LinkKernelConfig { @@ -95,7 +95,7 @@ pub fn link_kernel(tokens: TokenStream) -> TokenStream { if skip_kernel_compilation() { return quote! { - const PTX_STR: &'static str = "CLIPPY skips specialised PTX compilation"; + const PTX_CSTR: &'static ::core::ffi::CStr = c"CLIPPY skips specialised PTX compilation"; } .into(); } @@ -107,7 +107,7 @@ pub fn link_kernel(tokens: TokenStream) -> TokenStream { Specialisation::Link(&specialisation), ) else { return (quote! { - const PTX_STR: &'static str = "ERROR in this PTX compilation"; + const PTX_CSTR: &'static ::core::ffi::CStr = c"ERROR in this PTX compilation"; }) .into(); }; @@ -122,7 +122,23 @@ pub fn link_kernel(tokens: TokenStream) -> TokenStream { &ptx_lint_levels, ); - (quote! { const PTX_STR: &'static str = #kernel_ptx; #(#type_layouts)* }).into() + let mut kernel_ptx = kernel_ptx.into_bytes(); + kernel_ptx.push(b'\0'); + + if let Err(err) = CStr::from_bytes_with_nul(&kernel_ptx) { + abort_call_site!( + "Kernel compilation generated invalid PTX: internal nul byte: {:?}", + err + ); + } + + // TODO: CStr constructor blocked on https://github.com/rust-lang/rust/issues/118560 + let kernel_ptx = syn::LitByteStr::new(&kernel_ptx, proc_macro2::Span::call_site()); + // Safety: the validity of kernel_ptx as a CStr was just checked above + let kernel_ptx = + quote! { unsafe { ::core::ffi::CStr::from_bytes_with_nul_unchecked(#kernel_ptx) } }; + + (quote! { const PTX_CSTR: &'static ::core::ffi::CStr = #kernel_ptx; #(#type_layouts)* }).into() } fn extract_ptx_kernel_layout(kernel_ptx: &mut String) -> Vec { @@ -626,7 +642,7 @@ fn compile_kernel( Some(kernel_ptx) }, Err(err) => { - eprintln!("{err:?}"); + eprintln!("{err}"); emit_ptx_build_error(); None }, @@ -669,7 +685,7 @@ fn build_kernel_with_specialisation( let any_output = AtomicBool::new(false); let crate_name = String::from(builder.get_crate_name()); - match builder.build_live( + let build = builder.build_live( |stdout_line| { if let Ok(cargo_metadata::Message::CompilerMessage(mut message)) = serde_json::from_str(stdout_line) @@ -737,7 +753,9 @@ fn build_kernel_with_specialisation( ); colored::control::unset_override(); }, - )? { + )?; + + match build { BuildStatus::Success(output) => { let ptx_path = output.get_assembly_path(); diff --git a/rust-cuda-derive/src/kernel/specialise/call.rs b/rust-cuda-derive/src/kernel/specialise/call.rs index 34eb0dc35..10e43d26a 100644 --- a/rust-cuda-derive/src/kernel/specialise/call.rs +++ b/rust-cuda-derive/src/kernel/specialise/call.rs @@ -1,3 +1,5 @@ +use std::ffi::CStr; + use proc_macro::TokenStream; #[allow(clippy::module_name_repetitions)] @@ -25,6 +27,22 @@ pub fn specialise_kernel_call(tokens: TokenStream) -> TokenStream { format!("{kernel}_kernel") }; + let mut mangled_kernel_ident = mangled_kernel_ident.into_bytes(); + mangled_kernel_ident.push(b'\0'); + + if let Err(err) = CStr::from_bytes_with_nul(&mangled_kernel_ident) { + abort_call_site!( + "Kernel compilation generated invalid kernel entry point: internal nul byte: {:?}", + err + ); + } + + // TODO: CStr constructor blocked on https://github.com/rust-lang/rust/issues/118560 + let mangled_kernel_ident = + syn::LitByteStr::new(&mangled_kernel_ident, proc_macro2::Span::call_site()); + // Safety: the validity of mangled_kernel_ident as a CStr was just checked above + let mangled_kernel_ident = quote! { unsafe { ::core::ffi::CStr::from_bytes_with_nul_unchecked(#mangled_kernel_ident) } }; + (quote! { #mangled_kernel_ident }).into() } diff --git a/rust-cuda-derive/src/kernel/wrapper/config.rs b/rust-cuda-derive/src/kernel/wrapper/config.rs index d8951230d..6ba9ebedc 100644 --- a/rust-cuda-derive/src/kernel/wrapper/config.rs +++ b/rust-cuda-derive/src/kernel/wrapper/config.rs @@ -3,7 +3,6 @@ pub(super) struct KernelConfig { pub(super) linker: syn::Ident, pub(super) kernel: syn::Ident, pub(super) args: syn::Ident, - pub(super) ptx: syn::Ident, pub(super) launcher: syn::Ident, } @@ -18,8 +17,6 @@ impl syn::parse::Parse for KernelConfig { let kernel: syn::Ident = input.parse()?; let _lt_token: syn::token::Lt = input.parse()?; let args: syn::Ident = input.parse()?; - let _comma: syn::token::Comma = input.parse()?; - let ptx: syn::Ident = input.parse()?; let _comma: Option = input.parse()?; let _gt_token: syn::token::Gt = input.parse()?; let _for: syn::token::For = input.parse()?; @@ -30,7 +27,6 @@ impl syn::parse::Parse for KernelConfig { linker, kernel, args, - ptx, launcher, }) } diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/get_ptx_str.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/get_ptx.rs similarity index 95% rename from rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/get_ptx_str.rs rename to rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/get_ptx.rs index 10732a133..790b3b8df 100644 --- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/get_ptx_str.rs +++ b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/get_ptx.rs @@ -6,7 +6,7 @@ use crate::kernel::utils::skip_kernel_compilation; use super::super::super::{DeclGenerics, FuncIdent, FunctionInputs, InputCudaType, KernelConfig}; #[allow(clippy::too_many_arguments)] -pub(super) fn quote_get_ptx_str( +pub(super) fn quote_get_ptx( crate_path: &syn::Path, FuncIdent { func_ident, @@ -43,12 +43,12 @@ pub(super) fn quote_get_ptx_str( #crate_path::safety::kernel_signature::CpuAndGpuKernelSignatures::Match }> = #crate_path::safety::kernel_signature::Assert::<{ #crate_path::safety::kernel_signature::check( - PTX_STR.as_bytes(), - concat!(".visible .entry ", #crate_path::host::specialise_kernel_call!( + PTX_CSTR.to_bytes(), + #crate_path::host::specialise_kernel_call!( #func_ident_hash #generic_start_token #($#macro_type_ids),* #generic_close_token - )).as_bytes() + ).to_bytes(), ) }>; } @@ -78,7 +78,7 @@ pub(super) fn quote_get_ptx_str( }; quote! { - fn get_ptx_str() -> &'static str { + fn get_ptx() -> &'static ::core::ffi::CStr { #crate_path::host::link_kernel!{ #func_ident #func_ident_hash #args #crate_name #crate_manifest_dir #generic_start_token #($#macro_type_ids),* @@ -100,7 +100,7 @@ pub(super) fn quote_get_ptx_str( )* } } - PTX_STR + PTX_CSTR } } } diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/mod.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/mod.rs index 0ca963bb2..495b61870 100644 --- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/mod.rs +++ b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/mod.rs @@ -2,33 +2,35 @@ use proc_macro2::TokenStream; use super::super::{DeclGenerics, FuncIdent, FunctionInputs, KernelConfig}; -mod get_ptx_str; -mod new_kernel; +mod get_ptx; -use get_ptx_str::quote_get_ptx_str; -use new_kernel::quote_new_kernel; +use get_ptx::quote_get_ptx; pub(in super::super) fn quote_cpu_linker_macro( crate_path: &syn::Path, config @ KernelConfig { visibility, linker, + kernel, launcher, - ptx, .. }: &KernelConfig, decl_generics @ DeclGenerics { generic_start_token, generic_trait_params: generic_params, generic_close_token, + generic_kernel_params, .. }: &DeclGenerics, func_inputs: &FunctionInputs, - func_ident: &FuncIdent, + func_ident @ FuncIdent { + func_ident: func_ident_name, + func_ident_hash, .. + }: &FuncIdent, func_params: &[syn::Ident], ptx_lint_levels: &TokenStream, ) -> TokenStream { - let macro_types = generic_params + let macro_generics = generic_kernel_params//generic_params .iter() .enumerate() .map(|(i, generic)| { @@ -37,50 +39,77 @@ pub(in super::super) fn quote_cpu_linker_macro( match generic { syn::GenericParam::Type(_) => quote!($#generic_ident:ty), syn::GenericParam::Const(_) => quote!($#generic_ident:expr), - syn::GenericParam::Lifetime(_) => unreachable!(), + syn::GenericParam::Lifetime(_) => quote!($#generic_ident:lifetime),//unreachable!(), } }) .collect::>(); - let macro_type_ids = (0..generic_params.len()) + let macro_generic_ids = (0..generic_kernel_params.len()) .map(|i| quote::format_ident!("__g_{}", i)) .collect::>(); + let macro_only_lt_generic_ids = generic_kernel_params//generic_params + .iter() + .enumerate() + .filter_map(|(i, generic)| { + let generic_ident = quote::format_ident!("__g_{}", i); + + match generic { + syn::GenericParam::Type(_) | syn::GenericParam::Const(_) => None, + syn::GenericParam::Lifetime(_) => Some(generic_ident), + } + }) + .collect::>(); + + let macro_non_lt_generic_ids = generic_kernel_params//generic_params + .iter() + .enumerate() + .filter_map(|(i, generic)| { + let generic_ident = quote::format_ident!("__g_{}", i); + + match generic { + syn::GenericParam::Type(_) | syn::GenericParam::Const(_) => Some(generic_ident), + syn::GenericParam::Lifetime(_) => None, + } + }) + .collect::>(); + let cpu_linker_macro_visibility = if visibility.is_some() { quote! { #[macro_export] } } else { quote! {} }; - let get_ptx_str = quote_get_ptx_str( + let get_ptx = quote_get_ptx( crate_path, func_ident, config, decl_generics, func_inputs, func_params, - ¯o_type_ids, + ¯o_non_lt_generic_ids, ptx_lint_levels, ); - let new_kernel = quote_new_kernel( - crate_path, - config, - decl_generics, - func_ident, - ¯o_type_ids, - ); quote! { #[cfg(not(target_os = "cuda"))] #cpu_linker_macro_visibility macro_rules! #linker { - (#(#macro_types),* $(,)?) => { - unsafe impl #ptx #generic_start_token #($#macro_type_ids),* #generic_close_token - for #launcher #generic_start_token #($#macro_type_ids),* #generic_close_token + (impl #func_ident_name #generic_start_token #(#macro_generics),* $(,)? #generic_close_token for $ptx:ident) => { + unsafe impl<#($#macro_only_lt_generic_ids),*> #crate_path::host::CompiledKernelPtx< + #func_ident_name #generic_start_token #($#macro_generic_ids),* #generic_close_token + //dyn #kernel #generic_start_token #($#macro_type_ids),* #generic_close_token + > for $ptx #generic_start_token #($#macro_generic_ids),* #generic_close_token // #launcher #generic_start_token #($#macro_type_ids),* #generic_close_token { - #get_ptx_str + #get_ptx - #new_kernel + fn get_entry_point() -> &'static ::core::ffi::CStr { + #crate_path::host::specialise_kernel_call!( + #func_ident_hash #generic_start_token + #($#macro_non_lt_generic_ids),* + #generic_close_token + ) + } } }; } diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/new_kernel.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/new_kernel.rs deleted file mode 100644 index 6b53954e4..000000000 --- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/new_kernel.rs +++ /dev/null @@ -1,34 +0,0 @@ -use proc_macro2::TokenStream; - -use super::super::super::{DeclGenerics, FuncIdent, KernelConfig}; - -pub(super) fn quote_new_kernel( - crate_path: &syn::Path, - KernelConfig { kernel, .. }: &KernelConfig, - DeclGenerics { - generic_start_token, - generic_close_token, - .. - }: &DeclGenerics, - FuncIdent { - func_ident_hash, .. - }: &FuncIdent, - macro_type_ids: &[syn::Ident], -) -> TokenStream { - quote! { - fn new_kernel() -> #crate_path::rustacuda::error::CudaResult< - #crate_path::host::TypedKernel - > { - let ptx = Self::get_ptx_str(); - let entry_point = #crate_path::host::specialise_kernel_call!( - #func_ident_hash #generic_start_token - #($#macro_type_ids),* - #generic_close_token - ); - - #crate_path::host::TypedKernel::new(ptx, entry_point) - } - } -} diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func.rs index 94b4b9598..6fa778eb3 100644 --- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func.rs +++ b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func.rs @@ -5,11 +5,13 @@ use super::super::super::{ }; #[allow(clippy::too_many_arguments)] +#[allow(clippy::too_many_lines)] // FIXME pub(super) fn quote_kernel_func_inputs( crate_path: &syn::Path, - KernelConfig { kernel, args, .. }: &KernelConfig, + KernelConfig { kernel, args, visibility, .. }: &KernelConfig, ImplGenerics { ty_generics, .. }: &ImplGenerics, DeclGenerics { + generic_kernel_params, generic_wrapper_params, generic_wrapper_where_clause, .. @@ -45,7 +47,7 @@ pub(super) fn quote_kernel_func_inputs( }, }; - let kernel_func_inputs = func_inputs + let (kernel_func_inputs, kernel_func_input_tys): (Vec<_>, Vec<_>) = func_inputs .iter() .enumerate() .map(|(i, arg)| match arg { @@ -56,44 +58,73 @@ pub(super) fn quote_kernel_func_inputs( ty, }) => { let type_ident = quote::format_ident!("__T_{}", i); - let syn_type = quote! { + let syn_type: syn::Type = syn::parse_quote! { <() as #args #ty_generics>::#type_ident }; - - if let syn::Type::Reference(syn::TypeReference { + let syn_type = if let syn::Type::Reference(syn::TypeReference { and_token, lifetime, mutability, .. }) = &**ty { - quote! { - #(#attrs)* #pat #colon_token #and_token #lifetime #mutability #syn_type - } + syn::Type::Reference(syn::TypeReference { + and_token: *and_token, + lifetime: lifetime.clone(), + mutability: *mutability, + elem: Box::new(syn_type), + }) } else { - quote! { #(#attrs)* #pat #colon_token #syn_type } - } + syn_type + }; + + let param = quote! { + #(#attrs)* #pat #colon_token #syn_type + }; + + (param, syn_type) }, syn::FnArg::Receiver(_) => unreachable!(), }) - .collect::>(); + .unzip(); let raw_func_input_wrap = generate_raw_func_input_wrap(crate_path, inputs, fn_ident, func_params); + let full_generics = generic_kernel_params.iter().map(|param| match param { + syn::GenericParam::Type(syn::TypeParam { ident, .. }) | syn::GenericParam::Const(syn::ConstParam { ident, .. }) => quote!(#ident), + syn::GenericParam::Lifetime(syn::LifetimeDef { lifetime, .. }) => quote!(#lifetime), + }).collect::>(); + + let ty_turbofish = ty_generics.as_turbofish(); + quote! { + #[cfg(not(target_os = "cuda"))] + #[allow(non_camel_case_types)] + #visibility type #func_ident <#generic_kernel_params> = impl Copy + Fn( + &mut #crate_path::host::Launcher<#func_ident <#(#full_generics),*>>, + #(#kernel_func_input_tys),* + ) -> #crate_path::rustacuda::error::CudaResult<()>; + + #[cfg(not(target_os = "cuda"))] #(#func_attrs)* #[allow(clippy::needless_lifetimes)] #[allow(clippy::too_many_arguments)] #[allow(clippy::used_underscore_binding)] #[allow(unused_variables)] - fn #func_ident <'stream, #generic_wrapper_params>( - &mut self, - stream: &'stream #crate_path::rustacuda::stream::Stream, + #visibility fn #func_ident ( + // &mut self, + // TODO: move the stream + // stream: &'stream #crate_path::rustacuda::stream::Stream, + // kernel: &mut #crate_path::host::TypedKernel<#func_ident #ty_generics>, + launcher: &mut #crate_path::host::Launcher<#func_ident #ty_generics>, #(#kernel_func_inputs),* ) -> #crate_path::rustacuda::error::CudaResult<()> - #generic_wrapper_where_clause + // TODO: don't allow where clause + //#generic_wrapper_where_clause { + let _: #func_ident <#(#full_generics),*> = #func_ident #ty_turbofish; + // impls check adapted from Nikolai Vazquez's `impls` crate: // https://docs.rs/impls/1.0.3/src/impls/lib.rs.html#584-602 const fn __check_is_sync(_x: &T) -> bool { @@ -110,7 +141,9 @@ pub(super) fn quote_kernel_func_inputs( >::SYNC } - #raw_func_input_wrap + todo!() + + // #raw_func_input_wrap } } } diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func_async/mod.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func_async/mod.rs index 747f4a278..63d0d472f 100644 --- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func_async/mod.rs +++ b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func_async/mod.rs @@ -61,6 +61,7 @@ pub(super) fn quote_kernel_func_async( generate_launch_types(crate_path, config, impl_generics, func_inputs); quote! { + #[cfg(not(target_os = "cuda"))] #(#func_attrs)* #[allow(clippy::extra_unused_type_parameters)] #[allow(clippy::too_many_arguments)] diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/mod.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/mod.rs index 1b984f920..7007abe87 100644 --- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/mod.rs +++ b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/mod.rs @@ -14,10 +14,7 @@ use kernel_func_async::quote_kernel_func_async; pub(in super::super) fn quote_cpu_wrapper( crate_path: &syn::Path, config @ KernelConfig { - visibility, - kernel, - ptx, - .. + visibility, kernel, .. }: &KernelConfig, decl @ DeclGenerics { generic_start_token, @@ -37,12 +34,6 @@ pub(in super::super) fn quote_cpu_wrapper( func_params: &[syn::Ident], func_attrs: &[syn::Attribute], ) -> TokenStream { - let launcher_predicate = quote! { - Self: Sized + #crate_path::host::Launcher< - KernelTraitObject = dyn #kernel #ty_generics - > - }; - let kernel_func = quote_kernel_func_inputs( crate_path, config, @@ -65,32 +56,27 @@ pub(in super::super) fn quote_cpu_wrapper( ); quote! { - #[cfg(not(target_os = "cuda"))] - #[allow(clippy::missing_safety_doc)] - #visibility unsafe trait #ptx #generic_start_token #generic_trait_params #generic_close_token - #generic_trait_where_clause - { - fn get_ptx_str() -> &'static str where #launcher_predicate; + // #[cfg(not(target_os = "cuda"))] + // #[allow(clippy::missing_safety_doc)] + // #visibility unsafe trait #kernel #generic_start_token + // #generic_trait_params + // #generic_close_token: #crate_path::host::CompiledKernelPtx< + // dyn #kernel #ty_generics + // > #generic_trait_where_clause + // { + // #kernel_func - fn new_kernel() -> #crate_path::rustacuda::error::CudaResult< - #crate_path::host::TypedKernel - > where #launcher_predicate; - } + // #kernel_func_async + // } - #[cfg(not(target_os = "cuda"))] - #[allow(clippy::missing_safety_doc)] - #visibility unsafe trait #kernel #generic_start_token #generic_trait_params #generic_close_token: #ptx #ty_generics - #generic_trait_where_clause - { - #kernel_func + // #[cfg(not(target_os = "cuda"))] + // #[allow(clippy::missing_safety_doc)] + // unsafe impl #blanket_impl_generics #kernel #ty_generics for #blanket_ty + // #blanket_where_clause + // {} - #kernel_func_async - } + #kernel_func - #[cfg(not(target_os = "cuda"))] - #[allow(clippy::missing_safety_doc)] - unsafe impl #blanket_impl_generics #kernel #ty_generics for #blanket_ty - #blanket_where_clause - {} + // #kernel_func_async } } diff --git a/rust-cuda-derive/src/kernel/wrapper/inputs/mod.rs b/rust-cuda-derive/src/kernel/wrapper/inputs/mod.rs index f3cc1a4d8..4a25bf958 100644 --- a/rust-cuda-derive/src/kernel/wrapper/inputs/mod.rs +++ b/rust-cuda-derive/src/kernel/wrapper/inputs/mod.rs @@ -136,26 +136,26 @@ fn ensure_reference_type_lifetime( mutability, elem, }) => { - let lifetime = lifetime.clone().unwrap_or_else(|| { - let lifetime = syn::Lifetime::new( - &format!("'__r2c_lt_{implicit_lifetime_id}"), - lifetime.span(), - ); - - generic_params.insert( - *implicit_lifetime_id, - syn::GenericParam::Lifetime(syn::LifetimeDef { - attrs: Vec::new(), - lifetime: lifetime.clone(), - colon_token: None, - bounds: syn::punctuated::Punctuated::new(), - }), - ); - - *implicit_lifetime_id += 1; - - lifetime - }); + // let lifetime = lifetime.clone().unwrap_or_else(|| { + // let lifetime = syn::Lifetime::new( + // &format!("'__r2c_lt_{implicit_lifetime_id}"), + // lifetime.span(), + // ); + + // generic_params.insert( + // *implicit_lifetime_id, + // syn::GenericParam::Lifetime(syn::LifetimeDef { + // attrs: Vec::new(), + // lifetime: lifetime.clone(), + // colon_token: None, + // bounds: syn::punctuated::Punctuated::new(), + // }), + // ); + + // *implicit_lifetime_id += 1; + + // lifetime + // }); let elem = if matches!(cuda_type, InputCudaType::LendRustToCuda) { (|| { @@ -203,25 +203,25 @@ fn ensure_reference_type_lifetime( Box::new(syn::Type::Reference(syn::TypeReference { and_token: *and_token, - lifetime: Some(lifetime), + lifetime: lifetime.clone(),//Some(lifetime), mutability: *mutability, elem, })) }, ty => { - if matches!(cuda_type, InputCudaType::LendRustToCuda) { - generic_params.insert( - *implicit_lifetime_id, - syn::GenericParam::Lifetime(syn::LifetimeDef { - attrs: Vec::new(), - lifetime: r2c_move_lifetime(i, ty), - colon_token: None, - bounds: syn::punctuated::Punctuated::new(), - }), - ); - - *implicit_lifetime_id += 1; - } + // if matches!(cuda_type, InputCudaType::LendRustToCuda) { + // generic_params.insert( + // *implicit_lifetime_id, + // syn::GenericParam::Lifetime(syn::LifetimeDef { + // attrs: Vec::new(), + // lifetime: r2c_move_lifetime(i, ty), + // colon_token: None, + // bounds: syn::punctuated::Punctuated::new(), + // }), + // ); + + // *implicit_lifetime_id += 1; + // } Box::new(ty.clone()) }, diff --git a/rust-cuda-derive/src/kernel/wrapper/mod.rs b/rust-cuda-derive/src/kernel/wrapper/mod.rs index a812f9dd4..a4db5f7f3 100644 --- a/rust-cuda-derive/src/kernel/wrapper/mod.rs +++ b/rust-cuda-derive/src/kernel/wrapper/mod.rs @@ -36,8 +36,8 @@ pub fn kernel(attr: TokenStream, func: TokenStream) -> TokenStream { Ok(config) => config, Err(err) => { abort_call_site!( - "#[kernel(pub? use LINKER! as impl KERNEL for LAUNCHER)] expects \ - LINKER, KERNEL, ARGS, PTX, and LAUNCHER identifiers: {:?}", + "#[kernel(pub? use LINKER! as impl KERNEL for LAUNCHER)] expects LINKER, \ + KERNEL, ARGS, and LAUNCHER identifiers: {:?}", err ) }, @@ -211,12 +211,18 @@ pub fn kernel(attr: TokenStream, func: TokenStream) -> TokenStream { let (impl_generics, ty_generics, where_clause) = trait_generics.split_for_impl(); let blanket_ty = syn::Ident::new("K", Span::mixed_site()); let mut blanket_params = generic_trait_params.clone(); - let ptx = &config.ptx; blanket_params.push(syn::GenericParam::Type(syn::TypeParam { attrs: Vec::new(), ident: blanket_ty.clone(), colon_token: syn::parse_quote!(:), - bounds: syn::parse_quote!(#ptx #ty_generics), + bounds: { + let kernel = &config.kernel; + syn::parse_quote! { + #crate_path::host::CompiledKernelPtx< + dyn #kernel #ty_generics + > + } + }, eq_token: None, default: None, })); diff --git a/rust-cuda-ptx-jit/src/host/compiler/mod.rs b/rust-cuda-ptx-jit/src/host/compiler/mod.rs deleted file mode 100644 index 156e8223c..000000000 --- a/rust-cuda-ptx-jit/src/host/compiler/mod.rs +++ /dev/null @@ -1,41 +0,0 @@ -use std::ffi::{CStr, CString}; - -mod preprocess; -mod regex; -mod replace; - -type ByteSliceOptionalArguments = Option>]>>; - -#[doc(cfg(feature = "host"))] -#[allow(clippy::module_name_repetitions)] -pub struct PtxJITCompiler { - ptx_slices: Box<[PtxElement]>, - last_arguments: ByteSliceOptionalArguments, - last_ptx: CString, -} - -#[doc(cfg(feature = "host"))] -pub enum PtxJITResult<'s> { - Cached(&'s CStr), - Recomputed(&'s CStr), -} - -enum PtxLoadWidth { - B1, - B2, - B4, - B8, -} - -enum PtxElement { - CopiedSource { - ptx: Box<[u8]>, - }, - ConstLoad { - ptx: Box<[u8]>, - parameter_index: usize, - byte_offset: usize, - load_width: PtxLoadWidth, - registers: Box<[Box<[u8]>]>, - }, -} diff --git a/rust-cuda-ptx-jit/src/host/kernel.rs b/rust-cuda-ptx-jit/src/host/kernel.rs deleted file mode 100644 index 02baabfcf..000000000 --- a/rust-cuda-ptx-jit/src/host/kernel.rs +++ /dev/null @@ -1,58 +0,0 @@ -use std::{ffi::CStr, mem::ManuallyDrop}; - -use rustacuda::{error::CudaResult, function::Function, module::Module}; - -#[doc(cfg(feature = "host"))] -#[allow(clippy::module_name_repetitions)] -pub struct CudaKernel { - module: ManuallyDrop>, - function: ManuallyDrop>, -} - -impl CudaKernel { - /// # Errors - /// - /// Returns a `CudaError` if `ptx` is not a valid PTX source, or it does - /// not contain an entry point named `entry_point`. - pub fn new(ptx: &CStr, entry_point: &CStr) -> CudaResult { - let module = Box::new(Module::load_from_string(ptx)?); - - let function = unsafe { &*(module.as_ref() as *const Module) }.get_function(entry_point); - - let function = match function { - Ok(function) => function, - Err(err) => { - if let Err((_err, module)) = Module::drop(*module) { - std::mem::forget(module); - } - - return Err(err); - }, - }; - - Ok(Self { - function: ManuallyDrop::new(function), - module: ManuallyDrop::new(module), - }) - } - - #[must_use] - pub fn get_function(&self) -> &Function { - &self.function - } -} - -impl Drop for CudaKernel { - fn drop(&mut self) { - { - // Ensure that self.function is dropped before self.module as - // it borrows data from the module and must not outlive it - let _function = unsafe { ManuallyDrop::take(&mut self.function) }; - } - - if let Err((_err, module)) = Module::drop(*unsafe { ManuallyDrop::take(&mut self.module) }) - { - std::mem::forget(module); - } - } -} diff --git a/rust-cuda-ptx-jit/src/host/mod.rs b/rust-cuda-ptx-jit/src/host/mod.rs index 2ace3405d..156e8223c 100644 --- a/rust-cuda-ptx-jit/src/host/mod.rs +++ b/rust-cuda-ptx-jit/src/host/mod.rs @@ -1,2 +1,41 @@ -pub mod compiler; -pub mod kernel; +use std::ffi::{CStr, CString}; + +mod preprocess; +mod regex; +mod replace; + +type ByteSliceOptionalArguments = Option>]>>; + +#[doc(cfg(feature = "host"))] +#[allow(clippy::module_name_repetitions)] +pub struct PtxJITCompiler { + ptx_slices: Box<[PtxElement]>, + last_arguments: ByteSliceOptionalArguments, + last_ptx: CString, +} + +#[doc(cfg(feature = "host"))] +pub enum PtxJITResult<'s> { + Cached(&'s CStr), + Recomputed(&'s CStr), +} + +enum PtxLoadWidth { + B1, + B2, + B4, + B8, +} + +enum PtxElement { + CopiedSource { + ptx: Box<[u8]>, + }, + ConstLoad { + ptx: Box<[u8]>, + parameter_index: usize, + byte_offset: usize, + load_width: PtxLoadWidth, + registers: Box<[Box<[u8]>]>, + }, +} diff --git a/rust-cuda-ptx-jit/src/host/compiler/preprocess.rs b/rust-cuda-ptx-jit/src/host/preprocess.rs similarity index 100% rename from rust-cuda-ptx-jit/src/host/compiler/preprocess.rs rename to rust-cuda-ptx-jit/src/host/preprocess.rs diff --git a/rust-cuda-ptx-jit/src/host/compiler/regex.rs b/rust-cuda-ptx-jit/src/host/regex.rs similarity index 100% rename from rust-cuda-ptx-jit/src/host/compiler/regex.rs rename to rust-cuda-ptx-jit/src/host/regex.rs diff --git a/rust-cuda-ptx-jit/src/host/compiler/replace.rs b/rust-cuda-ptx-jit/src/host/replace.rs similarity index 100% rename from rust-cuda-ptx-jit/src/host/compiler/replace.rs rename to rust-cuda-ptx-jit/src/host/replace.rs diff --git a/rust-cuda-ptx-jit/src/lib.rs b/rust-cuda-ptx-jit/src/lib.rs index 802e26cdd..8b25fc9a0 100644 --- a/rust-cuda-ptx-jit/src/lib.rs +++ b/rust-cuda-ptx-jit/src/lib.rs @@ -9,7 +9,7 @@ mod host; #[cfg(feature = "host")] -pub use host::{compiler::PtxJITCompiler, compiler::PtxJITResult, kernel::CudaKernel}; +pub use host::{PtxJITCompiler, PtxJITResult}; #[cfg(any(not(feature = "host"), doc))] #[doc(cfg(not(feature = "host")))] diff --git a/src/host.rs b/src/host.rs index 1bdce5ee6..ea2bd11a8 100644 --- a/src/host.rs +++ b/src/host.rs @@ -1,4 +1,5 @@ -use core::{ +use std::{ + ffi::{CStr, CString}, marker::PhantomData, mem::ManuallyDrop, ops::{Deref, DerefMut}, @@ -23,27 +24,28 @@ use crate::{ common::{ DeviceAccessible, DeviceConstRef, DeviceMutRef, EmptyCudaAlloc, NoCudaAlloc, RustToCuda, }, - ptx_jit::{CudaKernel, PtxJITCompiler, PtxJITResult}, + ptx_jit::{PtxJITCompiler, PtxJITResult}, safety::SafeDeviceCopy, }; -pub trait Launcher { - type KernelTraitObject: ?Sized; - type CompilationWatcher<'a>; +pub struct Launcher<'a, Kernel> { + pub kernel: &'a mut TypedPtxKernel, + pub config: LaunchConfig, +} - fn get_launch_package(&mut self) -> LaunchPackage; +impl<'a, Kernel> Launcher<'a, Kernel> { + #[allow(clippy::missing_errors_doc)] + pub fn launch0(&mut self) -> CudaResult<()> where Kernel: Copy + FnOnce(&mut Launcher) -> CudaResult<()> { + self.kernel.launch0(&self.config) + } - /// # Errors - /// - /// Should only return a [`CudaError`] if some implementation-defined - /// critical kernel function configuration failed. - #[allow(unused_variables)] - fn on_compile(kernel: &Function, watcher: Self::CompilationWatcher<'_>) -> CudaResult<()> { - Ok(()) + #[allow(clippy::missing_errors_doc)] + pub fn launch1(&mut self, arg1: A) -> CudaResult<()> where Kernel: Copy + FnOnce(&mut Launcher, A) -> CudaResult<()> { + self.kernel.launch1(&self.config, arg1) } } -#[derive(Clone, Debug, PartialEq, Eq)] +#[derive(Clone, Debug, PartialEq)] pub struct LaunchConfig { pub grid: rustacuda::function::GridSize, pub block: rustacuda::function::BlockSize, @@ -51,26 +53,57 @@ pub struct LaunchConfig { pub ptx_jit: bool, } -pub struct LaunchPackage<'l, L: ?Sized + Launcher> { - pub config: LaunchConfig, - pub kernel: &'l mut TypedKernel, - pub watcher: L::CompilationWatcher<'l>, +#[doc(cfg(feature = "host"))] +#[allow(clippy::module_name_repetitions)] +pub struct PtxKernel { + module: ManuallyDrop>, + function: ManuallyDrop>, } -pub struct SimpleKernelLauncher { - pub kernel: TypedKernel, - pub config: LaunchConfig, +impl PtxKernel { + /// # Errors + /// + /// Returns a [`CudaError`] if `ptx` is not a valid PTX source, or it does + /// not contain an entry point named `entry_point`. + pub fn new(ptx: &CStr, entry_point: &CStr) -> CudaResult { + let module = Box::new(Module::load_from_string(ptx)?); + + let function = unsafe { &*(module.as_ref() as *const Module) }.get_function(entry_point); + + let function = match function { + Ok(function) => function, + Err(err) => { + if let Err((_err, module)) = Module::drop(*module) { + std::mem::forget(module); + } + + return Err(err); + }, + }; + + Ok(Self { + function: ManuallyDrop::new(function), + module: ManuallyDrop::new(module), + }) + } + + #[must_use] + pub fn get_function(&self) -> &Function { + &self.function + } } -impl Launcher for SimpleKernelLauncher { - type CompilationWatcher<'a> = (); - type KernelTraitObject = KernelTraitObject; +impl Drop for PtxKernel { + fn drop(&mut self) { + { + // Ensure that self.function is dropped before self.module as + // it borrows data from the module and must not outlive it + let _function = unsafe { ManuallyDrop::take(&mut self.function) }; + } - fn get_launch_package(&mut self) -> LaunchPackage { - LaunchPackage { - config: self.config.clone(), - kernel: &mut self.kernel, - watcher: (), + if let Err((_err, module)) = Module::drop(*unsafe { ManuallyDrop::take(&mut self.module) }) + { + std::mem::forget(module); } } } @@ -80,63 +113,92 @@ pub enum KernelJITResult<'k> { Recompiled(&'k Function<'k>), } -pub struct TypedKernel { +pub type PtxKernelConfigure = dyn FnMut(&Function) -> CudaResult<()>; + +pub struct TypedPtxKernel { compiler: PtxJITCompiler, - kernel: Option, - entry_point: alloc::boxed::Box, - marker: PhantomData, + ptx_kernel: Option, + entry_point: Box, + configure: Option>, + marker: PhantomData, } -impl TypedKernel { - /// # Errors - /// - /// Returns a [`CudaError`] if `ptx` or `entry_point` contain nul bytes. - pub fn new(ptx: &str, entry_point: &str) -> CudaResult { - let ptx_cstring = std::ffi::CString::new(ptx).map_err(|_| CudaError::InvalidPtx)?; - - let compiler = crate::ptx_jit::PtxJITCompiler::new(&ptx_cstring); - - let entry_point_cstring = - std::ffi::CString::new(entry_point).map_err(|_| CudaError::InvalidValue)?; - let entry_point = entry_point_cstring.into_boxed_c_str(); +impl TypedPtxKernel { + #[must_use] + pub fn new>(configure: Option>) -> Self { + let compiler = crate::ptx_jit::PtxJITCompiler::new(T::get_ptx()); + let entry_point = CString::from(T::get_entry_point()).into_boxed_c_str(); - Ok(Self { + Self { compiler, - kernel: None, + ptx_kernel: None, entry_point, - marker: PhantomData, - }) + configure, + marker: PhantomData::, + } } /// # Errors /// - /// Returns a [`CudaError`] if `ptx` (from [`Self::new`]) is not a valid - /// PTX source, or it does not contain an entry point named `entry_point` - /// (from [`Self::new`]). + /// Returns a [`CudaError`] if the [`CompiledKernelPtx`] provided to + /// [`Self::new`] is not a valid PTX source or does not contain the + /// entry point it declares. pub fn compile_with_ptx_jit_args( &mut self, arguments: Option<&[Option<*const [u8]>]>, ) -> CudaResult { let ptx_jit = self.compiler.with_arguments(arguments); - let kernel_jit = match (&mut self.kernel, ptx_jit) { - (Some(kernel), PtxJITResult::Cached(_)) => { - KernelJITResult::Cached(kernel.get_function()) + let kernel_jit = match (&mut self.ptx_kernel, ptx_jit) { + (Some(ptx_kernel), PtxJITResult::Cached(_)) => { + KernelJITResult::Cached(ptx_kernel.get_function()) }, - (kernel, PtxJITResult::Cached(ptx_cstr) | PtxJITResult::Recomputed(ptx_cstr)) => { - let recomputed_kernel = CudaKernel::new(ptx_cstr, &self.entry_point)?; + (ptx_kernel, PtxJITResult::Cached(ptx_cstr) | PtxJITResult::Recomputed(ptx_cstr)) => { + let recomputed_ptx_kernel = PtxKernel::new(ptx_cstr, &self.entry_point)?; // Replace the existing compiled kernel, drop the old one - let kernel = kernel.insert(recomputed_kernel); + let ptx_kernel = ptx_kernel.insert(recomputed_ptx_kernel); + + let function = ptx_kernel.get_function(); + + if let Some(configure) = self.configure.as_mut() { + configure(function)?; + } - KernelJITResult::Recompiled(kernel.get_function()) + KernelJITResult::Recompiled(function) }, }; Ok(kernel_jit) } + + #[allow(clippy::missing_errors_doc)] + pub fn launch0(&mut self, config: &LaunchConfig) -> CudaResult<()> where Kernel: Copy + FnOnce(&mut Launcher) -> CudaResult<()> { + (const { conjure::() })(&mut Launcher { kernel: self, config: config.clone() }) + } + + #[allow(clippy::missing_errors_doc)] + pub fn launch1(&mut self, config: &LaunchConfig, arg1: A) -> CudaResult<()> where Kernel: Copy + FnOnce(&mut Launcher, A) -> CudaResult<()> { + (const { conjure::() })(&mut Launcher { kernel: self, config: config.clone() }, arg1) + } +} + +const fn conjure() -> T { + union Transmute { + empty: (), + magic: T, + } + + assert!(std::mem::size_of::() == 0); + assert!(std::mem::align_of::() == 1); + + unsafe { Transmute { empty: () }.magic } } +struct Assert; +trait True {} +impl True for Assert {} + pub trait LendToCuda: RustToCuda { /// Lends an immutable copy of `&self` to CUDA: /// - code in the CUDA kernel can only access `&self` through the @@ -908,3 +970,17 @@ impl<'stream, 'a, T: SafeDeviceCopy + DeviceCopy> HostAndDeviceOwnedAsync<'strea self.host_val } } + +/// # Safety +/// +/// The PTX string returned by [`CompiledKernelPtx::get_ptx`] must correspond +/// to the compiled kernel code for the `Kernel` function and contain a kernel +/// entry point whose name is returned by +/// [`CompiledKernelPtx::get_entry_point`]. +/// +/// This trait should not be implemented manually – use the +/// [`kernel`](crate::common::kernel) macro instead. +pub unsafe trait CompiledKernelPtx { + fn get_ptx() -> &'static CStr; + fn get_entry_point() -> &'static CStr; +} diff --git a/src/lib.rs b/src/lib.rs index 100e95325..15e704e79 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -26,6 +26,7 @@ #![feature(decl_macro)] #![feature(panic_info_message)] #![feature(let_chains)] +#![feature(inline_const)] #![feature(cfg_version)] #![cfg_attr(not(version("1.76.0")), feature(c_str_literals))] #![cfg_attr(not(version("1.76.0")), feature(ptr_from_ref))] diff --git a/src/safety/kernel_signature.rs b/src/safety/kernel_signature.rs index 4a82ec1d0..96bdd3f32 100644 --- a/src/safety/kernel_signature.rs +++ b/src/safety/kernel_signature.rs @@ -7,13 +7,33 @@ pub enum CpuAndGpuKernelSignatures { pub struct Assert; #[must_use] -pub const fn check(haystack: &[u8], needle: &[u8]) -> CpuAndGpuKernelSignatures { - let mut i = 0; +pub const fn check(ptx: &[u8], entry_point: &[u8]) -> CpuAndGpuKernelSignatures { + const KERNEL_TYPE: &[u8] = b".visible .entry "; + let mut j = 0; + while j < ptx.len() { + let Some(j2) = find(ptx, KERNEL_TYPE, j) else { + return CpuAndGpuKernelSignatures::Mismatch; + }; + + if starts_with(ptx, entry_point, j2) { + return CpuAndGpuKernelSignatures::Match; + } + + j += 1; + } + + CpuAndGpuKernelSignatures::Mismatch +} + +const fn find(haystack: &[u8], needle: &[u8], from: usize) -> Option { + let mut i = 0; + let mut j = from; + while i < needle.len() { if j >= haystack.len() { - return CpuAndGpuKernelSignatures::Mismatch; + return None; } if needle[i] == haystack[j] { @@ -25,5 +45,23 @@ pub const fn check(haystack: &[u8], needle: &[u8]) -> CpuAndGpuKernelSignatures } } - CpuAndGpuKernelSignatures::Match + Some(j) +} + +const fn starts_with(haystack: &[u8], needle: &[u8], from: usize) -> bool { + let mut i = 0; + + while i < needle.len() { + if (from + i) >= haystack.len() { + return false; + } + + if needle[i] == haystack[from + i] { + i += 1; + } else { + return false; + } + } + + true } From a3ec63a7611280b63f3c8eda90d92fc3f2f2964d Mon Sep 17 00:00:00 2001 From: Juniper Tyree Date: Wed, 20 Dec 2023 12:59:18 +0000 Subject: [PATCH 050/120] More work into kernel functions instead of traits --- examples/print/src/main.rs | 14 +- examples/single-source/src/main.rs | 18 ++- rust-cuda-derive/src/kernel/specialise/ty.rs | 62 +++++++-- rust-cuda-derive/src/kernel/utils.rs | 6 - rust-cuda-derive/src/kernel/wrapper/config.rs | 18 +-- .../src/kernel/wrapper/generate/args_trait.rs | 33 +---- .../generate/cpu_linker_macro/get_ptx.rs | 14 +- .../wrapper/generate/cpu_linker_macro/mod.rs | 63 ++++----- .../generate/cpu_wrapper/kernel_func.rs | 82 ++++-------- .../kernel_func_async/async_func_types.rs | 17 ++- .../kernel_func_async/launch_types.rs | 17 ++- .../cpu_wrapper/kernel_func_async/mod.rs | 71 +++------- .../wrapper/generate/cpu_wrapper/mod.rs | 44 +----- .../wrapper/generate/cuda_generic_function.rs | 2 - .../kernel/wrapper/generate/cuda_wrapper.rs | 28 ++-- .../src/kernel/wrapper/inputs/mod.rs | 69 +--------- rust-cuda-derive/src/kernel/wrapper/mod.rs | 125 +++--------------- rust-cuda-derive/src/kernel/wrapper/parse.rs | 7 + rust-cuda-derive/src/lib.rs | 1 + src/host.rs | 46 +++++-- 20 files changed, 280 insertions(+), 457 deletions(-) diff --git a/examples/print/src/main.rs b/examples/print/src/main.rs index cedaa6bae..dc38b3fa9 100644 --- a/examples/print/src/main.rs +++ b/examples/print/src/main.rs @@ -8,6 +8,7 @@ #![feature(cfg_version)] #![cfg_attr(not(version("1.76.0")), feature(c_str_literals))] #![feature(type_alias_impl_trait)] +#![feature(decl_macro)] extern crate alloc; @@ -20,7 +21,7 @@ pub enum Action { AllocError, } -#[rust_cuda::common::kernel(pub use link! as impl Kernel for Launcher)] +#[rust_cuda::common::kernel(use link! for impl)] #[kernel(allow(ptx::local_memory_usage))] pub fn kernel(#[kernel(pass = SafeDeviceCopy)] action: Action) { match action { @@ -39,7 +40,7 @@ fn main() -> rust_cuda::rustacuda::error::CudaResult<()> { link! { impl kernel for KernelPtx } // Initialize the CUDA API - /*rust_cuda::rustacuda::init(rust_cuda::rustacuda::CudaFlags::empty())?; + rust_cuda::rustacuda::init(rust_cuda::rustacuda::CudaFlags::empty())?; // Get the first CUDA GPU device let device = rust_cuda::rustacuda::device::Device::get_device(0)?; @@ -57,7 +58,7 @@ fn main() -> rust_cuda::rustacuda::error::CudaResult<()> { let stream = rust_cuda::host::CudaDropWrapper::from(rust_cuda::rustacuda::stream::Stream::new( rust_cuda::rustacuda::stream::StreamFlags::NON_BLOCKING, None, - )?);*/ + )?); // Create a new instance of the CUDA kernel and prepare the launch config let mut kernel = rust_cuda::host::TypedPtxKernel::::new::(None); @@ -67,17 +68,16 @@ fn main() -> rust_cuda::rustacuda::error::CudaResult<()> { shared_memory_size: 0, ptx_jit: false, }; - // let mut launcher = rust_cuda::host::Launcher { kernel: &mut typed_kernel, config }; // Launch the CUDA kernel on the stream and synchronise to its completion println!("Launching print kernel ..."); - kernel.launch1(&config, Action::Print)?; + kernel.launch1(&stream, &config, Action::Print)?; // kernel(&mut launcher, Action::Print)?; println!("Launching panic kernel ..."); - kernel.launch1(&config, Action::Panic)?; + kernel.launch1(&stream, &config, Action::Panic)?; // kernel(&mut launcher, Action::Panic)?; println!("Launching alloc error kernel ..."); - kernel.launch1(&config, Action::AllocError)?; + kernel.launch1(&stream, &config, Action::AllocError)?; // kernel(&mut launcher, Action::AllocError)?; Ok(()) diff --git a/examples/single-source/src/main.rs b/examples/single-source/src/main.rs index b06f7031c..796e6ee4f 100644 --- a/examples/single-source/src/main.rs +++ b/examples/single-source/src/main.rs @@ -10,6 +10,7 @@ #![cfg_attr(not(version("1.76.0")), feature(c_str_literals))] #![feature(type_alias_impl_trait)] #![feature(associated_type_bounds)] +#![feature(decl_macro)] extern crate alloc; @@ -46,13 +47,20 @@ pub struct Tuple(u32, i32); #[layout(crate = "rc::const_type_layout")] pub struct Triple(i32, i32, i32); -#[rc::common::kernel(use link! as impl Kernel for Launcher)] +#[rc::common::kernel(pub use link! for impl)] #[kernel(crate = "rc")] #[kernel( allow(ptx::double_precision_use), forbid(ptx::local_memory_usage, ptx::register_spills) )] -pub fn kernel<'a, T: rc::common::RustToCuda + rc::safety::StackOnly + rc::safety::NoSafeAliasing>( +pub fn kernel< + 'a, + T: rc::common::RustToCuda< + CudaRepresentation: rc::safety::StackOnly, + CudaAllocation: rc::common::EmptyCudaAlloc, + > + rc::safety::StackOnly + + rc::safety::NoSafeAliasing, +>( #[kernel(pass = SafeDeviceCopy)] _x: &Dummy, #[kernel(pass = LendRustToCuda, jit)] _y: &mut ShallowCopy>, #[kernel(pass = LendRustToCuda)] _z: &ShallowCopy>, @@ -80,12 +88,12 @@ pub fn kernel<'a, T: rc::common::RustToCuda(std::marker::PhantomData<&'a T>); - link! { impl kernel<'a, crate::Empty> for KernelPtx } - link! { impl kernel<'a, rc::utils::device_copy::SafeDeviceCopyWrapper> for KernelPtx } + crate::link! { impl kernel<'a, crate::Empty> for KernelPtx } + crate::link! { impl kernel<'a, rc::utils::device_copy::SafeDeviceCopyWrapper> for KernelPtx } } #[cfg(target_os = "cuda")] diff --git a/rust-cuda-derive/src/kernel/specialise/ty.rs b/rust-cuda-derive/src/kernel/specialise/ty.rs index 9b5a06955..196f2556a 100644 --- a/rust-cuda-derive/src/kernel/specialise/ty.rs +++ b/rust-cuda-derive/src/kernel/specialise/ty.rs @@ -1,11 +1,16 @@ use proc_macro::TokenStream; +use quote::ToTokens; pub fn specialise_kernel_type(tokens: TokenStream) -> TokenStream { - let SpecialiseTypeConfig { kernel, typedef } = match syn::parse_macro_input::parse(tokens) { + let SpecialiseTypeConfig { + _private, // TODO: either use or remove the private path + args, + typedef, + } = match syn::parse_macro_input::parse(tokens) { Ok(config) => config, Err(err) => { abort_call_site!( - "specialise_kernel_type!(KERNEL::TYPEDEF) expects KERNEL and TYPEDEF identifiers: \ + "specialise_kernel_type!(ARGS::TYPEDEF) expects ARGS path and TYPEDEF identifier: \ {:?}", err ) @@ -20,15 +25,47 @@ pub fn specialise_kernel_type(tokens: TokenStream) -> TokenStream { let specialisation_var = format!( "RUST_CUDA_DERIVE_SPECIALISE_{}_{}", crate_name, - kernel.to_string().to_uppercase() + args.to_string().to_uppercase() ); match proc_macro::tracked_env::var(&specialisation_var) { Ok(specialisation) => { - match format!("<() as {kernel}{specialisation}>::{typedef}").parse() { - Ok(parsed_specialisation) => parsed_specialisation, + let specialisation = match syn::parse_str(&specialisation) { + _ if specialisation.is_empty() => syn::PathArguments::None, + Ok(specialisation) => syn::PathArguments::AngleBracketed(specialisation), Err(err) => abort_call_site!("Failed to parse specialisation: {:?}", err), - } + }; + + syn::Type::Path(syn::TypePath { + qself: Some(syn::QSelf { + lt_token: syn::parse_quote!(<), + ty: syn::parse_quote!(()), + position: 1, // 2, + as_token: syn::parse_quote!(as), + gt_token: syn::parse_quote!(>), + }), + path: syn::Path { + leading_colon: None, + segments: [ + // syn::PathSegment { + // ident: private, + // arguments: syn::PathArguments::None, + // }, + syn::PathSegment { + ident: args, + arguments: specialisation, + }, + syn::PathSegment { + ident: typedef, + arguments: syn::PathArguments::None, + }, + ] + .into_iter() + .collect(), + }, + }) + .into_token_stream() + .into() }, Err(err) => abort_call_site!( "Failed to read specialisation from {:?}: {:?}", @@ -39,16 +76,23 @@ pub fn specialise_kernel_type(tokens: TokenStream) -> TokenStream { } struct SpecialiseTypeConfig { - kernel: syn::Ident, + _private: syn::Ident, + args: syn::Ident, typedef: syn::Ident, } impl syn::parse::Parse for SpecialiseTypeConfig { fn parse(input: syn::parse::ParseStream) -> syn::Result { - let kernel: syn::Ident = input.parse()?; + let private: syn::Ident = input.parse()?; + let _dc: syn::token::Colon2 = input.parse()?; + let args: syn::Ident = input.parse()?; let _dc: syn::token::Colon2 = input.parse()?; let typedef: syn::Ident = input.parse()?; - Ok(Self { kernel, typedef }) + Ok(Self { + _private: private, + args, + typedef, + }) } } diff --git a/rust-cuda-derive/src/kernel/utils.rs b/rust-cuda-derive/src/kernel/utils.rs index 5afd05858..c73876f09 100644 --- a/rust-cuda-derive/src/kernel/utils.rs +++ b/rust-cuda-derive/src/kernel/utils.rs @@ -1,5 +1,3 @@ -use syn::spanned::Spanned; - pub fn skip_kernel_compilation() -> bool { let mut skip_compilation = false; @@ -13,7 +11,3 @@ pub fn skip_kernel_compilation() -> bool { skip_compilation } - -pub fn r2c_move_lifetime(arg: usize, ty: &syn::Type) -> syn::Lifetime { - syn::Lifetime::new(&format!("'__r2c_move_lt_{arg}"), ty.span()) -} diff --git a/rust-cuda-derive/src/kernel/wrapper/config.rs b/rust-cuda-derive/src/kernel/wrapper/config.rs index 6ba9ebedc..cc9531acc 100644 --- a/rust-cuda-derive/src/kernel/wrapper/config.rs +++ b/rust-cuda-derive/src/kernel/wrapper/config.rs @@ -1,9 +1,8 @@ pub(super) struct KernelConfig { pub(super) visibility: Option, pub(super) linker: syn::Ident, - pub(super) kernel: syn::Ident, + pub(super) private: syn::Ident, pub(super) args: syn::Ident, - pub(super) launcher: syn::Ident, } impl syn::parse::Parse for KernelConfig { @@ -12,22 +11,17 @@ impl syn::parse::Parse for KernelConfig { let _use: syn::token::Use = input.parse()?; let linker: syn::Ident = input.parse()?; let _bang: syn::token::Bang = input.parse()?; - let _as: syn::token::As = input.parse()?; - let _impl: syn::token::Impl = input.parse()?; - let kernel: syn::Ident = input.parse()?; - let _lt_token: syn::token::Lt = input.parse()?; - let args: syn::Ident = input.parse()?; - let _comma: Option = input.parse()?; - let _gt_token: syn::token::Gt = input.parse()?; let _for: syn::token::For = input.parse()?; - let launcher: syn::Ident = input.parse()?; + let _impl: syn::token::Impl = input.parse()?; + + let private = syn::Ident::new("private", proc_macro::Span::def_site().into()); + let args = syn::Ident::new("KernelArgs", proc_macro::Span::def_site().into()); Ok(Self { visibility, linker, - kernel, + private, args, - launcher, }) } } diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/args_trait.rs b/rust-cuda-derive/src/kernel/wrapper/generate/args_trait.rs index 4c725601b..d45a35fb0 100644 --- a/rust-cuda-derive/src/kernel/wrapper/generate/args_trait.rs +++ b/rust-cuda-derive/src/kernel/wrapper/generate/args_trait.rs @@ -1,22 +1,12 @@ use proc_macro2::TokenStream; -use super::super::{DeclGenerics, FunctionInputs, ImplGenerics, KernelConfig}; +use super::super::{FunctionInputs, ImplGenerics, KernelConfig}; pub(in super::super) fn quote_args_trait( - KernelConfig { - visibility, args, .. - }: &KernelConfig, - DeclGenerics { - generic_start_token, - generic_trait_params: generic_params, - generic_close_token, - generic_trait_where_clause: generic_where_clause, - .. - }: &DeclGenerics, + KernelConfig { args, .. }: &KernelConfig, ImplGenerics { impl_generics, ty_generics, - where_clause, }: &ImplGenerics, FunctionInputs { func_inputs, .. }: &FunctionInputs, ) -> TokenStream { @@ -52,25 +42,12 @@ pub(in super::super) fn quote_args_trait( .collect::>(); quote! { - #[cfg(not(target_os = "cuda"))] - #[allow(clippy::missing_safety_doc)] - #visibility unsafe trait #args #generic_start_token #generic_params #generic_close_token - #generic_where_clause - { + #[allow(non_camel_case_types)] + pub trait #args #impl_generics { #(#func_input_typedefs)* } - // #args must always be pub in CUDA kernel as it is used to define the - // public kernel entry point signature - #[cfg(target_os = "cuda")] - #[allow(clippy::missing_safety_doc)] - pub unsafe trait #args #generic_start_token #generic_params #generic_close_token - #generic_where_clause - { - #(#func_input_typedefs)* - } - - unsafe impl #impl_generics #args #ty_generics for () #where_clause { + impl #impl_generics #args #ty_generics for () { #(#func_input_types)* } } diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/get_ptx.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/get_ptx.rs index 790b3b8df..75fc008ed 100644 --- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/get_ptx.rs +++ b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/get_ptx.rs @@ -3,7 +3,9 @@ use syn::spanned::Spanned; use crate::kernel::utils::skip_kernel_compilation; -use super::super::super::{DeclGenerics, FuncIdent, FunctionInputs, InputCudaType, KernelConfig}; +use super::super::super::{ + DeclGenerics, FuncIdent, FunctionInputs, ImplGenerics, InputCudaType, KernelConfig, +}; #[allow(clippy::too_many_arguments)] pub(super) fn quote_get_ptx( @@ -19,6 +21,7 @@ pub(super) fn quote_get_ptx( generic_close_token, .. }: &DeclGenerics, + impl_generics: &ImplGenerics, inputs: &FunctionInputs, func_params: &[syn::Ident], macro_type_ids: &[syn::Ident], @@ -32,6 +35,8 @@ pub(super) fn quote_get_ptx( let crate_manifest_dir = proc_macro::tracked_env::var("CARGO_MANIFEST_DIR") .unwrap_or_else(|err| abort_call_site!("Failed to read crate path: {:?}.", err)); + let args_trait = super::super::args_trait::quote_args_trait(config, impl_generics, inputs); + let cpu_func_lifetime_erased_types = generate_lifetime_erased_types(crate_path, config, generics, inputs, macro_type_ids); @@ -79,6 +84,9 @@ pub(super) fn quote_get_ptx( quote! { fn get_ptx() -> &'static ::core::ffi::CStr { + #[allow(unused_imports)] + use __rust_cuda_ffi_safe_assert::#args; + #crate_path::host::link_kernel!{ #func_ident #func_ident_hash #args #crate_name #crate_manifest_dir #generic_start_token #($#macro_type_ids),* @@ -92,7 +100,9 @@ pub(super) fn quote_get_ptx( #[deny(improper_ctypes)] mod __rust_cuda_ffi_safe_assert { #[allow(unused_imports)] - use super::#args; + use super::*; + + #args_trait extern "C" { #( #[allow(dead_code)] diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/mod.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/mod.rs index 495b61870..ae2be49d9 100644 --- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/mod.rs +++ b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/mod.rs @@ -1,36 +1,34 @@ use proc_macro2::TokenStream; -use super::super::{DeclGenerics, FuncIdent, FunctionInputs, KernelConfig}; +use super::super::{DeclGenerics, FuncIdent, FunctionInputs, ImplGenerics, KernelConfig}; mod get_ptx; use get_ptx::quote_get_ptx; +#[allow(clippy::too_many_arguments)] // FIXME pub(in super::super) fn quote_cpu_linker_macro( crate_path: &syn::Path, config @ KernelConfig { - visibility, - linker, - kernel, - launcher, - .. + visibility, linker, .. }: &KernelConfig, decl_generics @ DeclGenerics { generic_start_token, - generic_trait_params: generic_params, generic_close_token, generic_kernel_params, .. }: &DeclGenerics, + impl_generics: &ImplGenerics, func_inputs: &FunctionInputs, func_ident @ FuncIdent { func_ident: func_ident_name, - func_ident_hash, .. + func_ident_hash, + .. }: &FuncIdent, func_params: &[syn::Ident], ptx_lint_levels: &TokenStream, ) -> TokenStream { - let macro_generics = generic_kernel_params//generic_params + let macro_generics = generic_kernel_params .iter() .enumerate() .map(|(i, generic)| { @@ -39,7 +37,7 @@ pub(in super::super) fn quote_cpu_linker_macro( match generic { syn::GenericParam::Type(_) => quote!($#generic_ident:ty), syn::GenericParam::Const(_) => quote!($#generic_ident:expr), - syn::GenericParam::Lifetime(_) => quote!($#generic_ident:lifetime),//unreachable!(), + syn::GenericParam::Lifetime(_) => quote!($#generic_ident:lifetime), } }) .collect::>(); @@ -48,7 +46,7 @@ pub(in super::super) fn quote_cpu_linker_macro( .map(|i| quote::format_ident!("__g_{}", i)) .collect::>(); - let macro_only_lt_generic_ids = generic_kernel_params//generic_params + let macro_only_lt_generic_ids = generic_kernel_params .iter() .enumerate() .filter_map(|(i, generic)| { @@ -61,7 +59,7 @@ pub(in super::super) fn quote_cpu_linker_macro( }) .collect::>(); - let macro_non_lt_generic_ids = generic_kernel_params//generic_params + let macro_non_lt_generic_ids = generic_kernel_params .iter() .enumerate() .filter_map(|(i, generic)| { @@ -74,17 +72,12 @@ pub(in super::super) fn quote_cpu_linker_macro( }) .collect::>(); - let cpu_linker_macro_visibility = if visibility.is_some() { - quote! { #[macro_export] } - } else { - quote! {} - }; - let get_ptx = quote_get_ptx( crate_path, func_ident, config, decl_generics, + impl_generics, func_inputs, func_params, ¯o_non_lt_generic_ids, @@ -93,25 +86,25 @@ pub(in super::super) fn quote_cpu_linker_macro( quote! { #[cfg(not(target_os = "cuda"))] - #cpu_linker_macro_visibility - macro_rules! #linker { - (impl #func_ident_name #generic_start_token #(#macro_generics),* $(,)? #generic_close_token for $ptx:ident) => { - unsafe impl<#($#macro_only_lt_generic_ids),*> #crate_path::host::CompiledKernelPtx< - #func_ident_name #generic_start_token #($#macro_generic_ids),* #generic_close_token - //dyn #kernel #generic_start_token #($#macro_type_ids),* #generic_close_token - > for $ptx #generic_start_token #($#macro_generic_ids),* #generic_close_token // #launcher #generic_start_token #($#macro_type_ids),* #generic_close_token - { - #get_ptx + #visibility macro #linker( + impl #func_ident_name #generic_start_token + #(#macro_generics),* $(,)? + #generic_close_token for $ptx:ident + ) { + unsafe impl<#($#macro_only_lt_generic_ids),*> #crate_path::host::CompiledKernelPtx< + #func_ident_name #generic_start_token #($#macro_generic_ids),* #generic_close_token + > for $ptx #generic_start_token #($#macro_generic_ids),* #generic_close_token + { + #get_ptx - fn get_entry_point() -> &'static ::core::ffi::CStr { - #crate_path::host::specialise_kernel_call!( - #func_ident_hash #generic_start_token - #($#macro_non_lt_generic_ids),* - #generic_close_token - ) - } + fn get_entry_point() -> &'static ::core::ffi::CStr { + #crate_path::host::specialise_kernel_call!( + #func_ident_hash #generic_start_token + #($#macro_non_lt_generic_ids),* + #generic_close_token + ) } - }; + } } } } diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func.rs index 6fa778eb3..b0fa4625f 100644 --- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func.rs +++ b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func.rs @@ -8,12 +8,12 @@ use super::super::super::{ #[allow(clippy::too_many_lines)] // FIXME pub(super) fn quote_kernel_func_inputs( crate_path: &syn::Path, - KernelConfig { kernel, args, visibility, .. }: &KernelConfig, + KernelConfig { args, private, .. }: &KernelConfig, ImplGenerics { ty_generics, .. }: &ImplGenerics, DeclGenerics { generic_kernel_params, - generic_wrapper_params, - generic_wrapper_where_clause, + generic_start_token, + generic_close_token, .. }: &DeclGenerics, inputs @ FunctionInputs { func_inputs, .. }: &FunctionInputs, @@ -21,32 +21,6 @@ pub(super) fn quote_kernel_func_inputs( func_params: &[syn::Ident], func_attrs: &[syn::Attribute], ) -> TokenStream { - let launcher_predicate = quote! { - Self: Sized + #crate_path::host::Launcher< - KernelTraitObject = dyn #kernel #ty_generics - > - }; - - let generic_wrapper_where_clause = match generic_wrapper_where_clause { - Some(syn::WhereClause { - where_token, - predicates, - }) if !predicates.is_empty() => { - let comma = if predicates.empty_or_trailing() { - quote!() - } else { - quote!(,) - }; - - quote! { - #where_token #predicates #comma #launcher_predicate - } - }, - _ => quote! { - where #launcher_predicate - }, - }; - let (kernel_func_inputs, kernel_func_input_tys): (Vec<_>, Vec<_>) = func_inputs .iter() .enumerate() @@ -59,7 +33,7 @@ pub(super) fn quote_kernel_func_inputs( }) => { let type_ident = quote::format_ident!("__T_{}", i); let syn_type: syn::Type = syn::parse_quote! { - <() as #args #ty_generics>::#type_ident + <() as #private :: #args #ty_generics>::#type_ident }; let syn_type = if let syn::Type::Reference(syn::TypeReference { and_token, @@ -88,21 +62,31 @@ pub(super) fn quote_kernel_func_inputs( }) .unzip(); + let launcher = syn::Ident::new("launcher", proc_macro2::Span::mixed_site()); + let raw_func_input_wrap = - generate_raw_func_input_wrap(crate_path, inputs, fn_ident, func_params); + generate_raw_func_input_wrap(crate_path, inputs, fn_ident, func_params, &launcher); + + let full_generics = generic_kernel_params + .iter() + .map(|param| match param { + syn::GenericParam::Type(syn::TypeParam { ident, .. }) + | syn::GenericParam::Const(syn::ConstParam { ident, .. }) => quote!(#ident), + syn::GenericParam::Lifetime(syn::LifetimeDef { lifetime, .. }) => quote!(#lifetime), + }) + .collect::>(); - let full_generics = generic_kernel_params.iter().map(|param| match param { - syn::GenericParam::Type(syn::TypeParam { ident, .. }) | syn::GenericParam::Const(syn::ConstParam { ident, .. }) => quote!(#ident), - syn::GenericParam::Lifetime(syn::LifetimeDef { lifetime, .. }) => quote!(#lifetime), - }).collect::>(); - let ty_turbofish = ty_generics.as_turbofish(); quote! { #[cfg(not(target_os = "cuda"))] #[allow(non_camel_case_types)] - #visibility type #func_ident <#generic_kernel_params> = impl Copy + Fn( - &mut #crate_path::host::Launcher<#func_ident <#(#full_generics),*>>, + pub type #func_ident #generic_start_token + #generic_kernel_params + #generic_close_token = impl Copy + Fn( + &mut #crate_path::host::Launcher<#func_ident #generic_start_token + #(#full_generics),* + #generic_close_token>, #(#kernel_func_input_tys),* ) -> #crate_path::rustacuda::error::CudaResult<()>; @@ -112,17 +96,10 @@ pub(super) fn quote_kernel_func_inputs( #[allow(clippy::too_many_arguments)] #[allow(clippy::used_underscore_binding)] #[allow(unused_variables)] - #visibility fn #func_ident ( - // &mut self, - // TODO: move the stream - // stream: &'stream #crate_path::rustacuda::stream::Stream, - // kernel: &mut #crate_path::host::TypedKernel<#func_ident #ty_generics>, - launcher: &mut #crate_path::host::Launcher<#func_ident #ty_generics>, + pub fn #func_ident <#generic_kernel_params>( + #launcher: &mut #crate_path::host::Launcher<#func_ident #ty_generics>, #(#kernel_func_inputs),* - ) -> #crate_path::rustacuda::error::CudaResult<()> - // TODO: don't allow where clause - //#generic_wrapper_where_clause - { + ) -> #crate_path::rustacuda::error::CudaResult<()> { let _: #func_ident <#(#full_generics),*> = #func_ident #ty_turbofish; // impls check adapted from Nikolai Vazquez's `impls` crate: @@ -141,9 +118,7 @@ pub(super) fn quote_kernel_func_inputs( >::SYNC } - todo!() - - // #raw_func_input_wrap + #raw_func_input_wrap } } } @@ -159,6 +134,7 @@ fn generate_raw_func_input_wrap( func_ident_async, .. }: &FuncIdent, func_params: &[syn::Ident], + launcher: &syn::Ident, ) -> TokenStream { func_inputs .iter() @@ -167,8 +143,8 @@ fn generate_raw_func_input_wrap( .rev() .fold( quote! { - self.#func_ident_async(stream, #(#func_params),*)?; - stream.synchronize() + #func_ident_async(#launcher, #(#func_params),*)?; + #launcher.stream.synchronize() }, |inner, ((arg, param), (cuda_mode, _ptx_jit))| match arg { syn::FnArg::Typed(syn::PatType { pat, ty, .. }) => match cuda_mode { diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func_async/async_func_types.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func_async/async_func_types.rs index efe8026eb..652ff4bc6 100644 --- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func_async/async_func_types.rs +++ b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func_async/async_func_types.rs @@ -1,18 +1,17 @@ use proc_macro2::TokenStream; use syn::spanned::Spanned; -use crate::kernel::utils::r2c_move_lifetime; - use super::super::super::super::{FunctionInputs, ImplGenerics, InputCudaType, KernelConfig}; pub(super) fn generate_async_func_types( crate_path: &syn::Path, - KernelConfig { args, .. }: &KernelConfig, + KernelConfig { args, private, .. }: &KernelConfig, ImplGenerics { ty_generics, .. }: &ImplGenerics, FunctionInputs { func_inputs, func_input_cuda_types, }: &FunctionInputs, + stream: &syn::Lifetime, ) -> Vec { func_inputs .iter() @@ -27,7 +26,7 @@ pub(super) fn generate_async_func_types( }) => { let type_ident = quote::format_ident!("__T_{}", i); let syn_type = quote! { - <() as #args #ty_generics>::#type_ident + <() as #private :: #args #ty_generics>::#type_ident }; let cuda_type = match cuda_mode { @@ -47,6 +46,8 @@ pub(super) fn generate_async_func_types( .. }) = &**ty { + let lifetime = lifetime.clone().unwrap_or(syn::parse_quote!('_)); + let wrapped_type = if mutability.is_some() { if matches!(cuda_mode, InputCudaType::SafeDeviceCopy) { abort!( @@ -56,11 +57,11 @@ pub(super) fn generate_async_func_types( } quote!( - #crate_path::host::HostAndDeviceMutRefAsync<'stream, #lifetime, #cuda_type> + #crate_path::host::HostAndDeviceMutRefAsync<#stream, #lifetime, #cuda_type> ) } else { quote!( - #crate_path::host::HostAndDeviceConstRefAsync<'stream, #lifetime, #cuda_type> + #crate_path::host::HostAndDeviceConstRefAsync<#stream, #lifetime, #cuda_type> ) }; @@ -68,10 +69,8 @@ pub(super) fn generate_async_func_types( #(#attrs)* #mutability #pat #colon_token #wrapped_type } } else if matches!(cuda_mode, InputCudaType::LendRustToCuda) { - let lifetime = r2c_move_lifetime(i, ty); - let wrapped_type = quote! { - #crate_path::host::HostAndDeviceOwnedAsync<'stream, #lifetime, #cuda_type> + #crate_path::host::HostAndDeviceOwnedAsync<#stream, '_, #cuda_type> }; quote! { diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func_async/launch_types.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func_async/launch_types.rs index 454bdcd57..55771c3c8 100644 --- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func_async/launch_types.rs +++ b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func_async/launch_types.rs @@ -1,13 +1,11 @@ use proc_macro2::TokenStream; use syn::spanned::Spanned; -use crate::kernel::utils::r2c_move_lifetime; - use super::super::super::super::{FunctionInputs, ImplGenerics, InputCudaType, KernelConfig}; pub(in super::super) fn generate_launch_types( crate_path: &syn::Path, - KernelConfig { args, .. }: &KernelConfig, + KernelConfig { args, private, .. }: &KernelConfig, ImplGenerics { ty_generics, .. }: &ImplGenerics, FunctionInputs { func_inputs, @@ -25,7 +23,7 @@ pub(in super::super) fn generate_launch_types( syn::FnArg::Typed(syn::PatType { ty, .. }) => { let type_ident = quote::format_ident!("__T_{}", i); let syn_type = quote::quote_spanned! { ty.span()=> - <() as #args #ty_generics>::#type_ident + <() as #private :: #args #ty_generics>::#type_ident }; cpu_func_unboxed_types.push(syn_type.clone()); @@ -48,20 +46,21 @@ pub(in super::super) fn generate_launch_types( .. }) = &**ty { + let comma: Option = + lifetime.as_ref().map(|_| syn::parse_quote!(,)); + if mutability.is_some() { quote::quote_spanned! { ty.span()=> - #crate_path::common::DeviceMutRef<#lifetime, #cuda_type> + #crate_path::common::DeviceMutRef<#lifetime #comma #cuda_type> } } else { quote::quote_spanned! { ty.span()=> - #crate_path::common::DeviceConstRef<#lifetime, #cuda_type> + #crate_path::common::DeviceConstRef<#lifetime #comma #cuda_type> } } } else if matches!(cuda_mode, InputCudaType::LendRustToCuda) { - let lifetime = r2c_move_lifetime(i, ty); - quote::quote_spanned! { ty.span()=> - #crate_path::common::DeviceMutRef<#lifetime, #cuda_type> + #crate_path::common::DeviceMutRef<#cuda_type> } } else { quote! { #cuda_type } diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func_async/mod.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func_async/mod.rs index 63d0d472f..d4830d254 100644 --- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func_async/mod.rs +++ b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func_async/mod.rs @@ -13,48 +13,26 @@ use type_wrap::generate_func_input_and_ptx_jit_wraps; #[allow(clippy::too_many_arguments)] pub(super) fn quote_kernel_func_async( crate_path: &syn::Path, - config @ KernelConfig { kernel, .. }: &KernelConfig, + config: &KernelConfig, impl_generics @ ImplGenerics { ty_generics, .. }: &ImplGenerics, DeclGenerics { - generic_wrapper_params, - generic_wrapper_where_clause, + generic_kernel_params, .. }: &DeclGenerics, func_inputs: &FunctionInputs, FuncIdent { - func_ident_async, .. + func_ident, + func_ident_async, + .. }: &FuncIdent, func_params: &[syn::Ident], func_attrs: &[syn::Attribute], ) -> TokenStream { - let launcher_predicate = quote! { - Self: Sized + #crate_path::host::Launcher< - KernelTraitObject = dyn #kernel #ty_generics - > - }; - - let generic_wrapper_where_clause = match generic_wrapper_where_clause { - Some(syn::WhereClause { - where_token, - predicates, - }) if !predicates.is_empty() => { - let comma = if predicates.empty_or_trailing() { - quote!() - } else { - quote!(,) - }; - - quote! { - #where_token #predicates #comma #launcher_predicate - } - }, - _ => quote! { - where #launcher_predicate - }, - }; + let launcher = syn::Ident::new("launcher", proc_macro2::Span::mixed_site()); + let stream = syn::Lifetime::new("'stream", proc_macro2::Span::mixed_site()); let kernel_func_async_inputs = - generate_async_func_types(crate_path, config, impl_generics, func_inputs); + generate_async_func_types(crate_path, config, impl_generics, func_inputs, &stream); let (func_input_wrap, func_cpu_ptx_jit_wrap) = generate_func_input_and_ptx_jit_wraps(crate_path, func_inputs); let (cpu_func_types_launch, cpu_func_unboxed_types) = @@ -67,31 +45,18 @@ pub(super) fn quote_kernel_func_async( #[allow(clippy::too_many_arguments)] #[allow(clippy::used_underscore_binding)] #[allow(unused_variables)] - fn #func_ident_async <'stream, #generic_wrapper_params>( - &mut self, - stream: &'stream #crate_path::rustacuda::stream::Stream, + pub fn #func_ident_async <#stream, #generic_kernel_params>( + #launcher: &mut #crate_path::host::Launcher<#stream, '_, #func_ident #ty_generics>, #(#kernel_func_async_inputs),* - ) -> #crate_path::rustacuda::error::CudaResult<()> - #generic_wrapper_where_clause - { - let #crate_path::host::LaunchPackage { - kernel, watcher, config - } = #crate_path::host::Launcher::get_launch_package(self); - - let kernel_jit_result = if config.ptx_jit { - kernel.compile_with_ptx_jit_args(#func_cpu_ptx_jit_wrap)? + ) -> #crate_path::rustacuda::error::CudaResult<()> { + let kernel_jit_result = if #launcher.config.ptx_jit { + #launcher.kernel.compile_with_ptx_jit_args(#func_cpu_ptx_jit_wrap)? } else { - kernel.compile_with_ptx_jit_args(None)? + #launcher.kernel.compile_with_ptx_jit_args(None)? }; - let function = match kernel_jit_result { - #crate_path::host::KernelJITResult::Recompiled(function) => { - // Call launcher hook on kernel compilation - ::on_compile(function, watcher)?; - - function - }, - #crate_path::host::KernelJITResult::Cached(function) => function, + #crate_path::host::KernelJITResult::Recompiled(function) + | #crate_path::host::KernelJITResult::Cached(function) => function, }; #[allow(clippy::redundant_closure_call)] @@ -109,9 +74,9 @@ pub(super) fn quote_kernel_func_async( let #crate_path::host::LaunchConfig { grid, block, shared_memory_size, ptx_jit: _, - } = config; + } = #launcher.config.clone(); - unsafe { stream.launch(function, grid, block, shared_memory_size, + unsafe { #launcher.stream.launch(function, grid, block, shared_memory_size, &[ #( &#func_params as *const _ as *mut ::core::ffi::c_void diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/mod.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/mod.rs index 7007abe87..ef99f68fc 100644 --- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/mod.rs +++ b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/mod.rs @@ -1,8 +1,6 @@ use proc_macro2::TokenStream; -use super::super::{ - BlanketGenerics, DeclGenerics, FuncIdent, FunctionInputs, ImplGenerics, KernelConfig, -}; +use super::super::{DeclGenerics, FuncIdent, FunctionInputs, ImplGenerics, KernelConfig}; mod kernel_func; mod kernel_func_async; @@ -13,22 +11,9 @@ use kernel_func_async::quote_kernel_func_async; #[allow(clippy::too_many_arguments)] pub(in super::super) fn quote_cpu_wrapper( crate_path: &syn::Path, - config @ KernelConfig { - visibility, kernel, .. - }: &KernelConfig, - decl @ DeclGenerics { - generic_start_token, - generic_trait_params, - generic_close_token, - generic_trait_where_clause, - .. - }: &DeclGenerics, - impl_generics @ ImplGenerics { ty_generics, .. }: &ImplGenerics, - BlanketGenerics { - blanket_ty, - impl_generics: blanket_impl_generics, - where_clause: blanket_where_clause, - }: &BlanketGenerics, + config: &KernelConfig, + decl: &DeclGenerics, + impl_generics: &ImplGenerics, func_inputs: &FunctionInputs, fn_ident: &FuncIdent, func_params: &[syn::Ident], @@ -56,27 +41,8 @@ pub(in super::super) fn quote_cpu_wrapper( ); quote! { - // #[cfg(not(target_os = "cuda"))] - // #[allow(clippy::missing_safety_doc)] - // #visibility unsafe trait #kernel #generic_start_token - // #generic_trait_params - // #generic_close_token: #crate_path::host::CompiledKernelPtx< - // dyn #kernel #ty_generics - // > #generic_trait_where_clause - // { - // #kernel_func - - // #kernel_func_async - // } - - // #[cfg(not(target_os = "cuda"))] - // #[allow(clippy::missing_safety_doc)] - // unsafe impl #blanket_impl_generics #kernel #ty_generics for #blanket_ty - // #blanket_where_clause - // {} - #kernel_func - // #kernel_func_async + #kernel_func_async } } diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cuda_generic_function.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cuda_generic_function.rs index 628642fc0..aa23b77c6 100644 --- a/rust-cuda-derive/src/kernel/wrapper/generate/cuda_generic_function.rs +++ b/rust-cuda-derive/src/kernel/wrapper/generate/cuda_generic_function.rs @@ -7,7 +7,6 @@ pub(in super::super) fn quote_cuda_generic_function( generic_start_token, generic_kernel_params: generic_params, generic_close_token, - generic_kernel_where_clause: generic_where_clause, .. }: &DeclGenerics, func_inputs: &syn::punctuated::Punctuated, @@ -19,7 +18,6 @@ pub(in super::super) fn quote_cuda_generic_function( #[cfg(target_os = "cuda")] #(#func_attrs)* fn #func_ident #generic_start_token #generic_params #generic_close_token (#func_inputs) - #generic_where_clause #func_block } } diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cuda_wrapper.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cuda_wrapper.rs index 40d4abfbf..3e573d583 100644 --- a/rust-cuda-derive/src/kernel/wrapper/generate/cuda_wrapper.rs +++ b/rust-cuda-derive/src/kernel/wrapper/generate/cuda_wrapper.rs @@ -4,13 +4,13 @@ use syn::spanned::Spanned; use super::super::{ super::{KERNEL_TYPE_USE_END_CANARY, KERNEL_TYPE_USE_START_CANARY}, - FuncIdent, FunctionInputs, InputCudaType, KernelConfig, + FuncIdent, FunctionInputs, ImplGenerics, InputCudaType, KernelConfig, }; #[allow(clippy::too_many_lines)] pub(in super::super) fn quote_cuda_wrapper( crate_path: &syn::Path, - config @ KernelConfig { args, .. }: &KernelConfig, + config @ KernelConfig { args, private, .. }: &KernelConfig, inputs @ FunctionInputs { func_inputs, func_input_cuda_types, @@ -20,6 +20,7 @@ pub(in super::super) fn quote_cuda_wrapper( func_ident_hash, .. }: &FuncIdent, + impl_generics: &ImplGenerics, func_attrs: &[syn::Attribute], func_params: &[syn::Ident], ) -> TokenStream { @@ -56,7 +57,7 @@ pub(in super::super) fn quote_cuda_wrapper( let type_ident = quote::format_ident!("__T_{}", i); let syn_type = quote::quote_spanned! { ty.span()=> - #crate_path::device::specialise_kernel_type!(#args :: #type_ident) + #crate_path::device::specialise_kernel_type!(#private :: #args :: #type_ident) }; match cuda_mode { @@ -99,12 +100,21 @@ pub(in super::super) fn quote_cuda_wrapper( syn::FnArg::Receiver(_) => unreachable!(), }); + let args_trait = super::args_trait::quote_args_trait(config, impl_generics, inputs); + quote! { + // TODO: args trait should not be publicly available like this + // but specialisation requires it right now + #args_trait + #[cfg(target_os = "cuda")] #[#crate_path::device::specialise_kernel_entry(#args)] #[no_mangle] #(#func_attrs)* pub unsafe extern "ptx-kernel" fn #func_ident_hash(#(#ptx_func_inputs),*) { + #[allow(unused_imports)] + use __rust_cuda_ffi_safe_assert::#args; + unsafe { ::core::arch::asm!(#KERNEL_TYPE_USE_START_CANARY); } @@ -123,7 +133,9 @@ pub(in super::super) fn quote_cuda_wrapper( #[deny(improper_ctypes)] mod __rust_cuda_ffi_safe_assert { #[allow(unused_imports)] - use super::#args; + use super::*; + + #args_trait extern "C" { #( #[allow(dead_code)] @@ -149,7 +161,7 @@ pub(in super::super) fn quote_cuda_wrapper( fn specialise_ptx_func_inputs( crate_path: &syn::Path, - KernelConfig { args, .. }: &KernelConfig, + KernelConfig { args, private, .. }: &KernelConfig, FunctionInputs { func_inputs, func_input_cuda_types, @@ -170,7 +182,7 @@ fn specialise_ptx_func_inputs( ) => { let type_ident = quote::format_ident!("__T_{}", i); let syn_type = quote::quote_spanned! { ty.span()=> - #crate_path::device::specialise_kernel_type!(#args :: #type_ident) + #crate_path::device::specialise_kernel_type!(#private :: #args :: #type_ident) }; let cuda_type = match cuda_mode { @@ -228,7 +240,7 @@ fn specialise_ptx_func_inputs( fn specialise_ptx_unboxed_types( crate_path: &syn::Path, - KernelConfig { args, .. }: &KernelConfig, + KernelConfig { args, private, .. }: &KernelConfig, FunctionInputs { func_inputs, .. }: &FunctionInputs, ) -> Vec { func_inputs @@ -239,7 +251,7 @@ fn specialise_ptx_unboxed_types( let type_ident = quote::format_ident!("__T_{}", i); quote::quote_spanned! { ty.span()=> - #crate_path::device::specialise_kernel_type!(#args :: #type_ident) + #crate_path::device::specialise_kernel_type!(#private :: #args :: #type_ident) } }, syn::FnArg::Receiver(_) => unreachable!(), diff --git a/rust-cuda-derive/src/kernel/wrapper/inputs/mod.rs b/rust-cuda-derive/src/kernel/wrapper/inputs/mod.rs index 4a25bf958..9222de237 100644 --- a/rust-cuda-derive/src/kernel/wrapper/inputs/mod.rs +++ b/rust-cuda-derive/src/kernel/wrapper/inputs/mod.rs @@ -1,7 +1,5 @@ use syn::spanned::Spanned; -use crate::kernel::utils::r2c_move_lifetime; - use super::{InputCudaType, InputPtxJit}; mod attribute; @@ -12,12 +10,7 @@ pub(super) struct FunctionInputs { pub(super) func_input_cuda_types: Vec<(InputCudaType, InputPtxJit)>, } -pub(super) fn parse_function_inputs( - func: &syn::ItemFn, - generic_params: &mut syn::punctuated::Punctuated, -) -> FunctionInputs { - let mut implicit_lifetime_id: usize = 0; - +pub(super) fn parse_function_inputs(func: &syn::ItemFn) -> FunctionInputs { let (func_inputs, func_input_cuda_types): ( syn::punctuated::Punctuated, Vec<(InputCudaType, InputPtxJit)>, @@ -25,8 +18,7 @@ pub(super) fn parse_function_inputs( .sig .inputs .iter() - .enumerate() - .map(|(i, arg)| match arg { + .map(|arg| match arg { receiver @ syn::FnArg::Receiver(_) => { abort!(receiver.span(), "Kernel function must not have a receiver.") }, @@ -94,13 +86,7 @@ pub(super) fn parse_function_inputs( ); }); - let ty = ensure_reference_type_lifetime( - i, - ty, - &cuda_type, - &mut implicit_lifetime_id, - generic_params, - ); + let ty = ensure_reference_type_lifetime(ty, &cuda_type); ( syn::FnArg::Typed(syn::PatType { @@ -122,13 +108,7 @@ pub(super) fn parse_function_inputs( } #[allow(clippy::unnecessary_box_returns)] -fn ensure_reference_type_lifetime( - i: usize, - ty: &syn::Type, - cuda_type: &InputCudaType, - implicit_lifetime_id: &mut usize, - generic_params: &mut syn::punctuated::Punctuated, -) -> Box { +fn ensure_reference_type_lifetime(ty: &syn::Type, cuda_type: &InputCudaType) -> Box { match ty { syn::Type::Reference(syn::TypeReference { and_token, @@ -136,27 +116,6 @@ fn ensure_reference_type_lifetime( mutability, elem, }) => { - // let lifetime = lifetime.clone().unwrap_or_else(|| { - // let lifetime = syn::Lifetime::new( - // &format!("'__r2c_lt_{implicit_lifetime_id}"), - // lifetime.span(), - // ); - - // generic_params.insert( - // *implicit_lifetime_id, - // syn::GenericParam::Lifetime(syn::LifetimeDef { - // attrs: Vec::new(), - // lifetime: lifetime.clone(), - // colon_token: None, - // bounds: syn::punctuated::Punctuated::new(), - // }), - // ); - - // *implicit_lifetime_id += 1; - - // lifetime - // }); - let elem = if matches!(cuda_type, InputCudaType::LendRustToCuda) { (|| { if let syn::Type::Path(syn::TypePath { @@ -203,27 +162,11 @@ fn ensure_reference_type_lifetime( Box::new(syn::Type::Reference(syn::TypeReference { and_token: *and_token, - lifetime: lifetime.clone(),//Some(lifetime), + lifetime: lifetime.clone(), mutability: *mutability, elem, })) }, - ty => { - // if matches!(cuda_type, InputCudaType::LendRustToCuda) { - // generic_params.insert( - // *implicit_lifetime_id, - // syn::GenericParam::Lifetime(syn::LifetimeDef { - // attrs: Vec::new(), - // lifetime: r2c_move_lifetime(i, ty), - // colon_token: None, - // bounds: syn::punctuated::Punctuated::new(), - // }), - // ); - - // *implicit_lifetime_id += 1; - // } - - Box::new(ty.clone()) - }, + ty => Box::new(ty.clone()), } } diff --git a/rust-cuda-derive/src/kernel/wrapper/mod.rs b/rust-cuda-derive/src/kernel/wrapper/mod.rs index a4db5f7f3..3d42c9d8b 100644 --- a/rust-cuda-derive/src/kernel/wrapper/mod.rs +++ b/rust-cuda-derive/src/kernel/wrapper/mod.rs @@ -132,116 +132,33 @@ pub fn kernel(attr: TokenStream, func: TokenStream) -> TokenStream { } }; - let mut generic_kernel_params = func.sig.generics.params.clone(); - let mut func_inputs = parse_function_inputs(&func, &mut generic_kernel_params); - - let (generic_start_token, generic_close_token) = if generic_kernel_params.is_empty() { - (None, None) - } else if let (Some(start), Some(close)) = - (func.sig.generics.lt_token, func.sig.generics.gt_token) - { - (Some(start), Some(close)) - } else { - (Some(syn::parse_quote!(<)), Some(syn::parse_quote!(>))) - }; + let mut func_inputs = parse_function_inputs(&func); + + let generic_kernel_params = func.sig.generics.params.clone(); + let (generic_start_token, generic_close_token) = + (func.sig.generics.lt_token, func.sig.generics.gt_token); let generic_trait_params = generic_kernel_params .iter() .filter(|generic_param| !matches!(generic_param, syn::GenericParam::Lifetime(_))) .cloned() .collect(); - let generic_wrapper_params = generic_kernel_params - .iter() - .filter(|generic_param| matches!(generic_param, syn::GenericParam::Lifetime(_))) - .cloned() - .collect(); - - let generic_kernel_where_clause = &func.sig.generics.where_clause; - let generic_trait_where_clause = generic_kernel_where_clause.as_ref().map( - |syn::WhereClause { - where_token, - predicates, - }: &syn::WhereClause| { - let predicates = predicates - .iter() - .filter(|predicate| !matches!(predicate, syn::WherePredicate::Lifetime(_))) - .cloned() - .collect(); - - syn::WhereClause { - where_token: *where_token, - predicates, - } - }, - ); - let generic_wrapper_where_clause = generic_kernel_where_clause.as_ref().map( - |syn::WhereClause { - where_token, - predicates, - }: &syn::WhereClause| { - let predicates = predicates - .iter() - .filter(|predicate| matches!(predicate, syn::WherePredicate::Lifetime(_))) - .cloned() - .collect(); - - syn::WhereClause { - where_token: *where_token, - predicates, - } - }, - ); let decl_generics = DeclGenerics { generic_start_token: &generic_start_token, - generic_trait_params: &generic_trait_params, generic_close_token: &generic_close_token, - generic_trait_where_clause: &generic_trait_where_clause, - generic_wrapper_params: &generic_wrapper_params, - generic_wrapper_where_clause: &generic_wrapper_where_clause, generic_kernel_params: &generic_kernel_params, - generic_kernel_where_clause, }; let trait_generics = syn::Generics { lt_token: generic_start_token, - params: generic_trait_params.clone(), + params: generic_trait_params, gt_token: generic_close_token, - where_clause: generic_trait_where_clause.clone(), - }; - let (impl_generics, ty_generics, where_clause) = trait_generics.split_for_impl(); - let blanket_ty = syn::Ident::new("K", Span::mixed_site()); - let mut blanket_params = generic_trait_params.clone(); - blanket_params.push(syn::GenericParam::Type(syn::TypeParam { - attrs: Vec::new(), - ident: blanket_ty.clone(), - colon_token: syn::parse_quote!(:), - bounds: { - let kernel = &config.kernel; - syn::parse_quote! { - #crate_path::host::CompiledKernelPtx< - dyn #kernel #ty_generics - > - } - }, - eq_token: None, - default: None, - })); - let trait_blanket_generics = syn::Generics { - lt_token: Some(generic_start_token.unwrap_or(syn::parse_quote!(<))), - params: blanket_params, - gt_token: Some(generic_close_token.unwrap_or(syn::parse_quote!(>))), - where_clause: generic_trait_where_clause.clone(), - }; - let (blanket_impl_generics, _, blanket_where_clause) = trait_blanket_generics.split_for_impl(); - let blanket_generics = BlanketGenerics { - blanket_ty, - impl_generics: blanket_impl_generics, - where_clause: blanket_where_clause, + where_clause: None, }; + let (impl_generics, ty_generics, _where_clause) = trait_generics.split_for_impl(); let impl_generics = ImplGenerics { impl_generics, ty_generics, - where_clause, }; let func_ident = FuncIdent { @@ -293,13 +210,12 @@ pub fn kernel(attr: TokenStream, func: TokenStream) -> TokenStream { }) .collect(); - let args_trait = quote_args_trait(&config, &decl_generics, &impl_generics, &func_inputs); + let args_trait = quote_args_trait(&config, &impl_generics, &func_inputs); let cpu_wrapper = quote_cpu_wrapper( &crate_path, &config, &decl_generics, &impl_generics, - &blanket_generics, &func_inputs, &func_ident, &func_params, @@ -310,6 +226,7 @@ pub fn kernel(attr: TokenStream, func: TokenStream) -> TokenStream { &crate_path, &config, &decl_generics, + &impl_generics, &func_inputs, &func_ident, &func_params, @@ -320,6 +237,7 @@ pub fn kernel(attr: TokenStream, func: TokenStream) -> TokenStream { &config, &func_inputs, &func_ident, + &impl_generics, &func.attrs, &func_params, ); @@ -330,9 +248,16 @@ pub fn kernel(attr: TokenStream, func: TokenStream) -> TokenStream { &func.attrs, &func.block, ); + let private = &config.private; (quote! { - #args_trait + mod #private { + #[allow(unused_imports)] + use super::*; + + #args_trait + } + #cpu_wrapper #cpu_cuda_check @@ -355,26 +280,14 @@ struct InputPtxJit(bool); #[allow(clippy::struct_field_names)] struct DeclGenerics<'f> { generic_start_token: &'f Option, - generic_trait_params: &'f syn::punctuated::Punctuated, generic_close_token: &'f Option, - generic_trait_where_clause: &'f Option, - generic_wrapper_params: &'f syn::punctuated::Punctuated, - generic_wrapper_where_clause: &'f Option, generic_kernel_params: &'f syn::punctuated::Punctuated, - generic_kernel_where_clause: &'f Option, } struct ImplGenerics<'f> { #[allow(clippy::struct_field_names)] impl_generics: syn::ImplGenerics<'f>, ty_generics: syn::TypeGenerics<'f>, - where_clause: Option<&'f syn::WhereClause>, -} - -struct BlanketGenerics<'f> { - blanket_ty: syn::Ident, - impl_generics: syn::ImplGenerics<'f>, - where_clause: Option<&'f syn::WhereClause>, } #[allow(clippy::struct_field_names)] diff --git a/rust-cuda-derive/src/kernel/wrapper/parse.rs b/rust-cuda-derive/src/kernel/wrapper/parse.rs index 7d523adb0..56aa60053 100644 --- a/rust-cuda-derive/src/kernel/wrapper/parse.rs +++ b/rust-cuda-derive/src/kernel/wrapper/parse.rs @@ -50,5 +50,12 @@ pub(super) fn parse_kernel_fn(tokens: TokenStream) -> syn::ItemFn { ), }; + if let Some(r#where) = &func.sig.generics.where_clause { + abort!( + r#where.span(), + "Kernel function must not have a where clause, use type generic bounds instead." + ); + } + func } diff --git a/rust-cuda-derive/src/lib.rs b/rust-cuda-derive/src/lib.rs index 74e76a2cc..1a0550bc5 100644 --- a/rust-cuda-derive/src/lib.rs +++ b/rust-cuda-derive/src/lib.rs @@ -5,6 +5,7 @@ #![feature(if_let_guard)] #![feature(let_chains)] #![feature(map_try_insert)] +#![feature(proc_macro_def_site)] #![feature(cfg_version)] #![cfg_attr(not(version("1.76.0")), feature(c_str_literals))] #![doc(html_root_url = "https://juntyr.github.io/rust-cuda/")] diff --git a/src/host.rs b/src/host.rs index ea2bd11a8..5e01e5b1e 100644 --- a/src/host.rs +++ b/src/host.rs @@ -28,20 +28,27 @@ use crate::{ safety::SafeDeviceCopy, }; -pub struct Launcher<'a, Kernel> { - pub kernel: &'a mut TypedPtxKernel, +pub struct Launcher<'stream, 'kernel, Kernel> { + pub stream: &'stream Stream, + pub kernel: &'kernel mut TypedPtxKernel, pub config: LaunchConfig, } -impl<'a, Kernel> Launcher<'a, Kernel> { +impl<'stream, 'kernel, Kernel> Launcher<'stream, 'kernel, Kernel> { #[allow(clippy::missing_errors_doc)] - pub fn launch0(&mut self) -> CudaResult<()> where Kernel: Copy + FnOnce(&mut Launcher) -> CudaResult<()> { - self.kernel.launch0(&self.config) + pub fn launch0(&mut self) -> CudaResult<()> + where + Kernel: Copy + FnOnce(&mut Launcher) -> CudaResult<()>, + { + self.kernel.launch0(self.stream, &self.config) } #[allow(clippy::missing_errors_doc)] - pub fn launch1(&mut self, arg1: A) -> CudaResult<()> where Kernel: Copy + FnOnce(&mut Launcher, A) -> CudaResult<()> { - self.kernel.launch1(&self.config, arg1) + pub fn launch1(&mut self, arg1: A) -> CudaResult<()> + where + Kernel: Copy + FnOnce(&mut Launcher, A) -> CudaResult<()>, + { + self.kernel.launch1(self.stream, &self.config, arg1) } } @@ -173,13 +180,30 @@ impl TypedPtxKernel { } #[allow(clippy::missing_errors_doc)] - pub fn launch0(&mut self, config: &LaunchConfig) -> CudaResult<()> where Kernel: Copy + FnOnce(&mut Launcher) -> CudaResult<()> { - (const { conjure::() })(&mut Launcher { kernel: self, config: config.clone() }) + pub fn launch0(&mut self, stream: &Stream, config: &LaunchConfig) -> CudaResult<()> + where + Kernel: Copy + FnOnce(&mut Launcher) -> CudaResult<()>, + { + (const { conjure::() })(&mut Launcher { + stream, + kernel: self, + config: config.clone(), + }) } #[allow(clippy::missing_errors_doc)] - pub fn launch1(&mut self, config: &LaunchConfig, arg1: A) -> CudaResult<()> where Kernel: Copy + FnOnce(&mut Launcher, A) -> CudaResult<()> { - (const { conjure::() })(&mut Launcher { kernel: self, config: config.clone() }, arg1) + pub fn launch1(&mut self, stream: &Stream, config: &LaunchConfig, arg1: A) -> CudaResult<()> + where + Kernel: Copy + FnOnce(&mut Launcher, A) -> CudaResult<()>, + { + (const { conjure::() })( + &mut Launcher { + stream, + kernel: self, + config: config.clone(), + }, + arg1, + ) } } From 9af625374ede272a2090bf8ff48e21e72f9db3dd Mon Sep 17 00:00:00 2001 From: Juniper Tyree Date: Thu, 21 Dec 2023 04:59:58 +0000 Subject: [PATCH 051/120] Eliminate almost all ArgsTrait usages --- rust-cuda-derive/Cargo.toml | 2 +- rust-cuda-derive/src/kernel/link/config.rs | 9 +- rust-cuda-derive/src/kernel/link/mod.rs | 23 +- .../specialise/{call.rs => entry_point.rs} | 6 +- .../specialise/{entry.rs => function.rs} | 6 +- rust-cuda-derive/src/kernel/specialise/mod.rs | 4 +- rust-cuda-derive/src/kernel/specialise/ty.rs | 281 ++++++++++++++---- rust-cuda-derive/src/kernel/wrapper/config.rs | 12 +- .../{ => cpu_linker_macro}/args_trait.rs | 4 +- .../generate/cpu_linker_macro/get_ptx.rs | 16 +- .../wrapper/generate/cpu_linker_macro/mod.rs | 6 +- .../generate/cpu_wrapper/kernel_func.rs | 49 +-- .../kernel_func_async/async_func_types.rs | 13 +- .../kernel_func_async/launch_types.rs | 15 +- .../cpu_wrapper/kernel_func_async/mod.rs | 10 +- .../wrapper/generate/cpu_wrapper/mod.rs | 6 +- .../kernel/wrapper/generate/cuda_wrapper.rs | 62 ++-- .../src/kernel/wrapper/generate/mod.rs | 1 - rust-cuda-derive/src/kernel/wrapper/mod.rs | 25 +- rust-cuda-derive/src/lib.rs | 8 +- src/device/mod.rs | 2 +- src/host.rs | 2 +- 22 files changed, 328 insertions(+), 234 deletions(-) rename rust-cuda-derive/src/kernel/specialise/{call.rs => entry_point.rs} (91%) rename rust-cuda-derive/src/kernel/specialise/{entry.rs => function.rs} (86%) rename rust-cuda-derive/src/kernel/wrapper/generate/{ => cpu_linker_macro}/args_trait.rs (92%) diff --git a/rust-cuda-derive/Cargo.toml b/rust-cuda-derive/Cargo.toml index 31a686008..60677b1dd 100644 --- a/rust-cuda-derive/Cargo.toml +++ b/rust-cuda-derive/Cargo.toml @@ -12,7 +12,7 @@ links = "libnvptxcompiler_static" proc-macro = true [dependencies] -syn = { version = "1.0", features = ["full"] } +syn = { version = "1.0", features = ["full", "fold"] } quote = "1.0" proc-macro2 = "1.0" proc-macro-error = "1.0" diff --git a/rust-cuda-derive/src/kernel/link/config.rs b/rust-cuda-derive/src/kernel/link/config.rs index efb7899fa..d7a4d0458 100644 --- a/rust-cuda-derive/src/kernel/link/config.rs +++ b/rust-cuda-derive/src/kernel/link/config.rs @@ -6,7 +6,6 @@ use super::super::lints::{parse_ptx_lint_level, LintLevel, PtxLint}; pub(super) struct LinkKernelConfig { pub(super) kernel: syn::Ident, pub(super) kernel_hash: syn::Ident, - pub(super) args: syn::Ident, pub(super) crate_name: String, pub(super) crate_path: PathBuf, pub(super) specialisation: String, @@ -17,7 +16,6 @@ impl syn::parse::Parse for LinkKernelConfig { fn parse(input: syn::parse::ParseStream) -> syn::Result { let kernel: syn::Ident = input.parse()?; let kernel_hash: syn::Ident = input.parse()?; - let args: syn::Ident = input.parse()?; let name: syn::LitStr = input.parse()?; let path: syn::LitStr = input.parse()?; @@ -56,7 +54,6 @@ impl syn::parse::Parse for LinkKernelConfig { Ok(Self { kernel, kernel_hash, - args, crate_name: name.value(), crate_path: PathBuf::from(path.value()), specialisation, @@ -67,22 +64,22 @@ impl syn::parse::Parse for LinkKernelConfig { #[allow(clippy::module_name_repetitions)] pub(super) struct CheckKernelConfig { + pub(super) kernel: syn::Ident, pub(super) kernel_hash: syn::Ident, - pub(super) args: syn::Ident, pub(super) crate_name: String, pub(super) crate_path: PathBuf, } impl syn::parse::Parse for CheckKernelConfig { fn parse(input: syn::parse::ParseStream) -> syn::Result { + let kernel: syn::Ident = input.parse()?; let kernel_hash: syn::Ident = input.parse()?; - let args: syn::Ident = input.parse()?; let name: syn::LitStr = input.parse()?; let path: syn::LitStr = input.parse()?; Ok(Self { + kernel, kernel_hash, - args, crate_name: name.value(), crate_path: PathBuf::from(path.value()), }) diff --git a/rust-cuda-derive/src/kernel/link/mod.rs b/rust-cuda-derive/src/kernel/link/mod.rs index bcbe297cf..8424e7056 100644 --- a/rust-cuda-derive/src/kernel/link/mod.rs +++ b/rust-cuda-derive/src/kernel/link/mod.rs @@ -36,22 +36,22 @@ pub fn check_kernel(tokens: TokenStream) -> TokenStream { proc_macro_error::set_dummy(quote! {::core::result::Result::Err(())}); let CheckKernelConfig { + kernel, kernel_hash, - args, crate_name, crate_path, } = match syn::parse_macro_input::parse(tokens) { Ok(config) => config, Err(err) => { abort_call_site!( - "check_kernel!(HASH ARGS NAME PATH) expects HASH and ARGS identifiers, annd NAME \ - and PATH string literals: {:?}", + "check_kernel!(KERNEL HASH NAME PATH) expects KERNEL and HASH identifiers, annd \ + NAME and PATH string literals: {:?}", err ) }, }; - let kernel_ptx = compile_kernel(&args, &crate_name, &crate_path, Specialisation::Check); + let kernel_ptx = compile_kernel(&kernel, &crate_name, &crate_path, Specialisation::Check); let Some(kernel_ptx) = kernel_ptx else { return quote!(::core::result::Result::Err(())).into(); @@ -74,9 +74,8 @@ pub fn link_kernel(tokens: TokenStream) -> TokenStream { }); let LinkKernelConfig { - kernel: _kernel, + kernel, kernel_hash, - args, crate_name, crate_path, specialisation, @@ -85,9 +84,9 @@ pub fn link_kernel(tokens: TokenStream) -> TokenStream { Ok(config) => config, Err(err) => { abort_call_site!( - "link_kernel!(KERNEL HASH ARGS NAME PATH SPECIALISATION LINTS,*) expects KERNEL, \ - HASH, and ARGS identifiers, NAME and PATH string literals, and SPECIALISATION \ - and LINTS tokens: {:?}", + "link_kernel!(KERNEL HASH NAME PATH SPECIALISATION LINTS,*) expects KERNEL and \ + HASH identifiers, NAME and PATH string literals, and SPECIALISATION and LINTS \ + tokens: {:?}", err ) }, @@ -101,7 +100,7 @@ pub fn link_kernel(tokens: TokenStream) -> TokenStream { } let Some(mut kernel_ptx) = compile_kernel( - &args, + &kernel, &crate_name, &crate_path, Specialisation::Link(&specialisation), @@ -601,7 +600,7 @@ fn check_kernel_ptx( } fn compile_kernel( - args: &syn::Ident, + kernel: &syn::Ident, crate_name: &str, crate_path: &Path, specialisation: Specialisation, @@ -618,7 +617,7 @@ fn compile_kernel( let specialisation_var = format!( "RUST_CUDA_DERIVE_SPECIALISE_{}_{}", crate_name, - args.to_string().to_uppercase() + kernel.to_string().to_uppercase() ); match build_kernel_with_specialisation(crate_path, &specialisation_var, specialisation) { diff --git a/rust-cuda-derive/src/kernel/specialise/call.rs b/rust-cuda-derive/src/kernel/specialise/entry_point.rs similarity index 91% rename from rust-cuda-derive/src/kernel/specialise/call.rs rename to rust-cuda-derive/src/kernel/specialise/entry_point.rs index 10e43d26a..5653a5539 100644 --- a/rust-cuda-derive/src/kernel/specialise/call.rs +++ b/rust-cuda-derive/src/kernel/specialise/entry_point.rs @@ -3,7 +3,7 @@ use std::ffi::CStr; use proc_macro::TokenStream; #[allow(clippy::module_name_repetitions)] -pub fn specialise_kernel_call(tokens: TokenStream) -> TokenStream { +pub fn specialise_kernel_entry_point(tokens: TokenStream) -> TokenStream { let SpecialiseMangleConfig { kernel, specialisation, @@ -11,8 +11,8 @@ pub fn specialise_kernel_call(tokens: TokenStream) -> TokenStream { Ok(config) => config, Err(err) => { abort_call_site!( - "specialise_kernel_call!(KERNEL SPECIALISATION) expects KERNEL identifier and \ - SPECIALISATION tokens: {:?}", + "specialise_kernel_entry_point!(KERNEL SPECIALISATION) expects KERNEL identifier \ + and SPECIALISATION tokens: {:?}", err ) }, diff --git a/rust-cuda-derive/src/kernel/specialise/entry.rs b/rust-cuda-derive/src/kernel/specialise/function.rs similarity index 86% rename from rust-cuda-derive/src/kernel/specialise/entry.rs rename to rust-cuda-derive/src/kernel/specialise/function.rs index b85a433e7..068f30d97 100644 --- a/rust-cuda-derive/src/kernel/specialise/entry.rs +++ b/rust-cuda-derive/src/kernel/specialise/function.rs @@ -3,10 +3,10 @@ use std::env::VarError; use proc_macro::TokenStream; #[allow(clippy::module_name_repetitions)] -pub fn specialise_kernel_entry(attr: TokenStream, func: TokenStream) -> TokenStream { +pub fn specialise_kernel_function(attr: TokenStream, func: TokenStream) -> TokenStream { let mut func: syn::ItemFn = syn::parse(func).unwrap_or_else(|err| { abort_call_site!( - "#[specialise_kernel_entry(...)] must be wrapped around a function: {:?}", + "#[specialise_kernel_function(...)] must be wrapped around a function: {:?}", err ) }); @@ -14,7 +14,7 @@ pub fn specialise_kernel_entry(attr: TokenStream, func: TokenStream) -> TokenStr let kernel: syn::Ident = match syn::parse_macro_input::parse(attr) { Ok(kernel) => kernel, Err(err) => abort_call_site!( - "#[specialise_kernel_entry(KERNEL)] expects KERNEL identifier: {:?}", + "#[specialise_kernel_function(KERNEL)] expects KERNEL identifier: {:?}", err ), }; diff --git a/rust-cuda-derive/src/kernel/specialise/mod.rs b/rust-cuda-derive/src/kernel/specialise/mod.rs index 337508b5b..6d30d4d5d 100644 --- a/rust-cuda-derive/src/kernel/specialise/mod.rs +++ b/rust-cuda-derive/src/kernel/specialise/mod.rs @@ -1,3 +1,3 @@ -pub mod call; -pub mod entry; +pub mod entry_point; +pub mod function; pub mod ty; diff --git a/rust-cuda-derive/src/kernel/specialise/ty.rs b/rust-cuda-derive/src/kernel/specialise/ty.rs index 196f2556a..9805abc3c 100644 --- a/rust-cuda-derive/src/kernel/specialise/ty.rs +++ b/rust-cuda-derive/src/kernel/specialise/ty.rs @@ -3,15 +3,15 @@ use quote::ToTokens; pub fn specialise_kernel_type(tokens: TokenStream) -> TokenStream { let SpecialiseTypeConfig { - _private, // TODO: either use or remove the private path - args, - typedef, + mut ty, + generics, + kernel, } = match syn::parse_macro_input::parse(tokens) { Ok(config) => config, Err(err) => { abort_call_site!( - "specialise_kernel_type!(ARGS::TYPEDEF) expects ARGS path and TYPEDEF identifier: \ - {:?}", + "specialise_kernel_type!(TY for GENERICS in KERNEL) expects TY type, GENERICS \ + generics, and KERNEL identifier: {:?}", err ) }, @@ -25,74 +25,243 @@ pub fn specialise_kernel_type(tokens: TokenStream) -> TokenStream { let specialisation_var = format!( "RUST_CUDA_DERIVE_SPECIALISE_{}_{}", crate_name, - args.to_string().to_uppercase() + kernel.to_string().to_uppercase() ); - match proc_macro::tracked_env::var(&specialisation_var) { - Ok(specialisation) => { - let specialisation = match syn::parse_str(&specialisation) { - _ if specialisation.is_empty() => syn::PathArguments::None, - Ok(specialisation) => syn::PathArguments::AngleBracketed(specialisation), - Err(err) => abort_call_site!("Failed to parse specialisation: {:?}", err), - }; - - syn::Type::Path(syn::TypePath { - qself: Some(syn::QSelf { - lt_token: syn::parse_quote!(<), - ty: syn::parse_quote!(()), - position: 1, // 2, - as_token: syn::parse_quote!(as), - gt_token: syn::parse_quote!(>), - }), - path: syn::Path { - leading_colon: None, - segments: [ - // syn::PathSegment { - // ident: private, - // arguments: syn::PathArguments::None, - // }, - syn::PathSegment { - ident: args, - arguments: specialisation, - }, - syn::PathSegment { - ident: typedef, - arguments: syn::PathArguments::None, - }, - ] - .into_iter() - .collect(), - }, - }) - .into_token_stream() - .into() - }, + let specialisation = match proc_macro::tracked_env::var(&specialisation_var) { + Ok(specialisation) => specialisation, Err(err) => abort_call_site!( "Failed to read specialisation from {:?}: {:?}", &specialisation_var, err ), + }; + let specialisation = match syn::parse_str(&specialisation) { + _ if specialisation.is_empty() => syn::PathArguments::None, + Ok(specialisation) => syn::PathArguments::AngleBracketed(specialisation), + Err(err) => abort_call_site!("Failed to parse specialisation: {:?}", err), + }; + + if let syn::PathArguments::AngleBracketed(syn::AngleBracketedGenericArguments { + args, .. + }) = specialisation + { + if generics.params.len() != args.len() { + abort_call_site!( + "Mismatch specialising {} with {}", + generics.split_for_impl().1.to_token_stream(), + args.to_token_stream() + ); + } + + for (generic, arg) in generics.params.into_iter().zip(args.into_iter()) { + match (generic, arg) { + ( + syn::GenericParam::Lifetime(syn::LifetimeDef { + lifetime: generic, .. + }), + syn::GenericArgument::Lifetime(arg), + ) => { + ty = syn::fold::Fold::fold_type(&mut FoldLifetimeGeneric { generic, arg }, ty); + }, + ( + syn::GenericParam::Const(syn::ConstParam { ident: generic, .. }), + syn::GenericArgument::Const(arg), + ) => { + ty = syn::fold::Fold::fold_type(&mut FoldConstGeneric { generic, arg }, ty); + }, + ( + syn::GenericParam::Type(syn::TypeParam { ident: generic, .. }), + syn::GenericArgument::Type(arg), + ) => { + ty = syn::fold::Fold::fold_type(&mut FoldTypeGeneric { generic, arg }, ty); + }, + (generic, arg) => abort_call_site!( + "Mismatch specialising {} with {}", + generic.to_token_stream(), + arg.to_token_stream() + ), + } + } + } else if !generics.params.is_empty() { + abort_call_site!( + "Missing specialisation for {}", + generics.split_for_impl().1.to_token_stream() + ); } + + ty.into_token_stream().into() } struct SpecialiseTypeConfig { - _private: syn::Ident, - args: syn::Ident, - typedef: syn::Ident, + ty: syn::Type, + generics: syn::Generics, + kernel: syn::Ident, } impl syn::parse::Parse for SpecialiseTypeConfig { fn parse(input: syn::parse::ParseStream) -> syn::Result { - let private: syn::Ident = input.parse()?; - let _dc: syn::token::Colon2 = input.parse()?; - let args: syn::Ident = input.parse()?; - let _dc: syn::token::Colon2 = input.parse()?; - let typedef: syn::Ident = input.parse()?; + let ty: syn::Type = input.parse()?; + let _for: syn::token::For = input.parse()?; + let generics: syn::Generics = input.parse()?; + let _in: syn::token::In = input.parse()?; + let kernel: syn::Ident = input.parse()?; Ok(Self { - _private: private, - args, - typedef, + ty, + generics, + kernel, }) } } + +struct FoldLifetimeGeneric { + generic: syn::Lifetime, + arg: syn::Lifetime, +} + +impl syn::fold::Fold for FoldLifetimeGeneric { + fn fold_lifetime(&mut self, lt: syn::Lifetime) -> syn::Lifetime { + if lt == self.generic { + self.arg.clone() + } else { + lt + } + } +} + +struct FoldConstGeneric { + generic: syn::Ident, + arg: syn::Expr, +} + +impl syn::fold::Fold for FoldConstGeneric { + fn fold_generic_argument(&mut self, arg: syn::GenericArgument) -> syn::GenericArgument { + let syn::GenericArgument::Type(syn::Type::Path(syn::TypePath { + qself: None, + path: + syn::Path { + leading_colon: None, + segments, + }, + })) = arg + else { + return syn::fold::fold_generic_argument(self, arg); + }; + + if let Some(syn::PathSegment { + ident, + arguments: syn::PathArguments::None, + }) = segments.first() + && segments.len() == 1 + && ident == &self.generic + { + return syn::GenericArgument::Const(self.arg.clone()); + } + + syn::fold::fold_generic_argument( + self, + syn::GenericArgument::Type(syn::Type::Path(syn::TypePath { + qself: None, + path: syn::Path { + leading_colon: None, + segments, + }, + })), + ) + } + + fn fold_expr(&mut self, expr: syn::Expr) -> syn::Expr { + let syn::Expr::Path(syn::ExprPath { + qself: None, + path: + syn::Path { + leading_colon: None, + segments, + }, + attrs, + }) = expr + else { + return syn::fold::fold_expr(self, expr); + }; + + if let Some(syn::PathSegment { + ident, + arguments: syn::PathArguments::None, + }) = segments.first() + && segments.len() == 1 + && ident == &self.generic + { + return self.arg.clone(); + } + + syn::fold::fold_expr( + self, + syn::Expr::Path(syn::ExprPath { + qself: None, + path: syn::Path { + leading_colon: None, + segments, + }, + attrs, + }), + ) + } +} + +struct FoldTypeGeneric { + generic: syn::Ident, + arg: syn::Type, +} + +impl syn::fold::Fold for FoldTypeGeneric { + fn fold_type(&mut self, ty: syn::Type) -> syn::Type { + let syn::Type::Path(syn::TypePath { + qself: None, + path: + syn::Path { + leading_colon: None, + segments, + }, + }) = ty + else { + return syn::fold::fold_type(self, ty); + }; + + if let Some(syn::PathSegment { + ident, + arguments: syn::PathArguments::None, + }) = segments.first() + && ident == &self.generic + { + return if segments.len() > 1 { + syn::Type::Path(syn::TypePath { + qself: Some(syn::QSelf { + lt_token: syn::parse_quote!(<), + ty: Box::new(self.arg.clone()), + position: 0, + as_token: None, + gt_token: syn::parse_quote!(>), + }), + path: syn::Path { + leading_colon: syn::parse_quote!(::), + segments: segments.into_iter().skip(1).collect(), + }, + }) + } else { + self.arg.clone() + }; + } + + syn::fold::fold_type( + self, + syn::Type::Path(syn::TypePath { + qself: None, + path: syn::Path { + leading_colon: None, + segments, + }, + }), + ) + } +} diff --git a/rust-cuda-derive/src/kernel/wrapper/config.rs b/rust-cuda-derive/src/kernel/wrapper/config.rs index cc9531acc..8f8cd2240 100644 --- a/rust-cuda-derive/src/kernel/wrapper/config.rs +++ b/rust-cuda-derive/src/kernel/wrapper/config.rs @@ -1,8 +1,6 @@ pub(super) struct KernelConfig { pub(super) visibility: Option, pub(super) linker: syn::Ident, - pub(super) private: syn::Ident, - pub(super) args: syn::Ident, } impl syn::parse::Parse for KernelConfig { @@ -14,14 +12,6 @@ impl syn::parse::Parse for KernelConfig { let _for: syn::token::For = input.parse()?; let _impl: syn::token::Impl = input.parse()?; - let private = syn::Ident::new("private", proc_macro::Span::def_site().into()); - let args = syn::Ident::new("KernelArgs", proc_macro::Span::def_site().into()); - - Ok(Self { - visibility, - linker, - private, - args, - }) + Ok(Self { visibility, linker }) } } diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/args_trait.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/args_trait.rs similarity index 92% rename from rust-cuda-derive/src/kernel/wrapper/generate/args_trait.rs rename to rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/args_trait.rs index d45a35fb0..178ed026d 100644 --- a/rust-cuda-derive/src/kernel/wrapper/generate/args_trait.rs +++ b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/args_trait.rs @@ -1,9 +1,9 @@ use proc_macro2::TokenStream; -use super::super::{FunctionInputs, ImplGenerics, KernelConfig}; +use super::super::super::{FunctionInputs, ImplGenerics}; pub(in super::super) fn quote_args_trait( - KernelConfig { args, .. }: &KernelConfig, + args: &syn::Ident, ImplGenerics { impl_generics, ty_generics, diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/get_ptx.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/get_ptx.rs index 75fc008ed..7e4b88f87 100644 --- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/get_ptx.rs +++ b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/get_ptx.rs @@ -3,9 +3,7 @@ use syn::spanned::Spanned; use crate::kernel::utils::skip_kernel_compilation; -use super::super::super::{ - DeclGenerics, FuncIdent, FunctionInputs, ImplGenerics, InputCudaType, KernelConfig, -}; +use super::super::super::{DeclGenerics, FuncIdent, FunctionInputs, ImplGenerics, InputCudaType}; #[allow(clippy::too_many_arguments)] pub(super) fn quote_get_ptx( @@ -15,7 +13,6 @@ pub(super) fn quote_get_ptx( func_ident_hash, .. }: &FuncIdent, - config @ KernelConfig { args, .. }: &KernelConfig, generics @ DeclGenerics { generic_start_token, generic_close_token, @@ -35,10 +32,11 @@ pub(super) fn quote_get_ptx( let crate_manifest_dir = proc_macro::tracked_env::var("CARGO_MANIFEST_DIR") .unwrap_or_else(|err| abort_call_site!("Failed to read crate path: {:?}.", err)); - let args_trait = super::super::args_trait::quote_args_trait(config, impl_generics, inputs); + let args = syn::Ident::new("KernelArgs", proc_macro::Span::def_site().into()); + let args_trait = super::args_trait::quote_args_trait(&args, impl_generics, inputs); let cpu_func_lifetime_erased_types = - generate_lifetime_erased_types(crate_path, config, generics, inputs, macro_type_ids); + generate_lifetime_erased_types(crate_path, &args, generics, inputs, macro_type_ids); let matching_kernel_assert = if skip_kernel_compilation() { quote!() @@ -49,7 +47,7 @@ pub(super) fn quote_get_ptx( }> = #crate_path::safety::kernel_signature::Assert::<{ #crate_path::safety::kernel_signature::check( PTX_CSTR.to_bytes(), - #crate_path::host::specialise_kernel_call!( + #crate_path::host::specialise_kernel_entry_point!( #func_ident_hash #generic_start_token #($#macro_type_ids),* #generic_close_token @@ -88,7 +86,7 @@ pub(super) fn quote_get_ptx( use __rust_cuda_ffi_safe_assert::#args; #crate_path::host::link_kernel!{ - #func_ident #func_ident_hash #args #crate_name #crate_manifest_dir #generic_start_token + #func_ident #func_ident_hash #crate_name #crate_manifest_dir #generic_start_token #($#macro_type_ids),* #generic_close_token #ptx_lint_levels } @@ -117,7 +115,7 @@ pub(super) fn quote_get_ptx( fn generate_lifetime_erased_types( crate_path: &syn::Path, - KernelConfig { args, .. }: &KernelConfig, + args: &syn::Ident, DeclGenerics { generic_start_token, generic_close_token, diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/mod.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/mod.rs index ae2be49d9..f68b9cf34 100644 --- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/mod.rs +++ b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/mod.rs @@ -2,6 +2,7 @@ use proc_macro2::TokenStream; use super::super::{DeclGenerics, FuncIdent, FunctionInputs, ImplGenerics, KernelConfig}; +mod args_trait; mod get_ptx; use get_ptx::quote_get_ptx; @@ -9,7 +10,7 @@ use get_ptx::quote_get_ptx; #[allow(clippy::too_many_arguments)] // FIXME pub(in super::super) fn quote_cpu_linker_macro( crate_path: &syn::Path, - config @ KernelConfig { + KernelConfig { visibility, linker, .. }: &KernelConfig, decl_generics @ DeclGenerics { @@ -75,7 +76,6 @@ pub(in super::super) fn quote_cpu_linker_macro( let get_ptx = quote_get_ptx( crate_path, func_ident, - config, decl_generics, impl_generics, func_inputs, @@ -98,7 +98,7 @@ pub(in super::super) fn quote_cpu_linker_macro( #get_ptx fn get_entry_point() -> &'static ::core::ffi::CStr { - #crate_path::host::specialise_kernel_call!( + #crate_path::host::specialise_kernel_entry_point!( #func_ident_hash #generic_start_token #($#macro_non_lt_generic_ids),* #generic_close_token diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func.rs index b0fa4625f..a51fc565a 100644 --- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func.rs +++ b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func.rs @@ -1,14 +1,9 @@ use proc_macro2::TokenStream; -use super::super::super::{ - DeclGenerics, FuncIdent, FunctionInputs, ImplGenerics, InputCudaType, KernelConfig, -}; +use super::super::super::{DeclGenerics, FuncIdent, FunctionInputs, ImplGenerics, InputCudaType}; -#[allow(clippy::too_many_arguments)] -#[allow(clippy::too_many_lines)] // FIXME pub(super) fn quote_kernel_func_inputs( crate_path: &syn::Path, - KernelConfig { args, private, .. }: &KernelConfig, ImplGenerics { ty_generics, .. }: &ImplGenerics, DeclGenerics { generic_kernel_params, @@ -21,46 +16,14 @@ pub(super) fn quote_kernel_func_inputs( func_params: &[syn::Ident], func_attrs: &[syn::Attribute], ) -> TokenStream { - let (kernel_func_inputs, kernel_func_input_tys): (Vec<_>, Vec<_>) = func_inputs + let kernel_func_inputs = func_inputs.iter().collect::>(); + let kernel_func_input_tys = func_inputs .iter() - .enumerate() - .map(|(i, arg)| match arg { - syn::FnArg::Typed(syn::PatType { - attrs, - pat, - colon_token, - ty, - }) => { - let type_ident = quote::format_ident!("__T_{}", i); - let syn_type: syn::Type = syn::parse_quote! { - <() as #private :: #args #ty_generics>::#type_ident - }; - let syn_type = if let syn::Type::Reference(syn::TypeReference { - and_token, - lifetime, - mutability, - .. - }) = &**ty - { - syn::Type::Reference(syn::TypeReference { - and_token: *and_token, - lifetime: lifetime.clone(), - mutability: *mutability, - elem: Box::new(syn_type), - }) - } else { - syn_type - }; - - let param = quote! { - #(#attrs)* #pat #colon_token #syn_type - }; - - (param, syn_type) - }, + .map(|arg| match arg { + syn::FnArg::Typed(syn::PatType { ty, .. }) => syn::Type::clone(ty), syn::FnArg::Receiver(_) => unreachable!(), }) - .unzip(); + .collect::>(); let launcher = syn::Ident::new("launcher", proc_macro2::Span::mixed_site()); diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func_async/async_func_types.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func_async/async_func_types.rs index 652ff4bc6..cd539a16c 100644 --- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func_async/async_func_types.rs +++ b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func_async/async_func_types.rs @@ -1,12 +1,10 @@ use proc_macro2::TokenStream; use syn::spanned::Spanned; -use super::super::super::super::{FunctionInputs, ImplGenerics, InputCudaType, KernelConfig}; +use super::super::super::super::{FunctionInputs, InputCudaType}; pub(super) fn generate_async_func_types( crate_path: &syn::Path, - KernelConfig { args, private, .. }: &KernelConfig, - ImplGenerics { ty_generics, .. }: &ImplGenerics, FunctionInputs { func_inputs, func_input_cuda_types, @@ -16,17 +14,16 @@ pub(super) fn generate_async_func_types( func_inputs .iter() .zip(func_input_cuda_types.iter()) - .enumerate() - .map(|(i, (arg, (cuda_mode, _ptx_jit)))| match arg { + .map(|(arg, (cuda_mode, _ptx_jit))| match arg { syn::FnArg::Typed(syn::PatType { attrs, pat, colon_token, ty, }) => { - let type_ident = quote::format_ident!("__T_{}", i); - let syn_type = quote! { - <() as #private :: #args #ty_generics>::#type_ident + let syn_type = match &**ty { + syn::Type::Reference(syn::TypeReference { elem, .. }) => elem, + other => other, }; let cuda_type = match cuda_mode { diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func_async/launch_types.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func_async/launch_types.rs index 55771c3c8..74402c939 100644 --- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func_async/launch_types.rs +++ b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func_async/launch_types.rs @@ -1,29 +1,26 @@ use proc_macro2::TokenStream; use syn::spanned::Spanned; -use super::super::super::super::{FunctionInputs, ImplGenerics, InputCudaType, KernelConfig}; +use super::super::super::super::{FunctionInputs, InputCudaType}; pub(in super::super) fn generate_launch_types( crate_path: &syn::Path, - KernelConfig { args, private, .. }: &KernelConfig, - ImplGenerics { ty_generics, .. }: &ImplGenerics, FunctionInputs { func_inputs, func_input_cuda_types, }: &FunctionInputs, -) -> (Vec, Vec) { +) -> (Vec, Vec) { let mut cpu_func_types_launch = Vec::with_capacity(func_inputs.len()); let mut cpu_func_unboxed_types = Vec::with_capacity(func_inputs.len()); func_inputs .iter() .zip(func_input_cuda_types.iter()) - .enumerate() - .for_each(|(i, (arg, (cuda_mode, _ptx_jit)))| match arg { + .for_each(|(arg, (cuda_mode, _ptx_jit))| match arg { syn::FnArg::Typed(syn::PatType { ty, .. }) => { - let type_ident = quote::format_ident!("__T_{}", i); - let syn_type = quote::quote_spanned! { ty.span()=> - <() as #private :: #args #ty_generics>::#type_ident + let syn_type = match &**ty { + syn::Type::Reference(syn::TypeReference { elem, .. }) => elem, + other => other, }; cpu_func_unboxed_types.push(syn_type.clone()); diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func_async/mod.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func_async/mod.rs index d4830d254..c8ef3dbc0 100644 --- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func_async/mod.rs +++ b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func_async/mod.rs @@ -1,6 +1,6 @@ use proc_macro2::TokenStream; -use super::super::super::{DeclGenerics, FuncIdent, FunctionInputs, ImplGenerics, KernelConfig}; +use super::super::super::{DeclGenerics, FuncIdent, FunctionInputs, ImplGenerics}; mod async_func_types; mod launch_types; @@ -13,8 +13,7 @@ use type_wrap::generate_func_input_and_ptx_jit_wraps; #[allow(clippy::too_many_arguments)] pub(super) fn quote_kernel_func_async( crate_path: &syn::Path, - config: &KernelConfig, - impl_generics @ ImplGenerics { ty_generics, .. }: &ImplGenerics, + ImplGenerics { ty_generics, .. }: &ImplGenerics, DeclGenerics { generic_kernel_params, .. @@ -31,12 +30,11 @@ pub(super) fn quote_kernel_func_async( let launcher = syn::Ident::new("launcher", proc_macro2::Span::mixed_site()); let stream = syn::Lifetime::new("'stream", proc_macro2::Span::mixed_site()); - let kernel_func_async_inputs = - generate_async_func_types(crate_path, config, impl_generics, func_inputs, &stream); + let kernel_func_async_inputs = generate_async_func_types(crate_path, func_inputs, &stream); let (func_input_wrap, func_cpu_ptx_jit_wrap) = generate_func_input_and_ptx_jit_wraps(crate_path, func_inputs); let (cpu_func_types_launch, cpu_func_unboxed_types) = - generate_launch_types(crate_path, config, impl_generics, func_inputs); + generate_launch_types(crate_path, func_inputs); quote! { #[cfg(not(target_os = "cuda"))] diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/mod.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/mod.rs index ef99f68fc..b863a478f 100644 --- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/mod.rs +++ b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/mod.rs @@ -1,6 +1,6 @@ use proc_macro2::TokenStream; -use super::super::{DeclGenerics, FuncIdent, FunctionInputs, ImplGenerics, KernelConfig}; +use super::super::{DeclGenerics, FuncIdent, FunctionInputs, ImplGenerics}; mod kernel_func; mod kernel_func_async; @@ -8,10 +8,8 @@ mod kernel_func_async; use kernel_func::quote_kernel_func_inputs; use kernel_func_async::quote_kernel_func_async; -#[allow(clippy::too_many_arguments)] pub(in super::super) fn quote_cpu_wrapper( crate_path: &syn::Path, - config: &KernelConfig, decl: &DeclGenerics, impl_generics: &ImplGenerics, func_inputs: &FunctionInputs, @@ -21,7 +19,6 @@ pub(in super::super) fn quote_cpu_wrapper( ) -> TokenStream { let kernel_func = quote_kernel_func_inputs( crate_path, - config, impl_generics, decl, func_inputs, @@ -31,7 +28,6 @@ pub(in super::super) fn quote_cpu_wrapper( ); let kernel_func_async = quote_kernel_func_async( crate_path, - config, impl_generics, decl, func_inputs, diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cuda_wrapper.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cuda_wrapper.rs index 3e573d583..058299b41 100644 --- a/rust-cuda-derive/src/kernel/wrapper/generate/cuda_wrapper.rs +++ b/rust-cuda-derive/src/kernel/wrapper/generate/cuda_wrapper.rs @@ -4,28 +4,32 @@ use syn::spanned::Spanned; use super::super::{ super::{KERNEL_TYPE_USE_END_CANARY, KERNEL_TYPE_USE_START_CANARY}, - FuncIdent, FunctionInputs, ImplGenerics, InputCudaType, KernelConfig, + FuncIdent, FunctionInputs, ImplGenerics, InputCudaType, }; #[allow(clippy::too_many_lines)] pub(in super::super) fn quote_cuda_wrapper( crate_path: &syn::Path, - config @ KernelConfig { args, private, .. }: &KernelConfig, inputs @ FunctionInputs { func_inputs, func_input_cuda_types, }: &FunctionInputs, - FuncIdent { + func @ FuncIdent { func_ident, func_ident_hash, .. }: &FuncIdent, - impl_generics: &ImplGenerics, + impl_generics @ ImplGenerics { + impl_generics: generics, + .. + }: &ImplGenerics, func_attrs: &[syn::Attribute], func_params: &[syn::Ident], ) -> TokenStream { - let (ptx_func_inputs, ptx_func_types) = specialise_ptx_func_inputs(crate_path, config, inputs); - let ptx_func_unboxed_types = specialise_ptx_unboxed_types(crate_path, config, inputs); + let (ptx_func_inputs, ptx_func_types) = + specialise_ptx_func_inputs(crate_path, inputs, func, impl_generics); + let ptx_func_unboxed_types = + specialise_ptx_unboxed_types(crate_path, inputs, func, impl_generics); let func_layout_params = func_params .iter() @@ -55,9 +59,12 @@ pub(in super::super) fn quote_cuda_wrapper( } } else { quote! {} }; - let type_ident = quote::format_ident!("__T_{}", i); + let arg_type = match &**ty { + syn::Type::Reference(syn::TypeReference { elem, .. }) => elem, + other => other, + }; let syn_type = quote::quote_spanned! { ty.span()=> - #crate_path::device::specialise_kernel_type!(#private :: #args :: #type_ident) + #crate_path::device::specialise_kernel_type!(#arg_type for #generics in #func_ident) }; match cuda_mode { @@ -100,21 +107,12 @@ pub(in super::super) fn quote_cuda_wrapper( syn::FnArg::Receiver(_) => unreachable!(), }); - let args_trait = super::args_trait::quote_args_trait(config, impl_generics, inputs); - quote! { - // TODO: args trait should not be publicly available like this - // but specialisation requires it right now - #args_trait - #[cfg(target_os = "cuda")] - #[#crate_path::device::specialise_kernel_entry(#args)] + #[#crate_path::device::specialise_kernel_function(#func_ident)] #[no_mangle] #(#func_attrs)* pub unsafe extern "ptx-kernel" fn #func_ident_hash(#(#ptx_func_inputs),*) { - #[allow(unused_imports)] - use __rust_cuda_ffi_safe_assert::#args; - unsafe { ::core::arch::asm!(#KERNEL_TYPE_USE_START_CANARY); } @@ -135,8 +133,6 @@ pub(in super::super) fn quote_cuda_wrapper( #[allow(unused_imports)] use super::*; - #args_trait - extern "C" { #( #[allow(dead_code)] static #func_params: #ptx_func_types; @@ -161,17 +157,17 @@ pub(in super::super) fn quote_cuda_wrapper( fn specialise_ptx_func_inputs( crate_path: &syn::Path, - KernelConfig { args, private, .. }: &KernelConfig, FunctionInputs { func_inputs, func_input_cuda_types, }: &FunctionInputs, + FuncIdent { func_ident, .. }: &FuncIdent, + ImplGenerics { impl_generics, .. }: &ImplGenerics, ) -> (Vec, Vec) { func_inputs .iter() .zip(func_input_cuda_types.iter()) - .enumerate() - .map(|(i, (arg, (cuda_mode, _ptx_jit)))| match arg { + .map(|(arg, (cuda_mode, _ptx_jit))| match arg { syn::FnArg::Typed( fn_arg @ syn::PatType { attrs, @@ -180,9 +176,12 @@ fn specialise_ptx_func_inputs( ty, }, ) => { - let type_ident = quote::format_ident!("__T_{}", i); + let arg_type = match &**ty { + syn::Type::Reference(syn::TypeReference { elem, .. }) => elem, + other => other, + }; let syn_type = quote::quote_spanned! { ty.span()=> - #crate_path::device::specialise_kernel_type!(#private :: #args :: #type_ident) + #crate_path::device::specialise_kernel_type!(#arg_type for #impl_generics in #func_ident) }; let cuda_type = match cuda_mode { @@ -240,18 +239,21 @@ fn specialise_ptx_func_inputs( fn specialise_ptx_unboxed_types( crate_path: &syn::Path, - KernelConfig { args, private, .. }: &KernelConfig, FunctionInputs { func_inputs, .. }: &FunctionInputs, + FuncIdent { func_ident, .. }: &FuncIdent, + ImplGenerics { impl_generics, .. }: &ImplGenerics, ) -> Vec { func_inputs .iter() - .enumerate() - .map(|(i, arg)| match arg { + .map(|arg| match arg { syn::FnArg::Typed(syn::PatType { ty, .. }) => { - let type_ident = quote::format_ident!("__T_{}", i); + let arg_type = match &**ty { + syn::Type::Reference(syn::TypeReference { elem, .. }) => elem, + other => other, + }; quote::quote_spanned! { ty.span()=> - #crate_path::device::specialise_kernel_type!(#private :: #args :: #type_ident) + #crate_path::device::specialise_kernel_type!(#arg_type for #impl_generics in #func_ident) } }, syn::FnArg::Receiver(_) => unreachable!(), diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/mod.rs b/rust-cuda-derive/src/kernel/wrapper/generate/mod.rs index 4dd9b4096..c7a2fcabd 100644 --- a/rust-cuda-derive/src/kernel/wrapper/generate/mod.rs +++ b/rust-cuda-derive/src/kernel/wrapper/generate/mod.rs @@ -1,4 +1,3 @@ -pub mod args_trait; pub mod cpu_linker_macro; pub mod cpu_wrapper; pub mod cuda_generic_function; diff --git a/rust-cuda-derive/src/kernel/wrapper/mod.rs b/rust-cuda-derive/src/kernel/wrapper/mod.rs index 3d42c9d8b..64f6f4f3f 100644 --- a/rust-cuda-derive/src/kernel/wrapper/mod.rs +++ b/rust-cuda-derive/src/kernel/wrapper/mod.rs @@ -14,9 +14,8 @@ use super::lints::{parse_ptx_lint_level, LintLevel, PtxLint}; use config::KernelConfig; use generate::{ - args_trait::quote_args_trait, cpu_linker_macro::quote_cpu_linker_macro, - cpu_wrapper::quote_cpu_wrapper, cuda_generic_function::quote_cuda_generic_function, - cuda_wrapper::quote_cuda_wrapper, + cpu_linker_macro::quote_cpu_linker_macro, cpu_wrapper::quote_cpu_wrapper, + cuda_generic_function::quote_cuda_generic_function, cuda_wrapper::quote_cuda_wrapper, }; use inputs::{parse_function_inputs, FunctionInputs}; use parse::parse_kernel_fn; @@ -210,10 +209,8 @@ pub fn kernel(attr: TokenStream, func: TokenStream) -> TokenStream { }) .collect(); - let args_trait = quote_args_trait(&config, &impl_generics, &func_inputs); let cpu_wrapper = quote_cpu_wrapper( &crate_path, - &config, &decl_generics, &impl_generics, &func_inputs, @@ -221,7 +218,7 @@ pub fn kernel(attr: TokenStream, func: TokenStream) -> TokenStream { &func_params, &func.attrs, ); - let cpu_cuda_check = quote_generic_check(&crate_path, &func_ident, &config); + let cpu_cuda_check = quote_generic_check(&crate_path, &func_ident); let cpu_linker_macro = quote_cpu_linker_macro( &crate_path, &config, @@ -234,7 +231,6 @@ pub fn kernel(attr: TokenStream, func: TokenStream) -> TokenStream { ); let cuda_wrapper = quote_cuda_wrapper( &crate_path, - &config, &func_inputs, &func_ident, &impl_generics, @@ -248,16 +244,8 @@ pub fn kernel(attr: TokenStream, func: TokenStream) -> TokenStream { &func.attrs, &func.block, ); - let private = &config.private; (quote! { - mod #private { - #[allow(unused_imports)] - use super::*; - - #args_trait - } - #cpu_wrapper #cpu_cuda_check @@ -342,9 +330,10 @@ fn ident_from_pat_iter<'p, I: Iterator>(iter: I) -> Option< fn quote_generic_check( crate_path: &syn::Path, FuncIdent { - func_ident_hash, .. + func_ident, + func_ident_hash, + .. }: &FuncIdent, - KernelConfig { args, .. }: &KernelConfig, ) -> proc_macro2::TokenStream { let crate_name = match proc_macro::tracked_env::var("CARGO_CRATE_NAME") { Ok(crate_name) => crate_name.to_uppercase(), @@ -357,7 +346,7 @@ fn quote_generic_check( quote::quote_spanned! { func_ident_hash.span()=> #[cfg(not(target_os = "cuda"))] const _: ::core::result::Result<(), ()> = #crate_path::host::check_kernel!( - #func_ident_hash #args #crate_name #crate_manifest_dir + #func_ident #func_ident_hash #crate_name #crate_manifest_dir ); } } diff --git a/rust-cuda-derive/src/lib.rs b/rust-cuda-derive/src/lib.rs index 1a0550bc5..4651be684 100644 --- a/rust-cuda-derive/src/lib.rs +++ b/rust-cuda-derive/src/lib.rs @@ -54,15 +54,15 @@ pub fn specialise_kernel_type(tokens: TokenStream) -> TokenStream { #[doc(hidden)] #[proc_macro_error] #[proc_macro] -pub fn specialise_kernel_call(tokens: TokenStream) -> TokenStream { - kernel::specialise::call::specialise_kernel_call(tokens) +pub fn specialise_kernel_entry_point(tokens: TokenStream) -> TokenStream { + kernel::specialise::entry_point::specialise_kernel_entry_point(tokens) } #[doc(hidden)] #[proc_macro_error] #[proc_macro_attribute] -pub fn specialise_kernel_entry(attr: TokenStream, func: TokenStream) -> TokenStream { - kernel::specialise::entry::specialise_kernel_entry(attr, func) +pub fn specialise_kernel_function(attr: TokenStream, func: TokenStream) -> TokenStream { + kernel::specialise::function::specialise_kernel_function(attr, func) } #[doc(hidden)] diff --git a/src/device/mod.rs b/src/device/mod.rs index ca9aab9fd..93811bb04 100644 --- a/src/device/mod.rs +++ b/src/device/mod.rs @@ -5,7 +5,7 @@ use core::{ #[cfg(feature = "derive")] #[doc(cfg(feature = "derive"))] -pub use rust_cuda_derive::{specialise_kernel_entry, specialise_kernel_type}; +pub use rust_cuda_derive::{specialise_kernel_function, specialise_kernel_type}; use crate::{ common::{CudaAsRust, DeviceAccessible, DeviceConstRef, DeviceMutRef, RustToCuda}, diff --git a/src/host.rs b/src/host.rs index 5e01e5b1e..8df7d2fbe 100644 --- a/src/host.rs +++ b/src/host.rs @@ -18,7 +18,7 @@ use rustacuda_core::{DeviceCopy, DevicePointer}; #[cfg(feature = "derive")] #[doc(cfg(feature = "derive"))] -pub use rust_cuda_derive::{check_kernel, link_kernel, specialise_kernel_call}; +pub use rust_cuda_derive::{check_kernel, link_kernel, specialise_kernel_entry_point}; use crate::{ common::{ From 6868d6bba6e49540fba833ee5d68c543a8cd77a3 Mon Sep 17 00:00:00 2001 From: Juniper Tyree Date: Thu, 21 Dec 2023 06:11:57 +0000 Subject: [PATCH 052/120] Some refactoring of the async kernel func type + wrap code --- .../generate/cpu_wrapper/kernel_func.rs | 3 +- .../generate/cpu_wrapper/kernel_func_async.rs | 231 ++++++++++++++++++ .../kernel_func_async/async_func_types.rs | 83 ------- .../kernel_func_async/launch_types.rs | 71 ------ .../cpu_wrapper/kernel_func_async/mod.rs | 87 ------- .../kernel_func_async/type_wrap.rs | 53 ---- rust-cuda-derive/src/kernel/wrapper/mod.rs | 3 +- 7 files changed, 233 insertions(+), 298 deletions(-) create mode 100644 rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func_async.rs delete mode 100644 rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func_async/async_func_types.rs delete mode 100644 rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func_async/launch_types.rs delete mode 100644 rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func_async/mod.rs delete mode 100644 rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func_async/type_wrap.rs diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func.rs index a51fc565a..5cc6f8077 100644 --- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func.rs +++ b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func.rs @@ -16,7 +16,6 @@ pub(super) fn quote_kernel_func_inputs( func_params: &[syn::Ident], func_attrs: &[syn::Attribute], ) -> TokenStream { - let kernel_func_inputs = func_inputs.iter().collect::>(); let kernel_func_input_tys = func_inputs .iter() .map(|arg| match arg { @@ -61,7 +60,7 @@ pub(super) fn quote_kernel_func_inputs( #[allow(unused_variables)] pub fn #func_ident <#generic_kernel_params>( #launcher: &mut #crate_path::host::Launcher<#func_ident #ty_generics>, - #(#kernel_func_inputs),* + #func_inputs ) -> #crate_path::rustacuda::error::CudaResult<()> { let _: #func_ident <#(#full_generics),*> = #func_ident #ty_turbofish; diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func_async.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func_async.rs new file mode 100644 index 000000000..8a0013900 --- /dev/null +++ b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func_async.rs @@ -0,0 +1,231 @@ +use proc_macro2::TokenStream; +use syn::spanned::Spanned; + +use super::super::super::{DeclGenerics, FuncIdent, FunctionInputs, ImplGenerics, InputCudaType}; + +#[allow(clippy::too_many_arguments)] +pub(super) fn quote_kernel_func_async( + crate_path: &syn::Path, + ImplGenerics { ty_generics, .. }: &ImplGenerics, + DeclGenerics { + generic_kernel_params, + .. + }: &DeclGenerics, + func_inputs: &FunctionInputs, + FuncIdent { + func_ident, + func_ident_async, + .. + }: &FuncIdent, + func_params: &[syn::Ident], + func_attrs: &[syn::Attribute], +) -> TokenStream { + let launcher = syn::Ident::new("launcher", proc_macro2::Span::mixed_site()); + let stream = syn::Lifetime::new("'stream", proc_macro2::Span::mixed_site()); + + let ( + async_params, + launch_param_types, + unboxed_param_types, + launch_param_wrap, + ptx_jit_param_wrap, + ) = generate_type_wrap(crate_path, func_inputs, &stream); + + quote! { + #[cfg(not(target_os = "cuda"))] + #(#func_attrs)* + #[allow(clippy::extra_unused_type_parameters)] + #[allow(clippy::too_many_arguments)] + #[allow(clippy::used_underscore_binding)] + #[allow(unused_variables)] + pub fn #func_ident_async <#stream, #generic_kernel_params>( + #launcher: &mut #crate_path::host::Launcher<#stream, '_, #func_ident #ty_generics>, + #(#async_params),* + ) -> #crate_path::rustacuda::error::CudaResult<()> { + let kernel_jit_result = if #launcher.config.ptx_jit { + #launcher.kernel.compile_with_ptx_jit_args(#ptx_jit_param_wrap)? + } else { + #launcher.kernel.compile_with_ptx_jit_args(None)? + }; + let function = match kernel_jit_result { + #crate_path::host::KernelJITResult::Recompiled(function) + | #crate_path::host::KernelJITResult::Cached(function) => function, + }; + + #[allow(clippy::redundant_closure_call)] + (|#(#func_params: #launch_param_types),*| { + if false { + #[allow(dead_code)] + fn assert_impl_devicecopy(_val: &T) {} + + #[allow(dead_code)] + fn assert_impl_no_safe_aliasing() {} + + #(assert_impl_devicecopy(&#func_params);)* + #(assert_impl_no_safe_aliasing::<#unboxed_param_types>();)* + } + + let #crate_path::host::LaunchConfig { + grid, block, shared_memory_size, ptx_jit: _, + } = #launcher.config.clone(); + + unsafe { #launcher.stream.launch(function, grid, block, shared_memory_size, + &[ + #( + &#func_params as *const _ as *mut ::core::ffi::c_void + ),* + ] + ) } + })(#(#launch_param_wrap),*) + } + } +} + +#[allow(clippy::too_many_lines)] // FIXME +fn generate_type_wrap( + crate_path: &syn::Path, + FunctionInputs { + func_inputs, + func_input_cuda_types, + }: &FunctionInputs, + stream: &syn::Lifetime, +) -> ( + Vec, + Vec, + Vec, + Vec, + TokenStream, +) { + let mut any_ptx_jit = false; + + let mut async_params = Vec::with_capacity(func_inputs.len()); + let mut launch_param_types = Vec::with_capacity(func_inputs.len()); + let mut unboxed_param_types = Vec::with_capacity(func_inputs.len()); + let mut launch_param_wrap = Vec::with_capacity(func_inputs.len()); + let mut ptx_jit_param_wrap = Vec::with_capacity(func_inputs.len()); + + func_inputs + .iter() + .zip(func_input_cuda_types.iter()) + .for_each(|(arg, (cuda_mode, ptx_jit))| match arg { + syn::FnArg::Typed(syn::PatType { + attrs, + pat, + colon_token, + ty, + }) => { + ptx_jit_param_wrap.push(if ptx_jit.0 { + any_ptx_jit = true; + + quote! { Some(#crate_path::ptx_jit::arg_as_raw_bytes(#pat.for_host())) } + } else { + quote! { None } + }); + + #[allow(clippy::if_same_then_else)] + launch_param_wrap.push(if let syn::Type::Reference(_) = &**ty { + quote! { unsafe { #pat.for_device_async() } } + } else if matches!(cuda_mode, InputCudaType::LendRustToCuda) { + quote! { unsafe { #pat.for_device_async() } } + } else { + quote! { #pat } + }); + + let unboxed_param_type = match &**ty { + syn::Type::Reference(syn::TypeReference { elem, .. }) => elem, + other => other, + }; + unboxed_param_types.push(unboxed_param_type.clone()); + + let cuda_param_type = match cuda_mode { + InputCudaType::SafeDeviceCopy => quote::quote_spanned! { ty.span()=> + #crate_path::utils::device_copy::SafeDeviceCopyWrapper<#unboxed_param_type> + }, + InputCudaType::LendRustToCuda => quote::quote_spanned! { ty.span()=> + #crate_path::common::DeviceAccessible< + <#unboxed_param_type as #crate_path::common::RustToCuda>::CudaRepresentation + > + }, + }; + + let (async_param, launch_param_type) = if let syn::Type::Reference(syn::TypeReference { + mutability, + lifetime, + .. + }) = &**ty + { + let lifetime_or_default = lifetime.clone().unwrap_or(syn::parse_quote!('_)); + let comma: Option = + lifetime.as_ref().map(|_| syn::parse_quote!(,)); + + let (async_param_type, launch_param_type) = if mutability.is_some() { + if matches!(cuda_mode, InputCudaType::SafeDeviceCopy) { + abort!( + mutability.span(), + "Cannot mutably alias a `SafeDeviceCopy` kernel parameter." + ); + } + + ( + quote::quote_spanned! { ty.span()=> + #crate_path::host::HostAndDeviceMutRefAsync<#stream, #lifetime_or_default, #cuda_param_type> + }, + quote::quote_spanned! { ty.span()=> + #crate_path::common::DeviceMutRef<#lifetime #comma #cuda_param_type> + }, + ) + } else { + ( + quote::quote_spanned! { ty.span()=> + #crate_path::host::HostAndDeviceConstRefAsync<#stream, #lifetime_or_default, #cuda_param_type> + }, + quote::quote_spanned! { ty.span()=> + #crate_path::common::DeviceConstRef<#lifetime #comma #cuda_param_type> + }, + ) + }; + + (quote! { + #(#attrs)* #mutability #pat #colon_token #async_param_type + }, launch_param_type) + } else if matches!(cuda_mode, InputCudaType::LendRustToCuda) { + let async_param_type = quote::quote_spanned! { ty.span()=> + #crate_path::host::HostAndDeviceOwnedAsync<#stream, '_, #cuda_param_type> + }; + let launch_param_type = quote::quote_spanned! { ty.span()=> + #crate_path::common::DeviceMutRef<#cuda_param_type> + }; + + ( + quote! { + #(#attrs)* #pat #colon_token #async_param_type + }, + launch_param_type + ) + } else { + ( + quote! { #(#attrs)* #pat #colon_token #cuda_param_type }, + quote! { #cuda_param_type }, + ) + }; + + async_params.push(async_param); + launch_param_types.push(launch_param_type); + }, + syn::FnArg::Receiver(_) => unreachable!(), + }); + + let ptx_jit_param_wrap = if any_ptx_jit { + quote!(Some(&[#(#ptx_jit_param_wrap),*])) + } else { + quote!(None) + }; + + ( + async_params, + launch_param_types, + unboxed_param_types, + launch_param_wrap, + ptx_jit_param_wrap, + ) +} diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func_async/async_func_types.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func_async/async_func_types.rs deleted file mode 100644 index cd539a16c..000000000 --- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func_async/async_func_types.rs +++ /dev/null @@ -1,83 +0,0 @@ -use proc_macro2::TokenStream; -use syn::spanned::Spanned; - -use super::super::super::super::{FunctionInputs, InputCudaType}; - -pub(super) fn generate_async_func_types( - crate_path: &syn::Path, - FunctionInputs { - func_inputs, - func_input_cuda_types, - }: &FunctionInputs, - stream: &syn::Lifetime, -) -> Vec { - func_inputs - .iter() - .zip(func_input_cuda_types.iter()) - .map(|(arg, (cuda_mode, _ptx_jit))| match arg { - syn::FnArg::Typed(syn::PatType { - attrs, - pat, - colon_token, - ty, - }) => { - let syn_type = match &**ty { - syn::Type::Reference(syn::TypeReference { elem, .. }) => elem, - other => other, - }; - - let cuda_type = match cuda_mode { - InputCudaType::SafeDeviceCopy => quote! { - #crate_path::utils::device_copy::SafeDeviceCopyWrapper<#syn_type> - }, - InputCudaType::LendRustToCuda => quote! { - #crate_path::common::DeviceAccessible< - <#syn_type as #crate_path::common::RustToCuda>::CudaRepresentation - > - }, - }; - - if let syn::Type::Reference(syn::TypeReference { - lifetime, - mutability, - .. - }) = &**ty - { - let lifetime = lifetime.clone().unwrap_or(syn::parse_quote!('_)); - - let wrapped_type = if mutability.is_some() { - if matches!(cuda_mode, InputCudaType::SafeDeviceCopy) { - abort!( - mutability.span(), - "Cannot mutably alias a `SafeDeviceCopy` kernel parameter." - ); - } - - quote!( - #crate_path::host::HostAndDeviceMutRefAsync<#stream, #lifetime, #cuda_type> - ) - } else { - quote!( - #crate_path::host::HostAndDeviceConstRefAsync<#stream, #lifetime, #cuda_type> - ) - }; - - quote! { - #(#attrs)* #mutability #pat #colon_token #wrapped_type - } - } else if matches!(cuda_mode, InputCudaType::LendRustToCuda) { - let wrapped_type = quote! { - #crate_path::host::HostAndDeviceOwnedAsync<#stream, '_, #cuda_type> - }; - - quote! { - #(#attrs)* #pat #colon_token #wrapped_type - } - } else { - quote! { #(#attrs)* #pat #colon_token #cuda_type } - } - }, - syn::FnArg::Receiver(_) => unreachable!(), - }) - .collect() -} diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func_async/launch_types.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func_async/launch_types.rs deleted file mode 100644 index 74402c939..000000000 --- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func_async/launch_types.rs +++ /dev/null @@ -1,71 +0,0 @@ -use proc_macro2::TokenStream; -use syn::spanned::Spanned; - -use super::super::super::super::{FunctionInputs, InputCudaType}; - -pub(in super::super) fn generate_launch_types( - crate_path: &syn::Path, - FunctionInputs { - func_inputs, - func_input_cuda_types, - }: &FunctionInputs, -) -> (Vec, Vec) { - let mut cpu_func_types_launch = Vec::with_capacity(func_inputs.len()); - let mut cpu_func_unboxed_types = Vec::with_capacity(func_inputs.len()); - - func_inputs - .iter() - .zip(func_input_cuda_types.iter()) - .for_each(|(arg, (cuda_mode, _ptx_jit))| match arg { - syn::FnArg::Typed(syn::PatType { ty, .. }) => { - let syn_type = match &**ty { - syn::Type::Reference(syn::TypeReference { elem, .. }) => elem, - other => other, - }; - - cpu_func_unboxed_types.push(syn_type.clone()); - - let cuda_type = match cuda_mode { - InputCudaType::SafeDeviceCopy => quote::quote_spanned! { ty.span()=> - #crate_path::utils::device_copy::SafeDeviceCopyWrapper<#syn_type> - }, - InputCudaType::LendRustToCuda => quote::quote_spanned! { ty.span()=> - #crate_path::common::DeviceAccessible< - <#syn_type as #crate_path::common::RustToCuda>::CudaRepresentation - > - }, - }; - - cpu_func_types_launch.push( - if let syn::Type::Reference(syn::TypeReference { - mutability, - lifetime, - .. - }) = &**ty - { - let comma: Option = - lifetime.as_ref().map(|_| syn::parse_quote!(,)); - - if mutability.is_some() { - quote::quote_spanned! { ty.span()=> - #crate_path::common::DeviceMutRef<#lifetime #comma #cuda_type> - } - } else { - quote::quote_spanned! { ty.span()=> - #crate_path::common::DeviceConstRef<#lifetime #comma #cuda_type> - } - } - } else if matches!(cuda_mode, InputCudaType::LendRustToCuda) { - quote::quote_spanned! { ty.span()=> - #crate_path::common::DeviceMutRef<#cuda_type> - } - } else { - quote! { #cuda_type } - }, - ); - }, - syn::FnArg::Receiver(_) => unreachable!(), - }); - - (cpu_func_types_launch, cpu_func_unboxed_types) -} diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func_async/mod.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func_async/mod.rs deleted file mode 100644 index c8ef3dbc0..000000000 --- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func_async/mod.rs +++ /dev/null @@ -1,87 +0,0 @@ -use proc_macro2::TokenStream; - -use super::super::super::{DeclGenerics, FuncIdent, FunctionInputs, ImplGenerics}; - -mod async_func_types; -mod launch_types; -mod type_wrap; - -use async_func_types::generate_async_func_types; -use launch_types::generate_launch_types; -use type_wrap::generate_func_input_and_ptx_jit_wraps; - -#[allow(clippy::too_many_arguments)] -pub(super) fn quote_kernel_func_async( - crate_path: &syn::Path, - ImplGenerics { ty_generics, .. }: &ImplGenerics, - DeclGenerics { - generic_kernel_params, - .. - }: &DeclGenerics, - func_inputs: &FunctionInputs, - FuncIdent { - func_ident, - func_ident_async, - .. - }: &FuncIdent, - func_params: &[syn::Ident], - func_attrs: &[syn::Attribute], -) -> TokenStream { - let launcher = syn::Ident::new("launcher", proc_macro2::Span::mixed_site()); - let stream = syn::Lifetime::new("'stream", proc_macro2::Span::mixed_site()); - - let kernel_func_async_inputs = generate_async_func_types(crate_path, func_inputs, &stream); - let (func_input_wrap, func_cpu_ptx_jit_wrap) = - generate_func_input_and_ptx_jit_wraps(crate_path, func_inputs); - let (cpu_func_types_launch, cpu_func_unboxed_types) = - generate_launch_types(crate_path, func_inputs); - - quote! { - #[cfg(not(target_os = "cuda"))] - #(#func_attrs)* - #[allow(clippy::extra_unused_type_parameters)] - #[allow(clippy::too_many_arguments)] - #[allow(clippy::used_underscore_binding)] - #[allow(unused_variables)] - pub fn #func_ident_async <#stream, #generic_kernel_params>( - #launcher: &mut #crate_path::host::Launcher<#stream, '_, #func_ident #ty_generics>, - #(#kernel_func_async_inputs),* - ) -> #crate_path::rustacuda::error::CudaResult<()> { - let kernel_jit_result = if #launcher.config.ptx_jit { - #launcher.kernel.compile_with_ptx_jit_args(#func_cpu_ptx_jit_wrap)? - } else { - #launcher.kernel.compile_with_ptx_jit_args(None)? - }; - let function = match kernel_jit_result { - #crate_path::host::KernelJITResult::Recompiled(function) - | #crate_path::host::KernelJITResult::Cached(function) => function, - }; - - #[allow(clippy::redundant_closure_call)] - (|#(#func_params: #cpu_func_types_launch),*| { - if false { - #[allow(dead_code)] - fn assert_impl_devicecopy(_val: &T) {} - - #[allow(dead_code)] - fn assert_impl_no_safe_aliasing() {} - - #(assert_impl_devicecopy(&#func_params);)* - #(assert_impl_no_safe_aliasing::<#cpu_func_unboxed_types>();)* - } - - let #crate_path::host::LaunchConfig { - grid, block, shared_memory_size, ptx_jit: _, - } = #launcher.config.clone(); - - unsafe { #launcher.stream.launch(function, grid, block, shared_memory_size, - &[ - #( - &#func_params as *const _ as *mut ::core::ffi::c_void - ),* - ] - ) } - })(#(#func_input_wrap),*) - } - } -} diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func_async/type_wrap.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func_async/type_wrap.rs deleted file mode 100644 index 54ba2945b..000000000 --- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func_async/type_wrap.rs +++ /dev/null @@ -1,53 +0,0 @@ -use proc_macro2::TokenStream; - -use crate::kernel::wrapper::InputCudaType; - -use super::super::super::super::FunctionInputs; - -pub(super) fn generate_func_input_and_ptx_jit_wraps( - crate_path: &syn::Path, - FunctionInputs { - func_inputs, - func_input_cuda_types, - }: &FunctionInputs, -) -> (Vec, TokenStream) { - let mut any_ptx_jit = false; - - let (func_input_wrap, func_cpu_ptx_jit_wrap): (Vec, Vec) = - func_inputs - .iter() - .zip(func_input_cuda_types.iter()) - .map(|(arg, (cuda_mode, ptx_jit))| match arg { - syn::FnArg::Typed(syn::PatType { pat, ty, .. }) => { - #[allow(clippy::if_same_then_else)] - let func_input = if let syn::Type::Reference(_) = &**ty { - quote! { unsafe { #pat.for_device_async() } } - } else if matches!(cuda_mode, InputCudaType::LendRustToCuda) { - quote! { unsafe { #pat.for_device_async() } } - } else { - quote! { #pat } - }; - - let ptx_load = if ptx_jit.0 { - any_ptx_jit = true; - - quote! { Some(#crate_path::ptx_jit::arg_as_raw_bytes(#pat.for_host())) } - } else { - quote! { None } - }; - - (func_input, ptx_load) - }, - syn::FnArg::Receiver(_) => unreachable!(), - }) - .unzip(); - - if any_ptx_jit { - ( - func_input_wrap, - quote!(Some(&[#(#func_cpu_ptx_jit_wrap),*])), - ) - } else { - (func_input_wrap, quote!(None)) - } -} diff --git a/rust-cuda-derive/src/kernel/wrapper/mod.rs b/rust-cuda-derive/src/kernel/wrapper/mod.rs index 64f6f4f3f..b79dfb1fd 100644 --- a/rust-cuda-derive/src/kernel/wrapper/mod.rs +++ b/rust-cuda-derive/src/kernel/wrapper/mod.rs @@ -35,8 +35,7 @@ pub fn kernel(attr: TokenStream, func: TokenStream) -> TokenStream { Ok(config) => config, Err(err) => { abort_call_site!( - "#[kernel(pub? use LINKER! as impl KERNEL for LAUNCHER)] expects LINKER, \ - KERNEL, ARGS, and LAUNCHER identifiers: {:?}", + "#[kernel(pub? use LINKER! for impl)] expects LINKER identifier: {:?}", err ) }, From 27635ee42c4f1111c622767b1e144dd3b018fd2b Mon Sep 17 00:00:00 2001 From: Juniper Tyree Date: Thu, 21 Dec 2023 17:59:45 +0000 Subject: [PATCH 053/120] Early sketch of extracting type wrapping from macro into types and traits --- .../kernel/wrapper/generate/cuda_wrapper.rs | 2 + src/common.rs | 489 ++++++++++++++++++ src/host.rs | 77 ++- src/lib.rs | 1 + 4 files changed, 527 insertions(+), 42 deletions(-) diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cuda_wrapper.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cuda_wrapper.rs index 058299b41..8aa57ab87 100644 --- a/rust-cuda-derive/src/kernel/wrapper/generate/cuda_wrapper.rs +++ b/rust-cuda-derive/src/kernel/wrapper/generate/cuda_wrapper.rs @@ -72,6 +72,8 @@ pub(in super::super) fn quote_cuda_wrapper( syn::TypeReference { and_token, .. } ) = &**ty { // DeviceCopy mode only supports immutable references + // TODO: ptx_jit_load should be here, not there + // also ptx_jit_load should not be enabled for interior mutability quote! { { let #pat: #and_token #syn_type = #pat.as_ref().into_ref(); #inner } } } else { quote! { #ptx_jit_load; { let #pat: #syn_type = #pat.into_inner(); #inner } } diff --git a/src/common.rs b/src/common.rs index cf44848a4..2e7102a73 100644 --- a/src/common.rs +++ b/src/common.rs @@ -239,6 +239,43 @@ impl<'r, T: DeviceCopy> AsMut for DeviceMutRef<'r, T> { } } +#[repr(transparent)] +#[derive(TypeLayout)] +pub struct DeviceOwnedRef { + pub(super) pointer: *mut T, + pub(super) marker: PhantomData, +} + +// TODO: when should the drop run??? +#[cfg(feature = "host")] +impl Drop for DeviceOwnedRef { + fn drop(&mut self) { + // Safety: pointer comes from [`DeviceBox::into_device`] + // i.e. this function completes the roundtrip + let device_box = unsafe { rustacuda::memory::DeviceBox::from_raw(self.pointer) }; + + core::mem::drop(crate::host::CudaDropWrapper::from(device_box)); + } +} + +unsafe impl DeviceCopy for DeviceOwnedRef {} + +#[cfg(any(not(feature = "host"), doc))] +#[doc(cfg(not(feature = "host")))] +impl AsRef for DeviceOwnedRef { + fn as_ref(&self) -> &T { + unsafe { &*self.pointer } + } +} + +#[cfg(any(not(feature = "host"), doc))] +#[doc(cfg(not(feature = "host")))] +impl AsMut for DeviceOwnedRef { + fn as_mut(&mut self) -> &mut T { + unsafe { &mut *self.pointer } + } +} + pub(crate) mod crate_private { pub mod alloc { pub trait Sealed {} @@ -282,3 +319,455 @@ impl CombinedCudaAlloc { (self.0, self.1) } } + +mod sealed { + pub trait Sealed {} +} + +// TODO: doc cfg +pub trait CudaKernelParameter: sealed::Sealed { + #[cfg(feature = "host")] + type SyncHostType; + #[cfg(feature = "host")] + type AsyncHostType<'stream, 'b>; + type FfiType<'stream, 'b>: rustacuda_core::DeviceCopy; + type DeviceType; + + #[cfg(feature = "host")] + #[allow(clippy::missing_errors_doc)] // FIXME + fn with_new_async<'stream, O, E: From>( + param: Self::SyncHostType, + stream: &'stream rustacuda::stream::Stream, + inner: impl for<'b> FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result, + ) -> Result; + + #[cfg(feature = "host")] + fn async_to_ffi<'stream, 'b>( + param: Self::AsyncHostType<'stream, 'b>, + ) -> Self::FfiType<'stream, 'b>; + + #[cfg(not(feature = "host"))] + fn with_ffi_as_device( + param: Self::FfiType<'static, 'static>, + inner: impl FnOnce(Self::DeviceType), + ); +} + +#[repr(transparent)] +pub struct PerThreadShallowCopy< + T: crate::safety::SafeDeviceCopy + const_type_layout::TypeGraphLayout, +>(T); + +#[cfg(not(feature = "host"))] +impl + PerThreadShallowCopy +{ + #[must_use] + pub fn into_inner(self) -> T { + self.0 + } +} + +#[cfg(not(feature = "host"))] +impl core::ops::Deref + for PerThreadShallowCopy +{ + type Target = T; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} + +impl CudaKernelParameter + for PerThreadShallowCopy +{ + #[cfg(feature = "host")] + type AsyncHostType<'stream, 'b> = crate::utils::device_copy::SafeDeviceCopyWrapper; + type DeviceType = PerThreadShallowCopy; + type FfiType<'stream, 'b> = crate::utils::device_copy::SafeDeviceCopyWrapper; + #[cfg(feature = "host")] + type SyncHostType = T; + + #[cfg(feature = "host")] + fn with_new_async<'stream, O, E: From>( + param: Self::SyncHostType, + _stream: &'stream rustacuda::stream::Stream, + inner: impl for<'b> FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result, + ) -> Result { + inner(crate::utils::device_copy::SafeDeviceCopyWrapper::from( + param, + )) + } + + #[cfg(feature = "host")] + fn async_to_ffi<'stream, 'b>( + param: Self::AsyncHostType<'stream, 'b>, + ) -> Self::FfiType<'stream, 'b> { + param + } + + #[cfg(not(feature = "host"))] + fn with_ffi_as_device( + param: Self::FfiType<'static, 'static>, + inner: impl FnOnce(Self::DeviceType), + ) { + let param = PerThreadShallowCopy(param.into_inner()); + + inner(param) + } +} +impl sealed::Sealed + for PerThreadShallowCopy +{ +} + +impl<'a, T: 'static + crate::safety::SafeDeviceCopy + const_type_layout::TypeGraphLayout> + CudaKernelParameter for &'a PerThreadShallowCopy +{ + #[cfg(feature = "host")] + type AsyncHostType<'stream, 'b> = crate::host::HostAndDeviceConstRefAsync< + 'stream, + 'b, + crate::utils::device_copy::SafeDeviceCopyWrapper, + >; + type DeviceType = &'a PerThreadShallowCopy; + type FfiType<'stream, 'b> = + DeviceConstRef<'b, crate::utils::device_copy::SafeDeviceCopyWrapper>; + #[cfg(feature = "host")] + type SyncHostType = &'a T; + + #[cfg(feature = "host")] + fn with_new_async<'stream, O, E: From>( + param: Self::SyncHostType, + _stream: &'stream rustacuda::stream::Stream, + inner: impl for<'b> FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result, + ) -> Result { + let host_box = crate::host::HostDeviceBox::from(rustacuda::memory::DeviceBox::new( + crate::utils::device_copy::SafeDeviceCopyWrapper::from_ref(param), + )?); + + // Safety: `host_box` contains exactly the device copy of `param` + let const_ref = unsafe { + crate::host::HostAndDeviceConstRef::new( + &host_box, + crate::utils::device_copy::SafeDeviceCopyWrapper::from_ref(param), + ) + }; + + inner(const_ref.as_async()) + } + + #[cfg(feature = "host")] + fn async_to_ffi<'stream, 'b>( + param: Self::AsyncHostType<'stream, 'b>, + ) -> Self::FfiType<'stream, 'b> { + unsafe { param.for_device_async() } + } + + #[cfg(not(feature = "host"))] + fn with_ffi_as_device( + param: Self::FfiType<'static, 'static>, + inner: impl FnOnce(Self::DeviceType), + ) { + let param = param.as_ref().into_ref(); + // Safety: PerThreadShallowCopy is a transparent newtype wrapper around T + let param = unsafe { &*(param as *const T).cast::>() }; + + inner(param) + } +} +impl<'a, T: crate::safety::SafeDeviceCopy + const_type_layout::TypeGraphLayout> sealed::Sealed + for &'a PerThreadShallowCopy +{ +} + +#[repr(transparent)] +pub struct ShallowInteriorMutable(T); + +#[cfg(not(feature = "host"))] +impl core::ops::Deref for ShallowInteriorMutable { + type Target = T; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} + +impl<'a, T: 'static + InteriorMutableSafeDeviceCopy> CudaKernelParameter + for &'a ShallowInteriorMutable +{ + #[cfg(feature = "host")] + type AsyncHostType<'stream, 'b> = crate::host::HostAndDeviceConstRefAsync< + 'stream, + 'b, + crate::utils::device_copy::SafeDeviceCopyWrapper, + >; + type DeviceType = &'a ShallowInteriorMutable; + type FfiType<'stream, 'b> = + DeviceConstRef<'b, crate::utils::device_copy::SafeDeviceCopyWrapper>; + #[cfg(feature = "host")] + /// The kernel takes a mutable borrow of the interior mutable data to ensure + /// the interior mutability is limited to just this kernel invocation. + type SyncHostType = &'a mut T; + + #[cfg(feature = "host")] + fn with_new_async<'stream, O, E: From>( + param: Self::SyncHostType, + _stream: &'stream rustacuda::stream::Stream, + inner: impl for<'b> FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result, + ) -> Result { + let host_box = crate::host::HostDeviceBox::from(rustacuda::memory::DeviceBox::new( + crate::utils::device_copy::SafeDeviceCopyWrapper::from_ref(param), + )?); + + // Safety: `host_box` contains exactly the device copy of `param` + let const_ref = unsafe { + crate::host::HostAndDeviceConstRef::new( + &host_box, + crate::utils::device_copy::SafeDeviceCopyWrapper::from_ref(param), + ) + }; + + let result = inner(const_ref.as_async()); + + host_box.copy_to(crate::utils::device_copy::SafeDeviceCopyWrapper::from_mut( + param, + ))?; + + result + } + + #[cfg(feature = "host")] + fn async_to_ffi<'stream, 'b>( + param: Self::AsyncHostType<'stream, 'b>, + ) -> Self::FfiType<'stream, 'b> { + unsafe { param.for_device_async() } + } + + #[cfg(not(feature = "host"))] + fn with_ffi_as_device( + param: Self::FfiType<'static, 'static>, + inner: impl FnOnce(Self::DeviceType), + ) { + let param = param.as_ref().into_ref(); + // Safety: ShallowInteriorMutable is a transparent newtype wrapper around T + let param = unsafe { &*(param as *const T).cast::>() }; + + inner(param) + } +} +impl<'a, T: InteriorMutableSafeDeviceCopy> sealed::Sealed for &'a ShallowInteriorMutable {} + +pub trait InteriorMutableSafeDeviceCopy: + crate::safety::SafeDeviceCopy + const_type_layout::TypeGraphLayout + sealed::Sealed +{ +} + +macro_rules! impl_atomic_interior_mutable { + ($atomic:ident($interior:ty)) => { + impl InteriorMutableSafeDeviceCopy for core::sync::atomic::$atomic {} + impl sealed::Sealed for core::sync::atomic::$atomic {} + }; + ($($atomic:ident($interior:ty)),*) => { + $(impl_atomic_interior_mutable! { $atomic($interior) })* + } +} + +impl_atomic_interior_mutable! { + AtomicBool(bool), + AtomicI8(i8), AtomicI16(i16), AtomicI32(i32), AtomicI64(i64), AtomicIsize(isize), + AtomicU8(u8), AtomicU16(u16), AtomicU32(u32), AtomicU64(u64), AtomicUsize(usize) +} + +// TODO: update const type layout +// impl +// InteriorMutableSafeDeviceCopy for core::cell::SyncUnsafeCell {} +// impl sealed::Sealed for +// core::cell::SyncUnsafeCell {} + +#[repr(transparent)] +pub struct SharedHeapPerThreadShallowCopy(core::mem::ManuallyDrop); + +#[cfg(not(feature = "host"))] +impl SharedHeapPerThreadShallowCopy { + #[must_use] + fn new(value: T) -> Self { + Self(core::mem::ManuallyDrop::new(value)) + } +} + +#[cfg(not(feature = "host"))] +impl< + T: RustToCuda< + CudaRepresentation: crate::safety::SafeDeviceCopy, + CudaAllocation: EmptyCudaAlloc, + >, + > SharedHeapPerThreadShallowCopy +{ + #[must_use] + pub fn into_inner(self) -> T { + core::mem::ManuallyDrop::into_inner(self.0) + } +} + +#[cfg(not(feature = "host"))] +impl core::ops::Deref for SharedHeapPerThreadShallowCopy { + type Target = T; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} + +#[cfg(not(feature = "host"))] +impl core::ops::DerefMut for SharedHeapPerThreadShallowCopy { + fn deref_mut(&mut self) -> &mut Self::Target { + &mut self.0 + } +} + +impl< + T: RustToCuda< + CudaRepresentation: crate::safety::SafeDeviceCopy, + CudaAllocation: EmptyCudaAlloc, + >, + > CudaKernelParameter for SharedHeapPerThreadShallowCopy +{ + #[cfg(feature = "host")] + type AsyncHostType<'stream, 'b> = crate::host::HostAndDeviceOwnedAsync< + 'stream, + DeviceAccessible<::CudaRepresentation>, + >; + type DeviceType = SharedHeapPerThreadShallowCopy; + // TODO: where does the drop happen? + type FfiType<'stream, 'b> = + DeviceOwnedRef::CudaRepresentation>>; + #[cfg(feature = "host")] + type SyncHostType = T; + + #[cfg(feature = "host")] + fn with_new_async<'stream, O, E: From>( + param: Self::SyncHostType, + _stream: &'stream rustacuda::stream::Stream, + inner: impl for<'b> FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result, + ) -> Result { + crate::host::LendToCuda::move_to_cuda(param, |param| inner(param.into_async())) + } + + #[cfg(feature = "host")] + fn async_to_ffi<'stream, 'b>( + param: Self::AsyncHostType<'stream, 'b>, + ) -> Self::FfiType<'stream, 'b> { + unsafe { param.for_device_async() } + } + + #[cfg(not(feature = "host"))] + fn with_ffi_as_device( + param: Self::FfiType<'static, 'static>, + inner: impl FnOnce(Self::DeviceType), + ) { + let param = + SharedHeapPerThreadShallowCopy::new(unsafe { CudaAsRust::as_rust(param.as_ref()) }); + + inner(param) + } +} +impl< + T: RustToCuda< + CudaRepresentation: crate::safety::SafeDeviceCopy, + CudaAllocation: EmptyCudaAlloc, + >, + > sealed::Sealed for SharedHeapPerThreadShallowCopy +{ +} + +impl<'a, T: 'static + RustToCuda> CudaKernelParameter for &'a SharedHeapPerThreadShallowCopy { + #[cfg(feature = "host")] + type AsyncHostType<'stream, 'b> = crate::host::HostAndDeviceConstRefAsync< + 'stream, + 'b, + DeviceAccessible<::CudaRepresentation>, + >; + type DeviceType = &'a SharedHeapPerThreadShallowCopy; + type FfiType<'stream, 'b> = + DeviceConstRef<'b, DeviceAccessible<::CudaRepresentation>>; + #[cfg(feature = "host")] + type SyncHostType = &'a T; + + #[cfg(feature = "host")] + fn with_new_async<'stream, O, E: From>( + param: Self::SyncHostType, + _stream: &'stream rustacuda::stream::Stream, + inner: impl for<'b> FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result, + ) -> Result { + crate::host::LendToCuda::lend_to_cuda(param, |param| inner(param.as_async())) + } + + #[cfg(feature = "host")] + fn async_to_ffi<'stream, 'b>( + param: Self::AsyncHostType<'stream, 'b>, + ) -> Self::FfiType<'stream, 'b> { + unsafe { param.for_device_async() } + } + + #[cfg(not(feature = "host"))] + fn with_ffi_as_device( + param: Self::FfiType<'static, 'static>, + inner: impl FnOnce(Self::DeviceType), + ) { + // param must never be dropped as we do NOT own any of the + // heap memory it might reference + let param = + SharedHeapPerThreadShallowCopy::new(unsafe { CudaAsRust::as_rust(param.as_ref()) }); + + inner(¶m) + } +} +impl<'a, T: RustToCuda> sealed::Sealed for &'a SharedHeapPerThreadShallowCopy {} + +impl<'a, T: 'static + RustToCuda> CudaKernelParameter + for &'a mut SharedHeapPerThreadShallowCopy +{ + #[cfg(feature = "host")] + type AsyncHostType<'stream, 'b> = crate::host::HostAndDeviceMutRefAsync< + 'stream, + 'b, + DeviceAccessible<::CudaRepresentation>, + >; + type DeviceType = &'a mut SharedHeapPerThreadShallowCopy; + type FfiType<'stream, 'b> = + DeviceMutRef<'b, DeviceAccessible<::CudaRepresentation>>; + #[cfg(feature = "host")] + type SyncHostType = &'a mut T; + + #[cfg(feature = "host")] + fn with_new_async<'stream, O, E: From>( + param: Self::SyncHostType, + _stream: &'stream rustacuda::stream::Stream, + inner: impl for<'b> FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result, + ) -> Result { + crate::host::LendToCuda::lend_to_cuda_mut(param, |mut param| inner(param.as_async())) + } + + #[cfg(feature = "host")] + fn async_to_ffi<'stream, 'b>( + mut param: Self::AsyncHostType<'stream, 'b>, + ) -> Self::FfiType<'stream, 'b> { + unsafe { param.for_device_async() } + } + + #[cfg(not(feature = "host"))] + fn with_ffi_as_device( + mut param: Self::FfiType<'static, 'static>, + inner: impl FnOnce(Self::DeviceType), + ) { + // param must never be dropped as we do NOT own any of the + // heap memory it might reference + let mut param = + SharedHeapPerThreadShallowCopy::new(unsafe { CudaAsRust::as_rust(param.as_mut()) }); + + inner(&mut param) + } +} +impl<'a, T: RustToCuda> sealed::Sealed for &'a mut SharedHeapPerThreadShallowCopy {} diff --git a/src/host.rs b/src/host.rs index 8df7d2fbe..8fa437cf3 100644 --- a/src/host.rs +++ b/src/host.rs @@ -22,7 +22,8 @@ pub use rust_cuda_derive::{check_kernel, link_kernel, specialise_kernel_entry_po use crate::{ common::{ - DeviceAccessible, DeviceConstRef, DeviceMutRef, EmptyCudaAlloc, NoCudaAlloc, RustToCuda, + DeviceAccessible, DeviceConstRef, DeviceMutRef, DeviceOwnedRef, EmptyCudaAlloc, + NoCudaAlloc, RustToCuda, }, ptx_jit::{PtxJITCompiler, PtxJITResult}, safety::SafeDeviceCopy, @@ -779,56 +780,46 @@ impl<'a, T: DeviceCopy> HostAndDeviceConstRef<'a, T> { } #[allow(clippy::module_name_repetitions)] -pub struct HostAndDeviceOwned<'a, T: SafeDeviceCopy + DeviceCopy> { - device_box: &'a mut HostDeviceBox, - host_val: &'a mut T, +pub struct HostAndDeviceOwned { + device_box: HostDeviceBox, + host_val: T, } -impl<'a, T: SafeDeviceCopy + DeviceCopy> HostAndDeviceOwned<'a, T> { +impl HostAndDeviceOwned { /// # Errors /// /// Returns a [`CudaError`] iff `value` cannot be moved /// to CUDA or an error occurs inside `inner`. - pub fn with_new< - O, - E: From, - F: for<'b> FnOnce(HostAndDeviceOwned<'b, T>) -> Result, - >( - mut value: T, + pub fn with_new, F: FnOnce(HostAndDeviceOwned) -> Result>( + value: T, inner: F, ) -> Result { - let mut device_box: HostDeviceBox<_> = DeviceBox::new(&value)?.into(); + let device_box: HostDeviceBox<_> = DeviceBox::new(&value)?.into(); // Safety: `device_box` contains exactly the device copy of `value` - let result = inner(HostAndDeviceOwned { - device_box: &mut device_box, - host_val: &mut value, - }); - - core::mem::drop(device_box); - core::mem::drop(value); - - result + inner(HostAndDeviceOwned { + device_box, + host_val: value, + }) } #[must_use] - pub fn for_device(self) -> DeviceMutRef<'a, T> { - DeviceMutRef { - pointer: self.device_box.0.as_raw_mut(), - reference: PhantomData, + pub fn for_device(self) -> DeviceOwnedRef { + let mut device_box = ManuallyDrop::new(self.device_box); + + DeviceOwnedRef { + pointer: device_box.0.as_raw_mut(), + marker: PhantomData::, } } #[must_use] - pub fn for_host(&'a mut self) -> &'a T { - self.host_val + pub fn for_host(&self) -> &T { + &self.host_val } #[must_use] - pub fn as_async<'stream, 'b>(&'b mut self) -> HostAndDeviceOwnedAsync<'stream, 'b, T> - where - 'a: 'b, - { + pub fn into_async<'stream>(self) -> HostAndDeviceOwnedAsync<'stream, T> { HostAndDeviceOwnedAsync { device_box: self.device_box, host_val: self.host_val, @@ -970,28 +961,30 @@ impl<'stream, 'a, T: DeviceCopy> HostAndDeviceConstRefAsync<'stream, 'a, T> { } #[allow(clippy::module_name_repetitions)] -pub struct HostAndDeviceOwnedAsync<'stream, 'a, T: SafeDeviceCopy + DeviceCopy> { - device_box: &'a mut HostDeviceBox, - host_val: &'a mut T, +pub struct HostAndDeviceOwnedAsync<'stream, T: SafeDeviceCopy + DeviceCopy> { + device_box: HostDeviceBox, + host_val: T, stream: PhantomData<&'stream Stream>, } -impl<'stream, 'a, T: SafeDeviceCopy + DeviceCopy> HostAndDeviceOwnedAsync<'stream, 'a, T> { +impl<'stream, T: SafeDeviceCopy + DeviceCopy> HostAndDeviceOwnedAsync<'stream, T> { #[must_use] /// # Safety /// - /// The returned [`DeviceConstRef`] must only be used on the + /// The returned [`DeviceOwnedRef`] must only be used on the /// constructed-with [`Stream`] - pub unsafe fn for_device_async(self) -> DeviceMutRef<'a, T> { - DeviceMutRef { - pointer: self.device_box.0.as_raw_mut(), - reference: PhantomData, + pub unsafe fn for_device_async(self) -> DeviceOwnedRef { + let mut device_box = ManuallyDrop::new(self.device_box); + + DeviceOwnedRef { + pointer: device_box.0.as_raw_mut(), + marker: PhantomData, } } #[must_use] - pub fn for_host(&'a mut self) -> &'a T { - self.host_val + pub fn for_host(&self) -> &T { + &self.host_val } } diff --git a/src/lib.rs b/src/lib.rs index 15e704e79..f4bc7bbe0 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -27,6 +27,7 @@ #![feature(panic_info_message)] #![feature(let_chains)] #![feature(inline_const)] +#![feature(sync_unsafe_cell)] #![feature(cfg_version)] #![cfg_attr(not(version("1.76.0")), feature(c_str_literals))] #![cfg_attr(not(version("1.76.0")), feature(ptr_from_ref))] From adde7a0359607f2aef5491ca79a2a1419752d580 Mon Sep 17 00:00:00 2001 From: Juniper Tyree Date: Fri, 22 Dec 2023 13:11:00 +0000 Subject: [PATCH 054/120] Early work towards using trait for kernel type wrap, ptx jit workaround missing --- examples/print/src/main.rs | 2 +- examples/single-source/src/main.rs | 24 +- rust-cuda-derive/src/kernel/specialise/ty.rs | 48 +++- .../generate/cpu_linker_macro/get_ptx.rs | 58 +--- .../generate/cpu_wrapper/kernel_func.rs | 140 +++------- .../generate/cpu_wrapper/kernel_func_async.rs | 133 ++------- .../wrapper/generate/cuda_generic_function.rs | 30 +- .../kernel/wrapper/generate/cuda_wrapper.rs | 194 +++---------- .../src/kernel/wrapper/inputs/attribute.rs | 30 +- .../src/kernel/wrapper/inputs/mod.rs | 107 +------ rust-cuda-derive/src/kernel/wrapper/mod.rs | 6 +- src/common.rs | 262 +++++++++--------- src/host.rs | 46 ++- src/lib.rs | 1 + 14 files changed, 366 insertions(+), 715 deletions(-) diff --git a/examples/print/src/main.rs b/examples/print/src/main.rs index dc38b3fa9..17cf42fd8 100644 --- a/examples/print/src/main.rs +++ b/examples/print/src/main.rs @@ -23,7 +23,7 @@ pub enum Action { #[rust_cuda::common::kernel(use link! for impl)] #[kernel(allow(ptx::local_memory_usage))] -pub fn kernel(#[kernel(pass = SafeDeviceCopy)] action: Action) { +pub fn kernel(action: rust_cuda::common::PerThreadShallowCopy) { match action { Action::Print => rust_cuda::device::utils::println!("println! from CUDA kernel"), Action::Panic => panic!("panic! from CUDA kernel"), diff --git a/examples/single-source/src/main.rs b/examples/single-source/src/main.rs index 796e6ee4f..10be57d65 100644 --- a/examples/single-source/src/main.rs +++ b/examples/single-source/src/main.rs @@ -11,6 +11,7 @@ #![feature(type_alias_impl_trait)] #![feature(associated_type_bounds)] #![feature(decl_macro)] +#![recursion_limit = "1024"] extern crate alloc; @@ -55,20 +56,22 @@ pub struct Triple(i32, i32, i32); )] pub fn kernel< 'a, - T: rc::common::RustToCuda< + T: 'static + + rc::common::RustToCuda< CudaRepresentation: rc::safety::StackOnly, CudaAllocation: rc::common::EmptyCudaAlloc, - > + rc::safety::StackOnly + > + + rc::safety::StackOnly + rc::safety::NoSafeAliasing, >( - #[kernel(pass = SafeDeviceCopy)] _x: &Dummy, - #[kernel(pass = LendRustToCuda, jit)] _y: &mut ShallowCopy>, - #[kernel(pass = LendRustToCuda)] _z: &ShallowCopy>, - #[kernel(pass = SafeDeviceCopy, jit)] _v @ _w: &'a core::sync::atomic::AtomicU64, - #[kernel(pass = LendRustToCuda)] _: Wrapper, - #[kernel(pass = SafeDeviceCopy)] Tuple(s, mut __t): Tuple, - #[kernel(pass = SafeDeviceCopy)] q: Triple, - // #[kernel(pass = SafeDeviceCopy)] shared3: ThreadBlockShared, + _x: &rc::common::PerThreadShallowCopy, + #[kernel(jit)] _y: &mut rc::common::SharedHeapPerThreadShallowCopy>, + _z: &rc::common::SharedHeapPerThreadShallowCopy>, + #[kernel(jit)] _v @ _w: &'a rc::common::ShallowInteriorMutable, + _: rc::common::SharedHeapPerThreadShallowCopy>, + Tuple(s, mut __t): rc::common::PerThreadShallowCopy, + q: rc::common::PerThreadShallowCopy, + // shared3: ThreadBlockShared, ) { let shared: ThreadBlockShared<[Tuple; 3]> = ThreadBlockShared::new_uninit(); let shared2: ThreadBlockShared<[Tuple; 3]> = ThreadBlockShared::new_uninit(); @@ -80,6 +83,7 @@ pub fn kernel< unsafe { (*shared2.index_mut_unchecked(2)).1 = q.0 + q.1 + q.2; } + // unsafe { core::arch::asm!("hi") } // unsafe { // *shared3.as_mut_ptr() = 12; diff --git a/rust-cuda-derive/src/kernel/specialise/ty.rs b/rust-cuda-derive/src/kernel/specialise/ty.rs index 9805abc3c..1671f43f0 100644 --- a/rust-cuda-derive/src/kernel/specialise/ty.rs +++ b/rust-cuda-derive/src/kernel/specialise/ty.rs @@ -54,15 +54,23 @@ pub fn specialise_kernel_type(tokens: TokenStream) -> TokenStream { ); } + // replace all lifetimes with 'static + ty = syn::fold::Fold::fold_type( + &mut FoldLifetimeAllStatic { + r#static: syn::parse_quote!('static), + }, + ty, + ); + for (generic, arg) in generics.params.into_iter().zip(args.into_iter()) { match (generic, arg) { ( syn::GenericParam::Lifetime(syn::LifetimeDef { - lifetime: generic, .. + lifetime: _generic, .. }), - syn::GenericArgument::Lifetime(arg), + syn::GenericArgument::Lifetime(_arg), ) => { - ty = syn::fold::Fold::fold_type(&mut FoldLifetimeGeneric { generic, arg }, ty); + // all lifetimes are already replaced with 'static above }, ( syn::GenericParam::Const(syn::ConstParam { ident: generic, .. }), @@ -115,18 +123,34 @@ impl syn::parse::Parse for SpecialiseTypeConfig { } } -struct FoldLifetimeGeneric { - generic: syn::Lifetime, - arg: syn::Lifetime, +struct FoldLifetimeAllStatic { + r#static: syn::Lifetime, } -impl syn::fold::Fold for FoldLifetimeGeneric { +impl syn::fold::Fold for FoldLifetimeAllStatic { + fn fold_type_reference(&mut self, r#ref: syn::TypeReference) -> syn::TypeReference { + let syn::TypeReference { + and_token, + lifetime: _, + mutability, + elem, + } = r#ref; + + syn::fold::fold_type_reference( + self, + syn::TypeReference { + and_token, + lifetime: Some(self.r#static.clone()), + mutability, + elem, + }, + ) + } + fn fold_lifetime(&mut self, lt: syn::Lifetime) -> syn::Lifetime { - if lt == self.generic { - self.arg.clone() - } else { - lt - } + let mut r#static = self.r#static.clone(); + r#static.set_span(lt.span()); + r#static } } diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/get_ptx.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/get_ptx.rs index 7e4b88f87..e838d400c 100644 --- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/get_ptx.rs +++ b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/get_ptx.rs @@ -3,7 +3,7 @@ use syn::spanned::Spanned; use crate::kernel::utils::skip_kernel_compilation; -use super::super::super::{DeclGenerics, FuncIdent, FunctionInputs, ImplGenerics, InputCudaType}; +use super::super::super::{DeclGenerics, FuncIdent, FunctionInputs, ImplGenerics}; #[allow(clippy::too_many_arguments)] pub(super) fn quote_get_ptx( @@ -121,60 +121,32 @@ fn generate_lifetime_erased_types( generic_close_token, .. }: &DeclGenerics, - FunctionInputs { - func_inputs, - func_input_cuda_types, - }: &FunctionInputs, + FunctionInputs { func_inputs, .. }: &FunctionInputs, macro_type_ids: &[syn::Ident], -) -> Vec { - let mut cpu_func_lifetime_erased_types = Vec::with_capacity(func_inputs.len()); - +) -> Vec { func_inputs .iter() - .zip(func_input_cuda_types.iter()) .enumerate() - .for_each(|(i, (arg, (cuda_mode, _ptx_jit)))| match arg { + .map(|(i, arg)| match arg { syn::FnArg::Typed(syn::PatType { ty, .. }) => { let type_ident = quote::format_ident!("__T_{}", i); - let syn_type = quote::quote_spanned! { ty.span()=> + + let mut specialised_ty = quote::quote_spanned! { ty.span()=> <() as #args #generic_start_token #($#macro_type_ids),* #generic_close_token>::#type_ident }; + // the args trait has to unbox outer lifetimes, so we need to add them back in here + if let syn::Type::Reference(syn::TypeReference { and_token, lifetime, mutability, .. }) = &**ty { + let lifetime = quote::quote_spanned! { lifetime.span()=> 'static }; - let cuda_type = match cuda_mode { - InputCudaType::SafeDeviceCopy => quote::quote_spanned! { ty.span()=> - #crate_path::utils::device_copy::SafeDeviceCopyWrapper<#syn_type> - }, - InputCudaType::LendRustToCuda => quote::quote_spanned! { ty.span()=> - #crate_path::common::DeviceAccessible< - <#syn_type as #crate_path::common::RustToCuda>::CudaRepresentation - > - }, - }; + specialised_ty = quote! { #and_token #lifetime #mutability #specialised_ty }; + } - cpu_func_lifetime_erased_types.push( - if let syn::Type::Reference(syn::TypeReference { mutability, .. }) = &**ty { - if mutability.is_some() { - quote::quote_spanned! { ty.span()=> - #crate_path::common::DeviceMutRef<'static, #cuda_type> - } - } else { - quote::quote_spanned! { ty.span()=> - #crate_path::common::DeviceConstRef<'static, #cuda_type> - } - } - } else if matches!(cuda_mode, InputCudaType::LendRustToCuda) { - quote::quote_spanned! { ty.span()=> - #crate_path::common::DeviceMutRef<'static, #cuda_type> - } - } else { - cuda_type - }, - ); + quote::quote_spanned! { ty.span()=> + <#specialised_ty as #crate_path::common::CudaKernelParameter>::FfiType<'static, 'static> + } }, syn::FnArg::Receiver(_) => unreachable!(), - }); - - cpu_func_lifetime_erased_types + }).collect() } diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func.rs index 5cc6f8077..7eb7db1a4 100644 --- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func.rs +++ b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func.rs @@ -1,6 +1,7 @@ use proc_macro2::TokenStream; +use syn::spanned::Spanned; -use super::super::super::{DeclGenerics, FuncIdent, FunctionInputs, ImplGenerics, InputCudaType}; +use super::super::super::{DeclGenerics, FuncIdent, FunctionInputs, ImplGenerics}; pub(super) fn quote_kernel_func_inputs( crate_path: &syn::Path, @@ -16,13 +17,32 @@ pub(super) fn quote_kernel_func_inputs( func_params: &[syn::Ident], func_attrs: &[syn::Attribute], ) -> TokenStream { - let kernel_func_input_tys = func_inputs + let (kernel_func_inputs, kernel_func_input_tys): (Vec<_>, Vec<_>) = func_inputs .iter() .map(|arg| match arg { - syn::FnArg::Typed(syn::PatType { ty, .. }) => syn::Type::clone(ty), + syn::FnArg::Typed(syn::PatType { + attrs, + ty, + pat, + colon_token, + }) => { + let ty: syn::Type = syn::parse_quote_spanned! { ty.span()=> + <#ty as #crate_path::common::CudaKernelParameter>::SyncHostType + }; + + ( + syn::FnArg::Typed(syn::PatType { + attrs: attrs.clone(), + ty: Box::new(ty.clone()), + pat: pat.clone(), + colon_token: *colon_token, + }), + ty, + ) + }, syn::FnArg::Receiver(_) => unreachable!(), }) - .collect::>(); + .unzip(); let launcher = syn::Ident::new("launcher", proc_macro2::Span::mixed_site()); @@ -60,26 +80,10 @@ pub(super) fn quote_kernel_func_inputs( #[allow(unused_variables)] pub fn #func_ident <#generic_kernel_params>( #launcher: &mut #crate_path::host::Launcher<#func_ident #ty_generics>, - #func_inputs + #(#kernel_func_inputs),* ) -> #crate_path::rustacuda::error::CudaResult<()> { let _: #func_ident <#(#full_generics),*> = #func_ident #ty_turbofish; - // impls check adapted from Nikolai Vazquez's `impls` crate: - // https://docs.rs/impls/1.0.3/src/impls/lib.rs.html#584-602 - const fn __check_is_sync(_x: &T) -> bool { - trait IsSyncMarker { - const SYNC: bool = false; - } - impl IsSyncMarker for T {} - struct CheckIs(::core::marker::PhantomData); - #[allow(dead_code)] - impl CheckIs { - const SYNC: bool = true; - } - - >::SYNC - } - #raw_func_input_wrap } } @@ -88,89 +92,27 @@ pub(super) fn quote_kernel_func_inputs( #[allow(clippy::too_many_lines)] fn generate_raw_func_input_wrap( crate_path: &syn::Path, - FunctionInputs { - func_inputs, - func_input_cuda_types, - }: &FunctionInputs, + FunctionInputs { func_inputs, .. }: &FunctionInputs, FuncIdent { func_ident_async, .. }: &FuncIdent, func_params: &[syn::Ident], launcher: &syn::Ident, ) -> TokenStream { - func_inputs - .iter() - .zip(func_params) - .zip(func_input_cuda_types.iter()) - .rev() - .fold( - quote! { - #func_ident_async(#launcher, #(#func_params),*)?; - #launcher.stream.synchronize() - }, - |inner, ((arg, param), (cuda_mode, _ptx_jit))| match arg { - syn::FnArg::Typed(syn::PatType { pat, ty, .. }) => match cuda_mode { - InputCudaType::SafeDeviceCopy => { - if let syn::Type::Reference(..) = &**ty { - let pat_box = quote::format_ident!("__{}_box", param); - - // DeviceCopy mode only supports immutable references - quote! { - let mut #pat_box = #crate_path::host::HostDeviceBox::from( - #crate_path::rustacuda::memory::DeviceBox::new( - #crate_path::utils::device_copy::SafeDeviceCopyWrapper::from_ref(#pat) - )? - ); - #[allow(clippy::redundant_closure_call)] - // Safety: `#pat_box` contains exactly the device copy of `#pat` - let __result = (|#pat| { #inner })(unsafe { - #crate_path::host::HostAndDeviceConstRef::new( - &#pat_box, #crate_path::utils::device_copy::SafeDeviceCopyWrapper::from_ref(#pat) - ).as_async() - }); - - #[allow(invalid_reference_casting)] - if !__check_is_sync(#pat) { - // Safety: - // * Since `#ty` is `!Sync`, it contains interior mutability - // * Therefore, part of the 'immutable' device copy may have - // been mutated - // * If all mutation was confined to interior mutability, - // then passing these changes on is safe (and expected) - // * If any mutations occured outside interior mutability, - // then UB occurred, in the kernel (we're not the cause) - #pat_box.copy_to(unsafe { &mut *(#pat as *const _ as *mut _) })?; - } - - ::core::mem::drop(#pat_box); - __result - } - } else { - quote! { { - let #pat = #crate_path::utils::device_copy::SafeDeviceCopyWrapper::from(#pat); - #inner - } } - } - }, - InputCudaType::LendRustToCuda => { - if let syn::Type::Reference(syn::TypeReference { mutability, .. }) = &**ty { - if mutability.is_some() { - quote! { #crate_path::host::LendToCuda::lend_to_cuda_mut( - #pat, |mut #pat| { (|#pat| { #inner })(#pat.as_async()) } - ) } - } else { - quote! { #crate_path::host::LendToCuda::lend_to_cuda( - #pat, |#pat| { (|#pat| { #inner })(#pat.as_async()) } - ) } - } - } else { - quote! { #crate_path::host::LendToCuda::move_to_cuda( - #pat, |mut #pat| { (|#pat| { #inner })(#pat.as_async()) } - ) } - } - }, - }, - syn::FnArg::Receiver(_) => unreachable!(), + func_inputs.iter().rev().fold( + quote! { + #func_ident_async(#launcher, #(#func_params),*)?; + #launcher.stream.synchronize() + }, + |inner, arg| match arg { + syn::FnArg::Typed(syn::PatType { pat, ty, .. }) => { + quote::quote_spanned! { ty.span()=> + <#ty as #crate_path::common::CudaKernelParameter>::with_new_async( + #pat, #launcher.stream, |#pat| { #inner } + ) + } }, - ) + syn::FnArg::Receiver(_) => unreachable!(), + }, + ) } diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func_async.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func_async.rs index 8a0013900..39ce95e9d 100644 --- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func_async.rs +++ b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func_async.rs @@ -1,7 +1,7 @@ use proc_macro2::TokenStream; use syn::spanned::Spanned; -use super::super::super::{DeclGenerics, FuncIdent, FunctionInputs, ImplGenerics, InputCudaType}; +use super::super::super::{DeclGenerics, FuncIdent, FunctionInputs, ImplGenerics}; #[allow(clippy::too_many_arguments)] pub(super) fn quote_kernel_func_async( @@ -23,13 +23,8 @@ pub(super) fn quote_kernel_func_async( let launcher = syn::Ident::new("launcher", proc_macro2::Span::mixed_site()); let stream = syn::Lifetime::new("'stream", proc_macro2::Span::mixed_site()); - let ( - async_params, - launch_param_types, - unboxed_param_types, - launch_param_wrap, - ptx_jit_param_wrap, - ) = generate_type_wrap(crate_path, func_inputs, &stream); + let (async_params, launch_param_types, launch_param_wrap, _ptx_jit_param_wrap) = + generate_type_wrap(crate_path, func_inputs, &stream); quote! { #[cfg(not(target_os = "cuda"))] @@ -43,7 +38,7 @@ pub(super) fn quote_kernel_func_async( #(#async_params),* ) -> #crate_path::rustacuda::error::CudaResult<()> { let kernel_jit_result = if #launcher.config.ptx_jit { - #launcher.kernel.compile_with_ptx_jit_args(#ptx_jit_param_wrap)? + #launcher.kernel.compile_with_ptx_jit_args(None)? // TODO: #ptx_jit_param_wrap)? } else { #launcher.kernel.compile_with_ptx_jit_args(None)? }; @@ -54,17 +49,6 @@ pub(super) fn quote_kernel_func_async( #[allow(clippy::redundant_closure_call)] (|#(#func_params: #launch_param_types),*| { - if false { - #[allow(dead_code)] - fn assert_impl_devicecopy(_val: &T) {} - - #[allow(dead_code)] - fn assert_impl_no_safe_aliasing() {} - - #(assert_impl_devicecopy(&#func_params);)* - #(assert_impl_no_safe_aliasing::<#unboxed_param_types>();)* - } - let #crate_path::host::LaunchConfig { grid, block, shared_memory_size, ptx_jit: _, } = #launcher.config.clone(); @@ -81,7 +65,6 @@ pub(super) fn quote_kernel_func_async( } } -#[allow(clippy::too_many_lines)] // FIXME fn generate_type_wrap( crate_path: &syn::Path, FunctionInputs { @@ -90,8 +73,7 @@ fn generate_type_wrap( }: &FunctionInputs, stream: &syn::Lifetime, ) -> ( - Vec, - Vec, + Vec, Vec, Vec, TokenStream, @@ -100,14 +82,13 @@ fn generate_type_wrap( let mut async_params = Vec::with_capacity(func_inputs.len()); let mut launch_param_types = Vec::with_capacity(func_inputs.len()); - let mut unboxed_param_types = Vec::with_capacity(func_inputs.len()); let mut launch_param_wrap = Vec::with_capacity(func_inputs.len()); let mut ptx_jit_param_wrap = Vec::with_capacity(func_inputs.len()); func_inputs .iter() .zip(func_input_cuda_types.iter()) - .for_each(|(arg, (cuda_mode, ptx_jit))| match arg { + .for_each(|(arg, ptx_jit)| match arg { syn::FnArg::Typed(syn::PatType { attrs, pat, @@ -122,95 +103,30 @@ fn generate_type_wrap( quote! { None } }); - #[allow(clippy::if_same_then_else)] - launch_param_wrap.push(if let syn::Type::Reference(_) = &**ty { - quote! { unsafe { #pat.for_device_async() } } - } else if matches!(cuda_mode, InputCudaType::LendRustToCuda) { - quote! { unsafe { #pat.for_device_async() } } - } else { - quote! { #pat } + let async_ty: syn::Type = syn::parse_quote_spanned! { ty.span()=> + <#ty as #crate_path::common::CudaKernelParameter>::AsyncHostType<#stream, '_> + }; + + let async_param = syn::FnArg::Typed(syn::PatType { + attrs: attrs.clone(), + ty: Box::new(async_ty), + pat: pat.clone(), + colon_token: *colon_token, }); - let unboxed_param_type = match &**ty { - syn::Type::Reference(syn::TypeReference { elem, .. }) => elem, - other => other, - }; - unboxed_param_types.push(unboxed_param_type.clone()); - - let cuda_param_type = match cuda_mode { - InputCudaType::SafeDeviceCopy => quote::quote_spanned! { ty.span()=> - #crate_path::utils::device_copy::SafeDeviceCopyWrapper<#unboxed_param_type> - }, - InputCudaType::LendRustToCuda => quote::quote_spanned! { ty.span()=> - #crate_path::common::DeviceAccessible< - <#unboxed_param_type as #crate_path::common::RustToCuda>::CudaRepresentation - > - }, + async_params.push(async_param); + + let launch_ty: syn::Type = syn::parse_quote_spanned! { ty.span()=> + <#ty as #crate_path::common::CudaKernelParameter>::FfiType<#stream, '_> }; - let (async_param, launch_param_type) = if let syn::Type::Reference(syn::TypeReference { - mutability, - lifetime, - .. - }) = &**ty - { - let lifetime_or_default = lifetime.clone().unwrap_or(syn::parse_quote!('_)); - let comma: Option = - lifetime.as_ref().map(|_| syn::parse_quote!(,)); - - let (async_param_type, launch_param_type) = if mutability.is_some() { - if matches!(cuda_mode, InputCudaType::SafeDeviceCopy) { - abort!( - mutability.span(), - "Cannot mutably alias a `SafeDeviceCopy` kernel parameter." - ); - } - - ( - quote::quote_spanned! { ty.span()=> - #crate_path::host::HostAndDeviceMutRefAsync<#stream, #lifetime_or_default, #cuda_param_type> - }, - quote::quote_spanned! { ty.span()=> - #crate_path::common::DeviceMutRef<#lifetime #comma #cuda_param_type> - }, - ) - } else { - ( - quote::quote_spanned! { ty.span()=> - #crate_path::host::HostAndDeviceConstRefAsync<#stream, #lifetime_or_default, #cuda_param_type> - }, - quote::quote_spanned! { ty.span()=> - #crate_path::common::DeviceConstRef<#lifetime #comma #cuda_param_type> - }, - ) - }; - - (quote! { - #(#attrs)* #mutability #pat #colon_token #async_param_type - }, launch_param_type) - } else if matches!(cuda_mode, InputCudaType::LendRustToCuda) { - let async_param_type = quote::quote_spanned! { ty.span()=> - #crate_path::host::HostAndDeviceOwnedAsync<#stream, '_, #cuda_param_type> - }; - let launch_param_type = quote::quote_spanned! { ty.span()=> - #crate_path::common::DeviceMutRef<#cuda_param_type> - }; - - ( - quote! { - #(#attrs)* #pat #colon_token #async_param_type - }, - launch_param_type - ) - } else { - ( - quote! { #(#attrs)* #pat #colon_token #cuda_param_type }, - quote! { #cuda_param_type }, - ) + launch_param_types.push(launch_ty); + + let launch_wrap = quote::quote_spanned! { ty.span()=> + <#ty as #crate_path::common::CudaKernelParameter>::async_to_ffi(#pat) }; - async_params.push(async_param); - launch_param_types.push(launch_param_type); + launch_param_wrap.push(launch_wrap); }, syn::FnArg::Receiver(_) => unreachable!(), }); @@ -224,7 +140,6 @@ fn generate_type_wrap( ( async_params, launch_param_types, - unboxed_param_types, launch_param_wrap, ptx_jit_param_wrap, ) diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cuda_generic_function.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cuda_generic_function.rs index aa23b77c6..a6d8ac550 100644 --- a/rust-cuda-derive/src/kernel/wrapper/generate/cuda_generic_function.rs +++ b/rust-cuda-derive/src/kernel/wrapper/generate/cuda_generic_function.rs @@ -1,8 +1,10 @@ use proc_macro2::TokenStream; +use syn::spanned::Spanned; use super::super::{DeclGenerics, FuncIdent}; pub(in super::super) fn quote_cuda_generic_function( + crate_path: &syn::Path, DeclGenerics { generic_start_token, generic_kernel_params: generic_params, @@ -14,10 +16,36 @@ pub(in super::super) fn quote_cuda_generic_function( func_attrs: &[syn::Attribute], func_block: &syn::Block, ) -> TokenStream { + let kernel_func_inputs = func_inputs + .iter() + .map(|arg| match arg { + syn::FnArg::Typed(syn::PatType { + attrs, + ty, + pat, + colon_token, + }) => { + let ty: syn::Type = syn::parse_quote_spanned! { ty.span()=> + <#ty as #crate_path::common::CudaKernelParameter>::DeviceType<'_> + }; + + syn::FnArg::Typed(syn::PatType { + attrs: attrs.clone(), + ty: Box::new(ty), + pat: pat.clone(), + colon_token: *colon_token, + }) + }, + syn::FnArg::Receiver(_) => unreachable!(), + }) + .collect::>(); + quote! { #[cfg(target_os = "cuda")] #(#func_attrs)* - fn #func_ident #generic_start_token #generic_params #generic_close_token (#func_inputs) + fn #func_ident #generic_start_token #generic_params #generic_close_token ( + #(#kernel_func_inputs),* + ) #func_block } } diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cuda_wrapper.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cuda_wrapper.rs index 8aa57ab87..24365ee29 100644 --- a/rust-cuda-derive/src/kernel/wrapper/generate/cuda_wrapper.rs +++ b/rust-cuda-derive/src/kernel/wrapper/generate/cuda_wrapper.rs @@ -1,10 +1,9 @@ use proc_macro2::TokenStream; -use quote::quote_spanned; use syn::spanned::Spanned; use super::super::{ super::{KERNEL_TYPE_USE_END_CANARY, KERNEL_TYPE_USE_START_CANARY}, - FuncIdent, FunctionInputs, ImplGenerics, InputCudaType, + FuncIdent, FunctionInputs, ImplGenerics, }; #[allow(clippy::too_many_lines)] @@ -26,10 +25,8 @@ pub(in super::super) fn quote_cuda_wrapper( func_attrs: &[syn::Attribute], func_params: &[syn::Ident], ) -> TokenStream { - let (ptx_func_inputs, ptx_func_types) = - specialise_ptx_func_inputs(crate_path, inputs, func, impl_generics); - let ptx_func_unboxed_types = - specialise_ptx_unboxed_types(crate_path, inputs, func, impl_generics); + let (ffi_inputs, ffi_types) = + specialise_ffi_input_types(crate_path, inputs, func, impl_generics); let func_layout_params = func_params .iter() @@ -41,69 +38,32 @@ pub(in super::super) fn quote_cuda_wrapper( }) .collect::>(); - let ptx_func_input_unwrap = func_inputs + let ffi_param_ptx_jit_wrap = func_inputs .iter().zip(func_input_cuda_types.iter()).enumerate() .rev() .fold(quote! { #func_ident(#(#func_params),*) - }, |inner, (i, (arg, (cuda_mode, ptx_jit)))| match arg { + }, |inner, (_i, (arg, _ptx_jit))| match arg { syn::FnArg::Typed(syn::PatType { pat, ty, .. }) => { // Emit PTX JIT load markers - let ptx_jit_load = if ptx_jit.0 { - quote! { - #crate_path::ptx_jit::PtxJITConstLoad!([#i] => #pat.as_ref()) - } - } else { quote! {} }; - - let arg_type = match &**ty { - syn::Type::Reference(syn::TypeReference { elem, .. }) => elem, - other => other, - }; - let syn_type = quote::quote_spanned! { ty.span()=> - #crate_path::device::specialise_kernel_type!(#arg_type for #generics in #func_ident) + // let ptx_jit_load = if ptx_jit.0 { + // quote! { + // #crate_path::ptx_jit::PtxJITConstLoad!([#i] => #pat.as_ref()) + // } + // } else { quote! {} }; + + let specialised_ty = quote::quote_spanned! { ty.span()=> + #crate_path::device::specialise_kernel_type!(#ty for #generics in #func_ident) }; - match cuda_mode { - InputCudaType::SafeDeviceCopy => if let syn::Type::Reference( - syn::TypeReference { and_token, .. } - ) = &**ty { - // DeviceCopy mode only supports immutable references - // TODO: ptx_jit_load should be here, not there - // also ptx_jit_load should not be enabled for interior mutability - quote! { { let #pat: #and_token #syn_type = #pat.as_ref().into_ref(); #inner } } - } else { - quote! { #ptx_jit_load; { let #pat: #syn_type = #pat.into_inner(); #inner } } - }, - InputCudaType::LendRustToCuda => if let syn::Type::Reference( - syn::TypeReference { and_token, mutability, ..} - ) = &**ty { - if mutability.is_some() { - quote! { - #ptx_jit_load; - #crate_path::device::BorrowFromRust::with_borrow_from_rust_mut( - #pat, |#pat: #and_token #mutability #crate_path::device::ShallowCopy<#syn_type>| { #inner }, - ) - } - } else { - quote! { - #ptx_jit_load; - #crate_path::device::BorrowFromRust::with_borrow_from_rust( - #pat, |#pat: #and_token #crate_path::device::ShallowCopy<#syn_type>| { #inner }, - ) - } - } - } else { - quote! { - #ptx_jit_load; - #crate_path::device::BorrowFromRust::with_moved_from_rust( - #pat, |#pat: #syn_type| { #inner }, - ) - } - } + quote::quote_spanned! { ty.span()=> + <#specialised_ty as #crate_path::common::CudaKernelParameter>::with_ffi_as_device( + #pat, |#pat| { #inner } + ) } }, syn::FnArg::Receiver(_) => unreachable!(), @@ -114,15 +74,15 @@ pub(in super::super) fn quote_cuda_wrapper( #[#crate_path::device::specialise_kernel_function(#func_ident)] #[no_mangle] #(#func_attrs)* - pub unsafe extern "ptx-kernel" fn #func_ident_hash(#(#ptx_func_inputs),*) { + pub unsafe extern "ptx-kernel" fn #func_ident_hash(#(#ffi_inputs),*) { unsafe { ::core::arch::asm!(#KERNEL_TYPE_USE_START_CANARY); } #( #[no_mangle] static #func_layout_params: [ - u8; #crate_path::const_type_layout::serialised_type_graph_len::<#ptx_func_types>() - ] = #crate_path::const_type_layout::serialise_type_graph::<#ptx_func_types>(); + u8; #crate_path::const_type_layout::serialised_type_graph_len::<#ffi_types>() + ] = #crate_path::const_type_layout::serialise_type_graph::<#ffi_types>(); unsafe { ::core::ptr::read_volatile(&#func_layout_params[0]) }; )* @@ -137,128 +97,50 @@ pub(in super::super) fn quote_cuda_wrapper( extern "C" { #( #[allow(dead_code)] - static #func_params: #ptx_func_types; + static #func_params: #ffi_types; )* } } - if false { - #[allow(dead_code)] - fn assert_impl_devicecopy(_val: &T) {} - - #[allow(dead_code)] - fn assert_impl_no_safe_aliasing() {} - - #(assert_impl_devicecopy(&#func_params);)* - #(assert_impl_no_safe_aliasing::<#ptx_func_unboxed_types>();)* - } - - #ptx_func_input_unwrap + #ffi_param_ptx_jit_wrap } } } -fn specialise_ptx_func_inputs( +fn specialise_ffi_input_types( crate_path: &syn::Path, - FunctionInputs { - func_inputs, - func_input_cuda_types, - }: &FunctionInputs, + FunctionInputs { func_inputs, .. }: &FunctionInputs, FuncIdent { func_ident, .. }: &FuncIdent, ImplGenerics { impl_generics, .. }: &ImplGenerics, -) -> (Vec, Vec) { +) -> (Vec, Vec) { func_inputs .iter() - .zip(func_input_cuda_types.iter()) - .map(|(arg, (cuda_mode, _ptx_jit))| match arg { + .map(|arg| match arg { syn::FnArg::Typed( - fn_arg @ syn::PatType { + syn::PatType { attrs, pat, colon_token, ty, }, ) => { - let arg_type = match &**ty { - syn::Type::Reference(syn::TypeReference { elem, .. }) => elem, - other => other, - }; - let syn_type = quote::quote_spanned! { ty.span()=> - #crate_path::device::specialise_kernel_type!(#arg_type for #impl_generics in #func_ident) + let specialised_ty = quote::quote_spanned! { ty.span()=> + #crate_path::device::specialise_kernel_type!(#ty for #impl_generics in #func_ident) }; - let cuda_type = match cuda_mode { - InputCudaType::SafeDeviceCopy => quote::quote_spanned! { ty.span()=> - #crate_path::utils::device_copy::SafeDeviceCopyWrapper<#syn_type> - }, - InputCudaType::LendRustToCuda => quote::quote_spanned! { ty.span()=> - #crate_path::common::DeviceAccessible< - <#syn_type as #crate_path::common::RustToCuda>::CudaRepresentation - > - }, - }; - - let ty = if let syn::Type::Reference(syn::TypeReference { - lifetime, - mutability, - .. - }) = &**ty - { - let lifetime = quote_spanned! { lifetime.span()=> - 'static - }; - - if mutability.is_some() { - quote::quote_spanned! { ty.span()=> - #crate_path::common::DeviceMutRef<#lifetime, #cuda_type> - } - } else { - quote::quote_spanned! { ty.span()=> - #crate_path::common::DeviceConstRef<#lifetime, #cuda_type> - } - } - } else if matches!(cuda_mode, InputCudaType::LendRustToCuda) { - let lifetime = quote_spanned! { ty.span()=> - 'static - }; - - quote::quote_spanned! { ty.span()=> - #crate_path::common::DeviceMutRef<#lifetime, #cuda_type> - } - } else { - cuda_type + let ffi_ty: syn::Type = syn::parse_quote_spanned! { ty.span()=> + <#specialised_ty as #crate_path::common::CudaKernelParameter>::FfiType<'static, 'static> }; - let fn_arg = quote::quote_spanned! { fn_arg.span()=> - #(#attrs)* #pat #colon_token #ty - }; + let ffi_param = syn::FnArg::Typed(syn::PatType { + attrs: attrs.clone(), + ty: Box::new(ffi_ty.clone()), + pat: pat.clone(), + colon_token: *colon_token, + }); - (fn_arg, ty) + (ffi_param, ffi_ty) }, syn::FnArg::Receiver(_) => unreachable!(), }) .unzip() } - -fn specialise_ptx_unboxed_types( - crate_path: &syn::Path, - FunctionInputs { func_inputs, .. }: &FunctionInputs, - FuncIdent { func_ident, .. }: &FuncIdent, - ImplGenerics { impl_generics, .. }: &ImplGenerics, -) -> Vec { - func_inputs - .iter() - .map(|arg| match arg { - syn::FnArg::Typed(syn::PatType { ty, .. }) => { - let arg_type = match &**ty { - syn::Type::Reference(syn::TypeReference { elem, .. }) => elem, - other => other, - }; - - quote::quote_spanned! { ty.span()=> - #crate_path::device::specialise_kernel_type!(#arg_type for #impl_generics in #func_ident) - } - }, - syn::FnArg::Receiver(_) => unreachable!(), - }) - .collect() -} diff --git a/rust-cuda-derive/src/kernel/wrapper/inputs/attribute.rs b/rust-cuda-derive/src/kernel/wrapper/inputs/attribute.rs index ceeee1e3e..4ca2ff7bf 100644 --- a/rust-cuda-derive/src/kernel/wrapper/inputs/attribute.rs +++ b/rust-cuda-derive/src/kernel/wrapper/inputs/attribute.rs @@ -1,9 +1,6 @@ use syn::spanned::Spanned; -use super::InputCudaType; - pub(super) enum KernelInputAttribute { - PassType(proc_macro2::Span, InputCudaType), PtxJit(proc_macro2::Span, bool), } @@ -12,31 +9,6 @@ impl syn::parse::Parse for KernelInputAttribute { let ident: syn::Ident = input.parse()?; match &*ident.to_string() { - "pass" => { - let eq: syn::token::Eq = input.parse()?; - let mode: syn::Ident = input.parse()?; - - let cuda_type = match &*mode.to_string() { - "SafeDeviceCopy" => InputCudaType::SafeDeviceCopy, - "LendRustToCuda" => InputCudaType::LendRustToCuda, - _ => abort!( - mode.span(), - "Unexpected CUDA transfer mode `{:?}`: Expected `SafeDeviceCopy` or \ - `LendRustToCuda`.", - mode - ), - }; - - Ok(KernelInputAttribute::PassType( - ident - .span() - .join(eq.span()) - .unwrap() - .join(mode.span()) - .unwrap(), - cuda_type, - )) - }, "jit" => { let eq: Option = input.parse()?; @@ -61,7 +33,7 @@ impl syn::parse::Parse for KernelInputAttribute { }, _ => abort!( ident.span(), - "Unexpected kernel attribute `{:?}`: Expected `pass` or `jit`.", + "Unexpected kernel attribute `{:?}`: Expected `jit`.", ident ), } diff --git a/rust-cuda-derive/src/kernel/wrapper/inputs/mod.rs b/rust-cuda-derive/src/kernel/wrapper/inputs/mod.rs index 9222de237..154503702 100644 --- a/rust-cuda-derive/src/kernel/wrapper/inputs/mod.rs +++ b/rust-cuda-derive/src/kernel/wrapper/inputs/mod.rs @@ -1,19 +1,19 @@ use syn::spanned::Spanned; -use super::{InputCudaType, InputPtxJit}; +use super::InputPtxJit; mod attribute; use attribute::{KernelInputAttribute, KernelInputAttributes}; pub(super) struct FunctionInputs { pub(super) func_inputs: syn::punctuated::Punctuated, - pub(super) func_input_cuda_types: Vec<(InputCudaType, InputPtxJit)>, + pub(super) func_input_cuda_types: Vec, } pub(super) fn parse_function_inputs(func: &syn::ItemFn) -> FunctionInputs { let (func_inputs, func_input_cuda_types): ( syn::punctuated::Punctuated, - Vec<(InputCudaType, InputPtxJit)>, + Vec, ) = func .sig .inputs @@ -22,15 +22,12 @@ pub(super) fn parse_function_inputs(func: &syn::ItemFn) -> FunctionInputs { receiver @ syn::FnArg::Receiver(_) => { abort!(receiver.span(), "Kernel function must not have a receiver.") }, - syn::FnArg::Typed( - input @ syn::PatType { - attrs, - pat, - colon_token, - ty, - }, - ) => { - let mut cuda_type: Option = None; + syn::FnArg::Typed(syn::PatType { + attrs, + pat, + colon_token, + ty, + }) => { let mut ptx_jit: Option = None; let attrs = attrs @@ -45,14 +42,6 @@ pub(super) fn parse_function_inputs(func: &syn::ItemFn) -> FunctionInputs { for attr in attrs { match attr { - KernelInputAttribute::PassType(_span, pass_type) - if cuda_type.is_none() => - { - cuda_type = Some(pass_type); - }, - KernelInputAttribute::PassType(span, _pass_type) => { - abort!(span, "Duplicate CUDA transfer mode declaration."); - }, KernelInputAttribute::PtxJit(span, jit) if ptx_jit.is_none() => { @@ -78,24 +67,14 @@ pub(super) fn parse_function_inputs(func: &syn::ItemFn) -> FunctionInputs { .cloned() .collect(); - let cuda_type = cuda_type.unwrap_or_else(|| { - abort!( - input.span(), - "Kernel function input must specify its CUDA transfer mode using \ - #[kernel(pass = ...)]." - ); - }); - - let ty = ensure_reference_type_lifetime(ty, &cuda_type); - ( syn::FnArg::Typed(syn::PatType { attrs, pat: pat.clone(), colon_token: *colon_token, - ty, + ty: ty.clone(), }), - (cuda_type, ptx_jit.unwrap_or(InputPtxJit(false))), + ptx_jit.unwrap_or(InputPtxJit(false)), ) }, }) @@ -106,67 +85,3 @@ pub(super) fn parse_function_inputs(func: &syn::ItemFn) -> FunctionInputs { func_input_cuda_types, } } - -#[allow(clippy::unnecessary_box_returns)] -fn ensure_reference_type_lifetime(ty: &syn::Type, cuda_type: &InputCudaType) -> Box { - match ty { - syn::Type::Reference(syn::TypeReference { - and_token, - lifetime, - mutability, - elem, - }) => { - let elem = if matches!(cuda_type, InputCudaType::LendRustToCuda) { - (|| { - if let syn::Type::Path(syn::TypePath { - path: syn::Path { segments, .. }, - qself: None, - }) = &**elem - { - if let Some(syn::PathSegment { - ident, - arguments: - syn::PathArguments::AngleBracketed( - syn::AngleBracketedGenericArguments { args, .. }, - ), - }) = segments.last() - { - if ident == "ShallowCopy" && segments.len() == 1 { - match args.last() { - Some(syn::GenericArgument::Type(elem)) if args.len() == 1 => { - return Box::new(elem.clone()); - }, - _ => { - abort!( - args.span(), - "`ShallowCopy` takes exactly one generic type \ - argument." - ); - }, - } - } - } - } - - emit_warning!( - elem.span(), - "RustToCuda kernel parameters should be explicitly wrapped with the \ - `ShallowCopy` marker to communicate their aliasing behaviour." - ); - - elem.clone() - })() - } else { - elem.clone() - }; - - Box::new(syn::Type::Reference(syn::TypeReference { - and_token: *and_token, - lifetime: lifetime.clone(), - mutability: *mutability, - elem, - })) - }, - ty => Box::new(ty.clone()), - } -} diff --git a/rust-cuda-derive/src/kernel/wrapper/mod.rs b/rust-cuda-derive/src/kernel/wrapper/mod.rs index b79dfb1fd..79bae8dbd 100644 --- a/rust-cuda-derive/src/kernel/wrapper/mod.rs +++ b/rust-cuda-derive/src/kernel/wrapper/mod.rs @@ -237,6 +237,7 @@ pub fn kernel(attr: TokenStream, func: TokenStream) -> TokenStream { &func_params, ); let cuda_generic_function = quote_cuda_generic_function( + &crate_path, &decl_generics, &pat_func_inputs, &func_ident, @@ -257,11 +258,6 @@ pub fn kernel(attr: TokenStream, func: TokenStream) -> TokenStream { .into() } -enum InputCudaType { - SafeDeviceCopy, - LendRustToCuda, -} - struct InputPtxJit(bool); #[allow(clippy::struct_field_names)] diff --git a/src/common.rs b/src/common.rs index 2e7102a73..c4a880262 100644 --- a/src/common.rs +++ b/src/common.rs @@ -1,11 +1,12 @@ #[cfg(any(not(feature = "host"), doc))] use core::convert::{AsMut, AsRef}; -use core::marker::PhantomData; +use core::{ + marker::PhantomData, + ops::{Deref, DerefMut}, +}; #[cfg(feature = "host")] use alloc::fmt; -#[cfg(not(feature = "host"))] -use core::ops::{Deref, DerefMut}; #[cfg(feature = "host")] use core::{mem::MaybeUninit, ptr::copy_nonoverlapping}; @@ -241,28 +242,18 @@ impl<'r, T: DeviceCopy> AsMut for DeviceMutRef<'r, T> { #[repr(transparent)] #[derive(TypeLayout)] -pub struct DeviceOwnedRef { +pub struct DeviceOwnedRef<'r, T: DeviceCopy> { + #[cfg_attr(feature = "host", allow(dead_code))] pub(super) pointer: *mut T, + pub(super) reference: PhantomData<&'r mut ()>, pub(super) marker: PhantomData, } -// TODO: when should the drop run??? -#[cfg(feature = "host")] -impl Drop for DeviceOwnedRef { - fn drop(&mut self) { - // Safety: pointer comes from [`DeviceBox::into_device`] - // i.e. this function completes the roundtrip - let device_box = unsafe { rustacuda::memory::DeviceBox::from_raw(self.pointer) }; - - core::mem::drop(crate::host::CudaDropWrapper::from(device_box)); - } -} - -unsafe impl DeviceCopy for DeviceOwnedRef {} +unsafe impl<'r, T: DeviceCopy> DeviceCopy for DeviceOwnedRef<'r, T> {} #[cfg(any(not(feature = "host"), doc))] #[doc(cfg(not(feature = "host")))] -impl AsRef for DeviceOwnedRef { +impl<'r, T: DeviceCopy> AsRef for DeviceOwnedRef<'r, T> { fn as_ref(&self) -> &T { unsafe { &*self.pointer } } @@ -270,7 +261,7 @@ impl AsRef for DeviceOwnedRef { #[cfg(any(not(feature = "host"), doc))] #[doc(cfg(not(feature = "host")))] -impl AsMut for DeviceOwnedRef { +impl<'r, T: DeviceCopy> AsMut for DeviceOwnedRef<'r, T> { fn as_mut(&mut self) -> &mut T { unsafe { &mut *self.pointer } } @@ -321,6 +312,7 @@ impl CombinedCudaAlloc { } mod sealed { + #[doc(hidden)] pub trait Sealed {} } @@ -330,8 +322,8 @@ pub trait CudaKernelParameter: sealed::Sealed { type SyncHostType; #[cfg(feature = "host")] type AsyncHostType<'stream, 'b>; - type FfiType<'stream, 'b>: rustacuda_core::DeviceCopy; - type DeviceType; + type FfiType<'stream, 'b>: rustacuda_core::DeviceCopy + TypeGraphLayout; + type DeviceType<'b>; #[cfg(feature = "host")] #[allow(clippy::missing_errors_doc)] // FIXME @@ -347,44 +339,55 @@ pub trait CudaKernelParameter: sealed::Sealed { ) -> Self::FfiType<'stream, 'b>; #[cfg(not(feature = "host"))] - fn with_ffi_as_device( + fn with_ffi_as_device( param: Self::FfiType<'static, 'static>, - inner: impl FnOnce(Self::DeviceType), - ); + inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O, + ) -> O; } #[repr(transparent)] pub struct PerThreadShallowCopy< - T: crate::safety::SafeDeviceCopy + const_type_layout::TypeGraphLayout, ->(T); - -#[cfg(not(feature = "host"))] -impl - PerThreadShallowCopy -{ - #[must_use] - pub fn into_inner(self) -> T { - self.0 - } + T: crate::safety::SafeDeviceCopy + + crate::safety::NoSafeAliasing + + const_type_layout::TypeGraphLayout, +> { + never: !, + _marker: PhantomData, } -#[cfg(not(feature = "host"))] -impl core::ops::Deref - for PerThreadShallowCopy +impl< + T: crate::safety::SafeDeviceCopy + + crate::safety::NoSafeAliasing + + const_type_layout::TypeGraphLayout, + > Deref for PerThreadShallowCopy { type Target = T; fn deref(&self) -> &Self::Target { - &self.0 + self.never + } +} + +impl< + T: crate::safety::SafeDeviceCopy + + crate::safety::NoSafeAliasing + + const_type_layout::TypeGraphLayout, + > DerefMut for PerThreadShallowCopy +{ + fn deref_mut(&mut self) -> &mut Self::Target { + self.never } } -impl CudaKernelParameter - for PerThreadShallowCopy +impl< + T: crate::safety::SafeDeviceCopy + + crate::safety::NoSafeAliasing + + const_type_layout::TypeGraphLayout, + > CudaKernelParameter for PerThreadShallowCopy { #[cfg(feature = "host")] type AsyncHostType<'stream, 'b> = crate::utils::device_copy::SafeDeviceCopyWrapper; - type DeviceType = PerThreadShallowCopy; + type DeviceType<'b> = T; type FfiType<'stream, 'b> = crate::utils::device_copy::SafeDeviceCopyWrapper; #[cfg(feature = "host")] type SyncHostType = T; @@ -408,22 +411,30 @@ impl Cuda } #[cfg(not(feature = "host"))] - fn with_ffi_as_device( + fn with_ffi_as_device( param: Self::FfiType<'static, 'static>, - inner: impl FnOnce(Self::DeviceType), - ) { - let param = PerThreadShallowCopy(param.into_inner()); + inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O, + ) -> O { + let param = param.into_inner(); inner(param) } } -impl sealed::Sealed - for PerThreadShallowCopy +impl< + T: crate::safety::SafeDeviceCopy + + crate::safety::NoSafeAliasing + + const_type_layout::TypeGraphLayout, + > sealed::Sealed for PerThreadShallowCopy { } -impl<'a, T: 'static + crate::safety::SafeDeviceCopy + const_type_layout::TypeGraphLayout> - CudaKernelParameter for &'a PerThreadShallowCopy +impl< + 'a, + T: 'static + + crate::safety::SafeDeviceCopy + + crate::safety::NoSafeAliasing + + const_type_layout::TypeGraphLayout, + > CudaKernelParameter for &'a PerThreadShallowCopy { #[cfg(feature = "host")] type AsyncHostType<'stream, 'b> = crate::host::HostAndDeviceConstRefAsync< @@ -431,7 +442,7 @@ impl<'a, T: 'static + crate::safety::SafeDeviceCopy + const_type_layout::TypeGra 'b, crate::utils::device_copy::SafeDeviceCopyWrapper, >; - type DeviceType = &'a PerThreadShallowCopy; + type DeviceType<'b> = &'b T; type FfiType<'stream, 'b> = DeviceConstRef<'b, crate::utils::device_copy::SafeDeviceCopyWrapper>; #[cfg(feature = "host")] @@ -466,31 +477,35 @@ impl<'a, T: 'static + crate::safety::SafeDeviceCopy + const_type_layout::TypeGra } #[cfg(not(feature = "host"))] - fn with_ffi_as_device( + fn with_ffi_as_device( param: Self::FfiType<'static, 'static>, - inner: impl FnOnce(Self::DeviceType), - ) { + inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O, + ) -> O { let param = param.as_ref().into_ref(); - // Safety: PerThreadShallowCopy is a transparent newtype wrapper around T - let param = unsafe { &*(param as *const T).cast::>() }; inner(param) } } -impl<'a, T: crate::safety::SafeDeviceCopy + const_type_layout::TypeGraphLayout> sealed::Sealed - for &'a PerThreadShallowCopy +impl< + 'a, + T: crate::safety::SafeDeviceCopy + + crate::safety::NoSafeAliasing + + const_type_layout::TypeGraphLayout, + > sealed::Sealed for &'a PerThreadShallowCopy { } #[repr(transparent)] -pub struct ShallowInteriorMutable(T); +pub struct ShallowInteriorMutable { + never: !, + _marker: PhantomData, +} -#[cfg(not(feature = "host"))] -impl core::ops::Deref for ShallowInteriorMutable { +impl Deref for ShallowInteriorMutable { type Target = T; fn deref(&self) -> &Self::Target { - &self.0 + self.never } } @@ -503,7 +518,7 @@ impl<'a, T: 'static + InteriorMutableSafeDeviceCopy> CudaKernelParameter 'b, crate::utils::device_copy::SafeDeviceCopyWrapper, >; - type DeviceType = &'a ShallowInteriorMutable; + type DeviceType<'b> = &'b T; type FfiType<'stream, 'b> = DeviceConstRef<'b, crate::utils::device_copy::SafeDeviceCopyWrapper>; #[cfg(feature = "host")] @@ -546,13 +561,11 @@ impl<'a, T: 'static + InteriorMutableSafeDeviceCopy> CudaKernelParameter } #[cfg(not(feature = "host"))] - fn with_ffi_as_device( + fn with_ffi_as_device( param: Self::FfiType<'static, 'static>, - inner: impl FnOnce(Self::DeviceType), - ) { + inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O, + ) -> O { let param = param.as_ref().into_ref(); - // Safety: ShallowInteriorMutable is a transparent newtype wrapper around T - let param = unsafe { &*(param as *const T).cast::>() }; inner(param) } @@ -560,7 +573,10 @@ impl<'a, T: 'static + InteriorMutableSafeDeviceCopy> CudaKernelParameter impl<'a, T: InteriorMutableSafeDeviceCopy> sealed::Sealed for &'a ShallowInteriorMutable {} pub trait InteriorMutableSafeDeviceCopy: - crate::safety::SafeDeviceCopy + const_type_layout::TypeGraphLayout + sealed::Sealed + crate::safety::SafeDeviceCopy + + crate::safety::NoSafeAliasing + + const_type_layout::TypeGraphLayout + + sealed::Sealed { } @@ -587,62 +603,41 @@ impl_atomic_interior_mutable! { // core::cell::SyncUnsafeCell {} #[repr(transparent)] -pub struct SharedHeapPerThreadShallowCopy(core::mem::ManuallyDrop); - -#[cfg(not(feature = "host"))] -impl SharedHeapPerThreadShallowCopy { - #[must_use] - fn new(value: T) -> Self { - Self(core::mem::ManuallyDrop::new(value)) - } -} - -#[cfg(not(feature = "host"))] -impl< - T: RustToCuda< - CudaRepresentation: crate::safety::SafeDeviceCopy, - CudaAllocation: EmptyCudaAlloc, - >, - > SharedHeapPerThreadShallowCopy -{ - #[must_use] - pub fn into_inner(self) -> T { - core::mem::ManuallyDrop::into_inner(self.0) - } +pub struct SharedHeapPerThreadShallowCopy { + never: !, + _marker: PhantomData, } -#[cfg(not(feature = "host"))] -impl core::ops::Deref for SharedHeapPerThreadShallowCopy { +impl Deref for SharedHeapPerThreadShallowCopy { type Target = T; fn deref(&self) -> &Self::Target { - &self.0 + self.never } } -#[cfg(not(feature = "host"))] -impl core::ops::DerefMut for SharedHeapPerThreadShallowCopy { +impl DerefMut for SharedHeapPerThreadShallowCopy { fn deref_mut(&mut self) -> &mut Self::Target { - &mut self.0 + self.never } } impl< T: RustToCuda< - CudaRepresentation: crate::safety::SafeDeviceCopy, - CudaAllocation: EmptyCudaAlloc, - >, + CudaRepresentation: 'static + crate::safety::SafeDeviceCopy, + CudaAllocation: EmptyCudaAlloc, + > + crate::safety::NoSafeAliasing, > CudaKernelParameter for SharedHeapPerThreadShallowCopy { #[cfg(feature = "host")] type AsyncHostType<'stream, 'b> = crate::host::HostAndDeviceOwnedAsync< 'stream, + 'b, DeviceAccessible<::CudaRepresentation>, >; - type DeviceType = SharedHeapPerThreadShallowCopy; - // TODO: where does the drop happen? + type DeviceType<'b> = T; type FfiType<'stream, 'b> = - DeviceOwnedRef::CudaRepresentation>>; + DeviceOwnedRef<'b, DeviceAccessible<::CudaRepresentation>>; #[cfg(feature = "host")] type SyncHostType = T; @@ -663,33 +658,35 @@ impl< } #[cfg(not(feature = "host"))] - fn with_ffi_as_device( + fn with_ffi_as_device( param: Self::FfiType<'static, 'static>, - inner: impl FnOnce(Self::DeviceType), - ) { - let param = - SharedHeapPerThreadShallowCopy::new(unsafe { CudaAsRust::as_rust(param.as_ref()) }); + inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O, + ) -> O { + // The type contains no allocations and is safe to copy + let param = unsafe { CudaAsRust::as_rust(param.as_ref()) }; inner(param) } } impl< T: RustToCuda< - CudaRepresentation: crate::safety::SafeDeviceCopy, - CudaAllocation: EmptyCudaAlloc, - >, + CudaRepresentation: crate::safety::SafeDeviceCopy, + CudaAllocation: EmptyCudaAlloc, + > + crate::safety::NoSafeAliasing, > sealed::Sealed for SharedHeapPerThreadShallowCopy { } -impl<'a, T: 'static + RustToCuda> CudaKernelParameter for &'a SharedHeapPerThreadShallowCopy { +impl<'a, T: 'static + RustToCuda + crate::safety::NoSafeAliasing> CudaKernelParameter + for &'a SharedHeapPerThreadShallowCopy +{ #[cfg(feature = "host")] type AsyncHostType<'stream, 'b> = crate::host::HostAndDeviceConstRefAsync< 'stream, 'b, DeviceAccessible<::CudaRepresentation>, >; - type DeviceType = &'a SharedHeapPerThreadShallowCopy; + type DeviceType<'b> = &'b T; type FfiType<'stream, 'b> = DeviceConstRef<'b, DeviceAccessible<::CudaRepresentation>>; #[cfg(feature = "host")] @@ -712,21 +709,23 @@ impl<'a, T: 'static + RustToCuda> CudaKernelParameter for &'a SharedHeapPerThrea } #[cfg(not(feature = "host"))] - fn with_ffi_as_device( + fn with_ffi_as_device( param: Self::FfiType<'static, 'static>, - inner: impl FnOnce(Self::DeviceType), - ) { - // param must never be dropped as we do NOT own any of the - // heap memory it might reference - let param = - SharedHeapPerThreadShallowCopy::new(unsafe { CudaAsRust::as_rust(param.as_ref()) }); + inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O, + ) -> O { + // Safety: param must never be dropped as we do NOT own any of the + // heap memory it might reference + let param = core::mem::ManuallyDrop::new(unsafe { CudaAsRust::as_rust(param.as_ref()) }); inner(¶m) } } -impl<'a, T: RustToCuda> sealed::Sealed for &'a SharedHeapPerThreadShallowCopy {} +impl<'a, T: RustToCuda + crate::safety::NoSafeAliasing> sealed::Sealed + for &'a SharedHeapPerThreadShallowCopy +{ +} -impl<'a, T: 'static + RustToCuda> CudaKernelParameter +impl<'a, T: 'static + RustToCuda + crate::safety::NoSafeAliasing> CudaKernelParameter for &'a mut SharedHeapPerThreadShallowCopy { #[cfg(feature = "host")] @@ -735,7 +734,7 @@ impl<'a, T: 'static + RustToCuda> CudaKernelParameter 'b, DeviceAccessible<::CudaRepresentation>, >; - type DeviceType = &'a mut SharedHeapPerThreadShallowCopy; + type DeviceType<'b> = &'b mut T; type FfiType<'stream, 'b> = DeviceMutRef<'b, DeviceAccessible<::CudaRepresentation>>; #[cfg(feature = "host")] @@ -758,16 +757,19 @@ impl<'a, T: 'static + RustToCuda> CudaKernelParameter } #[cfg(not(feature = "host"))] - fn with_ffi_as_device( + fn with_ffi_as_device( mut param: Self::FfiType<'static, 'static>, - inner: impl FnOnce(Self::DeviceType), - ) { - // param must never be dropped as we do NOT own any of the - // heap memory it might reference + inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O, + ) -> O { + // Safety: param must never be dropped as we do NOT own any of the + // heap memory it might reference let mut param = - SharedHeapPerThreadShallowCopy::new(unsafe { CudaAsRust::as_rust(param.as_mut()) }); + core::mem::ManuallyDrop::new(unsafe { CudaAsRust::as_rust(param.as_mut()) }); inner(&mut param) } } -impl<'a, T: RustToCuda> sealed::Sealed for &'a mut SharedHeapPerThreadShallowCopy {} +impl<'a, T: RustToCuda + crate::safety::NoSafeAliasing> sealed::Sealed + for &'a mut SharedHeapPerThreadShallowCopy +{ +} diff --git a/src/host.rs b/src/host.rs index 8fa437cf3..f15bca27e 100644 --- a/src/host.rs +++ b/src/host.rs @@ -780,46 +780,45 @@ impl<'a, T: DeviceCopy> HostAndDeviceConstRef<'a, T> { } #[allow(clippy::module_name_repetitions)] -pub struct HostAndDeviceOwned { - device_box: HostDeviceBox, - host_val: T, +pub struct HostAndDeviceOwned<'a, T: SafeDeviceCopy + DeviceCopy> { + device_box: &'a mut HostDeviceBox, + host_val: &'a mut T, } -impl HostAndDeviceOwned { +impl<'a, T: SafeDeviceCopy + DeviceCopy> HostAndDeviceOwned<'a, T> { /// # Errors /// /// Returns a [`CudaError`] iff `value` cannot be moved /// to CUDA or an error occurs inside `inner`. pub fn with_new, F: FnOnce(HostAndDeviceOwned) -> Result>( - value: T, + mut value: T, inner: F, ) -> Result { - let device_box: HostDeviceBox<_> = DeviceBox::new(&value)?.into(); + let mut device_box: HostDeviceBox<_> = DeviceBox::new(&value)?.into(); // Safety: `device_box` contains exactly the device copy of `value` inner(HostAndDeviceOwned { - device_box, - host_val: value, + device_box: &mut device_box, + host_val: &mut value, }) } #[must_use] - pub fn for_device(self) -> DeviceOwnedRef { - let mut device_box = ManuallyDrop::new(self.device_box); - + pub fn for_device(self) -> DeviceOwnedRef<'a, T> { DeviceOwnedRef { - pointer: device_box.0.as_raw_mut(), + pointer: self.device_box.0.as_raw_mut(), marker: PhantomData::, + reference: PhantomData::<&'a mut ()>, } } #[must_use] pub fn for_host(&self) -> &T { - &self.host_val + self.host_val } #[must_use] - pub fn into_async<'stream>(self) -> HostAndDeviceOwnedAsync<'stream, T> { + pub fn into_async<'stream>(self) -> HostAndDeviceOwnedAsync<'stream, 'a, T> { HostAndDeviceOwnedAsync { device_box: self.device_box, host_val: self.host_val, @@ -961,30 +960,29 @@ impl<'stream, 'a, T: DeviceCopy> HostAndDeviceConstRefAsync<'stream, 'a, T> { } #[allow(clippy::module_name_repetitions)] -pub struct HostAndDeviceOwnedAsync<'stream, T: SafeDeviceCopy + DeviceCopy> { - device_box: HostDeviceBox, - host_val: T, +pub struct HostAndDeviceOwnedAsync<'stream, 'a, T: SafeDeviceCopy + DeviceCopy> { + device_box: &'a mut HostDeviceBox, + host_val: &'a mut T, stream: PhantomData<&'stream Stream>, } -impl<'stream, T: SafeDeviceCopy + DeviceCopy> HostAndDeviceOwnedAsync<'stream, T> { +impl<'stream, 'a, T: SafeDeviceCopy + DeviceCopy> HostAndDeviceOwnedAsync<'stream, 'a, T> { #[must_use] /// # Safety /// /// The returned [`DeviceOwnedRef`] must only be used on the /// constructed-with [`Stream`] - pub unsafe fn for_device_async(self) -> DeviceOwnedRef { - let mut device_box = ManuallyDrop::new(self.device_box); - + pub unsafe fn for_device_async(self) -> DeviceOwnedRef<'a, T> { DeviceOwnedRef { - pointer: device_box.0.as_raw_mut(), - marker: PhantomData, + pointer: self.device_box.0.as_raw_mut(), + marker: PhantomData::, + reference: PhantomData::<&'a mut ()>, } } #[must_use] pub fn for_host(&self) -> &T { - &self.host_val + self.host_val } } diff --git a/src/lib.rs b/src/lib.rs index f4bc7bbe0..61b807d8b 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -28,6 +28,7 @@ #![feature(let_chains)] #![feature(inline_const)] #![feature(sync_unsafe_cell)] +#![feature(never_type)] #![feature(cfg_version)] #![cfg_attr(not(version("1.76.0")), feature(c_str_literals))] #![cfg_attr(not(version("1.76.0")), feature(ptr_from_ref))] From 446b1f7a8675dc04a616f2aff4379471b0ab9204 Mon Sep 17 00:00:00 2001 From: Juniper Tyree Date: Sat, 23 Dec 2023 10:48:00 +0000 Subject: [PATCH 055/120] Lift complete CPU kernel wrapper from proc macro into public functions --- Cargo.toml | 9 +- examples/print/src/main.rs | 18 +- examples/single-source/src/main.rs | 4 +- .../generate/cpu_linker_macro/args_trait.rs | 9 +- .../generate/cpu_linker_macro/get_ptx.rs | 39 +- .../generate/cpu_wrapper/kernel_func.rs | 67 ++-- .../generate/cpu_wrapper/kernel_func_async.rs | 146 ------- .../wrapper/generate/cpu_wrapper/mod.rs | 14 - .../wrapper/generate/cuda_generic_function.rs | 19 +- .../kernel/wrapper/generate/cuda_wrapper.rs | 91 ++--- .../src/kernel/wrapper/inputs/attribute.rs | 65 ---- .../src/kernel/wrapper/inputs/mod.rs | 87 ----- rust-cuda-derive/src/kernel/wrapper/mod.rs | 68 ++-- rust-cuda-derive/src/kernel/wrapper/parse.rs | 14 + rust-cuda-ptx-jit/Cargo.toml | 17 - rust-cuda-ptx-jit/src/device.rs | 13 - rust-cuda-ptx-jit/src/host/regex.rs | 46 --- rust-cuda-ptx-jit/src/lib.rs | 23 -- src/common.rs | 355 +++++++++++++++++- src/{host.rs => host/mod.rs} | 271 +++++++++---- .../src/host => src/host/ptx_jit}/mod.rs | 0 .../host => src/host/ptx_jit}/preprocess.rs | 10 +- src/host/ptx_jit/regex.rs | 58 +++ .../src/host => src/host/ptx_jit}/replace.rs | 9 +- src/lib.rs | 1 - 25 files changed, 783 insertions(+), 670 deletions(-) delete mode 100644 rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func_async.rs delete mode 100644 rust-cuda-derive/src/kernel/wrapper/inputs/attribute.rs delete mode 100644 rust-cuda-derive/src/kernel/wrapper/inputs/mod.rs delete mode 100644 rust-cuda-ptx-jit/Cargo.toml delete mode 100644 rust-cuda-ptx-jit/src/device.rs delete mode 100644 rust-cuda-ptx-jit/src/host/regex.rs delete mode 100644 rust-cuda-ptx-jit/src/lib.rs rename src/{host.rs => host/mod.rs} (77%) rename {rust-cuda-ptx-jit/src/host => src/host/ptx_jit}/mod.rs (100%) rename {rust-cuda-ptx-jit/src/host => src/host/ptx_jit}/preprocess.rs (93%) create mode 100644 src/host/ptx_jit/regex.rs rename {rust-cuda-ptx-jit/src/host => src/host/ptx_jit}/replace.rs (96%) diff --git a/Cargo.toml b/Cargo.toml index 9e9a568f2..0a1375547 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,10 +1,10 @@ [workspace] members = [ - ".", "rust-cuda-derive", "rust-cuda-ptx-jit", + ".", "rust-cuda-derive", "examples/derive", "examples/print", "examples/single-source", ] default-members = [ - ".", "rust-cuda-derive", "rust-cuda-ptx-jit" + ".", "rust-cuda-derive", ] [package] @@ -19,7 +19,7 @@ rust-version = "1.75" # nightly [features] default = [] -host = ["rustacuda", "rust-cuda-ptx-jit/host"] +host = ["rustacuda", "regex"] derive = ["rustacuda_derive", "rust-cuda-derive"] [dependencies] @@ -28,9 +28,10 @@ rustacuda_core = { git = "https://github.com/juntyr/RustaCUDA", rev = "c6ea7cc" rustacuda = { git = "https://github.com/juntyr/RustaCUDA", rev = "c6ea7cc", optional = true } rustacuda_derive = { git = "https://github.com/juntyr/RustaCUDA", rev = "c6ea7cc", optional = true } +regex = { version = "1.10", optional = true } + const-type-layout = { version = "0.2.0", features = ["derive"] } final = "0.1.1" rust-cuda-derive = { path = "rust-cuda-derive", optional = true } -rust-cuda-ptx-jit = { path = "rust-cuda-ptx-jit" } diff --git a/examples/print/src/main.rs b/examples/print/src/main.rs index 17cf42fd8..3d2f776e4 100644 --- a/examples/print/src/main.rs +++ b/examples/print/src/main.rs @@ -71,13 +71,25 @@ fn main() -> rust_cuda::rustacuda::error::CudaResult<()> { // Launch the CUDA kernel on the stream and synchronise to its completion println!("Launching print kernel ..."); - kernel.launch1(&stream, &config, Action::Print)?; + kernel.launch1::>( + &stream, + &config, + Action::Print, + )?; // kernel(&mut launcher, Action::Print)?; println!("Launching panic kernel ..."); - kernel.launch1(&stream, &config, Action::Panic)?; + kernel.launch1::>( + &stream, + &config, + Action::Panic, + )?; // kernel(&mut launcher, Action::Panic)?; println!("Launching alloc error kernel ..."); - kernel.launch1(&stream, &config, Action::AllocError)?; + kernel.launch1::>( + &stream, + &config, + Action::AllocError, + )?; // kernel(&mut launcher, Action::AllocError)?; Ok(()) diff --git a/examples/single-source/src/main.rs b/examples/single-source/src/main.rs index 10be57d65..f53963f9d 100644 --- a/examples/single-source/src/main.rs +++ b/examples/single-source/src/main.rs @@ -65,9 +65,9 @@ pub fn kernel< + rc::safety::NoSafeAliasing, >( _x: &rc::common::PerThreadShallowCopy, - #[kernel(jit)] _y: &mut rc::common::SharedHeapPerThreadShallowCopy>, + _y: &mut rc::common::PtxJit>>, _z: &rc::common::SharedHeapPerThreadShallowCopy>, - #[kernel(jit)] _v @ _w: &'a rc::common::ShallowInteriorMutable, + _v @ _w: &'a rc::common::ShallowInteriorMutable, _: rc::common::SharedHeapPerThreadShallowCopy>, Tuple(s, mut __t): rc::common::PerThreadShallowCopy, q: rc::common::PerThreadShallowCopy, diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/args_trait.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/args_trait.rs index 178ed026d..25cc27955 100644 --- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/args_trait.rs +++ b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/args_trait.rs @@ -8,7 +8,7 @@ pub(in super::super) fn quote_args_trait( impl_generics, ty_generics, }: &ImplGenerics, - FunctionInputs { func_inputs, .. }: &FunctionInputs, + FunctionInputs { func_inputs }: &FunctionInputs, ) -> TokenStream { let func_input_typedefs = (0..func_inputs.len()) .map(|i| { @@ -23,12 +23,7 @@ pub(in super::super) fn quote_args_trait( let func_input_types = func_inputs .iter() .enumerate() - .map(|(i, arg)| { - let pat_type = match arg { - syn::FnArg::Typed(pat_type) => pat_type, - syn::FnArg::Receiver(_) => unreachable!(), - }; - + .map(|(i, pat_type)| { let type_ident = quote::format_ident!("__T_{}", i); let arg_type = match &*pat_type.ty { syn::Type::Reference(syn::TypeReference { elem, .. }) => elem, diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/get_ptx.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/get_ptx.rs index e838d400c..439f27f9e 100644 --- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/get_ptx.rs +++ b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/get_ptx.rs @@ -121,32 +121,29 @@ fn generate_lifetime_erased_types( generic_close_token, .. }: &DeclGenerics, - FunctionInputs { func_inputs, .. }: &FunctionInputs, + FunctionInputs { func_inputs }: &FunctionInputs, macro_type_ids: &[syn::Ident], ) -> Vec { func_inputs .iter() .enumerate() - .map(|(i, arg)| match arg { - syn::FnArg::Typed(syn::PatType { ty, .. }) => { - let type_ident = quote::format_ident!("__T_{}", i); - - let mut specialised_ty = quote::quote_spanned! { ty.span()=> - <() as #args #generic_start_token - #($#macro_type_ids),* - #generic_close_token>::#type_ident - }; - // the args trait has to unbox outer lifetimes, so we need to add them back in here - if let syn::Type::Reference(syn::TypeReference { and_token, lifetime, mutability, .. }) = &**ty { - let lifetime = quote::quote_spanned! { lifetime.span()=> 'static }; - - specialised_ty = quote! { #and_token #lifetime #mutability #specialised_ty }; - } + .map(|(i, syn::PatType { ty, .. })| { + let type_ident = quote::format_ident!("__T_{}", i); - quote::quote_spanned! { ty.span()=> - <#specialised_ty as #crate_path::common::CudaKernelParameter>::FfiType<'static, 'static> - } - }, - syn::FnArg::Receiver(_) => unreachable!(), + let mut specialised_ty = quote::quote_spanned! { ty.span()=> + <() as #args #generic_start_token + #($#macro_type_ids),* + #generic_close_token>::#type_ident + }; + // the args trait has to unbox outer lifetimes, so we need to add them back in here + if let syn::Type::Reference(syn::TypeReference { and_token, lifetime, mutability, .. }) = &**ty { + let lifetime = quote::quote_spanned! { lifetime.span()=> 'static }; + + specialised_ty = quote! { #and_token #lifetime #mutability #specialised_ty }; + } + + quote::quote_spanned! { ty.span()=> + <#specialised_ty as #crate_path::common::CudaKernelParameter>::FfiType<'static, 'static> + } }).collect() } diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func.rs index 7eb7db1a4..b854ce160 100644 --- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func.rs +++ b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func.rs @@ -12,20 +12,20 @@ pub(super) fn quote_kernel_func_inputs( generic_close_token, .. }: &DeclGenerics, - inputs @ FunctionInputs { func_inputs, .. }: &FunctionInputs, - fn_ident @ FuncIdent { func_ident, .. }: &FuncIdent, + FunctionInputs { func_inputs }: &FunctionInputs, + FuncIdent { func_ident, .. }: &FuncIdent, func_params: &[syn::Ident], func_attrs: &[syn::Attribute], ) -> TokenStream { let (kernel_func_inputs, kernel_func_input_tys): (Vec<_>, Vec<_>) = func_inputs .iter() - .map(|arg| match arg { - syn::FnArg::Typed(syn::PatType { - attrs, - ty, - pat, - colon_token, - }) => { + .map( + |syn::PatType { + attrs, + ty, + pat, + colon_token, + }| { let ty: syn::Type = syn::parse_quote_spanned! { ty.span()=> <#ty as #crate_path::common::CudaKernelParameter>::SyncHostType }; @@ -40,14 +40,17 @@ pub(super) fn quote_kernel_func_inputs( ty, ) }, - syn::FnArg::Receiver(_) => unreachable!(), - }) + ) .unzip(); + let cuda_kernel_param_tys = func_inputs + .iter() + .map(|syn::PatType { ty, .. }| &**ty) + .collect::>(); + let launcher = syn::Ident::new("launcher", proc_macro2::Span::mixed_site()); - let raw_func_input_wrap = - generate_raw_func_input_wrap(crate_path, inputs, fn_ident, func_params, &launcher); + let launch = quote::format_ident!("launch{}", func_inputs.len()); let full_generics = generic_kernel_params .iter() @@ -74,45 +77,19 @@ pub(super) fn quote_kernel_func_inputs( #[cfg(not(target_os = "cuda"))] #(#func_attrs)* - #[allow(clippy::needless_lifetimes)] #[allow(clippy::too_many_arguments)] #[allow(clippy::used_underscore_binding)] - #[allow(unused_variables)] pub fn #func_ident <#generic_kernel_params>( - #launcher: &mut #crate_path::host::Launcher<#func_ident #ty_generics>, + #launcher: &mut #crate_path::host::Launcher<#func_ident #generic_start_token + #(#full_generics),* + #generic_close_token>, #(#kernel_func_inputs),* ) -> #crate_path::rustacuda::error::CudaResult<()> { let _: #func_ident <#(#full_generics),*> = #func_ident #ty_turbofish; - #raw_func_input_wrap + #launcher.#launch::< + #(#cuda_kernel_param_tys),* + >(#(#func_params),*) } } } - -#[allow(clippy::too_many_lines)] -fn generate_raw_func_input_wrap( - crate_path: &syn::Path, - FunctionInputs { func_inputs, .. }: &FunctionInputs, - FuncIdent { - func_ident_async, .. - }: &FuncIdent, - func_params: &[syn::Ident], - launcher: &syn::Ident, -) -> TokenStream { - func_inputs.iter().rev().fold( - quote! { - #func_ident_async(#launcher, #(#func_params),*)?; - #launcher.stream.synchronize() - }, - |inner, arg| match arg { - syn::FnArg::Typed(syn::PatType { pat, ty, .. }) => { - quote::quote_spanned! { ty.span()=> - <#ty as #crate_path::common::CudaKernelParameter>::with_new_async( - #pat, #launcher.stream, |#pat| { #inner } - ) - } - }, - syn::FnArg::Receiver(_) => unreachable!(), - }, - ) -} diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func_async.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func_async.rs deleted file mode 100644 index 39ce95e9d..000000000 --- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func_async.rs +++ /dev/null @@ -1,146 +0,0 @@ -use proc_macro2::TokenStream; -use syn::spanned::Spanned; - -use super::super::super::{DeclGenerics, FuncIdent, FunctionInputs, ImplGenerics}; - -#[allow(clippy::too_many_arguments)] -pub(super) fn quote_kernel_func_async( - crate_path: &syn::Path, - ImplGenerics { ty_generics, .. }: &ImplGenerics, - DeclGenerics { - generic_kernel_params, - .. - }: &DeclGenerics, - func_inputs: &FunctionInputs, - FuncIdent { - func_ident, - func_ident_async, - .. - }: &FuncIdent, - func_params: &[syn::Ident], - func_attrs: &[syn::Attribute], -) -> TokenStream { - let launcher = syn::Ident::new("launcher", proc_macro2::Span::mixed_site()); - let stream = syn::Lifetime::new("'stream", proc_macro2::Span::mixed_site()); - - let (async_params, launch_param_types, launch_param_wrap, _ptx_jit_param_wrap) = - generate_type_wrap(crate_path, func_inputs, &stream); - - quote! { - #[cfg(not(target_os = "cuda"))] - #(#func_attrs)* - #[allow(clippy::extra_unused_type_parameters)] - #[allow(clippy::too_many_arguments)] - #[allow(clippy::used_underscore_binding)] - #[allow(unused_variables)] - pub fn #func_ident_async <#stream, #generic_kernel_params>( - #launcher: &mut #crate_path::host::Launcher<#stream, '_, #func_ident #ty_generics>, - #(#async_params),* - ) -> #crate_path::rustacuda::error::CudaResult<()> { - let kernel_jit_result = if #launcher.config.ptx_jit { - #launcher.kernel.compile_with_ptx_jit_args(None)? // TODO: #ptx_jit_param_wrap)? - } else { - #launcher.kernel.compile_with_ptx_jit_args(None)? - }; - let function = match kernel_jit_result { - #crate_path::host::KernelJITResult::Recompiled(function) - | #crate_path::host::KernelJITResult::Cached(function) => function, - }; - - #[allow(clippy::redundant_closure_call)] - (|#(#func_params: #launch_param_types),*| { - let #crate_path::host::LaunchConfig { - grid, block, shared_memory_size, ptx_jit: _, - } = #launcher.config.clone(); - - unsafe { #launcher.stream.launch(function, grid, block, shared_memory_size, - &[ - #( - &#func_params as *const _ as *mut ::core::ffi::c_void - ),* - ] - ) } - })(#(#launch_param_wrap),*) - } - } -} - -fn generate_type_wrap( - crate_path: &syn::Path, - FunctionInputs { - func_inputs, - func_input_cuda_types, - }: &FunctionInputs, - stream: &syn::Lifetime, -) -> ( - Vec, - Vec, - Vec, - TokenStream, -) { - let mut any_ptx_jit = false; - - let mut async_params = Vec::with_capacity(func_inputs.len()); - let mut launch_param_types = Vec::with_capacity(func_inputs.len()); - let mut launch_param_wrap = Vec::with_capacity(func_inputs.len()); - let mut ptx_jit_param_wrap = Vec::with_capacity(func_inputs.len()); - - func_inputs - .iter() - .zip(func_input_cuda_types.iter()) - .for_each(|(arg, ptx_jit)| match arg { - syn::FnArg::Typed(syn::PatType { - attrs, - pat, - colon_token, - ty, - }) => { - ptx_jit_param_wrap.push(if ptx_jit.0 { - any_ptx_jit = true; - - quote! { Some(#crate_path::ptx_jit::arg_as_raw_bytes(#pat.for_host())) } - } else { - quote! { None } - }); - - let async_ty: syn::Type = syn::parse_quote_spanned! { ty.span()=> - <#ty as #crate_path::common::CudaKernelParameter>::AsyncHostType<#stream, '_> - }; - - let async_param = syn::FnArg::Typed(syn::PatType { - attrs: attrs.clone(), - ty: Box::new(async_ty), - pat: pat.clone(), - colon_token: *colon_token, - }); - - async_params.push(async_param); - - let launch_ty: syn::Type = syn::parse_quote_spanned! { ty.span()=> - <#ty as #crate_path::common::CudaKernelParameter>::FfiType<#stream, '_> - }; - - launch_param_types.push(launch_ty); - - let launch_wrap = quote::quote_spanned! { ty.span()=> - <#ty as #crate_path::common::CudaKernelParameter>::async_to_ffi(#pat) - }; - - launch_param_wrap.push(launch_wrap); - }, - syn::FnArg::Receiver(_) => unreachable!(), - }); - - let ptx_jit_param_wrap = if any_ptx_jit { - quote!(Some(&[#(#ptx_jit_param_wrap),*])) - } else { - quote!(None) - }; - - ( - async_params, - launch_param_types, - launch_param_wrap, - ptx_jit_param_wrap, - ) -} diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/mod.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/mod.rs index b863a478f..eeb5cd5d4 100644 --- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/mod.rs +++ b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/mod.rs @@ -3,10 +3,7 @@ use proc_macro2::TokenStream; use super::super::{DeclGenerics, FuncIdent, FunctionInputs, ImplGenerics}; mod kernel_func; -mod kernel_func_async; - use kernel_func::quote_kernel_func_inputs; -use kernel_func_async::quote_kernel_func_async; pub(in super::super) fn quote_cpu_wrapper( crate_path: &syn::Path, @@ -26,19 +23,8 @@ pub(in super::super) fn quote_cpu_wrapper( func_params, func_attrs, ); - let kernel_func_async = quote_kernel_func_async( - crate_path, - impl_generics, - decl, - func_inputs, - fn_ident, - func_params, - func_attrs, - ); quote! { #kernel_func - - #kernel_func_async } } diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cuda_generic_function.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cuda_generic_function.rs index a6d8ac550..8a5de226e 100644 --- a/rust-cuda-derive/src/kernel/wrapper/generate/cuda_generic_function.rs +++ b/rust-cuda-derive/src/kernel/wrapper/generate/cuda_generic_function.rs @@ -11,20 +11,20 @@ pub(in super::super) fn quote_cuda_generic_function( generic_close_token, .. }: &DeclGenerics, - func_inputs: &syn::punctuated::Punctuated, + func_inputs: &syn::punctuated::Punctuated, FuncIdent { func_ident, .. }: &FuncIdent, func_attrs: &[syn::Attribute], func_block: &syn::Block, ) -> TokenStream { let kernel_func_inputs = func_inputs .iter() - .map(|arg| match arg { - syn::FnArg::Typed(syn::PatType { - attrs, - ty, - pat, - colon_token, - }) => { + .map( + |syn::PatType { + attrs, + ty, + pat, + colon_token, + }| { let ty: syn::Type = syn::parse_quote_spanned! { ty.span()=> <#ty as #crate_path::common::CudaKernelParameter>::DeviceType<'_> }; @@ -36,8 +36,7 @@ pub(in super::super) fn quote_cuda_generic_function( colon_token: *colon_token, }) }, - syn::FnArg::Receiver(_) => unreachable!(), - }) + ) .collect::>(); quote! { diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cuda_wrapper.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cuda_wrapper.rs index 24365ee29..7fce0a925 100644 --- a/rust-cuda-derive/src/kernel/wrapper/generate/cuda_wrapper.rs +++ b/rust-cuda-derive/src/kernel/wrapper/generate/cuda_wrapper.rs @@ -9,10 +9,7 @@ use super::super::{ #[allow(clippy::too_many_lines)] pub(in super::super) fn quote_cuda_wrapper( crate_path: &syn::Path, - inputs @ FunctionInputs { - func_inputs, - func_input_cuda_types, - }: &FunctionInputs, + inputs @ FunctionInputs { func_inputs }: &FunctionInputs, func @ FuncIdent { func_ident, func_ident_hash, @@ -39,34 +36,27 @@ pub(in super::super) fn quote_cuda_wrapper( .collect::>(); let ffi_param_ptx_jit_wrap = func_inputs - .iter().zip(func_input_cuda_types.iter()).enumerate() + .iter().enumerate() .rev() .fold(quote! { #func_ident(#(#func_params),*) - }, |inner, (_i, (arg, _ptx_jit))| match arg { - syn::FnArg::Typed(syn::PatType { - pat, - ty, - .. - }) => { - // Emit PTX JIT load markers - // let ptx_jit_load = if ptx_jit.0 { - // quote! { - // #crate_path::ptx_jit::PtxJITConstLoad!([#i] => #pat.as_ref()) - // } - // } else { quote! {} }; + }, |inner, (i, syn::PatType { + pat, + ty, + .. + })| { + let specialised_ty = quote::quote_spanned! { ty.span()=> + #crate_path::device::specialise_kernel_type!(#ty for #generics in #func_ident) + }; - let specialised_ty = quote::quote_spanned! { ty.span()=> - #crate_path::device::specialise_kernel_type!(#ty for #generics in #func_ident) - }; - - quote::quote_spanned! { ty.span()=> - <#specialised_ty as #crate_path::common::CudaKernelParameter>::with_ffi_as_device( - #pat, |#pat| { #inner } - ) - } - }, - syn::FnArg::Receiver(_) => unreachable!(), + // Load the device param from its FFI representation + // To allow some parameters to also inject PTX JIT load markers here, + // we pass them the param index i + quote::quote_spanned! { ty.span()=> + <#specialised_ty as #crate_path::common::CudaKernelParameter>::with_ffi_as_device::<_, #i>( + #pat, |#pat| { #inner } + ) + } }); quote! { @@ -108,39 +98,34 @@ pub(in super::super) fn quote_cuda_wrapper( fn specialise_ffi_input_types( crate_path: &syn::Path, - FunctionInputs { func_inputs, .. }: &FunctionInputs, + FunctionInputs { func_inputs }: &FunctionInputs, FuncIdent { func_ident, .. }: &FuncIdent, ImplGenerics { impl_generics, .. }: &ImplGenerics, ) -> (Vec, Vec) { func_inputs .iter() - .map(|arg| match arg { - syn::FnArg::Typed( - syn::PatType { - attrs, - pat, - colon_token, - ty, - }, - ) => { - let specialised_ty = quote::quote_spanned! { ty.span()=> - #crate_path::device::specialise_kernel_type!(#ty for #impl_generics in #func_ident) - }; + .map(|syn::PatType { + attrs, + pat, + colon_token, + ty, + }| { + let specialised_ty = quote::quote_spanned! { ty.span()=> + #crate_path::device::specialise_kernel_type!(#ty for #impl_generics in #func_ident) + }; - let ffi_ty: syn::Type = syn::parse_quote_spanned! { ty.span()=> - <#specialised_ty as #crate_path::common::CudaKernelParameter>::FfiType<'static, 'static> - }; + let ffi_ty: syn::Type = syn::parse_quote_spanned! { ty.span()=> + <#specialised_ty as #crate_path::common::CudaKernelParameter>::FfiType<'static, 'static> + }; - let ffi_param = syn::FnArg::Typed(syn::PatType { - attrs: attrs.clone(), - ty: Box::new(ffi_ty.clone()), - pat: pat.clone(), - colon_token: *colon_token, - }); + let ffi_param = syn::FnArg::Typed(syn::PatType { + attrs: attrs.clone(), + ty: Box::new(ffi_ty.clone()), + pat: pat.clone(), + colon_token: *colon_token, + }); - (ffi_param, ffi_ty) - }, - syn::FnArg::Receiver(_) => unreachable!(), + (ffi_param, ffi_ty) }) .unzip() } diff --git a/rust-cuda-derive/src/kernel/wrapper/inputs/attribute.rs b/rust-cuda-derive/src/kernel/wrapper/inputs/attribute.rs deleted file mode 100644 index 4ca2ff7bf..000000000 --- a/rust-cuda-derive/src/kernel/wrapper/inputs/attribute.rs +++ /dev/null @@ -1,65 +0,0 @@ -use syn::spanned::Spanned; - -pub(super) enum KernelInputAttribute { - PtxJit(proc_macro2::Span, bool), -} - -impl syn::parse::Parse for KernelInputAttribute { - fn parse(input: syn::parse::ParseStream) -> syn::Result { - let ident: syn::Ident = input.parse()?; - - match &*ident.to_string() { - "jit" => { - let eq: Option = input.parse()?; - - let (ptx_jit, span) = if eq.is_some() { - let value: syn::LitBool = input.parse()?; - - ( - value.value(), - ident - .span() - .join(eq.span()) - .unwrap() - .span() - .join(value.span()) - .unwrap(), - ) - } else { - (true, ident.span()) - }; - - Ok(KernelInputAttribute::PtxJit(span, ptx_jit)) - }, - _ => abort!( - ident.span(), - "Unexpected kernel attribute `{:?}`: Expected `jit`.", - ident - ), - } - } -} - -pub(super) struct KernelInputAttributes(Vec); - -impl syn::parse::Parse for KernelInputAttributes { - fn parse(input: syn::parse::ParseStream) -> syn::Result { - let content; - let _parens = syn::parenthesized!(content in input); - - syn::punctuated::Punctuated::< - KernelInputAttribute, syn::token::Comma - >::parse_separated_nonempty(&content).map(|punctuated| { - Self(punctuated.into_iter().collect()) - }) - } -} - -impl IntoIterator for KernelInputAttributes { - type IntoIter = std::vec::IntoIter; - type Item = KernelInputAttribute; - - fn into_iter(self) -> Self::IntoIter { - self.0.into_iter() - } -} diff --git a/rust-cuda-derive/src/kernel/wrapper/inputs/mod.rs b/rust-cuda-derive/src/kernel/wrapper/inputs/mod.rs deleted file mode 100644 index 154503702..000000000 --- a/rust-cuda-derive/src/kernel/wrapper/inputs/mod.rs +++ /dev/null @@ -1,87 +0,0 @@ -use syn::spanned::Spanned; - -use super::InputPtxJit; - -mod attribute; -use attribute::{KernelInputAttribute, KernelInputAttributes}; - -pub(super) struct FunctionInputs { - pub(super) func_inputs: syn::punctuated::Punctuated, - pub(super) func_input_cuda_types: Vec, -} - -pub(super) fn parse_function_inputs(func: &syn::ItemFn) -> FunctionInputs { - let (func_inputs, func_input_cuda_types): ( - syn::punctuated::Punctuated, - Vec, - ) = func - .sig - .inputs - .iter() - .map(|arg| match arg { - receiver @ syn::FnArg::Receiver(_) => { - abort!(receiver.span(), "Kernel function must not have a receiver.") - }, - syn::FnArg::Typed(syn::PatType { - attrs, - pat, - colon_token, - ty, - }) => { - let mut ptx_jit: Option = None; - - let attrs = attrs - .iter() - .filter(|attr| match attr.path.get_ident() { - Some(ident) if ident == "kernel" => { - let attrs: KernelInputAttributes = - match syn::parse_macro_input::parse(attr.tokens.clone().into()) { - Ok(data) => data, - Err(err) => abort!(attr.span(), err), - }; - - for attr in attrs { - match attr { - KernelInputAttribute::PtxJit(span, jit) - if ptx_jit.is_none() => - { - if !matches!(&**ty, syn::Type::Reference(_)) && jit { - abort!( - span, - "Only reference types can be PTX JIT loaded." - ); - } - - ptx_jit = Some(InputPtxJit(jit)); - }, - KernelInputAttribute::PtxJit(span, _jit) => { - abort!(span, "Duplicate PTX JIT declaration."); - }, - } - } - - false - }, - _ => true, - }) - .cloned() - .collect(); - - ( - syn::FnArg::Typed(syn::PatType { - attrs, - pat: pat.clone(), - colon_token: *colon_token, - ty: ty.clone(), - }), - ptx_jit.unwrap_or(InputPtxJit(false)), - ) - }, - }) - .unzip(); - - FunctionInputs { - func_inputs, - func_input_cuda_types, - } -} diff --git a/rust-cuda-derive/src/kernel/wrapper/mod.rs b/rust-cuda-derive/src/kernel/wrapper/mod.rs index 79bae8dbd..4486f4c49 100644 --- a/rust-cuda-derive/src/kernel/wrapper/mod.rs +++ b/rust-cuda-derive/src/kernel/wrapper/mod.rs @@ -7,7 +7,6 @@ use proc_macro::TokenStream; mod config; mod generate; -mod inputs; mod parse; use super::lints::{parse_ptx_lint_level, LintLevel, PtxLint}; @@ -17,7 +16,6 @@ use generate::{ cpu_linker_macro::quote_cpu_linker_macro, cpu_wrapper::quote_cpu_wrapper, cuda_generic_function::quote_cuda_generic_function, cuda_wrapper::quote_cuda_wrapper, }; -use inputs::{parse_function_inputs, FunctionInputs}; use parse::parse_kernel_fn; use proc_macro2::{Ident, Span}; use syn::spanned::Spanned; @@ -130,7 +128,19 @@ pub fn kernel(attr: TokenStream, func: TokenStream) -> TokenStream { } }; - let mut func_inputs = parse_function_inputs(&func); + let mut func_inputs = FunctionInputs { + func_inputs: func + .sig + .inputs + .into_iter() + .map(|arg| match arg { + syn::FnArg::Typed(arg) => arg, + syn::FnArg::Receiver(_) => { + unreachable!("already checked that no receiver arg exists") + }, + }) + .collect(), + }; let generic_kernel_params = func.sig.generics.params.clone(); let (generic_start_token, generic_close_token) = @@ -161,7 +171,6 @@ pub fn kernel(attr: TokenStream, func: TokenStream) -> TokenStream { let func_ident = FuncIdent { func_ident: &func.sig.ident, - func_ident_async: quote::format_ident!("{}_async", &func.sig.ident), func_ident_hash: quote::format_ident!("{}_{:016x}", &func.sig.ident, kernel_hash), }; @@ -169,12 +178,9 @@ pub fn kernel(attr: TokenStream, func: TokenStream) -> TokenStream { .func_inputs .iter() .enumerate() - .map(|(i, arg)| match arg { - syn::FnArg::Typed(syn::PatType { pat, .. }) => match ident_from_pat(pat) { - Some(ident) => ident, - None => syn::Ident::new(&format!("{}_arg_{i}", func_ident.func_ident), pat.span()), - }, - syn::FnArg::Receiver(_) => unreachable!(), + .map(|(i, syn::PatType { pat, .. })| match ident_from_pat(pat) { + Some(ident) => ident, + None => syn::Ident::new(&format!("{}_arg_{i}", func_ident.func_ident), pat.span()), }) .collect::>(); @@ -182,29 +188,28 @@ pub fn kernel(attr: TokenStream, func: TokenStream) -> TokenStream { .func_inputs .iter_mut() .zip(&func_params) - .map(|(arg, ident)| match arg { - syn::FnArg::Typed(syn::PatType { + .map(|(arg, ident)| { + let syn::PatType { attrs, colon_token, ty, .. - }) => { - let ident_fn_arg = syn::FnArg::Typed(syn::PatType { - attrs: attrs.clone(), - pat: Box::new(syn::Pat::Ident(syn::PatIdent { - attrs: Vec::new(), - by_ref: None, - mutability: None, - ident: ident.clone(), - subpat: None, - })), - colon_token: *colon_token, - ty: ty.clone(), - }); - - std::mem::replace(arg, ident_fn_arg) - }, - syn::FnArg::Receiver(_) => unreachable!(), + } = arg; + + let ident_fn_arg = syn::PatType { + attrs: attrs.clone(), + pat: Box::new(syn::Pat::Ident(syn::PatIdent { + attrs: Vec::new(), + by_ref: None, + mutability: None, + ident: ident.clone(), + subpat: None, + })), + colon_token: *colon_token, + ty: ty.clone(), + }; + + std::mem::replace(arg, ident_fn_arg) }) .collect(); @@ -258,7 +263,9 @@ pub fn kernel(attr: TokenStream, func: TokenStream) -> TokenStream { .into() } -struct InputPtxJit(bool); +struct FunctionInputs { + func_inputs: syn::punctuated::Punctuated, +} #[allow(clippy::struct_field_names)] struct DeclGenerics<'f> { @@ -276,7 +283,6 @@ struct ImplGenerics<'f> { #[allow(clippy::struct_field_names)] struct FuncIdent<'f> { func_ident: &'f syn::Ident, - func_ident_async: syn::Ident, func_ident_hash: syn::Ident, } diff --git a/rust-cuda-derive/src/kernel/wrapper/parse.rs b/rust-cuda-derive/src/kernel/wrapper/parse.rs index 56aa60053..6d31697cf 100644 --- a/rust-cuda-derive/src/kernel/wrapper/parse.rs +++ b/rust-cuda-derive/src/kernel/wrapper/parse.rs @@ -41,6 +41,20 @@ pub(super) fn parse_kernel_fn(tokens: TokenStream) -> syn::ItemFn { ); } + for param in &func.sig.inputs { + if let syn::FnArg::Receiver(receiver) = param { + abort!(receiver.span(), "Kernel function must not have a receiver."); + } + } + + if func.sig.inputs.len() > 12 { + abort!( + func.sig.inputs.span(), + "Kernel function has too many arguments, {} were found but at most 12 are supported.", + func.sig.inputs.len() + ); + } + match &func.sig.output { syn::ReturnType::Default => (), syn::ReturnType::Type(_, box syn::Type::Tuple(tuple)) if tuple.elems.is_empty() => (), diff --git a/rust-cuda-ptx-jit/Cargo.toml b/rust-cuda-ptx-jit/Cargo.toml deleted file mode 100644 index dc5fe4249..000000000 --- a/rust-cuda-ptx-jit/Cargo.toml +++ /dev/null @@ -1,17 +0,0 @@ -[package] -name = "rust-cuda-ptx-jit" -version = "0.1.0" -authors = ["Juniper Tyree "] -license = "MIT OR Apache-2.0" -edition = "2021" - -# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html - -[features] -default = [] -host = ["regex", "rustacuda", "lazy_static"] - -[dependencies] -rustacuda = { git = "https://github.com/juntyr/RustaCUDA", rev = "c6ea7cc", optional = true } -regex = { version = "1.5", optional = true } -lazy_static = { version = "1.4", optional = true } diff --git a/rust-cuda-ptx-jit/src/device.rs b/rust-cuda-ptx-jit/src/device.rs deleted file mode 100644 index c647a65eb..000000000 --- a/rust-cuda-ptx-jit/src/device.rs +++ /dev/null @@ -1,13 +0,0 @@ -#[macro_export] -#[doc(hidden)] -#[doc(cfg(not(feature = "host")))] -macro_rules! PtxJITConstLoad { - ([$index:literal] => $reference:expr) => { - unsafe { - ::core::arch::asm!( - ::core::concat!("// //"), - in(reg32) *($reference as *const _ as *const u32), - ) - } - }; -} diff --git a/rust-cuda-ptx-jit/src/host/regex.rs b/rust-cuda-ptx-jit/src/host/regex.rs deleted file mode 100644 index 5cff3bdc9..000000000 --- a/rust-cuda-ptx-jit/src/host/regex.rs +++ /dev/null @@ -1,46 +0,0 @@ -#[allow(unused_imports)] -use regex::bytes::Regex; - -lazy_static::lazy_static! { - pub static ref CONST_MARKER_REGEX: Regex = { - Regex::new( - r"(?-u)// %r\d+)-(?P\d+)> //" - ).unwrap() - }; - - pub static ref CONST_BASE_REGISTER_REGEX: Regex = { - Regex::new( - r"(?-u)ld\.global\.u32\s*(?P%r\d+)\s*,\s*\[(?P%r[ds]?\d+)]\s*;", - ).unwrap() - }; - - pub static ref CONST_LOAD_INSTRUCTION_REGEX: Regex = { - Regex::new( - r"(?x-u)(?P - ld\.global - (?:\.(?Pv[24]))? - \. - (?P[suf]) - (?P8|16|32|64) - \s* - (?P - (?:%[rf][sd]?\d+) | - (?:\{(?:\s*%[rf][sd]?\d+,)*\s*%[rf][sd]?\d+\s*\}) - ) - ,\s* - \[ - (?P%r[ds]?\d+) - (?: - \+ - (?P\d+) - )? - \] - \s*; - )", - ).unwrap() - }; - - pub static ref REGISTER_REGEX: Regex = { - Regex::new(r"(?-u)(?P%[rf][sd]?\d+)").unwrap() - }; -} diff --git a/rust-cuda-ptx-jit/src/lib.rs b/rust-cuda-ptx-jit/src/lib.rs deleted file mode 100644 index 8b25fc9a0..000000000 --- a/rust-cuda-ptx-jit/src/lib.rs +++ /dev/null @@ -1,23 +0,0 @@ -#![deny(clippy::pedantic)] -#![cfg_attr(not(feature = "host"), no_std)] -#![feature(cfg_version)] -#![cfg_attr(not(version("1.76.0")), feature(ptr_from_ref))] -#![feature(doc_cfg)] -#![doc(html_root_url = "https://juntyr.github.io/rust-cuda/")] - -#[cfg(feature = "host")] -mod host; - -#[cfg(feature = "host")] -pub use host::{PtxJITCompiler, PtxJITResult}; - -#[cfg(any(not(feature = "host"), doc))] -#[doc(cfg(not(feature = "host")))] -mod device; - -pub fn arg_as_raw_bytes(r: &T) -> *const [u8] { - core::ptr::slice_from_raw_parts( - core::ptr::from_ref(r).cast::(), - core::mem::size_of_val(r), - ) -} diff --git a/src/common.rs b/src/common.rs index c4a880262..5360ccbbc 100644 --- a/src/common.rs +++ b/src/common.rs @@ -8,7 +8,10 @@ use core::{ #[cfg(feature = "host")] use alloc::fmt; #[cfg(feature = "host")] -use core::{mem::MaybeUninit, ptr::copy_nonoverlapping}; +use core::{ + mem::MaybeUninit, + ptr::{copy_nonoverlapping, NonNull}, +}; use const_type_layout::TypeGraphLayout; use rustacuda_core::DeviceCopy; @@ -333,19 +336,43 @@ pub trait CudaKernelParameter: sealed::Sealed { inner: impl for<'b> FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result, ) -> Result; + #[cfg(feature = "host")] + fn with_async_as_ptx_jit( + param: &Self::AsyncHostType<'_, '_>, + inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O, + ) -> O; + #[cfg(feature = "host")] fn async_to_ffi<'stream, 'b>( param: Self::AsyncHostType<'stream, 'b>, ) -> Self::FfiType<'stream, 'b>; #[cfg(not(feature = "host"))] - fn with_ffi_as_device( + fn with_ffi_as_device( param: Self::FfiType<'static, 'static>, inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O, ) -> O; } -#[repr(transparent)] +pub struct PtxJit { + never: !, + _marker: PhantomData, +} + +impl Deref for PtxJit { + type Target = T; + + fn deref(&self) -> &Self::Target { + self.never + } +} + +impl DerefMut for PtxJit { + fn deref_mut(&mut self) -> &mut Self::Target { + self.never + } +} + pub struct PerThreadShallowCopy< T: crate::safety::SafeDeviceCopy + crate::safety::NoSafeAliasing @@ -403,6 +430,14 @@ impl< )) } + #[cfg(feature = "host")] + fn with_async_as_ptx_jit( + _param: &Self::AsyncHostType<'_, '_>, + inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O, + ) -> O { + inner(None) + } + #[cfg(feature = "host")] fn async_to_ffi<'stream, 'b>( param: Self::AsyncHostType<'stream, 'b>, @@ -411,7 +446,7 @@ impl< } #[cfg(not(feature = "host"))] - fn with_ffi_as_device( + fn with_ffi_as_device( param: Self::FfiType<'static, 'static>, inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O, ) -> O { @@ -469,6 +504,14 @@ impl< inner(const_ref.as_async()) } + #[cfg(feature = "host")] + fn with_async_as_ptx_jit( + _param: &Self::AsyncHostType<'_, '_>, + inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O, + ) -> O { + inner(None) + } + #[cfg(feature = "host")] fn async_to_ffi<'stream, 'b>( param: Self::AsyncHostType<'stream, 'b>, @@ -477,7 +520,7 @@ impl< } #[cfg(not(feature = "host"))] - fn with_ffi_as_device( + fn with_ffi_as_device( param: Self::FfiType<'static, 'static>, inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O, ) -> O { @@ -495,7 +538,68 @@ impl< { } -#[repr(transparent)] +impl< + 'a, + T: 'static + + crate::safety::SafeDeviceCopy + + crate::safety::NoSafeAliasing + + const_type_layout::TypeGraphLayout, + > CudaKernelParameter for &'a PtxJit> +{ + #[cfg(feature = "host")] + type AsyncHostType<'stream, 'b> = + <&'a PerThreadShallowCopy as CudaKernelParameter>::AsyncHostType<'stream, 'b>; + type DeviceType<'b> = <&'a PerThreadShallowCopy as CudaKernelParameter>::DeviceType<'b>; + type FfiType<'stream, 'b> = + <&'a PerThreadShallowCopy as CudaKernelParameter>::FfiType<'stream, 'b>; + #[cfg(feature = "host")] + type SyncHostType = <&'a PerThreadShallowCopy as CudaKernelParameter>::SyncHostType; + + #[cfg(feature = "host")] + fn with_new_async<'stream, O, E: From>( + param: Self::SyncHostType, + stream: &'stream rustacuda::stream::Stream, + inner: impl for<'b> FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result, + ) -> Result { + <&'a PerThreadShallowCopy as CudaKernelParameter>::with_new_async(param, stream, inner) + } + + #[cfg(feature = "host")] + fn with_async_as_ptx_jit( + param: &Self::AsyncHostType<'_, '_>, + inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O, + ) -> O { + inner(Some(¶m_as_raw_bytes(param.for_host()))) + } + + #[cfg(feature = "host")] + fn async_to_ffi<'stream, 'b>( + param: Self::AsyncHostType<'stream, 'b>, + ) -> Self::FfiType<'stream, 'b> { + <&'a PerThreadShallowCopy as CudaKernelParameter>::async_to_ffi(param) + } + + #[cfg(not(feature = "host"))] + fn with_ffi_as_device( + param: Self::FfiType<'static, 'static>, + inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O, + ) -> O { + emit_param_ptx_jit_marker::<_, PARAM>(param.as_ref()); + + <&'a PerThreadShallowCopy as CudaKernelParameter>::with_ffi_as_device::( + param, inner, + ) + } +} +impl< + 'a, + T: crate::safety::SafeDeviceCopy + + crate::safety::NoSafeAliasing + + const_type_layout::TypeGraphLayout, + > sealed::Sealed for &'a PtxJit> +{ +} + pub struct ShallowInteriorMutable { never: !, _marker: PhantomData, @@ -553,6 +657,14 @@ impl<'a, T: 'static + InteriorMutableSafeDeviceCopy> CudaKernelParameter result } + #[cfg(feature = "host")] + fn with_async_as_ptx_jit( + _param: &Self::AsyncHostType<'_, '_>, + inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O, + ) -> O { + inner(None) + } + #[cfg(feature = "host")] fn async_to_ffi<'stream, 'b>( param: Self::AsyncHostType<'stream, 'b>, @@ -561,7 +673,7 @@ impl<'a, T: 'static + InteriorMutableSafeDeviceCopy> CudaKernelParameter } #[cfg(not(feature = "host"))] - fn with_ffi_as_device( + fn with_ffi_as_device( param: Self::FfiType<'static, 'static>, inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O, ) -> O { @@ -602,7 +714,6 @@ impl_atomic_interior_mutable! { // impl sealed::Sealed for // core::cell::SyncUnsafeCell {} -#[repr(transparent)] pub struct SharedHeapPerThreadShallowCopy { never: !, _marker: PhantomData, @@ -650,6 +761,14 @@ impl< crate::host::LendToCuda::move_to_cuda(param, |param| inner(param.into_async())) } + #[cfg(feature = "host")] + fn with_async_as_ptx_jit( + _param: &Self::AsyncHostType<'_, '_>, + inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O, + ) -> O { + inner(None) + } + #[cfg(feature = "host")] fn async_to_ffi<'stream, 'b>( param: Self::AsyncHostType<'stream, 'b>, @@ -658,7 +777,7 @@ impl< } #[cfg(not(feature = "host"))] - fn with_ffi_as_device( + fn with_ffi_as_device( param: Self::FfiType<'static, 'static>, inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O, ) -> O { @@ -701,6 +820,14 @@ impl<'a, T: 'static + RustToCuda + crate::safety::NoSafeAliasing> CudaKernelPara crate::host::LendToCuda::lend_to_cuda(param, |param| inner(param.as_async())) } + #[cfg(feature = "host")] + fn with_async_as_ptx_jit( + _param: &Self::AsyncHostType<'_, '_>, + inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O, + ) -> O { + inner(None) + } + #[cfg(feature = "host")] fn async_to_ffi<'stream, 'b>( param: Self::AsyncHostType<'stream, 'b>, @@ -709,7 +836,7 @@ impl<'a, T: 'static + RustToCuda + crate::safety::NoSafeAliasing> CudaKernelPara } #[cfg(not(feature = "host"))] - fn with_ffi_as_device( + fn with_ffi_as_device( param: Self::FfiType<'static, 'static>, inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O, ) -> O { @@ -749,6 +876,14 @@ impl<'a, T: 'static + RustToCuda + crate::safety::NoSafeAliasing> CudaKernelPara crate::host::LendToCuda::lend_to_cuda_mut(param, |mut param| inner(param.as_async())) } + #[cfg(feature = "host")] + fn with_async_as_ptx_jit( + _param: &Self::AsyncHostType<'_, '_>, + inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O, + ) -> O { + inner(None) + } + #[cfg(feature = "host")] fn async_to_ffi<'stream, 'b>( mut param: Self::AsyncHostType<'stream, 'b>, @@ -757,7 +892,7 @@ impl<'a, T: 'static + RustToCuda + crate::safety::NoSafeAliasing> CudaKernelPara } #[cfg(not(feature = "host"))] - fn with_ffi_as_device( + fn with_ffi_as_device( mut param: Self::FfiType<'static, 'static>, inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O, ) -> O { @@ -773,3 +908,201 @@ impl<'a, T: RustToCuda + crate::safety::NoSafeAliasing> sealed::Sealed for &'a mut SharedHeapPerThreadShallowCopy { } + +impl< + T: RustToCuda< + CudaRepresentation: 'static + crate::safety::SafeDeviceCopy, + CudaAllocation: EmptyCudaAlloc, + > + crate::safety::NoSafeAliasing, + > CudaKernelParameter for PtxJit> +{ + #[cfg(feature = "host")] + type AsyncHostType<'stream, 'b> = + as CudaKernelParameter>::AsyncHostType<'stream, 'b>; + type DeviceType<'b> = + as CudaKernelParameter>::DeviceType<'b>; + type FfiType<'stream, 'b> = + as CudaKernelParameter>::FfiType<'stream, 'b>; + #[cfg(feature = "host")] + type SyncHostType = as CudaKernelParameter>::SyncHostType; + + #[cfg(feature = "host")] + fn with_new_async<'stream, O, E: From>( + param: Self::SyncHostType, + stream: &'stream rustacuda::stream::Stream, + inner: impl for<'b> FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result, + ) -> Result { + as CudaKernelParameter>::with_new_async( + param, stream, inner, + ) + } + + #[cfg(feature = "host")] + fn with_async_as_ptx_jit( + param: &Self::AsyncHostType<'_, '_>, + inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O, + ) -> O { + inner(Some(¶m_as_raw_bytes(param.for_host()))) + } + + #[cfg(feature = "host")] + fn async_to_ffi<'stream, 'b>( + param: Self::AsyncHostType<'stream, 'b>, + ) -> Self::FfiType<'stream, 'b> { + as CudaKernelParameter>::async_to_ffi(param) + } + + #[cfg(not(feature = "host"))] + fn with_ffi_as_device( + param: Self::FfiType<'static, 'static>, + inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O, + ) -> O { + emit_param_ptx_jit_marker::<_, PARAM>(param.as_ref()); + + as CudaKernelParameter>::with_ffi_as_device::( + param, inner, + ) + } +} +impl< + T: RustToCuda< + CudaRepresentation: crate::safety::SafeDeviceCopy, + CudaAllocation: EmptyCudaAlloc, + > + crate::safety::NoSafeAliasing, + > sealed::Sealed for PtxJit> +{ +} + +impl<'a, T: 'static + RustToCuda + crate::safety::NoSafeAliasing> CudaKernelParameter + for &'a PtxJit> +{ + #[cfg(feature = "host")] + type AsyncHostType<'stream, 'b> = + <&'a SharedHeapPerThreadShallowCopy as CudaKernelParameter>::AsyncHostType<'stream, 'b>; + type DeviceType<'b> = + <&'a SharedHeapPerThreadShallowCopy as CudaKernelParameter>::DeviceType<'b>; + type FfiType<'stream, 'b> = + <&'a SharedHeapPerThreadShallowCopy as CudaKernelParameter>::FfiType<'stream, 'b>; + #[cfg(feature = "host")] + type SyncHostType = + <&'a SharedHeapPerThreadShallowCopy as CudaKernelParameter>::SyncHostType; + + #[cfg(feature = "host")] + fn with_new_async<'stream, O, E: From>( + param: Self::SyncHostType, + stream: &'stream rustacuda::stream::Stream, + inner: impl for<'b> FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result, + ) -> Result { + <&'a SharedHeapPerThreadShallowCopy as CudaKernelParameter>::with_new_async( + param, stream, inner, + ) + } + + #[cfg(feature = "host")] + fn with_async_as_ptx_jit( + param: &Self::AsyncHostType<'_, '_>, + inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O, + ) -> O { + inner(Some(¶m_as_raw_bytes(param.for_host()))) + } + + #[cfg(feature = "host")] + fn async_to_ffi<'stream, 'b>( + param: Self::AsyncHostType<'stream, 'b>, + ) -> Self::FfiType<'stream, 'b> { + <&'a SharedHeapPerThreadShallowCopy as CudaKernelParameter>::async_to_ffi(param) + } + + #[cfg(not(feature = "host"))] + fn with_ffi_as_device( + param: Self::FfiType<'static, 'static>, + inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O, + ) -> O { + emit_param_ptx_jit_marker::<_, PARAM>(param.as_ref()); + + <&'a SharedHeapPerThreadShallowCopy as CudaKernelParameter>::with_ffi_as_device::( + param, inner, + ) + } +} +impl<'a, T: RustToCuda + crate::safety::NoSafeAliasing> sealed::Sealed + for &'a PtxJit> +{ +} + +impl<'a, T: 'static + RustToCuda + crate::safety::NoSafeAliasing> CudaKernelParameter + for &'a mut PtxJit> +{ + #[cfg(feature = "host")] + type AsyncHostType<'stream, 'b> = + <&'a mut SharedHeapPerThreadShallowCopy as CudaKernelParameter>::AsyncHostType< + 'stream, + 'b, + >; + type DeviceType<'b> = + <&'a mut SharedHeapPerThreadShallowCopy as CudaKernelParameter>::DeviceType<'b>; + type FfiType<'stream, 'b> = + <&'a mut SharedHeapPerThreadShallowCopy as CudaKernelParameter>::FfiType<'stream, 'b>; + #[cfg(feature = "host")] + type SyncHostType = + <&'a mut SharedHeapPerThreadShallowCopy as CudaKernelParameter>::SyncHostType; + + #[cfg(feature = "host")] + fn with_new_async<'stream, O, E: From>( + param: Self::SyncHostType, + stream: &'stream rustacuda::stream::Stream, + inner: impl for<'b> FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result, + ) -> Result { + <&'a mut SharedHeapPerThreadShallowCopy as CudaKernelParameter>::with_new_async( + param, stream, inner, + ) + } + + #[cfg(feature = "host")] + fn with_async_as_ptx_jit( + param: &Self::AsyncHostType<'_, '_>, + inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O, + ) -> O { + inner(Some(¶m_as_raw_bytes(param.for_host()))) + } + + #[cfg(feature = "host")] + fn async_to_ffi<'stream, 'b>( + param: Self::AsyncHostType<'stream, 'b>, + ) -> Self::FfiType<'stream, 'b> { + <&'a mut SharedHeapPerThreadShallowCopy as CudaKernelParameter>::async_to_ffi(param) + } + + #[cfg(not(feature = "host"))] + fn with_ffi_as_device( + param: Self::FfiType<'static, 'static>, + inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O, + ) -> O { + emit_param_ptx_jit_marker::<_, PARAM>(param.as_ref()); + + <&'a mut SharedHeapPerThreadShallowCopy as CudaKernelParameter>::with_ffi_as_device::< + O, + PARAM, + >(param, inner) + } +} +impl<'a, T: RustToCuda + crate::safety::NoSafeAliasing> sealed::Sealed + for &'a mut PtxJit> +{ +} + +#[cfg(feature = "host")] +fn param_as_raw_bytes(r: &T) -> NonNull<[u8]> { + NonNull::slice_from_raw_parts(NonNull::from(r).cast::(), core::mem::size_of_val(r)) +} + +#[cfg(not(feature = "host"))] +fn emit_param_ptx_jit_marker(param: &T) { + unsafe { + core::arch::asm!( + "// //", + in(reg32) *(core::ptr::from_ref(param).cast::()), + const(INDEX), + ); + } +} diff --git a/src/host.rs b/src/host/mod.rs similarity index 77% rename from src/host.rs rename to src/host/mod.rs index f15bca27e..6cb31a508 100644 --- a/src/host.rs +++ b/src/host/mod.rs @@ -1,3 +1,4 @@ +use core::ptr::NonNull; use std::{ ffi::{CStr, CString}, marker::PhantomData, @@ -22,35 +23,99 @@ pub use rust_cuda_derive::{check_kernel, link_kernel, specialise_kernel_entry_po use crate::{ common::{ - DeviceAccessible, DeviceConstRef, DeviceMutRef, DeviceOwnedRef, EmptyCudaAlloc, - NoCudaAlloc, RustToCuda, + CudaKernelParameter, DeviceAccessible, DeviceConstRef, DeviceMutRef, DeviceOwnedRef, + EmptyCudaAlloc, NoCudaAlloc, RustToCuda, }, - ptx_jit::{PtxJITCompiler, PtxJITResult}, safety::SafeDeviceCopy, }; +mod ptx_jit; +use ptx_jit::{PtxJITCompiler, PtxJITResult}; + pub struct Launcher<'stream, 'kernel, Kernel> { pub stream: &'stream Stream, pub kernel: &'kernel mut TypedPtxKernel, pub config: LaunchConfig, } +macro_rules! impl_launcher_launch { + ($launch:ident($($arg:ident : $T:ident),*) => $launch_async:ident) => { + #[allow(clippy::missing_errors_doc)] + #[allow(clippy::too_many_arguments)] // func is defined for <= 12 args + pub fn $launch<$($T: CudaKernelParameter),*>( + &mut self, + $($arg: $T::SyncHostType),* + ) -> CudaResult<()> + where + Kernel: Copy + FnOnce( + &mut Launcher, + $($T::SyncHostType),* + ) -> CudaResult<()>, + { + self.kernel.$launch::<$($T),*>(self.stream, &self.config, $($arg),*) + } + + #[allow(clippy::missing_errors_doc)] + #[allow(clippy::too_many_arguments)] // func is defined for <= 12 args + pub fn $launch_async<$($T: CudaKernelParameter),*>( + &mut self, + $($arg: $T::AsyncHostType<'stream, '_>),* + ) -> CudaResult<()> + where + Kernel: Copy + FnOnce( + &mut Launcher, + $($T::SyncHostType),* + ) -> CudaResult<()>, + { + self.kernel.$launch_async::<$($T),*>(self.stream, &self.config, $($arg),*) + } + }; +} + impl<'stream, 'kernel, Kernel> Launcher<'stream, 'kernel, Kernel> { - #[allow(clippy::missing_errors_doc)] - pub fn launch0(&mut self) -> CudaResult<()> - where - Kernel: Copy + FnOnce(&mut Launcher) -> CudaResult<()>, - { - self.kernel.launch0(self.stream, &self.config) - } + impl_launcher_launch! { launch0() => launch0_async } - #[allow(clippy::missing_errors_doc)] - pub fn launch1(&mut self, arg1: A) -> CudaResult<()> - where - Kernel: Copy + FnOnce(&mut Launcher, A) -> CudaResult<()>, - { - self.kernel.launch1(self.stream, &self.config, arg1) - } + impl_launcher_launch! { launch1(arg1: A) => launch1_async } + + impl_launcher_launch! { launch2(arg1: A, arg2: B) => launch2_async } + + impl_launcher_launch! { launch3(arg1: A, arg2: B, arg3: C) => launch3_async } + + impl_launcher_launch! { launch4(arg1: A, arg2: B, arg3: C, arg4: D) => launch4_async } + + impl_launcher_launch! { launch5( + arg1: A, arg2: B, arg3: C, arg4: D, arg5: E + ) => launch5_async } + + impl_launcher_launch! { launch6( + arg1: A, arg2: B, arg3: C, arg4: D, arg5: E, arg6: F + ) => launch6_async } + + impl_launcher_launch! { launch7( + arg1: A, arg2: B, arg3: C, arg4: D, arg5: E, arg6: F, arg7: G + ) => launch7_async } + + impl_launcher_launch! { launch8( + arg1: A, arg2: B, arg3: C, arg4: D, arg5: E, arg6: F, arg7: G, arg8: H + ) => launch8_async } + + impl_launcher_launch! { launch9( + arg1: A, arg2: B, arg3: C, arg4: D, arg5: E, arg6: F, arg7: G, arg8: H, arg9: I + ) => launch9_async } + + impl_launcher_launch! { launch10( + arg1: A, arg2: B, arg3: C, arg4: D, arg5: E, arg6: F, arg7: G, arg8: H, arg9: I, arg10: J + ) => launch10_async } + + impl_launcher_launch! { launch11( + arg1: A, arg2: B, arg3: C, arg4: D, arg5: E, arg6: F, arg7: G, arg8: H, arg9: I, arg10: J, + arg11: K + ) => launch11_async } + + impl_launcher_launch! { launch12( + arg1: A, arg2: B, arg3: C, arg4: D, arg5: E, arg6: F, arg7: G, arg8: H, arg9: I, arg10: J, + arg11: K, arg12: L + ) => launch12_async } } #[derive(Clone, Debug, PartialEq)] @@ -131,10 +196,133 @@ pub struct TypedPtxKernel { marker: PhantomData, } +macro_rules! impl_typed_kernel_launch { + ($launch:ident($($arg:ident : $T:ident),*) => $launch_async:ident) => { + #[allow(clippy::missing_errors_doc)] + #[allow(clippy::too_many_arguments)] // func is defined for <= 12 args + pub fn $launch<$($T: CudaKernelParameter),*>( + &mut self, + stream: &Stream, + config: &LaunchConfig, + $($arg: $T::SyncHostType),* + ) -> CudaResult<()> + where + Kernel: Copy + FnOnce( + &mut Launcher, + $($T::SyncHostType),* + ) -> CudaResult<()>, + { + impl_typed_kernel_launch! { impl with_new_async ($($arg: $T),*) + (stream) { + self.$launch_async::<$($T),*>(stream, config, $($arg),*) + } } + } + + #[allow(clippy::missing_errors_doc)] + #[allow(clippy::needless_lifetimes)] // 'stream is unused for zero args + #[allow(clippy::too_many_arguments)] // func is defined for <= 12 args + pub fn $launch_async<'stream, $($T: CudaKernelParameter),*>( + &mut self, + stream: &'stream Stream, + config: &LaunchConfig, + $($arg: $T::AsyncHostType<'stream, '_>),* + ) -> CudaResult<()> + where + Kernel: Copy + FnOnce( + &mut Launcher, + $($T::SyncHostType),* + ) -> CudaResult<()>, + { + let kernel_jit_result = if config.ptx_jit { + impl_typed_kernel_launch! { impl with_async_as_ptx_jit ref ($($arg: $T),*) + () { + self.compile_with_ptx_jit_args(Some(&[$($arg),*])) + } }? + } else { + self.compile_with_ptx_jit_args(None)? + }; + let function = match kernel_jit_result { + KernelJITResult::Recompiled(function) + | KernelJITResult::Cached(function) => function, + }; + + unsafe { stream.launch( + function, + config.grid.clone(), + config.block.clone(), + config.shared_memory_size, + &[ + $(core::ptr::from_mut( + &mut $T::async_to_ffi($arg) + ).cast::()),* + ], + ) } + } + }; + (impl $func:ident () + ($($other:ident),*) $inner:block) => { + $inner + }; + (impl $func:ident ($arg0:ident : $T0:ident $(, $arg:ident : $T:ident)*) + ($($other:ident),*) $inner:block) => { + $T0::$func($arg0 $(, $other)*, |$arg0| { + impl_typed_kernel_launch! { impl $func ($($arg: $T),*) + ($($other),*) $inner } + }) + }; + (impl $func:ident ref () + ($($other:ident),*) $inner:block) => { + $inner + }; + (impl $func:ident ref ($arg0:ident : $T0:ident $(, $arg:ident : $T:ident)*) + ($($other:ident),*) $inner:block) => { + $T0::$func(&$arg0 $(, $other)*, |$arg0| { + impl_typed_kernel_launch! { impl $func ref ($($arg: $T),*) + ($($other),*) $inner } + }) + }; +} + impl TypedPtxKernel { + impl_typed_kernel_launch! { launch0() => launch0_async } + + impl_typed_kernel_launch! { launch1(arg1: A) => launch1_async } + + impl_typed_kernel_launch! { launch2(arg1: A, arg2: B) => launch2_async } + + impl_typed_kernel_launch! { launch3(arg1: A, arg2: B, arg3: C) => launch3_async } + + impl_typed_kernel_launch! { launch4(arg1: A, arg2: B, arg3: C, arg4: D) => launch4_async } + + impl_typed_kernel_launch! { launch5( + arg1: A, arg2: B, arg3: C, arg4: D, arg5: E + ) => launch5_async } + + impl_typed_kernel_launch! { launch6( + arg1: A, arg2: B, arg3: C, arg4: D, arg5: E, arg6: F + ) => launch6_async } + + impl_typed_kernel_launch! { launch7( + arg1: A, arg2: B, arg3: C, arg4: D, arg5: E, arg6: F, arg7: G + ) => launch7_async } + + impl_typed_kernel_launch! { launch8( + arg1: A, arg2: B, arg3: C, arg4: D, arg5: E, arg6: F, arg7: G, arg8: H + ) => launch8_async } + + impl_typed_kernel_launch! { launch9( + arg1: A, arg2: B, arg3: C, arg4: D, arg5: E, arg6: F, arg7: G, arg8: H, arg9: I + ) => launch9_async } + + impl_typed_kernel_launch! { launch10( + arg1: A, arg2: B, arg3: C, arg4: D, arg5: E, arg6: F, arg7: G, arg8: H, arg9: I, arg10: J + ) => launch10_async } + + impl_typed_kernel_launch! { launch11( + arg1: A, arg2: B, arg3: C, arg4: D, arg5: E, arg6: F, arg7: G, arg8: H, arg9: I, arg10: J, + arg11: K + ) => launch11_async } + + impl_typed_kernel_launch! { launch12( + arg1: A, arg2: B, arg3: C, arg4: D, arg5: E, arg6: F, arg7: G, arg8: H, arg9: I, arg10: J, + arg11: K, arg12: L + ) => launch12_async } + #[must_use] pub fn new>(configure: Option>) -> Self { - let compiler = crate::ptx_jit::PtxJITCompiler::new(T::get_ptx()); + let compiler = PtxJITCompiler::new(T::get_ptx()); let entry_point = CString::from(T::get_entry_point()).into_boxed_c_str(); Self { @@ -151,9 +339,9 @@ impl TypedPtxKernel { /// Returns a [`CudaError`] if the [`CompiledKernelPtx`] provided to /// [`Self::new`] is not a valid PTX source or does not contain the /// entry point it declares. - pub fn compile_with_ptx_jit_args( + fn compile_with_ptx_jit_args( &mut self, - arguments: Option<&[Option<*const [u8]>]>, + arguments: Option<&[Option<&NonNull<[u8]>>]>, ) -> CudaResult { let ptx_jit = self.compiler.with_arguments(arguments); @@ -179,51 +367,8 @@ impl TypedPtxKernel { Ok(kernel_jit) } - - #[allow(clippy::missing_errors_doc)] - pub fn launch0(&mut self, stream: &Stream, config: &LaunchConfig) -> CudaResult<()> - where - Kernel: Copy + FnOnce(&mut Launcher) -> CudaResult<()>, - { - (const { conjure::() })(&mut Launcher { - stream, - kernel: self, - config: config.clone(), - }) - } - - #[allow(clippy::missing_errors_doc)] - pub fn launch1(&mut self, stream: &Stream, config: &LaunchConfig, arg1: A) -> CudaResult<()> - where - Kernel: Copy + FnOnce(&mut Launcher, A) -> CudaResult<()>, - { - (const { conjure::() })( - &mut Launcher { - stream, - kernel: self, - config: config.clone(), - }, - arg1, - ) - } -} - -const fn conjure() -> T { - union Transmute { - empty: (), - magic: T, - } - - assert!(std::mem::size_of::() == 0); - assert!(std::mem::align_of::() == 1); - - unsafe { Transmute { empty: () }.magic } } -struct Assert; -trait True {} -impl True for Assert {} - pub trait LendToCuda: RustToCuda { /// Lends an immutable copy of `&self` to CUDA: /// - code in the CUDA kernel can only access `&self` through the diff --git a/rust-cuda-ptx-jit/src/host/mod.rs b/src/host/ptx_jit/mod.rs similarity index 100% rename from rust-cuda-ptx-jit/src/host/mod.rs rename to src/host/ptx_jit/mod.rs diff --git a/rust-cuda-ptx-jit/src/host/preprocess.rs b/src/host/ptx_jit/preprocess.rs similarity index 93% rename from rust-cuda-ptx-jit/src/host/preprocess.rs rename to src/host/ptx_jit/preprocess.rs index 0ee17733f..c22cf63e9 100644 --- a/rust-cuda-ptx-jit/src/host/preprocess.rs +++ b/src/host/ptx_jit/preprocess.rs @@ -5,7 +5,7 @@ use std::{ use super::{ regex::{ - CONST_BASE_REGISTER_REGEX, CONST_LOAD_INSTRUCTION_REGEX, CONST_MARKER_REGEX, REGISTER_REGEX, + const_base_register_regex, const_load_instruction_regex, const_marker_regex, register_regex, }, PtxElement, PtxJITCompiler, PtxLoadWidth, }; @@ -19,7 +19,7 @@ impl PtxJITCompiler { let mut const_markers: HashMap<&[u8], usize> = HashMap::new(); // Find injected rust-cuda-const-markers which identify dummy register rxx - for const_marker in CONST_MARKER_REGEX.captures_iter(ptx) { + for const_marker in const_marker_regex().captures_iter(ptx) { if let Some(tmpreg) = const_marker.name("tmpreg").map(|s| s.as_bytes()) { if let Some(param) = const_marker .name("param") @@ -36,7 +36,7 @@ impl PtxJITCompiler { let mut const_base_registers: HashMap<&[u8], usize> = HashMap::new(); // Find base register ryy which was used in `ld.global.u32 rxx, [ryy];` - for const_base_register in CONST_BASE_REGISTER_REGEX.captures_iter(ptx) { + for const_base_register in const_base_register_regex().captures_iter(ptx) { if let Some(tmpreg) = const_base_register.name("tmpreg").map(|s| s.as_bytes()) { if let Some(param) = const_markers.get(tmpreg) { if let Some(basereg) = const_base_register.name("basereg").map(|s| s.as_bytes()) @@ -54,7 +54,7 @@ impl PtxJITCompiler { let mut ptx_slices: Vec = Vec::new(); // Iterate over all load from base register with offset instructions - for const_load_instruction in CONST_LOAD_INSTRUCTION_REGEX.captures_iter(ptx) { + for const_load_instruction in const_load_instruction_regex().captures_iter(ptx) { // Only consider instructions where the base register is ryy if let Some(basereg) = const_load_instruction.name("basereg").map(|s| s.as_bytes()) { if let Some(param) = const_base_registers.get(basereg) { @@ -100,7 +100,7 @@ impl PtxJITCompiler { parameter_index: *param, byte_offset: loadoffset, load_width: loadwidth, - registers: REGISTER_REGEX + registers: register_regex() .captures_iter(constreg) .filter_map(|m| { m.name("register").map(|s| { diff --git a/src/host/ptx_jit/regex.rs b/src/host/ptx_jit/regex.rs new file mode 100644 index 000000000..58406b01e --- /dev/null +++ b/src/host/ptx_jit/regex.rs @@ -0,0 +1,58 @@ +use std::sync::OnceLock; + +use regex::bytes::Regex; + +#[allow(clippy::module_name_repetitions)] +pub fn const_marker_regex() -> &'static Regex { + static CONST_MARKER_REGEX: OnceLock = OnceLock::new(); + CONST_MARKER_REGEX.get_or_init(|| { + Regex::new(r"(?-u)// %r\d+)-(?P\d+)> //") + .unwrap() + }) +} + +#[allow(clippy::module_name_repetitions)] +pub fn const_base_register_regex() -> &'static Regex { + static CONST_BASE_REGISTER_REGEX: OnceLock = OnceLock::new(); + CONST_BASE_REGISTER_REGEX.get_or_init(|| { + Regex::new(r"(?-u)ld\.global\.u32\s*(?P%r\d+)\s*,\s*\[(?P%r[ds]?\d+)]\s*;") + .unwrap() + }) +} + +#[allow(clippy::module_name_repetitions)] +pub fn const_load_instruction_regex() -> &'static Regex { + static CONST_LOAD_INSTRUCTION_REGEX: OnceLock = OnceLock::new(); + CONST_LOAD_INSTRUCTION_REGEX.get_or_init(|| { + Regex::new( + r"(?x-u)(?P + ld\.global + (?:\.(?Pv[24]))? + \. + (?P[suf]) + (?P8|16|32|64) + \s* + (?P + (?:%[rf][sd]?\d+) | + (?:\{(?:\s*%[rf][sd]?\d+,)*\s*%[rf][sd]?\d+\s*\}) + ) + ,\s* + \[ + (?P%r[ds]?\d+) + (?: + \+ + (?P\d+) + )? + \] + \s*; + )", + ) + .unwrap() + }) +} + +#[allow(clippy::module_name_repetitions)] +pub fn register_regex() -> &'static Regex { + static REGISTER_REGEX: OnceLock = OnceLock::new(); + REGISTER_REGEX.get_or_init(|| Regex::new(r"(?-u)(?P%[rf][sd]?\d+)").unwrap()) +} diff --git a/rust-cuda-ptx-jit/src/host/replace.rs b/src/host/ptx_jit/replace.rs similarity index 96% rename from rust-cuda-ptx-jit/src/host/replace.rs rename to src/host/ptx_jit/replace.rs index 920842d6f..ed59701c7 100644 --- a/rust-cuda-ptx-jit/src/host/replace.rs +++ b/src/host/ptx_jit/replace.rs @@ -1,10 +1,11 @@ +use core::ptr::NonNull; use std::{ffi::CString, ops::Deref}; use super::{PtxElement, PtxJITCompiler, PtxJITResult, PtxLoadWidth}; impl PtxJITCompiler { #[allow(clippy::too_many_lines)] - pub fn with_arguments(&mut self, arguments: Option<&[Option<*const [u8]>]>) -> PtxJITResult { + pub fn with_arguments(&mut self, arguments: Option<&[Option<&NonNull<[u8]>>]>) -> PtxJITResult { // Check if the arguments, cast as byte slices, are the same as the last cached // ones #[allow(clippy::explicit_deref_methods)] @@ -16,7 +17,7 @@ impl PtxJITCompiler { .zip(last_arguments.iter()) .all(|(a, b)| match (a, b) { (None, None) => false, - (Some(a), Some(b)) => (unsafe { &**a }) != b.deref(), + (Some(a), Some(b)) => (unsafe { a.as_ref() }) != b.deref(), _ => true, }) }, @@ -30,7 +31,9 @@ impl PtxJITCompiler { self.last_arguments = arguments.map(|arguments| { arguments .iter() - .map(|arg| arg.map(|bytes| unsafe { &*bytes }.to_owned().into_boxed_slice())) + .map(|arg| { + arg.map(|bytes| unsafe { bytes.as_ref() }.to_owned().into_boxed_slice()) + }) .collect::>>>() .into_boxed_slice() }); diff --git a/src/lib.rs b/src/lib.rs index 61b807d8b..0bf8b0e21 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -40,7 +40,6 @@ #[doc(hidden)] pub extern crate alloc; -pub extern crate rust_cuda_ptx_jit as ptx_jit; pub extern crate rustacuda_core; #[doc(hidden)] From adfff4328d6a1b62640d7f276bfaf8bcbe394d6c Mon Sep 17 00:00:00 2001 From: Juniper Tyree Date: Sat, 23 Dec 2023 11:28:02 +0000 Subject: [PATCH 056/120] Add async launch helper --- src/host/mod.rs | 153 +++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 124 insertions(+), 29 deletions(-) diff --git a/src/host/mod.rs b/src/host/mod.rs index 6cb31a508..539d24207 100644 --- a/src/host/mod.rs +++ b/src/host/mod.rs @@ -39,7 +39,7 @@ pub struct Launcher<'stream, 'kernel, Kernel> { } macro_rules! impl_launcher_launch { - ($launch:ident($($arg:ident : $T:ident),*) => $launch_async:ident) => { + ($launch:ident($($arg:ident : $T:ident),*) => $with_async:ident => $launch_async:ident) => { #[allow(clippy::missing_errors_doc)] #[allow(clippy::too_many_arguments)] // func is defined for <= 12 args pub fn $launch<$($T: CudaKernelParameter),*>( @@ -55,6 +55,35 @@ macro_rules! impl_launcher_launch { self.kernel.$launch::<$($T),*>(self.stream, &self.config, $($arg),*) } + #[allow(clippy::missing_errors_doc)] + #[allow(clippy::too_many_arguments)] // func is defined for <= 12 args + pub fn $with_async< + 'a, + Ok, + Err: From, + $($T: CudaKernelParameter),* + >( + &'a mut self, + $($arg: $T::SyncHostType,)* + inner: impl FnOnce( + &'a mut Self, + $($T::AsyncHostType<'stream, '_>),* + ) -> Result, + ) -> Result + where + Kernel: Copy + FnOnce( + &mut Launcher, + $($T::SyncHostType),* + ) -> CudaResult<()>, + { + #[allow(unused_variables)] + let stream = self.stream; + + impl_launcher_launch! { impl with_new_async ($($arg: $T),*) + (stream) { + inner(self, $($arg),*) + } } + } + #[allow(clippy::missing_errors_doc)] #[allow(clippy::too_many_arguments)] // func is defined for <= 12 args pub fn $launch_async<$($T: CudaKernelParameter),*>( @@ -70,52 +99,68 @@ macro_rules! impl_launcher_launch { self.kernel.$launch_async::<$($T),*>(self.stream, &self.config, $($arg),*) } }; + (impl $func:ident () + ($($other:ident),*) $inner:block) => { + $inner + }; + (impl $func:ident ($arg0:ident : $T0:ident $(, $arg:ident : $T:ident)*) + ($($other:ident),*) $inner:block) => { + $T0::$func($arg0 $(, $other)*, |$arg0| { + impl_launcher_launch! { impl $func ($($arg: $T),*) + ($($other),*) $inner } + }) + }; } impl<'stream, 'kernel, Kernel> Launcher<'stream, 'kernel, Kernel> { - impl_launcher_launch! { launch0() => launch0_async } + impl_launcher_launch! { launch0() => with0_async => launch0_async } - impl_launcher_launch! { launch1(arg1: A) => launch1_async } + impl_launcher_launch! { launch1( + arg1: A + ) => with1_async => launch1_async } - impl_launcher_launch! { launch2(arg1: A, arg2: B) => launch2_async } + impl_launcher_launch! { launch2( + arg1: A, arg2: B + ) => with2_async => launch2_async } - impl_launcher_launch! { launch3(arg1: A, arg2: B, arg3: C) => launch3_async } + impl_launcher_launch! { launch3( + arg1: A, arg2: B, arg3: C + ) => with3_async => launch3_async } - impl_launcher_launch! { launch4(arg1: A, arg2: B, arg3: C, arg4: D) => launch4_async } + impl_launcher_launch! { launch4( + arg1: A, arg2: B, arg3: C, arg4: D + ) => with4_async => launch4_async } impl_launcher_launch! { launch5( arg1: A, arg2: B, arg3: C, arg4: D, arg5: E - ) => launch5_async } + ) => with5_async => launch5_async } impl_launcher_launch! { launch6( arg1: A, arg2: B, arg3: C, arg4: D, arg5: E, arg6: F - ) => launch6_async } + ) => with6_async => launch6_async } impl_launcher_launch! { launch7( arg1: A, arg2: B, arg3: C, arg4: D, arg5: E, arg6: F, arg7: G - ) => launch7_async } + ) => with7_async => launch7_async } impl_launcher_launch! { launch8( arg1: A, arg2: B, arg3: C, arg4: D, arg5: E, arg6: F, arg7: G, arg8: H - ) => launch8_async } + ) => with8_async => launch8_async } impl_launcher_launch! { launch9( arg1: A, arg2: B, arg3: C, arg4: D, arg5: E, arg6: F, arg7: G, arg8: H, arg9: I - ) => launch9_async } + ) => with9_async => launch9_async } impl_launcher_launch! { launch10( arg1: A, arg2: B, arg3: C, arg4: D, arg5: E, arg6: F, arg7: G, arg8: H, arg9: I, arg10: J - ) => launch10_async } + ) => with10_async => launch10_async } impl_launcher_launch! { launch11( arg1: A, arg2: B, arg3: C, arg4: D, arg5: E, arg6: F, arg7: G, arg8: H, arg9: I, arg10: J, arg11: K - ) => launch11_async } + ) => with11_async => launch11_async } impl_launcher_launch! { launch12( arg1: A, arg2: B, arg3: C, arg4: D, arg5: E, arg6: F, arg7: G, arg8: H, arg9: I, arg10: J, arg11: K, arg12: L - ) => launch12_async } + ) => with12_async => launch12_async } } #[derive(Clone, Debug, PartialEq)] @@ -197,7 +242,7 @@ pub struct TypedPtxKernel { } macro_rules! impl_typed_kernel_launch { - ($launch:ident($($arg:ident : $T:ident),*) => $launch_async:ident) => { + ($launch:ident($($arg:ident : $T:ident),*) => $with_async:ident => $launch_async:ident) => { #[allow(clippy::missing_errors_doc)] #[allow(clippy::too_many_arguments)] // func is defined for <= 12 args pub fn $launch<$($T: CudaKernelParameter),*>( @@ -206,6 +251,48 @@ macro_rules! impl_typed_kernel_launch { config: &LaunchConfig, $($arg: $T::SyncHostType),* ) -> CudaResult<()> + where + Kernel: Copy + FnOnce( + &mut Launcher, + $($T::SyncHostType),* + ) -> CudaResult<()>, + { + self.$with_async::<(), CudaError, $($T),*>( + stream, + config, + $($arg,)* + |kernel, stream, config, $($arg),*| { + let result = kernel.$launch_async::<$($T),*>(stream, config, $($arg),*); + + // important: always synchronise here, this function is sync! + match (stream.synchronize(), result) { + (Ok(()), result) => result, + (Err(_), Err(err)) | (Err(err), Ok(())) => Err(err), + } + }, + ) + } + + #[allow(clippy::missing_errors_doc)] + #[allow(clippy::too_many_arguments)] // func is defined for <= 12 args + pub fn $with_async< + 'a, + 'stream, + Ok, + Err: From, + $($T: CudaKernelParameter),* + >( + &'a mut self, + stream: &'stream Stream, + config: &LaunchConfig, + $($arg: $T::SyncHostType,)* + inner: impl FnOnce( + &'a mut Self, + &'stream Stream, + &LaunchConfig, + $($T::AsyncHostType<'stream, '_>),* + ) -> Result, + ) -> Result where Kernel: Copy + FnOnce( &mut Launcher, @@ -213,7 +300,7 @@ macro_rules! impl_typed_kernel_launch { ) -> CudaResult<()>, { impl_typed_kernel_launch! { impl with_new_async ($($arg: $T),*) + (stream) { - self.$launch_async::<$($T),*>(stream, config, $($arg),*) + inner(self, stream, config, $($arg),*) } } } @@ -276,49 +363,57 @@ macro_rules! impl_typed_kernel_launch { } impl TypedPtxKernel { - impl_typed_kernel_launch! { launch0() => launch0_async } + impl_typed_kernel_launch! { launch0() => with0_async => launch0_async } - impl_typed_kernel_launch! { launch1(arg1: A) => launch1_async } + impl_typed_kernel_launch! { launch1( + arg1: A + ) => with1_async => launch1_async } - impl_typed_kernel_launch! { launch2(arg1: A, arg2: B) => launch2_async } + impl_typed_kernel_launch! { launch2( + arg1: A, arg2: B + ) => with2_async => launch2_async } - impl_typed_kernel_launch! { launch3(arg1: A, arg2: B, arg3: C) => launch3_async } + impl_typed_kernel_launch! { launch3( + arg1: A, arg2: B, arg3: C + ) => with3_async => launch3_async } - impl_typed_kernel_launch! { launch4(arg1: A, arg2: B, arg3: C, arg4: D) => launch4_async } + impl_typed_kernel_launch! { launch4( + arg1: A, arg2: B, arg3: C, arg4: D + ) => with4_async => launch4_async } impl_typed_kernel_launch! { launch5( arg1: A, arg2: B, arg3: C, arg4: D, arg5: E - ) => launch5_async } + ) => with5_async => launch5_async } impl_typed_kernel_launch! { launch6( arg1: A, arg2: B, arg3: C, arg4: D, arg5: E, arg6: F - ) => launch6_async } + ) => with6_async => launch6_async } impl_typed_kernel_launch! { launch7( arg1: A, arg2: B, arg3: C, arg4: D, arg5: E, arg6: F, arg7: G - ) => launch7_async } + ) => with7_async => launch7_async } impl_typed_kernel_launch! { launch8( arg1: A, arg2: B, arg3: C, arg4: D, arg5: E, arg6: F, arg7: G, arg8: H - ) => launch8_async } + ) => with8_async => launch8_async } impl_typed_kernel_launch! { launch9( arg1: A, arg2: B, arg3: C, arg4: D, arg5: E, arg6: F, arg7: G, arg8: H, arg9: I - ) => launch9_async } + ) => with9_async => launch9_async } impl_typed_kernel_launch! { launch10( arg1: A, arg2: B, arg3: C, arg4: D, arg5: E, arg6: F, arg7: G, arg8: H, arg9: I, arg10: J - ) => launch10_async } + ) => with10_async => launch10_async } impl_typed_kernel_launch! { launch11( arg1: A, arg2: B, arg3: C, arg4: D, arg5: E, arg6: F, arg7: G, arg8: H, arg9: I, arg10: J, arg11: K - ) => launch11_async } + ) => with11_async => launch11_async } impl_typed_kernel_launch! { launch12( arg1: A, arg2: B, arg3: C, arg4: D, arg5: E, arg6: F, arg7: G, arg8: H, arg9: I, arg10: J, arg11: K, arg12: L - ) => launch12_async } + ) => with12_async => launch12_async } #[must_use] pub fn new>(configure: Option>) -> Self { From 93e8d202e015a8618f41e2937ea241866c01383d Mon Sep 17 00:00:00 2001 From: Juniper Tyree Date: Sat, 23 Dec 2023 13:03:31 +0000 Subject: [PATCH 057/120] Further cleanup of the new kernel param API --- examples/print/src/main.rs | 21 +------ .../wrapper/generate/cpu_wrapper/mod.rs | 30 ---------- .../kernel_func.rs => host_kernel_ty.rs} | 60 ++++++------------- .../args_trait.rs | 0 .../get_ptx.rs | 0 .../mod.rs | 2 +- .../src/kernel/wrapper/generate/mod.rs | 4 +- rust-cuda-derive/src/kernel/wrapper/mod.rs | 14 ++--- rust-cuda-derive/src/kernel/wrapper/parse.rs | 2 +- src/common.rs | 39 ++++++++---- src/host/mod.rs | 36 +++++------ src/lib.rs | 2 + 12 files changed, 79 insertions(+), 131 deletions(-) delete mode 100644 rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/mod.rs rename rust-cuda-derive/src/kernel/wrapper/generate/{cpu_wrapper/kernel_func.rs => host_kernel_ty.rs} (53%) rename rust-cuda-derive/src/kernel/wrapper/generate/{cpu_linker_macro => host_linker_macro}/args_trait.rs (100%) rename rust-cuda-derive/src/kernel/wrapper/generate/{cpu_linker_macro => host_linker_macro}/get_ptx.rs (100%) rename rust-cuda-derive/src/kernel/wrapper/generate/{cpu_linker_macro => host_linker_macro}/mod.rs (98%) diff --git a/examples/print/src/main.rs b/examples/print/src/main.rs index 3d2f776e4..62a0e2713 100644 --- a/examples/print/src/main.rs +++ b/examples/print/src/main.rs @@ -71,26 +71,11 @@ fn main() -> rust_cuda::rustacuda::error::CudaResult<()> { // Launch the CUDA kernel on the stream and synchronise to its completion println!("Launching print kernel ..."); - kernel.launch1::>( - &stream, - &config, - Action::Print, - )?; - // kernel(&mut launcher, Action::Print)?; + kernel.launch1(&stream, &config, Action::Print)?; println!("Launching panic kernel ..."); - kernel.launch1::>( - &stream, - &config, - Action::Panic, - )?; - // kernel(&mut launcher, Action::Panic)?; + kernel.launch1(&stream, &config, Action::Panic)?; println!("Launching alloc error kernel ..."); - kernel.launch1::>( - &stream, - &config, - Action::AllocError, - )?; - // kernel(&mut launcher, Action::AllocError)?; + kernel.launch1(&stream, &config, Action::AllocError)?; Ok(()) } diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/mod.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/mod.rs deleted file mode 100644 index eeb5cd5d4..000000000 --- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/mod.rs +++ /dev/null @@ -1,30 +0,0 @@ -use proc_macro2::TokenStream; - -use super::super::{DeclGenerics, FuncIdent, FunctionInputs, ImplGenerics}; - -mod kernel_func; -use kernel_func::quote_kernel_func_inputs; - -pub(in super::super) fn quote_cpu_wrapper( - crate_path: &syn::Path, - decl: &DeclGenerics, - impl_generics: &ImplGenerics, - func_inputs: &FunctionInputs, - fn_ident: &FuncIdent, - func_params: &[syn::Ident], - func_attrs: &[syn::Attribute], -) -> TokenStream { - let kernel_func = quote_kernel_func_inputs( - crate_path, - impl_generics, - decl, - func_inputs, - fn_ident, - func_params, - func_attrs, - ); - - quote! { - #kernel_func - } -} diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func.rs b/rust-cuda-derive/src/kernel/wrapper/generate/host_kernel_ty.rs similarity index 53% rename from rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func.rs rename to rust-cuda-derive/src/kernel/wrapper/generate/host_kernel_ty.rs index b854ce160..75c86820f 100644 --- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_wrapper/kernel_func.rs +++ b/rust-cuda-derive/src/kernel/wrapper/generate/host_kernel_ty.rs @@ -1,48 +1,21 @@ use proc_macro2::TokenStream; -use syn::spanned::Spanned; -use super::super::super::{DeclGenerics, FuncIdent, FunctionInputs, ImplGenerics}; +use super::super::{DeclGenerics, FuncIdent, FunctionInputs, ImplGenerics}; -pub(super) fn quote_kernel_func_inputs( +pub(in super::super) fn quote_host_kernel_ty( crate_path: &syn::Path, - ImplGenerics { ty_generics, .. }: &ImplGenerics, DeclGenerics { generic_kernel_params, generic_start_token, generic_close_token, .. }: &DeclGenerics, + ImplGenerics { ty_generics, .. }: &ImplGenerics, FunctionInputs { func_inputs }: &FunctionInputs, FuncIdent { func_ident, .. }: &FuncIdent, func_params: &[syn::Ident], func_attrs: &[syn::Attribute], ) -> TokenStream { - let (kernel_func_inputs, kernel_func_input_tys): (Vec<_>, Vec<_>) = func_inputs - .iter() - .map( - |syn::PatType { - attrs, - ty, - pat, - colon_token, - }| { - let ty: syn::Type = syn::parse_quote_spanned! { ty.span()=> - <#ty as #crate_path::common::CudaKernelParameter>::SyncHostType - }; - - ( - syn::FnArg::Typed(syn::PatType { - attrs: attrs.clone(), - ty: Box::new(ty.clone()), - pat: pat.clone(), - colon_token: *colon_token, - }), - ty, - ) - }, - ) - .unzip(); - let cuda_kernel_param_tys = func_inputs .iter() .map(|syn::PatType { ty, .. }| &**ty) @@ -50,8 +23,6 @@ pub(super) fn quote_kernel_func_inputs( let launcher = syn::Ident::new("launcher", proc_macro2::Span::mixed_site()); - let launch = quote::format_ident!("launch{}", func_inputs.len()); - let full_generics = generic_kernel_params .iter() .map(|param| match param { @@ -61,6 +32,9 @@ pub(super) fn quote_kernel_func_inputs( }) .collect::>(); + let mut private_func_ident = syn::Ident::clone(func_ident); + private_func_ident.set_span(proc_macro::Span::def_site().into()); + let ty_turbofish = ty_generics.as_turbofish(); quote! { @@ -68,28 +42,30 @@ pub(super) fn quote_kernel_func_inputs( #[allow(non_camel_case_types)] pub type #func_ident #generic_start_token #generic_kernel_params - #generic_close_token = impl Copy + Fn( + #generic_close_token = impl Fn( &mut #crate_path::host::Launcher<#func_ident #generic_start_token #(#full_generics),* #generic_close_token>, - #(#kernel_func_input_tys),* - ) -> #crate_path::rustacuda::error::CudaResult<()>; + #(#cuda_kernel_param_tys),* + ); #[cfg(not(target_os = "cuda"))] #(#func_attrs)* #[allow(clippy::too_many_arguments)] #[allow(clippy::used_underscore_binding)] - pub fn #func_ident <#generic_kernel_params>( + fn #private_func_ident #generic_start_token + #generic_kernel_params + #generic_close_token ( #launcher: &mut #crate_path::host::Launcher<#func_ident #generic_start_token #(#full_generics),* #generic_close_token>, - #(#kernel_func_inputs),* - ) -> #crate_path::rustacuda::error::CudaResult<()> { - let _: #func_ident <#(#full_generics),*> = #func_ident #ty_turbofish; + #func_inputs + ) { + let _: #func_ident <#(#full_generics),*> = #private_func_ident #ty_turbofish; - #launcher.#launch::< - #(#cuda_kernel_param_tys),* - >(#(#func_params),*) + #( + let _ = #func_params; + )* } } } diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/args_trait.rs b/rust-cuda-derive/src/kernel/wrapper/generate/host_linker_macro/args_trait.rs similarity index 100% rename from rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/args_trait.rs rename to rust-cuda-derive/src/kernel/wrapper/generate/host_linker_macro/args_trait.rs diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/get_ptx.rs b/rust-cuda-derive/src/kernel/wrapper/generate/host_linker_macro/get_ptx.rs similarity index 100% rename from rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/get_ptx.rs rename to rust-cuda-derive/src/kernel/wrapper/generate/host_linker_macro/get_ptx.rs diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/mod.rs b/rust-cuda-derive/src/kernel/wrapper/generate/host_linker_macro/mod.rs similarity index 98% rename from rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/mod.rs rename to rust-cuda-derive/src/kernel/wrapper/generate/host_linker_macro/mod.rs index f68b9cf34..dc609da26 100644 --- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/mod.rs +++ b/rust-cuda-derive/src/kernel/wrapper/generate/host_linker_macro/mod.rs @@ -8,7 +8,7 @@ mod get_ptx; use get_ptx::quote_get_ptx; #[allow(clippy::too_many_arguments)] // FIXME -pub(in super::super) fn quote_cpu_linker_macro( +pub(in super::super) fn quote_host_linker_macro( crate_path: &syn::Path, KernelConfig { visibility, linker, .. diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/mod.rs b/rust-cuda-derive/src/kernel/wrapper/generate/mod.rs index c7a2fcabd..bf2c293cc 100644 --- a/rust-cuda-derive/src/kernel/wrapper/generate/mod.rs +++ b/rust-cuda-derive/src/kernel/wrapper/generate/mod.rs @@ -1,4 +1,4 @@ -pub mod cpu_linker_macro; -pub mod cpu_wrapper; pub mod cuda_generic_function; pub mod cuda_wrapper; +pub mod host_kernel_ty; +pub mod host_linker_macro; diff --git a/rust-cuda-derive/src/kernel/wrapper/mod.rs b/rust-cuda-derive/src/kernel/wrapper/mod.rs index 4486f4c49..f3e1177bc 100644 --- a/rust-cuda-derive/src/kernel/wrapper/mod.rs +++ b/rust-cuda-derive/src/kernel/wrapper/mod.rs @@ -13,8 +13,8 @@ use super::lints::{parse_ptx_lint_level, LintLevel, PtxLint}; use config::KernelConfig; use generate::{ - cpu_linker_macro::quote_cpu_linker_macro, cpu_wrapper::quote_cpu_wrapper, cuda_generic_function::quote_cuda_generic_function, cuda_wrapper::quote_cuda_wrapper, + host_kernel_ty::quote_host_kernel_ty, host_linker_macro::quote_host_linker_macro, }; use parse::parse_kernel_fn; use proc_macro2::{Ident, Span}; @@ -213,7 +213,7 @@ pub fn kernel(attr: TokenStream, func: TokenStream) -> TokenStream { }) .collect(); - let cpu_wrapper = quote_cpu_wrapper( + let host_kernel_ty = quote_host_kernel_ty( &crate_path, &decl_generics, &impl_generics, @@ -222,8 +222,8 @@ pub fn kernel(attr: TokenStream, func: TokenStream) -> TokenStream { &func_params, &func.attrs, ); - let cpu_cuda_check = quote_generic_check(&crate_path, &func_ident); - let cpu_linker_macro = quote_cpu_linker_macro( + let host_generic_kernel_check = quote_generic_check(&crate_path, &func_ident); + let host_linker_macro = quote_host_linker_macro( &crate_path, &config, &decl_generics, @@ -251,11 +251,11 @@ pub fn kernel(attr: TokenStream, func: TokenStream) -> TokenStream { ); (quote! { - #cpu_wrapper + #host_kernel_ty - #cpu_cuda_check + #host_generic_kernel_check - #cpu_linker_macro + #host_linker_macro #cuda_wrapper #cuda_generic_function diff --git a/rust-cuda-derive/src/kernel/wrapper/parse.rs b/rust-cuda-derive/src/kernel/wrapper/parse.rs index 6d31697cf..8d1662772 100644 --- a/rust-cuda-derive/src/kernel/wrapper/parse.rs +++ b/rust-cuda-derive/src/kernel/wrapper/parse.rs @@ -48,7 +48,7 @@ pub(super) fn parse_kernel_fn(tokens: TokenStream) -> syn::ItemFn { } if func.sig.inputs.len() > 12 { - abort!( + emit_warning!( func.sig.inputs.span(), "Kernel function has too many arguments, {} were found but at most 12 are supported.", func.sig.inputs.len() diff --git a/src/common.rs b/src/common.rs index 5360ccbbc..deaf85220 100644 --- a/src/common.rs +++ b/src/common.rs @@ -325,7 +325,9 @@ pub trait CudaKernelParameter: sealed::Sealed { type SyncHostType; #[cfg(feature = "host")] type AsyncHostType<'stream, 'b>; + #[doc(hidden)] type FfiType<'stream, 'b>: rustacuda_core::DeviceCopy + TypeGraphLayout; + #[cfg(all(not(feature = "host"), target_os = "cuda"))] type DeviceType<'b>; #[cfg(feature = "host")] @@ -336,18 +338,21 @@ pub trait CudaKernelParameter: sealed::Sealed { inner: impl for<'b> FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result, ) -> Result; + #[doc(hidden)] #[cfg(feature = "host")] fn with_async_as_ptx_jit( param: &Self::AsyncHostType<'_, '_>, inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O, ) -> O; + #[doc(hidden)] #[cfg(feature = "host")] fn async_to_ffi<'stream, 'b>( param: Self::AsyncHostType<'stream, 'b>, ) -> Self::FfiType<'stream, 'b>; - #[cfg(not(feature = "host"))] + #[doc(hidden)] + #[cfg(all(not(feature = "host"), target_os = "cuda"))] fn with_ffi_as_device( param: Self::FfiType<'static, 'static>, inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O, @@ -414,6 +419,7 @@ impl< { #[cfg(feature = "host")] type AsyncHostType<'stream, 'b> = crate::utils::device_copy::SafeDeviceCopyWrapper; + #[cfg(all(not(feature = "host"), target_os = "cuda"))] type DeviceType<'b> = T; type FfiType<'stream, 'b> = crate::utils::device_copy::SafeDeviceCopyWrapper; #[cfg(feature = "host")] @@ -445,7 +451,7 @@ impl< param } - #[cfg(not(feature = "host"))] + #[cfg(all(not(feature = "host"), target_os = "cuda"))] fn with_ffi_as_device( param: Self::FfiType<'static, 'static>, inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O, @@ -477,6 +483,7 @@ impl< 'b, crate::utils::device_copy::SafeDeviceCopyWrapper, >; + #[cfg(all(not(feature = "host"), target_os = "cuda"))] type DeviceType<'b> = &'b T; type FfiType<'stream, 'b> = DeviceConstRef<'b, crate::utils::device_copy::SafeDeviceCopyWrapper>; @@ -519,7 +526,7 @@ impl< unsafe { param.for_device_async() } } - #[cfg(not(feature = "host"))] + #[cfg(all(not(feature = "host"), target_os = "cuda"))] fn with_ffi_as_device( param: Self::FfiType<'static, 'static>, inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O, @@ -549,6 +556,7 @@ impl< #[cfg(feature = "host")] type AsyncHostType<'stream, 'b> = <&'a PerThreadShallowCopy as CudaKernelParameter>::AsyncHostType<'stream, 'b>; + #[cfg(all(not(feature = "host"), target_os = "cuda"))] type DeviceType<'b> = <&'a PerThreadShallowCopy as CudaKernelParameter>::DeviceType<'b>; type FfiType<'stream, 'b> = <&'a PerThreadShallowCopy as CudaKernelParameter>::FfiType<'stream, 'b>; @@ -579,7 +587,7 @@ impl< <&'a PerThreadShallowCopy as CudaKernelParameter>::async_to_ffi(param) } - #[cfg(not(feature = "host"))] + #[cfg(all(not(feature = "host"), target_os = "cuda"))] fn with_ffi_as_device( param: Self::FfiType<'static, 'static>, inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O, @@ -622,6 +630,7 @@ impl<'a, T: 'static + InteriorMutableSafeDeviceCopy> CudaKernelParameter 'b, crate::utils::device_copy::SafeDeviceCopyWrapper, >; + #[cfg(all(not(feature = "host"), target_os = "cuda"))] type DeviceType<'b> = &'b T; type FfiType<'stream, 'b> = DeviceConstRef<'b, crate::utils::device_copy::SafeDeviceCopyWrapper>; @@ -672,7 +681,7 @@ impl<'a, T: 'static + InteriorMutableSafeDeviceCopy> CudaKernelParameter unsafe { param.for_device_async() } } - #[cfg(not(feature = "host"))] + #[cfg(all(not(feature = "host"), target_os = "cuda"))] fn with_ffi_as_device( param: Self::FfiType<'static, 'static>, inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O, @@ -746,6 +755,7 @@ impl< 'b, DeviceAccessible<::CudaRepresentation>, >; + #[cfg(all(not(feature = "host"), target_os = "cuda"))] type DeviceType<'b> = T; type FfiType<'stream, 'b> = DeviceOwnedRef<'b, DeviceAccessible<::CudaRepresentation>>; @@ -776,7 +786,7 @@ impl< unsafe { param.for_device_async() } } - #[cfg(not(feature = "host"))] + #[cfg(all(not(feature = "host"), target_os = "cuda"))] fn with_ffi_as_device( param: Self::FfiType<'static, 'static>, inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O, @@ -805,6 +815,7 @@ impl<'a, T: 'static + RustToCuda + crate::safety::NoSafeAliasing> CudaKernelPara 'b, DeviceAccessible<::CudaRepresentation>, >; + #[cfg(all(not(feature = "host"), target_os = "cuda"))] type DeviceType<'b> = &'b T; type FfiType<'stream, 'b> = DeviceConstRef<'b, DeviceAccessible<::CudaRepresentation>>; @@ -835,7 +846,7 @@ impl<'a, T: 'static + RustToCuda + crate::safety::NoSafeAliasing> CudaKernelPara unsafe { param.for_device_async() } } - #[cfg(not(feature = "host"))] + #[cfg(all(not(feature = "host"), target_os = "cuda"))] fn with_ffi_as_device( param: Self::FfiType<'static, 'static>, inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O, @@ -861,6 +872,7 @@ impl<'a, T: 'static + RustToCuda + crate::safety::NoSafeAliasing> CudaKernelPara 'b, DeviceAccessible<::CudaRepresentation>, >; + #[cfg(all(not(feature = "host"), target_os = "cuda"))] type DeviceType<'b> = &'b mut T; type FfiType<'stream, 'b> = DeviceMutRef<'b, DeviceAccessible<::CudaRepresentation>>; @@ -891,7 +903,7 @@ impl<'a, T: 'static + RustToCuda + crate::safety::NoSafeAliasing> CudaKernelPara unsafe { param.for_device_async() } } - #[cfg(not(feature = "host"))] + #[cfg(all(not(feature = "host"), target_os = "cuda"))] fn with_ffi_as_device( mut param: Self::FfiType<'static, 'static>, inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O, @@ -919,6 +931,7 @@ impl< #[cfg(feature = "host")] type AsyncHostType<'stream, 'b> = as CudaKernelParameter>::AsyncHostType<'stream, 'b>; + #[cfg(all(not(feature = "host"), target_os = "cuda"))] type DeviceType<'b> = as CudaKernelParameter>::DeviceType<'b>; type FfiType<'stream, 'b> = @@ -952,7 +965,7 @@ impl< as CudaKernelParameter>::async_to_ffi(param) } - #[cfg(not(feature = "host"))] + #[cfg(all(not(feature = "host"), target_os = "cuda"))] fn with_ffi_as_device( param: Self::FfiType<'static, 'static>, inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O, @@ -979,6 +992,7 @@ impl<'a, T: 'static + RustToCuda + crate::safety::NoSafeAliasing> CudaKernelPara #[cfg(feature = "host")] type AsyncHostType<'stream, 'b> = <&'a SharedHeapPerThreadShallowCopy as CudaKernelParameter>::AsyncHostType<'stream, 'b>; + #[cfg(all(not(feature = "host"), target_os = "cuda"))] type DeviceType<'b> = <&'a SharedHeapPerThreadShallowCopy as CudaKernelParameter>::DeviceType<'b>; type FfiType<'stream, 'b> = @@ -1013,7 +1027,7 @@ impl<'a, T: 'static + RustToCuda + crate::safety::NoSafeAliasing> CudaKernelPara <&'a SharedHeapPerThreadShallowCopy as CudaKernelParameter>::async_to_ffi(param) } - #[cfg(not(feature = "host"))] + #[cfg(all(not(feature = "host"), target_os = "cuda"))] fn with_ffi_as_device( param: Self::FfiType<'static, 'static>, inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O, @@ -1039,6 +1053,7 @@ impl<'a, T: 'static + RustToCuda + crate::safety::NoSafeAliasing> CudaKernelPara 'stream, 'b, >; + #[cfg(all(not(feature = "host"), target_os = "cuda"))] type DeviceType<'b> = <&'a mut SharedHeapPerThreadShallowCopy as CudaKernelParameter>::DeviceType<'b>; type FfiType<'stream, 'b> = @@ -1073,7 +1088,7 @@ impl<'a, T: 'static + RustToCuda + crate::safety::NoSafeAliasing> CudaKernelPara <&'a mut SharedHeapPerThreadShallowCopy as CudaKernelParameter>::async_to_ffi(param) } - #[cfg(not(feature = "host"))] + #[cfg(all(not(feature = "host"), target_os = "cuda"))] fn with_ffi_as_device( param: Self::FfiType<'static, 'static>, inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O, @@ -1096,7 +1111,7 @@ fn param_as_raw_bytes(r: &T) -> NonNull<[u8]> { NonNull::slice_from_raw_parts(NonNull::from(r).cast::(), core::mem::size_of_val(r)) } -#[cfg(not(feature = "host"))] +#[cfg(all(not(feature = "host"), target_os = "cuda"))] fn emit_param_ptx_jit_marker(param: &T) { unsafe { core::arch::asm!( diff --git a/src/host/mod.rs b/src/host/mod.rs index 539d24207..e06e180f6 100644 --- a/src/host/mod.rs +++ b/src/host/mod.rs @@ -47,10 +47,10 @@ macro_rules! impl_launcher_launch { $($arg: $T::SyncHostType),* ) -> CudaResult<()> where - Kernel: Copy + FnOnce( + Kernel: /*Copy +*/ FnOnce( &mut Launcher, - $($T::SyncHostType),* - ) -> CudaResult<()>, + $($T/*::SyncHostType*/),* + )/* -> CudaResult<()>*/, { self.kernel.$launch::<$($T),*>(self.stream, &self.config, $($arg),*) } @@ -71,10 +71,10 @@ macro_rules! impl_launcher_launch { ) -> Result, ) -> Result where - Kernel: Copy + FnOnce( + Kernel: /*Copy +*/ FnOnce( &mut Launcher, - $($T::SyncHostType),* - ) -> CudaResult<()>, + $($T/*::SyncHostType*/),* + )/* -> CudaResult<()>*/, { #[allow(unused_variables)] let stream = self.stream; @@ -91,10 +91,10 @@ macro_rules! impl_launcher_launch { $($arg: $T::AsyncHostType<'stream, '_>),* ) -> CudaResult<()> where - Kernel: Copy + FnOnce( + Kernel: /*Copy +*/ FnOnce( &mut Launcher, - $($T::SyncHostType),* - ) -> CudaResult<()>, + $($T/*::SyncHostType*/),* + )/* -> CudaResult<()>*/, { self.kernel.$launch_async::<$($T),*>(self.stream, &self.config, $($arg),*) } @@ -252,10 +252,10 @@ macro_rules! impl_typed_kernel_launch { $($arg: $T::SyncHostType),* ) -> CudaResult<()> where - Kernel: Copy + FnOnce( + Kernel: /*Copy +*/ FnOnce( &mut Launcher, - $($T::SyncHostType),* - ) -> CudaResult<()>, + $($T/*::SyncHostType*/),* + )/* -> CudaResult<()>*/, { self.$with_async::<(), CudaError, $($T),*>( stream, @@ -294,10 +294,10 @@ macro_rules! impl_typed_kernel_launch { ) -> Result, ) -> Result where - Kernel: Copy + FnOnce( + Kernel: /*Copy +*/ FnOnce( &mut Launcher, - $($T::SyncHostType),* - ) -> CudaResult<()>, + $($T/*::SyncHostType*/),* + )/* -> CudaResult<()>*/, { impl_typed_kernel_launch! { impl with_new_async ($($arg: $T),*) + (stream) { inner(self, stream, config, $($arg),*) @@ -314,10 +314,10 @@ macro_rules! impl_typed_kernel_launch { $($arg: $T::AsyncHostType<'stream, '_>),* ) -> CudaResult<()> where - Kernel: Copy + FnOnce( + Kernel: /*Copy +*/ FnOnce( &mut Launcher, - $($T::SyncHostType),* - ) -> CudaResult<()>, + $($T/*::SyncHostType*/),* + )/* -> CudaResult<()>*/, { let kernel_jit_result = if config.ptx_jit { impl_typed_kernel_launch! { impl with_async_as_ptx_jit ref ($($arg: $T),*) + () { diff --git a/src/lib.rs b/src/lib.rs index 0bf8b0e21..392928d29 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -29,6 +29,8 @@ #![feature(inline_const)] #![feature(sync_unsafe_cell)] #![feature(never_type)] +#![feature(tuple_trait)] +#![feature(unboxed_closures)] #![feature(cfg_version)] #![cfg_attr(not(version("1.76.0")), feature(c_str_literals))] #![cfg_attr(not(version("1.76.0")), feature(ptr_from_ref))] From b2ce9ee2c83741fc3cef74f5be27579243425d99 Mon Sep 17 00:00:00 2001 From: Juniper Tyree Date: Sat, 23 Dec 2023 14:26:46 +0000 Subject: [PATCH 058/120] Start cleaning up the public API --- .github/workflows/ci.yml | 4 +- .github/workflows/coverage.yml | 1 - Cargo.toml | 4 +- src/common.rs | 44 +++++----- src/device/mod.rs | 75 +++++++--------- src/host/mod.rs | 130 +++++++++++----------------- src/utils/aliasing/const.rs | 3 +- src/utils/aliasing/dynamic.rs | 3 +- src/utils/aliasing/final.rs | 3 +- src/utils/box.rs | 3 +- src/utils/boxed_slice.rs | 3 +- src/utils/device_copy.rs | 2 +- src/utils/exchange/buffer/common.rs | 3 +- src/utils/option.rs | 3 +- 14 files changed, 112 insertions(+), 169 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 954395a77..07fa4ab26 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -62,7 +62,7 @@ jobs: - name: Check feature powerset on CUDA run: | cargo hack check --feature-powerset --optional-deps \ - --skip host,rustacuda,rustacuda_derive \ + --skip host,rustacuda,rustacuda_derive,regex \ --keep-going \ --target nvptx64-nvidia-cuda @@ -180,7 +180,7 @@ jobs: - name: Check feature powerset on CUDA run: | cargo hack clippy --feature-powerset --optional-deps \ - --skip host,rustacuda,rustacuda_derive \ + --skip host,rustacuda,rustacuda_derive,regex \ --keep-going \ --target nvptx64-nvidia-cuda \ -- -D warnings diff --git a/.github/workflows/coverage.yml b/.github/workflows/coverage.yml index 904e1a65c..176d98baa 100644 --- a/.github/workflows/coverage.yml +++ b/.github/workflows/coverage.yml @@ -59,7 +59,6 @@ jobs: ./grcov . -s . --binary-path ./target/debug/deps \ -t lcov -o coverage.lcov --branch \ --keep-only "src/*" \ - --keep-only "rust-cuda-ptx-jit/*" \ --keep-only "rust-cuda-derive/*" \ --ignore-not-existing \ --excl-line GRCOV_EXCL_LINE \ diff --git a/Cargo.toml b/Cargo.toml index 0a1375547..43687be65 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -19,8 +19,8 @@ rust-version = "1.75" # nightly [features] default = [] -host = ["rustacuda", "regex"] -derive = ["rustacuda_derive", "rust-cuda-derive"] +host = ["dep:rustacuda", "dep:regex"] +derive = ["dep:rustacuda_derive", "dep:rust-cuda-derive"] [dependencies] rustacuda_core = { git = "https://github.com/juntyr/RustaCUDA", rev = "c6ea7cc" } diff --git a/src/common.rs b/src/common.rs index deaf85220..e4ee3f804 100644 --- a/src/common.rs +++ b/src/common.rs @@ -86,8 +86,8 @@ pub unsafe trait RustToCuda { type CudaAllocation: CudaAlloc; type CudaRepresentation: CudaAsRust + TypeGraphLayout; + #[doc(hidden)] #[cfg(feature = "host")] - #[doc(cfg(feature = "host"))] /// # Errors /// /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside @@ -107,8 +107,8 @@ pub unsafe trait RustToCuda { CombinedCudaAlloc, )>; + #[doc(hidden)] #[cfg(feature = "host")] - #[doc(cfg(feature = "host"))] /// # Errors /// /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside @@ -129,8 +129,8 @@ pub unsafe trait RustToCuda { /// This is an internal trait and should ONLY be derived automatically using /// `#[derive(LendRustToCuda)]` pub unsafe trait RustToCudaAsync: RustToCuda { + #[doc(hidden)] #[cfg(feature = "host")] - #[doc(cfg(feature = "host"))] /// # Errors /// /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside @@ -153,8 +153,8 @@ pub unsafe trait RustToCudaAsync: RustToCuda { CombinedCudaAlloc, )>; + #[doc(hidden)] #[cfg(feature = "host")] - #[doc(cfg(feature = "host"))] /// # Errors /// /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside @@ -177,8 +177,8 @@ pub unsafe trait RustToCudaAsync: RustToCuda { pub unsafe trait CudaAsRust: DeviceCopy + TypeGraphLayout { type RustRepresentation: RustToCuda; - #[cfg(any(not(feature = "host"), doc))] - #[doc(cfg(not(feature = "host")))] + #[doc(hidden)] + #[cfg(not(feature = "host"))] /// # Safety /// /// This is an internal function and should NEVER be called manually @@ -283,23 +283,31 @@ mod private { } pub trait EmptyCudaAlloc: private::empty::Sealed {} -impl EmptyCudaAlloc for T {} pub trait CudaAlloc: crate_private::alloc::Sealed {} -impl CudaAlloc for T {} +impl CudaAlloc for Option {} impl crate_private::alloc::Sealed for Option {} pub struct NoCudaAlloc; +impl CudaAlloc for NoCudaAlloc {} impl crate_private::alloc::Sealed for NoCudaAlloc {} +impl EmptyCudaAlloc for NoCudaAlloc {} impl private::empty::Sealed for NoCudaAlloc {} pub struct SomeCudaAlloc(()); +impl CudaAlloc for SomeCudaAlloc {} impl crate_private::alloc::Sealed for SomeCudaAlloc {} +impl !EmptyCudaAlloc for SomeCudaAlloc {} impl !private::empty::Sealed for SomeCudaAlloc {} pub struct CombinedCudaAlloc(A, B); +impl CudaAlloc for CombinedCudaAlloc {} impl crate_private::alloc::Sealed for CombinedCudaAlloc {} +impl EmptyCudaAlloc + for CombinedCudaAlloc +{ +} impl private::empty::Sealed for CombinedCudaAlloc { @@ -791,10 +799,7 @@ impl< param: Self::FfiType<'static, 'static>, inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O, ) -> O { - // The type contains no allocations and is safe to copy - let param = unsafe { CudaAsRust::as_rust(param.as_ref()) }; - - inner(param) + unsafe { crate::device::BorrowFromRust::with_moved_from_rust(param, inner) } } } impl< @@ -851,11 +856,7 @@ impl<'a, T: 'static + RustToCuda + crate::safety::NoSafeAliasing> CudaKernelPara param: Self::FfiType<'static, 'static>, inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O, ) -> O { - // Safety: param must never be dropped as we do NOT own any of the - // heap memory it might reference - let param = core::mem::ManuallyDrop::new(unsafe { CudaAsRust::as_rust(param.as_ref()) }); - - inner(¶m) + unsafe { crate::device::BorrowFromRust::with_borrow_from_rust(param, inner) } } } impl<'a, T: RustToCuda + crate::safety::NoSafeAliasing> sealed::Sealed @@ -905,15 +906,10 @@ impl<'a, T: 'static + RustToCuda + crate::safety::NoSafeAliasing> CudaKernelPara #[cfg(all(not(feature = "host"), target_os = "cuda"))] fn with_ffi_as_device( - mut param: Self::FfiType<'static, 'static>, + param: Self::FfiType<'static, 'static>, inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O, ) -> O { - // Safety: param must never be dropped as we do NOT own any of the - // heap memory it might reference - let mut param = - core::mem::ManuallyDrop::new(unsafe { CudaAsRust::as_rust(param.as_mut()) }); - - inner(&mut param) + unsafe { crate::device::BorrowFromRust::with_borrow_from_rust_mut(param, inner) } } } impl<'a, T: RustToCuda + crate::safety::NoSafeAliasing> sealed::Sealed diff --git a/src/device/mod.rs b/src/device/mod.rs index 93811bb04..5ce92bbbe 100644 --- a/src/device/mod.rs +++ b/src/device/mod.rs @@ -1,28 +1,27 @@ -use core::{ - mem::ManuallyDrop, - ops::{Deref, DerefMut}, -}; +use core::mem::ManuallyDrop; #[cfg(feature = "derive")] #[doc(cfg(feature = "derive"))] pub use rust_cuda_derive::{specialise_kernel_function, specialise_kernel_type}; use crate::{ - common::{CudaAsRust, DeviceAccessible, DeviceConstRef, DeviceMutRef, RustToCuda}, - safety::SafeDeviceCopy, + common::{ + CudaAsRust, DeviceAccessible, DeviceConstRef, DeviceMutRef, DeviceOwnedRef, RustToCuda, + }, + safety::{NoSafeAliasing, SafeDeviceCopy}, }; pub mod alloc; pub mod thread; pub mod utils; -pub trait BorrowFromRust: RustToCuda { +pub trait BorrowFromRust: RustToCuda + NoSafeAliasing { /// # Safety /// /// This function is only safe to call iff `cuda_repr` is the /// [`DeviceConstRef`] borrowed on the CPU using the corresponding /// [`LendToCuda::lend_to_cuda`](crate::host::LendToCuda::lend_to_cuda). - unsafe fn with_borrow_from_rust) -> O>( + unsafe fn with_borrow_from_rust O>( cuda_repr: DeviceConstRef::CudaRepresentation>>, inner: F, ) -> O; @@ -35,7 +34,7 @@ pub trait BorrowFromRust: RustToCuda { /// Furthermore, since different GPU threads can access heap storage /// mutably inside the safe `inner` scope, there must not be any /// aliasing between concurrently running threads. - unsafe fn with_borrow_from_rust_mut) -> O>( + unsafe fn with_borrow_from_rust_mut O>( cuda_repr_mut: DeviceMutRef::CudaRepresentation>>, inner: F, ) -> O; @@ -43,10 +42,10 @@ pub trait BorrowFromRust: RustToCuda { /// # Safety /// /// This function is only safe to call iff `cuda_repr` is the - /// [`DeviceMutRef`] borrowed on the CPU using the corresponding + /// [`DeviceOwnedRef`] borrowed on the CPU using the corresponding /// [`LendToCuda::move_to_cuda`](crate::host::LendToCuda::move_to_cuda). unsafe fn with_moved_from_rust O>( - cuda_repr_mut: DeviceMutRef::CudaRepresentation>>, + cuda_repr_mut: DeviceOwnedRef::CudaRepresentation>>, inner: F, ) -> O where @@ -54,34 +53,46 @@ pub trait BorrowFromRust: RustToCuda { ::CudaRepresentation: SafeDeviceCopy; } -impl BorrowFromRust for T { +impl BorrowFromRust for T { #[inline] - unsafe fn with_borrow_from_rust) -> O>( + unsafe fn with_borrow_from_rust O>( cuda_repr: DeviceConstRef::CudaRepresentation>>, inner: F, ) -> O { - // rust_repr must never be dropped as we do NOT own any of the + // `rust_repr` must never be dropped as we do NOT own any of the // heap memory it might reference - let rust_repr = ShallowCopy::new(CudaAsRust::as_rust(cuda_repr.as_ref())); + let rust_repr = ManuallyDrop::new(CudaAsRust::as_rust(cuda_repr.as_ref())); inner(&rust_repr) } #[inline] - unsafe fn with_borrow_from_rust_mut) -> O>( + unsafe fn with_borrow_from_rust_mut O>( mut cuda_repr_mut: DeviceMutRef::CudaRepresentation>>, inner: F, ) -> O { - // rust_repr must never be dropped as we do NOT own any of the + // `rust_repr_mut` must never be dropped as we do NOT own any of the // heap memory it might reference - let mut rust_repr_mut = ShallowCopy::new(CudaAsRust::as_rust(cuda_repr_mut.as_mut())); - + let mut rust_repr_mut = ManuallyDrop::new(CudaAsRust::as_rust(cuda_repr_mut.as_mut())); + + // Ideally, we would only provide access to `Pin<&mut T>` s.t. + // `core::mem::replace` cannot be used. + // However, we should still be fine because + // - the shallow part of `rust_repr_mut` is a unique copy per thread, so + // replacing it affects no other thread (immediately, drop is handled below) + // - any deep parts of `rust_repr_mut` are not allowed to hand out aliasing + // mutable references, so any deep memory replacement would not affect other + // threads + // - since any deep data is allocated from the host, we *hope* that trying to + // drop it in a CUDA thread turns into a no-op inner(&mut rust_repr_mut) } #[inline] unsafe fn with_moved_from_rust O>( - mut cuda_repr_mut: DeviceMutRef::CudaRepresentation>>, + mut cuda_repr_mut: DeviceOwnedRef< + DeviceAccessible<::CudaRepresentation>, + >, inner: F, ) -> O where @@ -91,27 +102,3 @@ impl BorrowFromRust for T { inner(CudaAsRust::as_rust(cuda_repr_mut.as_mut())) } } - -#[repr(transparent)] -#[derive(Debug)] -pub struct ShallowCopy(ManuallyDrop); - -impl ShallowCopy { - fn new(value: T) -> Self { - Self(ManuallyDrop::new(value)) - } -} - -impl Deref for ShallowCopy { - type Target = T; - - fn deref(&self) -> &Self::Target { - &self.0 - } -} - -impl DerefMut for ShallowCopy { - fn deref_mut(&mut self) -> &mut Self::Target { - &mut self.0 - } -} diff --git a/src/host/mod.rs b/src/host/mod.rs index e06e180f6..f619b4415 100644 --- a/src/host/mod.rs +++ b/src/host/mod.rs @@ -26,7 +26,7 @@ use crate::{ CudaKernelParameter, DeviceAccessible, DeviceConstRef, DeviceMutRef, DeviceOwnedRef, EmptyCudaAlloc, NoCudaAlloc, RustToCuda, }, - safety::SafeDeviceCopy, + safety::{NoSafeAliasing, SafeDeviceCopy}, }; mod ptx_jit; @@ -47,10 +47,7 @@ macro_rules! impl_launcher_launch { $($arg: $T::SyncHostType),* ) -> CudaResult<()> where - Kernel: /*Copy +*/ FnOnce( - &mut Launcher, - $($T/*::SyncHostType*/),* - )/* -> CudaResult<()>*/, + Kernel: FnOnce(&mut Launcher, $($T),*), { self.kernel.$launch::<$($T),*>(self.stream, &self.config, $($arg),*) } @@ -71,10 +68,7 @@ macro_rules! impl_launcher_launch { ) -> Result, ) -> Result where - Kernel: /*Copy +*/ FnOnce( - &mut Launcher, - $($T/*::SyncHostType*/),* - )/* -> CudaResult<()>*/, + Kernel: FnOnce(&mut Launcher, $($T),*), { #[allow(unused_variables)] let stream = self.stream; @@ -91,10 +85,7 @@ macro_rules! impl_launcher_launch { $($arg: $T::AsyncHostType<'stream, '_>),* ) -> CudaResult<()> where - Kernel: /*Copy +*/ FnOnce( - &mut Launcher, - $($T/*::SyncHostType*/),* - )/* -> CudaResult<()>*/, + Kernel: FnOnce(&mut Launcher, $($T),*), { self.kernel.$launch_async::<$($T),*>(self.stream, &self.config, $($arg),*) } @@ -173,12 +164,12 @@ pub struct LaunchConfig { #[doc(cfg(feature = "host"))] #[allow(clippy::module_name_repetitions)] -pub struct PtxKernel { +pub struct RawPtxKernel { module: ManuallyDrop>, function: ManuallyDrop>, } -impl PtxKernel { +impl RawPtxKernel { /// # Errors /// /// Returns a [`CudaError`] if `ptx` is not a valid PTX source, or it does @@ -211,7 +202,7 @@ impl PtxKernel { } } -impl Drop for PtxKernel { +impl Drop for RawPtxKernel { fn drop(&mut self) { { // Ensure that self.function is dropped before self.module as @@ -226,16 +217,11 @@ impl Drop for PtxKernel { } } -pub enum KernelJITResult<'k> { - Cached(&'k Function<'k>), - Recompiled(&'k Function<'k>), -} - pub type PtxKernelConfigure = dyn FnMut(&Function) -> CudaResult<()>; pub struct TypedPtxKernel { compiler: PtxJITCompiler, - ptx_kernel: Option, + ptx_kernel: Option, entry_point: Box, configure: Option>, marker: PhantomData, @@ -252,10 +238,7 @@ macro_rules! impl_typed_kernel_launch { $($arg: $T::SyncHostType),* ) -> CudaResult<()> where - Kernel: /*Copy +*/ FnOnce( - &mut Launcher, - $($T/*::SyncHostType*/),* - )/* -> CudaResult<()>*/, + Kernel: FnOnce(&mut Launcher, $($T),*), { self.$with_async::<(), CudaError, $($T),*>( stream, @@ -294,10 +277,7 @@ macro_rules! impl_typed_kernel_launch { ) -> Result, ) -> Result where - Kernel: /*Copy +*/ FnOnce( - &mut Launcher, - $($T/*::SyncHostType*/),* - )/* -> CudaResult<()>*/, + Kernel: FnOnce(&mut Launcher, $($T),*), { impl_typed_kernel_launch! { impl with_new_async ($($arg: $T),*) + (stream) { inner(self, stream, config, $($arg),*) @@ -314,22 +294,15 @@ macro_rules! impl_typed_kernel_launch { $($arg: $T::AsyncHostType<'stream, '_>),* ) -> CudaResult<()> where - Kernel: /*Copy +*/ FnOnce( - &mut Launcher, - $($T/*::SyncHostType*/),* - )/* -> CudaResult<()>*/, + Kernel: FnOnce(&mut Launcher, $($T),*), { - let kernel_jit_result = if config.ptx_jit { + let function = if config.ptx_jit { impl_typed_kernel_launch! { impl with_async_as_ptx_jit ref ($($arg: $T),*) + () { self.compile_with_ptx_jit_args(Some(&[$($arg),*])) } }? } else { self.compile_with_ptx_jit_args(None)? }; - let function = match kernel_jit_result { - KernelJITResult::Recompiled(function) - | KernelJITResult::Cached(function) => function, - }; unsafe { stream.launch( function, @@ -362,6 +335,22 @@ macro_rules! impl_typed_kernel_launch { }; } +impl TypedPtxKernel { + #[must_use] + pub fn new>(configure: Option>) -> Self { + let compiler = PtxJITCompiler::new(T::get_ptx()); + let entry_point = CString::from(T::get_entry_point()).into_boxed_c_str(); + + Self { + compiler, + ptx_kernel: None, + entry_point, + configure, + marker: PhantomData::, + } + } +} + impl TypedPtxKernel { impl_typed_kernel_launch! { launch0() => with0_async => launch0_async } @@ -415,20 +404,6 @@ impl TypedPtxKernel { arg11: K, arg12: L ) => with12_async => launch12_async } - #[must_use] - pub fn new>(configure: Option>) -> Self { - let compiler = PtxJITCompiler::new(T::get_ptx()); - let entry_point = CString::from(T::get_entry_point()).into_boxed_c_str(); - - Self { - compiler, - ptx_kernel: None, - entry_point, - configure, - marker: PhantomData::, - } - } - /// # Errors /// /// Returns a [`CudaError`] if the [`CompiledKernelPtx`] provided to @@ -437,15 +412,13 @@ impl TypedPtxKernel { fn compile_with_ptx_jit_args( &mut self, arguments: Option<&[Option<&NonNull<[u8]>>]>, - ) -> CudaResult { + ) -> CudaResult<&Function> { let ptx_jit = self.compiler.with_arguments(arguments); let kernel_jit = match (&mut self.ptx_kernel, ptx_jit) { - (Some(ptx_kernel), PtxJITResult::Cached(_)) => { - KernelJITResult::Cached(ptx_kernel.get_function()) - }, + (Some(ptx_kernel), PtxJITResult::Cached(_)) => ptx_kernel.get_function(), (ptx_kernel, PtxJITResult::Cached(ptx_cstr) | PtxJITResult::Recomputed(ptx_cstr)) => { - let recomputed_ptx_kernel = PtxKernel::new(ptx_cstr, &self.entry_point)?; + let recomputed_ptx_kernel = RawPtxKernel::new(ptx_cstr, &self.entry_point)?; // Replace the existing compiled kernel, drop the old one let ptx_kernel = ptx_kernel.insert(recomputed_ptx_kernel); @@ -456,7 +429,7 @@ impl TypedPtxKernel { configure(function)?; } - KernelJITResult::Recompiled(function) + function }, }; @@ -464,7 +437,7 @@ impl TypedPtxKernel { } } -pub trait LendToCuda: RustToCuda { +pub trait LendToCuda: RustToCuda + NoSafeAliasing { /// Lends an immutable copy of `&self` to CUDA: /// - code in the CUDA kernel can only access `&self` through the /// [`DeviceConstRef`] inside the closure @@ -525,12 +498,10 @@ pub trait LendToCuda: RustToCuda { inner: F, ) -> Result where - Self: Sized, - ::CudaRepresentation: SafeDeviceCopy, - ::CudaAllocation: EmptyCudaAlloc; + Self: RustToCuda; } -impl LendToCuda for T { +impl LendToCuda for T { fn lend_to_cuda< O, E: From, @@ -583,9 +554,7 @@ impl LendToCuda for T { inner: F, ) -> Result where - Self: Sized, - ::CudaRepresentation: SafeDeviceCopy, - ::CudaAllocation: EmptyCudaAlloc, + Self: RustToCuda, { let (cuda_repr, alloc) = unsafe { self.borrow(NoCudaAlloc) }?; @@ -597,23 +566,21 @@ impl LendToCuda for T { } } -mod private { - pub mod drop { - pub trait Sealed: Sized { - fn drop(val: Self) -> Result<(), (rustacuda::error::CudaError, Self)>; - } - } +pub trait CudaDroppable: Sized { + #[allow(clippy::missing_errors_doc)] + fn drop(val: Self) -> Result<(), (rustacuda::error::CudaError, Self)>; } #[repr(transparent)] -pub struct CudaDropWrapper(ManuallyDrop); -impl crate::common::crate_private::alloc::Sealed for CudaDropWrapper {} -impl From for CudaDropWrapper { +pub struct CudaDropWrapper(ManuallyDrop); +impl crate::common::CudaAlloc for CudaDropWrapper {} +impl crate::common::crate_private::alloc::Sealed for CudaDropWrapper {} +impl From for CudaDropWrapper { fn from(val: C) -> Self { Self(ManuallyDrop::new(val)) } } -impl Drop for CudaDropWrapper { +impl Drop for CudaDropWrapper { fn drop(&mut self) { // Safety: drop is only ever called once let val = unsafe { ManuallyDrop::take(&mut self.0) }; @@ -623,14 +590,14 @@ impl Drop for CudaDropWrapper { } } } -impl Deref for CudaDropWrapper { +impl Deref for CudaDropWrapper { type Target = C; fn deref(&self) -> &Self::Target { &self.0 } } -impl DerefMut for CudaDropWrapper { +impl DerefMut for CudaDropWrapper { fn deref_mut(&mut self) -> &mut Self::Target { &mut self.0 } @@ -638,7 +605,7 @@ impl DerefMut for CudaDropWrapper { macro_rules! impl_sealed_drop_collection { ($type:ident) => { - impl private::drop::Sealed for $type { + impl CudaDroppable for $type { fn drop(val: Self) -> Result<(), (CudaError, Self)> { Self::drop(val) } @@ -653,7 +620,7 @@ impl_sealed_drop_collection!(LockedBox); macro_rules! impl_sealed_drop_value { ($type:ident) => { - impl private::drop::Sealed for $type { + impl CudaDroppable for $type { fn drop(val: Self) -> Result<(), (CudaError, Self)> { Self::drop(val) } @@ -727,6 +694,7 @@ impl Drop for HostLockedBox { #[allow(clippy::module_name_repetitions)] pub struct HostDeviceBox(DevicePointer); +impl crate::common::CudaAlloc for HostDeviceBox {} impl crate::common::crate_private::alloc::Sealed for HostDeviceBox {} impl HostDeviceBox { diff --git a/src/utils/aliasing/const.rs b/src/utils/aliasing/const.rs index 91496a47d..c40b3642a 100644 --- a/src/utils/aliasing/const.rs +++ b/src/utils/aliasing/const.rs @@ -248,8 +248,7 @@ unsafe impl CudaAsRust { type RustRepresentation = SplitSliceOverCudaThreadsConstStride; - #[cfg(any(not(feature = "host"), doc))] - #[doc(cfg(not(feature = "host")))] + #[cfg(not(feature = "host"))] unsafe fn as_rust(this: &DeviceAccessible) -> Self::RustRepresentation { SplitSliceOverCudaThreadsConstStride::new(CudaAsRust::as_rust(&this.0)) } diff --git a/src/utils/aliasing/dynamic.rs b/src/utils/aliasing/dynamic.rs index d7b48b05f..c70ca80f8 100644 --- a/src/utils/aliasing/dynamic.rs +++ b/src/utils/aliasing/dynamic.rs @@ -226,8 +226,7 @@ unsafe impl CudaAsRust { type RustRepresentation = SplitSliceOverCudaThreadsDynamicStride; - #[cfg(any(not(feature = "host"), doc))] - #[doc(cfg(not(feature = "host")))] + #[cfg(not(feature = "host"))] unsafe fn as_rust(this: &DeviceAccessible) -> Self::RustRepresentation { SplitSliceOverCudaThreadsDynamicStride::new(CudaAsRust::as_rust(&this.inner), this.stride) } diff --git a/src/utils/aliasing/final.rs b/src/utils/aliasing/final.rs index 019ece1b6..366de9557 100644 --- a/src/utils/aliasing/final.rs +++ b/src/utils/aliasing/final.rs @@ -83,8 +83,7 @@ unsafe impl RustToCudaAsync for Final { unsafe impl CudaAsRust for FinalCudaRepresentation { type RustRepresentation = Final; - #[cfg(any(not(feature = "host"), doc))] - #[doc(cfg(not(feature = "host")))] + #[cfg(not(feature = "host"))] unsafe fn as_rust(this: &DeviceAccessible) -> Self::RustRepresentation { Final::new(CudaAsRust::as_rust(&this.0)) } diff --git a/src/utils/box.rs b/src/utils/box.rs index 8e81941a1..ab0b22708 100644 --- a/src/utils/box.rs +++ b/src/utils/box.rs @@ -79,8 +79,7 @@ unsafe impl RustToCuda for Box { unsafe impl CudaAsRust for BoxCudaRepresentation { type RustRepresentation = Box; - #[cfg(any(not(feature = "host"), doc))] - #[doc(cfg(not(feature = "host")))] + #[cfg(not(feature = "host"))] unsafe fn as_rust(this: &DeviceAccessible) -> Self::RustRepresentation { alloc::boxed::Box::from_raw(this.0) } diff --git a/src/utils/boxed_slice.rs b/src/utils/boxed_slice.rs index 4a06e0a8d..588fa8c07 100644 --- a/src/utils/boxed_slice.rs +++ b/src/utils/boxed_slice.rs @@ -81,8 +81,7 @@ unsafe impl RustToCuda for Box<[T]> { unsafe impl CudaAsRust for BoxedSliceCudaRepresentation { type RustRepresentation = Box<[T]>; - #[cfg(any(not(feature = "host"), doc))] - #[doc(cfg(not(feature = "host")))] + #[cfg(not(feature = "host"))] unsafe fn as_rust(this: &DeviceAccessible) -> Self::RustRepresentation { alloc::boxed::Box::from_raw(core::slice::from_raw_parts_mut(this.0, this.1)) } diff --git a/src/utils/device_copy.rs b/src/utils/device_copy.rs index 2869cd296..b06735692 100644 --- a/src/utils/device_copy.rs +++ b/src/utils/device_copy.rs @@ -133,7 +133,7 @@ unsafe impl RustToCudaAsync for SafeDeviceC unsafe impl CudaAsRust for SafeDeviceCopyWrapper { type RustRepresentation = Self; - #[cfg(any(not(feature = "host"), doc))] + #[cfg(not(feature = "host"))] unsafe fn as_rust(this: &DeviceAccessible) -> Self::RustRepresentation { let mut uninit = core::mem::MaybeUninit::uninit(); core::ptr::copy_nonoverlapping(&**this, uninit.as_mut_ptr(), 1); diff --git a/src/utils/exchange/buffer/common.rs b/src/utils/exchange/buffer/common.rs index c5d1f9128..12a491b20 100644 --- a/src/utils/exchange/buffer/common.rs +++ b/src/utils/exchange/buffer/common.rs @@ -28,8 +28,7 @@ unsafe impl; - #[cfg(any(not(feature = "host"), doc))] - #[doc(cfg(not(feature = "host")))] + #[cfg(not(feature = "host"))] unsafe fn as_rust(this: &crate::common::DeviceAccessible) -> Self::RustRepresentation { CudaExchangeBuffer(core::mem::ManuallyDrop::new(alloc::boxed::Box::from_raw( core::slice::from_raw_parts_mut(this.0, this.1), diff --git a/src/utils/option.rs b/src/utils/option.rs index f939f5ba0..a7b3e991e 100644 --- a/src/utils/option.rs +++ b/src/utils/option.rs @@ -146,8 +146,7 @@ unsafe impl RustToCudaAsync for Option { unsafe impl CudaAsRust for OptionCudaRepresentation { type RustRepresentation = Option<::RustRepresentation>; - #[cfg(any(not(feature = "host"), doc))] - #[doc(cfg(not(feature = "host")))] + #[cfg(not(feature = "host"))] unsafe fn as_rust(this: &DeviceAccessible) -> Self::RustRepresentation { if this.present { Some(CudaAsRust::as_rust(this.maybe.assume_init_ref())) From ed082f24ee1971772a1ff20c6024c7ef689ba995 Mon Sep 17 00:00:00 2001 From: Juniper Tyree Date: Sun, 24 Dec 2023 05:21:39 +0000 Subject: [PATCH 059/120] Allow passing ThreadBlockShared to kernels again --- examples/single-source/src/main.rs | 19 +++----- src/common.rs | 70 +++++++++++++++++++++++++++++- src/device/thread.rs | 10 ++--- src/device/utils.rs | 2 + src/host/mod.rs | 21 ++++----- src/lib.rs | 8 +++- src/utils/aliasing/const.rs | 4 +- src/utils/aliasing/dynamic.rs | 4 +- src/utils/device_copy.rs | 17 ++++++-- src/utils/exchange/buffer/mod.rs | 12 ++--- src/utils/exchange/wrapper.rs | 16 +++---- src/utils/shared/slice.rs | 4 +- src/utils/shared/static.rs | 4 +- 13 files changed, 137 insertions(+), 54 deletions(-) diff --git a/examples/single-source/src/main.rs b/examples/single-source/src/main.rs index f53963f9d..d88d628a7 100644 --- a/examples/single-source/src/main.rs +++ b/examples/single-source/src/main.rs @@ -15,9 +15,6 @@ extern crate alloc; -#[cfg(target_os = "cuda")] -use rc::utils::shared::r#static::ThreadBlockShared; - #[cfg(not(target_os = "cuda"))] fn main() {} @@ -69,12 +66,11 @@ pub fn kernel< _z: &rc::common::SharedHeapPerThreadShallowCopy>, _v @ _w: &'a rc::common::ShallowInteriorMutable, _: rc::common::SharedHeapPerThreadShallowCopy>, - Tuple(s, mut __t): rc::common::PerThreadShallowCopy, - q: rc::common::PerThreadShallowCopy, - // shared3: ThreadBlockShared, + q @ Triple(s, mut __t, _u): rc::common::PerThreadShallowCopy, + shared3: &mut rc::utils::shared::r#static::ThreadBlockShared, ) { - let shared: ThreadBlockShared<[Tuple; 3]> = ThreadBlockShared::new_uninit(); - let shared2: ThreadBlockShared<[Tuple; 3]> = ThreadBlockShared::new_uninit(); + let shared = rc::utils::shared::r#static::ThreadBlockShared::<[Tuple; 3]>::new_uninit(); + let shared2 = rc::utils::shared::r#static::ThreadBlockShared::<[Tuple; 3]>::new_uninit(); #[allow(clippy::cast_possible_truncation, clippy::cast_sign_loss)] unsafe { @@ -84,10 +80,9 @@ pub fn kernel< (*shared2.index_mut_unchecked(2)).1 = q.0 + q.1 + q.2; } - // unsafe { core::arch::asm!("hi") } - // unsafe { - // *shared3.as_mut_ptr() = 12; - // } + unsafe { + *shared3.as_mut_ptr() = 12; + } } #[cfg(not(target_os = "cuda"))] diff --git a/src/common.rs b/src/common.rs index e4ee3f804..b0263031b 100644 --- a/src/common.rs +++ b/src/common.rs @@ -313,7 +313,8 @@ impl private::empt { } impl CombinedCudaAlloc { - pub fn new(front: A, tail: B) -> Self { + #[must_use] + pub const fn new(front: A, tail: B) -> Self { Self(front, tail) } @@ -1117,3 +1118,70 @@ fn emit_param_ptx_jit_marker(param: &T) { ); } } + +mod private_shared { + use const_type_layout::TypeGraphLayout; + use rustacuda_core::DeviceCopy; + + #[doc(hidden)] + #[derive(TypeLayout)] + #[repr(C)] + pub struct ThreadBlockSharedFfi { + pub(super) _marker: [T; 0], + } + + // Safety: there is nothing to copy, this is just a zero-sized marker type + unsafe impl DeviceCopy for ThreadBlockSharedFfi {} +} + +impl<'a, T: 'static + TypeGraphLayout> CudaKernelParameter + for &'a mut crate::utils::shared::r#static::ThreadBlockShared +{ + #[cfg(feature = "host")] + type AsyncHostType<'stream, 'b> = &'b mut crate::utils::shared::r#static::ThreadBlockShared; + #[cfg(all(not(feature = "host"), target_os = "cuda"))] + type DeviceType<'b> = &'b mut crate::utils::shared::r#static::ThreadBlockShared; + type FfiType<'stream, 'b> = private_shared::ThreadBlockSharedFfi; + #[cfg(feature = "host")] + type SyncHostType = Self; + + #[cfg(feature = "host")] + fn with_new_async<'stream, O, E: From>( + param: Self::SyncHostType, + _stream: &'stream rustacuda::stream::Stream, + inner: impl for<'b> FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result, + ) -> Result { + inner(param) + } + + #[cfg(feature = "host")] + fn with_async_as_ptx_jit( + _param: &Self::AsyncHostType<'_, '_>, + inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O, + ) -> O { + inner(None) + } + + #[cfg(feature = "host")] + fn async_to_ffi<'stream, 'b>( + _param: Self::AsyncHostType<'stream, 'b>, + ) -> Self::FfiType<'stream, 'b> { + private_shared::ThreadBlockSharedFfi { _marker: [] } + } + + #[cfg(all(not(feature = "host"), target_os = "cuda"))] + #[allow(clippy::inline_always)] + #[inline(always)] + fn with_ffi_as_device( + _param: Self::FfiType<'static, 'static>, + inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O, + ) -> O { + let mut param = crate::utils::shared::r#static::ThreadBlockShared::new_uninit(); + + inner(&mut param) + } +} +impl<'a, T: 'static + TypeGraphLayout> sealed::Sealed + for &'a mut crate::utils::shared::r#static::ThreadBlockShared +{ +} diff --git a/src/device/thread.rs b/src/device/thread.rs index 26ee357d2..b2f3035bd 100644 --- a/src/device/thread.rs +++ b/src/device/thread.rs @@ -20,7 +20,7 @@ impl Thread { #[must_use] #[allow(clippy::inline_always)] #[inline(always)] - pub fn this() -> Self { + pub const fn this() -> Self { Self { _private: () } } @@ -54,7 +54,7 @@ impl Thread { #[must_use] #[allow(clippy::inline_always)] #[inline(always)] - pub fn block(&self) -> ThreadBlock { + pub const fn block(&self) -> ThreadBlock { ThreadBlock { _private: () } } } @@ -91,7 +91,7 @@ impl ThreadBlock { #[must_use] #[allow(clippy::inline_always)] #[inline(always)] - pub fn grid(&self) -> ThreadBlockGrid { + pub const fn grid(&self) -> ThreadBlockGrid { ThreadBlockGrid { _private: () } } @@ -138,7 +138,7 @@ impl Dim3 { #[must_use] #[allow(clippy::inline_always)] #[inline(always)] - pub fn size(&self) -> usize { + pub const fn size(&self) -> usize { (self.x as usize) * (self.y as usize) * (self.z as usize) } } @@ -147,7 +147,7 @@ impl Idx3 { #[must_use] #[allow(clippy::inline_always)] #[inline(always)] - pub fn as_id(&self, dim: &Dim3) -> usize { + pub const fn as_id(&self, dim: &Dim3) -> usize { (self.x as usize) + (self.y as usize) * (dim.x as usize) + (self.z as usize) * (dim.x as usize) * (dim.y as usize) diff --git a/src/device/utils.rs b/src/device/utils.rs index 073e7bd54..3b37307a6 100644 --- a/src/device/utils.rs +++ b/src/device/utils.rs @@ -48,6 +48,7 @@ pub fn print(args: ::core::fmt::Arguments) { } let msg; // place to store the dynamically expanded format string + #[allow(clippy::option_if_let_else)] let msg = if let Some(msg) = args.as_str() { msg } else { @@ -87,6 +88,7 @@ pub fn pretty_panic_handler( } let msg; // place to store the dynamically expanded format string + #[allow(clippy::option_if_let_else)] let msg = if let Some(message) = info.message() { if let Some(msg) = message.as_str() { msg diff --git a/src/host/mod.rs b/src/host/mod.rs index f619b4415..685584659 100644 --- a/src/host/mod.rs +++ b/src/host/mod.rs @@ -154,7 +154,7 @@ impl<'stream, 'kernel, Kernel> Launcher<'stream, 'kernel, Kernel> { ) => with12_async => launch12_async } } -#[derive(Clone, Debug, PartialEq)] +#[derive(Clone, Debug, PartialEq, Eq)] pub struct LaunchConfig { pub grid: rustacuda::function::GridSize, pub block: rustacuda::function::BlockSize, @@ -676,7 +676,7 @@ impl From> for LockedBox { fn from(host_locked_box: HostLockedBox) -> Self { // Safety: pointer comes from [`LockedBox::into_raw`] // i.e. this function completes the roundtrip - unsafe { LockedBox::from_raw(host_locked_box.0) } + unsafe { Self::from_raw(host_locked_box.0) } } } @@ -790,7 +790,7 @@ impl From> for DeviceBox { fn from(host_device_box: HostDeviceBox) -> Self { // Safety: pointer comes from [`DeviceBox::into_device`] // i.e. this function completes the roundtrip - unsafe { DeviceBox::from_device(host_device_box.0) } + unsafe { Self::from_device(host_device_box.0) } } } @@ -918,7 +918,7 @@ impl<'a, T: DeviceCopy> HostAndDeviceConstRef<'a, T> { /// # Safety /// /// `device_box` must contain EXACTLY the device copy of `host_ref` - pub unsafe fn new(device_box: &'a HostDeviceBox, host_ref: &'a T) -> Self { + pub const unsafe fn new(device_box: &'a HostDeviceBox, host_ref: &'a T) -> Self { Self { device_box, host_ref, @@ -962,12 +962,12 @@ impl<'a, T: DeviceCopy> HostAndDeviceConstRef<'a, T> { } #[must_use] - pub fn for_host(&'a self) -> &'a T { + pub const fn for_host(&'a self) -> &'a T { self.host_ref } #[must_use] - pub fn as_ref<'b>(&'b self) -> HostAndDeviceConstRef<'b, T> + pub const fn as_ref<'b>(&'b self) -> HostAndDeviceConstRef<'b, T> where 'a: 'b, { @@ -975,7 +975,7 @@ impl<'a, T: DeviceCopy> HostAndDeviceConstRef<'a, T> { } #[must_use] - pub fn as_async<'stream, 'b>(&'b self) -> HostAndDeviceConstRefAsync<'stream, 'b, T> + pub const fn as_async<'stream, 'b>(&'b self) -> HostAndDeviceConstRefAsync<'stream, 'b, T> where 'a: 'b, { @@ -1124,7 +1124,8 @@ impl<'stream, 'a, T: DeviceCopy> HostAndDeviceConstRefAsync<'stream, 'a, T> { /// # Safety /// /// `device_box` must contain EXACTLY the device copy of `host_ref` - pub unsafe fn new( + #[must_use] + pub const unsafe fn new( device_box: &'a HostDeviceBox, host_ref: &'a T, stream: &'stream Stream, @@ -1154,12 +1155,12 @@ impl<'stream, 'a, T: DeviceCopy> HostAndDeviceConstRefAsync<'stream, 'a, T> { } #[must_use] - pub fn for_host(&'a self) -> &'a T { + pub const fn for_host(&'a self) -> &'a T { self.host_ref } #[must_use] - pub fn as_ref<'b>(&'b self) -> HostAndDeviceConstRefAsync<'stream, 'b, T> + pub const fn as_ref<'b>(&'b self) -> HostAndDeviceConstRefAsync<'stream, 'b, T> where 'a: 'b, { diff --git a/src/lib.rs b/src/lib.rs index 392928d29..26f56adb6 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,4 +1,10 @@ -#![deny(clippy::pedantic)] +#![deny(clippy::complexity)] +#![deny(clippy::correctness)] +#![warn(clippy::nursery)] +#![warn(clippy::pedantic)] +#![deny(clippy::perf)] +#![deny(clippy::style)] +#![deny(clippy::suspicious)] #![allow(clippy::useless_attribute)] #![cfg_attr(not(feature = "host"), no_std)] #![feature(associated_type_bounds)] diff --git a/src/utils/aliasing/const.rs b/src/utils/aliasing/const.rs index c40b3642a..759b14dc9 100644 --- a/src/utils/aliasing/const.rs +++ b/src/utils/aliasing/const.rs @@ -14,7 +14,7 @@ pub struct SplitSliceOverCudaThreadsConstStride(T); impl SplitSliceOverCudaThreadsConstStride { #[must_use] - pub fn new(inner: T) -> Self { + pub const fn new(inner: T) -> Self { Self(inner) } } @@ -49,7 +49,7 @@ impl SplitSliceOverCudaThreadsConstStride { /// All cross-CUDA-thread aliasing guarantees are lost with this method. /// Instead, the caller must ensure that no two threads in a kernel launch /// access the same underlying elements. - pub unsafe fn alias_unchecked(&self) -> &T { + pub const unsafe fn alias_unchecked(&self) -> &T { &self.0 } diff --git a/src/utils/aliasing/dynamic.rs b/src/utils/aliasing/dynamic.rs index c70ca80f8..a3cecfa8f 100644 --- a/src/utils/aliasing/dynamic.rs +++ b/src/utils/aliasing/dynamic.rs @@ -17,7 +17,7 @@ pub struct SplitSliceOverCudaThreadsDynamicStride { impl SplitSliceOverCudaThreadsDynamicStride { #[must_use] - pub fn new(inner: T, stride: usize) -> Self { + pub const fn new(inner: T, stride: usize) -> Self { Self { stride, inner } } } @@ -49,7 +49,7 @@ impl SplitSliceOverCudaThreadsDynamicStride { /// All cross-CUDA-thread aliasing guarantees are lost with this method. /// Instead, the caller must ensure that no two threads in a kernel launch /// access the same underlying elements. - pub unsafe fn alias_unchecked(&self) -> &T { + pub const unsafe fn alias_unchecked(&self) -> &T { &self.inner } diff --git a/src/utils/device_copy.rs b/src/utils/device_copy.rs index b06735692..0a92e69a1 100644 --- a/src/utils/device_copy.rs +++ b/src/utils/device_copy.rs @@ -28,45 +28,54 @@ impl From for SafeDeviceCopyWrapper { } impl SafeDeviceCopyWrapper { + #[must_use] pub fn into_inner(self) -> T { self.0 } - pub fn from_ref(reference: &T) -> &Self { + #[must_use] + pub const fn from_ref(reference: &T) -> &Self { // Safety: [`SafeDeviceCopyWrapper`] is a transparent newtype around [`T`] unsafe { &*(reference as *const T).cast() } } - pub fn into_ref(&self) -> &T { + #[must_use] + pub const fn into_ref(&self) -> &T { // Safety: [`SafeDeviceCopyWrapper`] is a transparent newtype around [`T`] unsafe { &*(self as *const Self).cast() } } + #[must_use] pub fn from_mut(reference: &mut T) -> &mut Self { // Safety: [`SafeDeviceCopyWrapper`] is a transparent newtype around [`T`] unsafe { &mut *(reference as *mut T).cast() } } + #[must_use] pub fn into_mut(&mut self) -> &mut T { // Safety: [`SafeDeviceCopyWrapper`] is a transparent newtype around [`T`] unsafe { &mut *(self as *mut Self).cast() } } - pub fn from_slice(slice: &[T]) -> &[Self] { + #[must_use] + pub const fn from_slice(slice: &[T]) -> &[Self] { // Safety: [`SafeDeviceCopyWrapper`] is a transparent newtype around [`T`] unsafe { core::slice::from_raw_parts(slice.as_ptr().cast(), slice.len()) } } - pub fn into_slice(slice: &[Self]) -> &[T] { + #[must_use] + pub const fn into_slice(slice: &[Self]) -> &[T] { // Safety: [`SafeDeviceCopyWrapper`] is a transparent newtype around [`T`] unsafe { core::slice::from_raw_parts(slice.as_ptr().cast(), slice.len()) } } + #[must_use] pub fn from_mut_slice(slice: &mut [T]) -> &mut [Self] { // Safety: [`SafeDeviceCopyWrapper`] is a transparent newtype around [`T`] unsafe { core::slice::from_raw_parts_mut(slice.as_mut_ptr().cast(), slice.len()) } } + #[must_use] pub fn into_mut_slice(slice: &mut [Self]) -> &mut [T] { // Safety: [`SafeDeviceCopyWrapper`] is a transparent newtype around [`T`] unsafe { core::slice::from_raw_parts_mut(slice.as_mut_ptr().cast(), slice.len()) } diff --git a/src/utils/exchange/buffer/mod.rs b/src/utils/exchange/buffer/mod.rs index c4e4b24bd..66b2144c1 100644 --- a/src/utils/exchange/buffer/mod.rs +++ b/src/utils/exchange/buffer/mod.rs @@ -32,7 +32,7 @@ unsafe impl rustacuda_core: impl CudaExchangeItem { #[cfg(any(feature = "host", doc))] #[doc(cfg(feature = "host"))] - pub fn read(&self) -> &T { + pub const fn read(&self) -> &T { &self.0 } @@ -46,7 +46,7 @@ impl CudaExchangeItem { impl CudaExchangeItem { #[cfg(any(not(feature = "host"), doc))] #[doc(cfg(not(feature = "host")))] - pub fn read(&self) -> &T { + pub const fn read(&self) -> &T { &self.0 } @@ -66,7 +66,7 @@ impl AsMut for CudaExchangeItem { impl CudaExchangeItem { #[cfg(any(feature = "host", doc))] #[doc(cfg(feature = "host"))] - pub fn as_scratch(&self) -> &T { + pub const fn as_scratch(&self) -> &T { &self.0 } @@ -80,7 +80,7 @@ impl CudaExchangeItem { impl CudaExchangeItem { #[cfg(any(not(feature = "host"), doc))] #[doc(cfg(not(feature = "host")))] - pub fn as_scratch(&self) -> &T { + pub const fn as_scratch(&self) -> &T { &self.0 } @@ -94,7 +94,7 @@ impl CudaExchangeItem { impl CudaExchangeItem { #[cfg(any(feature = "host", doc))] #[doc(cfg(feature = "host"))] - pub fn as_uninit(&self) -> &MaybeUninit { + pub const fn as_uninit(&self) -> &MaybeUninit { // Safety: // - MaybeUninit is a transparent newtype union // - CudaExchangeItem is a transparent newtype @@ -114,7 +114,7 @@ impl CudaExchangeItem { impl CudaExchangeItem { #[cfg(any(not(feature = "host"), doc))] #[doc(cfg(not(feature = "host")))] - pub fn as_uninit(&self) -> &MaybeUninit { + pub const fn as_uninit(&self) -> &MaybeUninit { // Safety: // - MaybeUninit is a transparent newtype union // - CudaExchangeItem is a transparent newtype diff --git a/src/utils/exchange/wrapper.rs b/src/utils/exchange/wrapper.rs index 4ca2474d4..4edfdebd8 100644 --- a/src/utils/exchange/wrapper.rs +++ b/src/utils/exchange/wrapper.rs @@ -224,13 +224,13 @@ impl<'stream, T: RustToCuda> IntoFuture core::future::poll_fn(move |cx| match &wrapper { Some(inner) => match inner.move_event.query() { - Ok(EventStatus::NotReady) => match inner.waker.lock() { - Ok(mut w) => { + Ok(EventStatus::NotReady) => inner.waker.lock().map_or_else( + |_| Poll::Ready(Err(CudaError::OperatingSystemError)), + |mut w| { *w = Some(cx.waker().clone()); Poll::Pending }, - Err(_) => Poll::Ready(Err(CudaError::OperatingSystemError)), - }, + ), Ok(EventStatus::Ready) => match wrapper.take() { Some(inner) => Poll::Ready(Ok(ExchangeWrapperOnHost { value: inner.value, @@ -419,13 +419,13 @@ impl<'stream, T: RustToCuda> IntoFuture core::future::poll_fn(move |cx| match &wrapper { Some(inner) => match inner.move_event.query() { - Ok(EventStatus::NotReady) => match inner.waker.lock() { - Ok(mut w) => { + Ok(EventStatus::NotReady) => inner.waker.lock().map_or_else( + |_| Poll::Ready(Err(CudaError::OperatingSystemError)), + |mut w| { *w = Some(cx.waker().clone()); Poll::Pending }, - Err(_) => Poll::Ready(Err(CudaError::OperatingSystemError)), - }, + ), Ok(EventStatus::Ready) => match wrapper.take() { Some(inner) => Poll::Ready(Ok(ExchangeWrapperOnDevice { value: inner.value, diff --git a/src/utils/shared/slice.rs b/src/utils/shared/slice.rs index 804623ae4..920f0c58d 100644 --- a/src/utils/shared/slice.rs +++ b/src/utils/shared/slice.rs @@ -63,14 +63,14 @@ impl ThreadBlockSharedSlice { #[cfg(any(target_os = "cuda", doc))] #[doc(cfg(target_os = "cuda"))] #[must_use] - pub fn as_mut_ptr(&self) -> *mut T { + pub const fn as_mut_ptr(&self) -> *mut T { self.shared.cast() } #[cfg(any(target_os = "cuda", doc))] #[doc(cfg(target_os = "cuda"))] #[must_use] - pub fn as_mut_slice_ptr(&self) -> *mut [T] { + pub const fn as_mut_slice_ptr(&self) -> *mut [T] { self.shared } diff --git a/src/utils/shared/static.rs b/src/utils/shared/static.rs index 0ba7f9df0..41ba334ba 100644 --- a/src/utils/shared/static.rs +++ b/src/utils/shared/static.rs @@ -9,6 +9,8 @@ pub struct ThreadBlockShared { impl ThreadBlockShared { #[must_use] + #[allow(clippy::inline_always, clippy::missing_const_for_fn)] + #[inline(always)] pub fn new_uninit() -> Self { #[cfg(not(target_os = "cuda"))] { @@ -38,7 +40,7 @@ impl ThreadBlockShared { #[cfg(any(target_os = "cuda", doc))] #[doc(cfg(target_os = "cuda"))] #[must_use] - pub fn as_mut_ptr(&self) -> *mut T { + pub const fn as_mut_ptr(&self) -> *mut T { self.shared } } From ea74fa2edf0b7cd7745734776886690e2ec2221a Mon Sep 17 00:00:00 2001 From: Juniper Tyree Date: Sun, 24 Dec 2023 05:28:22 +0000 Subject: [PATCH 060/120] Remove unsound mutable lending to CUDA for now --- examples/single-source/src/main.rs | 1 - src/common.rs | 115 ----------------------------- src/device/mod.rs | 50 ++++--------- src/host/mod.rs | 46 ------------ 4 files changed, 13 insertions(+), 199 deletions(-) diff --git a/examples/single-source/src/main.rs b/examples/single-source/src/main.rs index d88d628a7..1b677192d 100644 --- a/examples/single-source/src/main.rs +++ b/examples/single-source/src/main.rs @@ -62,7 +62,6 @@ pub fn kernel< + rc::safety::NoSafeAliasing, >( _x: &rc::common::PerThreadShallowCopy, - _y: &mut rc::common::PtxJit>>, _z: &rc::common::SharedHeapPerThreadShallowCopy>, _v @ _w: &'a rc::common::ShallowInteriorMutable, _: rc::common::SharedHeapPerThreadShallowCopy>, diff --git a/src/common.rs b/src/common.rs index b0263031b..2e8d72a7c 100644 --- a/src/common.rs +++ b/src/common.rs @@ -865,59 +865,6 @@ impl<'a, T: RustToCuda + crate::safety::NoSafeAliasing> sealed::Sealed { } -impl<'a, T: 'static + RustToCuda + crate::safety::NoSafeAliasing> CudaKernelParameter - for &'a mut SharedHeapPerThreadShallowCopy -{ - #[cfg(feature = "host")] - type AsyncHostType<'stream, 'b> = crate::host::HostAndDeviceMutRefAsync< - 'stream, - 'b, - DeviceAccessible<::CudaRepresentation>, - >; - #[cfg(all(not(feature = "host"), target_os = "cuda"))] - type DeviceType<'b> = &'b mut T; - type FfiType<'stream, 'b> = - DeviceMutRef<'b, DeviceAccessible<::CudaRepresentation>>; - #[cfg(feature = "host")] - type SyncHostType = &'a mut T; - - #[cfg(feature = "host")] - fn with_new_async<'stream, O, E: From>( - param: Self::SyncHostType, - _stream: &'stream rustacuda::stream::Stream, - inner: impl for<'b> FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result, - ) -> Result { - crate::host::LendToCuda::lend_to_cuda_mut(param, |mut param| inner(param.as_async())) - } - - #[cfg(feature = "host")] - fn with_async_as_ptx_jit( - _param: &Self::AsyncHostType<'_, '_>, - inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O, - ) -> O { - inner(None) - } - - #[cfg(feature = "host")] - fn async_to_ffi<'stream, 'b>( - mut param: Self::AsyncHostType<'stream, 'b>, - ) -> Self::FfiType<'stream, 'b> { - unsafe { param.for_device_async() } - } - - #[cfg(all(not(feature = "host"), target_os = "cuda"))] - fn with_ffi_as_device( - param: Self::FfiType<'static, 'static>, - inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O, - ) -> O { - unsafe { crate::device::BorrowFromRust::with_borrow_from_rust_mut(param, inner) } - } -} -impl<'a, T: RustToCuda + crate::safety::NoSafeAliasing> sealed::Sealed - for &'a mut SharedHeapPerThreadShallowCopy -{ -} - impl< T: RustToCuda< CudaRepresentation: 'static + crate::safety::SafeDeviceCopy, @@ -1041,68 +988,6 @@ impl<'a, T: RustToCuda + crate::safety::NoSafeAliasing> sealed::Sealed { } -impl<'a, T: 'static + RustToCuda + crate::safety::NoSafeAliasing> CudaKernelParameter - for &'a mut PtxJit> -{ - #[cfg(feature = "host")] - type AsyncHostType<'stream, 'b> = - <&'a mut SharedHeapPerThreadShallowCopy as CudaKernelParameter>::AsyncHostType< - 'stream, - 'b, - >; - #[cfg(all(not(feature = "host"), target_os = "cuda"))] - type DeviceType<'b> = - <&'a mut SharedHeapPerThreadShallowCopy as CudaKernelParameter>::DeviceType<'b>; - type FfiType<'stream, 'b> = - <&'a mut SharedHeapPerThreadShallowCopy as CudaKernelParameter>::FfiType<'stream, 'b>; - #[cfg(feature = "host")] - type SyncHostType = - <&'a mut SharedHeapPerThreadShallowCopy as CudaKernelParameter>::SyncHostType; - - #[cfg(feature = "host")] - fn with_new_async<'stream, O, E: From>( - param: Self::SyncHostType, - stream: &'stream rustacuda::stream::Stream, - inner: impl for<'b> FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result, - ) -> Result { - <&'a mut SharedHeapPerThreadShallowCopy as CudaKernelParameter>::with_new_async( - param, stream, inner, - ) - } - - #[cfg(feature = "host")] - fn with_async_as_ptx_jit( - param: &Self::AsyncHostType<'_, '_>, - inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O, - ) -> O { - inner(Some(¶m_as_raw_bytes(param.for_host()))) - } - - #[cfg(feature = "host")] - fn async_to_ffi<'stream, 'b>( - param: Self::AsyncHostType<'stream, 'b>, - ) -> Self::FfiType<'stream, 'b> { - <&'a mut SharedHeapPerThreadShallowCopy as CudaKernelParameter>::async_to_ffi(param) - } - - #[cfg(all(not(feature = "host"), target_os = "cuda"))] - fn with_ffi_as_device( - param: Self::FfiType<'static, 'static>, - inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O, - ) -> O { - emit_param_ptx_jit_marker::<_, PARAM>(param.as_ref()); - - <&'a mut SharedHeapPerThreadShallowCopy as CudaKernelParameter>::with_ffi_as_device::< - O, - PARAM, - >(param, inner) - } -} -impl<'a, T: RustToCuda + crate::safety::NoSafeAliasing> sealed::Sealed - for &'a mut PtxJit> -{ -} - #[cfg(feature = "host")] fn param_as_raw_bytes(r: &T) -> NonNull<[u8]> { NonNull::slice_from_raw_parts(NonNull::from(r).cast::(), core::mem::size_of_val(r)) diff --git a/src/device/mod.rs b/src/device/mod.rs index 5ce92bbbe..c4a459087 100644 --- a/src/device/mod.rs +++ b/src/device/mod.rs @@ -5,9 +5,7 @@ use core::mem::ManuallyDrop; pub use rust_cuda_derive::{specialise_kernel_function, specialise_kernel_type}; use crate::{ - common::{ - CudaAsRust, DeviceAccessible, DeviceConstRef, DeviceMutRef, DeviceOwnedRef, RustToCuda, - }, + common::{CudaAsRust, DeviceAccessible, DeviceConstRef, DeviceOwnedRef, RustToCuda}, safety::{NoSafeAliasing, SafeDeviceCopy}, }; @@ -26,18 +24,18 @@ pub trait BorrowFromRust: RustToCuda + NoSafeAliasing { inner: F, ) -> O; - /// # Safety - /// - /// This function is only safe to call iff `cuda_repr_mut` is the - /// [`DeviceMutRef`] borrowed on the CPU using the corresponding - /// [`LendToCuda::lend_to_cuda_mut`](crate::host::LendToCuda::lend_to_cuda_mut). - /// Furthermore, since different GPU threads can access heap storage - /// mutably inside the safe `inner` scope, there must not be any - /// aliasing between concurrently running threads. - unsafe fn with_borrow_from_rust_mut O>( - cuda_repr_mut: DeviceMutRef::CudaRepresentation>>, - inner: F, - ) -> O; + // /// # Safety + // /// + // /// This function is only safe to call iff `cuda_repr_mut` is the + // /// [`DeviceMutRef`] borrowed on the CPU using the corresponding + // /// [`LendToCuda::lend_to_cuda_mut`](crate::host::LendToCuda::lend_to_cuda_mut). + // /// Furthermore, since different GPU threads can access heap storage + // /// mutably inside the safe `inner` scope, there must not be any + // /// aliasing between concurrently running threads. + // unsafe fn with_borrow_from_rust_mut O>( + // cuda_repr_mut: DeviceMutRef::CudaRepresentation>>, inner: F, + // ) -> O; /// # Safety /// @@ -66,28 +64,6 @@ impl BorrowFromRust for T { inner(&rust_repr) } - #[inline] - unsafe fn with_borrow_from_rust_mut O>( - mut cuda_repr_mut: DeviceMutRef::CudaRepresentation>>, - inner: F, - ) -> O { - // `rust_repr_mut` must never be dropped as we do NOT own any of the - // heap memory it might reference - let mut rust_repr_mut = ManuallyDrop::new(CudaAsRust::as_rust(cuda_repr_mut.as_mut())); - - // Ideally, we would only provide access to `Pin<&mut T>` s.t. - // `core::mem::replace` cannot be used. - // However, we should still be fine because - // - the shallow part of `rust_repr_mut` is a unique copy per thread, so - // replacing it affects no other thread (immediately, drop is handled below) - // - any deep parts of `rust_repr_mut` are not allowed to hand out aliasing - // mutable references, so any deep memory replacement would not affect other - // threads - // - since any deep data is allocated from the host, we *hope* that trying to - // drop it in a CUDA thread turns into a no-op - inner(&mut rust_repr_mut) - } - #[inline] unsafe fn with_moved_from_rust O>( mut cuda_repr_mut: DeviceOwnedRef< diff --git a/src/host/mod.rs b/src/host/mod.rs index 685584659..424b5726c 100644 --- a/src/host/mod.rs +++ b/src/host/mod.rs @@ -457,31 +457,6 @@ pub trait LendToCuda: RustToCuda + NoSafeAliasing { inner: F, ) -> Result; - /// Lends a mutable copy of `&mut self` to CUDA: - /// - code in the CUDA kernel can only access `&mut self` through the - /// [`DeviceMutRef`] inside the closure - /// - after the closure, `&mut self` might have changed in the following - /// ways: - /// - to avoid aliasing, each CUDA thread gets its own shallow copy of - /// `&mut self`, i.e. any shallow changes will NOT be reflected after - /// the closure - /// - each CUDA thread can access the same heap allocated storage, i.e. - /// any deep changes will be reflected after the closure - /// - /// # Errors - /// - /// Returns a [`CudaError`] iff an error occurs inside CUDA - fn lend_to_cuda_mut< - O, - E: From, - F: FnOnce( - HostAndDeviceMutRef::CudaRepresentation>>, - ) -> Result, - >( - &mut self, - inner: F, - ) -> Result; - /// Moves `self` to CUDA iff `self` is [`SafeDeviceCopy`] /// /// # Errors @@ -522,27 +497,6 @@ impl LendToCuda for T { result } - fn lend_to_cuda_mut< - O, - E: From, - F: FnOnce( - HostAndDeviceMutRef::CudaRepresentation>>, - ) -> Result, - >( - &mut self, - inner: F, - ) -> Result { - let (mut cuda_repr, alloc) = unsafe { self.borrow(NoCudaAlloc) }?; - - let result = HostAndDeviceMutRef::with_new(&mut cuda_repr, inner); - - core::mem::drop(cuda_repr); - - let _: NoCudaAlloc = unsafe { self.restore(alloc) }?; - - result - } - fn move_to_cuda< O, E: From, From 000a3f60cf48ab8b46c5ada22c7f3040c68b851f Mon Sep 17 00:00:00 2001 From: Juniper Tyree Date: Sun, 24 Dec 2023 08:46:31 +0000 Subject: [PATCH 061/120] Allow passing ThreadBlockSharedSlice to kernel for dynamic shared memory --- .github/workflows/rustdoc.yml | 2 +- Cargo.toml | 2 +- examples/single-source/src/main.rs | 8 ++ .../kernel/wrapper/generate/cuda_wrapper.rs | 33 +++-- src/common.rs | 114 +++++++++++++++--- src/utils/shared/slice.rs | 57 +++++++++ 6 files changed, 183 insertions(+), 33 deletions(-) diff --git a/.github/workflows/rustdoc.yml b/.github/workflows/rustdoc.yml index 23f4f1c07..5c756572c 100644 --- a/.github/workflows/rustdoc.yml +++ b/.github/workflows/rustdoc.yml @@ -28,7 +28,7 @@ jobs: run: | RUSTDOCFLAGS="\ --enable-index-page \ - --extern-html-root-url const_type_layout=https://docs.rs/const-type-layout/0.2.0/ \ + --extern-html-root-url const_type_layout=https://docs.rs/const-type-layout/0.2.1/ \ --extern-html-root-url final=https://docs.rs/final/0.1.1/ \ --extern-html-root-url rustacuda=https://docs.rs/rustacuda/0.1.3/ \ --extern-html-root-url rustacuda_core=https://docs.rs/rustacuda_core/0.1.2/ \ diff --git a/Cargo.toml b/Cargo.toml index 43687be65..a218c629d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -30,7 +30,7 @@ rustacuda_derive = { git = "https://github.com/juntyr/RustaCUDA", rev = "c6ea7cc regex = { version = "1.10", optional = true } -const-type-layout = { version = "0.2.0", features = ["derive"] } +const-type-layout = { version = "0.2.1", features = ["derive"] } final = "0.1.1" diff --git a/examples/single-source/src/main.rs b/examples/single-source/src/main.rs index 1b677192d..b57556963 100644 --- a/examples/single-source/src/main.rs +++ b/examples/single-source/src/main.rs @@ -67,6 +67,7 @@ pub fn kernel< _: rc::common::SharedHeapPerThreadShallowCopy>, q @ Triple(s, mut __t, _u): rc::common::PerThreadShallowCopy, shared3: &mut rc::utils::shared::r#static::ThreadBlockShared, + dynamic: &mut rc::utils::shared::slice::ThreadBlockSharedSlice, ) { let shared = rc::utils::shared::r#static::ThreadBlockShared::<[Tuple; 3]>::new_uninit(); let shared2 = rc::utils::shared::r#static::ThreadBlockShared::<[Tuple; 3]>::new_uninit(); @@ -82,6 +83,13 @@ pub fn kernel< unsafe { *shared3.as_mut_ptr() = 12; } + + let index = rc::device::thread::Thread::this().index(); + if index < dynamic.len() { + unsafe { + *dynamic.index_mut_unchecked(index) = Dummy(42); + } + } } #[cfg(not(target_os = "cuda"))] diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cuda_wrapper.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cuda_wrapper.rs index 7fce0a925..1bb8a5577 100644 --- a/rust-cuda-derive/src/kernel/wrapper/generate/cuda_wrapper.rs +++ b/rust-cuda-derive/src/kernel/wrapper/generate/cuda_wrapper.rs @@ -35,16 +35,11 @@ pub(in super::super) fn quote_cuda_wrapper( }) .collect::>(); - let ffi_param_ptx_jit_wrap = func_inputs - .iter().enumerate() - .rev() - .fold(quote! { + let ffi_param_ptx_jit_wrap = func_inputs.iter().enumerate().rev().fold( + quote! { #func_ident(#(#func_params),*) - }, |inner, (i, syn::PatType { - pat, - ty, - .. - })| { + }, + |inner, (i, syn::PatType { pat, ty, .. })| { let specialised_ty = quote::quote_spanned! { ty.span()=> #crate_path::device::specialise_kernel_type!(#ty for #generics in #func_ident) }; @@ -53,18 +48,30 @@ pub(in super::super) fn quote_cuda_wrapper( // To allow some parameters to also inject PTX JIT load markers here, // we pass them the param index i quote::quote_spanned! { ty.span()=> - <#specialised_ty as #crate_path::common::CudaKernelParameter>::with_ffi_as_device::<_, #i>( - #pat, |#pat| { #inner } - ) + unsafe { + < + #specialised_ty as #crate_path::common::CudaKernelParameter + >::with_ffi_as_device::<_, #i>( + #pat, |#pat| { #inner } + ) + } } - }); + }, + ); quote! { #[cfg(target_os = "cuda")] #[#crate_path::device::specialise_kernel_function(#func_ident)] #[no_mangle] + #[allow(unused_unsafe)] #(#func_attrs)* pub unsafe extern "ptx-kernel" fn #func_ident_hash(#(#ffi_inputs),*) { + unsafe { + // Initialise the dynamically-sized thread-block shared memory + // and the thread-local offset pointer that points to it + #crate_path::utils::shared::slice::init(); + } + unsafe { ::core::arch::asm!(#KERNEL_TYPE_USE_START_CANARY); } diff --git a/src/common.rs b/src/common.rs index 2e8d72a7c..7f8f6cfec 100644 --- a/src/common.rs +++ b/src/common.rs @@ -362,7 +362,7 @@ pub trait CudaKernelParameter: sealed::Sealed { #[doc(hidden)] #[cfg(all(not(feature = "host"), target_os = "cuda"))] - fn with_ffi_as_device( + unsafe fn with_ffi_as_device( param: Self::FfiType<'static, 'static>, inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O, ) -> O; @@ -461,7 +461,7 @@ impl< } #[cfg(all(not(feature = "host"), target_os = "cuda"))] - fn with_ffi_as_device( + unsafe fn with_ffi_as_device( param: Self::FfiType<'static, 'static>, inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O, ) -> O { @@ -536,7 +536,7 @@ impl< } #[cfg(all(not(feature = "host"), target_os = "cuda"))] - fn with_ffi_as_device( + unsafe fn with_ffi_as_device( param: Self::FfiType<'static, 'static>, inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O, ) -> O { @@ -597,7 +597,7 @@ impl< } #[cfg(all(not(feature = "host"), target_os = "cuda"))] - fn with_ffi_as_device( + unsafe fn with_ffi_as_device( param: Self::FfiType<'static, 'static>, inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O, ) -> O { @@ -691,7 +691,7 @@ impl<'a, T: 'static + InteriorMutableSafeDeviceCopy> CudaKernelParameter } #[cfg(all(not(feature = "host"), target_os = "cuda"))] - fn with_ffi_as_device( + unsafe fn with_ffi_as_device( param: Self::FfiType<'static, 'static>, inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O, ) -> O { @@ -726,11 +726,20 @@ impl_atomic_interior_mutable! { AtomicU8(u8), AtomicU16(u16), AtomicU32(u32), AtomicU64(u64), AtomicUsize(usize) } -// TODO: update const type layout -// impl -// InteriorMutableSafeDeviceCopy for core::cell::SyncUnsafeCell {} -// impl sealed::Sealed for -// core::cell::SyncUnsafeCell {} +impl< + T: crate::safety::StackOnly + + crate::safety::NoSafeAliasing + + const_type_layout::TypeGraphLayout, + > InteriorMutableSafeDeviceCopy for core::cell::SyncUnsafeCell +{ +} +impl< + T: crate::safety::StackOnly + + crate::safety::NoSafeAliasing + + const_type_layout::TypeGraphLayout, + > sealed::Sealed for core::cell::SyncUnsafeCell +{ +} pub struct SharedHeapPerThreadShallowCopy { never: !, @@ -796,7 +805,7 @@ impl< } #[cfg(all(not(feature = "host"), target_os = "cuda"))] - fn with_ffi_as_device( + unsafe fn with_ffi_as_device( param: Self::FfiType<'static, 'static>, inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O, ) -> O { @@ -853,7 +862,7 @@ impl<'a, T: 'static + RustToCuda + crate::safety::NoSafeAliasing> CudaKernelPara } #[cfg(all(not(feature = "host"), target_os = "cuda"))] - fn with_ffi_as_device( + unsafe fn with_ffi_as_device( param: Self::FfiType<'static, 'static>, inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O, ) -> O { @@ -910,7 +919,7 @@ impl< } #[cfg(all(not(feature = "host"), target_os = "cuda"))] - fn with_ffi_as_device( + unsafe fn with_ffi_as_device( param: Self::FfiType<'static, 'static>, inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O, ) -> O { @@ -972,7 +981,7 @@ impl<'a, T: 'static + RustToCuda + crate::safety::NoSafeAliasing> CudaKernelPara } #[cfg(all(not(feature = "host"), target_os = "cuda"))] - fn with_ffi_as_device( + unsafe fn with_ffi_as_device( param: Self::FfiType<'static, 'static>, inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O, ) -> O { @@ -997,9 +1006,9 @@ fn param_as_raw_bytes(r: &T) -> NonNull<[u8]> { fn emit_param_ptx_jit_marker(param: &T) { unsafe { core::arch::asm!( - "// //", - in(reg32) *(core::ptr::from_ref(param).cast::()), - const(INDEX), + "// //", + param_reg = in(reg32) *(core::ptr::from_ref(param).cast::()), + param_index = const(INDEX), ); } } @@ -1017,6 +1026,17 @@ mod private_shared { // Safety: there is nothing to copy, this is just a zero-sized marker type unsafe impl DeviceCopy for ThreadBlockSharedFfi {} + + #[doc(hidden)] + #[derive(TypeLayout)] + #[repr(C)] + pub struct ThreadBlockSharedSliceFfi { + pub(super) len: usize, + pub(super) _marker: [T; 0], + } + + // Safety: we only copy a usize, which implements `DeviceCopy` + unsafe impl DeviceCopy for ThreadBlockSharedSliceFfi {} } impl<'a, T: 'static + TypeGraphLayout> CudaKernelParameter @@ -1057,7 +1077,7 @@ impl<'a, T: 'static + TypeGraphLayout> CudaKernelParameter #[cfg(all(not(feature = "host"), target_os = "cuda"))] #[allow(clippy::inline_always)] #[inline(always)] - fn with_ffi_as_device( + unsafe fn with_ffi_as_device( _param: Self::FfiType<'static, 'static>, inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O, ) -> O { @@ -1070,3 +1090,61 @@ impl<'a, T: 'static + TypeGraphLayout> sealed::Sealed for &'a mut crate::utils::shared::r#static::ThreadBlockShared { } + +impl<'a, T: 'static + TypeGraphLayout> CudaKernelParameter + for &'a mut crate::utils::shared::slice::ThreadBlockSharedSlice +{ + #[cfg(feature = "host")] + type AsyncHostType<'stream, 'b> = + &'b mut crate::utils::shared::slice::ThreadBlockSharedSlice; + #[cfg(all(not(feature = "host"), target_os = "cuda"))] + type DeviceType<'b> = &'b mut crate::utils::shared::slice::ThreadBlockSharedSlice; + type FfiType<'stream, 'b> = private_shared::ThreadBlockSharedSliceFfi; + #[cfg(feature = "host")] + type SyncHostType = Self; + + #[cfg(feature = "host")] + fn with_new_async<'stream, O, E: From>( + param: Self::SyncHostType, + _stream: &'stream rustacuda::stream::Stream, + inner: impl for<'b> FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result, + ) -> Result { + inner(param) + } + + #[cfg(feature = "host")] + fn with_async_as_ptx_jit( + _param: &Self::AsyncHostType<'_, '_>, + inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O, + ) -> O { + inner(None) + } + + #[cfg(feature = "host")] + fn async_to_ffi<'stream, 'b>( + param: Self::AsyncHostType<'stream, 'b>, + ) -> Self::FfiType<'stream, 'b> { + private_shared::ThreadBlockSharedSliceFfi { + len: param.len(), + _marker: [], + } + } + + #[cfg(all(not(feature = "host"), target_os = "cuda"))] + #[allow(clippy::inline_always)] + #[inline(always)] + unsafe fn with_ffi_as_device( + param: Self::FfiType<'static, 'static>, + inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O, + ) -> O { + unsafe { + crate::utils::shared::slice::ThreadBlockSharedSlice::with_uninit_for_len( + param.len, inner, + ) + } + } +} +impl<'a, T: 'static + TypeGraphLayout> sealed::Sealed + for &'a mut crate::utils::shared::slice::ThreadBlockSharedSlice +{ +} diff --git a/src/utils/shared/slice.rs b/src/utils/shared/slice.rs index 920f0c58d..7039e15f9 100644 --- a/src/utils/shared/slice.rs +++ b/src/utils/shared/slice.rs @@ -88,3 +88,60 @@ impl ThreadBlockSharedSlice { self.shared.get_unchecked_mut(index) } } + +#[cfg(all(not(feature = "host"), target_os = "cuda"))] +#[doc(cfg(all(not(feature = "host"), target_os = "cuda")))] +impl ThreadBlockSharedSlice { + /// # Safety + /// + /// Exposing the [`ThreadBlockSharedSlice`] must be preceded by exactly one + /// call to [`init`]. + pub(crate) unsafe fn with_uninit_for_len Q, Q>( + len: usize, + inner: F, + ) -> Q { + let base: *mut u8; + + unsafe { + core::arch::asm!( + "mov.u64 {base}, %rust_cuda_dynamic_shared;", + base = out(reg64) base, + ); + } + + let aligned_base = base.byte_add(base.align_offset(core::mem::align_of::())); + + let data: *mut T = aligned_base.cast(); + + let new_base = data.add(len).cast::(); + + unsafe { + core::arch::asm!( + "mov.u64 %rust_cuda_dynamic_shared, {new_base};", + new_base = in(reg64) new_base, + ); + } + + let shared = core::ptr::slice_from_raw_parts_mut(data, len); + + inner(&mut Self { shared }) + } +} + +#[doc(hidden)] +#[cfg(all(not(feature = "host"), target_os = "cuda"))] +/// # Safety +/// +/// The thread-block shared dynamic memory must be initialised once and +/// only once per kernel. +pub unsafe fn init() { + unsafe { + core::arch::asm!(".reg .u64 %rust_cuda_dynamic_shared;"); + core::arch::asm!( + "cvta.shared.u64 %rust_cuda_dynamic_shared, rust_cuda_dynamic_shared_base;", + ); + } +} + +#[cfg(all(not(feature = "host"), target_os = "cuda"))] +core::arch::global_asm!(".extern .shared .align 8 .b8 rust_cuda_dynamic_shared_base[];"); From cd8f4b4da519ddaebfa736673632735edf59e434 Mon Sep 17 00:00:00 2001 From: Juniper Tyree Date: Mon, 25 Dec 2023 09:52:56 +0000 Subject: [PATCH 062/120] Begin refactoring the public API with device feature --- .github/workflows/ci.yml | 2 + Cargo.toml | 1 + examples/print/Cargo.toml | 2 +- examples/print/src/main.rs | 29 +-- examples/single-source/Cargo.toml | 2 +- examples/single-source/src/main.rs | 14 +- .../kernel/wrapper/generate/cuda_wrapper.rs | 4 +- rust-cuda-derive/src/rust_to_cuda/impl.rs | 18 +- src/common.rs | 79 ++++---- src/deps.rs | 6 + src/device/alloc.rs | 2 +- src/device/mod.rs | 1 - src/device/thread.rs | 2 +- src/host/mod.rs | 3 - src/host/ptx_jit/mod.rs | 2 - src/lib.rs | 49 ++--- src/utils/aliasing/const.rs | 47 ++--- src/utils/aliasing/dynamic.rs | 50 ++--- src/utils/aliasing/final.rs | 7 +- src/utils/box.rs | 20 +- src/utils/boxed_slice.rs | 20 +- src/utils/device_copy.rs | 11 +- src/utils/exchange/buffer/common.rs | 12 +- src/utils/exchange/buffer/device.rs | 40 +--- src/utils/exchange/buffer/host.rs | 42 ++--- src/utils/exchange/buffer/mod.rs | 178 ++++++++++++++---- src/utils/exchange/mod.rs | 1 - src/utils/option.rs | 16 +- src/utils/shared/slice.rs | 48 ++--- src/utils/shared/static.rs | 18 +- 30 files changed, 386 insertions(+), 340 deletions(-) create mode 100644 src/deps.rs diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 07fa4ab26..a8f37a6dd 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -57,6 +57,7 @@ jobs: - name: Check feature powerset on the CPU run: | cargo hack check --feature-powerset --optional-deps \ + --skip device \ --keep-going - name: Check feature powerset on CUDA @@ -174,6 +175,7 @@ jobs: - name: Check feature powerset on the CPU run: | cargo hack clippy --feature-powerset --optional-deps \ + --skip device \ --keep-going \ -- -D warnings diff --git a/Cargo.toml b/Cargo.toml index a218c629d..12a90ef59 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -20,6 +20,7 @@ rust-version = "1.75" # nightly [features] default = [] host = ["dep:rustacuda", "dep:regex"] +device = [] derive = ["dep:rustacuda_derive", "dep:rust-cuda-derive"] [dependencies] diff --git a/examples/print/Cargo.toml b/examples/print/Cargo.toml index 21f513d8f..05f3a537e 100644 --- a/examples/print/Cargo.toml +++ b/examples/print/Cargo.toml @@ -8,7 +8,7 @@ edition = "2021" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [target.'cfg(target_os = "cuda")'.dependencies] -rust-cuda = { path = "../../", features = ["derive"] } +rust-cuda = { path = "../../", features = ["derive", "device"] } [target.'cfg(not(target_os = "cuda"))'.dependencies] rust-cuda = { path = "../../", features = ["derive", "host"] } diff --git a/examples/print/src/main.rs b/examples/print/src/main.rs index 62a0e2713..462603ca6 100644 --- a/examples/print/src/main.rs +++ b/examples/print/src/main.rs @@ -12,8 +12,8 @@ extern crate alloc; -#[derive(rust_cuda::const_type_layout::TypeLayout)] -#[layout(crate = "rust_cuda::const_type_layout")] +#[derive(rust_cuda::deps::const_type_layout::TypeLayout)] +#[layout(crate = "rust_cuda::deps::const_type_layout")] #[repr(C)] pub enum Action { Print, @@ -34,37 +34,38 @@ pub fn kernel(action: rust_cuda::common::PerThreadShallowCopy) { } #[cfg(not(target_os = "cuda"))] -fn main() -> rust_cuda::rustacuda::error::CudaResult<()> { +fn main() -> rust_cuda::deps::rustacuda::error::CudaResult<()> { // Link the non-generic CUDA kernel struct KernelPtx; link! { impl kernel for KernelPtx } // Initialize the CUDA API - rust_cuda::rustacuda::init(rust_cuda::rustacuda::CudaFlags::empty())?; + rust_cuda::deps::rustacuda::init(rust_cuda::deps::rustacuda::CudaFlags::empty())?; // Get the first CUDA GPU device - let device = rust_cuda::rustacuda::device::Device::get_device(0)?; + let device = rust_cuda::deps::rustacuda::device::Device::get_device(0)?; // Create a CUDA context associated to this device let _context = rust_cuda::host::CudaDropWrapper::from( - rust_cuda::rustacuda::context::Context::create_and_push( - rust_cuda::rustacuda::context::ContextFlags::MAP_HOST - | rust_cuda::rustacuda::context::ContextFlags::SCHED_AUTO, + rust_cuda::deps::rustacuda::context::Context::create_and_push( + rust_cuda::deps::rustacuda::context::ContextFlags::MAP_HOST + | rust_cuda::deps::rustacuda::context::ContextFlags::SCHED_AUTO, device, )?, ); // Create a new CUDA stream to submit kernels to - let stream = rust_cuda::host::CudaDropWrapper::from(rust_cuda::rustacuda::stream::Stream::new( - rust_cuda::rustacuda::stream::StreamFlags::NON_BLOCKING, - None, - )?); + let stream = + rust_cuda::host::CudaDropWrapper::from(rust_cuda::deps::rustacuda::stream::Stream::new( + rust_cuda::deps::rustacuda::stream::StreamFlags::NON_BLOCKING, + None, + )?); // Create a new instance of the CUDA kernel and prepare the launch config let mut kernel = rust_cuda::host::TypedPtxKernel::::new::(None); let config = rust_cuda::host::LaunchConfig { - grid: rust_cuda::rustacuda::function::GridSize::x(1), - block: rust_cuda::rustacuda::function::BlockSize::x(4), + grid: rust_cuda::deps::rustacuda::function::GridSize::x(1), + block: rust_cuda::deps::rustacuda::function::BlockSize::x(4), shared_memory_size: 0, ptx_jit: false, }; diff --git a/examples/single-source/Cargo.toml b/examples/single-source/Cargo.toml index eeada181d..6f53359cd 100644 --- a/examples/single-source/Cargo.toml +++ b/examples/single-source/Cargo.toml @@ -8,7 +8,7 @@ edition = "2021" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [target.'cfg(target_os = "cuda")'.dependencies] -rc = { package = "rust-cuda", path = "../../", features = ["derive"] } +rc = { package = "rust-cuda", path = "../../", features = ["derive", "device"] } [target.'cfg(not(target_os = "cuda"))'.dependencies] rc = { package = "rust-cuda", path = "../../", features = ["derive", "host"] } diff --git a/examples/single-source/src/main.rs b/examples/single-source/src/main.rs index b57556963..41df1705d 100644 --- a/examples/single-source/src/main.rs +++ b/examples/single-source/src/main.rs @@ -19,8 +19,8 @@ extern crate alloc; fn main() {} #[repr(C)] -#[derive(rc::const_type_layout::TypeLayout)] -#[layout(crate = "rc::const_type_layout")] +#[derive(rc::deps::const_type_layout::TypeLayout)] +#[layout(crate = "rc::deps::const_type_layout")] pub struct Dummy(i32); #[derive(rc::common::LendRustToCuda)] @@ -36,13 +36,13 @@ pub struct Wrapper { pub struct Empty([u8; 0]); #[repr(C)] -#[derive(rc::const_type_layout::TypeLayout)] -#[layout(crate = "rc::const_type_layout")] +#[derive(rc::deps::const_type_layout::TypeLayout)] +#[layout(crate = "rc::deps::const_type_layout")] pub struct Tuple(u32, i32); #[repr(C)] -#[derive(rc::const_type_layout::TypeLayout)] -#[layout(crate = "rc::const_type_layout")] +#[derive(rc::deps::const_type_layout::TypeLayout)] +#[layout(crate = "rc::deps::const_type_layout")] pub struct Triple(i32, i32, i32); #[rc::common::kernel(pub use link! for impl)] @@ -94,8 +94,6 @@ pub fn kernel< #[cfg(not(target_os = "cuda"))] mod host { - // use super::{link, kernel}; - // Link several instances of the generic CUDA kernel struct KernelPtx<'a, T>(std::marker::PhantomData<&'a T>); crate::link! { impl kernel<'a, crate::Empty> for KernelPtx } diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cuda_wrapper.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cuda_wrapper.rs index 1bb8a5577..e87bd0d16 100644 --- a/rust-cuda-derive/src/kernel/wrapper/generate/cuda_wrapper.rs +++ b/rust-cuda-derive/src/kernel/wrapper/generate/cuda_wrapper.rs @@ -78,8 +78,8 @@ pub(in super::super) fn quote_cuda_wrapper( #( #[no_mangle] static #func_layout_params: [ - u8; #crate_path::const_type_layout::serialised_type_graph_len::<#ffi_types>() - ] = #crate_path::const_type_layout::serialise_type_graph::<#ffi_types>(); + u8; #crate_path::deps::const_type_layout::serialised_type_graph_len::<#ffi_types>() + ] = #crate_path::deps::const_type_layout::serialise_type_graph::<#ffi_types>(); unsafe { ::core::ptr::read_volatile(&#func_layout_params[0]) }; )* diff --git a/rust-cuda-derive/src/rust_to_cuda/impl.rs b/rust-cuda-derive/src/rust_to_cuda/impl.rs index 896e51e89..2928cebef 100644 --- a/rust-cuda-derive/src/rust_to_cuda/impl.rs +++ b/rust-cuda-derive/src/rust_to_cuda/impl.rs @@ -29,20 +29,20 @@ pub fn cuda_struct_declaration( quote!(#where_clause #struct_fields_cuda) }; - let const_type_layout_crate_path = quote! { #crate_path::const_type_layout }.to_string(); + let const_type_layout_crate_path = quote! { #crate_path::deps::const_type_layout }.to_string(); quote! { #[allow(dead_code)] #[doc(hidden)] #(#struct_attrs_cuda)* - #[derive(#crate_path::const_type_layout::TypeLayout)] + #[derive(#crate_path::deps::const_type_layout::TypeLayout)] #struct_repr #(#struct_layout_attrs)* #[layout(crate = #const_type_layout_crate_path)] #struct_vis_cuda struct #struct_name_cuda #struct_generics_cuda #struct_fields_where_clause // #[derive(DeviceCopy)] can interfer with type parameters - unsafe impl #impl_generics #crate_path::rustacuda_core::DeviceCopy + unsafe impl #impl_generics #crate_path::deps::rustacuda_core::DeviceCopy for #struct_name_cuda #ty_generics #where_clause {} } } @@ -87,7 +87,7 @@ pub fn rust_to_cuda_trait( unsafe fn borrow( &self, alloc: CudaAllocType, - ) -> #crate_path::rustacuda::error::CudaResult<( + ) -> #crate_path::deps::rustacuda::error::CudaResult<( #crate_path::common::DeviceAccessible, #crate_path::common::CombinedCudaAlloc )> { @@ -110,7 +110,7 @@ pub fn rust_to_cuda_trait( alloc: #crate_path::common::CombinedCudaAlloc< Self::CudaAllocation, CudaAllocType >, - ) -> #crate_path::rustacuda::error::CudaResult { + ) -> #crate_path::deps::rustacuda::error::CudaResult { let (alloc_front, alloc_tail) = alloc.split(); #(#r2c_field_destructors)* @@ -156,8 +156,8 @@ pub fn rust_to_cuda_async_trait( unsafe fn borrow_async( &self, alloc: CudaAllocType, - stream: &#crate_path::rustacuda::stream::Stream, - ) -> #crate_path::rustacuda::error::CudaResult<( + stream: &#crate_path::deps::rustacuda::stream::Stream, + ) -> #crate_path::deps::rustacuda::error::CudaResult<( #crate_path::common::DeviceAccessible, #crate_path::common::CombinedCudaAlloc )> { @@ -180,8 +180,8 @@ pub fn rust_to_cuda_async_trait( alloc: #crate_path::common::CombinedCudaAlloc< Self::CudaAllocation, CudaAllocType >, - stream: &#crate_path::rustacuda::stream::Stream, - ) -> #crate_path::rustacuda::error::CudaResult { + stream: &#crate_path::deps::rustacuda::stream::Stream, + ) -> #crate_path::deps::rustacuda::error::CudaResult { let (alloc_front, alloc_tail) = alloc.split(); #(#r2c_field_async_destructors)* diff --git a/src/common.rs b/src/common.rs index 7f8f6cfec..d9d1a955a 100644 --- a/src/common.rs +++ b/src/common.rs @@ -1,4 +1,4 @@ -#[cfg(any(not(feature = "host"), doc))] +#[cfg(feature = "device")] use core::convert::{AsMut, AsRef}; use core::{ marker::PhantomData, @@ -13,22 +13,20 @@ use core::{ ptr::{copy_nonoverlapping, NonNull}, }; -use const_type_layout::TypeGraphLayout; +use const_type_layout::{TypeGraphLayout, TypeLayout}; use rustacuda_core::DeviceCopy; #[cfg(feature = "derive")] -#[doc(cfg(feature = "derive"))] pub use rust_cuda_derive::LendRustToCuda; #[cfg(feature = "derive")] -#[doc(cfg(feature = "derive"))] pub use rust_cuda_derive::kernel; #[cfg(feature = "host")] use crate::{safety::SafeDeviceCopy, utils::device_copy::SafeDeviceCopyWrapper}; #[repr(transparent)] -#[cfg_attr(not(feature = "host"), derive(Debug))] +#[cfg_attr(any(feature = "device", doc), derive(Debug))] #[derive(TypeLayout)] pub struct DeviceAccessible(T); @@ -54,7 +52,7 @@ impl From<&T> for DeviceAccessible fmt::Debug for DeviceAccessible { fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result { fmt.debug_struct(stringify!(DeviceAccessible)) @@ -62,7 +60,7 @@ impl fmt::Debug for DeviceAccessible { } } -#[cfg(not(feature = "host"))] +#[cfg(feature = "device")] impl Deref for DeviceAccessible { type Target = T; @@ -71,7 +69,7 @@ impl Deref for DeviceAccessible { } } -#[cfg(not(feature = "host"))] +#[cfg(feature = "device")] impl DerefMut for DeviceAccessible { fn deref_mut(&mut self) -> &mut Self::Target { &mut self.0 @@ -178,7 +176,7 @@ pub unsafe trait CudaAsRust: DeviceCopy + TypeGraphLayout { type RustRepresentation: RustToCuda; #[doc(hidden)] - #[cfg(not(feature = "host"))] + #[cfg(feature = "device")] /// # Safety /// /// This is an internal function and should NEVER be called manually @@ -209,8 +207,7 @@ pub struct DeviceConstRef<'r, T: DeviceCopy + 'r> { unsafe impl<'r, T: DeviceCopy> DeviceCopy for DeviceConstRef<'r, T> {} -#[cfg(any(not(feature = "host"), doc))] -#[doc(cfg(not(feature = "host")))] +#[cfg(feature = "device")] impl<'r, T: DeviceCopy> AsRef for DeviceConstRef<'r, T> { fn as_ref(&self) -> &T { unsafe { &*self.pointer } @@ -227,16 +224,14 @@ pub struct DeviceMutRef<'r, T: DeviceCopy + 'r> { unsafe impl<'r, T: DeviceCopy> DeviceCopy for DeviceMutRef<'r, T> {} -#[cfg(any(not(feature = "host"), doc))] -#[doc(cfg(not(feature = "host")))] +#[cfg(feature = "device")] impl<'r, T: DeviceCopy> AsRef for DeviceMutRef<'r, T> { fn as_ref(&self) -> &T { unsafe { &*self.pointer } } } -#[cfg(any(not(feature = "host"), doc))] -#[doc(cfg(not(feature = "host")))] +#[cfg(feature = "device")] impl<'r, T: DeviceCopy> AsMut for DeviceMutRef<'r, T> { fn as_mut(&mut self) -> &mut T { unsafe { &mut *self.pointer } @@ -254,16 +249,14 @@ pub struct DeviceOwnedRef<'r, T: DeviceCopy> { unsafe impl<'r, T: DeviceCopy> DeviceCopy for DeviceOwnedRef<'r, T> {} -#[cfg(any(not(feature = "host"), doc))] -#[doc(cfg(not(feature = "host")))] +#[cfg(feature = "device")] impl<'r, T: DeviceCopy> AsRef for DeviceOwnedRef<'r, T> { fn as_ref(&self) -> &T { unsafe { &*self.pointer } } } -#[cfg(any(not(feature = "host"), doc))] -#[doc(cfg(not(feature = "host")))] +#[cfg(feature = "device")] impl<'r, T: DeviceCopy> AsMut for DeviceOwnedRef<'r, T> { fn as_mut(&mut self) -> &mut T { unsafe { &mut *self.pointer } @@ -336,7 +329,7 @@ pub trait CudaKernelParameter: sealed::Sealed { type AsyncHostType<'stream, 'b>; #[doc(hidden)] type FfiType<'stream, 'b>: rustacuda_core::DeviceCopy + TypeGraphLayout; - #[cfg(all(not(feature = "host"), target_os = "cuda"))] + #[cfg(any(feature = "device", doc))] type DeviceType<'b>; #[cfg(feature = "host")] @@ -361,7 +354,7 @@ pub trait CudaKernelParameter: sealed::Sealed { ) -> Self::FfiType<'stream, 'b>; #[doc(hidden)] - #[cfg(all(not(feature = "host"), target_os = "cuda"))] + #[cfg(feature = "device")] unsafe fn with_ffi_as_device( param: Self::FfiType<'static, 'static>, inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O, @@ -428,7 +421,7 @@ impl< { #[cfg(feature = "host")] type AsyncHostType<'stream, 'b> = crate::utils::device_copy::SafeDeviceCopyWrapper; - #[cfg(all(not(feature = "host"), target_os = "cuda"))] + #[cfg(any(feature = "device", doc))] type DeviceType<'b> = T; type FfiType<'stream, 'b> = crate::utils::device_copy::SafeDeviceCopyWrapper; #[cfg(feature = "host")] @@ -460,7 +453,7 @@ impl< param } - #[cfg(all(not(feature = "host"), target_os = "cuda"))] + #[cfg(feature = "device")] unsafe fn with_ffi_as_device( param: Self::FfiType<'static, 'static>, inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O, @@ -492,7 +485,7 @@ impl< 'b, crate::utils::device_copy::SafeDeviceCopyWrapper, >; - #[cfg(all(not(feature = "host"), target_os = "cuda"))] + #[cfg(any(feature = "device", doc))] type DeviceType<'b> = &'b T; type FfiType<'stream, 'b> = DeviceConstRef<'b, crate::utils::device_copy::SafeDeviceCopyWrapper>; @@ -535,7 +528,7 @@ impl< unsafe { param.for_device_async() } } - #[cfg(all(not(feature = "host"), target_os = "cuda"))] + #[cfg(feature = "device")] unsafe fn with_ffi_as_device( param: Self::FfiType<'static, 'static>, inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O, @@ -565,7 +558,7 @@ impl< #[cfg(feature = "host")] type AsyncHostType<'stream, 'b> = <&'a PerThreadShallowCopy as CudaKernelParameter>::AsyncHostType<'stream, 'b>; - #[cfg(all(not(feature = "host"), target_os = "cuda"))] + #[cfg(any(feature = "device", doc))] type DeviceType<'b> = <&'a PerThreadShallowCopy as CudaKernelParameter>::DeviceType<'b>; type FfiType<'stream, 'b> = <&'a PerThreadShallowCopy as CudaKernelParameter>::FfiType<'stream, 'b>; @@ -596,7 +589,7 @@ impl< <&'a PerThreadShallowCopy as CudaKernelParameter>::async_to_ffi(param) } - #[cfg(all(not(feature = "host"), target_os = "cuda"))] + #[cfg(feature = "device")] unsafe fn with_ffi_as_device( param: Self::FfiType<'static, 'static>, inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O, @@ -639,7 +632,7 @@ impl<'a, T: 'static + InteriorMutableSafeDeviceCopy> CudaKernelParameter 'b, crate::utils::device_copy::SafeDeviceCopyWrapper, >; - #[cfg(all(not(feature = "host"), target_os = "cuda"))] + #[cfg(any(feature = "device", doc))] type DeviceType<'b> = &'b T; type FfiType<'stream, 'b> = DeviceConstRef<'b, crate::utils::device_copy::SafeDeviceCopyWrapper>; @@ -690,7 +683,7 @@ impl<'a, T: 'static + InteriorMutableSafeDeviceCopy> CudaKernelParameter unsafe { param.for_device_async() } } - #[cfg(all(not(feature = "host"), target_os = "cuda"))] + #[cfg(feature = "device")] unsafe fn with_ffi_as_device( param: Self::FfiType<'static, 'static>, inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O, @@ -773,7 +766,7 @@ impl< 'b, DeviceAccessible<::CudaRepresentation>, >; - #[cfg(all(not(feature = "host"), target_os = "cuda"))] + #[cfg(any(feature = "device", doc))] type DeviceType<'b> = T; type FfiType<'stream, 'b> = DeviceOwnedRef<'b, DeviceAccessible<::CudaRepresentation>>; @@ -804,7 +797,7 @@ impl< unsafe { param.for_device_async() } } - #[cfg(all(not(feature = "host"), target_os = "cuda"))] + #[cfg(feature = "device")] unsafe fn with_ffi_as_device( param: Self::FfiType<'static, 'static>, inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O, @@ -830,7 +823,7 @@ impl<'a, T: 'static + RustToCuda + crate::safety::NoSafeAliasing> CudaKernelPara 'b, DeviceAccessible<::CudaRepresentation>, >; - #[cfg(all(not(feature = "host"), target_os = "cuda"))] + #[cfg(any(feature = "device", doc))] type DeviceType<'b> = &'b T; type FfiType<'stream, 'b> = DeviceConstRef<'b, DeviceAccessible<::CudaRepresentation>>; @@ -861,7 +854,7 @@ impl<'a, T: 'static + RustToCuda + crate::safety::NoSafeAliasing> CudaKernelPara unsafe { param.for_device_async() } } - #[cfg(all(not(feature = "host"), target_os = "cuda"))] + #[cfg(feature = "device")] unsafe fn with_ffi_as_device( param: Self::FfiType<'static, 'static>, inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O, @@ -884,7 +877,7 @@ impl< #[cfg(feature = "host")] type AsyncHostType<'stream, 'b> = as CudaKernelParameter>::AsyncHostType<'stream, 'b>; - #[cfg(all(not(feature = "host"), target_os = "cuda"))] + #[cfg(any(feature = "device", doc))] type DeviceType<'b> = as CudaKernelParameter>::DeviceType<'b>; type FfiType<'stream, 'b> = @@ -918,7 +911,7 @@ impl< as CudaKernelParameter>::async_to_ffi(param) } - #[cfg(all(not(feature = "host"), target_os = "cuda"))] + #[cfg(feature = "device")] unsafe fn with_ffi_as_device( param: Self::FfiType<'static, 'static>, inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O, @@ -945,7 +938,7 @@ impl<'a, T: 'static + RustToCuda + crate::safety::NoSafeAliasing> CudaKernelPara #[cfg(feature = "host")] type AsyncHostType<'stream, 'b> = <&'a SharedHeapPerThreadShallowCopy as CudaKernelParameter>::AsyncHostType<'stream, 'b>; - #[cfg(all(not(feature = "host"), target_os = "cuda"))] + #[cfg(any(feature = "device", doc))] type DeviceType<'b> = <&'a SharedHeapPerThreadShallowCopy as CudaKernelParameter>::DeviceType<'b>; type FfiType<'stream, 'b> = @@ -980,7 +973,7 @@ impl<'a, T: 'static + RustToCuda + crate::safety::NoSafeAliasing> CudaKernelPara <&'a SharedHeapPerThreadShallowCopy as CudaKernelParameter>::async_to_ffi(param) } - #[cfg(all(not(feature = "host"), target_os = "cuda"))] + #[cfg(feature = "device")] unsafe fn with_ffi_as_device( param: Self::FfiType<'static, 'static>, inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O, @@ -1002,7 +995,7 @@ fn param_as_raw_bytes(r: &T) -> NonNull<[u8]> { NonNull::slice_from_raw_parts(NonNull::from(r).cast::(), core::mem::size_of_val(r)) } -#[cfg(all(not(feature = "host"), target_os = "cuda"))] +#[cfg(feature = "device")] fn emit_param_ptx_jit_marker(param: &T) { unsafe { core::arch::asm!( @@ -1014,7 +1007,7 @@ fn emit_param_ptx_jit_marker(param: &T) { } mod private_shared { - use const_type_layout::TypeGraphLayout; + use const_type_layout::{TypeGraphLayout, TypeLayout}; use rustacuda_core::DeviceCopy; #[doc(hidden)] @@ -1044,7 +1037,7 @@ impl<'a, T: 'static + TypeGraphLayout> CudaKernelParameter { #[cfg(feature = "host")] type AsyncHostType<'stream, 'b> = &'b mut crate::utils::shared::r#static::ThreadBlockShared; - #[cfg(all(not(feature = "host"), target_os = "cuda"))] + #[cfg(any(feature = "device", doc))] type DeviceType<'b> = &'b mut crate::utils::shared::r#static::ThreadBlockShared; type FfiType<'stream, 'b> = private_shared::ThreadBlockSharedFfi; #[cfg(feature = "host")] @@ -1074,7 +1067,7 @@ impl<'a, T: 'static + TypeGraphLayout> CudaKernelParameter private_shared::ThreadBlockSharedFfi { _marker: [] } } - #[cfg(all(not(feature = "host"), target_os = "cuda"))] + #[cfg(feature = "device")] #[allow(clippy::inline_always)] #[inline(always)] unsafe fn with_ffi_as_device( @@ -1097,7 +1090,7 @@ impl<'a, T: 'static + TypeGraphLayout> CudaKernelParameter #[cfg(feature = "host")] type AsyncHostType<'stream, 'b> = &'b mut crate::utils::shared::slice::ThreadBlockSharedSlice; - #[cfg(all(not(feature = "host"), target_os = "cuda"))] + #[cfg(any(feature = "device", doc))] type DeviceType<'b> = &'b mut crate::utils::shared::slice::ThreadBlockSharedSlice; type FfiType<'stream, 'b> = private_shared::ThreadBlockSharedSliceFfi; #[cfg(feature = "host")] @@ -1130,7 +1123,7 @@ impl<'a, T: 'static + TypeGraphLayout> CudaKernelParameter } } - #[cfg(all(not(feature = "host"), target_os = "cuda"))] + #[cfg(feature = "device")] #[allow(clippy::inline_always)] #[inline(always)] unsafe fn with_ffi_as_device( diff --git a/src/deps.rs b/src/deps.rs new file mode 100644 index 000000000..fe001e054 --- /dev/null +++ b/src/deps.rs @@ -0,0 +1,6 @@ +pub extern crate const_type_layout; + +#[cfg(feature = "host")] +pub extern crate rustacuda; + +pub extern crate rustacuda_core; diff --git a/src/device/alloc.rs b/src/device/alloc.rs index 0217fa939..c1c28f931 100644 --- a/src/device/alloc.rs +++ b/src/device/alloc.rs @@ -1,5 +1,5 @@ use alloc::alloc::{GlobalAlloc, Layout}; -#[cfg(target_os = "cuda")] +#[cfg(all(feature = "device", not(doc)))] use core::arch::nvptx; /// Memory allocator using CUDA malloc/free diff --git a/src/device/mod.rs b/src/device/mod.rs index c4a459087..07894b5bb 100644 --- a/src/device/mod.rs +++ b/src/device/mod.rs @@ -1,7 +1,6 @@ use core::mem::ManuallyDrop; #[cfg(feature = "derive")] -#[doc(cfg(feature = "derive"))] pub use rust_cuda_derive::{specialise_kernel_function, specialise_kernel_type}; use crate::{ diff --git a/src/device/thread.rs b/src/device/thread.rs index b2f3035bd..bb5599cda 100644 --- a/src/device/thread.rs +++ b/src/device/thread.rs @@ -1,4 +1,4 @@ -#[cfg(target_os = "cuda")] +#[cfg(all(feature = "device", not(doc)))] use core::arch::nvptx; #[allow(clippy::module_name_repetitions)] diff --git a/src/host/mod.rs b/src/host/mod.rs index 424b5726c..cc7fa681f 100644 --- a/src/host/mod.rs +++ b/src/host/mod.rs @@ -18,7 +18,6 @@ use rustacuda::{ use rustacuda_core::{DeviceCopy, DevicePointer}; #[cfg(feature = "derive")] -#[doc(cfg(feature = "derive"))] pub use rust_cuda_derive::{check_kernel, link_kernel, specialise_kernel_entry_point}; use crate::{ @@ -162,8 +161,6 @@ pub struct LaunchConfig { pub ptx_jit: bool, } -#[doc(cfg(feature = "host"))] -#[allow(clippy::module_name_repetitions)] pub struct RawPtxKernel { module: ManuallyDrop>, function: ManuallyDrop>, diff --git a/src/host/ptx_jit/mod.rs b/src/host/ptx_jit/mod.rs index 156e8223c..43c555ab2 100644 --- a/src/host/ptx_jit/mod.rs +++ b/src/host/ptx_jit/mod.rs @@ -6,7 +6,6 @@ mod replace; type ByteSliceOptionalArguments = Option>]>>; -#[doc(cfg(feature = "host"))] #[allow(clippy::module_name_repetitions)] pub struct PtxJITCompiler { ptx_slices: Box<[PtxElement]>, @@ -14,7 +13,6 @@ pub struct PtxJITCompiler { last_ptx: CString, } -#[doc(cfg(feature = "host"))] pub enum PtxJITResult<'s> { Cached(&'s CStr), Recomputed(&'s CStr), diff --git a/src/lib.rs b/src/lib.rs index 26f56adb6..118a55343 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -6,22 +6,14 @@ #![deny(clippy::style)] #![deny(clippy::suspicious)] #![allow(clippy::useless_attribute)] -#![cfg_attr(not(feature = "host"), no_std)] +#![cfg_attr(all(feature = "device", not(doc)), no_std)] #![feature(associated_type_bounds)] #![feature(auto_traits)] #![feature(negative_impls)] -#![cfg_attr( - any(all(not(feature = "host"), target_os = "cuda"), doc), - feature(stdsimd) -)] -#![cfg_attr( - any(all(not(feature = "host"), target_os = "cuda"), doc), - feature(asm_experimental_arch) -)] -#![cfg_attr( - any(all(not(feature = "host"), target_os = "cuda"), doc), - feature(asm_const) -)] +#![cfg_attr(feature = "device", feature(stdsimd))] +#![cfg_attr(feature = "device", feature(asm_experimental_arch))] +#![cfg_attr(feature = "device", feature(asm_const))] +#![feature(doc_auto_cfg)] #![feature(doc_cfg)] #![feature(marker_trait_attr)] #![feature(const_type_name)] @@ -35,43 +27,36 @@ #![feature(inline_const)] #![feature(sync_unsafe_cell)] #![feature(never_type)] -#![feature(tuple_trait)] -#![feature(unboxed_closures)] #![feature(cfg_version)] #![cfg_attr(not(version("1.76.0")), feature(c_str_literals))] #![cfg_attr(not(version("1.76.0")), feature(ptr_from_ref))] #![allow(incomplete_features)] #![feature(generic_const_exprs)] -#![cfg_attr(target_os = "cuda", feature(slice_ptr_get))] +#![cfg_attr(feature = "device", feature(slice_ptr_get))] #![doc(html_root_url = "https://juntyr.github.io/rust-cuda/")] -#[doc(hidden)] -pub extern crate alloc; +#[cfg(all(feature = "host", feature = "device", not(doc)))] +core::compile_error!("cannot enable the `host` and `device` features at the same time"); -pub extern crate rustacuda_core; +#[cfg(all(feature = "host", targt_os = "cuda", not(doc)))] +core::compile_error!("cannot enable the `host` feature on a target with `target_os=\"cuda\"`"); -#[doc(hidden)] -#[macro_use] -pub extern crate const_type_layout; +#[cfg(all(feature = "device", not(target_os = "cuda"), not(doc)))] +core::compile_error!("cannot enable the `device` feature on a target without `target_os=\"cuda\"`"); -#[cfg(feature = "derive")] -#[doc(cfg(feature = "derive"))] -pub extern crate rustacuda_derive; +#[doc(hidden)] +pub extern crate alloc; pub mod common; #[cfg(feature = "host")] -#[doc(cfg(feature = "host"))] pub mod host; -#[cfg(feature = "host")] -#[doc(cfg(feature = "host"))] -pub extern crate rustacuda; - -#[cfg(any(all(not(feature = "host"), target_os = "cuda"), doc))] -#[doc(cfg(all(not(feature = "host"), target_os = "cuda")))] +#[cfg(feature = "device")] pub mod device; pub mod utils; pub mod safety; + +pub mod deps; diff --git a/src/utils/aliasing/const.rs b/src/utils/aliasing/const.rs index 759b14dc9..131a05803 100644 --- a/src/utils/aliasing/const.rs +++ b/src/utils/aliasing/const.rs @@ -1,9 +1,11 @@ +#[cfg(any(feature = "host", feature = "device"))] use core::{ borrow::{Borrow, BorrowMut}, convert::{AsMut, AsRef}, ops::{Deref, DerefMut}, }; +use const_type_layout::TypeLayout; use rustacuda_core::DeviceCopy; use crate::common::{CudaAsRust, DeviceAccessible, RustToCuda, RustToCudaAsync}; @@ -13,6 +15,7 @@ use crate::common::{CudaAsRust, DeviceAccessible, RustToCuda, RustToCudaAsync}; pub struct SplitSliceOverCudaThreadsConstStride(T); impl SplitSliceOverCudaThreadsConstStride { + #[cfg(feature = "host")] #[must_use] pub const fn new(inner: T) -> Self { Self(inner) @@ -26,7 +29,7 @@ unsafe impl DeviceCopy { } -#[cfg(all(not(feature = "host"), target_os = "cuda"))] +#[cfg(feature = "device")] fn split_slice_const_stride(slice: &[E]) -> &[E] { let offset: usize = crate::device::thread::Thread::this().index() * STRIDE; let len = slice.len().min(offset + STRIDE).saturating_sub(offset); @@ -34,7 +37,7 @@ fn split_slice_const_stride(slice: &[E]) -> &[E] { unsafe { core::slice::from_raw_parts(slice.as_ptr().add(offset), len) } } -#[cfg(all(not(feature = "host"), target_os = "cuda"))] +#[cfg(feature = "device")] fn split_slice_const_stride_mut(slice: &mut [E]) -> &mut [E] { let offset: usize = crate::device::thread::Thread::this().index() * STRIDE; let len = slice.len().min(offset + STRIDE).saturating_sub(offset); @@ -42,7 +45,7 @@ fn split_slice_const_stride_mut(slice: &mut [E]) -> &mut unsafe { core::slice::from_raw_parts_mut(slice.as_mut_ptr().add(offset), len) } } -#[cfg(all(not(feature = "host"), target_os = "cuda"))] +#[cfg(feature = "device")] impl SplitSliceOverCudaThreadsConstStride { /// # Safety /// @@ -63,7 +66,8 @@ impl SplitSliceOverCudaThreadsConstStride { } } -#[cfg(all(not(feature = "host"), target_os = "cuda"))] +#[cfg(any(feature = "device", doc))] +#[doc(cfg(any(feature = "device", feature = "host")))] impl, const STRIDE: usize> Deref for SplitSliceOverCudaThreadsConstStride { @@ -74,7 +78,8 @@ impl, const STRIDE: usize> Deref } } -#[cfg(all(not(feature = "host"), target_os = "cuda"))] +#[cfg(any(feature = "device", doc))] +#[doc(cfg(any(feature = "device", feature = "host")))] impl, const STRIDE: usize> DerefMut for SplitSliceOverCudaThreadsConstStride { @@ -83,7 +88,8 @@ impl, const STRIDE: usize> DerefMut } } -#[cfg(all(not(feature = "host"), target_os = "cuda"))] +#[cfg(any(feature = "device", doc))] +#[doc(cfg(any(feature = "device", feature = "host")))] impl, const STRIDE: usize> AsRef<[E]> for SplitSliceOverCudaThreadsConstStride { @@ -92,7 +98,8 @@ impl, const STRIDE: usize> AsRef<[E]> } } -#[cfg(all(not(feature = "host"), target_os = "cuda"))] +#[cfg(any(feature = "device", doc))] +#[doc(cfg(any(feature = "device", feature = "host")))] impl, const STRIDE: usize> AsMut<[E]> for SplitSliceOverCudaThreadsConstStride { @@ -101,7 +108,8 @@ impl, const STRIDE: usize> AsMut<[E]> } } -#[cfg(all(not(feature = "host"), target_os = "cuda"))] +#[cfg(any(feature = "device", doc))] +#[doc(cfg(any(feature = "device", feature = "host")))] impl, const STRIDE: usize> Borrow<[E]> for SplitSliceOverCudaThreadsConstStride { @@ -110,7 +118,8 @@ impl, const STRIDE: usize> Borrow<[E]> } } -#[cfg(all(not(feature = "host"), target_os = "cuda"))] +#[cfg(any(feature = "device", doc))] +#[doc(cfg(any(feature = "device", feature = "host")))] impl, const STRIDE: usize> BorrowMut<[E]> for SplitSliceOverCudaThreadsConstStride { @@ -119,7 +128,7 @@ impl, const STRIDE: usize> BorrowMut<[E]> } } -#[cfg(any(feature = "host", not(target_os = "cuda")))] +#[cfg(all(feature = "host", not(doc)))] impl, const STRIDE: usize> Deref for SplitSliceOverCudaThreadsConstStride { @@ -130,7 +139,7 @@ impl, const STRIDE: usize> Deref } } -#[cfg(any(feature = "host", not(target_os = "cuda")))] +#[cfg(all(feature = "host", not(doc)))] impl, const STRIDE: usize> DerefMut for SplitSliceOverCudaThreadsConstStride { @@ -139,7 +148,7 @@ impl, const STRIDE: usize> DerefMut } } -#[cfg(any(feature = "host", not(target_os = "cuda")))] +#[cfg(all(feature = "host", not(doc)))] impl, const STRIDE: usize> AsRef<[E]> for SplitSliceOverCudaThreadsConstStride { @@ -148,7 +157,7 @@ impl, const STRIDE: usize> AsRef<[E]> } } -#[cfg(any(feature = "host", not(target_os = "cuda")))] +#[cfg(all(feature = "host", not(doc)))] impl, const STRIDE: usize> AsMut<[E]> for SplitSliceOverCudaThreadsConstStride { @@ -157,7 +166,7 @@ impl, const STRIDE: usize> AsMut<[E]> } } -#[cfg(any(feature = "host", not(target_os = "cuda")))] +#[cfg(all(feature = "host", not(doc)))] impl, const STRIDE: usize> Borrow<[E]> for SplitSliceOverCudaThreadsConstStride { @@ -166,7 +175,7 @@ impl, const STRIDE: usize> Borrow<[E]> } } -#[cfg(any(feature = "host", not(target_os = "cuda")))] +#[cfg(all(feature = "host", not(doc)))] impl, const STRIDE: usize> BorrowMut<[E]> for SplitSliceOverCudaThreadsConstStride { @@ -183,7 +192,6 @@ unsafe impl RustToCuda SplitSliceOverCudaThreadsConstStride, STRIDE>; #[cfg(feature = "host")] - #[doc(cfg(feature = "host"))] #[allow(clippy::type_complexity)] unsafe fn borrow( &self, @@ -201,7 +209,6 @@ unsafe impl RustToCuda } #[cfg(feature = "host")] - #[doc(cfg(feature = "host"))] unsafe fn restore( &mut self, alloc: crate::common::CombinedCudaAlloc, @@ -214,7 +221,6 @@ unsafe impl RustToCudaAsync for SplitSliceOverCudaThreadsConstStride { #[cfg(feature = "host")] - #[doc(cfg(feature = "host"))] #[allow(clippy::type_complexity)] unsafe fn borrow_async( &self, @@ -233,7 +239,6 @@ unsafe impl RustToCudaAsync } #[cfg(feature = "host")] - #[doc(cfg(feature = "host"))] unsafe fn restore_async( &mut self, alloc: crate::common::CombinedCudaAlloc, @@ -248,8 +253,8 @@ unsafe impl CudaAsRust { type RustRepresentation = SplitSliceOverCudaThreadsConstStride; - #[cfg(not(feature = "host"))] + #[cfg(feature = "device")] unsafe fn as_rust(this: &DeviceAccessible) -> Self::RustRepresentation { - SplitSliceOverCudaThreadsConstStride::new(CudaAsRust::as_rust(&this.0)) + SplitSliceOverCudaThreadsConstStride(CudaAsRust::as_rust(&this.0)) } } diff --git a/src/utils/aliasing/dynamic.rs b/src/utils/aliasing/dynamic.rs index a3cecfa8f..a6577fc6f 100644 --- a/src/utils/aliasing/dynamic.rs +++ b/src/utils/aliasing/dynamic.rs @@ -1,9 +1,11 @@ +#[cfg(any(feature = "host", feature = "device"))] use core::{ borrow::{Borrow, BorrowMut}, convert::{AsMut, AsRef}, ops::{Deref, DerefMut}, }; +use const_type_layout::TypeLayout; use rustacuda_core::DeviceCopy; use crate::common::{CudaAsRust, DeviceAccessible, RustToCuda, RustToCudaAsync}; @@ -16,6 +18,7 @@ pub struct SplitSliceOverCudaThreadsDynamicStride { } impl SplitSliceOverCudaThreadsDynamicStride { + #[cfg(feature = "host")] #[must_use] pub const fn new(inner: T, stride: usize) -> Self { Self { stride, inner } @@ -26,7 +29,7 @@ impl SplitSliceOverCudaThreadsDynamicStride { // [`DeviceCopy`] unsafe impl DeviceCopy for SplitSliceOverCudaThreadsDynamicStride {} -#[cfg(all(not(feature = "host"), target_os = "cuda"))] +#[cfg(feature = "device")] fn split_slice_dynamic_stride(slice: &[E], stride: usize) -> &[E] { let offset: usize = crate::device::thread::Thread::this().index() * stride; let len = slice.len().min(offset + stride).saturating_sub(offset); @@ -34,7 +37,7 @@ fn split_slice_dynamic_stride(slice: &[E], stride: usize) -> &[E] { unsafe { core::slice::from_raw_parts(slice.as_ptr().add(offset), len) } } -#[cfg(all(not(feature = "host"), target_os = "cuda"))] +#[cfg(feature = "device")] fn split_slice_dynamic_stride_mut(slice: &mut [E], stride: usize) -> &mut [E] { let offset: usize = crate::device::thread::Thread::this().index() * stride; let len = slice.len().min(offset + stride).saturating_sub(offset); @@ -42,7 +45,7 @@ fn split_slice_dynamic_stride_mut(slice: &mut [E], stride: usize) -> &mut [E] unsafe { core::slice::from_raw_parts_mut(slice.as_mut_ptr().add(offset), len) } } -#[cfg(all(not(feature = "host"), target_os = "cuda"))] +#[cfg(feature = "device")] impl SplitSliceOverCudaThreadsDynamicStride { /// # Safety /// @@ -63,7 +66,8 @@ impl SplitSliceOverCudaThreadsDynamicStride { } } -#[cfg(all(not(feature = "host"), target_os = "cuda"))] +#[cfg(any(feature = "device", doc))] +#[doc(cfg(any(feature = "device", feature = "host")))] impl> Deref for SplitSliceOverCudaThreadsDynamicStride { type Target = [E]; @@ -72,42 +76,47 @@ impl> Deref for SplitSliceOverCudaThreadsDynamicStride } } -#[cfg(all(not(feature = "host"), target_os = "cuda"))] +#[cfg(any(feature = "device", doc))] +#[doc(cfg(any(feature = "device", feature = "host")))] impl> DerefMut for SplitSliceOverCudaThreadsDynamicStride { fn deref_mut(&mut self) -> &mut Self::Target { split_slice_dynamic_stride_mut(&mut self.inner, self.stride) } } -#[cfg(all(not(feature = "host"), target_os = "cuda"))] +#[cfg(any(feature = "device", doc))] +#[doc(cfg(any(feature = "device", feature = "host")))] impl> AsRef<[E]> for SplitSliceOverCudaThreadsDynamicStride { fn as_ref(&self) -> &[E] { split_slice_dynamic_stride(self.inner.as_ref(), self.stride) } } -#[cfg(all(not(feature = "host"), target_os = "cuda"))] +#[cfg(any(feature = "device", doc))] +#[doc(cfg(any(feature = "device", feature = "host")))] impl> AsMut<[E]> for SplitSliceOverCudaThreadsDynamicStride { fn as_mut(&mut self) -> &mut [E] { split_slice_dynamic_stride_mut(self.inner.as_mut(), self.stride) } } -#[cfg(all(not(feature = "host"), target_os = "cuda"))] +#[cfg(any(feature = "device", doc))] +#[doc(cfg(any(feature = "device", feature = "host")))] impl> Borrow<[E]> for SplitSliceOverCudaThreadsDynamicStride { fn borrow(&self) -> &[E] { split_slice_dynamic_stride(self.inner.borrow(), self.stride) } } -#[cfg(all(not(feature = "host"), target_os = "cuda"))] +#[cfg(any(feature = "device", doc))] +#[doc(cfg(any(feature = "device", feature = "host")))] impl> BorrowMut<[E]> for SplitSliceOverCudaThreadsDynamicStride { fn borrow_mut(&mut self) -> &mut [E] { split_slice_dynamic_stride_mut(self.inner.borrow_mut(), self.stride) } } -#[cfg(any(feature = "host", not(target_os = "cuda")))] +#[cfg(all(feature = "host", not(doc)))] impl> Deref for SplitSliceOverCudaThreadsDynamicStride { type Target = [E]; @@ -116,35 +125,35 @@ impl> Deref for SplitSliceOverCudaThreadsDynamicStride } } -#[cfg(any(feature = "host", not(target_os = "cuda")))] +#[cfg(all(feature = "host", not(doc)))] impl> DerefMut for SplitSliceOverCudaThreadsDynamicStride { fn deref_mut(&mut self) -> &mut Self::Target { &mut self.inner } } -#[cfg(any(feature = "host", not(target_os = "cuda")))] +#[cfg(all(feature = "host", not(doc)))] impl> AsRef<[E]> for SplitSliceOverCudaThreadsDynamicStride { fn as_ref(&self) -> &[E] { self.inner.as_ref() } } -#[cfg(any(feature = "host", not(target_os = "cuda")))] +#[cfg(all(feature = "host", not(doc)))] impl> AsMut<[E]> for SplitSliceOverCudaThreadsDynamicStride { fn as_mut(&mut self) -> &mut [E] { self.inner.as_mut() } } -#[cfg(any(feature = "host", not(target_os = "cuda")))] +#[cfg(all(feature = "host", not(doc)))] impl> Borrow<[E]> for SplitSliceOverCudaThreadsDynamicStride { fn borrow(&self) -> &[E] { self.inner.borrow() } } -#[cfg(any(feature = "host", not(target_os = "cuda")))] +#[cfg(all(feature = "host", not(doc)))] impl> BorrowMut<[E]> for SplitSliceOverCudaThreadsDynamicStride { fn borrow_mut(&mut self) -> &mut [E] { self.inner.borrow_mut() @@ -157,7 +166,6 @@ unsafe impl RustToCuda for SplitSliceOverCudaThreadsDynamicStride SplitSliceOverCudaThreadsDynamicStride>; #[cfg(feature = "host")] - #[doc(cfg(feature = "host"))] #[allow(clippy::type_complexity)] unsafe fn borrow( &self, @@ -178,7 +186,6 @@ unsafe impl RustToCuda for SplitSliceOverCudaThreadsDynamicStride } #[cfg(feature = "host")] - #[doc(cfg(feature = "host"))] unsafe fn restore( &mut self, alloc: crate::common::CombinedCudaAlloc, @@ -189,7 +196,6 @@ unsafe impl RustToCuda for SplitSliceOverCudaThreadsDynamicStride unsafe impl RustToCudaAsync for SplitSliceOverCudaThreadsDynamicStride { #[cfg(feature = "host")] - #[doc(cfg(feature = "host"))] #[allow(clippy::type_complexity)] unsafe fn borrow_async( &self, @@ -211,7 +217,6 @@ unsafe impl RustToCudaAsync for SplitSliceOverCudaThreadsDyn } #[cfg(feature = "host")] - #[doc(cfg(feature = "host"))] unsafe fn restore_async( &mut self, alloc: crate::common::CombinedCudaAlloc, @@ -226,8 +231,11 @@ unsafe impl CudaAsRust { type RustRepresentation = SplitSliceOverCudaThreadsDynamicStride; - #[cfg(not(feature = "host"))] + #[cfg(feature = "device")] unsafe fn as_rust(this: &DeviceAccessible) -> Self::RustRepresentation { - SplitSliceOverCudaThreadsDynamicStride::new(CudaAsRust::as_rust(&this.inner), this.stride) + SplitSliceOverCudaThreadsDynamicStride { + stride: this.stride, + inner: CudaAsRust::as_rust(&this.inner), + } } } diff --git a/src/utils/aliasing/final.rs b/src/utils/aliasing/final.rs index 366de9557..230ea4e8a 100644 --- a/src/utils/aliasing/final.rs +++ b/src/utils/aliasing/final.rs @@ -1,3 +1,4 @@ +use const_type_layout::TypeLayout; use r#final::Final; use crate::common::{CudaAsRust, DeviceAccessible, RustToCuda, RustToCudaAsync}; @@ -16,7 +17,6 @@ unsafe impl RustToCuda for Final { type CudaRepresentation = FinalCudaRepresentation; #[cfg(feature = "host")] - #[doc(cfg(feature = "host"))] #[allow(clippy::type_complexity)] unsafe fn borrow( &self, @@ -34,7 +34,6 @@ unsafe impl RustToCuda for Final { } #[cfg(feature = "host")] - #[doc(cfg(feature = "host"))] unsafe fn restore( &mut self, alloc: crate::common::CombinedCudaAlloc, @@ -48,7 +47,6 @@ unsafe impl RustToCuda for Final { unsafe impl RustToCudaAsync for Final { #[cfg(feature = "host")] - #[doc(cfg(feature = "host"))] #[allow(clippy::type_complexity)] unsafe fn borrow_async( &self, @@ -67,7 +65,6 @@ unsafe impl RustToCudaAsync for Final { } #[cfg(feature = "host")] - #[doc(cfg(feature = "host"))] unsafe fn restore_async( &mut self, alloc: crate::common::CombinedCudaAlloc, @@ -83,7 +80,7 @@ unsafe impl RustToCudaAsync for Final { unsafe impl CudaAsRust for FinalCudaRepresentation { type RustRepresentation = Final; - #[cfg(not(feature = "host"))] + #[cfg(feature = "device")] unsafe fn as_rust(this: &DeviceAccessible) -> Self::RustRepresentation { Final::new(CudaAsRust::as_rust(&this.0)) } diff --git a/src/utils/box.rs b/src/utils/box.rs index ab0b22708..9972c4ef3 100644 --- a/src/utils/box.rs +++ b/src/utils/box.rs @@ -1,18 +1,22 @@ use alloc::boxed::Box; -use const_type_layout::TypeGraphLayout; +use const_type_layout::{TypeGraphLayout, TypeLayout}; + +#[cfg(feature = "host")] +use rustacuda::{error::CudaResult, memory::DeviceBox}; use crate::{ - common::{CudaAsRust, DeviceAccessible, RustToCuda}, + common::{CudaAsRust, RustToCuda}, safety::SafeDeviceCopy, }; +#[cfg(any(feature = "host", feature = "device"))] +use crate::common::DeviceAccessible; + #[cfg(feature = "host")] use crate::{ common::{CombinedCudaAlloc, CudaAlloc}, host::CudaDropWrapper, - rustacuda::error::CudaResult, - rustacuda::memory::DeviceBox, utils::device_copy::SafeDeviceCopyWrapper, }; @@ -31,14 +35,13 @@ unsafe impl rustacuda_core::DeviceCopy } unsafe impl RustToCuda for Box { - #[cfg(feature = "host")] + #[cfg(all(feature = "host", not(doc)))] type CudaAllocation = crate::host::CudaDropWrapper>>; - #[cfg(not(feature = "host"))] + #[cfg(any(not(feature = "host"), doc))] type CudaAllocation = crate::common::SomeCudaAlloc; type CudaRepresentation = BoxCudaRepresentation; #[cfg(feature = "host")] - #[doc(cfg(feature = "host"))] #[allow(clippy::type_complexity)] unsafe fn borrow( &self, @@ -59,7 +62,6 @@ unsafe impl RustToCuda for Box { } #[cfg(feature = "host")] - #[doc(cfg(feature = "host"))] unsafe fn restore( &mut self, alloc: CombinedCudaAlloc, @@ -79,7 +81,7 @@ unsafe impl RustToCuda for Box { unsafe impl CudaAsRust for BoxCudaRepresentation { type RustRepresentation = Box; - #[cfg(not(feature = "host"))] + #[cfg(feature = "device")] unsafe fn as_rust(this: &DeviceAccessible) -> Self::RustRepresentation { alloc::boxed::Box::from_raw(this.0) } diff --git a/src/utils/boxed_slice.rs b/src/utils/boxed_slice.rs index 588fa8c07..bd9e74aee 100644 --- a/src/utils/boxed_slice.rs +++ b/src/utils/boxed_slice.rs @@ -1,18 +1,22 @@ use alloc::boxed::Box; -use const_type_layout::TypeGraphLayout; +use const_type_layout::{TypeGraphLayout, TypeLayout}; + +#[cfg(feature = "host")] +use rustacuda::{error::CudaResult, memory::DeviceBuffer}; use crate::{ - common::{CudaAsRust, DeviceAccessible, RustToCuda}, + common::{CudaAsRust, RustToCuda}, safety::SafeDeviceCopy, }; +#[cfg(any(feature = "host", feature = "device"))] +use crate::common::DeviceAccessible; + #[cfg(feature = "host")] use crate::{ common::{CombinedCudaAlloc, CudaAlloc}, host::CudaDropWrapper, - rustacuda::error::CudaResult, - rustacuda::memory::DeviceBuffer, utils::device_copy::SafeDeviceCopyWrapper, }; @@ -31,14 +35,13 @@ unsafe impl rustacuda_core::DeviceCopy } unsafe impl RustToCuda for Box<[T]> { - #[cfg(feature = "host")] + #[cfg(all(feature = "host", not(doc)))] type CudaAllocation = crate::host::CudaDropWrapper>>; - #[cfg(not(feature = "host"))] + #[cfg(any(not(feature = "host"), doc))] type CudaAllocation = crate::common::SomeCudaAlloc; type CudaRepresentation = BoxedSliceCudaRepresentation; #[cfg(feature = "host")] - #[doc(cfg(feature = "host"))] #[allow(clippy::type_complexity)] unsafe fn borrow( &self, @@ -61,7 +64,6 @@ unsafe impl RustToCuda for Box<[T]> { } #[cfg(feature = "host")] - #[doc(cfg(feature = "host"))] unsafe fn restore( &mut self, alloc: CombinedCudaAlloc, @@ -81,7 +83,7 @@ unsafe impl RustToCuda for Box<[T]> { unsafe impl CudaAsRust for BoxedSliceCudaRepresentation { type RustRepresentation = Box<[T]>; - #[cfg(not(feature = "host"))] + #[cfg(feature = "device")] unsafe fn as_rust(this: &DeviceAccessible) -> Self::RustRepresentation { alloc::boxed::Box::from_raw(core::slice::from_raw_parts_mut(this.0, this.1)) } diff --git a/src/utils/device_copy.rs b/src/utils/device_copy.rs index 0a92e69a1..1f03c1799 100644 --- a/src/utils/device_copy.rs +++ b/src/utils/device_copy.rs @@ -1,12 +1,15 @@ #![allow(clippy::trait_duplication_in_bounds)] -use const_type_layout::TypeGraphLayout; +use const_type_layout::{TypeGraphLayout, TypeLayout}; use crate::{ - common::{CudaAsRust, DeviceAccessible, NoCudaAlloc, RustToCuda, RustToCudaAsync}, + common::{CudaAsRust, NoCudaAlloc, RustToCuda, RustToCudaAsync}, safety::SafeDeviceCopy, }; +#[cfg(any(feature = "host", feature = "device"))] +use crate::common::DeviceAccessible; + #[cfg(feature = "host")] use crate::common::{CombinedCudaAlloc, CudaAlloc}; @@ -100,7 +103,6 @@ unsafe impl RustToCuda for SafeDeviceCopyWr } #[cfg(feature = "host")] - #[doc(cfg(feature = "host"))] unsafe fn restore( &mut self, alloc: CombinedCudaAlloc, @@ -127,7 +129,6 @@ unsafe impl RustToCudaAsync for SafeDeviceC } #[cfg(feature = "host")] - #[doc(cfg(feature = "host"))] unsafe fn restore_async( &mut self, alloc: CombinedCudaAlloc, @@ -142,7 +143,7 @@ unsafe impl RustToCudaAsync for SafeDeviceC unsafe impl CudaAsRust for SafeDeviceCopyWrapper { type RustRepresentation = Self; - #[cfg(not(feature = "host"))] + #[cfg(feature = "device")] unsafe fn as_rust(this: &DeviceAccessible) -> Self::RustRepresentation { let mut uninit = core::mem::MaybeUninit::uninit(); core::ptr::copy_nonoverlapping(&**this, uninit.as_mut_ptr(), 1); diff --git a/src/utils/exchange/buffer/common.rs b/src/utils/exchange/buffer/common.rs index 12a491b20..2725811ca 100644 --- a/src/utils/exchange/buffer/common.rs +++ b/src/utils/exchange/buffer/common.rs @@ -1,4 +1,4 @@ -use const_type_layout::TypeGraphLayout; +use const_type_layout::{TypeGraphLayout, TypeLayout}; use rustacuda_core::DeviceCopy; use crate::{common::CudaAsRust, safety::SafeDeviceCopy}; @@ -28,10 +28,12 @@ unsafe impl; - #[cfg(not(feature = "host"))] + #[cfg(feature = "device")] unsafe fn as_rust(this: &crate::common::DeviceAccessible) -> Self::RustRepresentation { - CudaExchangeBuffer(core::mem::ManuallyDrop::new(alloc::boxed::Box::from_raw( - core::slice::from_raw_parts_mut(this.0, this.1), - ))) + CudaExchangeBuffer { + inner: super::device::CudaExchangeBufferDevice(core::mem::ManuallyDrop::new( + alloc::boxed::Box::from_raw(core::slice::from_raw_parts_mut(this.0, this.1)), + )), + } } } diff --git a/src/utils/exchange/buffer/device.rs b/src/utils/exchange/buffer/device.rs index f6f00248b..ed160e185 100644 --- a/src/utils/exchange/buffer/device.rs +++ b/src/utils/exchange/buffer/device.rs @@ -2,26 +2,18 @@ use core::ops::{Deref, DerefMut}; use const_type_layout::TypeGraphLayout; -use crate::{ - common::{NoCudaAlloc, RustToCuda, RustToCudaAsync}, - safety::SafeDeviceCopy, -}; +use crate::safety::SafeDeviceCopy; -use super::{common::CudaExchangeBufferCudaRepresentation, CudaExchangeItem}; +use super::CudaExchangeItem; #[allow(clippy::module_name_repetitions)] -#[doc(cfg(not(feature = "host")))] -/// When the `host` feature is set, -/// [`CudaExchangeBuffer`](super::CudaExchangeBuffer) -/// refers to -/// [`CudaExchangeBufferHost`](super::CudaExchangeBufferHost) -/// instead. -/// [`CudaExchangeBufferDevice`](Self) is never exposed directly. -pub struct CudaExchangeBufferDevice( - pub(super) core::mem::ManuallyDrop]>>, -); +pub struct CudaExchangeBufferDevice< + T: SafeDeviceCopy + TypeGraphLayout, + const M2D: bool, + const M2H: bool, +>(pub(super) core::mem::ManuallyDrop]>>); -impl Deref +impl Deref for CudaExchangeBufferDevice { type Target = [CudaExchangeItem]; @@ -31,24 +23,10 @@ impl Deref } } -impl DerefMut +impl DerefMut for CudaExchangeBufferDevice { fn deref_mut(&mut self) -> &mut Self::Target { &mut self.0 } } - -#[cfg(not(all(doc, feature = "host")))] -unsafe impl RustToCuda - for CudaExchangeBufferDevice -{ - type CudaAllocation = NoCudaAlloc; - type CudaRepresentation = CudaExchangeBufferCudaRepresentation; -} - -#[cfg(not(all(doc, feature = "host")))] -unsafe impl RustToCudaAsync - for CudaExchangeBufferDevice -{ -} diff --git a/src/utils/exchange/buffer/host.rs b/src/utils/exchange/buffer/host.rs index 24a95bfe3..56fc259cd 100644 --- a/src/utils/exchange/buffer/host.rs +++ b/src/utils/exchange/buffer/host.rs @@ -11,9 +11,7 @@ use rustacuda::{ }; use crate::{ - common::{ - CombinedCudaAlloc, CudaAlloc, DeviceAccessible, NoCudaAlloc, RustToCuda, RustToCudaAsync, - }, + common::{CombinedCudaAlloc, CudaAlloc, DeviceAccessible, NoCudaAlloc}, host::CudaDropWrapper, safety::SafeDeviceCopy, }; @@ -21,13 +19,6 @@ use crate::{ use super::{common::CudaExchangeBufferCudaRepresentation, CudaExchangeItem}; #[allow(clippy::module_name_repetitions)] -#[doc(cfg(feature = "host"))] -/// When the `host` feature is **not** set, -/// [`CudaExchangeBuffer`](super::CudaExchangeBuffer) -/// refers to -/// [`CudaExchangeBufferDevice`](super::CudaExchangeBufferDevice) -/// instead. -/// [`CudaExchangeBufferHost`](Self) is never exposed directly. pub struct CudaExchangeBufferHost< T: SafeDeviceCopy + TypeGraphLayout, const M2D: bool, @@ -104,19 +95,16 @@ impl Dere } } -unsafe impl RustToCuda - for CudaExchangeBufferHost +impl + CudaExchangeBufferHost { - type CudaAllocation = NoCudaAlloc; - type CudaRepresentation = CudaExchangeBufferCudaRepresentation; - #[allow(clippy::type_complexity)] - unsafe fn borrow( + pub unsafe fn borrow( &self, alloc: A, ) -> rustacuda::error::CudaResult<( - DeviceAccessible, - CombinedCudaAlloc, + DeviceAccessible>, + CombinedCudaAlloc, )> { // Safety: device_buffer is inside an UnsafeCell // borrow checks must be satisfied through LendToCuda @@ -141,9 +129,9 @@ unsafe impl( + pub unsafe fn restore( &mut self, - alloc: CombinedCudaAlloc, + alloc: CombinedCudaAlloc, ) -> rustacuda::error::CudaResult { let (_alloc_front, alloc_tail) = alloc.split(); @@ -160,17 +148,17 @@ unsafe impl RustToCudaAsync - for CudaExchangeBufferHost +impl + CudaExchangeBufferHost { #[allow(clippy::type_complexity)] - unsafe fn borrow_async( + pub unsafe fn borrow_async( &self, alloc: A, stream: &rustacuda::stream::Stream, ) -> rustacuda::error::CudaResult<( - DeviceAccessible, - CombinedCudaAlloc, + DeviceAccessible>, + CombinedCudaAlloc, )> { // Safety: device_buffer is inside an UnsafeCell // borrow checks must be satisfied through LendToCuda @@ -196,9 +184,9 @@ unsafe impl( + pub unsafe fn restore_async( &mut self, - alloc: CombinedCudaAlloc, + alloc: CombinedCudaAlloc, stream: &rustacuda::stream::Stream, ) -> rustacuda::error::CudaResult { let (_alloc_front, alloc_tail) = alloc.split(); diff --git a/src/utils/exchange/buffer/mod.rs b/src/utils/exchange/buffer/mod.rs index 66b2144c1..dcbbc036f 100644 --- a/src/utils/exchange/buffer/mod.rs +++ b/src/utils/exchange/buffer/mod.rs @@ -1,22 +1,146 @@ -use core::mem::MaybeUninit; +#[cfg(any(feature = "host", feature = "device"))] +use core::{ + mem::MaybeUninit, + ops::{Deref, DerefMut}, +}; +use const_type_layout::TypeLayout; + +#[cfg(any(feature = "host", feature = "device"))] +use const_type_layout::TypeGraphLayout; + +use crate::safety::SafeDeviceCopy; + +#[cfg(any(feature = "host", feature = "device"))] +use crate::common::{NoCudaAlloc, RustToCuda, RustToCudaAsync}; + +#[cfg(feature = "host")] +use crate::common::{CombinedCudaAlloc, CudaAlloc, DeviceAccessible}; + +#[cfg(any(feature = "host", feature = "device"))] +use self::common::CudaExchangeBufferCudaRepresentation; + +#[cfg(any(feature = "host", feature = "device"))] mod common; -#[cfg(any(not(feature = "host"), doc))] +#[cfg(feature = "device")] mod device; #[cfg(feature = "host")] mod host; -#[cfg(not(feature = "host"))] +#[cfg(any(feature = "host", feature = "device"))] #[allow(clippy::module_name_repetitions)] -pub use device::CudaExchangeBufferDevice as CudaExchangeBuffer; +pub struct CudaExchangeBuffer +{ + #[cfg(feature = "host")] + inner: host::CudaExchangeBufferHost, + #[cfg(all(feature = "device", not(feature = "host")))] + inner: device::CudaExchangeBufferDevice, +} + #[cfg(feature = "host")] -#[allow(clippy::module_name_repetitions)] -pub use host::CudaExchangeBufferHost as CudaExchangeBuffer; +impl + CudaExchangeBuffer +{ + /// # Errors + /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside + /// CUDA + pub fn new(elem: &T, capacity: usize) -> rustacuda::error::CudaResult { + Ok(Self { + inner: host::CudaExchangeBufferHost::new(elem, capacity)?, + }) + } +} -#[cfg(doc)] -pub use self::{device::CudaExchangeBufferDevice, host::CudaExchangeBufferHost}; +#[cfg(feature = "host")] +impl + CudaExchangeBuffer +{ + /// # Errors + /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside + /// CUDA + pub fn from_vec(vec: Vec) -> rustacuda::error::CudaResult { + Ok(Self { + inner: host::CudaExchangeBufferHost::from_vec(vec)?, + }) + } +} -use crate::safety::SafeDeviceCopy; +#[cfg(any(feature = "host", feature = "device"))] +impl Deref + for CudaExchangeBuffer +{ + type Target = [CudaExchangeItem]; + + fn deref(&self) -> &Self::Target { + &self.inner + } +} + +#[cfg(any(feature = "host", feature = "device"))] +impl DerefMut + for CudaExchangeBuffer +{ + fn deref_mut(&mut self) -> &mut Self::Target { + &mut self.inner + } +} + +#[cfg(any(feature = "host", feature = "device"))] +unsafe impl RustToCuda + for CudaExchangeBuffer +{ + type CudaAllocation = NoCudaAlloc; + type CudaRepresentation = CudaExchangeBufferCudaRepresentation; + + #[cfg(feature = "host")] + #[allow(clippy::type_complexity)] + unsafe fn borrow( + &self, + alloc: A, + ) -> rustacuda::error::CudaResult<( + DeviceAccessible, + CombinedCudaAlloc, + )> { + self.inner.borrow(alloc) + } + + #[cfg(feature = "host")] + #[allow(clippy::type_complexity)] + unsafe fn restore( + &mut self, + alloc: CombinedCudaAlloc, + ) -> rustacuda::error::CudaResult { + self.inner.restore(alloc) + } +} + +#[cfg(any(feature = "host", feature = "device"))] +unsafe impl RustToCudaAsync + for CudaExchangeBuffer +{ + #[cfg(feature = "host")] + #[allow(clippy::type_complexity)] + unsafe fn borrow_async( + &self, + alloc: A, + stream: &rustacuda::stream::Stream, + ) -> rustacuda::error::CudaResult<( + DeviceAccessible, + CombinedCudaAlloc, + )> { + self.inner.borrow_async(alloc, stream) + } + + #[cfg(feature = "host")] + #[allow(clippy::type_complexity)] + unsafe fn restore_async( + &mut self, + alloc: CombinedCudaAlloc, + stream: &rustacuda::stream::Stream, + ) -> rustacuda::error::CudaResult { + self.inner.restore_async(alloc, stream) + } +} #[repr(transparent)] #[derive(Clone, Copy, TypeLayout)] @@ -30,28 +154,24 @@ unsafe impl rustacuda_core: } impl CudaExchangeItem { - #[cfg(any(feature = "host", doc))] - #[doc(cfg(feature = "host"))] + #[cfg(feature = "host")] pub const fn read(&self) -> &T { &self.0 } - #[cfg(any(not(feature = "host"), doc))] - #[doc(cfg(not(feature = "host")))] + #[cfg(feature = "device")] pub fn write(&mut self, value: T) { self.0 = value; } } impl CudaExchangeItem { - #[cfg(any(not(feature = "host"), doc))] - #[doc(cfg(not(feature = "host")))] + #[cfg(feature = "device")] pub const fn read(&self) -> &T { &self.0 } - #[cfg(any(feature = "host", doc))] - #[doc(cfg(feature = "host"))] + #[cfg(feature = "host")] pub fn write(&mut self, value: T) { self.0 = value; } @@ -64,36 +184,31 @@ impl AsMut for CudaExchangeItem { } impl CudaExchangeItem { - #[cfg(any(feature = "host", doc))] - #[doc(cfg(feature = "host"))] + #[cfg(feature = "host")] pub const fn as_scratch(&self) -> &T { &self.0 } - #[cfg(any(feature = "host", doc))] - #[doc(cfg(feature = "host"))] + #[cfg(feature = "host")] pub fn as_scratch_mut(&mut self) -> &mut T { &mut self.0 } } impl CudaExchangeItem { - #[cfg(any(not(feature = "host"), doc))] - #[doc(cfg(not(feature = "host")))] + #[cfg(feature = "device")] pub const fn as_scratch(&self) -> &T { &self.0 } - #[cfg(any(not(feature = "host"), doc))] - #[doc(cfg(not(feature = "host")))] + #[cfg(feature = "device")] pub fn as_scratch_mut(&mut self) -> &mut T { &mut self.0 } } impl CudaExchangeItem { - #[cfg(any(feature = "host", doc))] - #[doc(cfg(feature = "host"))] + #[cfg(feature = "host")] pub const fn as_uninit(&self) -> &MaybeUninit { // Safety: // - MaybeUninit is a transparent newtype union @@ -101,8 +216,7 @@ impl CudaExchangeItem { unsafe { &*(self as *const Self).cast() } } - #[cfg(any(feature = "host", doc))] - #[doc(cfg(feature = "host"))] + #[cfg(feature = "host")] pub fn as_uninit_mut(&mut self) -> &mut MaybeUninit { // Safety: // - MaybeUninit is a transparent newtype union @@ -112,8 +226,7 @@ impl CudaExchangeItem { } impl CudaExchangeItem { - #[cfg(any(not(feature = "host"), doc))] - #[doc(cfg(not(feature = "host")))] + #[cfg(feature = "device")] pub const fn as_uninit(&self) -> &MaybeUninit { // Safety: // - MaybeUninit is a transparent newtype union @@ -121,8 +234,7 @@ impl CudaExchangeItem { unsafe { &*(self as *const Self).cast() } } - #[cfg(any(not(feature = "host"), doc))] - #[doc(cfg(not(feature = "host")))] + #[cfg(feature = "device")] pub fn as_uninit_mut(&mut self) -> &mut MaybeUninit { // Safety: // - MaybeUninit is a transparent newtype union diff --git a/src/utils/exchange/mod.rs b/src/utils/exchange/mod.rs index ffca4bbf3..722e02559 100644 --- a/src/utils/exchange/mod.rs +++ b/src/utils/exchange/mod.rs @@ -1,5 +1,4 @@ pub mod buffer; #[cfg(feature = "host")] -#[doc(cfg(feature = "host"))] pub mod wrapper; diff --git a/src/utils/option.rs b/src/utils/option.rs index a7b3e991e..dec109f38 100644 --- a/src/utils/option.rs +++ b/src/utils/option.rs @@ -1,6 +1,9 @@ use core::mem::MaybeUninit; -use const_type_layout::TypeGraphLayout; +use const_type_layout::{TypeGraphLayout, TypeLayout}; + +#[cfg(feature = "host")] +use rustacuda::error::CudaResult; use crate::{ common::{ @@ -12,10 +15,7 @@ use crate::{ }; #[cfg(feature = "host")] -use crate::{ - common::{CombinedCudaAlloc, CudaAlloc}, - rustacuda::error::CudaResult, -}; +use crate::common::{CombinedCudaAlloc, CudaAlloc}; #[doc(hidden)] #[allow(clippy::module_name_repetitions)] @@ -35,7 +35,6 @@ unsafe impl RustToCuda for Option { type CudaRepresentation = OptionCudaRepresentation<::CudaRepresentation>; #[cfg(feature = "host")] - #[doc(cfg(feature = "host"))] #[allow(clippy::type_complexity)] unsafe fn borrow( &self, @@ -71,7 +70,6 @@ unsafe impl RustToCuda for Option { } #[cfg(feature = "host")] - #[doc(cfg(feature = "host"))] unsafe fn restore( &mut self, alloc: CombinedCudaAlloc, @@ -89,7 +87,6 @@ unsafe impl RustToCuda for Option { unsafe impl RustToCudaAsync for Option { #[cfg(feature = "host")] - #[doc(cfg(feature = "host"))] #[allow(clippy::type_complexity)] unsafe fn borrow_async( &self, @@ -126,7 +123,6 @@ unsafe impl RustToCudaAsync for Option { } #[cfg(feature = "host")] - #[doc(cfg(feature = "host"))] unsafe fn restore_async( &mut self, alloc: CombinedCudaAlloc, @@ -146,7 +142,7 @@ unsafe impl RustToCudaAsync for Option { unsafe impl CudaAsRust for OptionCudaRepresentation { type RustRepresentation = Option<::RustRepresentation>; - #[cfg(not(feature = "host"))] + #[cfg(feature = "device")] unsafe fn as_rust(this: &DeviceAccessible) -> Self::RustRepresentation { if this.present { Some(CudaAsRust::as_rust(this.maybe.assume_init_ref())) diff --git a/src/utils/shared/slice.rs b/src/utils/shared/slice.rs index 7039e15f9..bec725bd1 100644 --- a/src/utils/shared/slice.rs +++ b/src/utils/shared/slice.rs @@ -3,56 +3,40 @@ use const_type_layout::TypeGraphLayout; #[allow(clippy::module_name_repetitions)] #[repr(transparent)] pub struct ThreadBlockSharedSlice { - #[cfg(not(target_os = "cuda"))] - // dangling marker s.t. Self is not StackOnly - dangling: *mut [T], - #[cfg(target_os = "cuda")] shared: *mut [T], } impl ThreadBlockSharedSlice { - #[cfg(any(not(target_os = "cuda"), doc))] - #[doc(cfg(not(target_os = "cuda")))] + #[cfg(feature = "host")] #[must_use] pub fn new_uninit_with_len(len: usize) -> Self { Self { - dangling: Self::dangling_slice_with_len(len), + shared: Self::dangling_slice_with_len(len), } } - #[cfg(any(not(target_os = "cuda"), doc))] - #[doc(cfg(not(target_os = "cuda")))] + #[cfg(feature = "host")] #[must_use] pub fn with_len(mut self, len: usize) -> Self { - self.dangling = Self::dangling_slice_with_len(len); + self.shared = Self::dangling_slice_with_len(len); self } - #[cfg(any(not(target_os = "cuda"), doc))] - #[doc(cfg(not(target_os = "cuda")))] + #[cfg(feature = "host")] #[must_use] pub fn with_len_mut(&mut self, len: usize) -> &mut Self { - self.dangling = Self::dangling_slice_with_len(len); + self.shared = Self::dangling_slice_with_len(len); self } - #[cfg(not(target_os = "cuda"))] + #[cfg(feature = "host")] fn dangling_slice_with_len(len: usize) -> *mut [T] { core::ptr::slice_from_raw_parts_mut(core::ptr::NonNull::dangling().as_ptr(), len) } #[must_use] pub fn len(&self) -> usize { - core::ptr::metadata({ - #[cfg(not(target_os = "cuda"))] - { - self.dangling - } - #[cfg(target_os = "cuda")] - { - self.shared - } - }) + core::ptr::metadata(self.shared) } #[must_use] @@ -60,22 +44,19 @@ impl ThreadBlockSharedSlice { self.len() == 0 } - #[cfg(any(target_os = "cuda", doc))] - #[doc(cfg(target_os = "cuda"))] + #[cfg(feature = "device")] #[must_use] pub const fn as_mut_ptr(&self) -> *mut T { self.shared.cast() } - #[cfg(any(target_os = "cuda", doc))] - #[doc(cfg(target_os = "cuda"))] + #[cfg(feature = "device")] #[must_use] pub const fn as_mut_slice_ptr(&self) -> *mut [T] { self.shared } - #[cfg(any(target_os = "cuda", doc))] - #[doc(cfg(target_os = "cuda"))] + #[cfg(feature = "device")] /// # Safety /// /// The provided `index` must not be out of bounds. @@ -89,8 +70,7 @@ impl ThreadBlockSharedSlice { } } -#[cfg(all(not(feature = "host"), target_os = "cuda"))] -#[doc(cfg(all(not(feature = "host"), target_os = "cuda")))] +#[cfg(feature = "device")] impl ThreadBlockSharedSlice { /// # Safety /// @@ -129,7 +109,7 @@ impl ThreadBlockSharedSlice { } #[doc(hidden)] -#[cfg(all(not(feature = "host"), target_os = "cuda"))] +#[cfg(feature = "device")] /// # Safety /// /// The thread-block shared dynamic memory must be initialised once and @@ -143,5 +123,5 @@ pub unsafe fn init() { } } -#[cfg(all(not(feature = "host"), target_os = "cuda"))] +#[cfg(feature = "device")] core::arch::global_asm!(".extern .shared .align 8 .b8 rust_cuda_dynamic_shared_base[];"); diff --git a/src/utils/shared/static.rs b/src/utils/shared/static.rs index 41ba334ba..62c3a0c49 100644 --- a/src/utils/shared/static.rs +++ b/src/utils/shared/static.rs @@ -1,25 +1,23 @@ #[repr(transparent)] pub struct ThreadBlockShared { - #[cfg(not(target_os = "cuda"))] - // dangling marker s.t. Self is not StackOnly - _dangling: *mut T, - #[cfg(target_os = "cuda")] + #[cfg_attr(not(feature = "device"), allow(dead_code))] shared: *mut T, } impl ThreadBlockShared { + #[cfg(any(feature = "host", feature = "device"))] #[must_use] #[allow(clippy::inline_always, clippy::missing_const_for_fn)] #[inline(always)] pub fn new_uninit() -> Self { - #[cfg(not(target_os = "cuda"))] + #[cfg(feature = "host")] { Self { - _dangling: core::ptr::NonNull::dangling().as_ptr(), + shared: core::ptr::NonNull::dangling().as_ptr(), } } - #[cfg(target_os = "cuda")] + #[cfg(feature = "device")] { let shared: *mut T; @@ -37,8 +35,7 @@ impl ThreadBlockShared { } } - #[cfg(any(target_os = "cuda", doc))] - #[doc(cfg(target_os = "cuda"))] + #[cfg(feature = "device")] #[must_use] pub const fn as_mut_ptr(&self) -> *mut T { self.shared @@ -46,8 +43,7 @@ impl ThreadBlockShared { } impl ThreadBlockShared<[T; N]> { - #[cfg(any(target_os = "cuda", doc))] - #[doc(cfg(target_os = "cuda"))] + #[cfg(feature = "device")] /// # Safety /// /// The provided `index` must not be out of bounds. From 28a1e266ddee49ff6cdb883ff5548ce4dd381838 Mon Sep 17 00:00:00 2001 From: Juniper Tyree Date: Mon, 25 Dec 2023 10:08:39 +0000 Subject: [PATCH 063/120] Refactoring to prepare for better module structure --- src/common.rs | 4 ++-- src/deps.rs | 2 ++ src/device/alloc.rs | 3 ++- src/device/utils.rs | 8 +++++--- src/host/mod.rs | 2 +- src/host/ptx_jit/replace.rs | 3 +-- src/lib.rs | 5 +---- src/utils/box.rs | 4 ++-- src/utils/boxed_slice.rs | 4 ++-- src/utils/exchange/buffer/common.rs | 4 +++- src/utils/exchange/buffer/device.rs | 4 ++-- src/utils/exchange/buffer/host.rs | 3 +-- src/utils/exchange/wrapper.rs | 5 ++--- 13 files changed, 26 insertions(+), 25 deletions(-) diff --git a/src/common.rs b/src/common.rs index d9d1a955a..d7b9815b4 100644 --- a/src/common.rs +++ b/src/common.rs @@ -5,13 +5,13 @@ use core::{ ops::{Deref, DerefMut}, }; -#[cfg(feature = "host")] -use alloc::fmt; #[cfg(feature = "host")] use core::{ mem::MaybeUninit, ptr::{copy_nonoverlapping, NonNull}, }; +#[cfg(feature = "host")] +use std::fmt; use const_type_layout::{TypeGraphLayout, TypeLayout}; use rustacuda_core::DeviceCopy; diff --git a/src/deps.rs b/src/deps.rs index fe001e054..68257e095 100644 --- a/src/deps.rs +++ b/src/deps.rs @@ -1,3 +1,5 @@ +pub(crate) extern crate alloc; + pub extern crate const_type_layout; #[cfg(feature = "host")] diff --git a/src/device/alloc.rs b/src/device/alloc.rs index c1c28f931..bca59a1eb 100644 --- a/src/device/alloc.rs +++ b/src/device/alloc.rs @@ -1,7 +1,8 @@ -use alloc::alloc::{GlobalAlloc, Layout}; #[cfg(all(feature = "device", not(doc)))] use core::arch::nvptx; +use crate::deps::alloc::alloc::{GlobalAlloc, Layout}; + /// Memory allocator using CUDA malloc/free pub struct PTXAllocator; diff --git a/src/device/utils.rs b/src/device/utils.rs index 3b37307a6..cbc5080ab 100644 --- a/src/device/utils.rs +++ b/src/device/utils.rs @@ -1,3 +1,5 @@ +use crate::deps::alloc::{fmt, string::String}; + /// Abort the CUDA kernel using the `trap` system call. #[allow(clippy::inline_always)] #[inline(always)] @@ -52,7 +54,7 @@ pub fn print(args: ::core::fmt::Arguments) { let msg = if let Some(msg) = args.as_str() { msg } else { - msg = ::alloc::fmt::format(args); + msg = fmt::format(args); msg.as_str() }; @@ -93,7 +95,7 @@ pub fn pretty_panic_handler( if let Some(msg) = message.as_str() { msg } else if allow_dynamic_message { - msg = ::alloc::fmt::format(*message); + msg = fmt::format(*message); msg.as_str() } else { "" @@ -102,7 +104,7 @@ pub fn pretty_panic_handler( && allow_dynamic_payload { msg - } else if let Some(msg) = info.payload().downcast_ref::<::alloc::string::String>() + } else if let Some(msg) = info.payload().downcast_ref::() && allow_dynamic_payload { msg.as_str() diff --git a/src/host/mod.rs b/src/host/mod.rs index cc7fa681f..45dc6f059 100644 --- a/src/host/mod.rs +++ b/src/host/mod.rs @@ -1,9 +1,9 @@ -use core::ptr::NonNull; use std::{ ffi::{CStr, CString}, marker::PhantomData, mem::ManuallyDrop, ops::{Deref, DerefMut}, + ptr::NonNull, }; use rustacuda::{ diff --git a/src/host/ptx_jit/replace.rs b/src/host/ptx_jit/replace.rs index ed59701c7..97a592da9 100644 --- a/src/host/ptx_jit/replace.rs +++ b/src/host/ptx_jit/replace.rs @@ -1,5 +1,4 @@ -use core::ptr::NonNull; -use std::{ffi::CString, ops::Deref}; +use std::{ffi::CString, ops::Deref, ptr::NonNull}; use super::{PtxElement, PtxJITCompiler, PtxJITResult, PtxLoadWidth}; diff --git a/src/lib.rs b/src/lib.rs index 118a55343..16d48d0b3 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -6,7 +6,7 @@ #![deny(clippy::style)] #![deny(clippy::suspicious)] #![allow(clippy::useless_attribute)] -#![cfg_attr(all(feature = "device", not(doc)), no_std)] +#![cfg_attr(all(any(feature = "device", target_os = "cuda"), not(doc)), no_std)] #![feature(associated_type_bounds)] #![feature(auto_traits)] #![feature(negative_impls)] @@ -44,9 +44,6 @@ core::compile_error!("cannot enable the `host` feature on a target with `target_ #[cfg(all(feature = "device", not(target_os = "cuda"), not(doc)))] core::compile_error!("cannot enable the `device` feature on a target without `target_os=\"cuda\"`"); -#[doc(hidden)] -pub extern crate alloc; - pub mod common; #[cfg(feature = "host")] diff --git a/src/utils/box.rs b/src/utils/box.rs index 9972c4ef3..8672c36a0 100644 --- a/src/utils/box.rs +++ b/src/utils/box.rs @@ -1,4 +1,4 @@ -use alloc::boxed::Box; +use crate::deps::alloc::boxed::Box; use const_type_layout::{TypeGraphLayout, TypeLayout}; @@ -83,6 +83,6 @@ unsafe impl CudaAsRust for BoxCudaRepresent #[cfg(feature = "device")] unsafe fn as_rust(this: &DeviceAccessible) -> Self::RustRepresentation { - alloc::boxed::Box::from_raw(this.0) + crate::deps::alloc::boxed::Box::from_raw(this.0) } } diff --git a/src/utils/boxed_slice.rs b/src/utils/boxed_slice.rs index bd9e74aee..e9113d865 100644 --- a/src/utils/boxed_slice.rs +++ b/src/utils/boxed_slice.rs @@ -1,4 +1,4 @@ -use alloc::boxed::Box; +use crate::deps::alloc::boxed::Box; use const_type_layout::{TypeGraphLayout, TypeLayout}; @@ -85,6 +85,6 @@ unsafe impl CudaAsRust for BoxedSliceCudaRe #[cfg(feature = "device")] unsafe fn as_rust(this: &DeviceAccessible) -> Self::RustRepresentation { - alloc::boxed::Box::from_raw(core::slice::from_raw_parts_mut(this.0, this.1)) + crate::deps::alloc::boxed::Box::from_raw(core::slice::from_raw_parts_mut(this.0, this.1)) } } diff --git a/src/utils/exchange/buffer/common.rs b/src/utils/exchange/buffer/common.rs index 2725811ca..31f50cb68 100644 --- a/src/utils/exchange/buffer/common.rs +++ b/src/utils/exchange/buffer/common.rs @@ -32,7 +32,9 @@ unsafe impl) -> Self::RustRepresentation { CudaExchangeBuffer { inner: super::device::CudaExchangeBufferDevice(core::mem::ManuallyDrop::new( - alloc::boxed::Box::from_raw(core::slice::from_raw_parts_mut(this.0, this.1)), + crate::deps::alloc::boxed::Box::from_raw(core::slice::from_raw_parts_mut( + this.0, this.1, + )), )), } } diff --git a/src/utils/exchange/buffer/device.rs b/src/utils/exchange/buffer/device.rs index ed160e185..139224da3 100644 --- a/src/utils/exchange/buffer/device.rs +++ b/src/utils/exchange/buffer/device.rs @@ -2,7 +2,7 @@ use core::ops::{Deref, DerefMut}; use const_type_layout::TypeGraphLayout; -use crate::safety::SafeDeviceCopy; +use crate::{deps::alloc::boxed::Box, safety::SafeDeviceCopy}; use super::CudaExchangeItem; @@ -11,7 +11,7 @@ pub struct CudaExchangeBufferDevice< T: SafeDeviceCopy + TypeGraphLayout, const M2D: bool, const M2H: bool, ->(pub(super) core::mem::ManuallyDrop]>>); +>(pub(super) core::mem::ManuallyDrop]>>); impl Deref for CudaExchangeBufferDevice diff --git a/src/utils/exchange/buffer/host.rs b/src/utils/exchange/buffer/host.rs index 56fc259cd..9bbf8a0af 100644 --- a/src/utils/exchange/buffer/host.rs +++ b/src/utils/exchange/buffer/host.rs @@ -1,5 +1,4 @@ -use alloc::vec::Vec; -use core::{ +use std::{ cell::UnsafeCell, ops::{Deref, DerefMut}, }; diff --git a/src/utils/exchange/wrapper.rs b/src/utils/exchange/wrapper.rs index 4edfdebd8..5f64d3d05 100644 --- a/src/utils/exchange/wrapper.rs +++ b/src/utils/exchange/wrapper.rs @@ -1,12 +1,11 @@ -use core::{ +use std::{ future::{Future, IntoFuture}, marker::PhantomData, ops::{Deref, DerefMut}, + sync::{Arc, Mutex}, task::{Poll, Waker}, }; -use std::sync::Mutex; -use alloc::sync::Arc; use rustacuda::{ error::{CudaError, CudaResult}, event::{Event, EventFlags, EventStatus}, From cdd84a4d6b671a700b65da7c80ff85b8fcb2e937 Mon Sep 17 00:00:00 2001 From: Juniper Tyree Date: Mon, 25 Dec 2023 10:31:51 +0000 Subject: [PATCH 064/120] Extract kernel module just for parameters --- examples/print/src/main.rs | 4 +- examples/single-source/src/main.rs | 20 +- .../wrapper/generate/cuda_generic_function.rs | 2 +- .../kernel/wrapper/generate/cuda_wrapper.rs | 6 +- .../generate/host_linker_macro/get_ptx.rs | 2 +- src/common.rs | 838 +----------------- src/host/mod.rs | 7 +- src/kernel.rs | 838 ++++++++++++++++++ src/lib.rs | 10 +- src/safety/no_aliasing.rs | 8 +- src/utils/shared/mod.rs | 13 +- src/utils/shared/slice.rs | 1 - 12 files changed, 881 insertions(+), 868 deletions(-) create mode 100644 src/kernel.rs diff --git a/examples/print/src/main.rs b/examples/print/src/main.rs index 462603ca6..7a26ce2bd 100644 --- a/examples/print/src/main.rs +++ b/examples/print/src/main.rs @@ -21,9 +21,9 @@ pub enum Action { AllocError, } -#[rust_cuda::common::kernel(use link! for impl)] +#[rust_cuda::kernel::kernel(use link! for impl)] #[kernel(allow(ptx::local_memory_usage))] -pub fn kernel(action: rust_cuda::common::PerThreadShallowCopy) { +pub fn kernel(action: rust_cuda::kernel::PerThreadShallowCopy) { match action { Action::Print => rust_cuda::device::utils::println!("println! from CUDA kernel"), Action::Panic => panic!("panic! from CUDA kernel"), diff --git a/examples/single-source/src/main.rs b/examples/single-source/src/main.rs index 41df1705d..383ade30a 100644 --- a/examples/single-source/src/main.rs +++ b/examples/single-source/src/main.rs @@ -45,7 +45,7 @@ pub struct Tuple(u32, i32); #[layout(crate = "rc::deps::const_type_layout")] pub struct Triple(i32, i32, i32); -#[rc::common::kernel(pub use link! for impl)] +#[rc::kernel::kernel(pub use link! for impl)] #[kernel(crate = "rc")] #[kernel( allow(ptx::double_precision_use), @@ -61,16 +61,16 @@ pub fn kernel< + rc::safety::StackOnly + rc::safety::NoSafeAliasing, >( - _x: &rc::common::PerThreadShallowCopy, - _z: &rc::common::SharedHeapPerThreadShallowCopy>, - _v @ _w: &'a rc::common::ShallowInteriorMutable, - _: rc::common::SharedHeapPerThreadShallowCopy>, - q @ Triple(s, mut __t, _u): rc::common::PerThreadShallowCopy, - shared3: &mut rc::utils::shared::r#static::ThreadBlockShared, - dynamic: &mut rc::utils::shared::slice::ThreadBlockSharedSlice, + _x: &rc::kernel::PerThreadShallowCopy, + _z: &rc::kernel::SharedHeapPerThreadShallowCopy>, + _v @ _w: &'a rc::kernel::ShallowInteriorMutable, + _: rc::kernel::SharedHeapPerThreadShallowCopy>, + q @ Triple(s, mut __t, _u): rc::kernel::PerThreadShallowCopy, + shared3: &mut rc::utils::shared::ThreadBlockShared, + dynamic: &mut rc::utils::shared::ThreadBlockSharedSlice, ) { - let shared = rc::utils::shared::r#static::ThreadBlockShared::<[Tuple; 3]>::new_uninit(); - let shared2 = rc::utils::shared::r#static::ThreadBlockShared::<[Tuple; 3]>::new_uninit(); + let shared = rc::utils::shared::ThreadBlockShared::<[Tuple; 3]>::new_uninit(); + let shared2 = rc::utils::shared::ThreadBlockShared::<[Tuple; 3]>::new_uninit(); #[allow(clippy::cast_possible_truncation, clippy::cast_sign_loss)] unsafe { diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cuda_generic_function.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cuda_generic_function.rs index 8a5de226e..1b05df23b 100644 --- a/rust-cuda-derive/src/kernel/wrapper/generate/cuda_generic_function.rs +++ b/rust-cuda-derive/src/kernel/wrapper/generate/cuda_generic_function.rs @@ -26,7 +26,7 @@ pub(in super::super) fn quote_cuda_generic_function( colon_token, }| { let ty: syn::Type = syn::parse_quote_spanned! { ty.span()=> - <#ty as #crate_path::common::CudaKernelParameter>::DeviceType<'_> + <#ty as #crate_path::kernel::CudaKernelParameter>::DeviceType<'_> }; syn::FnArg::Typed(syn::PatType { diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cuda_wrapper.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cuda_wrapper.rs index e87bd0d16..f61bb9b32 100644 --- a/rust-cuda-derive/src/kernel/wrapper/generate/cuda_wrapper.rs +++ b/rust-cuda-derive/src/kernel/wrapper/generate/cuda_wrapper.rs @@ -50,7 +50,7 @@ pub(in super::super) fn quote_cuda_wrapper( quote::quote_spanned! { ty.span()=> unsafe { < - #specialised_ty as #crate_path::common::CudaKernelParameter + #specialised_ty as #crate_path::kernel::CudaKernelParameter >::with_ffi_as_device::<_, #i>( #pat, |#pat| { #inner } ) @@ -69,7 +69,7 @@ pub(in super::super) fn quote_cuda_wrapper( unsafe { // Initialise the dynamically-sized thread-block shared memory // and the thread-local offset pointer that points to it - #crate_path::utils::shared::slice::init(); + #crate_path::utils::shared::init(); } unsafe { @@ -122,7 +122,7 @@ fn specialise_ffi_input_types( }; let ffi_ty: syn::Type = syn::parse_quote_spanned! { ty.span()=> - <#specialised_ty as #crate_path::common::CudaKernelParameter>::FfiType<'static, 'static> + <#specialised_ty as #crate_path::kernel::CudaKernelParameter>::FfiType<'static, 'static> }; let ffi_param = syn::FnArg::Typed(syn::PatType { diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/host_linker_macro/get_ptx.rs b/rust-cuda-derive/src/kernel/wrapper/generate/host_linker_macro/get_ptx.rs index 439f27f9e..d7394142e 100644 --- a/rust-cuda-derive/src/kernel/wrapper/generate/host_linker_macro/get_ptx.rs +++ b/rust-cuda-derive/src/kernel/wrapper/generate/host_linker_macro/get_ptx.rs @@ -143,7 +143,7 @@ fn generate_lifetime_erased_types( } quote::quote_spanned! { ty.span()=> - <#specialised_ty as #crate_path::common::CudaKernelParameter>::FfiType<'static, 'static> + <#specialised_ty as #crate_path::kernel::CudaKernelParameter>::FfiType<'static, 'static> } }).collect() } diff --git a/src/common.rs b/src/common.rs index d7b9815b4..37d005ac4 100644 --- a/src/common.rs +++ b/src/common.rs @@ -1,15 +1,12 @@ +use core::marker::PhantomData; #[cfg(feature = "device")] -use core::convert::{AsMut, AsRef}; use core::{ - marker::PhantomData, + convert::{AsMut, AsRef}, ops::{Deref, DerefMut}, }; #[cfg(feature = "host")] -use core::{ - mem::MaybeUninit, - ptr::{copy_nonoverlapping, NonNull}, -}; +use core::{mem::MaybeUninit, ptr::copy_nonoverlapping}; #[cfg(feature = "host")] use std::fmt; @@ -19,9 +16,6 @@ use rustacuda_core::DeviceCopy; #[cfg(feature = "derive")] pub use rust_cuda_derive::LendRustToCuda; -#[cfg(feature = "derive")] -pub use rust_cuda_derive::kernel; - #[cfg(feature = "host")] use crate::{safety::SafeDeviceCopy, utils::device_copy::SafeDeviceCopyWrapper}; @@ -315,829 +309,3 @@ impl CombinedCudaAlloc { (self.0, self.1) } } - -mod sealed { - #[doc(hidden)] - pub trait Sealed {} -} - -// TODO: doc cfg -pub trait CudaKernelParameter: sealed::Sealed { - #[cfg(feature = "host")] - type SyncHostType; - #[cfg(feature = "host")] - type AsyncHostType<'stream, 'b>; - #[doc(hidden)] - type FfiType<'stream, 'b>: rustacuda_core::DeviceCopy + TypeGraphLayout; - #[cfg(any(feature = "device", doc))] - type DeviceType<'b>; - - #[cfg(feature = "host")] - #[allow(clippy::missing_errors_doc)] // FIXME - fn with_new_async<'stream, O, E: From>( - param: Self::SyncHostType, - stream: &'stream rustacuda::stream::Stream, - inner: impl for<'b> FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result, - ) -> Result; - - #[doc(hidden)] - #[cfg(feature = "host")] - fn with_async_as_ptx_jit( - param: &Self::AsyncHostType<'_, '_>, - inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O, - ) -> O; - - #[doc(hidden)] - #[cfg(feature = "host")] - fn async_to_ffi<'stream, 'b>( - param: Self::AsyncHostType<'stream, 'b>, - ) -> Self::FfiType<'stream, 'b>; - - #[doc(hidden)] - #[cfg(feature = "device")] - unsafe fn with_ffi_as_device( - param: Self::FfiType<'static, 'static>, - inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O, - ) -> O; -} - -pub struct PtxJit { - never: !, - _marker: PhantomData, -} - -impl Deref for PtxJit { - type Target = T; - - fn deref(&self) -> &Self::Target { - self.never - } -} - -impl DerefMut for PtxJit { - fn deref_mut(&mut self) -> &mut Self::Target { - self.never - } -} - -pub struct PerThreadShallowCopy< - T: crate::safety::SafeDeviceCopy - + crate::safety::NoSafeAliasing - + const_type_layout::TypeGraphLayout, -> { - never: !, - _marker: PhantomData, -} - -impl< - T: crate::safety::SafeDeviceCopy - + crate::safety::NoSafeAliasing - + const_type_layout::TypeGraphLayout, - > Deref for PerThreadShallowCopy -{ - type Target = T; - - fn deref(&self) -> &Self::Target { - self.never - } -} - -impl< - T: crate::safety::SafeDeviceCopy - + crate::safety::NoSafeAliasing - + const_type_layout::TypeGraphLayout, - > DerefMut for PerThreadShallowCopy -{ - fn deref_mut(&mut self) -> &mut Self::Target { - self.never - } -} - -impl< - T: crate::safety::SafeDeviceCopy - + crate::safety::NoSafeAliasing - + const_type_layout::TypeGraphLayout, - > CudaKernelParameter for PerThreadShallowCopy -{ - #[cfg(feature = "host")] - type AsyncHostType<'stream, 'b> = crate::utils::device_copy::SafeDeviceCopyWrapper; - #[cfg(any(feature = "device", doc))] - type DeviceType<'b> = T; - type FfiType<'stream, 'b> = crate::utils::device_copy::SafeDeviceCopyWrapper; - #[cfg(feature = "host")] - type SyncHostType = T; - - #[cfg(feature = "host")] - fn with_new_async<'stream, O, E: From>( - param: Self::SyncHostType, - _stream: &'stream rustacuda::stream::Stream, - inner: impl for<'b> FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result, - ) -> Result { - inner(crate::utils::device_copy::SafeDeviceCopyWrapper::from( - param, - )) - } - - #[cfg(feature = "host")] - fn with_async_as_ptx_jit( - _param: &Self::AsyncHostType<'_, '_>, - inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O, - ) -> O { - inner(None) - } - - #[cfg(feature = "host")] - fn async_to_ffi<'stream, 'b>( - param: Self::AsyncHostType<'stream, 'b>, - ) -> Self::FfiType<'stream, 'b> { - param - } - - #[cfg(feature = "device")] - unsafe fn with_ffi_as_device( - param: Self::FfiType<'static, 'static>, - inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O, - ) -> O { - let param = param.into_inner(); - - inner(param) - } -} -impl< - T: crate::safety::SafeDeviceCopy - + crate::safety::NoSafeAliasing - + const_type_layout::TypeGraphLayout, - > sealed::Sealed for PerThreadShallowCopy -{ -} - -impl< - 'a, - T: 'static - + crate::safety::SafeDeviceCopy - + crate::safety::NoSafeAliasing - + const_type_layout::TypeGraphLayout, - > CudaKernelParameter for &'a PerThreadShallowCopy -{ - #[cfg(feature = "host")] - type AsyncHostType<'stream, 'b> = crate::host::HostAndDeviceConstRefAsync< - 'stream, - 'b, - crate::utils::device_copy::SafeDeviceCopyWrapper, - >; - #[cfg(any(feature = "device", doc))] - type DeviceType<'b> = &'b T; - type FfiType<'stream, 'b> = - DeviceConstRef<'b, crate::utils::device_copy::SafeDeviceCopyWrapper>; - #[cfg(feature = "host")] - type SyncHostType = &'a T; - - #[cfg(feature = "host")] - fn with_new_async<'stream, O, E: From>( - param: Self::SyncHostType, - _stream: &'stream rustacuda::stream::Stream, - inner: impl for<'b> FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result, - ) -> Result { - let host_box = crate::host::HostDeviceBox::from(rustacuda::memory::DeviceBox::new( - crate::utils::device_copy::SafeDeviceCopyWrapper::from_ref(param), - )?); - - // Safety: `host_box` contains exactly the device copy of `param` - let const_ref = unsafe { - crate::host::HostAndDeviceConstRef::new( - &host_box, - crate::utils::device_copy::SafeDeviceCopyWrapper::from_ref(param), - ) - }; - - inner(const_ref.as_async()) - } - - #[cfg(feature = "host")] - fn with_async_as_ptx_jit( - _param: &Self::AsyncHostType<'_, '_>, - inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O, - ) -> O { - inner(None) - } - - #[cfg(feature = "host")] - fn async_to_ffi<'stream, 'b>( - param: Self::AsyncHostType<'stream, 'b>, - ) -> Self::FfiType<'stream, 'b> { - unsafe { param.for_device_async() } - } - - #[cfg(feature = "device")] - unsafe fn with_ffi_as_device( - param: Self::FfiType<'static, 'static>, - inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O, - ) -> O { - let param = param.as_ref().into_ref(); - - inner(param) - } -} -impl< - 'a, - T: crate::safety::SafeDeviceCopy - + crate::safety::NoSafeAliasing - + const_type_layout::TypeGraphLayout, - > sealed::Sealed for &'a PerThreadShallowCopy -{ -} - -impl< - 'a, - T: 'static - + crate::safety::SafeDeviceCopy - + crate::safety::NoSafeAliasing - + const_type_layout::TypeGraphLayout, - > CudaKernelParameter for &'a PtxJit> -{ - #[cfg(feature = "host")] - type AsyncHostType<'stream, 'b> = - <&'a PerThreadShallowCopy as CudaKernelParameter>::AsyncHostType<'stream, 'b>; - #[cfg(any(feature = "device", doc))] - type DeviceType<'b> = <&'a PerThreadShallowCopy as CudaKernelParameter>::DeviceType<'b>; - type FfiType<'stream, 'b> = - <&'a PerThreadShallowCopy as CudaKernelParameter>::FfiType<'stream, 'b>; - #[cfg(feature = "host")] - type SyncHostType = <&'a PerThreadShallowCopy as CudaKernelParameter>::SyncHostType; - - #[cfg(feature = "host")] - fn with_new_async<'stream, O, E: From>( - param: Self::SyncHostType, - stream: &'stream rustacuda::stream::Stream, - inner: impl for<'b> FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result, - ) -> Result { - <&'a PerThreadShallowCopy as CudaKernelParameter>::with_new_async(param, stream, inner) - } - - #[cfg(feature = "host")] - fn with_async_as_ptx_jit( - param: &Self::AsyncHostType<'_, '_>, - inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O, - ) -> O { - inner(Some(¶m_as_raw_bytes(param.for_host()))) - } - - #[cfg(feature = "host")] - fn async_to_ffi<'stream, 'b>( - param: Self::AsyncHostType<'stream, 'b>, - ) -> Self::FfiType<'stream, 'b> { - <&'a PerThreadShallowCopy as CudaKernelParameter>::async_to_ffi(param) - } - - #[cfg(feature = "device")] - unsafe fn with_ffi_as_device( - param: Self::FfiType<'static, 'static>, - inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O, - ) -> O { - emit_param_ptx_jit_marker::<_, PARAM>(param.as_ref()); - - <&'a PerThreadShallowCopy as CudaKernelParameter>::with_ffi_as_device::( - param, inner, - ) - } -} -impl< - 'a, - T: crate::safety::SafeDeviceCopy - + crate::safety::NoSafeAliasing - + const_type_layout::TypeGraphLayout, - > sealed::Sealed for &'a PtxJit> -{ -} - -pub struct ShallowInteriorMutable { - never: !, - _marker: PhantomData, -} - -impl Deref for ShallowInteriorMutable { - type Target = T; - - fn deref(&self) -> &Self::Target { - self.never - } -} - -impl<'a, T: 'static + InteriorMutableSafeDeviceCopy> CudaKernelParameter - for &'a ShallowInteriorMutable -{ - #[cfg(feature = "host")] - type AsyncHostType<'stream, 'b> = crate::host::HostAndDeviceConstRefAsync< - 'stream, - 'b, - crate::utils::device_copy::SafeDeviceCopyWrapper, - >; - #[cfg(any(feature = "device", doc))] - type DeviceType<'b> = &'b T; - type FfiType<'stream, 'b> = - DeviceConstRef<'b, crate::utils::device_copy::SafeDeviceCopyWrapper>; - #[cfg(feature = "host")] - /// The kernel takes a mutable borrow of the interior mutable data to ensure - /// the interior mutability is limited to just this kernel invocation. - type SyncHostType = &'a mut T; - - #[cfg(feature = "host")] - fn with_new_async<'stream, O, E: From>( - param: Self::SyncHostType, - _stream: &'stream rustacuda::stream::Stream, - inner: impl for<'b> FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result, - ) -> Result { - let host_box = crate::host::HostDeviceBox::from(rustacuda::memory::DeviceBox::new( - crate::utils::device_copy::SafeDeviceCopyWrapper::from_ref(param), - )?); - - // Safety: `host_box` contains exactly the device copy of `param` - let const_ref = unsafe { - crate::host::HostAndDeviceConstRef::new( - &host_box, - crate::utils::device_copy::SafeDeviceCopyWrapper::from_ref(param), - ) - }; - - let result = inner(const_ref.as_async()); - - host_box.copy_to(crate::utils::device_copy::SafeDeviceCopyWrapper::from_mut( - param, - ))?; - - result - } - - #[cfg(feature = "host")] - fn with_async_as_ptx_jit( - _param: &Self::AsyncHostType<'_, '_>, - inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O, - ) -> O { - inner(None) - } - - #[cfg(feature = "host")] - fn async_to_ffi<'stream, 'b>( - param: Self::AsyncHostType<'stream, 'b>, - ) -> Self::FfiType<'stream, 'b> { - unsafe { param.for_device_async() } - } - - #[cfg(feature = "device")] - unsafe fn with_ffi_as_device( - param: Self::FfiType<'static, 'static>, - inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O, - ) -> O { - let param = param.as_ref().into_ref(); - - inner(param) - } -} -impl<'a, T: InteriorMutableSafeDeviceCopy> sealed::Sealed for &'a ShallowInteriorMutable {} - -pub trait InteriorMutableSafeDeviceCopy: - crate::safety::SafeDeviceCopy - + crate::safety::NoSafeAliasing - + const_type_layout::TypeGraphLayout - + sealed::Sealed -{ -} - -macro_rules! impl_atomic_interior_mutable { - ($atomic:ident($interior:ty)) => { - impl InteriorMutableSafeDeviceCopy for core::sync::atomic::$atomic {} - impl sealed::Sealed for core::sync::atomic::$atomic {} - }; - ($($atomic:ident($interior:ty)),*) => { - $(impl_atomic_interior_mutable! { $atomic($interior) })* - } -} - -impl_atomic_interior_mutable! { - AtomicBool(bool), - AtomicI8(i8), AtomicI16(i16), AtomicI32(i32), AtomicI64(i64), AtomicIsize(isize), - AtomicU8(u8), AtomicU16(u16), AtomicU32(u32), AtomicU64(u64), AtomicUsize(usize) -} - -impl< - T: crate::safety::StackOnly - + crate::safety::NoSafeAliasing - + const_type_layout::TypeGraphLayout, - > InteriorMutableSafeDeviceCopy for core::cell::SyncUnsafeCell -{ -} -impl< - T: crate::safety::StackOnly - + crate::safety::NoSafeAliasing - + const_type_layout::TypeGraphLayout, - > sealed::Sealed for core::cell::SyncUnsafeCell -{ -} - -pub struct SharedHeapPerThreadShallowCopy { - never: !, - _marker: PhantomData, -} - -impl Deref for SharedHeapPerThreadShallowCopy { - type Target = T; - - fn deref(&self) -> &Self::Target { - self.never - } -} - -impl DerefMut for SharedHeapPerThreadShallowCopy { - fn deref_mut(&mut self) -> &mut Self::Target { - self.never - } -} - -impl< - T: RustToCuda< - CudaRepresentation: 'static + crate::safety::SafeDeviceCopy, - CudaAllocation: EmptyCudaAlloc, - > + crate::safety::NoSafeAliasing, - > CudaKernelParameter for SharedHeapPerThreadShallowCopy -{ - #[cfg(feature = "host")] - type AsyncHostType<'stream, 'b> = crate::host::HostAndDeviceOwnedAsync< - 'stream, - 'b, - DeviceAccessible<::CudaRepresentation>, - >; - #[cfg(any(feature = "device", doc))] - type DeviceType<'b> = T; - type FfiType<'stream, 'b> = - DeviceOwnedRef<'b, DeviceAccessible<::CudaRepresentation>>; - #[cfg(feature = "host")] - type SyncHostType = T; - - #[cfg(feature = "host")] - fn with_new_async<'stream, O, E: From>( - param: Self::SyncHostType, - _stream: &'stream rustacuda::stream::Stream, - inner: impl for<'b> FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result, - ) -> Result { - crate::host::LendToCuda::move_to_cuda(param, |param| inner(param.into_async())) - } - - #[cfg(feature = "host")] - fn with_async_as_ptx_jit( - _param: &Self::AsyncHostType<'_, '_>, - inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O, - ) -> O { - inner(None) - } - - #[cfg(feature = "host")] - fn async_to_ffi<'stream, 'b>( - param: Self::AsyncHostType<'stream, 'b>, - ) -> Self::FfiType<'stream, 'b> { - unsafe { param.for_device_async() } - } - - #[cfg(feature = "device")] - unsafe fn with_ffi_as_device( - param: Self::FfiType<'static, 'static>, - inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O, - ) -> O { - unsafe { crate::device::BorrowFromRust::with_moved_from_rust(param, inner) } - } -} -impl< - T: RustToCuda< - CudaRepresentation: crate::safety::SafeDeviceCopy, - CudaAllocation: EmptyCudaAlloc, - > + crate::safety::NoSafeAliasing, - > sealed::Sealed for SharedHeapPerThreadShallowCopy -{ -} - -impl<'a, T: 'static + RustToCuda + crate::safety::NoSafeAliasing> CudaKernelParameter - for &'a SharedHeapPerThreadShallowCopy -{ - #[cfg(feature = "host")] - type AsyncHostType<'stream, 'b> = crate::host::HostAndDeviceConstRefAsync< - 'stream, - 'b, - DeviceAccessible<::CudaRepresentation>, - >; - #[cfg(any(feature = "device", doc))] - type DeviceType<'b> = &'b T; - type FfiType<'stream, 'b> = - DeviceConstRef<'b, DeviceAccessible<::CudaRepresentation>>; - #[cfg(feature = "host")] - type SyncHostType = &'a T; - - #[cfg(feature = "host")] - fn with_new_async<'stream, O, E: From>( - param: Self::SyncHostType, - _stream: &'stream rustacuda::stream::Stream, - inner: impl for<'b> FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result, - ) -> Result { - crate::host::LendToCuda::lend_to_cuda(param, |param| inner(param.as_async())) - } - - #[cfg(feature = "host")] - fn with_async_as_ptx_jit( - _param: &Self::AsyncHostType<'_, '_>, - inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O, - ) -> O { - inner(None) - } - - #[cfg(feature = "host")] - fn async_to_ffi<'stream, 'b>( - param: Self::AsyncHostType<'stream, 'b>, - ) -> Self::FfiType<'stream, 'b> { - unsafe { param.for_device_async() } - } - - #[cfg(feature = "device")] - unsafe fn with_ffi_as_device( - param: Self::FfiType<'static, 'static>, - inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O, - ) -> O { - unsafe { crate::device::BorrowFromRust::with_borrow_from_rust(param, inner) } - } -} -impl<'a, T: RustToCuda + crate::safety::NoSafeAliasing> sealed::Sealed - for &'a SharedHeapPerThreadShallowCopy -{ -} - -impl< - T: RustToCuda< - CudaRepresentation: 'static + crate::safety::SafeDeviceCopy, - CudaAllocation: EmptyCudaAlloc, - > + crate::safety::NoSafeAliasing, - > CudaKernelParameter for PtxJit> -{ - #[cfg(feature = "host")] - type AsyncHostType<'stream, 'b> = - as CudaKernelParameter>::AsyncHostType<'stream, 'b>; - #[cfg(any(feature = "device", doc))] - type DeviceType<'b> = - as CudaKernelParameter>::DeviceType<'b>; - type FfiType<'stream, 'b> = - as CudaKernelParameter>::FfiType<'stream, 'b>; - #[cfg(feature = "host")] - type SyncHostType = as CudaKernelParameter>::SyncHostType; - - #[cfg(feature = "host")] - fn with_new_async<'stream, O, E: From>( - param: Self::SyncHostType, - stream: &'stream rustacuda::stream::Stream, - inner: impl for<'b> FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result, - ) -> Result { - as CudaKernelParameter>::with_new_async( - param, stream, inner, - ) - } - - #[cfg(feature = "host")] - fn with_async_as_ptx_jit( - param: &Self::AsyncHostType<'_, '_>, - inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O, - ) -> O { - inner(Some(¶m_as_raw_bytes(param.for_host()))) - } - - #[cfg(feature = "host")] - fn async_to_ffi<'stream, 'b>( - param: Self::AsyncHostType<'stream, 'b>, - ) -> Self::FfiType<'stream, 'b> { - as CudaKernelParameter>::async_to_ffi(param) - } - - #[cfg(feature = "device")] - unsafe fn with_ffi_as_device( - param: Self::FfiType<'static, 'static>, - inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O, - ) -> O { - emit_param_ptx_jit_marker::<_, PARAM>(param.as_ref()); - - as CudaKernelParameter>::with_ffi_as_device::( - param, inner, - ) - } -} -impl< - T: RustToCuda< - CudaRepresentation: crate::safety::SafeDeviceCopy, - CudaAllocation: EmptyCudaAlloc, - > + crate::safety::NoSafeAliasing, - > sealed::Sealed for PtxJit> -{ -} - -impl<'a, T: 'static + RustToCuda + crate::safety::NoSafeAliasing> CudaKernelParameter - for &'a PtxJit> -{ - #[cfg(feature = "host")] - type AsyncHostType<'stream, 'b> = - <&'a SharedHeapPerThreadShallowCopy as CudaKernelParameter>::AsyncHostType<'stream, 'b>; - #[cfg(any(feature = "device", doc))] - type DeviceType<'b> = - <&'a SharedHeapPerThreadShallowCopy as CudaKernelParameter>::DeviceType<'b>; - type FfiType<'stream, 'b> = - <&'a SharedHeapPerThreadShallowCopy as CudaKernelParameter>::FfiType<'stream, 'b>; - #[cfg(feature = "host")] - type SyncHostType = - <&'a SharedHeapPerThreadShallowCopy as CudaKernelParameter>::SyncHostType; - - #[cfg(feature = "host")] - fn with_new_async<'stream, O, E: From>( - param: Self::SyncHostType, - stream: &'stream rustacuda::stream::Stream, - inner: impl for<'b> FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result, - ) -> Result { - <&'a SharedHeapPerThreadShallowCopy as CudaKernelParameter>::with_new_async( - param, stream, inner, - ) - } - - #[cfg(feature = "host")] - fn with_async_as_ptx_jit( - param: &Self::AsyncHostType<'_, '_>, - inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O, - ) -> O { - inner(Some(¶m_as_raw_bytes(param.for_host()))) - } - - #[cfg(feature = "host")] - fn async_to_ffi<'stream, 'b>( - param: Self::AsyncHostType<'stream, 'b>, - ) -> Self::FfiType<'stream, 'b> { - <&'a SharedHeapPerThreadShallowCopy as CudaKernelParameter>::async_to_ffi(param) - } - - #[cfg(feature = "device")] - unsafe fn with_ffi_as_device( - param: Self::FfiType<'static, 'static>, - inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O, - ) -> O { - emit_param_ptx_jit_marker::<_, PARAM>(param.as_ref()); - - <&'a SharedHeapPerThreadShallowCopy as CudaKernelParameter>::with_ffi_as_device::( - param, inner, - ) - } -} -impl<'a, T: RustToCuda + crate::safety::NoSafeAliasing> sealed::Sealed - for &'a PtxJit> -{ -} - -#[cfg(feature = "host")] -fn param_as_raw_bytes(r: &T) -> NonNull<[u8]> { - NonNull::slice_from_raw_parts(NonNull::from(r).cast::(), core::mem::size_of_val(r)) -} - -#[cfg(feature = "device")] -fn emit_param_ptx_jit_marker(param: &T) { - unsafe { - core::arch::asm!( - "// //", - param_reg = in(reg32) *(core::ptr::from_ref(param).cast::()), - param_index = const(INDEX), - ); - } -} - -mod private_shared { - use const_type_layout::{TypeGraphLayout, TypeLayout}; - use rustacuda_core::DeviceCopy; - - #[doc(hidden)] - #[derive(TypeLayout)] - #[repr(C)] - pub struct ThreadBlockSharedFfi { - pub(super) _marker: [T; 0], - } - - // Safety: there is nothing to copy, this is just a zero-sized marker type - unsafe impl DeviceCopy for ThreadBlockSharedFfi {} - - #[doc(hidden)] - #[derive(TypeLayout)] - #[repr(C)] - pub struct ThreadBlockSharedSliceFfi { - pub(super) len: usize, - pub(super) _marker: [T; 0], - } - - // Safety: we only copy a usize, which implements `DeviceCopy` - unsafe impl DeviceCopy for ThreadBlockSharedSliceFfi {} -} - -impl<'a, T: 'static + TypeGraphLayout> CudaKernelParameter - for &'a mut crate::utils::shared::r#static::ThreadBlockShared -{ - #[cfg(feature = "host")] - type AsyncHostType<'stream, 'b> = &'b mut crate::utils::shared::r#static::ThreadBlockShared; - #[cfg(any(feature = "device", doc))] - type DeviceType<'b> = &'b mut crate::utils::shared::r#static::ThreadBlockShared; - type FfiType<'stream, 'b> = private_shared::ThreadBlockSharedFfi; - #[cfg(feature = "host")] - type SyncHostType = Self; - - #[cfg(feature = "host")] - fn with_new_async<'stream, O, E: From>( - param: Self::SyncHostType, - _stream: &'stream rustacuda::stream::Stream, - inner: impl for<'b> FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result, - ) -> Result { - inner(param) - } - - #[cfg(feature = "host")] - fn with_async_as_ptx_jit( - _param: &Self::AsyncHostType<'_, '_>, - inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O, - ) -> O { - inner(None) - } - - #[cfg(feature = "host")] - fn async_to_ffi<'stream, 'b>( - _param: Self::AsyncHostType<'stream, 'b>, - ) -> Self::FfiType<'stream, 'b> { - private_shared::ThreadBlockSharedFfi { _marker: [] } - } - - #[cfg(feature = "device")] - #[allow(clippy::inline_always)] - #[inline(always)] - unsafe fn with_ffi_as_device( - _param: Self::FfiType<'static, 'static>, - inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O, - ) -> O { - let mut param = crate::utils::shared::r#static::ThreadBlockShared::new_uninit(); - - inner(&mut param) - } -} -impl<'a, T: 'static + TypeGraphLayout> sealed::Sealed - for &'a mut crate::utils::shared::r#static::ThreadBlockShared -{ -} - -impl<'a, T: 'static + TypeGraphLayout> CudaKernelParameter - for &'a mut crate::utils::shared::slice::ThreadBlockSharedSlice -{ - #[cfg(feature = "host")] - type AsyncHostType<'stream, 'b> = - &'b mut crate::utils::shared::slice::ThreadBlockSharedSlice; - #[cfg(any(feature = "device", doc))] - type DeviceType<'b> = &'b mut crate::utils::shared::slice::ThreadBlockSharedSlice; - type FfiType<'stream, 'b> = private_shared::ThreadBlockSharedSliceFfi; - #[cfg(feature = "host")] - type SyncHostType = Self; - - #[cfg(feature = "host")] - fn with_new_async<'stream, O, E: From>( - param: Self::SyncHostType, - _stream: &'stream rustacuda::stream::Stream, - inner: impl for<'b> FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result, - ) -> Result { - inner(param) - } - - #[cfg(feature = "host")] - fn with_async_as_ptx_jit( - _param: &Self::AsyncHostType<'_, '_>, - inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O, - ) -> O { - inner(None) - } - - #[cfg(feature = "host")] - fn async_to_ffi<'stream, 'b>( - param: Self::AsyncHostType<'stream, 'b>, - ) -> Self::FfiType<'stream, 'b> { - private_shared::ThreadBlockSharedSliceFfi { - len: param.len(), - _marker: [], - } - } - - #[cfg(feature = "device")] - #[allow(clippy::inline_always)] - #[inline(always)] - unsafe fn with_ffi_as_device( - param: Self::FfiType<'static, 'static>, - inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O, - ) -> O { - unsafe { - crate::utils::shared::slice::ThreadBlockSharedSlice::with_uninit_for_len( - param.len, inner, - ) - } - } -} -impl<'a, T: 'static + TypeGraphLayout> sealed::Sealed - for &'a mut crate::utils::shared::slice::ThreadBlockSharedSlice -{ -} diff --git a/src/host/mod.rs b/src/host/mod.rs index 45dc6f059..2d423362a 100644 --- a/src/host/mod.rs +++ b/src/host/mod.rs @@ -22,9 +22,10 @@ pub use rust_cuda_derive::{check_kernel, link_kernel, specialise_kernel_entry_po use crate::{ common::{ - CudaKernelParameter, DeviceAccessible, DeviceConstRef, DeviceMutRef, DeviceOwnedRef, - EmptyCudaAlloc, NoCudaAlloc, RustToCuda, + DeviceAccessible, DeviceConstRef, DeviceMutRef, DeviceOwnedRef, EmptyCudaAlloc, + NoCudaAlloc, RustToCuda, }, + kernel::CudaKernelParameter, safety::{NoSafeAliasing, SafeDeviceCopy}, }; @@ -1154,7 +1155,7 @@ impl<'stream, 'a, T: SafeDeviceCopy + DeviceCopy> HostAndDeviceOwnedAsync<'strea /// [`CompiledKernelPtx::get_entry_point`]. /// /// This trait should not be implemented manually – use the -/// [`kernel`](crate::common::kernel) macro instead. +/// [`kernel`](crate::kernel::kernel) macro instead. pub unsafe trait CompiledKernelPtx { fn get_ptx() -> &'static CStr; fn get_entry_point() -> &'static CStr; diff --git a/src/kernel.rs b/src/kernel.rs new file mode 100644 index 000000000..98ae0220c --- /dev/null +++ b/src/kernel.rs @@ -0,0 +1,838 @@ +#[cfg(feature = "device")] +use core::convert::AsRef; +use core::{ + marker::PhantomData, + ops::{Deref, DerefMut}, +}; + +#[cfg(feature = "host")] +use core::ptr::NonNull; + +use const_type_layout::TypeGraphLayout; + +#[cfg(feature = "derive")] +pub use rust_cuda_derive::kernel; + +use crate::common::{DeviceAccessible, DeviceConstRef, DeviceOwnedRef, EmptyCudaAlloc, RustToCuda}; + +mod sealed { + #[doc(hidden)] + pub trait Sealed {} +} + +pub trait CudaKernelParameter: sealed::Sealed { + #[cfg(feature = "host")] + type SyncHostType; + #[cfg(feature = "host")] + type AsyncHostType<'stream, 'b>; + #[doc(hidden)] + type FfiType<'stream, 'b>: rustacuda_core::DeviceCopy + TypeGraphLayout; + #[cfg(any(feature = "device", doc))] + type DeviceType<'b>; + + #[cfg(feature = "host")] + #[allow(clippy::missing_errors_doc)] // FIXME + fn with_new_async<'stream, O, E: From>( + param: Self::SyncHostType, + stream: &'stream rustacuda::stream::Stream, + inner: impl for<'b> FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result, + ) -> Result; + + #[doc(hidden)] + #[cfg(feature = "host")] + fn with_async_as_ptx_jit( + param: &Self::AsyncHostType<'_, '_>, + inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O, + ) -> O; + + #[doc(hidden)] + #[cfg(feature = "host")] + fn async_to_ffi<'stream, 'b>( + param: Self::AsyncHostType<'stream, 'b>, + ) -> Self::FfiType<'stream, 'b>; + + #[doc(hidden)] + #[cfg(feature = "device")] + unsafe fn with_ffi_as_device( + param: Self::FfiType<'static, 'static>, + inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O, + ) -> O; +} + +pub struct PtxJit { + never: !, + _marker: PhantomData, +} + +impl Deref for PtxJit { + type Target = T; + + fn deref(&self) -> &Self::Target { + self.never + } +} + +impl DerefMut for PtxJit { + fn deref_mut(&mut self) -> &mut Self::Target { + self.never + } +} + +pub struct PerThreadShallowCopy< + T: crate::safety::SafeDeviceCopy + + crate::safety::NoSafeAliasing + + const_type_layout::TypeGraphLayout, +> { + never: !, + _marker: PhantomData, +} + +impl< + T: crate::safety::SafeDeviceCopy + + crate::safety::NoSafeAliasing + + const_type_layout::TypeGraphLayout, + > Deref for PerThreadShallowCopy +{ + type Target = T; + + fn deref(&self) -> &Self::Target { + self.never + } +} + +impl< + T: crate::safety::SafeDeviceCopy + + crate::safety::NoSafeAliasing + + const_type_layout::TypeGraphLayout, + > DerefMut for PerThreadShallowCopy +{ + fn deref_mut(&mut self) -> &mut Self::Target { + self.never + } +} + +impl< + T: crate::safety::SafeDeviceCopy + + crate::safety::NoSafeAliasing + + const_type_layout::TypeGraphLayout, + > CudaKernelParameter for PerThreadShallowCopy +{ + #[cfg(feature = "host")] + type AsyncHostType<'stream, 'b> = crate::utils::device_copy::SafeDeviceCopyWrapper; + #[cfg(any(feature = "device", doc))] + type DeviceType<'b> = T; + type FfiType<'stream, 'b> = crate::utils::device_copy::SafeDeviceCopyWrapper; + #[cfg(feature = "host")] + type SyncHostType = T; + + #[cfg(feature = "host")] + fn with_new_async<'stream, O, E: From>( + param: Self::SyncHostType, + _stream: &'stream rustacuda::stream::Stream, + inner: impl for<'b> FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result, + ) -> Result { + inner(crate::utils::device_copy::SafeDeviceCopyWrapper::from( + param, + )) + } + + #[cfg(feature = "host")] + fn with_async_as_ptx_jit( + _param: &Self::AsyncHostType<'_, '_>, + inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O, + ) -> O { + inner(None) + } + + #[cfg(feature = "host")] + fn async_to_ffi<'stream, 'b>( + param: Self::AsyncHostType<'stream, 'b>, + ) -> Self::FfiType<'stream, 'b> { + param + } + + #[cfg(feature = "device")] + unsafe fn with_ffi_as_device( + param: Self::FfiType<'static, 'static>, + inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O, + ) -> O { + let param = param.into_inner(); + + inner(param) + } +} +impl< + T: crate::safety::SafeDeviceCopy + + crate::safety::NoSafeAliasing + + const_type_layout::TypeGraphLayout, + > sealed::Sealed for PerThreadShallowCopy +{ +} + +impl< + 'a, + T: 'static + + crate::safety::SafeDeviceCopy + + crate::safety::NoSafeAliasing + + const_type_layout::TypeGraphLayout, + > CudaKernelParameter for &'a PerThreadShallowCopy +{ + #[cfg(feature = "host")] + type AsyncHostType<'stream, 'b> = crate::host::HostAndDeviceConstRefAsync< + 'stream, + 'b, + crate::utils::device_copy::SafeDeviceCopyWrapper, + >; + #[cfg(any(feature = "device", doc))] + type DeviceType<'b> = &'b T; + type FfiType<'stream, 'b> = + DeviceConstRef<'b, crate::utils::device_copy::SafeDeviceCopyWrapper>; + #[cfg(feature = "host")] + type SyncHostType = &'a T; + + #[cfg(feature = "host")] + fn with_new_async<'stream, O, E: From>( + param: Self::SyncHostType, + _stream: &'stream rustacuda::stream::Stream, + inner: impl for<'b> FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result, + ) -> Result { + let host_box = crate::host::HostDeviceBox::from(rustacuda::memory::DeviceBox::new( + crate::utils::device_copy::SafeDeviceCopyWrapper::from_ref(param), + )?); + + // Safety: `host_box` contains exactly the device copy of `param` + let const_ref = unsafe { + crate::host::HostAndDeviceConstRef::new( + &host_box, + crate::utils::device_copy::SafeDeviceCopyWrapper::from_ref(param), + ) + }; + + inner(const_ref.as_async()) + } + + #[cfg(feature = "host")] + fn with_async_as_ptx_jit( + _param: &Self::AsyncHostType<'_, '_>, + inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O, + ) -> O { + inner(None) + } + + #[cfg(feature = "host")] + fn async_to_ffi<'stream, 'b>( + param: Self::AsyncHostType<'stream, 'b>, + ) -> Self::FfiType<'stream, 'b> { + unsafe { param.for_device_async() } + } + + #[cfg(feature = "device")] + unsafe fn with_ffi_as_device( + param: Self::FfiType<'static, 'static>, + inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O, + ) -> O { + let param = param.as_ref().into_ref(); + + inner(param) + } +} +impl< + 'a, + T: crate::safety::SafeDeviceCopy + + crate::safety::NoSafeAliasing + + const_type_layout::TypeGraphLayout, + > sealed::Sealed for &'a PerThreadShallowCopy +{ +} + +impl< + 'a, + T: 'static + + crate::safety::SafeDeviceCopy + + crate::safety::NoSafeAliasing + + const_type_layout::TypeGraphLayout, + > CudaKernelParameter for &'a PtxJit> +{ + #[cfg(feature = "host")] + type AsyncHostType<'stream, 'b> = + <&'a PerThreadShallowCopy as CudaKernelParameter>::AsyncHostType<'stream, 'b>; + #[cfg(any(feature = "device", doc))] + type DeviceType<'b> = <&'a PerThreadShallowCopy as CudaKernelParameter>::DeviceType<'b>; + type FfiType<'stream, 'b> = + <&'a PerThreadShallowCopy as CudaKernelParameter>::FfiType<'stream, 'b>; + #[cfg(feature = "host")] + type SyncHostType = <&'a PerThreadShallowCopy as CudaKernelParameter>::SyncHostType; + + #[cfg(feature = "host")] + fn with_new_async<'stream, O, E: From>( + param: Self::SyncHostType, + stream: &'stream rustacuda::stream::Stream, + inner: impl for<'b> FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result, + ) -> Result { + <&'a PerThreadShallowCopy as CudaKernelParameter>::with_new_async(param, stream, inner) + } + + #[cfg(feature = "host")] + fn with_async_as_ptx_jit( + param: &Self::AsyncHostType<'_, '_>, + inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O, + ) -> O { + inner(Some(¶m_as_raw_bytes(param.for_host()))) + } + + #[cfg(feature = "host")] + fn async_to_ffi<'stream, 'b>( + param: Self::AsyncHostType<'stream, 'b>, + ) -> Self::FfiType<'stream, 'b> { + <&'a PerThreadShallowCopy as CudaKernelParameter>::async_to_ffi(param) + } + + #[cfg(feature = "device")] + unsafe fn with_ffi_as_device( + param: Self::FfiType<'static, 'static>, + inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O, + ) -> O { + emit_param_ptx_jit_marker::<_, PARAM>(param.as_ref()); + + <&'a PerThreadShallowCopy as CudaKernelParameter>::with_ffi_as_device::( + param, inner, + ) + } +} +impl< + 'a, + T: crate::safety::SafeDeviceCopy + + crate::safety::NoSafeAliasing + + const_type_layout::TypeGraphLayout, + > sealed::Sealed for &'a PtxJit> +{ +} + +pub struct ShallowInteriorMutable { + never: !, + _marker: PhantomData, +} + +impl Deref for ShallowInteriorMutable { + type Target = T; + + fn deref(&self) -> &Self::Target { + self.never + } +} + +impl<'a, T: 'static + InteriorMutableSafeDeviceCopy> CudaKernelParameter + for &'a ShallowInteriorMutable +{ + #[cfg(feature = "host")] + type AsyncHostType<'stream, 'b> = crate::host::HostAndDeviceConstRefAsync< + 'stream, + 'b, + crate::utils::device_copy::SafeDeviceCopyWrapper, + >; + #[cfg(any(feature = "device", doc))] + type DeviceType<'b> = &'b T; + type FfiType<'stream, 'b> = + DeviceConstRef<'b, crate::utils::device_copy::SafeDeviceCopyWrapper>; + #[cfg(feature = "host")] + /// The kernel takes a mutable borrow of the interior mutable data to ensure + /// the interior mutability is limited to just this kernel invocation. + type SyncHostType = &'a mut T; + + #[cfg(feature = "host")] + fn with_new_async<'stream, O, E: From>( + param: Self::SyncHostType, + _stream: &'stream rustacuda::stream::Stream, + inner: impl for<'b> FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result, + ) -> Result { + let host_box = crate::host::HostDeviceBox::from(rustacuda::memory::DeviceBox::new( + crate::utils::device_copy::SafeDeviceCopyWrapper::from_ref(param), + )?); + + // Safety: `host_box` contains exactly the device copy of `param` + let const_ref = unsafe { + crate::host::HostAndDeviceConstRef::new( + &host_box, + crate::utils::device_copy::SafeDeviceCopyWrapper::from_ref(param), + ) + }; + + let result = inner(const_ref.as_async()); + + host_box.copy_to(crate::utils::device_copy::SafeDeviceCopyWrapper::from_mut( + param, + ))?; + + result + } + + #[cfg(feature = "host")] + fn with_async_as_ptx_jit( + _param: &Self::AsyncHostType<'_, '_>, + inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O, + ) -> O { + inner(None) + } + + #[cfg(feature = "host")] + fn async_to_ffi<'stream, 'b>( + param: Self::AsyncHostType<'stream, 'b>, + ) -> Self::FfiType<'stream, 'b> { + unsafe { param.for_device_async() } + } + + #[cfg(feature = "device")] + unsafe fn with_ffi_as_device( + param: Self::FfiType<'static, 'static>, + inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O, + ) -> O { + let param = param.as_ref().into_ref(); + + inner(param) + } +} +impl<'a, T: InteriorMutableSafeDeviceCopy> sealed::Sealed for &'a ShallowInteriorMutable {} + +pub trait InteriorMutableSafeDeviceCopy: + crate::safety::SafeDeviceCopy + + crate::safety::NoSafeAliasing + + const_type_layout::TypeGraphLayout + + sealed::Sealed +{ +} + +macro_rules! impl_atomic_interior_mutable { + ($atomic:ident($interior:ty)) => { + impl InteriorMutableSafeDeviceCopy for core::sync::atomic::$atomic {} + impl sealed::Sealed for core::sync::atomic::$atomic {} + }; + ($($atomic:ident($interior:ty)),*) => { + $(impl_atomic_interior_mutable! { $atomic($interior) })* + } +} + +impl_atomic_interior_mutable! { + AtomicBool(bool), + AtomicI8(i8), AtomicI16(i16), AtomicI32(i32), AtomicI64(i64), AtomicIsize(isize), + AtomicU8(u8), AtomicU16(u16), AtomicU32(u32), AtomicU64(u64), AtomicUsize(usize) +} + +impl< + T: crate::safety::StackOnly + + crate::safety::NoSafeAliasing + + const_type_layout::TypeGraphLayout, + > InteriorMutableSafeDeviceCopy for core::cell::SyncUnsafeCell +{ +} +impl< + T: crate::safety::StackOnly + + crate::safety::NoSafeAliasing + + const_type_layout::TypeGraphLayout, + > sealed::Sealed for core::cell::SyncUnsafeCell +{ +} + +pub struct SharedHeapPerThreadShallowCopy { + never: !, + _marker: PhantomData, +} + +impl Deref for SharedHeapPerThreadShallowCopy { + type Target = T; + + fn deref(&self) -> &Self::Target { + self.never + } +} + +impl DerefMut for SharedHeapPerThreadShallowCopy { + fn deref_mut(&mut self) -> &mut Self::Target { + self.never + } +} + +impl< + T: RustToCuda< + CudaRepresentation: 'static + crate::safety::SafeDeviceCopy, + CudaAllocation: EmptyCudaAlloc, + > + crate::safety::NoSafeAliasing, + > CudaKernelParameter for SharedHeapPerThreadShallowCopy +{ + #[cfg(feature = "host")] + type AsyncHostType<'stream, 'b> = crate::host::HostAndDeviceOwnedAsync< + 'stream, + 'b, + DeviceAccessible<::CudaRepresentation>, + >; + #[cfg(any(feature = "device", doc))] + type DeviceType<'b> = T; + type FfiType<'stream, 'b> = + DeviceOwnedRef<'b, DeviceAccessible<::CudaRepresentation>>; + #[cfg(feature = "host")] + type SyncHostType = T; + + #[cfg(feature = "host")] + fn with_new_async<'stream, O, E: From>( + param: Self::SyncHostType, + _stream: &'stream rustacuda::stream::Stream, + inner: impl for<'b> FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result, + ) -> Result { + crate::host::LendToCuda::move_to_cuda(param, |param| inner(param.into_async())) + } + + #[cfg(feature = "host")] + fn with_async_as_ptx_jit( + _param: &Self::AsyncHostType<'_, '_>, + inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O, + ) -> O { + inner(None) + } + + #[cfg(feature = "host")] + fn async_to_ffi<'stream, 'b>( + param: Self::AsyncHostType<'stream, 'b>, + ) -> Self::FfiType<'stream, 'b> { + unsafe { param.for_device_async() } + } + + #[cfg(feature = "device")] + unsafe fn with_ffi_as_device( + param: Self::FfiType<'static, 'static>, + inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O, + ) -> O { + unsafe { crate::device::BorrowFromRust::with_moved_from_rust(param, inner) } + } +} +impl< + T: RustToCuda< + CudaRepresentation: crate::safety::SafeDeviceCopy, + CudaAllocation: EmptyCudaAlloc, + > + crate::safety::NoSafeAliasing, + > sealed::Sealed for SharedHeapPerThreadShallowCopy +{ +} + +impl<'a, T: 'static + RustToCuda + crate::safety::NoSafeAliasing> CudaKernelParameter + for &'a SharedHeapPerThreadShallowCopy +{ + #[cfg(feature = "host")] + type AsyncHostType<'stream, 'b> = crate::host::HostAndDeviceConstRefAsync< + 'stream, + 'b, + DeviceAccessible<::CudaRepresentation>, + >; + #[cfg(any(feature = "device", doc))] + type DeviceType<'b> = &'b T; + type FfiType<'stream, 'b> = + DeviceConstRef<'b, DeviceAccessible<::CudaRepresentation>>; + #[cfg(feature = "host")] + type SyncHostType = &'a T; + + #[cfg(feature = "host")] + fn with_new_async<'stream, O, E: From>( + param: Self::SyncHostType, + _stream: &'stream rustacuda::stream::Stream, + inner: impl for<'b> FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result, + ) -> Result { + crate::host::LendToCuda::lend_to_cuda(param, |param| inner(param.as_async())) + } + + #[cfg(feature = "host")] + fn with_async_as_ptx_jit( + _param: &Self::AsyncHostType<'_, '_>, + inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O, + ) -> O { + inner(None) + } + + #[cfg(feature = "host")] + fn async_to_ffi<'stream, 'b>( + param: Self::AsyncHostType<'stream, 'b>, + ) -> Self::FfiType<'stream, 'b> { + unsafe { param.for_device_async() } + } + + #[cfg(feature = "device")] + unsafe fn with_ffi_as_device( + param: Self::FfiType<'static, 'static>, + inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O, + ) -> O { + unsafe { crate::device::BorrowFromRust::with_borrow_from_rust(param, inner) } + } +} +impl<'a, T: RustToCuda + crate::safety::NoSafeAliasing> sealed::Sealed + for &'a SharedHeapPerThreadShallowCopy +{ +} + +impl< + T: RustToCuda< + CudaRepresentation: 'static + crate::safety::SafeDeviceCopy, + CudaAllocation: EmptyCudaAlloc, + > + crate::safety::NoSafeAliasing, + > CudaKernelParameter for PtxJit> +{ + #[cfg(feature = "host")] + type AsyncHostType<'stream, 'b> = + as CudaKernelParameter>::AsyncHostType<'stream, 'b>; + #[cfg(any(feature = "device", doc))] + type DeviceType<'b> = + as CudaKernelParameter>::DeviceType<'b>; + type FfiType<'stream, 'b> = + as CudaKernelParameter>::FfiType<'stream, 'b>; + #[cfg(feature = "host")] + type SyncHostType = as CudaKernelParameter>::SyncHostType; + + #[cfg(feature = "host")] + fn with_new_async<'stream, O, E: From>( + param: Self::SyncHostType, + stream: &'stream rustacuda::stream::Stream, + inner: impl for<'b> FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result, + ) -> Result { + as CudaKernelParameter>::with_new_async( + param, stream, inner, + ) + } + + #[cfg(feature = "host")] + fn with_async_as_ptx_jit( + param: &Self::AsyncHostType<'_, '_>, + inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O, + ) -> O { + inner(Some(¶m_as_raw_bytes(param.for_host()))) + } + + #[cfg(feature = "host")] + fn async_to_ffi<'stream, 'b>( + param: Self::AsyncHostType<'stream, 'b>, + ) -> Self::FfiType<'stream, 'b> { + as CudaKernelParameter>::async_to_ffi(param) + } + + #[cfg(feature = "device")] + unsafe fn with_ffi_as_device( + param: Self::FfiType<'static, 'static>, + inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O, + ) -> O { + emit_param_ptx_jit_marker::<_, PARAM>(param.as_ref()); + + as CudaKernelParameter>::with_ffi_as_device::( + param, inner, + ) + } +} +impl< + T: RustToCuda< + CudaRepresentation: crate::safety::SafeDeviceCopy, + CudaAllocation: EmptyCudaAlloc, + > + crate::safety::NoSafeAliasing, + > sealed::Sealed for PtxJit> +{ +} + +impl<'a, T: 'static + RustToCuda + crate::safety::NoSafeAliasing> CudaKernelParameter + for &'a PtxJit> +{ + #[cfg(feature = "host")] + type AsyncHostType<'stream, 'b> = + <&'a SharedHeapPerThreadShallowCopy as CudaKernelParameter>::AsyncHostType<'stream, 'b>; + #[cfg(any(feature = "device", doc))] + type DeviceType<'b> = + <&'a SharedHeapPerThreadShallowCopy as CudaKernelParameter>::DeviceType<'b>; + type FfiType<'stream, 'b> = + <&'a SharedHeapPerThreadShallowCopy as CudaKernelParameter>::FfiType<'stream, 'b>; + #[cfg(feature = "host")] + type SyncHostType = + <&'a SharedHeapPerThreadShallowCopy as CudaKernelParameter>::SyncHostType; + + #[cfg(feature = "host")] + fn with_new_async<'stream, O, E: From>( + param: Self::SyncHostType, + stream: &'stream rustacuda::stream::Stream, + inner: impl for<'b> FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result, + ) -> Result { + <&'a SharedHeapPerThreadShallowCopy as CudaKernelParameter>::with_new_async( + param, stream, inner, + ) + } + + #[cfg(feature = "host")] + fn with_async_as_ptx_jit( + param: &Self::AsyncHostType<'_, '_>, + inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O, + ) -> O { + inner(Some(¶m_as_raw_bytes(param.for_host()))) + } + + #[cfg(feature = "host")] + fn async_to_ffi<'stream, 'b>( + param: Self::AsyncHostType<'stream, 'b>, + ) -> Self::FfiType<'stream, 'b> { + <&'a SharedHeapPerThreadShallowCopy as CudaKernelParameter>::async_to_ffi(param) + } + + #[cfg(feature = "device")] + unsafe fn with_ffi_as_device( + param: Self::FfiType<'static, 'static>, + inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O, + ) -> O { + emit_param_ptx_jit_marker::<_, PARAM>(param.as_ref()); + + <&'a SharedHeapPerThreadShallowCopy as CudaKernelParameter>::with_ffi_as_device::( + param, inner, + ) + } +} +impl<'a, T: RustToCuda + crate::safety::NoSafeAliasing> sealed::Sealed + for &'a PtxJit> +{ +} + +#[cfg(feature = "host")] +fn param_as_raw_bytes(r: &T) -> NonNull<[u8]> { + NonNull::slice_from_raw_parts(NonNull::from(r).cast::(), core::mem::size_of_val(r)) +} + +#[cfg(feature = "device")] +fn emit_param_ptx_jit_marker(param: &T) { + unsafe { + core::arch::asm!( + "// //", + param_reg = in(reg32) *(core::ptr::from_ref(param).cast::()), + param_index = const(INDEX), + ); + } +} + +mod private_shared { + use const_type_layout::{TypeGraphLayout, TypeLayout}; + use rustacuda_core::DeviceCopy; + + #[doc(hidden)] + #[derive(TypeLayout)] + #[repr(C)] + pub struct ThreadBlockSharedFfi { + pub(super) _marker: [T; 0], + } + + // Safety: there is nothing to copy, this is just a zero-sized marker type + unsafe impl DeviceCopy for ThreadBlockSharedFfi {} + + #[doc(hidden)] + #[derive(TypeLayout)] + #[repr(C)] + pub struct ThreadBlockSharedSliceFfi { + pub(super) len: usize, + pub(super) _marker: [T; 0], + } + + // Safety: we only copy a usize, which implements `DeviceCopy` + unsafe impl DeviceCopy for ThreadBlockSharedSliceFfi {} +} + +impl<'a, T: 'static + TypeGraphLayout> CudaKernelParameter + for &'a mut crate::utils::shared::ThreadBlockShared +{ + #[cfg(feature = "host")] + type AsyncHostType<'stream, 'b> = &'b mut crate::utils::shared::ThreadBlockShared; + #[cfg(any(feature = "device", doc))] + type DeviceType<'b> = &'b mut crate::utils::shared::ThreadBlockShared; + type FfiType<'stream, 'b> = private_shared::ThreadBlockSharedFfi; + #[cfg(feature = "host")] + type SyncHostType = Self; + + #[cfg(feature = "host")] + fn with_new_async<'stream, O, E: From>( + param: Self::SyncHostType, + _stream: &'stream rustacuda::stream::Stream, + inner: impl for<'b> FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result, + ) -> Result { + inner(param) + } + + #[cfg(feature = "host")] + fn with_async_as_ptx_jit( + _param: &Self::AsyncHostType<'_, '_>, + inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O, + ) -> O { + inner(None) + } + + #[cfg(feature = "host")] + fn async_to_ffi<'stream, 'b>( + _param: Self::AsyncHostType<'stream, 'b>, + ) -> Self::FfiType<'stream, 'b> { + private_shared::ThreadBlockSharedFfi { _marker: [] } + } + + #[cfg(feature = "device")] + #[allow(clippy::inline_always)] + #[inline(always)] + unsafe fn with_ffi_as_device( + _param: Self::FfiType<'static, 'static>, + inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O, + ) -> O { + let mut param = crate::utils::shared::ThreadBlockShared::new_uninit(); + + inner(&mut param) + } +} +impl<'a, T: 'static + TypeGraphLayout> sealed::Sealed + for &'a mut crate::utils::shared::ThreadBlockShared +{ +} + +impl<'a, T: 'static + TypeGraphLayout> CudaKernelParameter + for &'a mut crate::utils::shared::ThreadBlockSharedSlice +{ + #[cfg(feature = "host")] + type AsyncHostType<'stream, 'b> = &'b mut crate::utils::shared::ThreadBlockSharedSlice; + #[cfg(any(feature = "device", doc))] + type DeviceType<'b> = &'b mut crate::utils::shared::ThreadBlockSharedSlice; + type FfiType<'stream, 'b> = private_shared::ThreadBlockSharedSliceFfi; + #[cfg(feature = "host")] + type SyncHostType = Self; + + #[cfg(feature = "host")] + fn with_new_async<'stream, O, E: From>( + param: Self::SyncHostType, + _stream: &'stream rustacuda::stream::Stream, + inner: impl for<'b> FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result, + ) -> Result { + inner(param) + } + + #[cfg(feature = "host")] + fn with_async_as_ptx_jit( + _param: &Self::AsyncHostType<'_, '_>, + inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O, + ) -> O { + inner(None) + } + + #[cfg(feature = "host")] + fn async_to_ffi<'stream, 'b>( + param: Self::AsyncHostType<'stream, 'b>, + ) -> Self::FfiType<'stream, 'b> { + private_shared::ThreadBlockSharedSliceFfi { + len: param.len(), + _marker: [], + } + } + + #[cfg(feature = "device")] + #[allow(clippy::inline_always)] + #[inline(always)] + unsafe fn with_ffi_as_device( + param: Self::FfiType<'static, 'static>, + inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O, + ) -> O { + unsafe { + crate::utils::shared::ThreadBlockSharedSlice::with_uninit_for_len(param.len, inner) + } + } +} +impl<'a, T: 'static + TypeGraphLayout> sealed::Sealed + for &'a mut crate::utils::shared::ThreadBlockSharedSlice +{ +} diff --git a/src/lib.rs b/src/lib.rs index 16d48d0b3..6ba80f56f 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -45,15 +45,13 @@ core::compile_error!("cannot enable the `host` feature on a target with `target_ core::compile_error!("cannot enable the `device` feature on a target without `target_os=\"cuda\"`"); pub mod common; +pub mod deps; +pub mod kernel; +pub mod safety; +pub mod utils; #[cfg(feature = "host")] pub mod host; #[cfg(feature = "device")] pub mod device; - -pub mod utils; - -pub mod safety; - -pub mod deps; diff --git a/src/safety/no_aliasing.rs b/src/safety/no_aliasing.rs index 0fc3abf9c..f5c80e354 100644 --- a/src/safety/no_aliasing.rs +++ b/src/safety/no_aliasing.rs @@ -47,9 +47,9 @@ /// with mutable access to its own partition of a slice and thus avoid mutable /// aliasing. /// -/// * [`ThreadBlockShared`](crate::utils::shared::static::ThreadBlockShared) +/// * [`ThreadBlockShared`](crate::utils::shared::ThreadBlockShared) /// and -/// [`ThreadBlockSharedSlice`](crate::utils::shared::slice::ThreadBlockSharedSlice) +/// [`ThreadBlockSharedSlice`](crate::utils::shared::ThreadBlockSharedSlice) /// also implement [`NoSafeAliasing`] since they only provide access to `*mut /// T`, which is always unsafe to mutate and thus moves the burden to uphoald /// the no-mutable-aliasing safety invariant to the user who derefereces these @@ -79,8 +79,8 @@ unsafe impl NoSafeAliasing // Thread-block-shared data only allows unsafe aliasing since only raw pointers // are exposed -unsafe impl NoSafeAliasing for crate::utils::shared::r#static::ThreadBlockShared {} +unsafe impl NoSafeAliasing for crate::utils::shared::ThreadBlockShared {} unsafe impl NoSafeAliasing - for crate::utils::shared::slice::ThreadBlockSharedSlice + for crate::utils::shared::ThreadBlockSharedSlice { } diff --git a/src/utils/shared/mod.rs b/src/utils/shared/mod.rs index 88a586ad6..dfd3f2019 100644 --- a/src/utils/shared/mod.rs +++ b/src/utils/shared/mod.rs @@ -1,2 +1,11 @@ -pub mod slice; -pub mod r#static; +mod slice; +mod r#static; + +pub use slice::ThreadBlockSharedSlice; + +#[allow(clippy::module_name_repetitions)] +pub use r#static::ThreadBlockShared; + +#[doc(hidden)] +#[cfg(feature = "device")] +pub use slice::init; diff --git a/src/utils/shared/slice.rs b/src/utils/shared/slice.rs index bec725bd1..f60276e6b 100644 --- a/src/utils/shared/slice.rs +++ b/src/utils/shared/slice.rs @@ -108,7 +108,6 @@ impl ThreadBlockSharedSlice { } } -#[doc(hidden)] #[cfg(feature = "device")] /// # Safety /// From c8761b0f5a133af6e9c7f9995747b75ea2261620 Mon Sep 17 00:00:00 2001 From: Juniper Tyree Date: Mon, 25 Dec 2023 10:47:10 +0000 Subject: [PATCH 065/120] Add RustToCuda impls for &T, &mut T, &[T], and &mut [T] where T: RustToCuda --- src/utils/box.rs | 4 +- src/utils/boxed_slice.rs | 6 +-- src/utils/mod.rs | 4 ++ src/utils/ref.rs | 83 +++++++++++++++++++++++++++++++++ src/utils/ref_mut.rs | 93 +++++++++++++++++++++++++++++++++++++ src/utils/slice_ref.rs | 88 +++++++++++++++++++++++++++++++++++ src/utils/slice_ref_mut.rs | 95 ++++++++++++++++++++++++++++++++++++++ 7 files changed, 366 insertions(+), 7 deletions(-) create mode 100644 src/utils/ref.rs create mode 100644 src/utils/ref_mut.rs create mode 100644 src/utils/slice_ref.rs create mode 100644 src/utils/slice_ref_mut.rs diff --git a/src/utils/box.rs b/src/utils/box.rs index 8672c36a0..f9c271a67 100644 --- a/src/utils/box.rs +++ b/src/utils/box.rs @@ -24,9 +24,7 @@ use crate::{ #[repr(transparent)] #[derive(TypeLayout)] #[allow(clippy::module_name_repetitions)] -pub struct BoxCudaRepresentation(*mut T) -where - T: SafeDeviceCopy + TypeGraphLayout; +pub struct BoxCudaRepresentation(*mut T); // Safety: This repr(C) struct only contains a device-owned pointer unsafe impl rustacuda_core::DeviceCopy diff --git a/src/utils/boxed_slice.rs b/src/utils/boxed_slice.rs index e9113d865..e4796f2f2 100644 --- a/src/utils/boxed_slice.rs +++ b/src/utils/boxed_slice.rs @@ -24,11 +24,9 @@ use crate::{ #[allow(clippy::module_name_repetitions)] #[derive(Debug, TypeLayout)] #[repr(C)] -pub struct BoxedSliceCudaRepresentation(*mut T, usize) -where - T: SafeDeviceCopy + TypeGraphLayout; +pub struct BoxedSliceCudaRepresentation(*mut T, usize); -// Safety: This repr(C) struct only contains a device-owned pointer +// Safety: This repr(C) struct only contains a device-owned pointer and a usize unsafe impl rustacuda_core::DeviceCopy for BoxedSliceCudaRepresentation { diff --git a/src/utils/mod.rs b/src/utils/mod.rs index dadf5a443..73d422f05 100644 --- a/src/utils/mod.rs +++ b/src/utils/mod.rs @@ -6,3 +6,7 @@ pub mod shared; mod r#box; mod boxed_slice; mod option; +mod r#ref; +mod ref_mut; +mod slice_ref; +mod slice_ref_mut; diff --git a/src/utils/ref.rs b/src/utils/ref.rs new file mode 100644 index 000000000..6475d9ccf --- /dev/null +++ b/src/utils/ref.rs @@ -0,0 +1,83 @@ +use core::marker::PhantomData; + +use const_type_layout::{TypeGraphLayout, TypeLayout}; + +#[cfg(feature = "host")] +use rustacuda::{error::CudaResult, memory::DeviceBox}; + +use crate::{ + common::{CudaAsRust, RustToCuda}, + safety::SafeDeviceCopy, +}; + +#[cfg(any(feature = "host", feature = "device"))] +use crate::common::DeviceAccessible; + +#[cfg(feature = "host")] +use crate::{ + common::{CombinedCudaAlloc, CudaAlloc}, + host::CudaDropWrapper, + utils::device_copy::SafeDeviceCopyWrapper, +}; + +#[doc(hidden)] +#[repr(transparent)] +#[derive(TypeLayout)] +#[allow(clippy::module_name_repetitions)] +pub struct RefCudaRepresentation<'a, T: 'a + SafeDeviceCopy + TypeGraphLayout> { + data: *const T, + _marker: PhantomData<&'a T>, +} + +// Safety: This repr(C) struct only contains a device-owned pointer +unsafe impl<'a, T: SafeDeviceCopy + TypeGraphLayout> rustacuda_core::DeviceCopy + for RefCudaRepresentation<'a, T> +{ +} + +unsafe impl<'a, T: SafeDeviceCopy + TypeGraphLayout> RustToCuda for &'a T { + #[cfg(all(feature = "host", not(doc)))] + type CudaAllocation = crate::host::CudaDropWrapper>>; + #[cfg(any(not(feature = "host"), doc))] + type CudaAllocation = crate::common::SomeCudaAlloc; + type CudaRepresentation = RefCudaRepresentation<'a, T>; + + #[cfg(feature = "host")] + #[allow(clippy::type_complexity)] + unsafe fn borrow( + &self, + alloc: A, + ) -> CudaResult<( + DeviceAccessible, + CombinedCudaAlloc, + )> { + let mut device_box = + CudaDropWrapper::from(DeviceBox::new(SafeDeviceCopyWrapper::from_ref(&**self))?); + + Ok(( + DeviceAccessible::from(RefCudaRepresentation { + data: device_box.as_device_ptr().as_raw().cast(), + _marker: PhantomData::<&'a T>, + }), + CombinedCudaAlloc::new(device_box, alloc), + )) + } + + #[cfg(feature = "host")] + unsafe fn restore( + &mut self, + alloc: CombinedCudaAlloc, + ) -> CudaResult { + let (_alloc_front, alloc_tail) = alloc.split(); + Ok(alloc_tail) + } +} + +unsafe impl<'a, T: SafeDeviceCopy + TypeGraphLayout> CudaAsRust for RefCudaRepresentation<'a, T> { + type RustRepresentation = &'a T; + + #[cfg(feature = "device")] + unsafe fn as_rust(this: &DeviceAccessible) -> Self::RustRepresentation { + &*this.data + } +} diff --git a/src/utils/ref_mut.rs b/src/utils/ref_mut.rs new file mode 100644 index 000000000..a5cbae62a --- /dev/null +++ b/src/utils/ref_mut.rs @@ -0,0 +1,93 @@ +use core::marker::PhantomData; + +use const_type_layout::{TypeGraphLayout, TypeLayout}; + +#[cfg(feature = "host")] +use rustacuda::{error::CudaResult, memory::DeviceBox}; + +use crate::{ + common::{CudaAsRust, RustToCuda}, + safety::SafeDeviceCopy, +}; + +#[cfg(any(feature = "host", feature = "device"))] +use crate::common::DeviceAccessible; + +#[cfg(feature = "host")] +use crate::{ + common::{CombinedCudaAlloc, CudaAlloc}, + host::CudaDropWrapper, + utils::device_copy::SafeDeviceCopyWrapper, +}; + +#[doc(hidden)] +#[repr(transparent)] +#[derive(TypeLayout)] +#[allow(clippy::module_name_repetitions)] +pub struct RefMutCudaRepresentation<'a, T: 'a + SafeDeviceCopy + TypeGraphLayout> { + data: *mut T, + _marker: PhantomData<&'a mut T>, +} + +// Safety: This repr(C) struct only contains a device-owned pointer +unsafe impl<'a, T: SafeDeviceCopy + TypeGraphLayout> rustacuda_core::DeviceCopy + for RefMutCudaRepresentation<'a, T> +{ +} + +unsafe impl<'a, T: SafeDeviceCopy + TypeGraphLayout> RustToCuda for &'a mut T { + #[cfg(all(feature = "host", not(doc)))] + type CudaAllocation = crate::host::CudaDropWrapper>>; + #[cfg(any(not(feature = "host"), doc))] + type CudaAllocation = crate::common::SomeCudaAlloc; + type CudaRepresentation = RefMutCudaRepresentation<'a, T>; + + #[cfg(feature = "host")] + #[allow(clippy::type_complexity)] + unsafe fn borrow( + &self, + alloc: A, + ) -> CudaResult<( + DeviceAccessible, + CombinedCudaAlloc, + )> { + let mut device_box = + CudaDropWrapper::from(DeviceBox::new(SafeDeviceCopyWrapper::from_ref(&**self))?); + + Ok(( + DeviceAccessible::from(RefMutCudaRepresentation { + data: device_box.as_device_ptr().as_raw_mut().cast(), + _marker: PhantomData::<&'a mut T>, + }), + CombinedCudaAlloc::new(device_box, alloc), + )) + } + + #[cfg(feature = "host")] + unsafe fn restore( + &mut self, + alloc: CombinedCudaAlloc, + ) -> CudaResult { + use rustacuda::memory::CopyDestination; + + let (alloc_front, alloc_tail) = alloc.split(); + + alloc_front.copy_to(SafeDeviceCopyWrapper::from_mut(&mut **self))?; + + core::mem::drop(alloc_front); + + Ok(alloc_tail) + } +} + +unsafe impl<'a, T: SafeDeviceCopy + TypeGraphLayout> CudaAsRust + for RefMutCudaRepresentation<'a, T> +{ + type RustRepresentation = &'a mut T; + + #[cfg(feature = "device")] + unsafe fn as_rust(this: &DeviceAccessible) -> Self::RustRepresentation { + let data: *mut T = this.data; + &mut *data + } +} diff --git a/src/utils/slice_ref.rs b/src/utils/slice_ref.rs new file mode 100644 index 000000000..a2a5e5012 --- /dev/null +++ b/src/utils/slice_ref.rs @@ -0,0 +1,88 @@ +use core::marker::PhantomData; + +use const_type_layout::{TypeGraphLayout, TypeLayout}; + +#[cfg(feature = "host")] +use rustacuda::{error::CudaResult, memory::DeviceBuffer}; + +use crate::{ + common::{CudaAsRust, RustToCuda}, + safety::SafeDeviceCopy, +}; + +#[cfg(any(feature = "host", feature = "device"))] +use crate::common::DeviceAccessible; + +#[cfg(feature = "host")] +use crate::{ + common::{CombinedCudaAlloc, CudaAlloc}, + host::CudaDropWrapper, + utils::device_copy::SafeDeviceCopyWrapper, +}; + +#[doc(hidden)] +#[allow(clippy::module_name_repetitions)] +#[derive(Debug, TypeLayout)] +#[repr(C)] +pub struct SliceRefCudaRepresentation<'a, T: 'a + SafeDeviceCopy + TypeGraphLayout> { + data: *const T, + len: usize, + _marker: PhantomData<&'a [T]>, +} + +// Safety: This repr(C) struct only contains a device-owned pointer and a usize +unsafe impl<'a, T: SafeDeviceCopy + TypeGraphLayout> rustacuda_core::DeviceCopy + for SliceRefCudaRepresentation<'a, T> +{ +} + +unsafe impl<'a, T: SafeDeviceCopy + TypeGraphLayout> RustToCuda for &'a [T] { + #[cfg(all(feature = "host", not(doc)))] + type CudaAllocation = crate::host::CudaDropWrapper>>; + #[cfg(any(not(feature = "host"), doc))] + type CudaAllocation = crate::common::SomeCudaAlloc; + type CudaRepresentation = SliceRefCudaRepresentation<'a, T>; + + #[cfg(feature = "host")] + #[allow(clippy::type_complexity)] + unsafe fn borrow( + &self, + alloc: A, + ) -> CudaResult<( + DeviceAccessible, + CombinedCudaAlloc, + )> { + let device_buffer = CudaDropWrapper::from(DeviceBuffer::from_slice( + SafeDeviceCopyWrapper::from_slice(self), + )?); + + Ok(( + DeviceAccessible::from(SliceRefCudaRepresentation { + data: device_buffer.as_ptr().cast(), + len: device_buffer.len(), + _marker: PhantomData::<&'a [T]>, + }), + CombinedCudaAlloc::new(device_buffer, alloc), + )) + } + + #[cfg(feature = "host")] + unsafe fn restore( + &mut self, + alloc: CombinedCudaAlloc, + ) -> CudaResult { + let (_alloc_front, alloc_tail) = alloc.split(); + Ok(alloc_tail) + } +} + +unsafe impl<'a, T: SafeDeviceCopy + TypeGraphLayout> CudaAsRust + for SliceRefCudaRepresentation<'a, T> +{ + type RustRepresentation = &'a [T]; + + #[cfg(feature = "device")] + unsafe fn as_rust(this: &DeviceAccessible) -> Self::RustRepresentation { + core::slice::from_raw_parts(this.data, this.len) + } +} diff --git a/src/utils/slice_ref_mut.rs b/src/utils/slice_ref_mut.rs new file mode 100644 index 000000000..64371a1e3 --- /dev/null +++ b/src/utils/slice_ref_mut.rs @@ -0,0 +1,95 @@ +use core::marker::PhantomData; + +use const_type_layout::{TypeGraphLayout, TypeLayout}; + +#[cfg(feature = "host")] +use rustacuda::{error::CudaResult, memory::DeviceBuffer}; + +use crate::{ + common::{CudaAsRust, RustToCuda}, + safety::SafeDeviceCopy, +}; + +#[cfg(any(feature = "host", feature = "device"))] +use crate::common::DeviceAccessible; + +#[cfg(feature = "host")] +use crate::{ + common::{CombinedCudaAlloc, CudaAlloc}, + host::CudaDropWrapper, + utils::device_copy::SafeDeviceCopyWrapper, +}; + +#[doc(hidden)] +#[allow(clippy::module_name_repetitions)] +#[derive(Debug, TypeLayout)] +#[repr(C)] +pub struct SliceRefMutCudaRepresentation<'a, T: 'a + SafeDeviceCopy + TypeGraphLayout> { + data: *mut T, + len: usize, + _marker: PhantomData<&'a mut [T]>, +} + +// Safety: This repr(C) struct only contains a device-owned pointer and a usize +unsafe impl<'a, T: SafeDeviceCopy + TypeGraphLayout> rustacuda_core::DeviceCopy + for SliceRefMutCudaRepresentation<'a, T> +{ +} + +unsafe impl<'a, T: SafeDeviceCopy + TypeGraphLayout> RustToCuda for &'a mut [T] { + #[cfg(all(feature = "host", not(doc)))] + type CudaAllocation = crate::host::CudaDropWrapper>>; + #[cfg(any(not(feature = "host"), doc))] + type CudaAllocation = crate::common::SomeCudaAlloc; + type CudaRepresentation = SliceRefMutCudaRepresentation<'a, T>; + + #[cfg(feature = "host")] + #[allow(clippy::type_complexity)] + unsafe fn borrow( + &self, + alloc: A, + ) -> CudaResult<( + DeviceAccessible, + CombinedCudaAlloc, + )> { + let mut device_buffer = CudaDropWrapper::from(DeviceBuffer::from_slice( + SafeDeviceCopyWrapper::from_slice(self), + )?); + + Ok(( + DeviceAccessible::from(SliceRefMutCudaRepresentation { + data: device_buffer.as_mut_ptr().cast(), + len: device_buffer.len(), + _marker: PhantomData::<&'a mut [T]>, + }), + CombinedCudaAlloc::new(device_buffer, alloc), + )) + } + + #[cfg(feature = "host")] + unsafe fn restore( + &mut self, + alloc: CombinedCudaAlloc, + ) -> CudaResult { + use rustacuda::memory::CopyDestination; + + let (alloc_front, alloc_tail) = alloc.split(); + + alloc_front.copy_to(SafeDeviceCopyWrapper::from_mut_slice(self))?; + + core::mem::drop(alloc_front); + + Ok(alloc_tail) + } +} + +unsafe impl<'a, T: SafeDeviceCopy + TypeGraphLayout> CudaAsRust + for SliceRefMutCudaRepresentation<'a, T> +{ + type RustRepresentation = &'a mut [T]; + + #[cfg(feature = "device")] + unsafe fn as_rust(this: &DeviceAccessible) -> Self::RustRepresentation { + core::slice::from_raw_parts_mut(this.data, this.len) + } +} From 8d2d85667308246ed885b05d4c06917b774c1e03 Mon Sep 17 00:00:00 2001 From: Juniper Tyree Date: Mon, 25 Dec 2023 14:51:27 +0000 Subject: [PATCH 066/120] Large restructuring of the module layout for rust-cuda --- examples/derive/src/lib.rs | 4 +- examples/print/src/main.rs | 7 +- examples/single-source/src/main.rs | 18 +- .../kernel/wrapper/generate/host_kernel_ty.rs | 4 +- .../generate/host_linker_macro/get_ptx.rs | 4 +- .../wrapper/generate/host_linker_macro/mod.rs | 4 +- rust-cuda-derive/src/kernel/wrapper/mod.rs | 2 +- .../src/rust_to_cuda/field_copy.rs | 44 +- rust-cuda-derive/src/rust_to_cuda/field_ty.rs | 10 +- rust-cuda-derive/src/rust_to_cuda/generics.rs | 4 +- rust-cuda-derive/src/rust_to_cuda/impl.rs | 40 +- rust-cuda-derive/src/rust_to_cuda/mod.rs | 2 +- src/alloc.rs | 52 ++ src/common.rs | 311 ----------- src/device/mod.rs | 73 --- src/host/mod.rs | 525 +----------------- src/kernel/mod.rs | 520 +++++++++++++++++ src/{kernel.rs => kernel/param.rs} | 114 ++-- src/{host => kernel}/ptx_jit/mod.rs | 0 src/{host => kernel}/ptx_jit/preprocess.rs | 0 src/{host => kernel}/ptx_jit/regex.rs | 0 src/{host => kernel}/ptx_jit/replace.rs | 0 src/{utils => lend/impls}/box.rs | 8 +- src/{utils => lend/impls}/boxed_slice.rs | 8 +- src/lend/impls/mod.rs | 7 + src/{utils => lend/impls}/option.rs | 9 +- src/{utils => lend/impls}/ref.rs | 8 +- src/{utils => lend/impls}/ref_mut.rs | 8 +- src/{utils => lend/impls}/slice_ref.rs | 8 +- src/{utils => lend/impls}/slice_ref_mut.rs | 8 +- src/lend/mod.rs | 283 ++++++++++ src/lib.rs | 4 +- src/safety/device_copy.rs | 2 +- src/safety/no_aliasing.rs | 2 +- src/utils/aliasing/const.rs | 21 +- src/utils/aliasing/dynamic.rs | 21 +- src/utils/aliasing/final.rs | 21 +- src/utils/device_copy.rs | 7 +- src/utils/exchange/buffer/common.rs | 6 +- src/utils/exchange/buffer/host.rs | 3 +- src/utils/exchange/buffer/mod.rs | 10 +- src/utils/exchange/wrapper.rs | 7 +- src/utils/ffi.rs | 133 +++++ src/utils/mod.rs | 9 +- src/utils/shared/mod.rs | 3 + src/utils/shared/slice.rs | 43 ++ 46 files changed, 1269 insertions(+), 1108 deletions(-) create mode 100644 src/alloc.rs delete mode 100644 src/common.rs create mode 100644 src/kernel/mod.rs rename src/{kernel.rs => kernel/param.rs} (93%) rename src/{host => kernel}/ptx_jit/mod.rs (100%) rename src/{host => kernel}/ptx_jit/preprocess.rs (100%) rename src/{host => kernel}/ptx_jit/regex.rs (100%) rename src/{host => kernel}/ptx_jit/replace.rs (100%) rename src/{utils => lend/impls}/box.rs (93%) rename src/{utils => lend/impls}/boxed_slice.rs (93%) create mode 100644 src/lend/impls/mod.rs rename src/{utils => lend/impls}/option.rs (96%) rename src/{utils => lend/impls}/ref.rs (93%) rename src/{utils => lend/impls}/ref_mut.rs (93%) rename src/{utils => lend/impls}/slice_ref.rs (93%) rename src/{utils => lend/impls}/slice_ref_mut.rs (93%) create mode 100644 src/lend/mod.rs create mode 100644 src/utils/ffi.rs diff --git a/examples/derive/src/lib.rs b/examples/derive/src/lib.rs index 76a7d3cb1..622b1b699 100644 --- a/examples/derive/src/lib.rs +++ b/examples/derive/src/lib.rs @@ -2,14 +2,14 @@ #![feature(const_type_name)] #![feature(offset_of)] -#[derive(rc::common::LendRustToCuda)] +#[derive(rc::lend::LendRustToCuda)] #[cuda(crate = "rc")] struct Inner { #[cuda(embed)] inner: T, } -#[derive(rc::common::LendRustToCuda)] +#[derive(rc::lend::LendRustToCuda)] #[cuda(crate = "rc")] struct Outer { #[cuda(embed)] diff --git a/examples/print/src/main.rs b/examples/print/src/main.rs index 7a26ce2bd..31c6897f3 100644 --- a/examples/print/src/main.rs +++ b/examples/print/src/main.rs @@ -23,7 +23,7 @@ pub enum Action { #[rust_cuda::kernel::kernel(use link! for impl)] #[kernel(allow(ptx::local_memory_usage))] -pub fn kernel(action: rust_cuda::kernel::PerThreadShallowCopy) { +pub fn kernel(action: rust_cuda::kernel::param::PerThreadShallowCopy) { match action { Action::Print => rust_cuda::device::utils::println!("println! from CUDA kernel"), Action::Panic => panic!("panic! from CUDA kernel"), @@ -62,11 +62,10 @@ fn main() -> rust_cuda::deps::rustacuda::error::CudaResult<()> { )?); // Create a new instance of the CUDA kernel and prepare the launch config - let mut kernel = rust_cuda::host::TypedPtxKernel::::new::(None); - let config = rust_cuda::host::LaunchConfig { + let mut kernel = rust_cuda::kernel::TypedPtxKernel::::new::(None); + let config = rust_cuda::kernel::LaunchConfig { grid: rust_cuda::deps::rustacuda::function::GridSize::x(1), block: rust_cuda::deps::rustacuda::function::BlockSize::x(4), - shared_memory_size: 0, ptx_jit: false, }; diff --git a/examples/single-source/src/main.rs b/examples/single-source/src/main.rs index 383ade30a..40d212294 100644 --- a/examples/single-source/src/main.rs +++ b/examples/single-source/src/main.rs @@ -23,7 +23,7 @@ fn main() {} #[layout(crate = "rc::deps::const_type_layout")] pub struct Dummy(i32); -#[derive(rc::common::LendRustToCuda)] +#[derive(rc::lend::LendRustToCuda)] #[cuda(crate = "rc")] #[allow(dead_code)] pub struct Wrapper { @@ -31,7 +31,7 @@ pub struct Wrapper { inner: T, } -#[derive(rc::common::LendRustToCuda)] +#[derive(rc::lend::LendRustToCuda)] #[cuda(crate = "rc")] pub struct Empty([u8; 0]); @@ -54,18 +54,18 @@ pub struct Triple(i32, i32, i32); pub fn kernel< 'a, T: 'static - + rc::common::RustToCuda< + + rc::lend::RustToCuda< CudaRepresentation: rc::safety::StackOnly, - CudaAllocation: rc::common::EmptyCudaAlloc, + CudaAllocation: rc::alloc::EmptyCudaAlloc, > + rc::safety::StackOnly + rc::safety::NoSafeAliasing, >( - _x: &rc::kernel::PerThreadShallowCopy, - _z: &rc::kernel::SharedHeapPerThreadShallowCopy>, - _v @ _w: &'a rc::kernel::ShallowInteriorMutable, - _: rc::kernel::SharedHeapPerThreadShallowCopy>, - q @ Triple(s, mut __t, _u): rc::kernel::PerThreadShallowCopy, + _x: &rc::kernel::param::PerThreadShallowCopy, + _z: &rc::kernel::param::SharedHeapPerThreadShallowCopy>, + _v @ _w: &'a rc::kernel::param::ShallowInteriorMutable, + _: rc::kernel::param::SharedHeapPerThreadShallowCopy>, + q @ Triple(s, mut __t, _u): rc::kernel::param::PerThreadShallowCopy, shared3: &mut rc::utils::shared::ThreadBlockShared, dynamic: &mut rc::utils::shared::ThreadBlockSharedSlice, ) { diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/host_kernel_ty.rs b/rust-cuda-derive/src/kernel/wrapper/generate/host_kernel_ty.rs index 75c86820f..84ece28b5 100644 --- a/rust-cuda-derive/src/kernel/wrapper/generate/host_kernel_ty.rs +++ b/rust-cuda-derive/src/kernel/wrapper/generate/host_kernel_ty.rs @@ -43,7 +43,7 @@ pub(in super::super) fn quote_host_kernel_ty( pub type #func_ident #generic_start_token #generic_kernel_params #generic_close_token = impl Fn( - &mut #crate_path::host::Launcher<#func_ident #generic_start_token + &mut #crate_path::kernel::Launcher<#func_ident #generic_start_token #(#full_generics),* #generic_close_token>, #(#cuda_kernel_param_tys),* @@ -56,7 +56,7 @@ pub(in super::super) fn quote_host_kernel_ty( fn #private_func_ident #generic_start_token #generic_kernel_params #generic_close_token ( - #launcher: &mut #crate_path::host::Launcher<#func_ident #generic_start_token + #launcher: &mut #crate_path::kernel::Launcher<#func_ident #generic_start_token #(#full_generics),* #generic_close_token>, #func_inputs diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/host_linker_macro/get_ptx.rs b/rust-cuda-derive/src/kernel/wrapper/generate/host_linker_macro/get_ptx.rs index d7394142e..599b68fce 100644 --- a/rust-cuda-derive/src/kernel/wrapper/generate/host_linker_macro/get_ptx.rs +++ b/rust-cuda-derive/src/kernel/wrapper/generate/host_linker_macro/get_ptx.rs @@ -47,7 +47,7 @@ pub(super) fn quote_get_ptx( }> = #crate_path::safety::kernel_signature::Assert::<{ #crate_path::safety::kernel_signature::check( PTX_CSTR.to_bytes(), - #crate_path::host::specialise_kernel_entry_point!( + #crate_path::kernel::specialise_kernel_entry_point!( #func_ident_hash #generic_start_token #($#macro_type_ids),* #generic_close_token @@ -85,7 +85,7 @@ pub(super) fn quote_get_ptx( #[allow(unused_imports)] use __rust_cuda_ffi_safe_assert::#args; - #crate_path::host::link_kernel!{ + #crate_path::kernel::link_kernel!{ #func_ident #func_ident_hash #crate_name #crate_manifest_dir #generic_start_token #($#macro_type_ids),* #generic_close_token #ptx_lint_levels diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/host_linker_macro/mod.rs b/rust-cuda-derive/src/kernel/wrapper/generate/host_linker_macro/mod.rs index dc609da26..cfc0af751 100644 --- a/rust-cuda-derive/src/kernel/wrapper/generate/host_linker_macro/mod.rs +++ b/rust-cuda-derive/src/kernel/wrapper/generate/host_linker_macro/mod.rs @@ -91,14 +91,14 @@ pub(in super::super) fn quote_host_linker_macro( #(#macro_generics),* $(,)? #generic_close_token for $ptx:ident ) { - unsafe impl<#($#macro_only_lt_generic_ids),*> #crate_path::host::CompiledKernelPtx< + unsafe impl<#($#macro_only_lt_generic_ids),*> #crate_path::kernel::CompiledKernelPtx< #func_ident_name #generic_start_token #($#macro_generic_ids),* #generic_close_token > for $ptx #generic_start_token #($#macro_generic_ids),* #generic_close_token { #get_ptx fn get_entry_point() -> &'static ::core::ffi::CStr { - #crate_path::host::specialise_kernel_entry_point!( + #crate_path::kernel::specialise_kernel_entry_point!( #func_ident_hash #generic_start_token #($#macro_non_lt_generic_ids),* #generic_close_token diff --git a/rust-cuda-derive/src/kernel/wrapper/mod.rs b/rust-cuda-derive/src/kernel/wrapper/mod.rs index f3e1177bc..7793c2dc0 100644 --- a/rust-cuda-derive/src/kernel/wrapper/mod.rs +++ b/rust-cuda-derive/src/kernel/wrapper/mod.rs @@ -346,7 +346,7 @@ fn quote_generic_check( quote::quote_spanned! { func_ident_hash.span()=> #[cfg(not(target_os = "cuda"))] - const _: ::core::result::Result<(), ()> = #crate_path::host::check_kernel!( + const _: ::core::result::Result<(), ()> = #crate_path::kernel::check_kernel!( #func_ident #func_ident_hash #crate_name #crate_manifest_dir ); } diff --git a/rust-cuda-derive/src/rust_to_cuda/field_copy.rs b/rust-cuda-derive/src/rust_to_cuda/field_copy.rs index 549f5ab56..f6464d197 100644 --- a/rust-cuda-derive/src/rust_to_cuda/field_copy.rs +++ b/rust-cuda-derive/src/rust_to_cuda/field_copy.rs @@ -34,12 +34,12 @@ pub fn impl_field_copy_init_and_expand_alloc_type( match cuda_repr_field_ty { CudaReprFieldTy::SafeDeviceCopy => { r2c_field_declarations.push(quote! { - let #field_repr_ident = #crate_path::common::DeviceAccessible::from( + let #field_repr_ident = #crate_path::utils::ffi::DeviceAccessible::from( &self.#field_accessor, ); }); r2c_field_async_declarations.push(quote! { - let #field_repr_ident = #crate_path::common::DeviceAccessible::from( + let #field_repr_ident = #crate_path::utils::ffi::DeviceAccessible::from( &self.#field_accessor, ); }); @@ -50,26 +50,26 @@ pub fn impl_field_copy_init_and_expand_alloc_type( c2r_field_initialisations.push(quote! { #optional_field_ident { - #crate_path::common::CudaAsRust::as_rust(&this.#field_accessor).into_inner() + #crate_path::lend::CudaAsRust::as_rust(&this.#field_accessor).into_inner() }, }); }, CudaReprFieldTy::RustToCuda { field_ty } => { combined_cuda_alloc_type = quote! { - #crate_path::common::CombinedCudaAlloc< - <#field_ty as #crate_path::common::RustToCuda>::CudaAllocation, + #crate_path::alloc::CombinedCudaAlloc< + <#field_ty as #crate_path::lend::RustToCuda>::CudaAllocation, #combined_cuda_alloc_type > }; r2c_field_declarations.push(quote! { - let (#field_repr_ident, alloc_front) = #crate_path::common::RustToCuda::borrow( + let (#field_repr_ident, alloc_front) = #crate_path::lend::RustToCuda::borrow( &self.#field_accessor, alloc_front, )?; }); r2c_field_async_declarations.push(quote! { - let (#field_repr_ident, alloc_front) = #crate_path::common::RustToCudaAsync::borrow_async( + let (#field_repr_ident, alloc_front) = #crate_path::lend::RustToCudaAsync::borrow_async( &self.#field_accessor, alloc_front, stream, @@ -81,13 +81,13 @@ pub fn impl_field_copy_init_and_expand_alloc_type( }); r2c_field_destructors.push(quote! { - let alloc_front = #crate_path::common::RustToCuda::restore( + let alloc_front = #crate_path::lend::RustToCuda::restore( &mut self.#field_accessor, alloc_front, )?; }); r2c_field_async_destructors.push(quote! { - let alloc_front = #crate_path::common::RustToCudaAsync::restore_async( + let alloc_front = #crate_path::lend::RustToCudaAsync::restore_async( &mut self.#field_accessor, alloc_front, stream, @@ -96,30 +96,30 @@ pub fn impl_field_copy_init_and_expand_alloc_type( c2r_field_initialisations.push(quote! { #optional_field_ident { - #crate_path::common::CudaAsRust::as_rust(&this.#field_accessor) + #crate_path::lend::CudaAsRust::as_rust(&this.#field_accessor) }, }); }, CudaReprFieldTy::RustToCudaProxy { proxy_ty, field_ty } => { combined_cuda_alloc_type = quote! { - #crate_path::common::CombinedCudaAlloc< - <#proxy_ty as #crate_path::common::RustToCuda>::CudaAllocation, + #crate_path::alloc::CombinedCudaAlloc< + <#proxy_ty as #crate_path::lend::RustToCuda>::CudaAllocation, #combined_cuda_alloc_type > }; r2c_field_declarations.push(quote! { - let (#field_repr_ident, alloc_front) = #crate_path::common::RustToCuda::borrow( + let (#field_repr_ident, alloc_front) = #crate_path::lend::RustToCuda::borrow( < - #proxy_ty as #crate_path::common::RustToCudaProxy<#field_ty> + #proxy_ty as #crate_path::lend::RustToCudaProxy<#field_ty> >::from_ref(&self.#field_accessor), alloc_front, )?; }); r2c_field_async_declarations.push(quote! { - let (#field_repr_ident, alloc_front) = #crate_path::common::RustToCudaAsync::borrow_async( + let (#field_repr_ident, alloc_front) = #crate_path::lend::RustToCudaAsync::borrow_async( < - #proxy_ty as #crate_path::common::RustToCudaAsyncProxy<#field_ty> + #proxy_ty as #crate_path::lend::RustToCudaAsyncProxy<#field_ty> >::from_ref(&self.#field_accessor), alloc_front, stream, @@ -131,17 +131,17 @@ pub fn impl_field_copy_init_and_expand_alloc_type( }); r2c_field_destructors.push(quote! { - let alloc_front = #crate_path::common::RustToCuda::restore( + let alloc_front = #crate_path::lend::RustToCuda::restore( < - #proxy_ty as #crate_path::common::RustToCudaProxy<#field_ty> + #proxy_ty as #crate_path::lend::RustToCudaProxy<#field_ty> >::from_mut(&mut self.#field_accessor), alloc_front, )?; }); r2c_field_async_destructors.push(quote! { - let alloc_front = #crate_path::common::RustToCudaAsync::restore_async( + let alloc_front = #crate_path::lend::RustToCudaAsync::restore_async( < - #proxy_ty as #crate_path::common::RustToCudaAsyncProxy<#field_ty> + #proxy_ty as #crate_path::lend::RustToCudaAsyncProxy<#field_ty> >::from_mut(&mut self.#field_accessor), alloc_front, stream, @@ -150,8 +150,8 @@ pub fn impl_field_copy_init_and_expand_alloc_type( c2r_field_initialisations.push(quote! { #optional_field_ident { - #crate_path::common::RustToCudaProxy::<#field_ty>::into( - #crate_path::common::CudaAsRust::as_rust(&this.#field_accessor) + #crate_path::lend::RustToCudaProxy::<#field_ty>::into( + #crate_path::lend::CudaAsRust::as_rust(&this.#field_accessor) ) }, }); diff --git a/rust-cuda-derive/src/rust_to_cuda/field_ty.rs b/rust-cuda-derive/src/rust_to_cuda/field_ty.rs index 21509ef8c..aee846fe3 100644 --- a/rust-cuda-derive/src/rust_to_cuda/field_ty.rs +++ b/rust-cuda-derive/src/rust_to_cuda/field_ty.rs @@ -36,8 +36,8 @@ pub fn swap_field_type_and_filter_attrs( field_ty: Box::new(field_ty.clone()), }); field_ty = parse_quote! { - #crate_path::common::DeviceAccessible< - <#field_ty as #crate_path::common::RustToCuda>::CudaRepresentation + #crate_path::utils::ffi::DeviceAccessible< + <#field_ty as #crate_path::lend::RustToCuda>::CudaRepresentation > }; } else { @@ -57,8 +57,8 @@ pub fn swap_field_type_and_filter_attrs( Ok(proxy_ty) => { let old_field_ty = Box::new(field_ty.clone()); field_ty = parse_quote! { - #crate_path::common::DeviceAccessible< - <#proxy_ty as #crate_path::common::RustToCuda>::CudaRepresentation + #crate_path::utils::ffi::DeviceAccessible< + <#proxy_ty as #crate_path::lend::RustToCuda>::CudaRepresentation > }; cuda_repr_field_ty = Some(CudaReprFieldTy::RustToCudaProxy { @@ -107,7 +107,7 @@ pub fn swap_field_type_and_filter_attrs( cuda_repr_field_ty } else { field_ty = parse_quote! { - #crate_path::common::DeviceAccessible< + #crate_path::utils::ffi::DeviceAccessible< #crate_path::utils::device_copy::SafeDeviceCopyWrapper<#field_ty> > }; diff --git a/rust-cuda-derive/src/rust_to_cuda/generics.rs b/rust-cuda-derive/src/rust_to_cuda/generics.rs index b9335db46..4325f39fb 100644 --- a/rust-cuda-derive/src/rust_to_cuda/generics.rs +++ b/rust-cuda-derive/src/rust_to_cuda/generics.rs @@ -190,13 +190,13 @@ pub fn expand_cuda_struct_generics_where_requested_in_attrs( .make_where_clause() .predicates .push(syn::parse_quote! { - #ty: #crate_path::common::RustToCuda + #ty: #crate_path::lend::RustToCuda }); struct_generics_cuda_async .make_where_clause() .predicates .push(syn::parse_quote! { - #ty: #crate_path::common::RustToCudaAsync + #ty: #crate_path::lend::RustToCudaAsync }); } diff --git a/rust-cuda-derive/src/rust_to_cuda/impl.rs b/rust-cuda-derive/src/rust_to_cuda/impl.rs index 2928cebef..b7dc1eb13 100644 --- a/rust-cuda-derive/src/rust_to_cuda/impl.rs +++ b/rust-cuda-derive/src/rust_to_cuda/impl.rs @@ -76,7 +76,7 @@ pub fn rust_to_cuda_trait( let (impl_generics, ty_generics, where_clause) = struct_generics_cuda.split_for_impl(); quote! { - unsafe impl #impl_generics #crate_path::common::RustToCuda for #struct_name #ty_generics + unsafe impl #impl_generics #crate_path::lend::RustToCuda for #struct_name #ty_generics #where_clause { type CudaRepresentation = #struct_name_cuda #ty_generics; @@ -84,14 +84,14 @@ pub fn rust_to_cuda_trait( type CudaAllocation = #combined_cuda_alloc_type; #[cfg(not(target_os = "cuda"))] - unsafe fn borrow( + unsafe fn borrow( &self, alloc: CudaAllocType, ) -> #crate_path::deps::rustacuda::error::CudaResult<( - #crate_path::common::DeviceAccessible, - #crate_path::common::CombinedCudaAlloc + #crate_path::utils::ffi::DeviceAccessible, + #crate_path::alloc::CombinedCudaAlloc )> { - let alloc_front = #crate_path::common::NoCudaAlloc; + let alloc_front = #crate_path::alloc::NoCudaAlloc; let alloc_tail = alloc; #(#r2c_field_declarations)* @@ -99,15 +99,15 @@ pub fn rust_to_cuda_trait( let borrow = #rust_to_cuda_struct_construction; Ok(( - #crate_path::common::DeviceAccessible::from(borrow), - #crate_path::common::CombinedCudaAlloc::new(alloc_front, alloc_tail) + #crate_path::utils::ffi::DeviceAccessible::from(borrow), + #crate_path::alloc::CombinedCudaAlloc::new(alloc_front, alloc_tail) )) } #[cfg(not(target_os = "cuda"))] - unsafe fn restore( + unsafe fn restore( &mut self, - alloc: #crate_path::common::CombinedCudaAlloc< + alloc: #crate_path::alloc::CombinedCudaAlloc< Self::CudaAllocation, CudaAllocType >, ) -> #crate_path::deps::rustacuda::error::CudaResult { @@ -149,19 +149,19 @@ pub fn rust_to_cuda_async_trait( let (impl_generics, ty_generics, where_clause) = struct_generics_cuda_async.split_for_impl(); quote! { - unsafe impl #impl_generics #crate_path::common::RustToCudaAsync for #struct_name #ty_generics + unsafe impl #impl_generics #crate_path::lend::RustToCudaAsync for #struct_name #ty_generics #where_clause { #[cfg(not(target_os = "cuda"))] - unsafe fn borrow_async( + unsafe fn borrow_async( &self, alloc: CudaAllocType, stream: &#crate_path::deps::rustacuda::stream::Stream, ) -> #crate_path::deps::rustacuda::error::CudaResult<( - #crate_path::common::DeviceAccessible, - #crate_path::common::CombinedCudaAlloc + #crate_path::utils::ffi::DeviceAccessible, + #crate_path::alloc::CombinedCudaAlloc )> { - let alloc_front = #crate_path::common::NoCudaAlloc; + let alloc_front = #crate_path::alloc::NoCudaAlloc; let alloc_tail = alloc; #(#r2c_field_async_declarations)* @@ -169,15 +169,15 @@ pub fn rust_to_cuda_async_trait( let borrow = #rust_to_cuda_struct_construction; Ok(( - #crate_path::common::DeviceAccessible::from(borrow), - #crate_path::common::CombinedCudaAlloc::new(alloc_front, alloc_tail) + #crate_path::utils::ffi::DeviceAccessible::from(borrow), + #crate_path::alloc::CombinedCudaAlloc::new(alloc_front, alloc_tail) )) } #[cfg(not(target_os = "cuda"))] - unsafe fn restore_async( + unsafe fn restore_async( &mut self, - alloc: #crate_path::common::CombinedCudaAlloc< + alloc: #crate_path::alloc::CombinedCudaAlloc< Self::CudaAllocation, CudaAllocType >, stream: &#crate_path::deps::rustacuda::stream::Stream, @@ -217,14 +217,14 @@ pub fn cuda_as_rust_trait( let (impl_generics, ty_generics, where_clause) = &struct_generics_cuda.split_for_impl(); quote! { - unsafe impl #impl_generics #crate_path::common::CudaAsRust + unsafe impl #impl_generics #crate_path::lend::CudaAsRust for #struct_name_cuda #ty_generics #where_clause { type RustRepresentation = #struct_name #ty_generics; #[cfg(target_os = "cuda")] unsafe fn as_rust( - this: &#crate_path::common::DeviceAccessible, + this: &#crate_path::utils::ffi::DeviceAccessible, ) -> #struct_name #ty_generics { #cuda_as_rust_struct_construction } diff --git a/rust-cuda-derive/src/rust_to_cuda/mod.rs b/rust-cuda-derive/src/rust_to_cuda/mod.rs index fb5b39503..6a885ac94 100644 --- a/rust-cuda-derive/src/rust_to_cuda/mod.rs +++ b/rust-cuda-derive/src/rust_to_cuda/mod.rs @@ -31,7 +31,7 @@ pub fn impl_rust_to_cuda(ast: &syn::DeriveInput) -> proc_macro::TokenStream { ) = generics::expand_cuda_struct_generics_where_requested_in_attrs(ast); let mut combined_cuda_alloc_type: TokenStream = quote! { - #crate_path::common::NoCudaAlloc + #crate_path::alloc::NoCudaAlloc }; let mut r2c_field_declarations: Vec = Vec::new(); let mut r2c_field_async_declarations: Vec = Vec::new(); diff --git a/src/alloc.rs b/src/alloc.rs new file mode 100644 index 000000000..f16178aec --- /dev/null +++ b/src/alloc.rs @@ -0,0 +1,52 @@ +#![allow(clippy::module_name_repetitions)] + +pub trait EmptyCudaAlloc: sealed::empty::Sealed {} + +pub trait CudaAlloc: sealed::alloc::Sealed {} + +impl CudaAlloc for Option {} +impl sealed::alloc::Sealed for Option {} + +pub struct NoCudaAlloc; +impl CudaAlloc for NoCudaAlloc {} +impl sealed::alloc::Sealed for NoCudaAlloc {} +impl EmptyCudaAlloc for NoCudaAlloc {} +impl sealed::empty::Sealed for NoCudaAlloc {} + +pub struct SomeCudaAlloc(()); +impl CudaAlloc for SomeCudaAlloc {} +impl sealed::alloc::Sealed for SomeCudaAlloc {} +impl !EmptyCudaAlloc for SomeCudaAlloc {} +impl !sealed::empty::Sealed for SomeCudaAlloc {} + +pub struct CombinedCudaAlloc(A, B); +impl CudaAlloc for CombinedCudaAlloc {} +impl sealed::alloc::Sealed for CombinedCudaAlloc {} +impl EmptyCudaAlloc + for CombinedCudaAlloc +{ +} +impl sealed::empty::Sealed + for CombinedCudaAlloc +{ +} +impl CombinedCudaAlloc { + #[must_use] + pub const fn new(front: A, tail: B) -> Self { + Self(front, tail) + } + + pub fn split(self) -> (A, B) { + (self.0, self.1) + } +} + +pub(crate) mod sealed { + pub(super) mod empty { + pub trait Sealed {} + } + + pub mod alloc { + pub trait Sealed {} + } +} diff --git a/src/common.rs b/src/common.rs deleted file mode 100644 index 37d005ac4..000000000 --- a/src/common.rs +++ /dev/null @@ -1,311 +0,0 @@ -use core::marker::PhantomData; -#[cfg(feature = "device")] -use core::{ - convert::{AsMut, AsRef}, - ops::{Deref, DerefMut}, -}; - -#[cfg(feature = "host")] -use core::{mem::MaybeUninit, ptr::copy_nonoverlapping}; -#[cfg(feature = "host")] -use std::fmt; - -use const_type_layout::{TypeGraphLayout, TypeLayout}; -use rustacuda_core::DeviceCopy; - -#[cfg(feature = "derive")] -pub use rust_cuda_derive::LendRustToCuda; - -#[cfg(feature = "host")] -use crate::{safety::SafeDeviceCopy, utils::device_copy::SafeDeviceCopyWrapper}; - -#[repr(transparent)] -#[cfg_attr(any(feature = "device", doc), derive(Debug))] -#[derive(TypeLayout)] -pub struct DeviceAccessible(T); - -unsafe impl DeviceCopy for DeviceAccessible {} - -#[cfg(feature = "host")] -impl From for DeviceAccessible { - fn from(value: T) -> Self { - Self(value) - } -} - -#[cfg(feature = "host")] -impl From<&T> for DeviceAccessible> { - fn from(value: &T) -> Self { - let value = unsafe { - let mut uninit = MaybeUninit::uninit(); - copy_nonoverlapping(value, uninit.as_mut_ptr(), 1); - uninit.assume_init() - }; - - Self(SafeDeviceCopyWrapper::from(value)) - } -} - -#[cfg(all(feature = "host", not(doc)))] -impl fmt::Debug for DeviceAccessible { - fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result { - fmt.debug_struct(stringify!(DeviceAccessible)) - .finish_non_exhaustive() - } -} - -#[cfg(feature = "device")] -impl Deref for DeviceAccessible { - type Target = T; - - fn deref(&self) -> &Self::Target { - &self.0 - } -} - -#[cfg(feature = "device")] -impl DerefMut for DeviceAccessible { - fn deref_mut(&mut self) -> &mut Self::Target { - &mut self.0 - } -} - -/// # Safety -/// -/// This is an internal trait and should ONLY be derived automatically using -/// `#[derive(LendRustToCuda)]` -pub unsafe trait RustToCuda { - type CudaAllocation: CudaAlloc; - type CudaRepresentation: CudaAsRust + TypeGraphLayout; - - #[doc(hidden)] - #[cfg(feature = "host")] - /// # Errors - /// - /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside - /// CUDA - /// - /// # Safety - /// - /// This is an internal function and should NEVER be called manually - /// The returned [`Self::CudaRepresentation`] must NEVER be accessed on the - /// CPU as it contains a GPU-resident copy of `self`. - #[allow(clippy::type_complexity)] - unsafe fn borrow( - &self, - alloc: A, - ) -> rustacuda::error::CudaResult<( - DeviceAccessible, - CombinedCudaAlloc, - )>; - - #[doc(hidden)] - #[cfg(feature = "host")] - /// # Errors - /// - /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside - /// CUDA - /// - /// # Safety - /// - /// This is an internal function and should NEVER be called manually - #[allow(clippy::type_complexity)] - unsafe fn restore( - &mut self, - alloc: CombinedCudaAlloc, - ) -> rustacuda::error::CudaResult; -} - -/// # Safety -/// -/// This is an internal trait and should ONLY be derived automatically using -/// `#[derive(LendRustToCuda)]` -pub unsafe trait RustToCudaAsync: RustToCuda { - #[doc(hidden)] - #[cfg(feature = "host")] - /// # Errors - /// - /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside - /// CUDA - /// - /// # Safety - /// - /// This is an internal function and should NEVER be called manually - /// The returned - /// [`Self::CudaRepresentation`](RustToCuda::CudaRepresentation) must NEVER - /// be accessed on the CPU as it contains a GPU-resident copy of - /// `self`. - #[allow(clippy::type_complexity)] - unsafe fn borrow_async( - &self, - alloc: A, - stream: &rustacuda::stream::Stream, - ) -> rustacuda::error::CudaResult<( - DeviceAccessible, - CombinedCudaAlloc, - )>; - - #[doc(hidden)] - #[cfg(feature = "host")] - /// # Errors - /// - /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside - /// CUDA - /// - /// # Safety - /// - /// This is an internal function and should NEVER be called manually - #[allow(clippy::type_complexity)] - unsafe fn restore_async( - &mut self, - alloc: CombinedCudaAlloc, - stream: &rustacuda::stream::Stream, - ) -> rustacuda::error::CudaResult; -} - -/// # Safety -/// -/// This is an internal trait and should NEVER be implemented manually -pub unsafe trait CudaAsRust: DeviceCopy + TypeGraphLayout { - type RustRepresentation: RustToCuda; - - #[doc(hidden)] - #[cfg(feature = "device")] - /// # Safety - /// - /// This is an internal function and should NEVER be called manually - unsafe fn as_rust(this: &DeviceAccessible) -> Self::RustRepresentation; -} - -pub trait RustToCudaProxy: RustToCuda { - fn from_ref(val: &T) -> &Self; - fn from_mut(val: &mut T) -> &mut Self; - - fn into(self) -> T; -} - -pub trait RustToCudaAsyncProxy: RustToCudaAsync { - fn from_ref(val: &T) -> &Self; - fn from_mut(val: &mut T) -> &mut Self; - - fn into(self) -> T; -} - -#[repr(transparent)] -#[derive(Clone, Copy, TypeLayout)] -pub struct DeviceConstRef<'r, T: DeviceCopy + 'r> { - #[cfg_attr(feature = "host", allow(dead_code))] - pub(super) pointer: *const T, - pub(super) reference: PhantomData<&'r T>, -} - -unsafe impl<'r, T: DeviceCopy> DeviceCopy for DeviceConstRef<'r, T> {} - -#[cfg(feature = "device")] -impl<'r, T: DeviceCopy> AsRef for DeviceConstRef<'r, T> { - fn as_ref(&self) -> &T { - unsafe { &*self.pointer } - } -} - -#[repr(transparent)] -#[derive(TypeLayout)] -pub struct DeviceMutRef<'r, T: DeviceCopy + 'r> { - #[cfg_attr(feature = "host", allow(dead_code))] - pub(super) pointer: *mut T, - pub(super) reference: PhantomData<&'r mut T>, -} - -unsafe impl<'r, T: DeviceCopy> DeviceCopy for DeviceMutRef<'r, T> {} - -#[cfg(feature = "device")] -impl<'r, T: DeviceCopy> AsRef for DeviceMutRef<'r, T> { - fn as_ref(&self) -> &T { - unsafe { &*self.pointer } - } -} - -#[cfg(feature = "device")] -impl<'r, T: DeviceCopy> AsMut for DeviceMutRef<'r, T> { - fn as_mut(&mut self) -> &mut T { - unsafe { &mut *self.pointer } - } -} - -#[repr(transparent)] -#[derive(TypeLayout)] -pub struct DeviceOwnedRef<'r, T: DeviceCopy> { - #[cfg_attr(feature = "host", allow(dead_code))] - pub(super) pointer: *mut T, - pub(super) reference: PhantomData<&'r mut ()>, - pub(super) marker: PhantomData, -} - -unsafe impl<'r, T: DeviceCopy> DeviceCopy for DeviceOwnedRef<'r, T> {} - -#[cfg(feature = "device")] -impl<'r, T: DeviceCopy> AsRef for DeviceOwnedRef<'r, T> { - fn as_ref(&self) -> &T { - unsafe { &*self.pointer } - } -} - -#[cfg(feature = "device")] -impl<'r, T: DeviceCopy> AsMut for DeviceOwnedRef<'r, T> { - fn as_mut(&mut self) -> &mut T { - unsafe { &mut *self.pointer } - } -} - -pub(crate) mod crate_private { - pub mod alloc { - pub trait Sealed {} - } -} - -mod private { - pub mod empty { - pub trait Sealed {} - } -} - -pub trait EmptyCudaAlloc: private::empty::Sealed {} - -pub trait CudaAlloc: crate_private::alloc::Sealed {} - -impl CudaAlloc for Option {} -impl crate_private::alloc::Sealed for Option {} - -pub struct NoCudaAlloc; -impl CudaAlloc for NoCudaAlloc {} -impl crate_private::alloc::Sealed for NoCudaAlloc {} -impl EmptyCudaAlloc for NoCudaAlloc {} -impl private::empty::Sealed for NoCudaAlloc {} - -pub struct SomeCudaAlloc(()); -impl CudaAlloc for SomeCudaAlloc {} -impl crate_private::alloc::Sealed for SomeCudaAlloc {} -impl !EmptyCudaAlloc for SomeCudaAlloc {} -impl !private::empty::Sealed for SomeCudaAlloc {} - -pub struct CombinedCudaAlloc(A, B); -impl CudaAlloc for CombinedCudaAlloc {} -impl crate_private::alloc::Sealed for CombinedCudaAlloc {} -impl EmptyCudaAlloc - for CombinedCudaAlloc -{ -} -impl private::empty::Sealed - for CombinedCudaAlloc -{ -} -impl CombinedCudaAlloc { - #[must_use] - pub const fn new(front: A, tail: B) -> Self { - Self(front, tail) - } - - pub fn split(self) -> (A, B) { - (self.0, self.1) - } -} diff --git a/src/device/mod.rs b/src/device/mod.rs index 07894b5bb..0c2a0c83f 100644 --- a/src/device/mod.rs +++ b/src/device/mod.rs @@ -1,79 +1,6 @@ -use core::mem::ManuallyDrop; - #[cfg(feature = "derive")] pub use rust_cuda_derive::{specialise_kernel_function, specialise_kernel_type}; -use crate::{ - common::{CudaAsRust, DeviceAccessible, DeviceConstRef, DeviceOwnedRef, RustToCuda}, - safety::{NoSafeAliasing, SafeDeviceCopy}, -}; - pub mod alloc; pub mod thread; pub mod utils; - -pub trait BorrowFromRust: RustToCuda + NoSafeAliasing { - /// # Safety - /// - /// This function is only safe to call iff `cuda_repr` is the - /// [`DeviceConstRef`] borrowed on the CPU using the corresponding - /// [`LendToCuda::lend_to_cuda`](crate::host::LendToCuda::lend_to_cuda). - unsafe fn with_borrow_from_rust O>( - cuda_repr: DeviceConstRef::CudaRepresentation>>, - inner: F, - ) -> O; - - // /// # Safety - // /// - // /// This function is only safe to call iff `cuda_repr_mut` is the - // /// [`DeviceMutRef`] borrowed on the CPU using the corresponding - // /// [`LendToCuda::lend_to_cuda_mut`](crate::host::LendToCuda::lend_to_cuda_mut). - // /// Furthermore, since different GPU threads can access heap storage - // /// mutably inside the safe `inner` scope, there must not be any - // /// aliasing between concurrently running threads. - // unsafe fn with_borrow_from_rust_mut O>( - // cuda_repr_mut: DeviceMutRef::CudaRepresentation>>, inner: F, - // ) -> O; - - /// # Safety - /// - /// This function is only safe to call iff `cuda_repr` is the - /// [`DeviceOwnedRef`] borrowed on the CPU using the corresponding - /// [`LendToCuda::move_to_cuda`](crate::host::LendToCuda::move_to_cuda). - unsafe fn with_moved_from_rust O>( - cuda_repr_mut: DeviceOwnedRef::CudaRepresentation>>, - inner: F, - ) -> O - where - Self: Sized, - ::CudaRepresentation: SafeDeviceCopy; -} - -impl BorrowFromRust for T { - #[inline] - unsafe fn with_borrow_from_rust O>( - cuda_repr: DeviceConstRef::CudaRepresentation>>, - inner: F, - ) -> O { - // `rust_repr` must never be dropped as we do NOT own any of the - // heap memory it might reference - let rust_repr = ManuallyDrop::new(CudaAsRust::as_rust(cuda_repr.as_ref())); - - inner(&rust_repr) - } - - #[inline] - unsafe fn with_moved_from_rust O>( - mut cuda_repr_mut: DeviceOwnedRef< - DeviceAccessible<::CudaRepresentation>, - >, - inner: F, - ) -> O - where - Self: Sized, - ::CudaRepresentation: SafeDeviceCopy, - { - inner(CudaAsRust::as_rust(cuda_repr_mut.as_mut())) - } -} diff --git a/src/host/mod.rs b/src/host/mod.rs index 2d423362a..ba37e32e2 100644 --- a/src/host/mod.rs +++ b/src/host/mod.rs @@ -1,523 +1,24 @@ use std::{ - ffi::{CStr, CString}, marker::PhantomData, mem::ManuallyDrop, ops::{Deref, DerefMut}, - ptr::NonNull, }; use rustacuda::{ context::Context, error::{CudaError, CudaResult}, event::Event, - function::Function, memory::{DeviceBox, DeviceBuffer, LockedBox, LockedBuffer}, module::Module, stream::Stream, }; use rustacuda_core::{DeviceCopy, DevicePointer}; -#[cfg(feature = "derive")] -pub use rust_cuda_derive::{check_kernel, link_kernel, specialise_kernel_entry_point}; - use crate::{ - common::{ - DeviceAccessible, DeviceConstRef, DeviceMutRef, DeviceOwnedRef, EmptyCudaAlloc, - NoCudaAlloc, RustToCuda, - }, - kernel::CudaKernelParameter, - safety::{NoSafeAliasing, SafeDeviceCopy}, + safety::SafeDeviceCopy, + utils::ffi::{DeviceConstRef, DeviceMutRef, DeviceOwnedRef}, }; -mod ptx_jit; -use ptx_jit::{PtxJITCompiler, PtxJITResult}; - -pub struct Launcher<'stream, 'kernel, Kernel> { - pub stream: &'stream Stream, - pub kernel: &'kernel mut TypedPtxKernel, - pub config: LaunchConfig, -} - -macro_rules! impl_launcher_launch { - ($launch:ident($($arg:ident : $T:ident),*) => $with_async:ident => $launch_async:ident) => { - #[allow(clippy::missing_errors_doc)] - #[allow(clippy::too_many_arguments)] // func is defined for <= 12 args - pub fn $launch<$($T: CudaKernelParameter),*>( - &mut self, - $($arg: $T::SyncHostType),* - ) -> CudaResult<()> - where - Kernel: FnOnce(&mut Launcher, $($T),*), - { - self.kernel.$launch::<$($T),*>(self.stream, &self.config, $($arg),*) - } - - #[allow(clippy::missing_errors_doc)] - #[allow(clippy::too_many_arguments)] // func is defined for <= 12 args - pub fn $with_async< - 'a, - Ok, - Err: From, - $($T: CudaKernelParameter),* - >( - &'a mut self, - $($arg: $T::SyncHostType,)* - inner: impl FnOnce( - &'a mut Self, - $($T::AsyncHostType<'stream, '_>),* - ) -> Result, - ) -> Result - where - Kernel: FnOnce(&mut Launcher, $($T),*), - { - #[allow(unused_variables)] - let stream = self.stream; - - impl_launcher_launch! { impl with_new_async ($($arg: $T),*) + (stream) { - inner(self, $($arg),*) - } } - } - - #[allow(clippy::missing_errors_doc)] - #[allow(clippy::too_many_arguments)] // func is defined for <= 12 args - pub fn $launch_async<$($T: CudaKernelParameter),*>( - &mut self, - $($arg: $T::AsyncHostType<'stream, '_>),* - ) -> CudaResult<()> - where - Kernel: FnOnce(&mut Launcher, $($T),*), - { - self.kernel.$launch_async::<$($T),*>(self.stream, &self.config, $($arg),*) - } - }; - (impl $func:ident () + ($($other:ident),*) $inner:block) => { - $inner - }; - (impl $func:ident ($arg0:ident : $T0:ident $(, $arg:ident : $T:ident)*) + ($($other:ident),*) $inner:block) => { - $T0::$func($arg0 $(, $other)*, |$arg0| { - impl_launcher_launch! { impl $func ($($arg: $T),*) + ($($other),*) $inner } - }) - }; -} - -impl<'stream, 'kernel, Kernel> Launcher<'stream, 'kernel, Kernel> { - impl_launcher_launch! { launch0() => with0_async => launch0_async } - - impl_launcher_launch! { launch1( - arg1: A - ) => with1_async => launch1_async } - - impl_launcher_launch! { launch2( - arg1: A, arg2: B - ) => with2_async => launch2_async } - - impl_launcher_launch! { launch3( - arg1: A, arg2: B, arg3: C - ) => with3_async => launch3_async } - - impl_launcher_launch! { launch4( - arg1: A, arg2: B, arg3: C, arg4: D - ) => with4_async => launch4_async } - - impl_launcher_launch! { launch5( - arg1: A, arg2: B, arg3: C, arg4: D, arg5: E - ) => with5_async => launch5_async } - - impl_launcher_launch! { launch6( - arg1: A, arg2: B, arg3: C, arg4: D, arg5: E, arg6: F - ) => with6_async => launch6_async } - - impl_launcher_launch! { launch7( - arg1: A, arg2: B, arg3: C, arg4: D, arg5: E, arg6: F, arg7: G - ) => with7_async => launch7_async } - - impl_launcher_launch! { launch8( - arg1: A, arg2: B, arg3: C, arg4: D, arg5: E, arg6: F, arg7: G, arg8: H - ) => with8_async => launch8_async } - - impl_launcher_launch! { launch9( - arg1: A, arg2: B, arg3: C, arg4: D, arg5: E, arg6: F, arg7: G, arg8: H, arg9: I - ) => with9_async => launch9_async } - - impl_launcher_launch! { launch10( - arg1: A, arg2: B, arg3: C, arg4: D, arg5: E, arg6: F, arg7: G, arg8: H, arg9: I, arg10: J - ) => with10_async => launch10_async } - - impl_launcher_launch! { launch11( - arg1: A, arg2: B, arg3: C, arg4: D, arg5: E, arg6: F, arg7: G, arg8: H, arg9: I, arg10: J, - arg11: K - ) => with11_async => launch11_async } - - impl_launcher_launch! { launch12( - arg1: A, arg2: B, arg3: C, arg4: D, arg5: E, arg6: F, arg7: G, arg8: H, arg9: I, arg10: J, - arg11: K, arg12: L - ) => with12_async => launch12_async } -} - -#[derive(Clone, Debug, PartialEq, Eq)] -pub struct LaunchConfig { - pub grid: rustacuda::function::GridSize, - pub block: rustacuda::function::BlockSize, - pub shared_memory_size: u32, - pub ptx_jit: bool, -} - -pub struct RawPtxKernel { - module: ManuallyDrop>, - function: ManuallyDrop>, -} - -impl RawPtxKernel { - /// # Errors - /// - /// Returns a [`CudaError`] if `ptx` is not a valid PTX source, or it does - /// not contain an entry point named `entry_point`. - pub fn new(ptx: &CStr, entry_point: &CStr) -> CudaResult { - let module = Box::new(Module::load_from_string(ptx)?); - - let function = unsafe { &*(module.as_ref() as *const Module) }.get_function(entry_point); - - let function = match function { - Ok(function) => function, - Err(err) => { - if let Err((_err, module)) = Module::drop(*module) { - std::mem::forget(module); - } - - return Err(err); - }, - }; - - Ok(Self { - function: ManuallyDrop::new(function), - module: ManuallyDrop::new(module), - }) - } - - #[must_use] - pub fn get_function(&self) -> &Function { - &self.function - } -} - -impl Drop for RawPtxKernel { - fn drop(&mut self) { - { - // Ensure that self.function is dropped before self.module as - // it borrows data from the module and must not outlive it - let _function = unsafe { ManuallyDrop::take(&mut self.function) }; - } - - if let Err((_err, module)) = Module::drop(*unsafe { ManuallyDrop::take(&mut self.module) }) - { - std::mem::forget(module); - } - } -} - -pub type PtxKernelConfigure = dyn FnMut(&Function) -> CudaResult<()>; - -pub struct TypedPtxKernel { - compiler: PtxJITCompiler, - ptx_kernel: Option, - entry_point: Box, - configure: Option>, - marker: PhantomData, -} - -macro_rules! impl_typed_kernel_launch { - ($launch:ident($($arg:ident : $T:ident),*) => $with_async:ident => $launch_async:ident) => { - #[allow(clippy::missing_errors_doc)] - #[allow(clippy::too_many_arguments)] // func is defined for <= 12 args - pub fn $launch<$($T: CudaKernelParameter),*>( - &mut self, - stream: &Stream, - config: &LaunchConfig, - $($arg: $T::SyncHostType),* - ) -> CudaResult<()> - where - Kernel: FnOnce(&mut Launcher, $($T),*), - { - self.$with_async::<(), CudaError, $($T),*>( - stream, - config, - $($arg,)* - |kernel, stream, config, $($arg),*| { - let result = kernel.$launch_async::<$($T),*>(stream, config, $($arg),*); - - // important: always synchronise here, this function is sync! - match (stream.synchronize(), result) { - (Ok(()), result) => result, - (Err(_), Err(err)) | (Err(err), Ok(())) => Err(err), - } - }, - ) - } - - #[allow(clippy::missing_errors_doc)] - #[allow(clippy::too_many_arguments)] // func is defined for <= 12 args - pub fn $with_async< - 'a, - 'stream, - Ok, - Err: From, - $($T: CudaKernelParameter),* - >( - &'a mut self, - stream: &'stream Stream, - config: &LaunchConfig, - $($arg: $T::SyncHostType,)* - inner: impl FnOnce( - &'a mut Self, - &'stream Stream, - &LaunchConfig, - $($T::AsyncHostType<'stream, '_>),* - ) -> Result, - ) -> Result - where - Kernel: FnOnce(&mut Launcher, $($T),*), - { - impl_typed_kernel_launch! { impl with_new_async ($($arg: $T),*) + (stream) { - inner(self, stream, config, $($arg),*) - } } - } - - #[allow(clippy::missing_errors_doc)] - #[allow(clippy::needless_lifetimes)] // 'stream is unused for zero args - #[allow(clippy::too_many_arguments)] // func is defined for <= 12 args - pub fn $launch_async<'stream, $($T: CudaKernelParameter),*>( - &mut self, - stream: &'stream Stream, - config: &LaunchConfig, - $($arg: $T::AsyncHostType<'stream, '_>),* - ) -> CudaResult<()> - where - Kernel: FnOnce(&mut Launcher, $($T),*), - { - let function = if config.ptx_jit { - impl_typed_kernel_launch! { impl with_async_as_ptx_jit ref ($($arg: $T),*) + () { - self.compile_with_ptx_jit_args(Some(&[$($arg),*])) - } }? - } else { - self.compile_with_ptx_jit_args(None)? - }; - - unsafe { stream.launch( - function, - config.grid.clone(), - config.block.clone(), - config.shared_memory_size, - &[ - $(core::ptr::from_mut( - &mut $T::async_to_ffi($arg) - ).cast::()),* - ], - ) } - } - }; - (impl $func:ident () + ($($other:ident),*) $inner:block) => { - $inner - }; - (impl $func:ident ($arg0:ident : $T0:ident $(, $arg:ident : $T:ident)*) + ($($other:ident),*) $inner:block) => { - $T0::$func($arg0 $(, $other)*, |$arg0| { - impl_typed_kernel_launch! { impl $func ($($arg: $T),*) + ($($other),*) $inner } - }) - }; - (impl $func:ident ref () + ($($other:ident),*) $inner:block) => { - $inner - }; - (impl $func:ident ref ($arg0:ident : $T0:ident $(, $arg:ident : $T:ident)*) + ($($other:ident),*) $inner:block) => { - $T0::$func(&$arg0 $(, $other)*, |$arg0| { - impl_typed_kernel_launch! { impl $func ref ($($arg: $T),*) + ($($other),*) $inner } - }) - }; -} - -impl TypedPtxKernel { - #[must_use] - pub fn new>(configure: Option>) -> Self { - let compiler = PtxJITCompiler::new(T::get_ptx()); - let entry_point = CString::from(T::get_entry_point()).into_boxed_c_str(); - - Self { - compiler, - ptx_kernel: None, - entry_point, - configure, - marker: PhantomData::, - } - } -} - -impl TypedPtxKernel { - impl_typed_kernel_launch! { launch0() => with0_async => launch0_async } - - impl_typed_kernel_launch! { launch1( - arg1: A - ) => with1_async => launch1_async } - - impl_typed_kernel_launch! { launch2( - arg1: A, arg2: B - ) => with2_async => launch2_async } - - impl_typed_kernel_launch! { launch3( - arg1: A, arg2: B, arg3: C - ) => with3_async => launch3_async } - - impl_typed_kernel_launch! { launch4( - arg1: A, arg2: B, arg3: C, arg4: D - ) => with4_async => launch4_async } - - impl_typed_kernel_launch! { launch5( - arg1: A, arg2: B, arg3: C, arg4: D, arg5: E - ) => with5_async => launch5_async } - - impl_typed_kernel_launch! { launch6( - arg1: A, arg2: B, arg3: C, arg4: D, arg5: E, arg6: F - ) => with6_async => launch6_async } - - impl_typed_kernel_launch! { launch7( - arg1: A, arg2: B, arg3: C, arg4: D, arg5: E, arg6: F, arg7: G - ) => with7_async => launch7_async } - - impl_typed_kernel_launch! { launch8( - arg1: A, arg2: B, arg3: C, arg4: D, arg5: E, arg6: F, arg7: G, arg8: H - ) => with8_async => launch8_async } - - impl_typed_kernel_launch! { launch9( - arg1: A, arg2: B, arg3: C, arg4: D, arg5: E, arg6: F, arg7: G, arg8: H, arg9: I - ) => with9_async => launch9_async } - - impl_typed_kernel_launch! { launch10( - arg1: A, arg2: B, arg3: C, arg4: D, arg5: E, arg6: F, arg7: G, arg8: H, arg9: I, arg10: J - ) => with10_async => launch10_async } - - impl_typed_kernel_launch! { launch11( - arg1: A, arg2: B, arg3: C, arg4: D, arg5: E, arg6: F, arg7: G, arg8: H, arg9: I, arg10: J, - arg11: K - ) => with11_async => launch11_async } - - impl_typed_kernel_launch! { launch12( - arg1: A, arg2: B, arg3: C, arg4: D, arg5: E, arg6: F, arg7: G, arg8: H, arg9: I, arg10: J, - arg11: K, arg12: L - ) => with12_async => launch12_async } - - /// # Errors - /// - /// Returns a [`CudaError`] if the [`CompiledKernelPtx`] provided to - /// [`Self::new`] is not a valid PTX source or does not contain the - /// entry point it declares. - fn compile_with_ptx_jit_args( - &mut self, - arguments: Option<&[Option<&NonNull<[u8]>>]>, - ) -> CudaResult<&Function> { - let ptx_jit = self.compiler.with_arguments(arguments); - - let kernel_jit = match (&mut self.ptx_kernel, ptx_jit) { - (Some(ptx_kernel), PtxJITResult::Cached(_)) => ptx_kernel.get_function(), - (ptx_kernel, PtxJITResult::Cached(ptx_cstr) | PtxJITResult::Recomputed(ptx_cstr)) => { - let recomputed_ptx_kernel = RawPtxKernel::new(ptx_cstr, &self.entry_point)?; - - // Replace the existing compiled kernel, drop the old one - let ptx_kernel = ptx_kernel.insert(recomputed_ptx_kernel); - - let function = ptx_kernel.get_function(); - - if let Some(configure) = self.configure.as_mut() { - configure(function)?; - } - - function - }, - }; - - Ok(kernel_jit) - } -} - -pub trait LendToCuda: RustToCuda + NoSafeAliasing { - /// Lends an immutable copy of `&self` to CUDA: - /// - code in the CUDA kernel can only access `&self` through the - /// [`DeviceConstRef`] inside the closure - /// - after the closure, `&self` will not have changed - /// - /// # Errors - /// - /// Returns a [`CudaError`] iff an error occurs inside CUDA - fn lend_to_cuda< - O, - E: From, - F: FnOnce( - HostAndDeviceConstRef::CudaRepresentation>>, - ) -> Result, - >( - &self, - inner: F, - ) -> Result; - - /// Moves `self` to CUDA iff `self` is [`SafeDeviceCopy`] - /// - /// # Errors - /// - /// Returns a [`CudaError`] iff an error occurs inside CUDA - fn move_to_cuda< - O, - E: From, - F: FnOnce( - HostAndDeviceOwned::CudaRepresentation>>, - ) -> Result, - >( - self, - inner: F, - ) -> Result - where - Self: RustToCuda; -} - -impl LendToCuda for T { - fn lend_to_cuda< - O, - E: From, - F: FnOnce( - HostAndDeviceConstRef::CudaRepresentation>>, - ) -> Result, - >( - &self, - inner: F, - ) -> Result { - let (cuda_repr, alloc) = unsafe { self.borrow(NoCudaAlloc) }?; - - let result = HostAndDeviceConstRef::with_new(&cuda_repr, inner); - - core::mem::drop(cuda_repr); - core::mem::drop(alloc); - - result - } - - fn move_to_cuda< - O, - E: From, - F: FnOnce( - HostAndDeviceOwned::CudaRepresentation>>, - ) -> Result, - >( - self, - inner: F, - ) -> Result - where - Self: RustToCuda, - { - let (cuda_repr, alloc) = unsafe { self.borrow(NoCudaAlloc) }?; - - let result = HostAndDeviceOwned::with_new(cuda_repr, inner); - - core::mem::drop(alloc); - - result - } -} - pub trait CudaDroppable: Sized { #[allow(clippy::missing_errors_doc)] fn drop(val: Self) -> Result<(), (rustacuda::error::CudaError, Self)>; @@ -525,8 +26,8 @@ pub trait CudaDroppable: Sized { #[repr(transparent)] pub struct CudaDropWrapper(ManuallyDrop); -impl crate::common::CudaAlloc for CudaDropWrapper {} -impl crate::common::crate_private::alloc::Sealed for CudaDropWrapper {} +impl crate::alloc::CudaAlloc for CudaDropWrapper {} +impl crate::alloc::sealed::alloc::Sealed for CudaDropWrapper {} impl From for CudaDropWrapper { fn from(val: C) -> Self { Self(ManuallyDrop::new(val)) @@ -646,8 +147,8 @@ impl Drop for HostLockedBox { #[allow(clippy::module_name_repetitions)] pub struct HostDeviceBox(DevicePointer); -impl crate::common::CudaAlloc for HostDeviceBox {} -impl crate::common::crate_private::alloc::Sealed for HostDeviceBox {} +impl crate::alloc::CudaAlloc for HostDeviceBox {} +impl crate::alloc::sealed::alloc::Sealed for HostDeviceBox {} impl HostDeviceBox { /// # Errors @@ -1146,17 +647,3 @@ impl<'stream, 'a, T: SafeDeviceCopy + DeviceCopy> HostAndDeviceOwnedAsync<'strea self.host_val } } - -/// # Safety -/// -/// The PTX string returned by [`CompiledKernelPtx::get_ptx`] must correspond -/// to the compiled kernel code for the `Kernel` function and contain a kernel -/// entry point whose name is returned by -/// [`CompiledKernelPtx::get_entry_point`]. -/// -/// This trait should not be implemented manually – use the -/// [`kernel`](crate::kernel::kernel) macro instead. -pub unsafe trait CompiledKernelPtx { - fn get_ptx() -> &'static CStr; - fn get_entry_point() -> &'static CStr; -} diff --git a/src/kernel/mod.rs b/src/kernel/mod.rs new file mode 100644 index 000000000..f5aeeb4bf --- /dev/null +++ b/src/kernel/mod.rs @@ -0,0 +1,520 @@ +#[cfg(feature = "host")] +use std::{ + ffi::{CStr, CString}, + marker::PhantomData, + mem::ManuallyDrop, + ptr::NonNull, +}; + +use const_type_layout::TypeGraphLayout; +#[cfg(feature = "host")] +use rustacuda::{ + error::{CudaError, CudaResult}, + function::Function, + module::Module, + stream::Stream, +}; + +#[cfg(feature = "derive")] +pub use rust_cuda_derive::kernel; + +#[doc(hidden)] +#[cfg(all(feature = "derive", feature = "host"))] +#[allow(clippy::module_name_repetitions)] +pub use rust_cuda_derive::{check_kernel, link_kernel, specialise_kernel_entry_point}; + +#[cfg(feature = "host")] +mod ptx_jit; +#[cfg(feature = "host")] +use ptx_jit::{PtxJITCompiler, PtxJITResult}; + +pub mod param; + +mod sealed { + #[doc(hidden)] + pub trait Sealed {} +} + +pub trait CudaKernelParameter: sealed::Sealed { + #[cfg(feature = "host")] + type SyncHostType; + #[cfg(feature = "host")] + type AsyncHostType<'stream, 'b>; + #[doc(hidden)] + type FfiType<'stream, 'b>: rustacuda_core::DeviceCopy + TypeGraphLayout; + #[cfg(any(feature = "device", doc))] + type DeviceType<'b>; + + #[cfg(feature = "host")] + #[allow(clippy::missing_errors_doc)] // FIXME + fn with_new_async<'stream, O, E: From>( + param: Self::SyncHostType, + stream: &'stream rustacuda::stream::Stream, + inner: impl for<'b> FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result, + ) -> Result; + + #[doc(hidden)] + #[cfg(feature = "host")] + fn with_async_as_ptx_jit( + param: &Self::AsyncHostType<'_, '_>, + inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O, + ) -> O; + + #[doc(hidden)] + #[cfg(feature = "host")] + fn shared_layout_for_async(param: &Self::AsyncHostType<'_, '_>) -> std::alloc::Layout; + + #[doc(hidden)] + #[cfg(feature = "host")] + fn async_to_ffi<'stream, 'b>( + param: Self::AsyncHostType<'stream, 'b>, + ) -> Self::FfiType<'stream, 'b>; + + #[doc(hidden)] + #[cfg(feature = "device")] + unsafe fn with_ffi_as_device( + param: Self::FfiType<'static, 'static>, + inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O, + ) -> O; +} + +#[cfg(feature = "host")] +pub struct Launcher<'stream, 'kernel, Kernel> { + pub stream: &'stream Stream, + pub kernel: &'kernel mut TypedPtxKernel, + pub config: LaunchConfig, +} + +#[cfg(feature = "host")] +macro_rules! impl_launcher_launch { + ($launch:ident($($arg:ident : $T:ident),*) => $with_async:ident => $launch_async:ident) => { + #[allow(clippy::missing_errors_doc)] + #[allow(clippy::too_many_arguments)] // func is defined for <= 12 args + pub fn $launch<$($T: CudaKernelParameter),*>( + &mut self, + $($arg: $T::SyncHostType),* + ) -> CudaResult<()> + where + Kernel: FnOnce(&mut Launcher<'stream, '_, Kernel>, $($T),*), + { + self.kernel.$launch::<$($T),*>(self.stream, &self.config, $($arg),*) + } + + #[allow(clippy::missing_errors_doc)] + #[allow(clippy::too_many_arguments)] // func is defined for <= 12 args + pub fn $with_async< + 'a, + Ok, + Err: From, + $($T: CudaKernelParameter),* + >( + &'a mut self, + $($arg: $T::SyncHostType,)* + inner: impl FnOnce( + &'a mut Self, + $($T::AsyncHostType<'stream, '_>),* + ) -> Result, + ) -> Result + where + Kernel: FnOnce(&mut Launcher<'stream, '_, Kernel>, $($T),*), + { + #[allow(unused_variables)] + let stream = self.stream; + + impl_launcher_launch! { impl with_new_async ($($arg: $T),*) + (stream) { + inner(self, $($arg),*) + } } + } + + #[allow(clippy::missing_errors_doc)] + #[allow(clippy::too_many_arguments)] // func is defined for <= 12 args + pub fn $launch_async<$($T: CudaKernelParameter),*>( + &mut self, + $($arg: $T::AsyncHostType<'stream, '_>),* + ) -> CudaResult<()> + where + Kernel: FnOnce(&mut Launcher<'stream, '_, Kernel>, $($T),*), + { + self.kernel.$launch_async::<$($T),*>(self.stream, &self.config, $($arg),*) + } + }; + (impl $func:ident () + ($($other:ident),*) $inner:block) => { + $inner + }; + (impl $func:ident ($arg0:ident : $T0:ident $(, $arg:ident : $T:ident)*) + ($($other:ident),*) $inner:block) => { + $T0::$func($arg0 $(, $other)*, |$arg0| { + impl_launcher_launch! { impl $func ($($arg: $T),*) + ($($other),*) $inner } + }) + }; +} + +#[cfg(feature = "host")] +impl<'stream, 'kernel, Kernel> Launcher<'stream, 'kernel, Kernel> { + impl_launcher_launch! { launch0() => with0_async => launch0_async } + + impl_launcher_launch! { launch1( + arg1: A + ) => with1_async => launch1_async } + + impl_launcher_launch! { launch2( + arg1: A, arg2: B + ) => with2_async => launch2_async } + + impl_launcher_launch! { launch3( + arg1: A, arg2: B, arg3: C + ) => with3_async => launch3_async } + + impl_launcher_launch! { launch4( + arg1: A, arg2: B, arg3: C, arg4: D + ) => with4_async => launch4_async } + + impl_launcher_launch! { launch5( + arg1: A, arg2: B, arg3: C, arg4: D, arg5: E + ) => with5_async => launch5_async } + + impl_launcher_launch! { launch6( + arg1: A, arg2: B, arg3: C, arg4: D, arg5: E, arg6: F + ) => with6_async => launch6_async } + + impl_launcher_launch! { launch7( + arg1: A, arg2: B, arg3: C, arg4: D, arg5: E, arg6: F, arg7: G + ) => with7_async => launch7_async } + + impl_launcher_launch! { launch8( + arg1: A, arg2: B, arg3: C, arg4: D, arg5: E, arg6: F, arg7: G, arg8: H + ) => with8_async => launch8_async } + + impl_launcher_launch! { launch9( + arg1: A, arg2: B, arg3: C, arg4: D, arg5: E, arg6: F, arg7: G, arg8: H, arg9: I + ) => with9_async => launch9_async } + + impl_launcher_launch! { launch10( + arg1: A, arg2: B, arg3: C, arg4: D, arg5: E, arg6: F, arg7: G, arg8: H, arg9: I, arg10: J + ) => with10_async => launch10_async } + + impl_launcher_launch! { launch11( + arg1: A, arg2: B, arg3: C, arg4: D, arg5: E, arg6: F, arg7: G, arg8: H, arg9: I, arg10: J, + arg11: K + ) => with11_async => launch11_async } + + impl_launcher_launch! { launch12( + arg1: A, arg2: B, arg3: C, arg4: D, arg5: E, arg6: F, arg7: G, arg8: H, arg9: I, arg10: J, + arg11: K, arg12: L + ) => with12_async => launch12_async } +} + +#[cfg(feature = "host")] +#[derive(Clone, Debug, PartialEq, Eq)] +pub struct LaunchConfig { + pub grid: rustacuda::function::GridSize, + pub block: rustacuda::function::BlockSize, + pub ptx_jit: bool, +} + +#[cfg(feature = "host")] +#[allow(clippy::module_name_repetitions)] +pub struct RawPtxKernel { + module: ManuallyDrop>, + function: ManuallyDrop>, +} + +#[cfg(feature = "host")] +impl RawPtxKernel { + /// # Errors + /// + /// Returns a [`CudaError`] if `ptx` is not a valid PTX source, or it does + /// not contain an entry point named `entry_point`. + pub fn new(ptx: &CStr, entry_point: &CStr) -> CudaResult { + let module = Box::new(Module::load_from_string(ptx)?); + + let function = unsafe { &*(module.as_ref() as *const Module) }.get_function(entry_point); + + let function = match function { + Ok(function) => function, + Err(err) => { + if let Err((_err, module)) = Module::drop(*module) { + std::mem::forget(module); + } + + return Err(err); + }, + }; + + Ok(Self { + function: ManuallyDrop::new(function), + module: ManuallyDrop::new(module), + }) + } + + #[must_use] + pub fn get_function(&self) -> &Function { + &self.function + } +} + +#[cfg(feature = "host")] +impl Drop for RawPtxKernel { + fn drop(&mut self) { + { + // Ensure that self.function is dropped before self.module as + // it borrows data from the module and must not outlive it + let _function = unsafe { ManuallyDrop::take(&mut self.function) }; + } + + if let Err((_err, module)) = Module::drop(*unsafe { ManuallyDrop::take(&mut self.module) }) + { + std::mem::forget(module); + } + } +} + +#[cfg(feature = "host")] +pub type PtxKernelConfigure = dyn FnMut(&Function) -> CudaResult<()>; + +#[cfg(feature = "host")] +#[allow(clippy::module_name_repetitions)] +pub struct TypedPtxKernel { + compiler: PtxJITCompiler, + ptx_kernel: Option, + entry_point: Box, + configure: Option>, + marker: PhantomData, +} + +#[cfg(feature = "host")] +macro_rules! impl_typed_kernel_launch { + ($launch:ident($($arg:ident : $T:ident),*) => $with_async:ident => $launch_async:ident) => { + #[allow(clippy::missing_errors_doc)] + #[allow(clippy::too_many_arguments)] // func is defined for <= 12 args + pub fn $launch<'kernel, 'stream, $($T: CudaKernelParameter),*>( + &'kernel mut self, + stream: &'stream Stream, + config: &LaunchConfig, + $($arg: $T::SyncHostType),* + ) -> CudaResult<()> + where + Kernel: FnOnce(&mut Launcher<'stream, 'kernel, Kernel>, $($T),*), + { + self.$with_async::<(), CudaError, $($T),*>( + stream, + config, + $($arg,)* + |kernel, stream, config, $($arg),*| { + let result = kernel.$launch_async::<$($T),*>(stream, config, $($arg),*); + + // important: always synchronise here, this function is sync! + match (stream.synchronize(), result) { + (Ok(()), result) => result, + (Err(_), Err(err)) | (Err(err), Ok(())) => Err(err), + } + }, + ) + } + + #[allow(clippy::missing_errors_doc)] + #[allow(clippy::too_many_arguments)] // func is defined for <= 12 args + pub fn $with_async< + 'kernel, + 'stream, + Ok, + Err: From, + $($T: CudaKernelParameter),* + >( + &'kernel mut self, + stream: &'stream Stream, + config: &LaunchConfig, + $($arg: $T::SyncHostType,)* + inner: impl FnOnce( + &'kernel mut Self, + &'stream Stream, + &LaunchConfig, + $($T::AsyncHostType<'stream, '_>),* + ) -> Result, + ) -> Result + where + Kernel: FnOnce(&mut Launcher<'stream, 'kernel, Kernel>, $($T),*), + { + impl_typed_kernel_launch! { impl with_new_async ($($arg: $T),*) + (stream) { + inner(self, stream, config, $($arg),*) + } } + } + + #[allow(clippy::missing_errors_doc)] + #[allow(clippy::needless_lifetimes)] // 'stream is unused for zero args + #[allow(clippy::too_many_arguments)] // func is defined for <= 12 args + pub fn $launch_async<'kernel, 'stream, $($T: CudaKernelParameter),*>( + &'kernel mut self, + stream: &'stream Stream, + config: &LaunchConfig, + $($arg: $T::AsyncHostType<'stream, '_>),* + ) -> CudaResult<()> + where + Kernel: FnOnce(&mut Launcher<'stream, 'kernel, Kernel>, $($T),*), + { + let function = if config.ptx_jit { + impl_typed_kernel_launch! { impl with_async_as_ptx_jit ref ($($arg: $T),*) + () { + self.compile_with_ptx_jit_args(Some(&[$($arg),*])) + } }? + } else { + self.compile_with_ptx_jit_args(None)? + }; + + #[allow(unused_mut)] + let mut shared_memory_size = crate::utils::shared::SharedMemorySize::new(); + $( + shared_memory_size.add($T::shared_layout_for_async(&$arg)); + )* + let Ok(shared_memory_size) = u32::try_from(shared_memory_size.total()) else { + // FIXME: this should really be InvalidConfiguration = 9 + return Err(CudaError::LaunchOutOfResources) + }; + + unsafe { stream.launch( + function, + config.grid.clone(), + config.block.clone(), + shared_memory_size, + &[ + $(core::ptr::from_mut( + &mut $T::async_to_ffi($arg) + ).cast::()),* + ], + ) } + } + }; + (impl $func:ident () + ($($other:ident),*) $inner:block) => { + $inner + }; + (impl $func:ident ($arg0:ident : $T0:ident $(, $arg:ident : $T:ident)*) + ($($other:ident),*) $inner:block) => { + $T0::$func($arg0 $(, $other)*, |$arg0| { + impl_typed_kernel_launch! { impl $func ($($arg: $T),*) + ($($other),*) $inner } + }) + }; + (impl $func:ident ref () + ($($other:ident),*) $inner:block) => { + $inner + }; + (impl $func:ident ref ($arg0:ident : $T0:ident $(, $arg:ident : $T:ident)*) + ($($other:ident),*) $inner:block) => { + $T0::$func(&$arg0 $(, $other)*, |$arg0| { + impl_typed_kernel_launch! { impl $func ref ($($arg: $T),*) + ($($other),*) $inner } + }) + }; +} + +#[cfg(feature = "host")] +impl TypedPtxKernel { + #[must_use] + pub fn new>(configure: Option>) -> Self { + let compiler = PtxJITCompiler::new(T::get_ptx()); + let entry_point = CString::from(T::get_entry_point()).into_boxed_c_str(); + + Self { + compiler, + ptx_kernel: None, + entry_point, + configure, + marker: PhantomData::, + } + } +} + +#[cfg(feature = "host")] +impl TypedPtxKernel { + impl_typed_kernel_launch! { launch0() => with0_async => launch0_async } + + impl_typed_kernel_launch! { launch1( + arg1: A + ) => with1_async => launch1_async } + + impl_typed_kernel_launch! { launch2( + arg1: A, arg2: B + ) => with2_async => launch2_async } + + impl_typed_kernel_launch! { launch3( + arg1: A, arg2: B, arg3: C + ) => with3_async => launch3_async } + + impl_typed_kernel_launch! { launch4( + arg1: A, arg2: B, arg3: C, arg4: D + ) => with4_async => launch4_async } + + impl_typed_kernel_launch! { launch5( + arg1: A, arg2: B, arg3: C, arg4: D, arg5: E + ) => with5_async => launch5_async } + + impl_typed_kernel_launch! { launch6( + arg1: A, arg2: B, arg3: C, arg4: D, arg5: E, arg6: F + ) => with6_async => launch6_async } + + impl_typed_kernel_launch! { launch7( + arg1: A, arg2: B, arg3: C, arg4: D, arg5: E, arg6: F, arg7: G + ) => with7_async => launch7_async } + + impl_typed_kernel_launch! { launch8( + arg1: A, arg2: B, arg3: C, arg4: D, arg5: E, arg6: F, arg7: G, arg8: H + ) => with8_async => launch8_async } + + impl_typed_kernel_launch! { launch9( + arg1: A, arg2: B, arg3: C, arg4: D, arg5: E, arg6: F, arg7: G, arg8: H, arg9: I + ) => with9_async => launch9_async } + + impl_typed_kernel_launch! { launch10( + arg1: A, arg2: B, arg3: C, arg4: D, arg5: E, arg6: F, arg7: G, arg8: H, arg9: I, arg10: J + ) => with10_async => launch10_async } + + impl_typed_kernel_launch! { launch11( + arg1: A, arg2: B, arg3: C, arg4: D, arg5: E, arg6: F, arg7: G, arg8: H, arg9: I, arg10: J, + arg11: K + ) => with11_async => launch11_async } + + impl_typed_kernel_launch! { launch12( + arg1: A, arg2: B, arg3: C, arg4: D, arg5: E, arg6: F, arg7: G, arg8: H, arg9: I, arg10: J, + arg11: K, arg12: L + ) => with12_async => launch12_async } + + /// # Errors + /// + /// Returns a [`CudaError`] if the [`CompiledKernelPtx`] provided to + /// [`Self::new`] is not a valid PTX source or does not contain the + /// entry point it declares. + fn compile_with_ptx_jit_args( + &mut self, + arguments: Option<&[Option<&NonNull<[u8]>>]>, + ) -> CudaResult<&Function> { + let ptx_jit = self.compiler.with_arguments(arguments); + + let kernel_jit = match (&mut self.ptx_kernel, ptx_jit) { + (Some(ptx_kernel), PtxJITResult::Cached(_)) => ptx_kernel.get_function(), + (ptx_kernel, PtxJITResult::Cached(ptx_cstr) | PtxJITResult::Recomputed(ptx_cstr)) => { + let recomputed_ptx_kernel = RawPtxKernel::new(ptx_cstr, &self.entry_point)?; + + // Replace the existing compiled kernel, drop the old one + let ptx_kernel = ptx_kernel.insert(recomputed_ptx_kernel); + + let function = ptx_kernel.get_function(); + + if let Some(configure) = self.configure.as_mut() { + configure(function)?; + } + + function + }, + }; + + Ok(kernel_jit) + } +} + +#[cfg(feature = "host")] +/// # Safety +/// +/// The PTX string returned by [`CompiledKernelPtx::get_ptx`] must correspond +/// to the compiled kernel code for the `Kernel` function and contain a kernel +/// entry point whose name is returned by +/// [`CompiledKernelPtx::get_entry_point`]. +/// +/// This trait should not be implemented manually – use the +/// [`kernel`] macro instead. +pub unsafe trait CompiledKernelPtx { + fn get_ptx() -> &'static CStr; + fn get_entry_point() -> &'static CStr; +} diff --git a/src/kernel.rs b/src/kernel/param.rs similarity index 93% rename from src/kernel.rs rename to src/kernel/param.rs index 98ae0220c..2e4461051 100644 --- a/src/kernel.rs +++ b/src/kernel/param.rs @@ -6,58 +6,16 @@ use core::{ }; #[cfg(feature = "host")] -use core::ptr::NonNull; +use std::{alloc::Layout, ptr::NonNull}; use const_type_layout::TypeGraphLayout; -#[cfg(feature = "derive")] -pub use rust_cuda_derive::kernel; - -use crate::common::{DeviceAccessible, DeviceConstRef, DeviceOwnedRef, EmptyCudaAlloc, RustToCuda}; - -mod sealed { - #[doc(hidden)] - pub trait Sealed {} -} - -pub trait CudaKernelParameter: sealed::Sealed { - #[cfg(feature = "host")] - type SyncHostType; - #[cfg(feature = "host")] - type AsyncHostType<'stream, 'b>; - #[doc(hidden)] - type FfiType<'stream, 'b>: rustacuda_core::DeviceCopy + TypeGraphLayout; - #[cfg(any(feature = "device", doc))] - type DeviceType<'b>; - - #[cfg(feature = "host")] - #[allow(clippy::missing_errors_doc)] // FIXME - fn with_new_async<'stream, O, E: From>( - param: Self::SyncHostType, - stream: &'stream rustacuda::stream::Stream, - inner: impl for<'b> FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result, - ) -> Result; - - #[doc(hidden)] - #[cfg(feature = "host")] - fn with_async_as_ptx_jit( - param: &Self::AsyncHostType<'_, '_>, - inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O, - ) -> O; - - #[doc(hidden)] - #[cfg(feature = "host")] - fn async_to_ffi<'stream, 'b>( - param: Self::AsyncHostType<'stream, 'b>, - ) -> Self::FfiType<'stream, 'b>; - - #[doc(hidden)] - #[cfg(feature = "device")] - unsafe fn with_ffi_as_device( - param: Self::FfiType<'static, 'static>, - inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O, - ) -> O; -} +use crate::{ + alloc::EmptyCudaAlloc, + kernel::{sealed, CudaKernelParameter}, + lend::RustToCuda, + utils::ffi::{DeviceAccessible, DeviceConstRef, DeviceOwnedRef}, +}; pub struct PtxJit { never: !, @@ -144,6 +102,11 @@ impl< inner(None) } + #[cfg(feature = "host")] + fn shared_layout_for_async(_param: &Self::AsyncHostType<'_, '_>) -> Layout { + Layout::new::<()>() + } + #[cfg(feature = "host")] fn async_to_ffi<'stream, 'b>( param: Self::AsyncHostType<'stream, 'b>, @@ -219,6 +182,11 @@ impl< inner(None) } + #[cfg(feature = "host")] + fn shared_layout_for_async(_param: &Self::AsyncHostType<'_, '_>) -> Layout { + Layout::new::<()>() + } + #[cfg(feature = "host")] fn async_to_ffi<'stream, 'b>( param: Self::AsyncHostType<'stream, 'b>, @@ -280,6 +248,11 @@ impl< inner(Some(¶m_as_raw_bytes(param.for_host()))) } + #[cfg(feature = "host")] + fn shared_layout_for_async(_param: &Self::AsyncHostType<'_, '_>) -> Layout { + Layout::new::<()>() + } + #[cfg(feature = "host")] fn async_to_ffi<'stream, 'b>( param: Self::AsyncHostType<'stream, 'b>, @@ -374,6 +347,11 @@ impl<'a, T: 'static + InteriorMutableSafeDeviceCopy> CudaKernelParameter inner(None) } + #[cfg(feature = "host")] + fn shared_layout_for_async(_param: &Self::AsyncHostType<'_, '_>) -> Layout { + Layout::new::<()>() + } + #[cfg(feature = "host")] fn async_to_ffi<'stream, 'b>( param: Self::AsyncHostType<'stream, 'b>, @@ -477,7 +455,7 @@ impl< _stream: &'stream rustacuda::stream::Stream, inner: impl for<'b> FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result, ) -> Result { - crate::host::LendToCuda::move_to_cuda(param, |param| inner(param.into_async())) + crate::lend::LendToCuda::move_to_cuda(param, |param| inner(param.into_async())) } #[cfg(feature = "host")] @@ -488,6 +466,11 @@ impl< inner(None) } + #[cfg(feature = "host")] + fn shared_layout_for_async(_param: &Self::AsyncHostType<'_, '_>) -> Layout { + Layout::new::<()>() + } + #[cfg(feature = "host")] fn async_to_ffi<'stream, 'b>( param: Self::AsyncHostType<'stream, 'b>, @@ -500,7 +483,7 @@ impl< param: Self::FfiType<'static, 'static>, inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O, ) -> O { - unsafe { crate::device::BorrowFromRust::with_moved_from_rust(param, inner) } + unsafe { crate::lend::BorrowFromRust::with_moved_from_rust(param, inner) } } } impl< @@ -534,7 +517,7 @@ impl<'a, T: 'static + RustToCuda + crate::safety::NoSafeAliasing> CudaKernelPara _stream: &'stream rustacuda::stream::Stream, inner: impl for<'b> FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result, ) -> Result { - crate::host::LendToCuda::lend_to_cuda(param, |param| inner(param.as_async())) + crate::lend::LendToCuda::lend_to_cuda(param, |param| inner(param.as_async())) } #[cfg(feature = "host")] @@ -545,6 +528,11 @@ impl<'a, T: 'static + RustToCuda + crate::safety::NoSafeAliasing> CudaKernelPara inner(None) } + #[cfg(feature = "host")] + fn shared_layout_for_async(_param: &Self::AsyncHostType<'_, '_>) -> Layout { + Layout::new::<()>() + } + #[cfg(feature = "host")] fn async_to_ffi<'stream, 'b>( param: Self::AsyncHostType<'stream, 'b>, @@ -557,7 +545,7 @@ impl<'a, T: 'static + RustToCuda + crate::safety::NoSafeAliasing> CudaKernelPara param: Self::FfiType<'static, 'static>, inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O, ) -> O { - unsafe { crate::device::BorrowFromRust::with_borrow_from_rust(param, inner) } + unsafe { crate::lend::BorrowFromRust::with_borrow_from_rust(param, inner) } } } impl<'a, T: RustToCuda + crate::safety::NoSafeAliasing> sealed::Sealed @@ -609,6 +597,11 @@ impl< as CudaKernelParameter>::async_to_ffi(param) } + #[cfg(feature = "host")] + fn shared_layout_for_async(_param: &Self::AsyncHostType<'_, '_>) -> Layout { + Layout::new::<()>() + } + #[cfg(feature = "device")] unsafe fn with_ffi_as_device( param: Self::FfiType<'static, 'static>, @@ -664,6 +657,11 @@ impl<'a, T: 'static + RustToCuda + crate::safety::NoSafeAliasing> CudaKernelPara inner(Some(¶m_as_raw_bytes(param.for_host()))) } + #[cfg(feature = "host")] + fn shared_layout_for_async(_param: &Self::AsyncHostType<'_, '_>) -> Layout { + Layout::new::<()>() + } + #[cfg(feature = "host")] fn async_to_ffi<'stream, 'b>( param: Self::AsyncHostType<'stream, 'b>, @@ -758,6 +756,11 @@ impl<'a, T: 'static + TypeGraphLayout> CudaKernelParameter inner(None) } + #[cfg(feature = "host")] + fn shared_layout_for_async(_param: &Self::AsyncHostType<'_, '_>) -> Layout { + Layout::new::<()>() + } + #[cfg(feature = "host")] fn async_to_ffi<'stream, 'b>( _param: Self::AsyncHostType<'stream, 'b>, @@ -810,6 +813,11 @@ impl<'a, T: 'static + TypeGraphLayout> CudaKernelParameter inner(None) } + #[cfg(feature = "host")] + fn shared_layout_for_async(param: &Self::AsyncHostType<'_, '_>) -> Layout { + param.layout() + } + #[cfg(feature = "host")] fn async_to_ffi<'stream, 'b>( param: Self::AsyncHostType<'stream, 'b>, diff --git a/src/host/ptx_jit/mod.rs b/src/kernel/ptx_jit/mod.rs similarity index 100% rename from src/host/ptx_jit/mod.rs rename to src/kernel/ptx_jit/mod.rs diff --git a/src/host/ptx_jit/preprocess.rs b/src/kernel/ptx_jit/preprocess.rs similarity index 100% rename from src/host/ptx_jit/preprocess.rs rename to src/kernel/ptx_jit/preprocess.rs diff --git a/src/host/ptx_jit/regex.rs b/src/kernel/ptx_jit/regex.rs similarity index 100% rename from src/host/ptx_jit/regex.rs rename to src/kernel/ptx_jit/regex.rs diff --git a/src/host/ptx_jit/replace.rs b/src/kernel/ptx_jit/replace.rs similarity index 100% rename from src/host/ptx_jit/replace.rs rename to src/kernel/ptx_jit/replace.rs diff --git a/src/utils/box.rs b/src/lend/impls/box.rs similarity index 93% rename from src/utils/box.rs rename to src/lend/impls/box.rs index f9c271a67..4acfd7b2c 100644 --- a/src/utils/box.rs +++ b/src/lend/impls/box.rs @@ -6,16 +6,16 @@ use const_type_layout::{TypeGraphLayout, TypeLayout}; use rustacuda::{error::CudaResult, memory::DeviceBox}; use crate::{ - common::{CudaAsRust, RustToCuda}, + lend::{CudaAsRust, RustToCuda}, safety::SafeDeviceCopy, }; #[cfg(any(feature = "host", feature = "device"))] -use crate::common::DeviceAccessible; +use crate::utils::ffi::DeviceAccessible; #[cfg(feature = "host")] use crate::{ - common::{CombinedCudaAlloc, CudaAlloc}, + alloc::{CombinedCudaAlloc, CudaAlloc}, host::CudaDropWrapper, utils::device_copy::SafeDeviceCopyWrapper, }; @@ -36,7 +36,7 @@ unsafe impl RustToCuda for Box { #[cfg(all(feature = "host", not(doc)))] type CudaAllocation = crate::host::CudaDropWrapper>>; #[cfg(any(not(feature = "host"), doc))] - type CudaAllocation = crate::common::SomeCudaAlloc; + type CudaAllocation = crate::alloc::SomeCudaAlloc; type CudaRepresentation = BoxCudaRepresentation; #[cfg(feature = "host")] diff --git a/src/utils/boxed_slice.rs b/src/lend/impls/boxed_slice.rs similarity index 93% rename from src/utils/boxed_slice.rs rename to src/lend/impls/boxed_slice.rs index e4796f2f2..6e1c95d90 100644 --- a/src/utils/boxed_slice.rs +++ b/src/lend/impls/boxed_slice.rs @@ -6,16 +6,16 @@ use const_type_layout::{TypeGraphLayout, TypeLayout}; use rustacuda::{error::CudaResult, memory::DeviceBuffer}; use crate::{ - common::{CudaAsRust, RustToCuda}, + lend::{CudaAsRust, RustToCuda}, safety::SafeDeviceCopy, }; #[cfg(any(feature = "host", feature = "device"))] -use crate::common::DeviceAccessible; +use crate::utils::ffi::DeviceAccessible; #[cfg(feature = "host")] use crate::{ - common::{CombinedCudaAlloc, CudaAlloc}, + alloc::{CombinedCudaAlloc, CudaAlloc}, host::CudaDropWrapper, utils::device_copy::SafeDeviceCopyWrapper, }; @@ -36,7 +36,7 @@ unsafe impl RustToCuda for Box<[T]> { #[cfg(all(feature = "host", not(doc)))] type CudaAllocation = crate::host::CudaDropWrapper>>; #[cfg(any(not(feature = "host"), doc))] - type CudaAllocation = crate::common::SomeCudaAlloc; + type CudaAllocation = crate::alloc::SomeCudaAlloc; type CudaRepresentation = BoxedSliceCudaRepresentation; #[cfg(feature = "host")] diff --git a/src/lend/impls/mod.rs b/src/lend/impls/mod.rs new file mode 100644 index 000000000..18f546bbd --- /dev/null +++ b/src/lend/impls/mod.rs @@ -0,0 +1,7 @@ +mod r#box; +mod boxed_slice; +mod option; +mod r#ref; +mod ref_mut; +mod slice_ref; +mod slice_ref_mut; diff --git a/src/utils/option.rs b/src/lend/impls/option.rs similarity index 96% rename from src/utils/option.rs rename to src/lend/impls/option.rs index dec109f38..291a4a255 100644 --- a/src/utils/option.rs +++ b/src/lend/impls/option.rs @@ -6,16 +6,13 @@ use const_type_layout::{TypeGraphLayout, TypeLayout}; use rustacuda::error::CudaResult; use crate::{ - common::{ - CudaAsRust, DeviceAccessible, RustToCuda, RustToCudaAsync, RustToCudaAsyncProxy, - RustToCudaProxy, - }, + lend::{CudaAsRust, RustToCuda, RustToCudaAsync, RustToCudaAsyncProxy, RustToCudaProxy}, safety::SafeDeviceCopy, - utils::device_copy::SafeDeviceCopyWrapper, + utils::{device_copy::SafeDeviceCopyWrapper, ffi::DeviceAccessible}, }; #[cfg(feature = "host")] -use crate::common::{CombinedCudaAlloc, CudaAlloc}; +use crate::alloc::{CombinedCudaAlloc, CudaAlloc}; #[doc(hidden)] #[allow(clippy::module_name_repetitions)] diff --git a/src/utils/ref.rs b/src/lend/impls/ref.rs similarity index 93% rename from src/utils/ref.rs rename to src/lend/impls/ref.rs index 6475d9ccf..c6aee84e6 100644 --- a/src/utils/ref.rs +++ b/src/lend/impls/ref.rs @@ -6,16 +6,16 @@ use const_type_layout::{TypeGraphLayout, TypeLayout}; use rustacuda::{error::CudaResult, memory::DeviceBox}; use crate::{ - common::{CudaAsRust, RustToCuda}, + lend::{CudaAsRust, RustToCuda}, safety::SafeDeviceCopy, }; #[cfg(any(feature = "host", feature = "device"))] -use crate::common::DeviceAccessible; +use crate::utils::ffi::DeviceAccessible; #[cfg(feature = "host")] use crate::{ - common::{CombinedCudaAlloc, CudaAlloc}, + alloc::{CombinedCudaAlloc, CudaAlloc}, host::CudaDropWrapper, utils::device_copy::SafeDeviceCopyWrapper, }; @@ -39,7 +39,7 @@ unsafe impl<'a, T: SafeDeviceCopy + TypeGraphLayout> RustToCuda for &'a T { #[cfg(all(feature = "host", not(doc)))] type CudaAllocation = crate::host::CudaDropWrapper>>; #[cfg(any(not(feature = "host"), doc))] - type CudaAllocation = crate::common::SomeCudaAlloc; + type CudaAllocation = crate::alloc::SomeCudaAlloc; type CudaRepresentation = RefCudaRepresentation<'a, T>; #[cfg(feature = "host")] diff --git a/src/utils/ref_mut.rs b/src/lend/impls/ref_mut.rs similarity index 93% rename from src/utils/ref_mut.rs rename to src/lend/impls/ref_mut.rs index a5cbae62a..a4f4dbe29 100644 --- a/src/utils/ref_mut.rs +++ b/src/lend/impls/ref_mut.rs @@ -6,16 +6,16 @@ use const_type_layout::{TypeGraphLayout, TypeLayout}; use rustacuda::{error::CudaResult, memory::DeviceBox}; use crate::{ - common::{CudaAsRust, RustToCuda}, + lend::{CudaAsRust, RustToCuda}, safety::SafeDeviceCopy, }; #[cfg(any(feature = "host", feature = "device"))] -use crate::common::DeviceAccessible; +use crate::utils::ffi::DeviceAccessible; #[cfg(feature = "host")] use crate::{ - common::{CombinedCudaAlloc, CudaAlloc}, + alloc::{CombinedCudaAlloc, CudaAlloc}, host::CudaDropWrapper, utils::device_copy::SafeDeviceCopyWrapper, }; @@ -39,7 +39,7 @@ unsafe impl<'a, T: SafeDeviceCopy + TypeGraphLayout> RustToCuda for &'a mut T { #[cfg(all(feature = "host", not(doc)))] type CudaAllocation = crate::host::CudaDropWrapper>>; #[cfg(any(not(feature = "host"), doc))] - type CudaAllocation = crate::common::SomeCudaAlloc; + type CudaAllocation = crate::alloc::SomeCudaAlloc; type CudaRepresentation = RefMutCudaRepresentation<'a, T>; #[cfg(feature = "host")] diff --git a/src/utils/slice_ref.rs b/src/lend/impls/slice_ref.rs similarity index 93% rename from src/utils/slice_ref.rs rename to src/lend/impls/slice_ref.rs index a2a5e5012..6108f9ccd 100644 --- a/src/utils/slice_ref.rs +++ b/src/lend/impls/slice_ref.rs @@ -6,16 +6,16 @@ use const_type_layout::{TypeGraphLayout, TypeLayout}; use rustacuda::{error::CudaResult, memory::DeviceBuffer}; use crate::{ - common::{CudaAsRust, RustToCuda}, + lend::{CudaAsRust, RustToCuda}, safety::SafeDeviceCopy, }; #[cfg(any(feature = "host", feature = "device"))] -use crate::common::DeviceAccessible; +use crate::utils::ffi::DeviceAccessible; #[cfg(feature = "host")] use crate::{ - common::{CombinedCudaAlloc, CudaAlloc}, + alloc::{CombinedCudaAlloc, CudaAlloc}, host::CudaDropWrapper, utils::device_copy::SafeDeviceCopyWrapper, }; @@ -40,7 +40,7 @@ unsafe impl<'a, T: SafeDeviceCopy + TypeGraphLayout> RustToCuda for &'a [T] { #[cfg(all(feature = "host", not(doc)))] type CudaAllocation = crate::host::CudaDropWrapper>>; #[cfg(any(not(feature = "host"), doc))] - type CudaAllocation = crate::common::SomeCudaAlloc; + type CudaAllocation = crate::alloc::SomeCudaAlloc; type CudaRepresentation = SliceRefCudaRepresentation<'a, T>; #[cfg(feature = "host")] diff --git a/src/utils/slice_ref_mut.rs b/src/lend/impls/slice_ref_mut.rs similarity index 93% rename from src/utils/slice_ref_mut.rs rename to src/lend/impls/slice_ref_mut.rs index 64371a1e3..b2f79abf9 100644 --- a/src/utils/slice_ref_mut.rs +++ b/src/lend/impls/slice_ref_mut.rs @@ -6,16 +6,16 @@ use const_type_layout::{TypeGraphLayout, TypeLayout}; use rustacuda::{error::CudaResult, memory::DeviceBuffer}; use crate::{ - common::{CudaAsRust, RustToCuda}, + lend::{CudaAsRust, RustToCuda}, safety::SafeDeviceCopy, }; #[cfg(any(feature = "host", feature = "device"))] -use crate::common::DeviceAccessible; +use crate::utils::ffi::DeviceAccessible; #[cfg(feature = "host")] use crate::{ - common::{CombinedCudaAlloc, CudaAlloc}, + alloc::{CombinedCudaAlloc, CudaAlloc}, host::CudaDropWrapper, utils::device_copy::SafeDeviceCopyWrapper, }; @@ -40,7 +40,7 @@ unsafe impl<'a, T: SafeDeviceCopy + TypeGraphLayout> RustToCuda for &'a mut [T] #[cfg(all(feature = "host", not(doc)))] type CudaAllocation = crate::host::CudaDropWrapper>>; #[cfg(any(not(feature = "host"), doc))] - type CudaAllocation = crate::common::SomeCudaAlloc; + type CudaAllocation = crate::alloc::SomeCudaAlloc; type CudaRepresentation = SliceRefMutCudaRepresentation<'a, T>; #[cfg(feature = "host")] diff --git a/src/lend/mod.rs b/src/lend/mod.rs new file mode 100644 index 000000000..a6cffea3d --- /dev/null +++ b/src/lend/mod.rs @@ -0,0 +1,283 @@ +use const_type_layout::TypeGraphLayout; +#[cfg(feature = "host")] +use rustacuda::error::CudaError; +use rustacuda_core::DeviceCopy; + +#[cfg(feature = "derive")] +#[allow(clippy::module_name_repetitions)] +pub use rust_cuda_derive::LendRustToCuda; + +use crate::{alloc::CudaAlloc, utils::ffi::DeviceAccessible}; + +#[cfg(feature = "device")] +use crate::utils::ffi::{DeviceConstRef, DeviceOwnedRef}; +#[cfg(feature = "host")] +use crate::{ + alloc::{CombinedCudaAlloc, EmptyCudaAlloc, NoCudaAlloc}, + host::{HostAndDeviceConstRef, HostAndDeviceOwned}, +}; + +#[cfg(any(feature = "host", feature = "device"))] +use crate::safety::{NoSafeAliasing, SafeDeviceCopy}; + +mod impls; + +/// # Safety +/// +/// This is an internal trait and should ONLY be derived automatically using +/// `#[derive(LendRustToCuda)]` +pub unsafe trait RustToCuda { + type CudaAllocation: CudaAlloc; + type CudaRepresentation: CudaAsRust; + + #[doc(hidden)] + #[cfg(feature = "host")] + /// # Errors + /// + /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside + /// CUDA + /// + /// # Safety + /// + /// This is an internal function and should NEVER be called manually + /// The returned [`Self::CudaRepresentation`] must NEVER be accessed on the + /// CPU as it contains a GPU-resident copy of `self`. + #[allow(clippy::type_complexity)] + unsafe fn borrow( + &self, + alloc: A, + ) -> rustacuda::error::CudaResult<( + DeviceAccessible, + CombinedCudaAlloc, + )>; + + #[doc(hidden)] + #[cfg(feature = "host")] + /// # Errors + /// + /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside + /// CUDA + /// + /// # Safety + /// + /// This is an internal function and should NEVER be called manually + #[allow(clippy::type_complexity)] + unsafe fn restore( + &mut self, + alloc: CombinedCudaAlloc, + ) -> rustacuda::error::CudaResult; +} + +/// # Safety +/// +/// This is an internal trait and should ONLY be derived automatically using +/// `#[derive(LendRustToCuda)]` +pub unsafe trait RustToCudaAsync: RustToCuda { + #[doc(hidden)] + #[cfg(feature = "host")] + /// # Errors + /// + /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside + /// CUDA + /// + /// # Safety + /// + /// This is an internal function and should NEVER be called manually + /// The returned + /// [`Self::CudaRepresentation`](RustToCuda::CudaRepresentation) must NEVER + /// be accessed on the CPU as it contains a GPU-resident copy of + /// `self`. + #[allow(clippy::type_complexity)] + unsafe fn borrow_async( + &self, + alloc: A, + stream: &rustacuda::stream::Stream, + ) -> rustacuda::error::CudaResult<( + DeviceAccessible, + CombinedCudaAlloc, + )>; + + #[doc(hidden)] + #[cfg(feature = "host")] + /// # Errors + /// + /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside + /// CUDA + /// + /// # Safety + /// + /// This is an internal function and should NEVER be called manually + #[allow(clippy::type_complexity)] + unsafe fn restore_async( + &mut self, + alloc: CombinedCudaAlloc, + stream: &rustacuda::stream::Stream, + ) -> rustacuda::error::CudaResult; +} + +/// # Safety +/// +/// This is an internal trait and should NEVER be implemented manually +pub unsafe trait CudaAsRust: DeviceCopy + TypeGraphLayout { + type RustRepresentation: RustToCuda; + + #[doc(hidden)] + #[cfg(feature = "device")] + /// # Safety + /// + /// This is an internal function and should NEVER be called manually + unsafe fn as_rust(this: &DeviceAccessible) -> Self::RustRepresentation; +} + +pub trait RustToCudaProxy: RustToCuda { + fn from_ref(val: &T) -> &Self; + fn from_mut(val: &mut T) -> &mut Self; + + fn into(self) -> T; +} + +pub trait RustToCudaAsyncProxy: RustToCudaAsync { + fn from_ref(val: &T) -> &Self; + fn from_mut(val: &mut T) -> &mut Self; + + fn into(self) -> T; +} + +#[cfg(feature = "host")] +#[allow(clippy::module_name_repetitions)] +pub trait LendToCuda: RustToCuda + NoSafeAliasing { + /// Lends an immutable copy of `&self` to CUDA: + /// - code in the CUDA kernel can only access `&self` through the + /// [`DeviceConstRef`] inside the closure + /// - after the closure, `&self` will not have changed + /// + /// # Errors + /// + /// Returns a [`CudaError`] iff an error occurs inside CUDA + fn lend_to_cuda< + O, + E: From, + F: FnOnce( + HostAndDeviceConstRef::CudaRepresentation>>, + ) -> Result, + >( + &self, + inner: F, + ) -> Result; + + /// Moves `self` to CUDA iff `self` is [`SafeDeviceCopy`] + /// + /// # Errors + /// + /// Returns a [`CudaError`] iff an error occurs inside CUDA + fn move_to_cuda< + O, + E: From, + F: FnOnce( + HostAndDeviceOwned::CudaRepresentation>>, + ) -> Result, + >( + self, + inner: F, + ) -> Result + where + Self: RustToCuda; +} + +#[cfg(feature = "host")] +impl LendToCuda for T { + fn lend_to_cuda< + O, + E: From, + F: FnOnce( + HostAndDeviceConstRef::CudaRepresentation>>, + ) -> Result, + >( + &self, + inner: F, + ) -> Result { + let (cuda_repr, alloc) = unsafe { self.borrow(NoCudaAlloc) }?; + + let result = HostAndDeviceConstRef::with_new(&cuda_repr, inner); + + core::mem::drop(cuda_repr); + core::mem::drop(alloc); + + result + } + + fn move_to_cuda< + O, + E: From, + F: FnOnce( + HostAndDeviceOwned::CudaRepresentation>>, + ) -> Result, + >( + self, + inner: F, + ) -> Result + where + Self: RustToCuda, + { + let (cuda_repr, alloc) = unsafe { self.borrow(NoCudaAlloc) }?; + + let result = HostAndDeviceOwned::with_new(cuda_repr, inner); + + core::mem::drop(alloc); + + result + } +} + +#[cfg(feature = "device")] +pub trait BorrowFromRust: RustToCuda + NoSafeAliasing { + /// # Safety + /// + /// This function is only safe to call iff `cuda_repr` is the + /// [`DeviceConstRef`] borrowed on the CPU using the corresponding + /// [`LendToCuda::lend_to_cuda`]. + unsafe fn with_borrow_from_rust O>( + cuda_repr: DeviceConstRef::CudaRepresentation>>, + inner: F, + ) -> O; + + /// # Safety + /// + /// This function is only safe to call iff `cuda_repr` is the + /// [`DeviceOwnedRef`] borrowed on the CPU using the corresponding + /// [`LendToCuda::move_to_cuda`]. + unsafe fn with_moved_from_rust O>( + cuda_repr: DeviceOwnedRef::CudaRepresentation>>, + inner: F, + ) -> O + where + Self: Sized, + ::CudaRepresentation: SafeDeviceCopy; +} + +#[cfg(feature = "device")] +impl BorrowFromRust for T { + #[inline] + unsafe fn with_borrow_from_rust O>( + cuda_repr: DeviceConstRef::CudaRepresentation>>, + inner: F, + ) -> O { + // `rust_repr` must never be dropped as we do NOT own any of the + // heap memory it might reference + let rust_repr = core::mem::ManuallyDrop::new(CudaAsRust::as_rust(cuda_repr.as_ref())); + + inner(&rust_repr) + } + + #[inline] + unsafe fn with_moved_from_rust O>( + mut cuda_repr: DeviceOwnedRef::CudaRepresentation>>, + inner: F, + ) -> O + where + Self: Sized, + ::CudaRepresentation: SafeDeviceCopy, + { + inner(CudaAsRust::as_rust(cuda_repr.as_mut())) + } +} diff --git a/src/lib.rs b/src/lib.rs index 6ba80f56f..c782c4047 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -27,6 +27,7 @@ #![feature(inline_const)] #![feature(sync_unsafe_cell)] #![feature(never_type)] +#![feature(layout_for_ptr)] #![feature(cfg_version)] #![cfg_attr(not(version("1.76.0")), feature(c_str_literals))] #![cfg_attr(not(version("1.76.0")), feature(ptr_from_ref))] @@ -44,9 +45,10 @@ core::compile_error!("cannot enable the `host` feature on a target with `target_ #[cfg(all(feature = "device", not(target_os = "cuda"), not(doc)))] core::compile_error!("cannot enable the `device` feature on a target without `target_os=\"cuda\"`"); -pub mod common; +pub mod alloc; pub mod deps; pub mod kernel; +pub mod lend; pub mod safety; pub mod utils; diff --git a/src/safety/device_copy.rs b/src/safety/device_copy.rs index 9aedc8e81..a2bfc9552 100644 --- a/src/safety/device_copy.rs +++ b/src/safety/device_copy.rs @@ -1,6 +1,6 @@ use const_type_layout::TypeGraphLayout; -use crate::{common::DeviceAccessible, safety::StackOnly}; +use crate::{safety::StackOnly, utils::ffi::DeviceAccessible}; #[allow(clippy::module_name_repetitions)] /// Types which are safe to memcpy from the CPU to a GPU. diff --git a/src/safety/no_aliasing.rs b/src/safety/no_aliasing.rs index f5c80e354..7baa06f19 100644 --- a/src/safety/no_aliasing.rs +++ b/src/safety/no_aliasing.rs @@ -63,7 +63,7 @@ impl !NoSafeAliasing for *mut T {} unsafe impl NoSafeAliasing for core::marker::PhantomData {} unsafe impl NoSafeAliasing for r#final::Final {} -unsafe impl NoSafeAliasing +unsafe impl NoSafeAliasing for crate::utils::aliasing::FinalCudaRepresentation { } diff --git a/src/utils/aliasing/const.rs b/src/utils/aliasing/const.rs index 131a05803..b3a28cf25 100644 --- a/src/utils/aliasing/const.rs +++ b/src/utils/aliasing/const.rs @@ -8,7 +8,10 @@ use core::{ use const_type_layout::TypeLayout; use rustacuda_core::DeviceCopy; -use crate::common::{CudaAsRust, DeviceAccessible, RustToCuda, RustToCudaAsync}; +use crate::{ + lend::{CudaAsRust, RustToCuda, RustToCudaAsync}, + utils::ffi::DeviceAccessible, +}; #[repr(transparent)] #[derive(Clone, TypeLayout)] @@ -193,12 +196,12 @@ unsafe impl RustToCuda #[cfg(feature = "host")] #[allow(clippy::type_complexity)] - unsafe fn borrow( + unsafe fn borrow( &self, alloc: A, ) -> rustacuda::error::CudaResult<( DeviceAccessible, - crate::common::CombinedCudaAlloc, + crate::alloc::CombinedCudaAlloc, )> { let (cuda_repr, alloc) = self.0.borrow(alloc)?; @@ -209,9 +212,9 @@ unsafe impl RustToCuda } #[cfg(feature = "host")] - unsafe fn restore( + unsafe fn restore( &mut self, - alloc: crate::common::CombinedCudaAlloc, + alloc: crate::alloc::CombinedCudaAlloc, ) -> rustacuda::error::CudaResult { self.0.restore(alloc) } @@ -222,13 +225,13 @@ unsafe impl RustToCudaAsync { #[cfg(feature = "host")] #[allow(clippy::type_complexity)] - unsafe fn borrow_async( + unsafe fn borrow_async( &self, alloc: A, stream: &rustacuda::stream::Stream, ) -> rustacuda::error::CudaResult<( DeviceAccessible, - crate::common::CombinedCudaAlloc, + crate::alloc::CombinedCudaAlloc, )> { let (cuda_repr, alloc) = self.0.borrow_async(alloc, stream)?; @@ -239,9 +242,9 @@ unsafe impl RustToCudaAsync } #[cfg(feature = "host")] - unsafe fn restore_async( + unsafe fn restore_async( &mut self, - alloc: crate::common::CombinedCudaAlloc, + alloc: crate::alloc::CombinedCudaAlloc, stream: &rustacuda::stream::Stream, ) -> rustacuda::error::CudaResult { self.0.restore_async(alloc, stream) diff --git a/src/utils/aliasing/dynamic.rs b/src/utils/aliasing/dynamic.rs index a6577fc6f..50f028ec3 100644 --- a/src/utils/aliasing/dynamic.rs +++ b/src/utils/aliasing/dynamic.rs @@ -8,7 +8,10 @@ use core::{ use const_type_layout::TypeLayout; use rustacuda_core::DeviceCopy; -use crate::common::{CudaAsRust, DeviceAccessible, RustToCuda, RustToCudaAsync}; +use crate::{ + lend::{CudaAsRust, RustToCuda, RustToCudaAsync}, + utils::ffi::DeviceAccessible, +}; #[repr(C)] #[derive(Clone, TypeLayout)] @@ -167,12 +170,12 @@ unsafe impl RustToCuda for SplitSliceOverCudaThreadsDynamicStride #[cfg(feature = "host")] #[allow(clippy::type_complexity)] - unsafe fn borrow( + unsafe fn borrow( &self, alloc: A, ) -> rustacuda::error::CudaResult<( DeviceAccessible, - crate::common::CombinedCudaAlloc, + crate::alloc::CombinedCudaAlloc, )> { let (cuda_repr, alloc) = self.inner.borrow(alloc)?; @@ -186,9 +189,9 @@ unsafe impl RustToCuda for SplitSliceOverCudaThreadsDynamicStride } #[cfg(feature = "host")] - unsafe fn restore( + unsafe fn restore( &mut self, - alloc: crate::common::CombinedCudaAlloc, + alloc: crate::alloc::CombinedCudaAlloc, ) -> rustacuda::error::CudaResult { self.inner.restore(alloc) } @@ -197,13 +200,13 @@ unsafe impl RustToCuda for SplitSliceOverCudaThreadsDynamicStride unsafe impl RustToCudaAsync for SplitSliceOverCudaThreadsDynamicStride { #[cfg(feature = "host")] #[allow(clippy::type_complexity)] - unsafe fn borrow_async( + unsafe fn borrow_async( &self, alloc: A, stream: &rustacuda::stream::Stream, ) -> rustacuda::error::CudaResult<( DeviceAccessible, - crate::common::CombinedCudaAlloc, + crate::alloc::CombinedCudaAlloc, )> { let (cuda_repr, alloc) = self.inner.borrow_async(alloc, stream)?; @@ -217,9 +220,9 @@ unsafe impl RustToCudaAsync for SplitSliceOverCudaThreadsDyn } #[cfg(feature = "host")] - unsafe fn restore_async( + unsafe fn restore_async( &mut self, - alloc: crate::common::CombinedCudaAlloc, + alloc: crate::alloc::CombinedCudaAlloc, stream: &rustacuda::stream::Stream, ) -> rustacuda::error::CudaResult { self.inner.restore_async(alloc, stream) diff --git a/src/utils/aliasing/final.rs b/src/utils/aliasing/final.rs index 230ea4e8a..432910920 100644 --- a/src/utils/aliasing/final.rs +++ b/src/utils/aliasing/final.rs @@ -1,7 +1,10 @@ use const_type_layout::TypeLayout; use r#final::Final; -use crate::common::{CudaAsRust, DeviceAccessible, RustToCuda, RustToCudaAsync}; +use crate::{ + lend::{CudaAsRust, RustToCuda, RustToCudaAsync}, + utils::ffi::DeviceAccessible, +}; #[doc(hidden)] #[repr(transparent)] @@ -18,12 +21,12 @@ unsafe impl RustToCuda for Final { #[cfg(feature = "host")] #[allow(clippy::type_complexity)] - unsafe fn borrow( + unsafe fn borrow( &self, alloc: A, ) -> rustacuda::error::CudaResult<( DeviceAccessible, - crate::common::CombinedCudaAlloc, + crate::alloc::CombinedCudaAlloc, )> { let (cuda_repr, alloc) = (**self).borrow(alloc)?; @@ -34,9 +37,9 @@ unsafe impl RustToCuda for Final { } #[cfg(feature = "host")] - unsafe fn restore( + unsafe fn restore( &mut self, - alloc: crate::common::CombinedCudaAlloc, + alloc: crate::alloc::CombinedCudaAlloc, ) -> rustacuda::error::CudaResult { // Safety: Final is a repr(transparent) newtype wrapper around T let inner: &mut T = &mut *(self as *mut Self).cast(); @@ -48,13 +51,13 @@ unsafe impl RustToCuda for Final { unsafe impl RustToCudaAsync for Final { #[cfg(feature = "host")] #[allow(clippy::type_complexity)] - unsafe fn borrow_async( + unsafe fn borrow_async( &self, alloc: A, stream: &rustacuda::stream::Stream, ) -> rustacuda::error::CudaResult<( DeviceAccessible, - crate::common::CombinedCudaAlloc, + crate::alloc::CombinedCudaAlloc, )> { let (cuda_repr, alloc) = (**self).borrow_async(alloc, stream)?; @@ -65,9 +68,9 @@ unsafe impl RustToCudaAsync for Final { } #[cfg(feature = "host")] - unsafe fn restore_async( + unsafe fn restore_async( &mut self, - alloc: crate::common::CombinedCudaAlloc, + alloc: crate::alloc::CombinedCudaAlloc, stream: &rustacuda::stream::Stream, ) -> rustacuda::error::CudaResult { // Safety: Final is a repr(transparent) newtype wrapper around T diff --git a/src/utils/device_copy.rs b/src/utils/device_copy.rs index 1f03c1799..2363b4855 100644 --- a/src/utils/device_copy.rs +++ b/src/utils/device_copy.rs @@ -3,15 +3,16 @@ use const_type_layout::{TypeGraphLayout, TypeLayout}; use crate::{ - common::{CudaAsRust, NoCudaAlloc, RustToCuda, RustToCudaAsync}, + alloc::NoCudaAlloc, + lend::{CudaAsRust, RustToCuda, RustToCudaAsync}, safety::SafeDeviceCopy, }; #[cfg(any(feature = "host", feature = "device"))] -use crate::common::DeviceAccessible; +use crate::utils::ffi::DeviceAccessible; #[cfg(feature = "host")] -use crate::common::{CombinedCudaAlloc, CudaAlloc}; +use crate::alloc::{CombinedCudaAlloc, CudaAlloc}; #[derive(Copy, Clone, Debug, TypeLayout)] #[repr(transparent)] diff --git a/src/utils/exchange/buffer/common.rs b/src/utils/exchange/buffer/common.rs index 31f50cb68..450ed0975 100644 --- a/src/utils/exchange/buffer/common.rs +++ b/src/utils/exchange/buffer/common.rs @@ -1,7 +1,7 @@ use const_type_layout::{TypeGraphLayout, TypeLayout}; use rustacuda_core::DeviceCopy; -use crate::{common::CudaAsRust, safety::SafeDeviceCopy}; +use crate::{lend::CudaAsRust, safety::SafeDeviceCopy}; use super::{CudaExchangeBuffer, CudaExchangeItem}; @@ -29,7 +29,9 @@ unsafe impl; #[cfg(feature = "device")] - unsafe fn as_rust(this: &crate::common::DeviceAccessible) -> Self::RustRepresentation { + unsafe fn as_rust( + this: &crate::utils::ffi::DeviceAccessible, + ) -> Self::RustRepresentation { CudaExchangeBuffer { inner: super::device::CudaExchangeBufferDevice(core::mem::ManuallyDrop::new( crate::deps::alloc::boxed::Box::from_raw(core::slice::from_raw_parts_mut( diff --git a/src/utils/exchange/buffer/host.rs b/src/utils/exchange/buffer/host.rs index 9bbf8a0af..58e200881 100644 --- a/src/utils/exchange/buffer/host.rs +++ b/src/utils/exchange/buffer/host.rs @@ -10,9 +10,10 @@ use rustacuda::{ }; use crate::{ - common::{CombinedCudaAlloc, CudaAlloc, DeviceAccessible, NoCudaAlloc}, + alloc::{CombinedCudaAlloc, CudaAlloc, NoCudaAlloc}, host::CudaDropWrapper, safety::SafeDeviceCopy, + utils::ffi::DeviceAccessible, }; use super::{common::CudaExchangeBufferCudaRepresentation, CudaExchangeItem}; diff --git a/src/utils/exchange/buffer/mod.rs b/src/utils/exchange/buffer/mod.rs index dcbbc036f..c1dea16d0 100644 --- a/src/utils/exchange/buffer/mod.rs +++ b/src/utils/exchange/buffer/mod.rs @@ -12,10 +12,16 @@ use const_type_layout::TypeGraphLayout; use crate::safety::SafeDeviceCopy; #[cfg(any(feature = "host", feature = "device"))] -use crate::common::{NoCudaAlloc, RustToCuda, RustToCudaAsync}; +use crate::{ + alloc::NoCudaAlloc, + lend::{RustToCuda, RustToCudaAsync}, +}; #[cfg(feature = "host")] -use crate::common::{CombinedCudaAlloc, CudaAlloc, DeviceAccessible}; +use crate::{ + alloc::{CombinedCudaAlloc, CudaAlloc}, + utils::ffi::DeviceAccessible, +}; #[cfg(any(feature = "host", feature = "device"))] use self::common::CudaExchangeBufferCudaRepresentation; diff --git a/src/utils/exchange/wrapper.rs b/src/utils/exchange/wrapper.rs index 5f64d3d05..2e9decc51 100644 --- a/src/utils/exchange/wrapper.rs +++ b/src/utils/exchange/wrapper.rs @@ -14,14 +14,13 @@ use rustacuda::{ }; use crate::{ - common::{ - CombinedCudaAlloc, DeviceAccessible, EmptyCudaAlloc, NoCudaAlloc, RustToCuda, - RustToCudaAsync, - }, + alloc::{CombinedCudaAlloc, EmptyCudaAlloc, NoCudaAlloc}, host::{ CudaDropWrapper, HostAndDeviceConstRef, HostAndDeviceConstRefAsync, HostAndDeviceMutRef, HostAndDeviceMutRefAsync, HostDeviceBox, HostLockedBox, }, + lend::{RustToCuda, RustToCudaAsync}, + utils::ffi::DeviceAccessible, }; #[allow(clippy::module_name_repetitions)] diff --git a/src/utils/ffi.rs b/src/utils/ffi.rs new file mode 100644 index 000000000..98fd945e7 --- /dev/null +++ b/src/utils/ffi.rs @@ -0,0 +1,133 @@ +use core::marker::PhantomData; +#[cfg(feature = "device")] +use core::{ + convert::{AsMut, AsRef}, + ops::{Deref, DerefMut}, +}; +#[cfg(feature = "host")] +use std::{fmt, mem::MaybeUninit, ptr::copy_nonoverlapping}; + +#[cfg(feature = "host")] +use const_type_layout::TypeGraphLayout; +use const_type_layout::TypeLayout; +use rustacuda_core::DeviceCopy; + +#[cfg(feature = "host")] +use crate::{lend::CudaAsRust, safety::SafeDeviceCopy, utils::device_copy::SafeDeviceCopyWrapper}; + +#[repr(transparent)] +#[cfg_attr(any(feature = "device", doc), derive(Debug))] +#[derive(TypeLayout)] +pub struct DeviceAccessible(T); + +unsafe impl DeviceCopy for DeviceAccessible {} + +#[cfg(feature = "host")] +impl From for DeviceAccessible { + fn from(value: T) -> Self { + Self(value) + } +} + +#[cfg(feature = "host")] +impl From<&T> for DeviceAccessible> { + fn from(value: &T) -> Self { + let value = unsafe { + let mut uninit = MaybeUninit::uninit(); + copy_nonoverlapping(value, uninit.as_mut_ptr(), 1); + uninit.assume_init() + }; + + Self(SafeDeviceCopyWrapper::from(value)) + } +} + +#[cfg(all(feature = "host", not(doc)))] +impl fmt::Debug for DeviceAccessible { + fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result { + fmt.debug_struct(stringify!(DeviceAccessible)) + .finish_non_exhaustive() + } +} + +#[cfg(feature = "device")] +impl Deref for DeviceAccessible { + type Target = T; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} + +#[cfg(feature = "device")] +impl DerefMut for DeviceAccessible { + fn deref_mut(&mut self) -> &mut Self::Target { + &mut self.0 + } +} + +#[repr(transparent)] +#[derive(Clone, Copy, TypeLayout)] +pub struct DeviceConstRef<'r, T: DeviceCopy + 'r> { + #[cfg_attr(feature = "host", allow(dead_code))] + pub(crate) pointer: *const T, + pub(crate) reference: PhantomData<&'r T>, +} + +unsafe impl<'r, T: DeviceCopy> DeviceCopy for DeviceConstRef<'r, T> {} + +#[cfg(feature = "device")] +impl<'r, T: DeviceCopy> AsRef for DeviceConstRef<'r, T> { + fn as_ref(&self) -> &T { + unsafe { &*self.pointer } + } +} + +#[repr(transparent)] +#[derive(TypeLayout)] +pub struct DeviceMutRef<'r, T: DeviceCopy + 'r> { + #[cfg_attr(feature = "host", allow(dead_code))] + pub(crate) pointer: *mut T, + pub(crate) reference: PhantomData<&'r mut T>, +} + +unsafe impl<'r, T: DeviceCopy> DeviceCopy for DeviceMutRef<'r, T> {} + +#[cfg(feature = "device")] +impl<'r, T: DeviceCopy> AsRef for DeviceMutRef<'r, T> { + fn as_ref(&self) -> &T { + unsafe { &*self.pointer } + } +} + +#[cfg(feature = "device")] +impl<'r, T: DeviceCopy> AsMut for DeviceMutRef<'r, T> { + fn as_mut(&mut self) -> &mut T { + unsafe { &mut *self.pointer } + } +} + +#[repr(transparent)] +#[derive(TypeLayout)] +pub struct DeviceOwnedRef<'r, T: DeviceCopy> { + #[cfg_attr(feature = "host", allow(dead_code))] + pub(crate) pointer: *mut T, + pub(crate) reference: PhantomData<&'r mut ()>, + pub(crate) marker: PhantomData, +} + +unsafe impl<'r, T: DeviceCopy> DeviceCopy for DeviceOwnedRef<'r, T> {} + +#[cfg(feature = "device")] +impl<'r, T: DeviceCopy> AsRef for DeviceOwnedRef<'r, T> { + fn as_ref(&self) -> &T { + unsafe { &*self.pointer } + } +} + +#[cfg(feature = "device")] +impl<'r, T: DeviceCopy> AsMut for DeviceOwnedRef<'r, T> { + fn as_mut(&mut self) -> &mut T { + unsafe { &mut *self.pointer } + } +} diff --git a/src/utils/mod.rs b/src/utils/mod.rs index 73d422f05..65a4379fb 100644 --- a/src/utils/mod.rs +++ b/src/utils/mod.rs @@ -1,12 +1,5 @@ pub mod aliasing; pub mod device_copy; pub mod exchange; +pub mod ffi; pub mod shared; - -mod r#box; -mod boxed_slice; -mod option; -mod r#ref; -mod ref_mut; -mod slice_ref; -mod slice_ref_mut; diff --git a/src/utils/shared/mod.rs b/src/utils/shared/mod.rs index dfd3f2019..b01dda26d 100644 --- a/src/utils/shared/mod.rs +++ b/src/utils/shared/mod.rs @@ -9,3 +9,6 @@ pub use r#static::ThreadBlockShared; #[doc(hidden)] #[cfg(feature = "device")] pub use slice::init; + +#[cfg(feature = "host")] +pub(crate) use slice::SharedMemorySize; diff --git a/src/utils/shared/slice.rs b/src/utils/shared/slice.rs index f60276e6b..72ed7fde1 100644 --- a/src/utils/shared/slice.rs +++ b/src/utils/shared/slice.rs @@ -1,3 +1,5 @@ +use core::alloc::Layout; + use const_type_layout::TypeGraphLayout; #[allow(clippy::module_name_repetitions)] @@ -44,6 +46,12 @@ impl ThreadBlockSharedSlice { self.len() == 0 } + #[must_use] + pub fn layout(&self) -> Layout { + // Safety: the length of self.shared is always initialised + unsafe { Layout::for_value_raw(self.shared) } + } + #[cfg(feature = "device")] #[must_use] pub const fn as_mut_ptr(&self) -> *mut T { @@ -124,3 +132,38 @@ pub unsafe fn init() { #[cfg(feature = "device")] core::arch::global_asm!(".extern .shared .align 8 .b8 rust_cuda_dynamic_shared_base[];"); + +#[cfg(feature = "host")] +pub struct SharedMemorySize { + last_align: usize, + total_size: usize, +} + +#[cfg(feature = "host")] +impl SharedMemorySize { + #[must_use] + pub const fn new() -> Self { + Self { + // we allocate the shared memory with an alignment of 8 + last_align: 8, + total_size: 0, + } + } + + pub fn add(&mut self, layout: core::alloc::Layout) { + if layout.align() > self.last_align { + // in the worst case, we are one element of the smaller alignment + // into the larger alignment, so we need to pad the entire rest + let pessimistic_padding = layout.align() - self.last_align; + + self.total_size += pessimistic_padding; + } + + self.last_align = layout.align(); + self.total_size += layout.size(); + } + + pub const fn total(self) -> usize { + self.total_size + } +} From 3020fb08a583b79ec1b4294246f8909ce2a4cece Mon Sep 17 00:00:00 2001 From: Juniper Tyree Date: Mon, 25 Dec 2023 15:09:56 +0000 Subject: [PATCH 067/120] Split rust-cuda-kernel off from rust-cuda-derive --- .vscode/settings.json | 3 +- Cargo.toml | 10 +-- examples/print/Cargo.toml | 4 +- examples/single-source/Cargo.toml | 4 +- rust-cuda-derive/Cargo.toml | 20 +++--- rust-cuda-derive/src/lib.rs | 62 +++---------------- .../src/rust_to_cuda/field_copy.rs | 2 + rust-cuda-derive/src/rust_to_cuda/field_ty.rs | 1 + rust-cuda-derive/src/rust_to_cuda/impl.rs | 1 + rust-cuda-kernel/Cargo.toml | 30 +++++++++ .../build.rs | 0 .../src/kernel/link/config.rs | 0 .../src/kernel/link/error.rs | 0 .../src/kernel/link/mod.rs | 0 .../src/kernel/link/ptx_compiler_sys.rs | 0 .../src/kernel/lints.rs | 0 .../src/kernel/mod.rs | 0 .../src/kernel/specialise/entry_point.rs | 0 .../src/kernel/specialise/function.rs | 0 .../src/kernel/specialise/mod.rs | 0 .../src/kernel/specialise/ty.rs | 0 .../src/kernel/utils.rs | 0 .../src/kernel/wrapper/config.rs | 0 .../wrapper/generate/cuda_generic_function.rs | 0 .../kernel/wrapper/generate/cuda_wrapper.rs | 0 .../kernel/wrapper/generate/host_kernel_ty.rs | 0 .../generate/host_linker_macro/args_trait.rs | 0 .../generate/host_linker_macro/get_ptx.rs | 0 .../wrapper/generate/host_linker_macro/mod.rs | 0 .../src/kernel/wrapper/generate/mod.rs | 0 .../src/kernel/wrapper/mod.rs | 0 .../src/kernel/wrapper/parse.rs | 0 rust-cuda-kernel/src/lib.rs | 60 ++++++++++++++++++ rust-toolchain | 2 +- src/device/mod.rs | 5 +- src/kernel/mod.rs | 8 +-- 36 files changed, 128 insertions(+), 84 deletions(-) create mode 100644 rust-cuda-kernel/Cargo.toml rename {rust-cuda-derive => rust-cuda-kernel}/build.rs (100%) rename {rust-cuda-derive => rust-cuda-kernel}/src/kernel/link/config.rs (100%) rename {rust-cuda-derive => rust-cuda-kernel}/src/kernel/link/error.rs (100%) rename {rust-cuda-derive => rust-cuda-kernel}/src/kernel/link/mod.rs (100%) rename {rust-cuda-derive => rust-cuda-kernel}/src/kernel/link/ptx_compiler_sys.rs (100%) rename {rust-cuda-derive => rust-cuda-kernel}/src/kernel/lints.rs (100%) rename {rust-cuda-derive => rust-cuda-kernel}/src/kernel/mod.rs (100%) rename {rust-cuda-derive => rust-cuda-kernel}/src/kernel/specialise/entry_point.rs (100%) rename {rust-cuda-derive => rust-cuda-kernel}/src/kernel/specialise/function.rs (100%) rename {rust-cuda-derive => rust-cuda-kernel}/src/kernel/specialise/mod.rs (100%) rename {rust-cuda-derive => rust-cuda-kernel}/src/kernel/specialise/ty.rs (100%) rename {rust-cuda-derive => rust-cuda-kernel}/src/kernel/utils.rs (100%) rename {rust-cuda-derive => rust-cuda-kernel}/src/kernel/wrapper/config.rs (100%) rename {rust-cuda-derive => rust-cuda-kernel}/src/kernel/wrapper/generate/cuda_generic_function.rs (100%) rename {rust-cuda-derive => rust-cuda-kernel}/src/kernel/wrapper/generate/cuda_wrapper.rs (100%) rename {rust-cuda-derive => rust-cuda-kernel}/src/kernel/wrapper/generate/host_kernel_ty.rs (100%) rename {rust-cuda-derive => rust-cuda-kernel}/src/kernel/wrapper/generate/host_linker_macro/args_trait.rs (100%) rename {rust-cuda-derive => rust-cuda-kernel}/src/kernel/wrapper/generate/host_linker_macro/get_ptx.rs (100%) rename {rust-cuda-derive => rust-cuda-kernel}/src/kernel/wrapper/generate/host_linker_macro/mod.rs (100%) rename {rust-cuda-derive => rust-cuda-kernel}/src/kernel/wrapper/generate/mod.rs (100%) rename {rust-cuda-derive => rust-cuda-kernel}/src/kernel/wrapper/mod.rs (100%) rename {rust-cuda-derive => rust-cuda-kernel}/src/kernel/wrapper/parse.rs (100%) create mode 100644 rust-cuda-kernel/src/lib.rs diff --git a/.vscode/settings.json b/.vscode/settings.json index c2b4219f5..ddfa41463 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -6,7 +6,8 @@ "rust-analyzer.cargo.allFeatures": false, "rust-analyzer.cargo.features": [ "derive", - "host" + "host", + "kernel" ], "rust-analyzer.showUnlinkedFileNotification": false, } diff --git a/Cargo.toml b/Cargo.toml index 12a90ef59..655359684 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,10 +1,10 @@ [workspace] members = [ - ".", "rust-cuda-derive", + ".", "rust-cuda-derive", "rust-cuda-kernel", "examples/derive", "examples/print", "examples/single-source", ] default-members = [ - ".", "rust-cuda-derive", + ".", "rust-cuda-derive", "rust-cuda-kernel", ] [package] @@ -19,9 +19,10 @@ rust-version = "1.75" # nightly [features] default = [] -host = ["dep:rustacuda", "dep:regex"] -device = [] derive = ["dep:rustacuda_derive", "dep:rust-cuda-derive"] +device = [] +host = ["dep:rustacuda", "dep:regex"] +kernel = ["dep:rust-cuda-kernel"] [dependencies] rustacuda_core = { git = "https://github.com/juntyr/RustaCUDA", rev = "c6ea7cc" } @@ -36,3 +37,4 @@ const-type-layout = { version = "0.2.1", features = ["derive"] } final = "0.1.1" rust-cuda-derive = { path = "rust-cuda-derive", optional = true } +rust-cuda-kernel = { path = "rust-cuda-kernel", optional = true } diff --git a/examples/print/Cargo.toml b/examples/print/Cargo.toml index 05f3a537e..b7f864b58 100644 --- a/examples/print/Cargo.toml +++ b/examples/print/Cargo.toml @@ -8,7 +8,7 @@ edition = "2021" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [target.'cfg(target_os = "cuda")'.dependencies] -rust-cuda = { path = "../../", features = ["derive", "device"] } +rust-cuda = { path = "../../", features = ["kernel", "device"] } [target.'cfg(not(target_os = "cuda"))'.dependencies] -rust-cuda = { path = "../../", features = ["derive", "host"] } +rust-cuda = { path = "../../", features = ["kernel", "host"] } diff --git a/examples/single-source/Cargo.toml b/examples/single-source/Cargo.toml index 6f53359cd..1a27dd30e 100644 --- a/examples/single-source/Cargo.toml +++ b/examples/single-source/Cargo.toml @@ -8,7 +8,7 @@ edition = "2021" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [target.'cfg(target_os = "cuda")'.dependencies] -rc = { package = "rust-cuda", path = "../../", features = ["derive", "device"] } +rc = { package = "rust-cuda", path = "../../", features = ["derive", "kernel", "device"] } [target.'cfg(not(target_os = "cuda"))'.dependencies] -rc = { package = "rust-cuda", path = "../../", features = ["derive", "host"] } +rc = { package = "rust-cuda", path = "../../", features = ["derive", "kernel", "host"] } diff --git a/rust-cuda-derive/Cargo.toml b/rust-cuda-derive/Cargo.toml index 60677b1dd..73a74907b 100644 --- a/rust-cuda-derive/Cargo.toml +++ b/rust-cuda-derive/Cargo.toml @@ -4,7 +4,6 @@ version = "0.1.0" authors = ["Juniper Tyree "] license = "MIT OR Apache-2.0" edition = "2021" -links = "libnvptxcompiler_static" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html @@ -16,15 +15,10 @@ syn = { version = "1.0", features = ["full", "fold"] } quote = "1.0" proc-macro2 = "1.0" proc-macro-error = "1.0" -regex = "1.5" -lazy_static = "1.4" -serde_json = "1.0" -cargo_metadata = { version = "0.18", features = ["builder"] } -strip-ansi-escapes = "0.2" -colored = "2.0" -thiserror = "1.0" -seahash = "4.1" -ptx-builder = { git = "https://github.com/juntyr/rust-ptx-builder", rev = "1f1f49d" } - -[build-dependencies] -find_cuda_helper = "0.2" +# regex = "1.5" +# lazy_static = "1.4" +# serde_json = "1.0" +# cargo_metadata = { version = "0.18", features = ["builder"] } +# strip-ansi-escapes = "0.2" +# colored = "2.0" +# thiserror = "1.0" diff --git a/rust-cuda-derive/src/lib.rs b/rust-cuda-derive/src/lib.rs index 4651be684..fba846798 100644 --- a/rust-cuda-derive/src/lib.rs +++ b/rust-cuda-derive/src/lib.rs @@ -1,13 +1,12 @@ -#![deny(clippy::pedantic)] -#![feature(box_patterns)] -#![feature(proc_macro_tracked_env)] -#![feature(proc_macro_span)] +#![deny(clippy::complexity)] +#![deny(clippy::correctness)] +#![warn(clippy::nursery)] +#![warn(clippy::pedantic)] +#![deny(clippy::perf)] +#![deny(clippy::style)] +#![deny(clippy::suspicious)] #![feature(if_let_guard)] #![feature(let_chains)] -#![feature(map_try_insert)] -#![feature(proc_macro_def_site)] -#![feature(cfg_version)] -#![cfg_attr(not(version("1.76.0")), feature(c_str_literals))] #![doc(html_root_url = "https://juntyr.github.io/rust-cuda/")] extern crate proc_macro; @@ -17,14 +16,8 @@ extern crate proc_macro_error; use proc_macro::TokenStream; -mod kernel; mod rust_to_cuda; -// cargo expand --target x86_64-unknown-linux-gnu --ugly \ -// | rustfmt --config max_width=160 > out.rs -// cargo expand --target nvptx64-nvidia-cuda --ugly \ -// | rustfmt --config max_width=160 > out.rs - #[proc_macro_error] #[proc_macro_derive(LendRustToCuda, attributes(cuda))] pub fn rust_to_cuda_derive(input: TokenStream) -> TokenStream { @@ -37,44 +30,3 @@ pub fn rust_to_cuda_derive(input: TokenStream) -> TokenStream { // Build the implementation of the `RustToCuda` and `CudaAsRust` traits rust_to_cuda::impl_rust_to_cuda(&ast) } - -#[proc_macro_error] -#[proc_macro_attribute] -pub fn kernel(attr: TokenStream, func: TokenStream) -> TokenStream { - kernel::wrapper::kernel(attr, func) -} - -#[doc(hidden)] -#[proc_macro_error] -#[proc_macro] -pub fn specialise_kernel_type(tokens: TokenStream) -> TokenStream { - kernel::specialise::ty::specialise_kernel_type(tokens) -} - -#[doc(hidden)] -#[proc_macro_error] -#[proc_macro] -pub fn specialise_kernel_entry_point(tokens: TokenStream) -> TokenStream { - kernel::specialise::entry_point::specialise_kernel_entry_point(tokens) -} - -#[doc(hidden)] -#[proc_macro_error] -#[proc_macro_attribute] -pub fn specialise_kernel_function(attr: TokenStream, func: TokenStream) -> TokenStream { - kernel::specialise::function::specialise_kernel_function(attr, func) -} - -#[doc(hidden)] -#[proc_macro_error] -#[proc_macro] -pub fn check_kernel(tokens: TokenStream) -> TokenStream { - kernel::link::check_kernel(tokens) -} - -#[doc(hidden)] -#[proc_macro_error] -#[proc_macro] -pub fn link_kernel(tokens: TokenStream) -> TokenStream { - kernel::link::link_kernel(tokens) -} diff --git a/rust-cuda-derive/src/rust_to_cuda/field_copy.rs b/rust-cuda-derive/src/rust_to_cuda/field_copy.rs index f6464d197..10f528730 100644 --- a/rust-cuda-derive/src/rust_to_cuda/field_copy.rs +++ b/rust-cuda-derive/src/rust_to_cuda/field_copy.rs @@ -21,10 +21,12 @@ pub fn impl_field_copy_init_and_expand_alloc_type( c2r_field_initialisations: &mut Vec, ) -> TokenStream { + #[allow(clippy::option_if_let_else)] let field_accessor = match &field.ident { Some(ident) => quote! { #ident }, None => proc_macro2::Literal::usize_unsuffixed(field_index).to_token_stream(), }; + #[allow(clippy::option_if_let_else)] let field_repr_ident = match &field.ident { Some(ident) => format_ident!("field_{}_repr", ident), None => format_ident!("field_{}_repr", field_index), diff --git a/rust-cuda-derive/src/rust_to_cuda/field_ty.rs b/rust-cuda-derive/src/rust_to_cuda/field_ty.rs index aee846fe3..313daf86b 100644 --- a/rust-cuda-derive/src/rust_to_cuda/field_ty.rs +++ b/rust-cuda-derive/src/rust_to_cuda/field_ty.rs @@ -103,6 +103,7 @@ pub fn swap_field_type_and_filter_attrs( } }); + #[allow(clippy::option_if_let_else)] let cuda_repr_field_ty = if let Some(cuda_repr_field_ty) = cuda_repr_field_ty { cuda_repr_field_ty } else { diff --git a/rust-cuda-derive/src/rust_to_cuda/impl.rs b/rust-cuda-derive/src/rust_to_cuda/impl.rs index b7dc1eb13..5eee100c1 100644 --- a/rust-cuda-derive/src/rust_to_cuda/impl.rs +++ b/rust-cuda-derive/src/rust_to_cuda/impl.rs @@ -23,6 +23,7 @@ pub fn cuda_struct_declaration( quote! { #[repr(C)] } }; + #[allow(clippy::option_if_let_else)] let struct_fields_where_clause = if let Some(struct_semi_cuda) = struct_semi_cuda { quote!(#struct_fields_cuda #where_clause #struct_semi_cuda) } else { diff --git a/rust-cuda-kernel/Cargo.toml b/rust-cuda-kernel/Cargo.toml new file mode 100644 index 000000000..23e641841 --- /dev/null +++ b/rust-cuda-kernel/Cargo.toml @@ -0,0 +1,30 @@ +[package] +name = "rust-cuda-kernel" +version = "0.1.0" +authors = ["Juniper Tyree "] +license = "MIT OR Apache-2.0" +edition = "2021" +links = "libnvptxcompiler_static" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[lib] +proc-macro = true + +[dependencies] +syn = { version = "1.0", features = ["full", "fold"] } +quote = "1.0" +proc-macro2 = "1.0" +proc-macro-error = "1.0" +regex = "1.5" +lazy_static = "1.4" +serde_json = "1.0" +cargo_metadata = { version = "0.18", features = ["builder"] } +strip-ansi-escapes = "0.2" +colored = "2.0" +thiserror = "1.0" +seahash = "4.1" +ptx-builder = { git = "https://github.com/juntyr/rust-ptx-builder", rev = "1f1f49d" } + +[build-dependencies] +find_cuda_helper = "0.2" diff --git a/rust-cuda-derive/build.rs b/rust-cuda-kernel/build.rs similarity index 100% rename from rust-cuda-derive/build.rs rename to rust-cuda-kernel/build.rs diff --git a/rust-cuda-derive/src/kernel/link/config.rs b/rust-cuda-kernel/src/kernel/link/config.rs similarity index 100% rename from rust-cuda-derive/src/kernel/link/config.rs rename to rust-cuda-kernel/src/kernel/link/config.rs diff --git a/rust-cuda-derive/src/kernel/link/error.rs b/rust-cuda-kernel/src/kernel/link/error.rs similarity index 100% rename from rust-cuda-derive/src/kernel/link/error.rs rename to rust-cuda-kernel/src/kernel/link/error.rs diff --git a/rust-cuda-derive/src/kernel/link/mod.rs b/rust-cuda-kernel/src/kernel/link/mod.rs similarity index 100% rename from rust-cuda-derive/src/kernel/link/mod.rs rename to rust-cuda-kernel/src/kernel/link/mod.rs diff --git a/rust-cuda-derive/src/kernel/link/ptx_compiler_sys.rs b/rust-cuda-kernel/src/kernel/link/ptx_compiler_sys.rs similarity index 100% rename from rust-cuda-derive/src/kernel/link/ptx_compiler_sys.rs rename to rust-cuda-kernel/src/kernel/link/ptx_compiler_sys.rs diff --git a/rust-cuda-derive/src/kernel/lints.rs b/rust-cuda-kernel/src/kernel/lints.rs similarity index 100% rename from rust-cuda-derive/src/kernel/lints.rs rename to rust-cuda-kernel/src/kernel/lints.rs diff --git a/rust-cuda-derive/src/kernel/mod.rs b/rust-cuda-kernel/src/kernel/mod.rs similarity index 100% rename from rust-cuda-derive/src/kernel/mod.rs rename to rust-cuda-kernel/src/kernel/mod.rs diff --git a/rust-cuda-derive/src/kernel/specialise/entry_point.rs b/rust-cuda-kernel/src/kernel/specialise/entry_point.rs similarity index 100% rename from rust-cuda-derive/src/kernel/specialise/entry_point.rs rename to rust-cuda-kernel/src/kernel/specialise/entry_point.rs diff --git a/rust-cuda-derive/src/kernel/specialise/function.rs b/rust-cuda-kernel/src/kernel/specialise/function.rs similarity index 100% rename from rust-cuda-derive/src/kernel/specialise/function.rs rename to rust-cuda-kernel/src/kernel/specialise/function.rs diff --git a/rust-cuda-derive/src/kernel/specialise/mod.rs b/rust-cuda-kernel/src/kernel/specialise/mod.rs similarity index 100% rename from rust-cuda-derive/src/kernel/specialise/mod.rs rename to rust-cuda-kernel/src/kernel/specialise/mod.rs diff --git a/rust-cuda-derive/src/kernel/specialise/ty.rs b/rust-cuda-kernel/src/kernel/specialise/ty.rs similarity index 100% rename from rust-cuda-derive/src/kernel/specialise/ty.rs rename to rust-cuda-kernel/src/kernel/specialise/ty.rs diff --git a/rust-cuda-derive/src/kernel/utils.rs b/rust-cuda-kernel/src/kernel/utils.rs similarity index 100% rename from rust-cuda-derive/src/kernel/utils.rs rename to rust-cuda-kernel/src/kernel/utils.rs diff --git a/rust-cuda-derive/src/kernel/wrapper/config.rs b/rust-cuda-kernel/src/kernel/wrapper/config.rs similarity index 100% rename from rust-cuda-derive/src/kernel/wrapper/config.rs rename to rust-cuda-kernel/src/kernel/wrapper/config.rs diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cuda_generic_function.rs b/rust-cuda-kernel/src/kernel/wrapper/generate/cuda_generic_function.rs similarity index 100% rename from rust-cuda-derive/src/kernel/wrapper/generate/cuda_generic_function.rs rename to rust-cuda-kernel/src/kernel/wrapper/generate/cuda_generic_function.rs diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cuda_wrapper.rs b/rust-cuda-kernel/src/kernel/wrapper/generate/cuda_wrapper.rs similarity index 100% rename from rust-cuda-derive/src/kernel/wrapper/generate/cuda_wrapper.rs rename to rust-cuda-kernel/src/kernel/wrapper/generate/cuda_wrapper.rs diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/host_kernel_ty.rs b/rust-cuda-kernel/src/kernel/wrapper/generate/host_kernel_ty.rs similarity index 100% rename from rust-cuda-derive/src/kernel/wrapper/generate/host_kernel_ty.rs rename to rust-cuda-kernel/src/kernel/wrapper/generate/host_kernel_ty.rs diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/host_linker_macro/args_trait.rs b/rust-cuda-kernel/src/kernel/wrapper/generate/host_linker_macro/args_trait.rs similarity index 100% rename from rust-cuda-derive/src/kernel/wrapper/generate/host_linker_macro/args_trait.rs rename to rust-cuda-kernel/src/kernel/wrapper/generate/host_linker_macro/args_trait.rs diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/host_linker_macro/get_ptx.rs b/rust-cuda-kernel/src/kernel/wrapper/generate/host_linker_macro/get_ptx.rs similarity index 100% rename from rust-cuda-derive/src/kernel/wrapper/generate/host_linker_macro/get_ptx.rs rename to rust-cuda-kernel/src/kernel/wrapper/generate/host_linker_macro/get_ptx.rs diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/host_linker_macro/mod.rs b/rust-cuda-kernel/src/kernel/wrapper/generate/host_linker_macro/mod.rs similarity index 100% rename from rust-cuda-derive/src/kernel/wrapper/generate/host_linker_macro/mod.rs rename to rust-cuda-kernel/src/kernel/wrapper/generate/host_linker_macro/mod.rs diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/mod.rs b/rust-cuda-kernel/src/kernel/wrapper/generate/mod.rs similarity index 100% rename from rust-cuda-derive/src/kernel/wrapper/generate/mod.rs rename to rust-cuda-kernel/src/kernel/wrapper/generate/mod.rs diff --git a/rust-cuda-derive/src/kernel/wrapper/mod.rs b/rust-cuda-kernel/src/kernel/wrapper/mod.rs similarity index 100% rename from rust-cuda-derive/src/kernel/wrapper/mod.rs rename to rust-cuda-kernel/src/kernel/wrapper/mod.rs diff --git a/rust-cuda-derive/src/kernel/wrapper/parse.rs b/rust-cuda-kernel/src/kernel/wrapper/parse.rs similarity index 100% rename from rust-cuda-derive/src/kernel/wrapper/parse.rs rename to rust-cuda-kernel/src/kernel/wrapper/parse.rs diff --git a/rust-cuda-kernel/src/lib.rs b/rust-cuda-kernel/src/lib.rs new file mode 100644 index 000000000..b26a78531 --- /dev/null +++ b/rust-cuda-kernel/src/lib.rs @@ -0,0 +1,60 @@ +#![deny(clippy::pedantic)] +#![feature(box_patterns)] +#![feature(proc_macro_tracked_env)] +#![feature(proc_macro_span)] +#![feature(let_chains)] +#![feature(map_try_insert)] +#![feature(proc_macro_def_site)] +#![feature(cfg_version)] +#![cfg_attr(not(version("1.76.0")), feature(c_str_literals))] +#![doc(html_root_url = "https://juntyr.github.io/rust-cuda/")] + +extern crate proc_macro; + +#[macro_use] +extern crate proc_macro_error; + +use proc_macro::TokenStream; + +mod kernel; + +#[proc_macro_error] +#[proc_macro_attribute] +pub fn kernel(attr: TokenStream, func: TokenStream) -> TokenStream { + kernel::wrapper::kernel(attr, func) +} + +#[doc(hidden)] +#[proc_macro_error] +#[proc_macro] +pub fn specialise_kernel_type(tokens: TokenStream) -> TokenStream { + kernel::specialise::ty::specialise_kernel_type(tokens) +} + +#[doc(hidden)] +#[proc_macro_error] +#[proc_macro] +pub fn specialise_kernel_entry_point(tokens: TokenStream) -> TokenStream { + kernel::specialise::entry_point::specialise_kernel_entry_point(tokens) +} + +#[doc(hidden)] +#[proc_macro_error] +#[proc_macro_attribute] +pub fn specialise_kernel_function(attr: TokenStream, func: TokenStream) -> TokenStream { + kernel::specialise::function::specialise_kernel_function(attr, func) +} + +#[doc(hidden)] +#[proc_macro_error] +#[proc_macro] +pub fn check_kernel(tokens: TokenStream) -> TokenStream { + kernel::link::check_kernel(tokens) +} + +#[doc(hidden)] +#[proc_macro_error] +#[proc_macro] +pub fn link_kernel(tokens: TokenStream) -> TokenStream { + kernel::link::link_kernel(tokens) +} diff --git a/rust-toolchain b/rust-toolchain index d6e655e5f..e6cfef665 100644 --- a/rust-toolchain +++ b/rust-toolchain @@ -1,5 +1,5 @@ [toolchain] # Pin to final 1.75.0 nightly -channel = "nightly-2023-11-10" +channel = "nightly" components = [ "cargo", "rustfmt", "clippy" ] targets = [ "x86_64-unknown-linux-gnu", "nvptx64-nvidia-cuda" ] diff --git a/src/device/mod.rs b/src/device/mod.rs index 0c2a0c83f..791035d51 100644 --- a/src/device/mod.rs +++ b/src/device/mod.rs @@ -1,5 +1,6 @@ -#[cfg(feature = "derive")] -pub use rust_cuda_derive::{specialise_kernel_function, specialise_kernel_type}; +#[doc(hidden)] +#[cfg(feature = "kernel")] +pub use rust_cuda_kernel::{specialise_kernel_function, specialise_kernel_type}; pub mod alloc; pub mod thread; diff --git a/src/kernel/mod.rs b/src/kernel/mod.rs index f5aeeb4bf..0f490c9b0 100644 --- a/src/kernel/mod.rs +++ b/src/kernel/mod.rs @@ -15,13 +15,13 @@ use rustacuda::{ stream::Stream, }; -#[cfg(feature = "derive")] -pub use rust_cuda_derive::kernel; +#[cfg(feature = "kernel")] +pub use rust_cuda_kernel::kernel; #[doc(hidden)] -#[cfg(all(feature = "derive", feature = "host"))] +#[cfg(all(feature = "kernel", feature = "host"))] #[allow(clippy::module_name_repetitions)] -pub use rust_cuda_derive::{check_kernel, link_kernel, specialise_kernel_entry_point}; +pub use rust_cuda_kernel::{check_kernel, link_kernel, specialise_kernel_entry_point}; #[cfg(feature = "host")] mod ptx_jit; From cc6edd0bd2e4dcf5e8b5ba2887b6b5b7f0fa0f97 Mon Sep 17 00:00:00 2001 From: Juniper Tyree Date: Mon, 25 Dec 2023 15:11:30 +0000 Subject: [PATCH 068/120] Update codecov action to handle rust-cuda-kernel --- .github/workflows/coverage.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/coverage.yml b/.github/workflows/coverage.yml index 176d98baa..c54f606d5 100644 --- a/.github/workflows/coverage.yml +++ b/.github/workflows/coverage.yml @@ -60,6 +60,7 @@ jobs: -t lcov -o coverage.lcov --branch \ --keep-only "src/*" \ --keep-only "rust-cuda-derive/*" \ + --keep-only "rust-cuda-kernel/*" \ --ignore-not-existing \ --excl-line GRCOV_EXCL_LINE \ --excl-start GRCOV_EXCL_START \ From 1c864b56ede235928ab035de2daf758e31694476 Mon Sep 17 00:00:00 2001 From: Juniper Tyree Date: Mon, 25 Dec 2023 15:17:14 +0000 Subject: [PATCH 069/120] Fix clippy lint --- src/lend/mod.rs | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/lend/mod.rs b/src/lend/mod.rs index a6cffea3d..6f7bab5d7 100644 --- a/src/lend/mod.rs +++ b/src/lend/mod.rs @@ -7,7 +7,7 @@ use rustacuda_core::DeviceCopy; #[allow(clippy::module_name_repetitions)] pub use rust_cuda_derive::LendRustToCuda; -use crate::{alloc::CudaAlloc, utils::ffi::DeviceAccessible}; +use crate::alloc::CudaAlloc; #[cfg(feature = "device")] use crate::utils::ffi::{DeviceConstRef, DeviceOwnedRef}; @@ -16,9 +16,11 @@ use crate::{ alloc::{CombinedCudaAlloc, EmptyCudaAlloc, NoCudaAlloc}, host::{HostAndDeviceConstRef, HostAndDeviceOwned}, }; - #[cfg(any(feature = "host", feature = "device"))] -use crate::safety::{NoSafeAliasing, SafeDeviceCopy}; +use crate::{ + safety::{NoSafeAliasing, SafeDeviceCopy}, + utils::ffi::DeviceAccessible, +}; mod impls; From ce8b69a13cb5fab47f5e84299291f2f09d8eb707 Mon Sep 17 00:00:00 2001 From: Juniper Tyree Date: Tue, 26 Dec 2023 18:46:02 +0000 Subject: [PATCH 070/120] Far too much time spent getting rid of DeviceCopy --- Cargo.toml | 2 - examples/print/src/main.rs | 2 +- examples/single-source/src/main.rs | 5 +- rust-cuda-derive/src/rust_to_cuda/impl.rs | 6 +- src/host/mod.rs | 304 ++++++---------------- src/kernel/mod.rs | 5 +- src/kernel/param.rs | 259 ++++++------------ src/lend/impls/box.rs | 24 +- src/lend/impls/boxed_slice.rs | 38 +-- src/lend/impls/option.rs | 10 +- src/lend/impls/ref.rs | 25 +- src/lend/impls/ref_mut.rs | 23 +- src/lend/impls/slice_ref.rs | 23 +- src/lend/impls/slice_ref_mut.rs | 23 +- src/lend/mod.rs | 26 +- src/safety/device_copy.rs | 29 --- src/safety/mod.rs | 6 +- src/safety/no_aliasing.rs | 86 ------ src/safety/portable.rs | 63 +++++ src/safety/stack_only.rs | 18 +- src/utils/aliasing/const.rs | 8 - src/utils/aliasing/dynamic.rs | 5 - src/utils/aliasing/final.rs | 90 ------- src/utils/aliasing/mod.rs | 3 - src/utils/device_copy.rs | 23 +- src/utils/exchange/buffer/common.rs | 26 +- src/utils/exchange/buffer/device.rs | 8 +- src/utils/exchange/buffer/host.rs | 58 +++-- src/utils/exchange/buffer/mod.rs | 50 ++-- src/utils/exchange/wrapper.rs | 78 ++++-- src/utils/ffi.rs | 150 ++++++++--- 31 files changed, 584 insertions(+), 892 deletions(-) delete mode 100644 src/safety/device_copy.rs delete mode 100644 src/safety/no_aliasing.rs create mode 100644 src/safety/portable.rs delete mode 100644 src/utils/aliasing/final.rs diff --git a/Cargo.toml b/Cargo.toml index 655359684..bbabb2007 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -34,7 +34,5 @@ regex = { version = "1.10", optional = true } const-type-layout = { version = "0.2.1", features = ["derive"] } -final = "0.1.1" - rust-cuda-derive = { path = "rust-cuda-derive", optional = true } rust-cuda-kernel = { path = "rust-cuda-kernel", optional = true } diff --git a/examples/print/src/main.rs b/examples/print/src/main.rs index 31c6897f3..7423f06ac 100644 --- a/examples/print/src/main.rs +++ b/examples/print/src/main.rs @@ -12,7 +12,7 @@ extern crate alloc; -#[derive(rust_cuda::deps::const_type_layout::TypeLayout)] +#[derive(Copy, Clone, rust_cuda::deps::const_type_layout::TypeLayout)] #[layout(crate = "rust_cuda::deps::const_type_layout")] #[repr(C)] pub enum Action { diff --git a/examples/single-source/src/main.rs b/examples/single-source/src/main.rs index 40d212294..13f2b7efe 100644 --- a/examples/single-source/src/main.rs +++ b/examples/single-source/src/main.rs @@ -41,7 +41,7 @@ pub struct Empty([u8; 0]); pub struct Tuple(u32, i32); #[repr(C)] -#[derive(rc::deps::const_type_layout::TypeLayout)] +#[derive(Copy, Clone, rc::deps::const_type_layout::TypeLayout)] #[layout(crate = "rc::deps::const_type_layout")] pub struct Triple(i32, i32, i32); @@ -58,8 +58,7 @@ pub fn kernel< CudaRepresentation: rc::safety::StackOnly, CudaAllocation: rc::alloc::EmptyCudaAlloc, > - + rc::safety::StackOnly - + rc::safety::NoSafeAliasing, + + rc::safety::StackOnly, >( _x: &rc::kernel::param::PerThreadShallowCopy, _z: &rc::kernel::param::SharedHeapPerThreadShallowCopy>, diff --git a/rust-cuda-derive/src/rust_to_cuda/impl.rs b/rust-cuda-derive/src/rust_to_cuda/impl.rs index 5eee100c1..612d77c5a 100644 --- a/rust-cuda-derive/src/rust_to_cuda/impl.rs +++ b/rust-cuda-derive/src/rust_to_cuda/impl.rs @@ -12,7 +12,7 @@ pub fn cuda_struct_declaration( struct_fields_cuda: &syn::Fields, struct_semi_cuda: Option, ) -> TokenStream { - let (impl_generics, ty_generics, where_clause) = struct_generics_cuda.split_for_impl(); + let (_impl_generics, _ty_generics, where_clause) = struct_generics_cuda.split_for_impl(); let struct_repr = if struct_attrs_cuda .iter() @@ -41,10 +41,6 @@ pub fn cuda_struct_declaration( #(#struct_layout_attrs)* #[layout(crate = #const_type_layout_crate_path)] #struct_vis_cuda struct #struct_name_cuda #struct_generics_cuda #struct_fields_where_clause - - // #[derive(DeviceCopy)] can interfer with type parameters - unsafe impl #impl_generics #crate_path::deps::rustacuda_core::DeviceCopy - for #struct_name_cuda #ty_generics #where_clause {} } } diff --git a/src/host/mod.rs b/src/host/mod.rs index ba37e32e2..e480de9f2 100644 --- a/src/host/mod.rs +++ b/src/host/mod.rs @@ -6,17 +6,22 @@ use std::{ use rustacuda::{ context::Context, - error::{CudaError, CudaResult}, + error::CudaError, event::Event, - memory::{DeviceBox, DeviceBuffer, LockedBox, LockedBuffer}, + memory::{CopyDestination, DeviceBox, DeviceBuffer, LockedBox, LockedBuffer}, module::Module, stream::Stream, }; -use rustacuda_core::{DeviceCopy, DevicePointer}; use crate::{ - safety::SafeDeviceCopy, - utils::ffi::{DeviceConstRef, DeviceMutRef, DeviceOwnedRef}, + safety::PortableBitSemantics, + utils::{ + device_copy::SafeDeviceCopyWrapper, + ffi::{ + DeviceConstPointer, DeviceConstRef, DeviceMutPointer, DeviceMutRef, DeviceOwnedPointer, + DeviceOwnedRef, + }, + }, }; pub trait CudaDroppable: Sized { @@ -56,20 +61,29 @@ impl DerefMut for CudaDropWrapper { } } -macro_rules! impl_sealed_drop_collection { - ($type:ident) => { - impl CudaDroppable for $type { - fn drop(val: Self) -> Result<(), (CudaError, Self)> { - Self::drop(val) - } - } - }; +impl CudaDroppable for DeviceBox { + fn drop(val: Self) -> Result<(), (CudaError, Self)> { + Self::drop(val) + } +} + +impl CudaDroppable for DeviceBuffer { + fn drop(val: Self) -> Result<(), (CudaError, Self)> { + Self::drop(val) + } +} + +impl CudaDroppable for LockedBox { + fn drop(val: Self) -> Result<(), (CudaError, Self)> { + Self::drop(val) + } } -impl_sealed_drop_collection!(DeviceBuffer); -impl_sealed_drop_collection!(DeviceBox); -impl_sealed_drop_collection!(LockedBuffer); -impl_sealed_drop_collection!(LockedBox); +impl CudaDroppable for LockedBuffer { + fn drop(val: Self) -> Result<(), (CudaError, Self)> { + Self::drop(val) + } +} macro_rules! impl_sealed_drop_value { ($type:ident) => { @@ -86,188 +100,20 @@ impl_sealed_drop_value!(Stream); impl_sealed_drop_value!(Context); impl_sealed_drop_value!(Event); -#[repr(transparent)] #[allow(clippy::module_name_repetitions)] -pub struct HostLockedBox(*mut T); - -impl HostLockedBox { - /// # Errors - /// Returns a [`CudaError`] iff an error occurs inside CUDA - pub fn new(value: T) -> CudaResult { - // Safety: uninitialised memory is immediately written to without reading it - let locked_ptr = unsafe { - let locked_ptr: *mut T = LockedBox::into_raw(LockedBox::uninitialized()?); - locked_ptr.write(value); - locked_ptr - }; - - Ok(Self(locked_ptr)) - } -} - -impl Deref for HostLockedBox { - type Target = T; - - fn deref(&self) -> &Self::Target { - unsafe { &*self.0 } - } -} - -impl DerefMut for HostLockedBox { - fn deref_mut(&mut self) -> &mut Self::Target { - unsafe { &mut *self.0 } - } -} - -impl From> for HostLockedBox { - fn from(locked_box: LockedBox) -> Self { - Self(LockedBox::into_raw(locked_box)) - } -} - -impl From> for LockedBox { - fn from(host_locked_box: HostLockedBox) -> Self { - // Safety: pointer comes from [`LockedBox::into_raw`] - // i.e. this function completes the roundtrip - unsafe { Self::from_raw(host_locked_box.0) } - } -} - -impl Drop for HostLockedBox { - fn drop(&mut self) { - // Safety: pointer comes from [`LockedBox::into_raw`] - // i.e. this function completes the roundtrip - let locked_box = unsafe { LockedBox::from_raw(self.0) }; - - core::mem::drop(CudaDropWrapper::from(locked_box)); - } -} - -#[repr(transparent)] -#[allow(clippy::module_name_repetitions)] -pub struct HostDeviceBox(DevicePointer); - -impl crate::alloc::CudaAlloc for HostDeviceBox {} -impl crate::alloc::sealed::alloc::Sealed for HostDeviceBox {} - -impl HostDeviceBox { - /// # Errors - /// - /// Returns a [`CudaError`] iff copying from `value` into `self` failed. - pub fn copy_from(&mut self, value: &T) -> CudaResult<()> { - // Safety: pointer comes from [`DeviceBox::into_device`] - // i.e. this function completes the roundtrip - let mut device_box = unsafe { ManuallyDrop::new(DeviceBox::from_device(self.0)) }; - - rustacuda::memory::CopyDestination::copy_from(&mut *device_box, value) - } - - /// # Errors - /// - /// Returns a [`CudaError`] iff copying from `self` into `value` failed. - pub fn copy_to(&self, value: &mut T) -> CudaResult<()> { - // Safety: pointer comes from [`DeviceBox::into_device`] - // i.e. this function completes the roundtrip - let device_box = unsafe { ManuallyDrop::new(DeviceBox::from_device(self.0)) }; - - rustacuda::memory::CopyDestination::copy_to(&*device_box, value) - } - - /// # Errors - /// - /// Returns a [`CudaError`] iff copying from `value` into `self` failed. - /// - /// # Safety - /// - /// To use the data inside the device box, either - /// - the passed-in [`Stream`] must be synchronised - /// - the kernel must be launched on the passed-in [`Stream`] - pub unsafe fn async_copy_from( - &mut self, - value: &HostLockedBox, - stream: &Stream, - ) -> CudaResult<()> { - // Safety: pointer comes from [`DeviceBox::into_device`] - // i.e. this function completes the roundtrip - let mut device_box = unsafe { ManuallyDrop::new(DeviceBox::from_device(self.0)) }; - // Safety: pointer comes from [`LockedBox::into_raw`] - // i.e. this function completes the roundtrip - let locked_box = unsafe { ManuallyDrop::new(LockedBox::from_raw(value.0)) }; - - unsafe { - rustacuda::memory::AsyncCopyDestination::async_copy_from( - &mut *device_box, - &*locked_box, - stream, - ) - } - } - - /// # Errors - /// - /// Returns a [`CudaError`] iff copying from `self` into `value` failed. - /// - /// # Safety - /// - /// To use the data inside `value`, the passed-in [`Stream`] must be - /// synchronised. - pub unsafe fn async_copy_to( - &self, - value: &mut HostLockedBox, - stream: &Stream, - ) -> CudaResult<()> { - // Safety: pointer comes from [`DeviceBox::into_device`] - // i.e. this function completes the roundtrip - let device_box = unsafe { ManuallyDrop::new(DeviceBox::from_device(self.0)) }; - // Safety: pointer comes from [`LockedBox::into_raw`] - // i.e. this function completes the roundtrip - let mut locked_box = unsafe { ManuallyDrop::new(LockedBox::from_raw(value.0)) }; - - unsafe { - rustacuda::memory::AsyncCopyDestination::async_copy_to( - &*device_box, - &mut *locked_box, - stream, - ) - } - } -} - -impl From> for HostDeviceBox { - fn from(device_box: DeviceBox) -> Self { - Self(DeviceBox::into_device(device_box)) - } -} - -impl From> for DeviceBox { - fn from(host_device_box: HostDeviceBox) -> Self { - // Safety: pointer comes from [`DeviceBox::into_device`] - // i.e. this function completes the roundtrip - unsafe { Self::from_device(host_device_box.0) } - } -} - -impl Drop for HostDeviceBox { - fn drop(&mut self) { - // Safety: pointer comes from [`DeviceBox::into_device`] - // i.e. this function completes the roundtrip - let device_box = unsafe { DeviceBox::from_device(self.0) }; - - core::mem::drop(CudaDropWrapper::from(device_box)); - } -} - -#[allow(clippy::module_name_repetitions)] -pub struct HostAndDeviceMutRef<'a, T: DeviceCopy> { - device_box: &'a mut HostDeviceBox, +pub struct HostAndDeviceMutRef<'a, T: PortableBitSemantics> { + device_box: &'a mut DeviceBox>, host_ref: &'a mut T, } -impl<'a, T: DeviceCopy> HostAndDeviceMutRef<'a, T> { +impl<'a, T: PortableBitSemantics> HostAndDeviceMutRef<'a, T> { /// # Safety /// /// `device_box` must contain EXACTLY the device copy of `host_ref` - pub unsafe fn new(device_box: &'a mut HostDeviceBox, host_ref: &'a mut T) -> Self { + pub unsafe fn new( + device_box: &'a mut DeviceBox>, + host_ref: &'a mut T, + ) -> Self { Self { device_box, host_ref, @@ -286,7 +132,8 @@ impl<'a, T: DeviceCopy> HostAndDeviceMutRef<'a, T> { host_ref: &mut T, inner: F, ) -> Result { - let mut device_box: HostDeviceBox<_> = DeviceBox::new(host_ref)?.into(); + let mut device_box = + CudaDropWrapper::from(DeviceBox::new(SafeDeviceCopyWrapper::from_ref(host_ref))?); // Safety: `device_box` contains exactly the device copy of `host_ref` let result = inner(HostAndDeviceMutRef { @@ -295,7 +142,7 @@ impl<'a, T: DeviceCopy> HostAndDeviceMutRef<'a, T> { }); // Copy back any changes made - device_box.copy_to(host_ref)?; + device_box.copy_to(SafeDeviceCopyWrapper::from_mut(host_ref))?; core::mem::drop(device_box); @@ -308,7 +155,7 @@ impl<'a, T: DeviceCopy> HostAndDeviceMutRef<'a, T> { 'a: 'b, { DeviceMutRef { - pointer: self.device_box.0.as_raw_mut(), + pointer: DeviceMutPointer(self.device_box.as_device_ptr().as_raw_mut().cast()), reference: PhantomData, } } @@ -354,24 +201,27 @@ impl<'a, T: DeviceCopy> HostAndDeviceMutRef<'a, T> { } #[allow(clippy::module_name_repetitions)] -pub struct HostAndDeviceConstRef<'a, T: DeviceCopy> { - device_box: &'a HostDeviceBox, +pub struct HostAndDeviceConstRef<'a, T: PortableBitSemantics> { + device_box: &'a DeviceBox>, host_ref: &'a T, } -impl<'a, T: DeviceCopy> Clone for HostAndDeviceConstRef<'a, T> { +impl<'a, T: PortableBitSemantics> Clone for HostAndDeviceConstRef<'a, T> { fn clone(&self) -> Self { *self } } -impl<'a, T: DeviceCopy> Copy for HostAndDeviceConstRef<'a, T> {} +impl<'a, T: PortableBitSemantics> Copy for HostAndDeviceConstRef<'a, T> {} -impl<'a, T: DeviceCopy> HostAndDeviceConstRef<'a, T> { +impl<'a, T: PortableBitSemantics> HostAndDeviceConstRef<'a, T> { /// # Safety /// /// `device_box` must contain EXACTLY the device copy of `host_ref` - pub const unsafe fn new(device_box: &'a HostDeviceBox, host_ref: &'a T) -> Self { + pub const unsafe fn new( + device_box: &'a DeviceBox>, + host_ref: &'a T, + ) -> Self { Self { device_box, host_ref, @@ -390,7 +240,8 @@ impl<'a, T: DeviceCopy> HostAndDeviceConstRef<'a, T> { host_ref: &T, inner: F, ) -> Result { - let device_box: HostDeviceBox<_> = DeviceBox::new(host_ref)?.into(); + let device_box = + CudaDropWrapper::from(DeviceBox::new(SafeDeviceCopyWrapper::from_ref(host_ref))?); // Safety: `device_box` contains exactly the device copy of `host_ref` let result = inner(HostAndDeviceConstRef { @@ -408,8 +259,10 @@ impl<'a, T: DeviceCopy> HostAndDeviceConstRef<'a, T> { where 'a: 'b, { + let mut hack = ManuallyDrop::new(unsafe { std::ptr::read(self.device_box) }); + DeviceConstRef { - pointer: self.device_box.0.as_raw(), + pointer: DeviceConstPointer(hack.as_device_ptr().as_raw().cast()), reference: PhantomData, } } @@ -441,12 +294,12 @@ impl<'a, T: DeviceCopy> HostAndDeviceConstRef<'a, T> { } #[allow(clippy::module_name_repetitions)] -pub struct HostAndDeviceOwned<'a, T: SafeDeviceCopy + DeviceCopy> { - device_box: &'a mut HostDeviceBox, +pub struct HostAndDeviceOwned<'a, T: PortableBitSemantics> { + device_box: &'a mut DeviceBox>, host_val: &'a mut T, } -impl<'a, T: SafeDeviceCopy + DeviceCopy> HostAndDeviceOwned<'a, T> { +impl<'a, T: PortableBitSemantics> HostAndDeviceOwned<'a, T> { /// # Errors /// /// Returns a [`CudaError`] iff `value` cannot be moved @@ -455,7 +308,8 @@ impl<'a, T: SafeDeviceCopy + DeviceCopy> HostAndDeviceOwned<'a, T> { mut value: T, inner: F, ) -> Result { - let mut device_box: HostDeviceBox<_> = DeviceBox::new(&value)?.into(); + let mut device_box = + CudaDropWrapper::from(DeviceBox::new(SafeDeviceCopyWrapper::from_ref(&value))?); // Safety: `device_box` contains exactly the device copy of `value` inner(HostAndDeviceOwned { @@ -467,7 +321,7 @@ impl<'a, T: SafeDeviceCopy + DeviceCopy> HostAndDeviceOwned<'a, T> { #[must_use] pub fn for_device(self) -> DeviceOwnedRef<'a, T> { DeviceOwnedRef { - pointer: self.device_box.0.as_raw_mut(), + pointer: DeviceOwnedPointer(self.device_box.as_device_ptr().as_raw_mut().cast()), marker: PhantomData::, reference: PhantomData::<&'a mut ()>, } @@ -489,18 +343,18 @@ impl<'a, T: SafeDeviceCopy + DeviceCopy> HostAndDeviceOwned<'a, T> { } #[allow(clippy::module_name_repetitions)] -pub struct HostAndDeviceMutRefAsync<'stream, 'a, T: DeviceCopy> { - device_box: &'a mut HostDeviceBox, +pub struct HostAndDeviceMutRefAsync<'stream, 'a, T: PortableBitSemantics> { + device_box: &'a mut DeviceBox>, host_ref: &'a mut T, stream: PhantomData<&'stream Stream>, } -impl<'stream, 'a, T: DeviceCopy> HostAndDeviceMutRefAsync<'stream, 'a, T> { +impl<'stream, 'a, T: PortableBitSemantics> HostAndDeviceMutRefAsync<'stream, 'a, T> { /// # Safety /// /// `device_box` must contain EXACTLY the device copy of `host_ref` pub unsafe fn new( - device_box: &'a mut HostDeviceBox, + device_box: &'a mut DeviceBox>, host_ref: &'a mut T, stream: &'stream Stream, ) -> Self { @@ -523,7 +377,7 @@ impl<'stream, 'a, T: DeviceCopy> HostAndDeviceMutRefAsync<'stream, 'a, T> { 'a: 'b, { DeviceMutRef { - pointer: self.device_box.0.as_raw_mut(), + pointer: DeviceMutPointer(self.device_box.as_device_ptr().as_raw_mut().cast()), reference: PhantomData, } } @@ -559,27 +413,27 @@ impl<'stream, 'a, T: DeviceCopy> HostAndDeviceMutRefAsync<'stream, 'a, T> { } #[allow(clippy::module_name_repetitions)] -pub struct HostAndDeviceConstRefAsync<'stream, 'a, T: DeviceCopy> { - device_box: &'a HostDeviceBox, +pub struct HostAndDeviceConstRefAsync<'stream, 'a, T: PortableBitSemantics> { + device_box: &'a DeviceBox>, host_ref: &'a T, stream: PhantomData<&'stream Stream>, } -impl<'stream, 'a, T: DeviceCopy> Clone for HostAndDeviceConstRefAsync<'stream, 'a, T> { +impl<'stream, 'a, T: PortableBitSemantics> Clone for HostAndDeviceConstRefAsync<'stream, 'a, T> { fn clone(&self) -> Self { *self } } -impl<'stream, 'a, T: DeviceCopy> Copy for HostAndDeviceConstRefAsync<'stream, 'a, T> {} +impl<'stream, 'a, T: PortableBitSemantics> Copy for HostAndDeviceConstRefAsync<'stream, 'a, T> {} -impl<'stream, 'a, T: DeviceCopy> HostAndDeviceConstRefAsync<'stream, 'a, T> { +impl<'stream, 'a, T: PortableBitSemantics> HostAndDeviceConstRefAsync<'stream, 'a, T> { /// # Safety /// /// `device_box` must contain EXACTLY the device copy of `host_ref` #[must_use] pub const unsafe fn new( - device_box: &'a HostDeviceBox, + device_box: &'a DeviceBox>, host_ref: &'a T, stream: &'stream Stream, ) -> Self { @@ -601,8 +455,10 @@ impl<'stream, 'a, T: DeviceCopy> HostAndDeviceConstRefAsync<'stream, 'a, T> { where 'a: 'b, { + let mut hack = ManuallyDrop::new(unsafe { std::ptr::read(self.device_box) }); + DeviceConstRef { - pointer: self.device_box.0.as_raw(), + pointer: DeviceConstPointer(hack.as_device_ptr().as_raw().cast()), reference: PhantomData, } } @@ -622,13 +478,13 @@ impl<'stream, 'a, T: DeviceCopy> HostAndDeviceConstRefAsync<'stream, 'a, T> { } #[allow(clippy::module_name_repetitions)] -pub struct HostAndDeviceOwnedAsync<'stream, 'a, T: SafeDeviceCopy + DeviceCopy> { - device_box: &'a mut HostDeviceBox, +pub struct HostAndDeviceOwnedAsync<'stream, 'a, T: PortableBitSemantics> { + device_box: &'a mut DeviceBox>, host_val: &'a mut T, stream: PhantomData<&'stream Stream>, } -impl<'stream, 'a, T: SafeDeviceCopy + DeviceCopy> HostAndDeviceOwnedAsync<'stream, 'a, T> { +impl<'stream, 'a, T: PortableBitSemantics> HostAndDeviceOwnedAsync<'stream, 'a, T> { #[must_use] /// # Safety /// @@ -636,7 +492,7 @@ impl<'stream, 'a, T: SafeDeviceCopy + DeviceCopy> HostAndDeviceOwnedAsync<'strea /// constructed-with [`Stream`] pub unsafe fn for_device_async(self) -> DeviceOwnedRef<'a, T> { DeviceOwnedRef { - pointer: self.device_box.0.as_raw_mut(), + pointer: DeviceOwnedPointer(self.device_box.as_device_ptr().as_raw_mut().cast()), marker: PhantomData::, reference: PhantomData::<&'a mut ()>, } diff --git a/src/kernel/mod.rs b/src/kernel/mod.rs index 0f490c9b0..29b3795c0 100644 --- a/src/kernel/mod.rs +++ b/src/kernel/mod.rs @@ -6,7 +6,6 @@ use std::{ ptr::NonNull, }; -use const_type_layout::TypeGraphLayout; #[cfg(feature = "host")] use rustacuda::{ error::{CudaError, CudaResult}, @@ -28,6 +27,8 @@ mod ptx_jit; #[cfg(feature = "host")] use ptx_jit::{PtxJITCompiler, PtxJITResult}; +use crate::safety::PortableBitSemantics; + pub mod param; mod sealed { @@ -41,7 +42,7 @@ pub trait CudaKernelParameter: sealed::Sealed { #[cfg(feature = "host")] type AsyncHostType<'stream, 'b>; #[doc(hidden)] - type FfiType<'stream, 'b>: rustacuda_core::DeviceCopy + TypeGraphLayout; + type FfiType<'stream, 'b>: PortableBitSemantics; #[cfg(any(feature = "device", doc))] type DeviceType<'b>; diff --git a/src/kernel/param.rs b/src/kernel/param.rs index 2e4461051..9b2499b51 100644 --- a/src/kernel/param.rs +++ b/src/kernel/param.rs @@ -14,6 +14,7 @@ use crate::{ alloc::EmptyCudaAlloc, kernel::{sealed, CudaKernelParameter}, lend::RustToCuda, + safety::PortableBitSemantics, utils::ffi::{DeviceAccessible, DeviceConstRef, DeviceOwnedRef}, }; @@ -36,20 +37,13 @@ impl DerefMut for PtxJit { } } -pub struct PerThreadShallowCopy< - T: crate::safety::SafeDeviceCopy - + crate::safety::NoSafeAliasing - + const_type_layout::TypeGraphLayout, -> { +pub struct PerThreadShallowCopy { never: !, _marker: PhantomData, } -impl< - T: crate::safety::SafeDeviceCopy - + crate::safety::NoSafeAliasing - + const_type_layout::TypeGraphLayout, - > Deref for PerThreadShallowCopy +impl Deref + for PerThreadShallowCopy { type Target = T; @@ -58,22 +52,16 @@ impl< } } -impl< - T: crate::safety::SafeDeviceCopy - + crate::safety::NoSafeAliasing - + const_type_layout::TypeGraphLayout, - > DerefMut for PerThreadShallowCopy +impl DerefMut + for PerThreadShallowCopy { fn deref_mut(&mut self) -> &mut Self::Target { self.never } } -impl< - T: crate::safety::SafeDeviceCopy - + crate::safety::NoSafeAliasing - + const_type_layout::TypeGraphLayout, - > CudaKernelParameter for PerThreadShallowCopy +impl + CudaKernelParameter for PerThreadShallowCopy { #[cfg(feature = "host")] type AsyncHostType<'stream, 'b> = crate::utils::device_copy::SafeDeviceCopyWrapper; @@ -124,32 +112,19 @@ impl< inner(param) } } -impl< - T: crate::safety::SafeDeviceCopy - + crate::safety::NoSafeAliasing - + const_type_layout::TypeGraphLayout, - > sealed::Sealed for PerThreadShallowCopy +impl sealed::Sealed + for PerThreadShallowCopy { } -impl< - 'a, - T: 'static - + crate::safety::SafeDeviceCopy - + crate::safety::NoSafeAliasing - + const_type_layout::TypeGraphLayout, - > CudaKernelParameter for &'a PerThreadShallowCopy +impl<'a, T: 'static + Sync + crate::safety::StackOnly + crate::safety::PortableBitSemantics> + CudaKernelParameter for &'a PerThreadShallowCopy { #[cfg(feature = "host")] - type AsyncHostType<'stream, 'b> = crate::host::HostAndDeviceConstRefAsync< - 'stream, - 'b, - crate::utils::device_copy::SafeDeviceCopyWrapper, - >; + type AsyncHostType<'stream, 'b> = crate::host::HostAndDeviceConstRefAsync<'stream, 'b, T>; #[cfg(any(feature = "device", doc))] type DeviceType<'b> = &'b T; - type FfiType<'stream, 'b> = - DeviceConstRef<'b, crate::utils::device_copy::SafeDeviceCopyWrapper>; + type FfiType<'stream, 'b> = DeviceConstRef<'b, T>; #[cfg(feature = "host")] type SyncHostType = &'a T; @@ -159,19 +134,7 @@ impl< _stream: &'stream rustacuda::stream::Stream, inner: impl for<'b> FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result, ) -> Result { - let host_box = crate::host::HostDeviceBox::from(rustacuda::memory::DeviceBox::new( - crate::utils::device_copy::SafeDeviceCopyWrapper::from_ref(param), - )?); - - // Safety: `host_box` contains exactly the device copy of `param` - let const_ref = unsafe { - crate::host::HostAndDeviceConstRef::new( - &host_box, - crate::utils::device_copy::SafeDeviceCopyWrapper::from_ref(param), - ) - }; - - inner(const_ref.as_async()) + crate::host::HostAndDeviceConstRef::with_new(param, |const_ref| inner(const_ref.as_async())) } #[cfg(feature = "host")] @@ -199,27 +162,18 @@ impl< param: Self::FfiType<'static, 'static>, inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O, ) -> O { - let param = param.as_ref().into_ref(); + let param = param.as_ref(); inner(param) } } -impl< - 'a, - T: crate::safety::SafeDeviceCopy - + crate::safety::NoSafeAliasing - + const_type_layout::TypeGraphLayout, - > sealed::Sealed for &'a PerThreadShallowCopy +impl<'a, T: 'static + Sync + crate::safety::StackOnly + crate::safety::PortableBitSemantics> + sealed::Sealed for &'a PerThreadShallowCopy { } -impl< - 'a, - T: 'static - + crate::safety::SafeDeviceCopy - + crate::safety::NoSafeAliasing - + const_type_layout::TypeGraphLayout, - > CudaKernelParameter for &'a PtxJit> +impl<'a, T: 'static + Sync + crate::safety::StackOnly + crate::safety::PortableBitSemantics> + CudaKernelParameter for &'a PtxJit> { #[cfg(feature = "host")] type AsyncHostType<'stream, 'b> = @@ -272,21 +226,21 @@ impl< ) } } -impl< - 'a, - T: crate::safety::SafeDeviceCopy - + crate::safety::NoSafeAliasing - + const_type_layout::TypeGraphLayout, - > sealed::Sealed for &'a PtxJit> +impl<'a, T: 'static + Sync + crate::safety::StackOnly + crate::safety::PortableBitSemantics> + sealed::Sealed for &'a PtxJit> { } -pub struct ShallowInteriorMutable { +pub struct ShallowInteriorMutable< + T: crate::safety::StackOnly + crate::safety::PortableBitSemantics + InteriorMutableSync, +> { never: !, _marker: PhantomData, } -impl Deref for ShallowInteriorMutable { +impl Deref + for ShallowInteriorMutable +{ type Target = T; fn deref(&self) -> &Self::Target { @@ -294,19 +248,19 @@ impl Deref for ShallowInteriorMutable { } } -impl<'a, T: 'static + InteriorMutableSafeDeviceCopy> CudaKernelParameter - for &'a ShallowInteriorMutable +impl< + 'a, + T: 'static + + crate::safety::StackOnly + + crate::safety::PortableBitSemantics + + InteriorMutableSync, + > CudaKernelParameter for &'a ShallowInteriorMutable { #[cfg(feature = "host")] - type AsyncHostType<'stream, 'b> = crate::host::HostAndDeviceConstRefAsync< - 'stream, - 'b, - crate::utils::device_copy::SafeDeviceCopyWrapper, - >; + type AsyncHostType<'stream, 'b> = crate::host::HostAndDeviceConstRefAsync<'stream, 'b, T>; #[cfg(any(feature = "device", doc))] type DeviceType<'b> = &'b T; - type FfiType<'stream, 'b> = - DeviceConstRef<'b, crate::utils::device_copy::SafeDeviceCopyWrapper>; + type FfiType<'stream, 'b> = DeviceConstRef<'b, T>; #[cfg(feature = "host")] /// The kernel takes a mutable borrow of the interior mutable data to ensure /// the interior mutability is limited to just this kernel invocation. @@ -318,25 +272,9 @@ impl<'a, T: 'static + InteriorMutableSafeDeviceCopy> CudaKernelParameter _stream: &'stream rustacuda::stream::Stream, inner: impl for<'b> FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result, ) -> Result { - let host_box = crate::host::HostDeviceBox::from(rustacuda::memory::DeviceBox::new( - crate::utils::device_copy::SafeDeviceCopyWrapper::from_ref(param), - )?); - - // Safety: `host_box` contains exactly the device copy of `param` - let const_ref = unsafe { - crate::host::HostAndDeviceConstRef::new( - &host_box, - crate::utils::device_copy::SafeDeviceCopyWrapper::from_ref(param), - ) - }; - - let result = inner(const_ref.as_async()); - - host_box.copy_to(crate::utils::device_copy::SafeDeviceCopyWrapper::from_mut( - param, - ))?; - - result + crate::host::HostAndDeviceMutRef::with_new(param, |const_ref| { + inner(const_ref.as_ref().as_async()) + }) } #[cfg(feature = "host")] @@ -364,24 +302,23 @@ impl<'a, T: 'static + InteriorMutableSafeDeviceCopy> CudaKernelParameter param: Self::FfiType<'static, 'static>, inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O, ) -> O { - let param = param.as_ref().into_ref(); + let param = param.as_ref(); inner(param) } } -impl<'a, T: InteriorMutableSafeDeviceCopy> sealed::Sealed for &'a ShallowInteriorMutable {} - -pub trait InteriorMutableSafeDeviceCopy: - crate::safety::SafeDeviceCopy - + crate::safety::NoSafeAliasing - + const_type_layout::TypeGraphLayout - + sealed::Sealed +impl< + 'a, + T: crate::safety::StackOnly + crate::safety::PortableBitSemantics + InteriorMutableSync, + > sealed::Sealed for &'a ShallowInteriorMutable { } +pub trait InteriorMutableSync: Sync + sealed::Sealed {} + macro_rules! impl_atomic_interior_mutable { ($atomic:ident($interior:ty)) => { - impl InteriorMutableSafeDeviceCopy for core::sync::atomic::$atomic {} + impl InteriorMutableSync for core::sync::atomic::$atomic {} impl sealed::Sealed for core::sync::atomic::$atomic {} }; ($($atomic:ident($interior:ty)),*) => { @@ -395,27 +332,21 @@ impl_atomic_interior_mutable! { AtomicU8(u8), AtomicU16(u16), AtomicU32(u32), AtomicU64(u64), AtomicUsize(usize) } -impl< - T: crate::safety::StackOnly - + crate::safety::NoSafeAliasing - + const_type_layout::TypeGraphLayout, - > InteriorMutableSafeDeviceCopy for core::cell::SyncUnsafeCell +impl InteriorMutableSync + for core::cell::SyncUnsafeCell { } -impl< - T: crate::safety::StackOnly - + crate::safety::NoSafeAliasing - + const_type_layout::TypeGraphLayout, - > sealed::Sealed for core::cell::SyncUnsafeCell +impl sealed::Sealed + for core::cell::SyncUnsafeCell { } -pub struct SharedHeapPerThreadShallowCopy { +pub struct SharedHeapPerThreadShallowCopy { never: !, _marker: PhantomData, } -impl Deref for SharedHeapPerThreadShallowCopy { +impl Deref for SharedHeapPerThreadShallowCopy { type Target = T; fn deref(&self) -> &Self::Target { @@ -423,17 +354,11 @@ impl Deref for SharedHeapPerThrea } } -impl DerefMut for SharedHeapPerThreadShallowCopy { - fn deref_mut(&mut self) -> &mut Self::Target { - self.never - } -} - impl< T: RustToCuda< - CudaRepresentation: 'static + crate::safety::SafeDeviceCopy, - CudaAllocation: EmptyCudaAlloc, - > + crate::safety::NoSafeAliasing, + CudaRepresentation: 'static + crate::safety::PortableBitSemantics, + CudaAllocation: EmptyCudaAlloc, + >, > CudaKernelParameter for SharedHeapPerThreadShallowCopy { #[cfg(feature = "host")] @@ -488,16 +413,14 @@ impl< } impl< T: RustToCuda< - CudaRepresentation: crate::safety::SafeDeviceCopy, - CudaAllocation: EmptyCudaAlloc, - > + crate::safety::NoSafeAliasing, + CudaRepresentation: crate::safety::PortableBitSemantics, + CudaAllocation: EmptyCudaAlloc, + >, > sealed::Sealed for SharedHeapPerThreadShallowCopy { } -impl<'a, T: 'static + RustToCuda + crate::safety::NoSafeAliasing> CudaKernelParameter - for &'a SharedHeapPerThreadShallowCopy -{ +impl<'a, T: 'static + RustToCuda> CudaKernelParameter for &'a SharedHeapPerThreadShallowCopy { #[cfg(feature = "host")] type AsyncHostType<'stream, 'b> = crate::host::HostAndDeviceConstRefAsync< 'stream, @@ -548,16 +471,13 @@ impl<'a, T: 'static + RustToCuda + crate::safety::NoSafeAliasing> CudaKernelPara unsafe { crate::lend::BorrowFromRust::with_borrow_from_rust(param, inner) } } } -impl<'a, T: RustToCuda + crate::safety::NoSafeAliasing> sealed::Sealed - for &'a SharedHeapPerThreadShallowCopy -{ -} +impl<'a, T: RustToCuda> sealed::Sealed for &'a SharedHeapPerThreadShallowCopy {} impl< T: RustToCuda< - CudaRepresentation: 'static + crate::safety::SafeDeviceCopy, - CudaAllocation: EmptyCudaAlloc, - > + crate::safety::NoSafeAliasing, + CudaRepresentation: 'static + crate::safety::PortableBitSemantics, + CudaAllocation: EmptyCudaAlloc, + >, > CudaKernelParameter for PtxJit> { #[cfg(feature = "host")] @@ -616,14 +536,14 @@ impl< } impl< T: RustToCuda< - CudaRepresentation: crate::safety::SafeDeviceCopy, - CudaAllocation: EmptyCudaAlloc, - > + crate::safety::NoSafeAliasing, + CudaRepresentation: crate::safety::PortableBitSemantics, + CudaAllocation: EmptyCudaAlloc, + >, > sealed::Sealed for PtxJit> { } -impl<'a, T: 'static + RustToCuda + crate::safety::NoSafeAliasing> CudaKernelParameter +impl<'a, T: 'static + RustToCuda> CudaKernelParameter for &'a PtxJit> { #[cfg(feature = "host")] @@ -681,10 +601,7 @@ impl<'a, T: 'static + RustToCuda + crate::safety::NoSafeAliasing> CudaKernelPara ) } } -impl<'a, T: RustToCuda + crate::safety::NoSafeAliasing> sealed::Sealed - for &'a PtxJit> -{ -} +impl<'a, T: RustToCuda> sealed::Sealed for &'a PtxJit> {} #[cfg(feature = "host")] fn param_as_raw_bytes(r: &T) -> NonNull<[u8]> { @@ -703,34 +620,30 @@ fn emit_param_ptx_jit_marker(param: &T) { } mod private_shared { + use core::marker::PhantomData; + use const_type_layout::{TypeGraphLayout, TypeLayout}; - use rustacuda_core::DeviceCopy; + + use crate::safety::PortableBitSemantics; #[doc(hidden)] #[derive(TypeLayout)] #[repr(C)] - pub struct ThreadBlockSharedFfi { - pub(super) _marker: [T; 0], + pub struct ThreadBlockSharedFfi { + pub(super) _dummy: [u8; 0], + pub(super) _marker: PhantomData, } - // Safety: there is nothing to copy, this is just a zero-sized marker type - unsafe impl DeviceCopy for ThreadBlockSharedFfi {} - #[doc(hidden)] #[derive(TypeLayout)] #[repr(C)] - pub struct ThreadBlockSharedSliceFfi { + pub struct ThreadBlockSharedSliceFfi { pub(super) len: usize, pub(super) _marker: [T; 0], } - - // Safety: we only copy a usize, which implements `DeviceCopy` - unsafe impl DeviceCopy for ThreadBlockSharedSliceFfi {} } -impl<'a, T: 'static + TypeGraphLayout> CudaKernelParameter - for &'a mut crate::utils::shared::ThreadBlockShared -{ +impl<'a, T: 'static> CudaKernelParameter for &'a mut crate::utils::shared::ThreadBlockShared { #[cfg(feature = "host")] type AsyncHostType<'stream, 'b> = &'b mut crate::utils::shared::ThreadBlockShared; #[cfg(any(feature = "device", doc))] @@ -765,7 +678,10 @@ impl<'a, T: 'static + TypeGraphLayout> CudaKernelParameter fn async_to_ffi<'stream, 'b>( _param: Self::AsyncHostType<'stream, 'b>, ) -> Self::FfiType<'stream, 'b> { - private_shared::ThreadBlockSharedFfi { _marker: [] } + private_shared::ThreadBlockSharedFfi { + _dummy: [], + _marker: PhantomData::, + } } #[cfg(feature = "device")] @@ -780,12 +696,9 @@ impl<'a, T: 'static + TypeGraphLayout> CudaKernelParameter inner(&mut param) } } -impl<'a, T: 'static + TypeGraphLayout> sealed::Sealed - for &'a mut crate::utils::shared::ThreadBlockShared -{ -} +impl<'a, T: 'static> sealed::Sealed for &'a mut crate::utils::shared::ThreadBlockShared {} -impl<'a, T: 'static + TypeGraphLayout> CudaKernelParameter +impl<'a, T: 'static + PortableBitSemantics + TypeGraphLayout> CudaKernelParameter for &'a mut crate::utils::shared::ThreadBlockSharedSlice { #[cfg(feature = "host")] @@ -840,7 +753,7 @@ impl<'a, T: 'static + TypeGraphLayout> CudaKernelParameter } } } -impl<'a, T: 'static + TypeGraphLayout> sealed::Sealed +impl<'a, T: 'static + PortableBitSemantics + TypeGraphLayout> sealed::Sealed for &'a mut crate::utils::shared::ThreadBlockSharedSlice { } diff --git a/src/lend/impls/box.rs b/src/lend/impls/box.rs index 4acfd7b2c..e2a78999b 100644 --- a/src/lend/impls/box.rs +++ b/src/lend/impls/box.rs @@ -1,4 +1,4 @@ -use crate::deps::alloc::boxed::Box; +use crate::{deps::alloc::boxed::Box, utils::ffi::DeviceOwnedPointer}; use const_type_layout::{TypeGraphLayout, TypeLayout}; @@ -7,7 +7,7 @@ use rustacuda::{error::CudaResult, memory::DeviceBox}; use crate::{ lend::{CudaAsRust, RustToCuda}, - safety::SafeDeviceCopy, + safety::PortableBitSemantics, }; #[cfg(any(feature = "host", feature = "device"))] @@ -24,17 +24,11 @@ use crate::{ #[repr(transparent)] #[derive(TypeLayout)] #[allow(clippy::module_name_repetitions)] -pub struct BoxCudaRepresentation(*mut T); +pub struct BoxCudaRepresentation(DeviceOwnedPointer); -// Safety: This repr(C) struct only contains a device-owned pointer -unsafe impl rustacuda_core::DeviceCopy - for BoxCudaRepresentation -{ -} - -unsafe impl RustToCuda for Box { +unsafe impl RustToCuda for Box { #[cfg(all(feature = "host", not(doc)))] - type CudaAllocation = crate::host::CudaDropWrapper>>; + type CudaAllocation = CudaDropWrapper>>; #[cfg(any(not(feature = "host"), doc))] type CudaAllocation = crate::alloc::SomeCudaAlloc; type CudaRepresentation = BoxCudaRepresentation; @@ -52,9 +46,9 @@ unsafe impl RustToCuda for Box { CudaDropWrapper::from(DeviceBox::new(SafeDeviceCopyWrapper::from_ref(&**self))?); Ok(( - DeviceAccessible::from(BoxCudaRepresentation( + DeviceAccessible::from(BoxCudaRepresentation(DeviceOwnedPointer( device_box.as_device_ptr().as_raw_mut().cast(), - )), + ))), CombinedCudaAlloc::new(device_box, alloc), )) } @@ -76,11 +70,11 @@ unsafe impl RustToCuda for Box { } } -unsafe impl CudaAsRust for BoxCudaRepresentation { +unsafe impl CudaAsRust for BoxCudaRepresentation { type RustRepresentation = Box; #[cfg(feature = "device")] unsafe fn as_rust(this: &DeviceAccessible) -> Self::RustRepresentation { - crate::deps::alloc::boxed::Box::from_raw(this.0) + crate::deps::alloc::boxed::Box::from_raw(this.0 .0) } } diff --git a/src/lend/impls/boxed_slice.rs b/src/lend/impls/boxed_slice.rs index 6e1c95d90..677fcca7d 100644 --- a/src/lend/impls/boxed_slice.rs +++ b/src/lend/impls/boxed_slice.rs @@ -1,4 +1,6 @@ -use crate::deps::alloc::boxed::Box; +use core::marker::PhantomData; + +use crate::{deps::alloc::boxed::Box, utils::ffi::DeviceOwnedPointer}; use const_type_layout::{TypeGraphLayout, TypeLayout}; @@ -7,7 +9,7 @@ use rustacuda::{error::CudaResult, memory::DeviceBuffer}; use crate::{ lend::{CudaAsRust, RustToCuda}, - safety::SafeDeviceCopy, + safety::PortableBitSemantics, }; #[cfg(any(feature = "host", feature = "device"))] @@ -22,17 +24,15 @@ use crate::{ #[doc(hidden)] #[allow(clippy::module_name_repetitions)] -#[derive(Debug, TypeLayout)] +#[derive(TypeLayout)] #[repr(C)] -pub struct BoxedSliceCudaRepresentation(*mut T, usize); - -// Safety: This repr(C) struct only contains a device-owned pointer and a usize -unsafe impl rustacuda_core::DeviceCopy - for BoxedSliceCudaRepresentation -{ +pub struct BoxedSliceCudaRepresentation { + data: DeviceOwnedPointer, + len: usize, + _marker: PhantomData, } -unsafe impl RustToCuda for Box<[T]> { +unsafe impl RustToCuda for Box<[T]> { #[cfg(all(feature = "host", not(doc)))] type CudaAllocation = crate::host::CudaDropWrapper>>; #[cfg(any(not(feature = "host"), doc))] @@ -53,10 +53,11 @@ unsafe impl RustToCuda for Box<[T]> { )?); Ok(( - DeviceAccessible::from(BoxedSliceCudaRepresentation( - device_buffer.as_mut_ptr().cast(), - device_buffer.len(), - )), + DeviceAccessible::from(BoxedSliceCudaRepresentation { + data: DeviceOwnedPointer(device_buffer.as_mut_ptr().cast()), + len: device_buffer.len(), + _marker: PhantomData::, + }), CombinedCudaAlloc::new(device_buffer, alloc), )) } @@ -78,11 +79,16 @@ unsafe impl RustToCuda for Box<[T]> { } } -unsafe impl CudaAsRust for BoxedSliceCudaRepresentation { +unsafe impl CudaAsRust + for BoxedSliceCudaRepresentation +{ type RustRepresentation = Box<[T]>; #[cfg(feature = "device")] unsafe fn as_rust(this: &DeviceAccessible) -> Self::RustRepresentation { - crate::deps::alloc::boxed::Box::from_raw(core::slice::from_raw_parts_mut(this.0, this.1)) + crate::deps::alloc::boxed::Box::from_raw(core::slice::from_raw_parts_mut( + this.data.0, + this.len, + )) } } diff --git a/src/lend/impls/option.rs b/src/lend/impls/option.rs index 291a4a255..f12f24861 100644 --- a/src/lend/impls/option.rs +++ b/src/lend/impls/option.rs @@ -7,7 +7,7 @@ use rustacuda::error::CudaResult; use crate::{ lend::{CudaAsRust, RustToCuda, RustToCudaAsync, RustToCudaAsyncProxy, RustToCudaProxy}, - safety::SafeDeviceCopy, + safety::PortableBitSemantics, utils::{device_copy::SafeDeviceCopyWrapper, ffi::DeviceAccessible}, }; @@ -23,10 +23,6 @@ pub struct OptionCudaRepresentation { present: bool, } -// Safety: Since the CUDA representation of T is DeviceCopy, -// the full enum is also DeviceCopy -unsafe impl rustacuda_core::DeviceCopy for OptionCudaRepresentation {} - unsafe impl RustToCuda for Option { type CudaAllocation = Option<::CudaAllocation>; type CudaRepresentation = OptionCudaRepresentation<::CudaRepresentation>; @@ -149,7 +145,7 @@ unsafe impl CudaAsRust for OptionCudaRepresentation { } } -impl RustToCudaProxy> +impl RustToCudaProxy> for Option> { fn from_ref(val: &Option) -> &Self { @@ -167,7 +163,7 @@ impl RustToCudaProxy> } } -impl RustToCudaAsyncProxy> +impl RustToCudaAsyncProxy> for Option> { fn from_ref(val: &Option) -> &Self { diff --git a/src/lend/impls/ref.rs b/src/lend/impls/ref.rs index c6aee84e6..39ba6117d 100644 --- a/src/lend/impls/ref.rs +++ b/src/lend/impls/ref.rs @@ -7,7 +7,8 @@ use rustacuda::{error::CudaResult, memory::DeviceBox}; use crate::{ lend::{CudaAsRust, RustToCuda}, - safety::SafeDeviceCopy, + safety::PortableBitSemantics, + utils::ffi::DeviceConstPointer, }; #[cfg(any(feature = "host", feature = "device"))] @@ -24,20 +25,14 @@ use crate::{ #[repr(transparent)] #[derive(TypeLayout)] #[allow(clippy::module_name_repetitions)] -pub struct RefCudaRepresentation<'a, T: 'a + SafeDeviceCopy + TypeGraphLayout> { - data: *const T, +pub struct RefCudaRepresentation<'a, T: 'a + PortableBitSemantics + TypeGraphLayout> { + data: DeviceConstPointer, _marker: PhantomData<&'a T>, } -// Safety: This repr(C) struct only contains a device-owned pointer -unsafe impl<'a, T: SafeDeviceCopy + TypeGraphLayout> rustacuda_core::DeviceCopy - for RefCudaRepresentation<'a, T> -{ -} - -unsafe impl<'a, T: SafeDeviceCopy + TypeGraphLayout> RustToCuda for &'a T { +unsafe impl<'a, T: PortableBitSemantics + TypeGraphLayout> RustToCuda for &'a T { #[cfg(all(feature = "host", not(doc)))] - type CudaAllocation = crate::host::CudaDropWrapper>>; + type CudaAllocation = CudaDropWrapper>>; #[cfg(any(not(feature = "host"), doc))] type CudaAllocation = crate::alloc::SomeCudaAlloc; type CudaRepresentation = RefCudaRepresentation<'a, T>; @@ -56,7 +51,7 @@ unsafe impl<'a, T: SafeDeviceCopy + TypeGraphLayout> RustToCuda for &'a T { Ok(( DeviceAccessible::from(RefCudaRepresentation { - data: device_box.as_device_ptr().as_raw().cast(), + data: DeviceConstPointer(device_box.as_device_ptr().as_raw().cast()), _marker: PhantomData::<&'a T>, }), CombinedCudaAlloc::new(device_box, alloc), @@ -73,11 +68,13 @@ unsafe impl<'a, T: SafeDeviceCopy + TypeGraphLayout> RustToCuda for &'a T { } } -unsafe impl<'a, T: SafeDeviceCopy + TypeGraphLayout> CudaAsRust for RefCudaRepresentation<'a, T> { +unsafe impl<'a, T: PortableBitSemantics + TypeGraphLayout> CudaAsRust + for RefCudaRepresentation<'a, T> +{ type RustRepresentation = &'a T; #[cfg(feature = "device")] unsafe fn as_rust(this: &DeviceAccessible) -> Self::RustRepresentation { - &*this.data + &*this.data.0 } } diff --git a/src/lend/impls/ref_mut.rs b/src/lend/impls/ref_mut.rs index a4f4dbe29..33d0fa6e7 100644 --- a/src/lend/impls/ref_mut.rs +++ b/src/lend/impls/ref_mut.rs @@ -7,7 +7,8 @@ use rustacuda::{error::CudaResult, memory::DeviceBox}; use crate::{ lend::{CudaAsRust, RustToCuda}, - safety::SafeDeviceCopy, + safety::PortableBitSemantics, + utils::ffi::DeviceMutPointer, }; #[cfg(any(feature = "host", feature = "device"))] @@ -24,20 +25,14 @@ use crate::{ #[repr(transparent)] #[derive(TypeLayout)] #[allow(clippy::module_name_repetitions)] -pub struct RefMutCudaRepresentation<'a, T: 'a + SafeDeviceCopy + TypeGraphLayout> { - data: *mut T, +pub struct RefMutCudaRepresentation<'a, T: 'a + PortableBitSemantics + TypeGraphLayout> { + data: DeviceMutPointer, _marker: PhantomData<&'a mut T>, } -// Safety: This repr(C) struct only contains a device-owned pointer -unsafe impl<'a, T: SafeDeviceCopy + TypeGraphLayout> rustacuda_core::DeviceCopy - for RefMutCudaRepresentation<'a, T> -{ -} - -unsafe impl<'a, T: SafeDeviceCopy + TypeGraphLayout> RustToCuda for &'a mut T { +unsafe impl<'a, T: PortableBitSemantics + TypeGraphLayout> RustToCuda for &'a mut T { #[cfg(all(feature = "host", not(doc)))] - type CudaAllocation = crate::host::CudaDropWrapper>>; + type CudaAllocation = CudaDropWrapper>>; #[cfg(any(not(feature = "host"), doc))] type CudaAllocation = crate::alloc::SomeCudaAlloc; type CudaRepresentation = RefMutCudaRepresentation<'a, T>; @@ -56,7 +51,7 @@ unsafe impl<'a, T: SafeDeviceCopy + TypeGraphLayout> RustToCuda for &'a mut T { Ok(( DeviceAccessible::from(RefMutCudaRepresentation { - data: device_box.as_device_ptr().as_raw_mut().cast(), + data: DeviceMutPointer(device_box.as_device_ptr().as_raw_mut().cast()), _marker: PhantomData::<&'a mut T>, }), CombinedCudaAlloc::new(device_box, alloc), @@ -80,14 +75,14 @@ unsafe impl<'a, T: SafeDeviceCopy + TypeGraphLayout> RustToCuda for &'a mut T { } } -unsafe impl<'a, T: SafeDeviceCopy + TypeGraphLayout> CudaAsRust +unsafe impl<'a, T: PortableBitSemantics + TypeGraphLayout> CudaAsRust for RefMutCudaRepresentation<'a, T> { type RustRepresentation = &'a mut T; #[cfg(feature = "device")] unsafe fn as_rust(this: &DeviceAccessible) -> Self::RustRepresentation { - let data: *mut T = this.data; + let data: *mut T = this.data.0; &mut *data } } diff --git a/src/lend/impls/slice_ref.rs b/src/lend/impls/slice_ref.rs index 6108f9ccd..4b7898571 100644 --- a/src/lend/impls/slice_ref.rs +++ b/src/lend/impls/slice_ref.rs @@ -7,7 +7,8 @@ use rustacuda::{error::CudaResult, memory::DeviceBuffer}; use crate::{ lend::{CudaAsRust, RustToCuda}, - safety::SafeDeviceCopy, + safety::PortableBitSemantics, + utils::ffi::DeviceConstPointer, }; #[cfg(any(feature = "host", feature = "device"))] @@ -22,21 +23,15 @@ use crate::{ #[doc(hidden)] #[allow(clippy::module_name_repetitions)] -#[derive(Debug, TypeLayout)] +#[derive(TypeLayout)] #[repr(C)] -pub struct SliceRefCudaRepresentation<'a, T: 'a + SafeDeviceCopy + TypeGraphLayout> { - data: *const T, +pub struct SliceRefCudaRepresentation<'a, T: 'a + PortableBitSemantics + TypeGraphLayout> { + data: DeviceConstPointer, len: usize, _marker: PhantomData<&'a [T]>, } -// Safety: This repr(C) struct only contains a device-owned pointer and a usize -unsafe impl<'a, T: SafeDeviceCopy + TypeGraphLayout> rustacuda_core::DeviceCopy - for SliceRefCudaRepresentation<'a, T> -{ -} - -unsafe impl<'a, T: SafeDeviceCopy + TypeGraphLayout> RustToCuda for &'a [T] { +unsafe impl<'a, T: PortableBitSemantics + TypeGraphLayout> RustToCuda for &'a [T] { #[cfg(all(feature = "host", not(doc)))] type CudaAllocation = crate::host::CudaDropWrapper>>; #[cfg(any(not(feature = "host"), doc))] @@ -58,7 +53,7 @@ unsafe impl<'a, T: SafeDeviceCopy + TypeGraphLayout> RustToCuda for &'a [T] { Ok(( DeviceAccessible::from(SliceRefCudaRepresentation { - data: device_buffer.as_ptr().cast(), + data: DeviceConstPointer(device_buffer.as_ptr().cast()), len: device_buffer.len(), _marker: PhantomData::<&'a [T]>, }), @@ -76,13 +71,13 @@ unsafe impl<'a, T: SafeDeviceCopy + TypeGraphLayout> RustToCuda for &'a [T] { } } -unsafe impl<'a, T: SafeDeviceCopy + TypeGraphLayout> CudaAsRust +unsafe impl<'a, T: PortableBitSemantics + TypeGraphLayout> CudaAsRust for SliceRefCudaRepresentation<'a, T> { type RustRepresentation = &'a [T]; #[cfg(feature = "device")] unsafe fn as_rust(this: &DeviceAccessible) -> Self::RustRepresentation { - core::slice::from_raw_parts(this.data, this.len) + core::slice::from_raw_parts(this.data.0, this.len) } } diff --git a/src/lend/impls/slice_ref_mut.rs b/src/lend/impls/slice_ref_mut.rs index b2f79abf9..9246fa474 100644 --- a/src/lend/impls/slice_ref_mut.rs +++ b/src/lend/impls/slice_ref_mut.rs @@ -7,7 +7,8 @@ use rustacuda::{error::CudaResult, memory::DeviceBuffer}; use crate::{ lend::{CudaAsRust, RustToCuda}, - safety::SafeDeviceCopy, + safety::PortableBitSemantics, + utils::ffi::DeviceMutPointer, }; #[cfg(any(feature = "host", feature = "device"))] @@ -22,21 +23,15 @@ use crate::{ #[doc(hidden)] #[allow(clippy::module_name_repetitions)] -#[derive(Debug, TypeLayout)] +#[derive(TypeLayout)] #[repr(C)] -pub struct SliceRefMutCudaRepresentation<'a, T: 'a + SafeDeviceCopy + TypeGraphLayout> { - data: *mut T, +pub struct SliceRefMutCudaRepresentation<'a, T: 'a + PortableBitSemantics + TypeGraphLayout> { + data: DeviceMutPointer, len: usize, _marker: PhantomData<&'a mut [T]>, } -// Safety: This repr(C) struct only contains a device-owned pointer and a usize -unsafe impl<'a, T: SafeDeviceCopy + TypeGraphLayout> rustacuda_core::DeviceCopy - for SliceRefMutCudaRepresentation<'a, T> -{ -} - -unsafe impl<'a, T: SafeDeviceCopy + TypeGraphLayout> RustToCuda for &'a mut [T] { +unsafe impl<'a, T: PortableBitSemantics + TypeGraphLayout> RustToCuda for &'a mut [T] { #[cfg(all(feature = "host", not(doc)))] type CudaAllocation = crate::host::CudaDropWrapper>>; #[cfg(any(not(feature = "host"), doc))] @@ -58,7 +53,7 @@ unsafe impl<'a, T: SafeDeviceCopy + TypeGraphLayout> RustToCuda for &'a mut [T] Ok(( DeviceAccessible::from(SliceRefMutCudaRepresentation { - data: device_buffer.as_mut_ptr().cast(), + data: DeviceMutPointer(device_buffer.as_mut_ptr().cast()), len: device_buffer.len(), _marker: PhantomData::<&'a mut [T]>, }), @@ -83,13 +78,13 @@ unsafe impl<'a, T: SafeDeviceCopy + TypeGraphLayout> RustToCuda for &'a mut [T] } } -unsafe impl<'a, T: SafeDeviceCopy + TypeGraphLayout> CudaAsRust +unsafe impl<'a, T: PortableBitSemantics + TypeGraphLayout> CudaAsRust for SliceRefMutCudaRepresentation<'a, T> { type RustRepresentation = &'a mut [T]; #[cfg(feature = "device")] unsafe fn as_rust(this: &DeviceAccessible) -> Self::RustRepresentation { - core::slice::from_raw_parts_mut(this.data, this.len) + core::slice::from_raw_parts_mut(this.data.0, this.len) } } diff --git a/src/lend/mod.rs b/src/lend/mod.rs index 6f7bab5d7..2fac0a08e 100644 --- a/src/lend/mod.rs +++ b/src/lend/mod.rs @@ -1,7 +1,6 @@ use const_type_layout::TypeGraphLayout; #[cfg(feature = "host")] use rustacuda::error::CudaError; -use rustacuda_core::DeviceCopy; #[cfg(feature = "derive")] #[allow(clippy::module_name_repetitions)] @@ -17,10 +16,7 @@ use crate::{ host::{HostAndDeviceConstRef, HostAndDeviceOwned}, }; #[cfg(any(feature = "host", feature = "device"))] -use crate::{ - safety::{NoSafeAliasing, SafeDeviceCopy}, - utils::ffi::DeviceAccessible, -}; +use crate::{safety::PortableBitSemantics, utils::ffi::DeviceAccessible}; mod impls; @@ -120,7 +116,7 @@ pub unsafe trait RustToCudaAsync: RustToCuda { /// # Safety /// /// This is an internal trait and should NEVER be implemented manually -pub unsafe trait CudaAsRust: DeviceCopy + TypeGraphLayout { +pub unsafe trait CudaAsRust: PortableBitSemantics + TypeGraphLayout { type RustRepresentation: RustToCuda; #[doc(hidden)] @@ -147,7 +143,7 @@ pub trait RustToCudaAsyncProxy: RustToCudaAsync { #[cfg(feature = "host")] #[allow(clippy::module_name_repetitions)] -pub trait LendToCuda: RustToCuda + NoSafeAliasing { +pub trait LendToCuda: RustToCuda { /// Lends an immutable copy of `&self` to CUDA: /// - code in the CUDA kernel can only access `&self` through the /// [`DeviceConstRef`] inside the closure @@ -167,7 +163,7 @@ pub trait LendToCuda: RustToCuda + NoSafeAliasing { inner: F, ) -> Result; - /// Moves `self` to CUDA iff `self` is [`SafeDeviceCopy`] + /// Moves `self` to CUDA iff `self` has [`PortableBitSemantics`] /// /// # Errors /// @@ -183,11 +179,11 @@ pub trait LendToCuda: RustToCuda + NoSafeAliasing { inner: F, ) -> Result where - Self: RustToCuda; + Self: RustToCuda; } #[cfg(feature = "host")] -impl LendToCuda for T { +impl LendToCuda for T { fn lend_to_cuda< O, E: From, @@ -219,7 +215,7 @@ impl LendToCuda for T { inner: F, ) -> Result where - Self: RustToCuda, + Self: RustToCuda, { let (cuda_repr, alloc) = unsafe { self.borrow(NoCudaAlloc) }?; @@ -232,7 +228,7 @@ impl LendToCuda for T { } #[cfg(feature = "device")] -pub trait BorrowFromRust: RustToCuda + NoSafeAliasing { +pub trait BorrowFromRust: RustToCuda { /// # Safety /// /// This function is only safe to call iff `cuda_repr` is the @@ -254,11 +250,11 @@ pub trait BorrowFromRust: RustToCuda + NoSafeAliasing { ) -> O where Self: Sized, - ::CudaRepresentation: SafeDeviceCopy; + ::CudaRepresentation: PortableBitSemantics; } #[cfg(feature = "device")] -impl BorrowFromRust for T { +impl BorrowFromRust for T { #[inline] unsafe fn with_borrow_from_rust O>( cuda_repr: DeviceConstRef::CudaRepresentation>>, @@ -278,7 +274,7 @@ impl BorrowFromRust for T { ) -> O where Self: Sized, - ::CudaRepresentation: SafeDeviceCopy, + ::CudaRepresentation: PortableBitSemantics, { inner(CudaAsRust::as_rust(cuda_repr.as_mut())) } diff --git a/src/safety/device_copy.rs b/src/safety/device_copy.rs deleted file mode 100644 index a2bfc9552..000000000 --- a/src/safety/device_copy.rs +++ /dev/null @@ -1,29 +0,0 @@ -use const_type_layout::TypeGraphLayout; - -use crate::{safety::StackOnly, utils::ffi::DeviceAccessible}; - -#[allow(clippy::module_name_repetitions)] -/// Types which are safe to memcpy from the CPU to a GPU. -/// -/// For a type to implement [`SafeDeviceCopy`], it must -/// -/// * have the same memory layout on both the CPU and GPU -/// -/// * not contain any references to data that is inaccessible from the GPU -/// -/// Types that implement both [`TypeGraphLayout`] and [`StackOnly`] satisfy -/// both of these criteria and thus implement [`SafeDeviceCopy`]. -#[marker] -pub trait SafeDeviceCopy: sealed::Sealed {} - -impl SafeDeviceCopy for T {} -impl sealed::Sealed for T {} - -#[doc(hidden)] -impl SafeDeviceCopy for DeviceAccessible {} -impl sealed::Sealed for DeviceAccessible {} - -mod sealed { - #[marker] - pub trait Sealed {} -} diff --git a/src/safety/mod.rs b/src/safety/mod.rs index 72ed9c7db..243a2a9f9 100644 --- a/src/safety/mod.rs +++ b/src/safety/mod.rs @@ -1,6 +1,5 @@ mod arch; -mod device_copy; -mod no_aliasing; +mod portable; mod stack_only; #[doc(hidden)] @@ -8,6 +7,5 @@ pub mod kernel_signature; #[doc(hidden)] pub mod type_layout; -pub use device_copy::SafeDeviceCopy; -pub use no_aliasing::NoSafeAliasing; +pub use portable::PortableBitSemantics; pub use stack_only::StackOnly; diff --git a/src/safety/no_aliasing.rs b/src/safety/no_aliasing.rs deleted file mode 100644 index 7baa06f19..000000000 --- a/src/safety/no_aliasing.rs +++ /dev/null @@ -1,86 +0,0 @@ -#[allow(clippy::module_name_repetitions)] -/// Types which can be safely shared between CUDA threads because they do -/// not provide safe aliasing mutable access to some shared inner state. -/// -/// This trait is automatically implemented when the compiler determines -/// it's appropriate. -/// -/// Data types that contain no references and can thus live entirely on -/// the stack, e.g. primitive types like [`u8`] and structs, tuples, and -/// enums made only from them, or more generally those types that implement -/// [`StackOnly`](super::StackOnly), also implement [`NoSafeAliasing`] as they -/// do not contain any inner data that might be shared when each thread is -/// given mutable access to a copy. -/// -/// In contrast, `&mut T` (and any type containing a mutable reference) do *not* -/// implement [`NoSafeAliasing`] as several threads would obtain mutable -/// aliasing access to the same date, thus violating Rust's borrowing and -/// memory safety rules. -/// -/// Even though `*const T` and `*mut T` do not provide *safe* mutable aliasing -/// access to their underlying data, as dereferincing them is always unsafe, -/// they (and any type containing a pointer) do *not* implement -/// [`NoSafeAliasing`] to ensure that any data type that uses them to build a -/// safe interface to accessing data, e.g. [`Box`], does not accidentially -/// implement [`NoSafeAliasing`]. If you have implemented a data structure that -/// uses `*const T` or `*mut T` internally but also ensures that no safe -/// aliasing mutable access is provided, you can *unsafely* implement -/// [`NoSafeAliasing`] for your type. Please reference the [Safety](#safety) -/// section below for more details on the contract you must uphold in this case. -/// -/// # Safety -/// -/// This trait must only be manually implemented for a type that upholds -/// the no-mutable-aliasing guarantee through its safe API. -/// -/// The following examples outline three different cases for types that do -/// fulfil this safety requirement: -/// -/// * [`Final`](final::Final) implements [`NoSafeAliasing`] -/// because even a mutable reference to it only provides read-only access -/// to its inner data. -/// -/// * [`SplitSliceOverCudaThreadsConstStride`](crate::utils::aliasing::SplitSliceOverCudaThreadsConstStride) -/// and -/// [`SplitSliceOverCudaThreadsDynamicStride`](crate::utils::aliasing::SplitSliceOverCudaThreadsDynamicStride) -/// also implement [`NoSafeAliasing`] because they only provide each CUDA thread -/// with mutable access to its own partition of a slice and thus avoid mutable -/// aliasing. -/// -/// * [`ThreadBlockShared`](crate::utils::shared::ThreadBlockShared) -/// and -/// [`ThreadBlockSharedSlice`](crate::utils::shared::ThreadBlockSharedSlice) -/// also implement [`NoSafeAliasing`] since they only provide access to `*mut -/// T`, which is always unsafe to mutate and thus moves the burden to uphoald -/// the no-mutable-aliasing safety invariant to the user who derefereces these -/// pointers. -pub unsafe auto trait NoSafeAliasing {} - -impl !NoSafeAliasing for &mut T {} -impl !NoSafeAliasing for *const T {} -impl !NoSafeAliasing for *mut T {} - -unsafe impl NoSafeAliasing for core::marker::PhantomData {} - -unsafe impl NoSafeAliasing for r#final::Final {} -unsafe impl NoSafeAliasing - for crate::utils::aliasing::FinalCudaRepresentation -{ -} - -unsafe impl NoSafeAliasing - for crate::utils::aliasing::SplitSliceOverCudaThreadsConstStride -{ -} -unsafe impl NoSafeAliasing - for crate::utils::aliasing::SplitSliceOverCudaThreadsDynamicStride -{ -} - -// Thread-block-shared data only allows unsafe aliasing since only raw pointers -// are exposed -unsafe impl NoSafeAliasing for crate::utils::shared::ThreadBlockShared {} -unsafe impl NoSafeAliasing - for crate::utils::shared::ThreadBlockSharedSlice -{ -} diff --git a/src/safety/portable.rs b/src/safety/portable.rs new file mode 100644 index 000000000..5b438e2f7 --- /dev/null +++ b/src/safety/portable.rs @@ -0,0 +1,63 @@ +macro_rules! portable_bit_semantics_docs { + ($item:item) => { + /// Types whose in-memory bit representation on the CPU host is safe to copy + /// to and read back on the GPU device while maintaining the same semantics, + /// iff the type layout on the CPU matches the type layout on the GPU. + /// + /// For a type to implement [`PortableBitSemantics`], it + /// + /// * should have the same memory layout on both the CPU and GPU, and + /// + /// * must not contain any references to data that are exposed as safely + /// accessible on both ends but actually inaccessible on one. + /// + /// For instance, a reference `&u8` to host memory has the same well-defined + /// layout on both CPU and GPU (if their pointer sizes and alignments + /// match), but it is not portable since the host memory is generally + /// not accessible from the GPU. + /// + /// This trait is automatically implemented when the compiler determines + /// it's appropriate. + /// + /// Note that this trait is *sealed*, i.e. you cannot implement it on your + /// own custom types. + /// + /// Trait bounds usually combine [`PortableBitSemantics`] with + /// [`TypeGraphLayout`](const_type_layout::TypeGraphLayout) to check that + /// the type layout is indeed the same on both the host CPU and the GPU + /// device. + /// + /// Types that implement [`StackOnly`](crate::safety::StackOnly) and + /// [`TypeGraphLayout`](const_type_layout::TypeGraphLayout) satisfy both + /// of the above criteria and thus also implement [`PortableBitSemantics`]. + $item + }; +} + +#[cfg(not(doc))] +portable_bit_semantics_docs! { + #[allow(clippy::module_name_repetitions)] + pub trait PortableBitSemantics: sealed::PortableBitSemantics {} +} +#[cfg(doc)] +portable_bit_semantics_docs! { + pub use sealed::PortableBitSemantics; +} + +#[cfg(not(doc))] +impl PortableBitSemantics for T {} + +mod sealed { + pub auto trait PortableBitSemantics {} + + impl !PortableBitSemantics for &T {} + impl !PortableBitSemantics for &mut T {} + impl !PortableBitSemantics for *const T {} + impl !PortableBitSemantics for *mut T {} + + impl PortableBitSemantics for core::marker::PhantomData {} + + impl PortableBitSemantics for crate::utils::ffi::DeviceConstPointer {} + impl PortableBitSemantics for crate::utils::ffi::DeviceMutPointer {} + impl PortableBitSemantics for crate::utils::ffi::DeviceOwnedPointer {} +} diff --git a/src/safety/stack_only.rs b/src/safety/stack_only.rs index bfb4e80d0..eac7f9456 100644 --- a/src/safety/stack_only.rs +++ b/src/safety/stack_only.rs @@ -64,23 +64,23 @@ macro_rules! stack_only_docs { #[cfg(not(doc))] stack_only_docs! { #[allow(clippy::module_name_repetitions)] - pub trait StackOnly: sealed::Sealed {} + pub trait StackOnly: sealed::StackOnly {} } #[cfg(doc)] stack_only_docs! { - pub use sealed::Sealed as StackOnly; + pub use sealed::StackOnly; } #[cfg(not(doc))] -impl StackOnly for T {} +impl StackOnly for T {} mod sealed { - pub auto trait Sealed {} + pub auto trait StackOnly {} - impl !Sealed for &T {} - impl !Sealed for &mut T {} - impl !Sealed for *const T {} - impl !Sealed for *mut T {} + impl !StackOnly for &T {} + impl !StackOnly for &mut T {} + impl !StackOnly for *const T {} + impl !StackOnly for *mut T {} - impl Sealed for core::marker::PhantomData {} + impl StackOnly for core::marker::PhantomData {} } diff --git a/src/utils/aliasing/const.rs b/src/utils/aliasing/const.rs index b3a28cf25..c36f814bf 100644 --- a/src/utils/aliasing/const.rs +++ b/src/utils/aliasing/const.rs @@ -6,7 +6,6 @@ use core::{ }; use const_type_layout::TypeLayout; -use rustacuda_core::DeviceCopy; use crate::{ lend::{CudaAsRust, RustToCuda, RustToCudaAsync}, @@ -25,13 +24,6 @@ impl SplitSliceOverCudaThreadsConstStride { } } -// Safety: If [`T`] is [`DeviceCopy`], then the newtype struct also is -// [`DeviceCopy`] -unsafe impl DeviceCopy - for SplitSliceOverCudaThreadsConstStride -{ -} - #[cfg(feature = "device")] fn split_slice_const_stride(slice: &[E]) -> &[E] { let offset: usize = crate::device::thread::Thread::this().index() * STRIDE; diff --git a/src/utils/aliasing/dynamic.rs b/src/utils/aliasing/dynamic.rs index 50f028ec3..0ab97016c 100644 --- a/src/utils/aliasing/dynamic.rs +++ b/src/utils/aliasing/dynamic.rs @@ -6,7 +6,6 @@ use core::{ }; use const_type_layout::TypeLayout; -use rustacuda_core::DeviceCopy; use crate::{ lend::{CudaAsRust, RustToCuda, RustToCudaAsync}, @@ -28,10 +27,6 @@ impl SplitSliceOverCudaThreadsDynamicStride { } } -// Safety: If [`T`] is [`DeviceCopy`], then the newtype struct also is -// [`DeviceCopy`] -unsafe impl DeviceCopy for SplitSliceOverCudaThreadsDynamicStride {} - #[cfg(feature = "device")] fn split_slice_dynamic_stride(slice: &[E], stride: usize) -> &[E] { let offset: usize = crate::device::thread::Thread::this().index() * stride; diff --git a/src/utils/aliasing/final.rs b/src/utils/aliasing/final.rs deleted file mode 100644 index 432910920..000000000 --- a/src/utils/aliasing/final.rs +++ /dev/null @@ -1,90 +0,0 @@ -use const_type_layout::TypeLayout; -use r#final::Final; - -use crate::{ - lend::{CudaAsRust, RustToCuda, RustToCudaAsync}, - utils::ffi::DeviceAccessible, -}; - -#[doc(hidden)] -#[repr(transparent)] -#[derive(TypeLayout)] -#[allow(clippy::module_name_repetitions)] -pub struct FinalCudaRepresentation(DeviceAccessible); - -// Safety: If [`T`] is [`CudaAsRust`], then the newtype struct is [`DeviceCopy`] -unsafe impl rustacuda_core::DeviceCopy for FinalCudaRepresentation {} - -unsafe impl RustToCuda for Final { - type CudaAllocation = T::CudaAllocation; - type CudaRepresentation = FinalCudaRepresentation; - - #[cfg(feature = "host")] - #[allow(clippy::type_complexity)] - unsafe fn borrow( - &self, - alloc: A, - ) -> rustacuda::error::CudaResult<( - DeviceAccessible, - crate::alloc::CombinedCudaAlloc, - )> { - let (cuda_repr, alloc) = (**self).borrow(alloc)?; - - Ok(( - DeviceAccessible::from(FinalCudaRepresentation(cuda_repr)), - alloc, - )) - } - - #[cfg(feature = "host")] - unsafe fn restore( - &mut self, - alloc: crate::alloc::CombinedCudaAlloc, - ) -> rustacuda::error::CudaResult { - // Safety: Final is a repr(transparent) newtype wrapper around T - let inner: &mut T = &mut *(self as *mut Self).cast(); - - inner.restore(alloc) - } -} - -unsafe impl RustToCudaAsync for Final { - #[cfg(feature = "host")] - #[allow(clippy::type_complexity)] - unsafe fn borrow_async( - &self, - alloc: A, - stream: &rustacuda::stream::Stream, - ) -> rustacuda::error::CudaResult<( - DeviceAccessible, - crate::alloc::CombinedCudaAlloc, - )> { - let (cuda_repr, alloc) = (**self).borrow_async(alloc, stream)?; - - Ok(( - DeviceAccessible::from(FinalCudaRepresentation(cuda_repr)), - alloc, - )) - } - - #[cfg(feature = "host")] - unsafe fn restore_async( - &mut self, - alloc: crate::alloc::CombinedCudaAlloc, - stream: &rustacuda::stream::Stream, - ) -> rustacuda::error::CudaResult { - // Safety: Final is a repr(transparent) newtype wrapper around T - let inner: &mut T = &mut *(self as *mut Self).cast(); - - inner.restore_async(alloc, stream) - } -} - -unsafe impl CudaAsRust for FinalCudaRepresentation { - type RustRepresentation = Final; - - #[cfg(feature = "device")] - unsafe fn as_rust(this: &DeviceAccessible) -> Self::RustRepresentation { - Final::new(CudaAsRust::as_rust(&this.0)) - } -} diff --git a/src/utils/aliasing/mod.rs b/src/utils/aliasing/mod.rs index de7c58e05..e7753cf92 100644 --- a/src/utils/aliasing/mod.rs +++ b/src/utils/aliasing/mod.rs @@ -1,8 +1,5 @@ mod r#const; mod dynamic; -mod r#final; pub use dynamic::SplitSliceOverCudaThreadsDynamicStride; pub use r#const::SplitSliceOverCudaThreadsConstStride; - -pub(crate) use self::r#final::FinalCudaRepresentation; diff --git a/src/utils/device_copy.rs b/src/utils/device_copy.rs index 2363b4855..72bd7d64e 100644 --- a/src/utils/device_copy.rs +++ b/src/utils/device_copy.rs @@ -5,7 +5,7 @@ use const_type_layout::{TypeGraphLayout, TypeLayout}; use crate::{ alloc::NoCudaAlloc, lend::{CudaAsRust, RustToCuda, RustToCudaAsync}, - safety::SafeDeviceCopy, + safety::PortableBitSemantics, }; #[cfg(any(feature = "host", feature = "device"))] @@ -16,22 +16,17 @@ use crate::alloc::{CombinedCudaAlloc, CudaAlloc}; #[derive(Copy, Clone, Debug, TypeLayout)] #[repr(transparent)] -pub struct SafeDeviceCopyWrapper(T) -where - T: SafeDeviceCopy + TypeGraphLayout; +pub struct SafeDeviceCopyWrapper(T); -unsafe impl rustacuda_core::DeviceCopy - for SafeDeviceCopyWrapper -{ -} +unsafe impl rustacuda_core::DeviceCopy for SafeDeviceCopyWrapper {} -impl From for SafeDeviceCopyWrapper { +impl From for SafeDeviceCopyWrapper { fn from(value: T) -> Self { Self(value) } } -impl SafeDeviceCopyWrapper { +impl SafeDeviceCopyWrapper { #[must_use] pub fn into_inner(self) -> T { self.0 @@ -86,7 +81,7 @@ impl SafeDeviceCopyWrapper { } } -unsafe impl RustToCuda for SafeDeviceCopyWrapper { +unsafe impl RustToCuda for SafeDeviceCopyWrapper { type CudaAllocation = NoCudaAlloc; type CudaRepresentation = Self; @@ -114,7 +109,9 @@ unsafe impl RustToCuda for SafeDeviceCopyWr } } -unsafe impl RustToCudaAsync for SafeDeviceCopyWrapper { +unsafe impl RustToCudaAsync + for SafeDeviceCopyWrapper +{ #[cfg(feature = "host")] #[allow(clippy::type_complexity)] unsafe fn borrow_async( @@ -141,7 +138,7 @@ unsafe impl RustToCudaAsync for SafeDeviceC } } -unsafe impl CudaAsRust for SafeDeviceCopyWrapper { +unsafe impl CudaAsRust for SafeDeviceCopyWrapper { type RustRepresentation = Self; #[cfg(feature = "device")] diff --git a/src/utils/exchange/buffer/common.rs b/src/utils/exchange/buffer/common.rs index 450ed0975..cfacf61a2 100644 --- a/src/utils/exchange/buffer/common.rs +++ b/src/utils/exchange/buffer/common.rs @@ -1,7 +1,6 @@ use const_type_layout::{TypeGraphLayout, TypeLayout}; -use rustacuda_core::DeviceCopy; -use crate::{lend::CudaAsRust, safety::SafeDeviceCopy}; +use crate::{lend::CudaAsRust, safety::PortableBitSemantics, utils::ffi::DeviceMutPointer}; use super::{CudaExchangeBuffer, CudaExchangeItem}; @@ -9,21 +8,16 @@ use super::{CudaExchangeBuffer, CudaExchangeItem}; #[doc(hidden)] #[derive(TypeLayout)] #[repr(C)] -pub struct CudaExchangeBufferCudaRepresentation( - pub(super) *mut CudaExchangeItem, +pub struct CudaExchangeBufferCudaRepresentation< + T: PortableBitSemantics + TypeGraphLayout, + const M2D: bool, + const M2H: bool, +>( + pub(super) DeviceMutPointer>, pub(super) usize, -) -where - T: SafeDeviceCopy + TypeGraphLayout; +); -// Safety: [`CudaExchangeBufferCudaRepresentation`] is [`DeviceCopy`] -// iff [`T`] is [`SafeDeviceCopy`] -unsafe impl DeviceCopy - for CudaExchangeBufferCudaRepresentation -{ -} - -unsafe impl CudaAsRust +unsafe impl CudaAsRust for CudaExchangeBufferCudaRepresentation { type RustRepresentation = CudaExchangeBuffer; @@ -35,7 +29,7 @@ unsafe impl(pub(super) core::mem::ManuallyDrop]>>); -impl Deref +impl Deref for CudaExchangeBufferDevice { type Target = [CudaExchangeItem]; @@ -23,7 +23,7 @@ impl Dere } } -impl DerefMut +impl DerefMut for CudaExchangeBufferDevice { fn deref_mut(&mut self) -> &mut Self::Target { diff --git a/src/utils/exchange/buffer/host.rs b/src/utils/exchange/buffer/host.rs index 58e200881..f7fedc804 100644 --- a/src/utils/exchange/buffer/host.rs +++ b/src/utils/exchange/buffer/host.rs @@ -12,23 +12,29 @@ use rustacuda::{ use crate::{ alloc::{CombinedCudaAlloc, CudaAlloc, NoCudaAlloc}, host::CudaDropWrapper, - safety::SafeDeviceCopy, - utils::ffi::DeviceAccessible, + safety::PortableBitSemantics, + utils::{ + device_copy::SafeDeviceCopyWrapper, + ffi::{DeviceAccessible, DeviceMutPointer}, + }, }; use super::{common::CudaExchangeBufferCudaRepresentation, CudaExchangeItem}; #[allow(clippy::module_name_repetitions)] pub struct CudaExchangeBufferHost< - T: SafeDeviceCopy + TypeGraphLayout, + T: PortableBitSemantics + TypeGraphLayout, const M2D: bool, const M2H: bool, > { - host_buffer: CudaDropWrapper>>, - device_buffer: UnsafeCell>>>, + host_buffer: + CudaDropWrapper>>>, + device_buffer: UnsafeCell< + CudaDropWrapper>>>, + >, } -impl +impl CudaExchangeBufferHost { /// # Errors @@ -38,7 +44,10 @@ impl = unsafe { &*(elem as *const T).cast() }; - let host_buffer = CudaDropWrapper::from(LockedBuffer::new(elem, capacity)?); + let host_buffer = CudaDropWrapper::from(LockedBuffer::new( + SafeDeviceCopyWrapper::from_ref(elem), + capacity, + )?); let device_buffer = UnsafeCell::new(CudaDropWrapper::from(DeviceBuffer::from_slice( host_buffer.as_slice(), )?)); @@ -50,21 +59,26 @@ impl +impl CudaExchangeBufferHost { /// # Errors /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside /// CUDA pub fn from_vec(vec: Vec) -> CudaResult { - let mut host_buffer_uninit = - CudaDropWrapper::from(unsafe { LockedBuffer::uninitialized(vec.len())? }); + let host_buffer = unsafe { + let mut uninit: CudaDropWrapper>> = + CudaDropWrapper::from(LockedBuffer::uninitialized(vec.len())?); - for (src, dst) in vec.into_iter().zip(host_buffer_uninit.iter_mut()) { - *dst = CudaExchangeItem(src); - } + for (i, src) in vec.into_iter().enumerate() { + uninit + .as_mut_ptr() + .add(i) + .write(SafeDeviceCopyWrapper::from(CudaExchangeItem(src))); + } - let host_buffer = host_buffer_uninit; + uninit + }; let device_buffer = UnsafeCell::new(CudaDropWrapper::from(DeviceBuffer::from_slice( host_buffer.as_slice(), @@ -77,25 +91,25 @@ impl } } -impl Deref +impl Deref for CudaExchangeBufferHost { type Target = [CudaExchangeItem]; fn deref(&self) -> &Self::Target { - self.host_buffer.as_slice() + SafeDeviceCopyWrapper::into_slice(self.host_buffer.as_slice()) } } -impl DerefMut +impl DerefMut for CudaExchangeBufferHost { fn deref_mut(&mut self) -> &mut Self::Target { - self.host_buffer.as_mut_slice() + SafeDeviceCopyWrapper::into_mut_slice(self.host_buffer.as_mut_slice()) } } -impl +impl CudaExchangeBufferHost { #[allow(clippy::type_complexity)] @@ -121,7 +135,7 @@ impl Ok(( DeviceAccessible::from(CudaExchangeBufferCudaRepresentation( - device_buffer.as_mut_ptr(), + DeviceMutPointer(device_buffer.as_mut_ptr().cast()), device_buffer.len(), )), CombinedCudaAlloc::new(NoCudaAlloc, alloc), @@ -148,7 +162,7 @@ impl } } -impl +impl CudaExchangeBufferHost { #[allow(clippy::type_complexity)] @@ -176,7 +190,7 @@ impl Ok(( DeviceAccessible::from(CudaExchangeBufferCudaRepresentation( - device_buffer.as_mut_ptr(), + DeviceMutPointer(device_buffer.as_mut_ptr().cast()), device_buffer.len(), )), CombinedCudaAlloc::new(NoCudaAlloc, alloc), diff --git a/src/utils/exchange/buffer/mod.rs b/src/utils/exchange/buffer/mod.rs index c1dea16d0..f493f316c 100644 --- a/src/utils/exchange/buffer/mod.rs +++ b/src/utils/exchange/buffer/mod.rs @@ -9,7 +9,7 @@ use const_type_layout::TypeLayout; #[cfg(any(feature = "host", feature = "device"))] use const_type_layout::TypeGraphLayout; -use crate::safety::SafeDeviceCopy; +use crate::safety::PortableBitSemantics; #[cfg(any(feature = "host", feature = "device"))] use crate::{ @@ -35,8 +35,11 @@ mod host; #[cfg(any(feature = "host", feature = "device"))] #[allow(clippy::module_name_repetitions)] -pub struct CudaExchangeBuffer -{ +pub struct CudaExchangeBuffer< + T: PortableBitSemantics + TypeGraphLayout, + const M2D: bool, + const M2H: bool, +> { #[cfg(feature = "host")] inner: host::CudaExchangeBufferHost, #[cfg(all(feature = "device", not(feature = "host")))] @@ -44,7 +47,7 @@ pub struct CudaExchangeBuffer +impl CudaExchangeBuffer { /// # Errors @@ -58,7 +61,7 @@ impl +impl CudaExchangeBuffer { /// # Errors @@ -72,7 +75,7 @@ impl } #[cfg(any(feature = "host", feature = "device"))] -impl Deref +impl Deref for CudaExchangeBuffer { type Target = [CudaExchangeItem]; @@ -83,7 +86,7 @@ impl Dere } #[cfg(any(feature = "host", feature = "device"))] -impl DerefMut +impl DerefMut for CudaExchangeBuffer { fn deref_mut(&mut self) -> &mut Self::Target { @@ -92,7 +95,7 @@ impl Dere } #[cfg(any(feature = "host", feature = "device"))] -unsafe impl RustToCuda +unsafe impl RustToCuda for CudaExchangeBuffer { type CudaAllocation = NoCudaAlloc; @@ -121,8 +124,8 @@ unsafe impl RustToCudaAsync - for CudaExchangeBuffer +unsafe impl + RustToCudaAsync for CudaExchangeBuffer { #[cfg(feature = "host")] #[allow(clippy::type_complexity)] @@ -150,16 +153,13 @@ unsafe impl(T); - -// Safety: Transparent newtype wrapper around [`SafeDeviceCopy`] -// is [`DeviceCopy`] -unsafe impl rustacuda_core::DeviceCopy - for CudaExchangeItem -{ -} +pub struct CudaExchangeItem< + T: PortableBitSemantics + TypeGraphLayout, + const M2D: bool, + const M2H: bool, +>(T); -impl CudaExchangeItem { +impl CudaExchangeItem { #[cfg(feature = "host")] pub const fn read(&self) -> &T { &self.0 @@ -171,7 +171,7 @@ impl CudaExchangeItem { } } -impl CudaExchangeItem { +impl CudaExchangeItem { #[cfg(feature = "device")] pub const fn read(&self) -> &T { &self.0 @@ -183,13 +183,13 @@ impl CudaExchangeItem { } } -impl AsMut for CudaExchangeItem { +impl AsMut for CudaExchangeItem { fn as_mut(&mut self) -> &mut T { &mut self.0 } } -impl CudaExchangeItem { +impl CudaExchangeItem { #[cfg(feature = "host")] pub const fn as_scratch(&self) -> &T { &self.0 @@ -201,7 +201,7 @@ impl CudaExchangeItem { } } -impl CudaExchangeItem { +impl CudaExchangeItem { #[cfg(feature = "device")] pub const fn as_scratch(&self) -> &T { &self.0 @@ -213,7 +213,7 @@ impl CudaExchangeItem { } } -impl CudaExchangeItem { +impl CudaExchangeItem { #[cfg(feature = "host")] pub const fn as_uninit(&self) -> &MaybeUninit { // Safety: @@ -231,7 +231,7 @@ impl CudaExchangeItem { } } -impl CudaExchangeItem { +impl CudaExchangeItem { #[cfg(feature = "device")] pub const fn as_uninit(&self) -> &MaybeUninit { // Safety: diff --git a/src/utils/exchange/wrapper.rs b/src/utils/exchange/wrapper.rs index 2e9decc51..9eedb058e 100644 --- a/src/utils/exchange/wrapper.rs +++ b/src/utils/exchange/wrapper.rs @@ -9,7 +9,7 @@ use std::{ use rustacuda::{ error::{CudaError, CudaResult}, event::{Event, EventFlags, EventStatus}, - memory::DeviceBox, + memory::{AsyncCopyDestination, CopyDestination, DeviceBox, LockedBox}, stream::{Stream, StreamWaitEventFlags}, }; @@ -17,25 +17,33 @@ use crate::{ alloc::{CombinedCudaAlloc, EmptyCudaAlloc, NoCudaAlloc}, host::{ CudaDropWrapper, HostAndDeviceConstRef, HostAndDeviceConstRefAsync, HostAndDeviceMutRef, - HostAndDeviceMutRefAsync, HostDeviceBox, HostLockedBox, + HostAndDeviceMutRefAsync, }, lend::{RustToCuda, RustToCudaAsync}, - utils::ffi::DeviceAccessible, + utils::{device_copy::SafeDeviceCopyWrapper, ffi::DeviceAccessible}, }; #[allow(clippy::module_name_repetitions)] pub struct ExchangeWrapperOnHost> { value: T, - device_box: HostDeviceBox::CudaRepresentation>>, - locked_cuda_repr: HostLockedBox::CudaRepresentation>>, + device_box: CudaDropWrapper< + DeviceBox::CudaRepresentation>>>, + >, + locked_cuda_repr: CudaDropWrapper< + LockedBox::CudaRepresentation>>>, + >, move_event: CudaDropWrapper, } #[allow(clippy::module_name_repetitions)] pub struct ExchangeWrapperOnHostAsync<'stream, T: RustToCuda> { value: T, - device_box: HostDeviceBox::CudaRepresentation>>, - locked_cuda_repr: HostLockedBox::CudaRepresentation>>, + device_box: CudaDropWrapper< + DeviceBox::CudaRepresentation>>>, + >, + locked_cuda_repr: CudaDropWrapper< + LockedBox::CudaRepresentation>>>, + >, move_event: CudaDropWrapper, stream: PhantomData<&'stream Stream>, waker: Arc>>, @@ -44,8 +52,12 @@ pub struct ExchangeWrapperOnHostAsync<'stream, T: RustToCuda> { value: T, - device_box: HostDeviceBox::CudaRepresentation>>, - locked_cuda_repr: HostLockedBox::CudaRepresentation>>, + device_box: CudaDropWrapper< + DeviceBox::CudaRepresentation>>>, + >, + locked_cuda_repr: CudaDropWrapper< + LockedBox::CudaRepresentation>>>, + >, null_alloc: CombinedCudaAlloc<::CudaAllocation, NoCudaAlloc>, move_event: CudaDropWrapper, } @@ -53,8 +65,12 @@ pub struct ExchangeWrapperOnDevice #[allow(clippy::module_name_repetitions)] pub struct ExchangeWrapperOnDeviceAsync<'stream, T: RustToCuda> { value: T, - device_box: HostDeviceBox::CudaRepresentation>>, - locked_cuda_repr: HostLockedBox::CudaRepresentation>>, + device_box: CudaDropWrapper< + DeviceBox::CudaRepresentation>>>, + >, + locked_cuda_repr: CudaDropWrapper< + LockedBox::CudaRepresentation>>>, + >, null_alloc: CombinedCudaAlloc<::CudaAllocation, NoCudaAlloc>, move_event: CudaDropWrapper, stream: &'stream Stream, @@ -67,12 +83,20 @@ impl> ExchangeWrapperOnHost { /// CUDA pub fn new(value: T) -> CudaResult { // Safety: The uninitialised memory is never exposed - // To access the device memory, [`Self::move_to_device`] has to be - // called first, which initialised the memory. - let device_box = unsafe { DeviceBox::uninitialized() }?.into(); + // To access the device memory, [`Self::move_to_device`] has to + // be called first, which initialised the memory. + let device_box = CudaDropWrapper::from(unsafe { DeviceBox::uninitialized() }?); let (cuda_repr, _null_alloc) = unsafe { value.borrow(NoCudaAlloc) }?; - let locked_cuda_repr = HostLockedBox::new(cuda_repr)?; + let locked_cuda_repr = unsafe { + let mut uninit = CudaDropWrapper::from(LockedBox::< + SafeDeviceCopyWrapper::CudaRepresentation>>, + >::uninitialized()?); + uninit + .as_mut_ptr() + .write(SafeDeviceCopyWrapper::from(cuda_repr)); + uninit + }; let move_event = Event::new(EventFlags::DISABLE_TIMING)?.into(); @@ -98,9 +122,9 @@ impl> ExchangeWrapperOnHost { /// CUDA pub fn move_to_device(mut self) -> CudaResult> { let (cuda_repr, null_alloc) = unsafe { self.value.borrow(NoCudaAlloc) }?; - *self.locked_cuda_repr = cuda_repr; + **self.locked_cuda_repr = SafeDeviceCopyWrapper::from(cuda_repr); - self.device_box.copy_from(&self.locked_cuda_repr)?; + self.device_box.copy_from(&**self.locked_cuda_repr)?; Ok(ExchangeWrapperOnDevice { value: self.value, @@ -128,14 +152,14 @@ impl> ExchangeWrapperOnHost CudaResult> { let (cuda_repr, null_alloc) = unsafe { self.value.borrow_async(NoCudaAlloc, stream) }?; - *self.locked_cuda_repr = cuda_repr; + **self.locked_cuda_repr = SafeDeviceCopyWrapper::from(cuda_repr); // Safety: The device value is not safely exposed until either // - the passed-in [`Stream`] is synchronised // - the kernel is launched on the passed-in [`Stream`] unsafe { self.device_box - .async_copy_from(&self.locked_cuda_repr, stream) + .async_copy_from(&*self.locked_cuda_repr, stream) }?; self.move_event.record(stream)?; @@ -316,7 +340,11 @@ impl<'stream, T: RustToCuda> ) -> HostAndDeviceConstRefAsync::CudaRepresentation>> { // Safety: `device_box` contains exactly the device copy of `locked_cuda_repr` unsafe { - HostAndDeviceConstRefAsync::new(&self.device_box, &self.locked_cuda_repr, self.stream) + HostAndDeviceConstRefAsync::new( + &*self.device_box, + (**self.locked_cuda_repr).into_ref(), + self.stream, + ) } } @@ -327,7 +355,7 @@ impl<'stream, T: RustToCuda> unsafe { HostAndDeviceMutRefAsync::new( &mut self.device_box, - &mut self.locked_cuda_repr, + (**self.locked_cuda_repr).into_mut(), self.stream, ) } @@ -470,14 +498,18 @@ impl> ExchangeWrapperOnDevice { &self, ) -> HostAndDeviceConstRef::CudaRepresentation>> { // Safety: `device_box` contains exactly the device copy of `locked_cuda_repr` - unsafe { HostAndDeviceConstRef::new(&self.device_box, &self.locked_cuda_repr) } + unsafe { + HostAndDeviceConstRef::new(&self.device_box, (**self.locked_cuda_repr).into_ref()) + } } pub fn as_mut( &mut self, ) -> HostAndDeviceMutRef::CudaRepresentation>> { // Safety: `device_box` contains exactly the device copy of `locked_cuda_repr` - unsafe { HostAndDeviceMutRef::new(&mut self.device_box, &mut self.locked_cuda_repr) } + unsafe { + HostAndDeviceMutRef::new(&mut self.device_box, (**self.locked_cuda_repr).into_mut()) + } } } diff --git a/src/utils/ffi.rs b/src/utils/ffi.rs index 98fd945e7..3b205ffca 100644 --- a/src/utils/ffi.rs +++ b/src/utils/ffi.rs @@ -7,20 +7,16 @@ use core::{ #[cfg(feature = "host")] use std::{fmt, mem::MaybeUninit, ptr::copy_nonoverlapping}; -#[cfg(feature = "host")] -use const_type_layout::TypeGraphLayout; use const_type_layout::TypeLayout; -use rustacuda_core::DeviceCopy; +use crate::safety::PortableBitSemantics; #[cfg(feature = "host")] -use crate::{lend::CudaAsRust, safety::SafeDeviceCopy, utils::device_copy::SafeDeviceCopyWrapper}; +use crate::{lend::CudaAsRust, utils::device_copy::SafeDeviceCopyWrapper}; -#[repr(transparent)] #[cfg_attr(any(feature = "device", doc), derive(Debug))] #[derive(TypeLayout)] -pub struct DeviceAccessible(T); - -unsafe impl DeviceCopy for DeviceAccessible {} +#[repr(transparent)] +pub struct DeviceAccessible(T); #[cfg(feature = "host")] impl From for DeviceAccessible { @@ -29,8 +25,9 @@ impl From for DeviceAccessible { } } +// TODO: should there be some copy bound here? #[cfg(feature = "host")] -impl From<&T> for DeviceAccessible> { +impl From<&T> for DeviceAccessible> { fn from(value: &T) -> Self { let value = unsafe { let mut uninit = MaybeUninit::uninit(); @@ -43,7 +40,7 @@ impl From<&T> for DeviceAccessible fmt::Debug for DeviceAccessible { +impl fmt::Debug for DeviceAccessible { fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result { fmt.debug_struct(stringify!(DeviceAccessible)) .finish_non_exhaustive() @@ -51,7 +48,7 @@ impl fmt::Debug for DeviceAccessible { } #[cfg(feature = "device")] -impl Deref for DeviceAccessible { +impl Deref for DeviceAccessible { type Target = T; fn deref(&self) -> &Self::Target { @@ -60,74 +57,155 @@ impl Deref for DeviceAccessible { } #[cfg(feature = "device")] -impl DerefMut for DeviceAccessible { +impl DerefMut for DeviceAccessible { fn deref_mut(&mut self) -> &mut Self::Target { &mut self.0 } } +#[derive(TypeLayout)] #[repr(transparent)] -#[derive(Clone, Copy, TypeLayout)] -pub struct DeviceConstRef<'r, T: DeviceCopy + 'r> { +pub struct DeviceConstRef<'r, T: PortableBitSemantics + 'r> { #[cfg_attr(feature = "host", allow(dead_code))] - pub(crate) pointer: *const T, + pub(crate) pointer: DeviceConstPointer, pub(crate) reference: PhantomData<&'r T>, } -unsafe impl<'r, T: DeviceCopy> DeviceCopy for DeviceConstRef<'r, T> {} +impl<'r, T: PortableBitSemantics> Copy for DeviceConstRef<'r, T> {} + +impl<'r, T: PortableBitSemantics> Clone for DeviceConstRef<'r, T> { + fn clone(&self) -> Self { + *self + } +} #[cfg(feature = "device")] -impl<'r, T: DeviceCopy> AsRef for DeviceConstRef<'r, T> { +impl<'r, T: PortableBitSemantics> AsRef for DeviceConstRef<'r, T> { fn as_ref(&self) -> &T { - unsafe { &*self.pointer } + unsafe { &*self.pointer.0 } } } -#[repr(transparent)] #[derive(TypeLayout)] -pub struct DeviceMutRef<'r, T: DeviceCopy + 'r> { +#[repr(transparent)] +pub struct DeviceMutRef<'r, T: PortableBitSemantics + 'r> { #[cfg_attr(feature = "host", allow(dead_code))] - pub(crate) pointer: *mut T, + pub(crate) pointer: DeviceMutPointer, pub(crate) reference: PhantomData<&'r mut T>, } -unsafe impl<'r, T: DeviceCopy> DeviceCopy for DeviceMutRef<'r, T> {} - #[cfg(feature = "device")] -impl<'r, T: DeviceCopy> AsRef for DeviceMutRef<'r, T> { +impl<'r, T: PortableBitSemantics> AsRef for DeviceMutRef<'r, T> { fn as_ref(&self) -> &T { - unsafe { &*self.pointer } + unsafe { &*self.pointer.0 } } } #[cfg(feature = "device")] -impl<'r, T: DeviceCopy> AsMut for DeviceMutRef<'r, T> { +impl<'r, T: PortableBitSemantics> AsMut for DeviceMutRef<'r, T> { fn as_mut(&mut self) -> &mut T { - unsafe { &mut *self.pointer } + unsafe { &mut *self.pointer.0 } } } -#[repr(transparent)] #[derive(TypeLayout)] -pub struct DeviceOwnedRef<'r, T: DeviceCopy> { +#[repr(transparent)] +pub struct DeviceOwnedRef<'r, T: PortableBitSemantics> { #[cfg_attr(feature = "host", allow(dead_code))] - pub(crate) pointer: *mut T, + pub(crate) pointer: DeviceOwnedPointer, pub(crate) reference: PhantomData<&'r mut ()>, pub(crate) marker: PhantomData, } -unsafe impl<'r, T: DeviceCopy> DeviceCopy for DeviceOwnedRef<'r, T> {} - #[cfg(feature = "device")] -impl<'r, T: DeviceCopy> AsRef for DeviceOwnedRef<'r, T> { +impl<'r, T: PortableBitSemantics> AsRef for DeviceOwnedRef<'r, T> { fn as_ref(&self) -> &T { - unsafe { &*self.pointer } + unsafe { &*self.pointer.0 } } } #[cfg(feature = "device")] -impl<'r, T: DeviceCopy> AsMut for DeviceOwnedRef<'r, T> { +impl<'r, T: PortableBitSemantics> AsMut for DeviceOwnedRef<'r, T> { fn as_mut(&mut self) -> &mut T { - unsafe { &mut *self.pointer } + unsafe { &mut *self.pointer.0 } + } +} + +#[derive(TypeLayout)] +#[repr(transparent)] +pub struct DeviceConstPointer(pub(crate) *const T); + +impl Copy for DeviceConstPointer {} + +impl Clone for DeviceConstPointer { + fn clone(&self) -> Self { + *self + } +} + +impl DeviceConstPointer<[T]> { + #[must_use] + pub fn into_raw_parts(self) -> (DeviceConstPointer, usize) { + let (data, len) = self.0.to_raw_parts(); + (DeviceConstPointer(data.cast()), len) + } +} + +#[derive(TypeLayout)] +#[repr(transparent)] +pub struct DeviceMutPointer(pub(crate) *mut T); + +impl Copy for DeviceMutPointer {} + +impl Clone for DeviceMutPointer { + fn clone(&self) -> Self { + *self + } +} + +impl DeviceMutPointer { + #[must_use] + pub const fn as_const(self) -> DeviceConstPointer { + DeviceConstPointer(self.0.cast_const()) + } +} + +impl DeviceMutPointer<[T]> { + #[must_use] + pub fn into_raw_parts(self) -> (DeviceMutPointer, usize) { + let (data, len) = self.0.to_raw_parts(); + (DeviceMutPointer(data.cast()), len) + } +} + +#[derive(TypeLayout)] +#[repr(transparent)] +pub struct DeviceOwnedPointer(pub(crate) *mut T); + +impl Copy for DeviceOwnedPointer {} + +impl Clone for DeviceOwnedPointer { + fn clone(&self) -> Self { + *self + } +} + +impl DeviceOwnedPointer { + #[must_use] + pub const fn as_const(self) -> DeviceConstPointer { + DeviceConstPointer(self.0.cast_const()) + } + + #[must_use] + pub const fn as_mut(self) -> DeviceMutPointer { + DeviceMutPointer(self.0) + } +} + +impl DeviceOwnedPointer<[T]> { + #[must_use] + pub fn into_raw_parts(self) -> (DeviceOwnedPointer, usize) { + let (data, len) = self.0.to_raw_parts(); + (DeviceOwnedPointer(data.cast()), len) } } From d88bac07ba57af93058524d6fd9636b8962ee7b1 Mon Sep 17 00:00:00 2001 From: Juniper Tyree Date: Wed, 27 Dec 2023 10:24:35 +0000 Subject: [PATCH 071/120] More refactoring and auditing kernel param bounds --- examples/single-source/src/main.rs | 9 +- rust-cuda-derive/src/rust_to_cuda/field_ty.rs | 2 +- src/host/mod.rs | 83 +++--- src/kernel/mod.rs | 4 +- src/kernel/param.rs | 151 +++++++---- src/lend/impls/box.rs | 11 +- src/lend/impls/boxed_slice.rs | 9 +- src/lend/impls/option.rs | 28 +- src/lend/impls/ref.rs | 9 +- src/lend/impls/ref_mut.rs | 11 +- src/lend/impls/slice_ref.rs | 7 +- src/lend/impls/slice_ref_mut.rs | 9 +- src/lend/mod.rs | 23 +- src/utils/adapter.rs | 248 ++++++++++++++++++ src/utils/device_copy.rs | 150 ----------- src/utils/exchange/buffer/common.rs | 12 +- src/utils/exchange/buffer/device.rs | 13 +- src/utils/exchange/buffer/host.rs | 48 ++-- src/utils/exchange/buffer/mod.rs | 56 ++-- src/utils/exchange/wrapper.rs | 60 ++++- src/utils/ffi.rs | 29 +- src/utils/mod.rs | 2 +- 22 files changed, 605 insertions(+), 369 deletions(-) create mode 100644 src/utils/adapter.rs delete mode 100644 src/utils/device_copy.rs diff --git a/examples/single-source/src/main.rs b/examples/single-source/src/main.rs index 13f2b7efe..4783deffa 100644 --- a/examples/single-source/src/main.rs +++ b/examples/single-source/src/main.rs @@ -23,7 +23,7 @@ fn main() {} #[layout(crate = "rc::deps::const_type_layout")] pub struct Dummy(i32); -#[derive(rc::lend::LendRustToCuda)] +#[derive(Clone, rc::lend::LendRustToCuda)] #[cuda(crate = "rc")] #[allow(dead_code)] pub struct Wrapper { @@ -31,7 +31,7 @@ pub struct Wrapper { inner: T, } -#[derive(rc::lend::LendRustToCuda)] +#[derive(Clone, rc::lend::LendRustToCuda)] #[cuda(crate = "rc")] pub struct Empty([u8; 0]); @@ -54,6 +54,9 @@ pub struct Triple(i32, i32, i32); pub fn kernel< 'a, T: 'static + + Send + + Sync + + Clone + rc::lend::RustToCuda< CudaRepresentation: rc::safety::StackOnly, CudaAllocation: rc::alloc::EmptyCudaAlloc, @@ -96,7 +99,7 @@ mod host { // Link several instances of the generic CUDA kernel struct KernelPtx<'a, T>(std::marker::PhantomData<&'a T>); crate::link! { impl kernel<'a, crate::Empty> for KernelPtx } - crate::link! { impl kernel<'a, rc::utils::device_copy::SafeDeviceCopyWrapper> for KernelPtx } + crate::link! { impl kernel<'a, rc::utils::adapter::RustToCudaWithPortableBitCopySemantics> for KernelPtx } } #[cfg(target_os = "cuda")] diff --git a/rust-cuda-derive/src/rust_to_cuda/field_ty.rs b/rust-cuda-derive/src/rust_to_cuda/field_ty.rs index 313daf86b..36924aaf9 100644 --- a/rust-cuda-derive/src/rust_to_cuda/field_ty.rs +++ b/rust-cuda-derive/src/rust_to_cuda/field_ty.rs @@ -109,7 +109,7 @@ pub fn swap_field_type_and_filter_attrs( } else { field_ty = parse_quote! { #crate_path::utils::ffi::DeviceAccessible< - #crate_path::utils::device_copy::SafeDeviceCopyWrapper<#field_ty> + #crate_path::utils::adapter::RustToCudaWithPortableBitCopySemantics<#field_ty> > }; diff --git a/src/host/mod.rs b/src/host/mod.rs index e480de9f2..f77c75792 100644 --- a/src/host/mod.rs +++ b/src/host/mod.rs @@ -4,6 +4,7 @@ use std::{ ops::{Deref, DerefMut}, }; +use const_type_layout::TypeGraphLayout; use rustacuda::{ context::Context, error::CudaError, @@ -16,7 +17,7 @@ use rustacuda::{ use crate::{ safety::PortableBitSemantics, utils::{ - device_copy::SafeDeviceCopyWrapper, + adapter::DeviceCopyWithPortableBitSemantics, ffi::{ DeviceConstPointer, DeviceConstRef, DeviceMutPointer, DeviceMutRef, DeviceOwnedPointer, DeviceOwnedRef, @@ -101,17 +102,17 @@ impl_sealed_drop_value!(Context); impl_sealed_drop_value!(Event); #[allow(clippy::module_name_repetitions)] -pub struct HostAndDeviceMutRef<'a, T: PortableBitSemantics> { - device_box: &'a mut DeviceBox>, +pub struct HostAndDeviceMutRef<'a, T: PortableBitSemantics + TypeGraphLayout> { + device_box: &'a mut DeviceBox>, host_ref: &'a mut T, } -impl<'a, T: PortableBitSemantics> HostAndDeviceMutRef<'a, T> { +impl<'a, T: PortableBitSemantics + TypeGraphLayout> HostAndDeviceMutRef<'a, T> { /// # Safety /// /// `device_box` must contain EXACTLY the device copy of `host_ref` pub unsafe fn new( - device_box: &'a mut DeviceBox>, + device_box: &'a mut DeviceBox>, host_ref: &'a mut T, ) -> Self { Self { @@ -132,8 +133,9 @@ impl<'a, T: PortableBitSemantics> HostAndDeviceMutRef<'a, T> { host_ref: &mut T, inner: F, ) -> Result { - let mut device_box = - CudaDropWrapper::from(DeviceBox::new(SafeDeviceCopyWrapper::from_ref(host_ref))?); + let mut device_box = CudaDropWrapper::from(DeviceBox::new( + DeviceCopyWithPortableBitSemantics::from_ref(host_ref), + )?); // Safety: `device_box` contains exactly the device copy of `host_ref` let result = inner(HostAndDeviceMutRef { @@ -142,7 +144,7 @@ impl<'a, T: PortableBitSemantics> HostAndDeviceMutRef<'a, T> { }); // Copy back any changes made - device_box.copy_to(SafeDeviceCopyWrapper::from_mut(host_ref))?; + device_box.copy_to(DeviceCopyWithPortableBitSemantics::from_mut(host_ref))?; core::mem::drop(device_box); @@ -201,25 +203,25 @@ impl<'a, T: PortableBitSemantics> HostAndDeviceMutRef<'a, T> { } #[allow(clippy::module_name_repetitions)] -pub struct HostAndDeviceConstRef<'a, T: PortableBitSemantics> { - device_box: &'a DeviceBox>, +pub struct HostAndDeviceConstRef<'a, T: PortableBitSemantics + TypeGraphLayout> { + device_box: &'a DeviceBox>, host_ref: &'a T, } -impl<'a, T: PortableBitSemantics> Clone for HostAndDeviceConstRef<'a, T> { +impl<'a, T: PortableBitSemantics + TypeGraphLayout> Clone for HostAndDeviceConstRef<'a, T> { fn clone(&self) -> Self { *self } } -impl<'a, T: PortableBitSemantics> Copy for HostAndDeviceConstRef<'a, T> {} +impl<'a, T: PortableBitSemantics + TypeGraphLayout> Copy for HostAndDeviceConstRef<'a, T> {} -impl<'a, T: PortableBitSemantics> HostAndDeviceConstRef<'a, T> { +impl<'a, T: PortableBitSemantics + TypeGraphLayout> HostAndDeviceConstRef<'a, T> { /// # Safety /// /// `device_box` must contain EXACTLY the device copy of `host_ref` pub const unsafe fn new( - device_box: &'a DeviceBox>, + device_box: &'a DeviceBox>, host_ref: &'a T, ) -> Self { Self { @@ -240,8 +242,9 @@ impl<'a, T: PortableBitSemantics> HostAndDeviceConstRef<'a, T> { host_ref: &T, inner: F, ) -> Result { - let device_box = - CudaDropWrapper::from(DeviceBox::new(SafeDeviceCopyWrapper::from_ref(host_ref))?); + let device_box = CudaDropWrapper::from(DeviceBox::new( + DeviceCopyWithPortableBitSemantics::from_ref(host_ref), + )?); // Safety: `device_box` contains exactly the device copy of `host_ref` let result = inner(HostAndDeviceConstRef { @@ -294,12 +297,12 @@ impl<'a, T: PortableBitSemantics> HostAndDeviceConstRef<'a, T> { } #[allow(clippy::module_name_repetitions)] -pub struct HostAndDeviceOwned<'a, T: PortableBitSemantics> { - device_box: &'a mut DeviceBox>, +pub struct HostAndDeviceOwned<'a, T: PortableBitSemantics + TypeGraphLayout> { + device_box: &'a mut DeviceBox>, host_val: &'a mut T, } -impl<'a, T: PortableBitSemantics> HostAndDeviceOwned<'a, T> { +impl<'a, T: PortableBitSemantics + TypeGraphLayout> HostAndDeviceOwned<'a, T> { /// # Errors /// /// Returns a [`CudaError`] iff `value` cannot be moved @@ -308,8 +311,9 @@ impl<'a, T: PortableBitSemantics> HostAndDeviceOwned<'a, T> { mut value: T, inner: F, ) -> Result { - let mut device_box = - CudaDropWrapper::from(DeviceBox::new(SafeDeviceCopyWrapper::from_ref(&value))?); + let mut device_box = CudaDropWrapper::from(DeviceBox::new( + DeviceCopyWithPortableBitSemantics::from_ref(&value), + )?); // Safety: `device_box` contains exactly the device copy of `value` inner(HostAndDeviceOwned { @@ -343,18 +347,20 @@ impl<'a, T: PortableBitSemantics> HostAndDeviceOwned<'a, T> { } #[allow(clippy::module_name_repetitions)] -pub struct HostAndDeviceMutRefAsync<'stream, 'a, T: PortableBitSemantics> { - device_box: &'a mut DeviceBox>, +pub struct HostAndDeviceMutRefAsync<'stream, 'a, T: PortableBitSemantics + TypeGraphLayout> { + device_box: &'a mut DeviceBox>, host_ref: &'a mut T, stream: PhantomData<&'stream Stream>, } -impl<'stream, 'a, T: PortableBitSemantics> HostAndDeviceMutRefAsync<'stream, 'a, T> { +impl<'stream, 'a, T: PortableBitSemantics + TypeGraphLayout> + HostAndDeviceMutRefAsync<'stream, 'a, T> +{ /// # Safety /// /// `device_box` must contain EXACTLY the device copy of `host_ref` pub unsafe fn new( - device_box: &'a mut DeviceBox>, + device_box: &'a mut DeviceBox>, host_ref: &'a mut T, stream: &'stream Stream, ) -> Self { @@ -413,27 +419,34 @@ impl<'stream, 'a, T: PortableBitSemantics> HostAndDeviceMutRefAsync<'stream, 'a, } #[allow(clippy::module_name_repetitions)] -pub struct HostAndDeviceConstRefAsync<'stream, 'a, T: PortableBitSemantics> { - device_box: &'a DeviceBox>, +pub struct HostAndDeviceConstRefAsync<'stream, 'a, T: PortableBitSemantics + TypeGraphLayout> { + device_box: &'a DeviceBox>, host_ref: &'a T, stream: PhantomData<&'stream Stream>, } -impl<'stream, 'a, T: PortableBitSemantics> Clone for HostAndDeviceConstRefAsync<'stream, 'a, T> { +impl<'stream, 'a, T: PortableBitSemantics + TypeGraphLayout> Clone + for HostAndDeviceConstRefAsync<'stream, 'a, T> +{ fn clone(&self) -> Self { *self } } -impl<'stream, 'a, T: PortableBitSemantics> Copy for HostAndDeviceConstRefAsync<'stream, 'a, T> {} +impl<'stream, 'a, T: PortableBitSemantics + TypeGraphLayout> Copy + for HostAndDeviceConstRefAsync<'stream, 'a, T> +{ +} -impl<'stream, 'a, T: PortableBitSemantics> HostAndDeviceConstRefAsync<'stream, 'a, T> { +impl<'stream, 'a, T: PortableBitSemantics + TypeGraphLayout> + HostAndDeviceConstRefAsync<'stream, 'a, T> +{ /// # Safety /// /// `device_box` must contain EXACTLY the device copy of `host_ref` #[must_use] pub const unsafe fn new( - device_box: &'a DeviceBox>, + device_box: &'a DeviceBox>, host_ref: &'a T, stream: &'stream Stream, ) -> Self { @@ -478,13 +491,15 @@ impl<'stream, 'a, T: PortableBitSemantics> HostAndDeviceConstRefAsync<'stream, ' } #[allow(clippy::module_name_repetitions)] -pub struct HostAndDeviceOwnedAsync<'stream, 'a, T: PortableBitSemantics> { - device_box: &'a mut DeviceBox>, +pub struct HostAndDeviceOwnedAsync<'stream, 'a, T: PortableBitSemantics + TypeGraphLayout> { + device_box: &'a mut DeviceBox>, host_val: &'a mut T, stream: PhantomData<&'stream Stream>, } -impl<'stream, 'a, T: PortableBitSemantics> HostAndDeviceOwnedAsync<'stream, 'a, T> { +impl<'stream, 'a, T: PortableBitSemantics + TypeGraphLayout> + HostAndDeviceOwnedAsync<'stream, 'a, T> +{ #[must_use] /// # Safety /// diff --git a/src/kernel/mod.rs b/src/kernel/mod.rs index 29b3795c0..b6ed5b8e7 100644 --- a/src/kernel/mod.rs +++ b/src/kernel/mod.rs @@ -226,9 +226,9 @@ impl RawPtxKernel { /// Returns a [`CudaError`] if `ptx` is not a valid PTX source, or it does /// not contain an entry point named `entry_point`. pub fn new(ptx: &CStr, entry_point: &CStr) -> CudaResult { - let module = Box::new(Module::load_from_string(ptx)?); + let module: Box = Box::new(Module::load_from_string(ptx)?); - let function = unsafe { &*(module.as_ref() as *const Module) }.get_function(entry_point); + let function = unsafe { &*std::ptr::from_ref(module.as_ref()) }.get_function(entry_point); let function = match function { Ok(function) => function, diff --git a/src/kernel/param.rs b/src/kernel/param.rs index 9b2499b51..17d4bc3a5 100644 --- a/src/kernel/param.rs +++ b/src/kernel/param.rs @@ -37,12 +37,14 @@ impl DerefMut for PtxJit { } } -pub struct PerThreadShallowCopy { +pub struct PerThreadShallowCopy< + T: crate::safety::StackOnly + crate::safety::PortableBitSemantics + TypeGraphLayout, +> { never: !, _marker: PhantomData, } -impl Deref +impl Deref for PerThreadShallowCopy { type Target = T; @@ -52,7 +54,7 @@ impl Deref } } -impl DerefMut +impl DerefMut for PerThreadShallowCopy { fn deref_mut(&mut self) -> &mut Self::Target { @@ -60,14 +62,20 @@ impl DerefMut } } -impl - CudaKernelParameter for PerThreadShallowCopy +impl< + T: Copy + + Send + + crate::safety::StackOnly + + crate::safety::PortableBitSemantics + + TypeGraphLayout, + > CudaKernelParameter for PerThreadShallowCopy { #[cfg(feature = "host")] - type AsyncHostType<'stream, 'b> = crate::utils::device_copy::SafeDeviceCopyWrapper; + type AsyncHostType<'stream, 'b> = + crate::utils::adapter::RustToCudaWithPortableBitCopySemantics; #[cfg(any(feature = "device", doc))] type DeviceType<'b> = T; - type FfiType<'stream, 'b> = crate::utils::device_copy::SafeDeviceCopyWrapper; + type FfiType<'stream, 'b> = crate::utils::adapter::RustToCudaWithPortableBitCopySemantics; #[cfg(feature = "host")] type SyncHostType = T; @@ -77,9 +85,7 @@ impl FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result, ) -> Result { - inner(crate::utils::device_copy::SafeDeviceCopyWrapper::from( - param, - )) + inner(crate::utils::adapter::RustToCudaWithPortableBitCopySemantics::from(param)) } #[cfg(feature = "host")] @@ -112,13 +118,24 @@ impl sealed::Sealed - for PerThreadShallowCopy +impl< + T: Copy + + Send + + crate::safety::StackOnly + + crate::safety::PortableBitSemantics + + TypeGraphLayout, + > sealed::Sealed for PerThreadShallowCopy { } -impl<'a, T: 'static + Sync + crate::safety::StackOnly + crate::safety::PortableBitSemantics> - CudaKernelParameter for &'a PerThreadShallowCopy +impl< + 'a, + T: 'static + + Sync + + crate::safety::StackOnly + + crate::safety::PortableBitSemantics + + TypeGraphLayout, + > CudaKernelParameter for &'a PerThreadShallowCopy { #[cfg(feature = "host")] type AsyncHostType<'stream, 'b> = crate::host::HostAndDeviceConstRefAsync<'stream, 'b, T>; @@ -167,13 +184,25 @@ impl<'a, T: 'static + Sync + crate::safety::StackOnly + crate::safety::PortableB inner(param) } } -impl<'a, T: 'static + Sync + crate::safety::StackOnly + crate::safety::PortableBitSemantics> - sealed::Sealed for &'a PerThreadShallowCopy +impl< + 'a, + T: 'static + + Sync + + crate::safety::StackOnly + + crate::safety::PortableBitSemantics + + TypeGraphLayout, + > sealed::Sealed for &'a PerThreadShallowCopy { } -impl<'a, T: 'static + Sync + crate::safety::StackOnly + crate::safety::PortableBitSemantics> - CudaKernelParameter for &'a PtxJit> +impl< + 'a, + T: 'static + + Sync + + crate::safety::StackOnly + + crate::safety::PortableBitSemantics + + TypeGraphLayout, + > CudaKernelParameter for &'a PtxJit> { #[cfg(feature = "host")] type AsyncHostType<'stream, 'b> = @@ -226,20 +255,35 @@ impl<'a, T: 'static + Sync + crate::safety::StackOnly + crate::safety::PortableB ) } } -impl<'a, T: 'static + Sync + crate::safety::StackOnly + crate::safety::PortableBitSemantics> - sealed::Sealed for &'a PtxJit> +impl< + 'a, + T: 'static + + Sync + + crate::safety::StackOnly + + crate::safety::PortableBitSemantics + + TypeGraphLayout, + > sealed::Sealed for &'a PtxJit> { } pub struct ShallowInteriorMutable< - T: crate::safety::StackOnly + crate::safety::PortableBitSemantics + InteriorMutableSync, + T: Sync + + crate::safety::StackOnly + + crate::safety::PortableBitSemantics + + TypeGraphLayout + + InteriorMutableSync, > { never: !, _marker: PhantomData, } -impl Deref - for ShallowInteriorMutable +impl< + T: Sync + + crate::safety::StackOnly + + crate::safety::PortableBitSemantics + + TypeGraphLayout + + InteriorMutableSync, + > Deref for ShallowInteriorMutable { type Target = T; @@ -251,8 +295,10 @@ impl CudaKernelParameter for &'a ShallowInteriorMutable { @@ -309,7 +355,11 @@ impl< } impl< 'a, - T: crate::safety::StackOnly + crate::safety::PortableBitSemantics + InteriorMutableSync, + T: crate::safety::StackOnly + + Sync + + crate::safety::PortableBitSemantics + + TypeGraphLayout + + InteriorMutableSync, > sealed::Sealed for &'a ShallowInteriorMutable { } @@ -355,10 +405,12 @@ impl Deref for SharedHeapPerThreadShallowCopy { } impl< - T: RustToCuda< - CudaRepresentation: 'static + crate::safety::PortableBitSemantics, - CudaAllocation: EmptyCudaAlloc, - >, + T: Send + + Clone + + RustToCuda< + CudaRepresentation: 'static + crate::safety::StackOnly, + CudaAllocation: EmptyCudaAlloc, + >, > CudaKernelParameter for SharedHeapPerThreadShallowCopy { #[cfg(feature = "host")] @@ -412,15 +464,19 @@ impl< } } impl< - T: RustToCuda< - CudaRepresentation: crate::safety::PortableBitSemantics, - CudaAllocation: EmptyCudaAlloc, - >, + T: Send + + Clone + + RustToCuda< + CudaRepresentation: 'static + crate::safety::StackOnly, + CudaAllocation: EmptyCudaAlloc, + >, > sealed::Sealed for SharedHeapPerThreadShallowCopy { } -impl<'a, T: 'static + RustToCuda> CudaKernelParameter for &'a SharedHeapPerThreadShallowCopy { +impl<'a, T: 'static + Sync + RustToCuda> CudaKernelParameter + for &'a SharedHeapPerThreadShallowCopy +{ #[cfg(feature = "host")] type AsyncHostType<'stream, 'b> = crate::host::HostAndDeviceConstRefAsync< 'stream, @@ -471,13 +527,15 @@ impl<'a, T: 'static + RustToCuda> CudaKernelParameter for &'a SharedHeapPerThrea unsafe { crate::lend::BorrowFromRust::with_borrow_from_rust(param, inner) } } } -impl<'a, T: RustToCuda> sealed::Sealed for &'a SharedHeapPerThreadShallowCopy {} +impl<'a, T: Sync + RustToCuda> sealed::Sealed for &'a SharedHeapPerThreadShallowCopy {} impl< - T: RustToCuda< - CudaRepresentation: 'static + crate::safety::PortableBitSemantics, - CudaAllocation: EmptyCudaAlloc, - >, + T: Send + + Clone + + RustToCuda< + CudaRepresentation: 'static + crate::safety::StackOnly, + CudaAllocation: EmptyCudaAlloc, + >, > CudaKernelParameter for PtxJit> { #[cfg(feature = "host")] @@ -535,15 +593,17 @@ impl< } } impl< - T: RustToCuda< - CudaRepresentation: crate::safety::PortableBitSemantics, - CudaAllocation: EmptyCudaAlloc, - >, + T: Send + + Clone + + RustToCuda< + CudaRepresentation: 'static + crate::safety::StackOnly, + CudaAllocation: EmptyCudaAlloc, + >, > sealed::Sealed for PtxJit> { } -impl<'a, T: 'static + RustToCuda> CudaKernelParameter +impl<'a, T: 'static + Sync + RustToCuda> CudaKernelParameter for &'a PtxJit> { #[cfg(feature = "host")] @@ -601,7 +661,10 @@ impl<'a, T: 'static + RustToCuda> CudaKernelParameter ) } } -impl<'a, T: RustToCuda> sealed::Sealed for &'a PtxJit> {} +impl<'a, T: 'static + Sync + RustToCuda> sealed::Sealed + for &'a PtxJit> +{ +} #[cfg(feature = "host")] fn param_as_raw_bytes(r: &T) -> NonNull<[u8]> { diff --git a/src/lend/impls/box.rs b/src/lend/impls/box.rs index e2a78999b..4156b1a29 100644 --- a/src/lend/impls/box.rs +++ b/src/lend/impls/box.rs @@ -17,7 +17,7 @@ use crate::utils::ffi::DeviceAccessible; use crate::{ alloc::{CombinedCudaAlloc, CudaAlloc}, host::CudaDropWrapper, - utils::device_copy::SafeDeviceCopyWrapper, + utils::adapter::DeviceCopyWithPortableBitSemantics, }; #[doc(hidden)] @@ -28,7 +28,7 @@ pub struct BoxCudaRepresentation(Devi unsafe impl RustToCuda for Box { #[cfg(all(feature = "host", not(doc)))] - type CudaAllocation = CudaDropWrapper>>; + type CudaAllocation = CudaDropWrapper>>; #[cfg(any(not(feature = "host"), doc))] type CudaAllocation = crate::alloc::SomeCudaAlloc; type CudaRepresentation = BoxCudaRepresentation; @@ -42,8 +42,9 @@ unsafe impl RustToCuda for Box { DeviceAccessible, CombinedCudaAlloc, )> { - let mut device_box = - CudaDropWrapper::from(DeviceBox::new(SafeDeviceCopyWrapper::from_ref(&**self))?); + let mut device_box = CudaDropWrapper::from(DeviceBox::new( + DeviceCopyWithPortableBitSemantics::from_ref(&**self), + )?); Ok(( DeviceAccessible::from(BoxCudaRepresentation(DeviceOwnedPointer( @@ -62,7 +63,7 @@ unsafe impl RustToCuda for Box { let (alloc_front, alloc_tail) = alloc.split(); - alloc_front.copy_to(SafeDeviceCopyWrapper::from_mut(&mut **self))?; + alloc_front.copy_to(DeviceCopyWithPortableBitSemantics::from_mut(&mut **self))?; core::mem::drop(alloc_front); diff --git a/src/lend/impls/boxed_slice.rs b/src/lend/impls/boxed_slice.rs index 677fcca7d..575ea4ef6 100644 --- a/src/lend/impls/boxed_slice.rs +++ b/src/lend/impls/boxed_slice.rs @@ -19,7 +19,7 @@ use crate::utils::ffi::DeviceAccessible; use crate::{ alloc::{CombinedCudaAlloc, CudaAlloc}, host::CudaDropWrapper, - utils::device_copy::SafeDeviceCopyWrapper, + utils::adapter::DeviceCopyWithPortableBitSemantics, }; #[doc(hidden)] @@ -34,7 +34,8 @@ pub struct BoxedSliceCudaRepresentation RustToCuda for Box<[T]> { #[cfg(all(feature = "host", not(doc)))] - type CudaAllocation = crate::host::CudaDropWrapper>>; + type CudaAllocation = + crate::host::CudaDropWrapper>>; #[cfg(any(not(feature = "host"), doc))] type CudaAllocation = crate::alloc::SomeCudaAlloc; type CudaRepresentation = BoxedSliceCudaRepresentation; @@ -49,7 +50,7 @@ unsafe impl RustToCuda for Box<[T]> { CombinedCudaAlloc, )> { let mut device_buffer = CudaDropWrapper::from(DeviceBuffer::from_slice( - SafeDeviceCopyWrapper::from_slice(self), + DeviceCopyWithPortableBitSemantics::from_slice(self), )?); Ok(( @@ -71,7 +72,7 @@ unsafe impl RustToCuda for Box<[T]> { let (alloc_front, alloc_tail) = alloc.split(); - alloc_front.copy_to(SafeDeviceCopyWrapper::from_mut_slice(self))?; + alloc_front.copy_to(DeviceCopyWithPortableBitSemantics::from_mut_slice(self))?; core::mem::drop(alloc_front); diff --git a/src/lend/impls/option.rs b/src/lend/impls/option.rs index f12f24861..fab89b89d 100644 --- a/src/lend/impls/option.rs +++ b/src/lend/impls/option.rs @@ -8,7 +8,7 @@ use rustacuda::error::CudaResult; use crate::{ lend::{CudaAsRust, RustToCuda, RustToCudaAsync, RustToCudaAsyncProxy, RustToCudaProxy}, safety::PortableBitSemantics, - utils::{device_copy::SafeDeviceCopyWrapper, ffi::DeviceAccessible}, + utils::{adapter::RustToCudaWithPortableBitCopySemantics, ffi::DeviceAccessible}, }; #[cfg(feature = "host")] @@ -145,38 +145,36 @@ unsafe impl CudaAsRust for OptionCudaRepresentation { } } -impl RustToCudaProxy> - for Option> +impl RustToCudaProxy> + for Option> { fn from_ref(val: &Option) -> &Self { - // Safety: [`SafeDeviceCopyWrapper`] is a transparent newtype - unsafe { &*(val as *const Option).cast() } + // Safety: [`RustToCudaWithPortableBitCopySemantics`] is a transparent newtype + unsafe { &*core::ptr::from_ref(val).cast() } } fn from_mut(val: &mut Option) -> &mut Self { - // Safety: [`SafeDeviceCopyWrapper`] is a transparent newtype - unsafe { &mut *(val as *mut Option).cast() } + // Safety: [`RustToCudaWithPortableBitCopySemantics`] is a transparent newtype + unsafe { &mut *core::ptr::from_mut(val).cast() } } fn into(self) -> Option { - self.map(SafeDeviceCopyWrapper::into_inner) + self.map(RustToCudaWithPortableBitCopySemantics::into_inner) } } -impl RustToCudaAsyncProxy> - for Option> +impl RustToCudaAsyncProxy> + for Option> { fn from_ref(val: &Option) -> &Self { - // Safety: [`SafeDeviceCopyWrapper`] is a transparent newtype - unsafe { &*(val as *const Option).cast() } + >>::from_ref(val) } fn from_mut(val: &mut Option) -> &mut Self { - // Safety: [`SafeDeviceCopyWrapper`] is a transparent newtype - unsafe { &mut *(val as *mut Option).cast() } + >>::from_mut(val) } fn into(self) -> Option { - self.map(SafeDeviceCopyWrapper::into_inner) + >>::into(self) } } diff --git a/src/lend/impls/ref.rs b/src/lend/impls/ref.rs index 39ba6117d..c068920ab 100644 --- a/src/lend/impls/ref.rs +++ b/src/lend/impls/ref.rs @@ -18,7 +18,7 @@ use crate::utils::ffi::DeviceAccessible; use crate::{ alloc::{CombinedCudaAlloc, CudaAlloc}, host::CudaDropWrapper, - utils::device_copy::SafeDeviceCopyWrapper, + utils::adapter::DeviceCopyWithPortableBitSemantics, }; #[doc(hidden)] @@ -32,7 +32,7 @@ pub struct RefCudaRepresentation<'a, T: 'a + PortableBitSemantics + TypeGraphLay unsafe impl<'a, T: PortableBitSemantics + TypeGraphLayout> RustToCuda for &'a T { #[cfg(all(feature = "host", not(doc)))] - type CudaAllocation = CudaDropWrapper>>; + type CudaAllocation = CudaDropWrapper>>; #[cfg(any(not(feature = "host"), doc))] type CudaAllocation = crate::alloc::SomeCudaAlloc; type CudaRepresentation = RefCudaRepresentation<'a, T>; @@ -46,8 +46,9 @@ unsafe impl<'a, T: PortableBitSemantics + TypeGraphLayout> RustToCuda for &'a T DeviceAccessible, CombinedCudaAlloc, )> { - let mut device_box = - CudaDropWrapper::from(DeviceBox::new(SafeDeviceCopyWrapper::from_ref(&**self))?); + let mut device_box = CudaDropWrapper::from(DeviceBox::new( + DeviceCopyWithPortableBitSemantics::from_ref(&**self), + )?); Ok(( DeviceAccessible::from(RefCudaRepresentation { diff --git a/src/lend/impls/ref_mut.rs b/src/lend/impls/ref_mut.rs index 33d0fa6e7..2a59d8953 100644 --- a/src/lend/impls/ref_mut.rs +++ b/src/lend/impls/ref_mut.rs @@ -18,7 +18,7 @@ use crate::utils::ffi::DeviceAccessible; use crate::{ alloc::{CombinedCudaAlloc, CudaAlloc}, host::CudaDropWrapper, - utils::device_copy::SafeDeviceCopyWrapper, + utils::adapter::DeviceCopyWithPortableBitSemantics, }; #[doc(hidden)] @@ -32,7 +32,7 @@ pub struct RefMutCudaRepresentation<'a, T: 'a + PortableBitSemantics + TypeGraph unsafe impl<'a, T: PortableBitSemantics + TypeGraphLayout> RustToCuda for &'a mut T { #[cfg(all(feature = "host", not(doc)))] - type CudaAllocation = CudaDropWrapper>>; + type CudaAllocation = CudaDropWrapper>>; #[cfg(any(not(feature = "host"), doc))] type CudaAllocation = crate::alloc::SomeCudaAlloc; type CudaRepresentation = RefMutCudaRepresentation<'a, T>; @@ -46,8 +46,9 @@ unsafe impl<'a, T: PortableBitSemantics + TypeGraphLayout> RustToCuda for &'a mu DeviceAccessible, CombinedCudaAlloc, )> { - let mut device_box = - CudaDropWrapper::from(DeviceBox::new(SafeDeviceCopyWrapper::from_ref(&**self))?); + let mut device_box = CudaDropWrapper::from(DeviceBox::new( + DeviceCopyWithPortableBitSemantics::from_ref(&**self), + )?); Ok(( DeviceAccessible::from(RefMutCudaRepresentation { @@ -67,7 +68,7 @@ unsafe impl<'a, T: PortableBitSemantics + TypeGraphLayout> RustToCuda for &'a mu let (alloc_front, alloc_tail) = alloc.split(); - alloc_front.copy_to(SafeDeviceCopyWrapper::from_mut(&mut **self))?; + alloc_front.copy_to(DeviceCopyWithPortableBitSemantics::from_mut(&mut **self))?; core::mem::drop(alloc_front); diff --git a/src/lend/impls/slice_ref.rs b/src/lend/impls/slice_ref.rs index 4b7898571..70d3a1e63 100644 --- a/src/lend/impls/slice_ref.rs +++ b/src/lend/impls/slice_ref.rs @@ -18,7 +18,7 @@ use crate::utils::ffi::DeviceAccessible; use crate::{ alloc::{CombinedCudaAlloc, CudaAlloc}, host::CudaDropWrapper, - utils::device_copy::SafeDeviceCopyWrapper, + utils::adapter::DeviceCopyWithPortableBitSemantics, }; #[doc(hidden)] @@ -33,7 +33,8 @@ pub struct SliceRefCudaRepresentation<'a, T: 'a + PortableBitSemantics + TypeGra unsafe impl<'a, T: PortableBitSemantics + TypeGraphLayout> RustToCuda for &'a [T] { #[cfg(all(feature = "host", not(doc)))] - type CudaAllocation = crate::host::CudaDropWrapper>>; + type CudaAllocation = + crate::host::CudaDropWrapper>>; #[cfg(any(not(feature = "host"), doc))] type CudaAllocation = crate::alloc::SomeCudaAlloc; type CudaRepresentation = SliceRefCudaRepresentation<'a, T>; @@ -48,7 +49,7 @@ unsafe impl<'a, T: PortableBitSemantics + TypeGraphLayout> RustToCuda for &'a [T CombinedCudaAlloc, )> { let device_buffer = CudaDropWrapper::from(DeviceBuffer::from_slice( - SafeDeviceCopyWrapper::from_slice(self), + DeviceCopyWithPortableBitSemantics::from_slice(self), )?); Ok(( diff --git a/src/lend/impls/slice_ref_mut.rs b/src/lend/impls/slice_ref_mut.rs index 9246fa474..0e802ccca 100644 --- a/src/lend/impls/slice_ref_mut.rs +++ b/src/lend/impls/slice_ref_mut.rs @@ -18,7 +18,7 @@ use crate::utils::ffi::DeviceAccessible; use crate::{ alloc::{CombinedCudaAlloc, CudaAlloc}, host::CudaDropWrapper, - utils::device_copy::SafeDeviceCopyWrapper, + utils::adapter::DeviceCopyWithPortableBitSemantics, }; #[doc(hidden)] @@ -33,7 +33,8 @@ pub struct SliceRefMutCudaRepresentation<'a, T: 'a + PortableBitSemantics + Type unsafe impl<'a, T: PortableBitSemantics + TypeGraphLayout> RustToCuda for &'a mut [T] { #[cfg(all(feature = "host", not(doc)))] - type CudaAllocation = crate::host::CudaDropWrapper>>; + type CudaAllocation = + crate::host::CudaDropWrapper>>; #[cfg(any(not(feature = "host"), doc))] type CudaAllocation = crate::alloc::SomeCudaAlloc; type CudaRepresentation = SliceRefMutCudaRepresentation<'a, T>; @@ -48,7 +49,7 @@ unsafe impl<'a, T: PortableBitSemantics + TypeGraphLayout> RustToCuda for &'a mu CombinedCudaAlloc, )> { let mut device_buffer = CudaDropWrapper::from(DeviceBuffer::from_slice( - SafeDeviceCopyWrapper::from_slice(self), + DeviceCopyWithPortableBitSemantics::from_slice(self), )?); Ok(( @@ -70,7 +71,7 @@ unsafe impl<'a, T: PortableBitSemantics + TypeGraphLayout> RustToCuda for &'a mu let (alloc_front, alloc_tail) = alloc.split(); - alloc_front.copy_to(SafeDeviceCopyWrapper::from_mut_slice(self))?; + alloc_front.copy_to(DeviceCopyWithPortableBitSemantics::from_mut_slice(self))?; core::mem::drop(alloc_front); diff --git a/src/lend/mod.rs b/src/lend/mod.rs index 2fac0a08e..603064fb8 100644 --- a/src/lend/mod.rs +++ b/src/lend/mod.rs @@ -6,17 +6,18 @@ use rustacuda::error::CudaError; #[allow(clippy::module_name_repetitions)] pub use rust_cuda_derive::LendRustToCuda; -use crate::alloc::CudaAlloc; - +#[cfg(any(feature = "host", feature = "device", doc))] +use crate::safety::StackOnly; #[cfg(feature = "device")] use crate::utils::ffi::{DeviceConstRef, DeviceOwnedRef}; +use crate::{alloc::CudaAlloc, safety::PortableBitSemantics}; +#[cfg(any(feature = "host", feature = "device"))] +use crate::{alloc::EmptyCudaAlloc, utils::ffi::DeviceAccessible}; #[cfg(feature = "host")] use crate::{ - alloc::{CombinedCudaAlloc, EmptyCudaAlloc, NoCudaAlloc}, + alloc::{CombinedCudaAlloc, NoCudaAlloc}, host::{HostAndDeviceConstRef, HostAndDeviceOwned}, }; -#[cfg(any(feature = "host", feature = "device"))] -use crate::{safety::PortableBitSemantics, utils::ffi::DeviceAccessible}; mod impls; @@ -163,7 +164,7 @@ pub trait LendToCuda: RustToCuda { inner: F, ) -> Result; - /// Moves `self` to CUDA iff `self` has [`PortableBitSemantics`] + /// Moves `self` to CUDA iff `self` is [`StackOnly`]. /// /// # Errors /// @@ -179,7 +180,7 @@ pub trait LendToCuda: RustToCuda { inner: F, ) -> Result where - Self: RustToCuda; + Self: RustToCuda; } #[cfg(feature = "host")] @@ -215,7 +216,7 @@ impl LendToCuda for T { inner: F, ) -> Result where - Self: RustToCuda, + Self: RustToCuda, { let (cuda_repr, alloc) = unsafe { self.borrow(NoCudaAlloc) }?; @@ -249,8 +250,7 @@ pub trait BorrowFromRust: RustToCuda { inner: F, ) -> O where - Self: Sized, - ::CudaRepresentation: PortableBitSemantics; + Self: Sized + RustToCuda; } #[cfg(feature = "device")] @@ -273,8 +273,7 @@ impl BorrowFromRust for T { inner: F, ) -> O where - Self: Sized, - ::CudaRepresentation: PortableBitSemantics, + Self: RustToCuda, { inner(CudaAsRust::as_rust(cuda_repr.as_mut())) } diff --git a/src/utils/adapter.rs b/src/utils/adapter.rs new file mode 100644 index 000000000..8be7712ef --- /dev/null +++ b/src/utils/adapter.rs @@ -0,0 +1,248 @@ +#![allow(clippy::trait_duplication_in_bounds)] + +use const_type_layout::{TypeGraphLayout, TypeLayout}; + +use crate::{ + alloc::NoCudaAlloc, + lend::{CudaAsRust, RustToCuda, RustToCudaAsync}, + safety::PortableBitSemantics, +}; + +#[cfg(any(feature = "host", feature = "device"))] +use crate::utils::ffi::DeviceAccessible; + +#[cfg(feature = "host")] +use crate::alloc::{CombinedCudaAlloc, CudaAlloc}; + +#[derive(Copy, Clone, Debug, TypeLayout)] +#[repr(transparent)] +pub struct RustToCudaWithPortableBitCopySemantics( + T, +); + +impl From + for RustToCudaWithPortableBitCopySemantics +{ + fn from(value: T) -> Self { + Self(value) + } +} + +impl RustToCudaWithPortableBitCopySemantics { + #[must_use] + pub const fn from_copy(value: &T) -> Self { + Self(*value) + } + + #[must_use] + pub const fn into_inner(self) -> T { + self.0 + } + + #[must_use] + pub const fn from_ref(reference: &T) -> &Self { + // Safety: [`RustToCudaWithPortableBitCopySemantics`] is a transparent newtype + // around `T` + unsafe { &*core::ptr::from_ref(reference).cast() } + } + + #[must_use] + pub const fn into_ref(&self) -> &T { + // Safety: [`RustToCudaWithPortableBitCopySemantics`] is a transparent newtype + // around `T` + unsafe { &*core::ptr::from_ref(self).cast() } + } + + #[must_use] + pub fn from_mut(reference: &mut T) -> &mut Self { + // Safety: [`RustToCudaWithPortableBitCopySemantics`] is a transparent newtype + // around `T` + unsafe { &mut *core::ptr::from_mut(reference).cast() } + } + + #[must_use] + pub fn into_mut(&mut self) -> &mut T { + // Safety: [`RustToCudaWithPortableBitCopySemantics`] is a transparent newtype + // around `T` + unsafe { &mut *core::ptr::from_mut(self).cast() } + } + + #[must_use] + pub const fn from_slice(slice: &[T]) -> &[Self] { + // Safety: [`RustToCudaWithPortableBitCopySemantics`] is a transparent newtype + // around `T` + unsafe { core::slice::from_raw_parts(slice.as_ptr().cast(), slice.len()) } + } + + #[must_use] + pub const fn into_slice(slice: &[Self]) -> &[T] { + // Safety: [`RustToCudaWithPortableBitCopySemantics`] is a transparent newtype + // around `T` + unsafe { core::slice::from_raw_parts(slice.as_ptr().cast(), slice.len()) } + } + + #[must_use] + pub fn from_mut_slice(slice: &mut [T]) -> &mut [Self] { + // Safety: [`RustToCudaWithPortableBitCopySemantics`] is a transparent newtype + // around `T` + unsafe { core::slice::from_raw_parts_mut(slice.as_mut_ptr().cast(), slice.len()) } + } + + #[must_use] + pub fn into_mut_slice(slice: &mut [Self]) -> &mut [T] { + // Safety: [`RustToCudaWithPortableBitCopySemantics`] is a transparent newtype + // around `T` + unsafe { core::slice::from_raw_parts_mut(slice.as_mut_ptr().cast(), slice.len()) } + } +} + +unsafe impl RustToCuda + for RustToCudaWithPortableBitCopySemantics +{ + type CudaAllocation = NoCudaAlloc; + type CudaRepresentation = Self; + + #[cfg(feature = "host")] + #[allow(clippy::type_complexity)] + unsafe fn borrow( + &self, + alloc: A, + ) -> rustacuda::error::CudaResult<( + DeviceAccessible, + CombinedCudaAlloc, + )> { + let alloc = CombinedCudaAlloc::new(NoCudaAlloc, alloc); + Ok((DeviceAccessible::from(*self), alloc)) + } + + #[cfg(feature = "host")] + unsafe fn restore( + &mut self, + alloc: CombinedCudaAlloc, + ) -> rustacuda::error::CudaResult { + let (_alloc_front, alloc_tail): (NoCudaAlloc, A) = alloc.split(); + + Ok(alloc_tail) + } +} + +unsafe impl RustToCudaAsync + for RustToCudaWithPortableBitCopySemantics +{ + #[cfg(feature = "host")] + #[allow(clippy::type_complexity)] + unsafe fn borrow_async( + &self, + alloc: A, + _stream: &rustacuda::stream::Stream, + ) -> rustacuda::error::CudaResult<( + DeviceAccessible, + CombinedCudaAlloc, + )> { + let alloc = CombinedCudaAlloc::new(NoCudaAlloc, alloc); + Ok((DeviceAccessible::from(*self), alloc)) + } + + #[cfg(feature = "host")] + unsafe fn restore_async( + &mut self, + alloc: CombinedCudaAlloc, + _stream: &rustacuda::stream::Stream, + ) -> rustacuda::error::CudaResult { + let (_alloc_front, alloc_tail): (NoCudaAlloc, A) = alloc.split(); + + Ok(alloc_tail) + } +} + +unsafe impl CudaAsRust + for RustToCudaWithPortableBitCopySemantics +{ + type RustRepresentation = Self; + + #[cfg(feature = "device")] + unsafe fn as_rust(this: &DeviceAccessible) -> Self::RustRepresentation { + let mut uninit = core::mem::MaybeUninit::uninit(); + core::ptr::copy_nonoverlapping(&**this, uninit.as_mut_ptr(), 1); + uninit.assume_init() + } +} + +#[allow(clippy::module_name_repetitions)] +#[derive(Copy, Clone, Debug, TypeLayout)] +#[repr(transparent)] +pub struct DeviceCopyWithPortableBitSemantics(T); + +unsafe impl rustacuda_core::DeviceCopy + for DeviceCopyWithPortableBitSemantics +{ +} + +impl From for DeviceCopyWithPortableBitSemantics { + fn from(value: T) -> Self { + Self(value) + } +} + +impl DeviceCopyWithPortableBitSemantics { + #[must_use] + pub fn into_inner(self) -> T { + self.0 + } + + #[must_use] + pub const fn from_ref(reference: &T) -> &Self { + // Safety: [`DeviceCopyWithPortableBitSemantics`] is a transparent newtype + // around `T` + unsafe { &*core::ptr::from_ref(reference).cast() } + } + + #[must_use] + pub const fn into_ref(&self) -> &T { + // Safety: [`DeviceCopyWithPortableBitSemantics`] is a transparent newtype + // around `T` + unsafe { &*core::ptr::from_ref(self).cast() } + } + + #[must_use] + pub fn from_mut(reference: &mut T) -> &mut Self { + // Safety: [`DeviceCopyWithPortableBitSemantics`] is a transparent newtype + // around `T` + unsafe { &mut *core::ptr::from_mut(reference).cast() } + } + + #[must_use] + pub fn into_mut(&mut self) -> &mut T { + // Safety: [`DeviceCopyWithPortableBitSemantics`] is a transparent newtype + // around `T` + unsafe { &mut *core::ptr::from_mut(self).cast() } + } + + #[must_use] + pub const fn from_slice(slice: &[T]) -> &[Self] { + // Safety: [`DeviceCopyWithPortableBitSemantics`] is a transparent newtype + // around `T` + unsafe { core::slice::from_raw_parts(slice.as_ptr().cast(), slice.len()) } + } + + #[must_use] + pub const fn into_slice(slice: &[Self]) -> &[T] { + // Safety: [`DeviceCopyWithPortableBitSemantics`] is a transparent newtype + // around `T` + unsafe { core::slice::from_raw_parts(slice.as_ptr().cast(), slice.len()) } + } + + #[must_use] + pub fn from_mut_slice(slice: &mut [T]) -> &mut [Self] { + // Safety: [`DeviceCopyWithPortableBitSemantics`] is a transparent newtype + // around `T` + unsafe { core::slice::from_raw_parts_mut(slice.as_mut_ptr().cast(), slice.len()) } + } + + #[must_use] + pub fn into_mut_slice(slice: &mut [Self]) -> &mut [T] { + // Safety: [`DeviceCopyWithPortableBitSemantics`] is a transparent newtype + // around `T` + unsafe { core::slice::from_raw_parts_mut(slice.as_mut_ptr().cast(), slice.len()) } + } +} diff --git a/src/utils/device_copy.rs b/src/utils/device_copy.rs deleted file mode 100644 index 72bd7d64e..000000000 --- a/src/utils/device_copy.rs +++ /dev/null @@ -1,150 +0,0 @@ -#![allow(clippy::trait_duplication_in_bounds)] - -use const_type_layout::{TypeGraphLayout, TypeLayout}; - -use crate::{ - alloc::NoCudaAlloc, - lend::{CudaAsRust, RustToCuda, RustToCudaAsync}, - safety::PortableBitSemantics, -}; - -#[cfg(any(feature = "host", feature = "device"))] -use crate::utils::ffi::DeviceAccessible; - -#[cfg(feature = "host")] -use crate::alloc::{CombinedCudaAlloc, CudaAlloc}; - -#[derive(Copy, Clone, Debug, TypeLayout)] -#[repr(transparent)] -pub struct SafeDeviceCopyWrapper(T); - -unsafe impl rustacuda_core::DeviceCopy for SafeDeviceCopyWrapper {} - -impl From for SafeDeviceCopyWrapper { - fn from(value: T) -> Self { - Self(value) - } -} - -impl SafeDeviceCopyWrapper { - #[must_use] - pub fn into_inner(self) -> T { - self.0 - } - - #[must_use] - pub const fn from_ref(reference: &T) -> &Self { - // Safety: [`SafeDeviceCopyWrapper`] is a transparent newtype around [`T`] - unsafe { &*(reference as *const T).cast() } - } - - #[must_use] - pub const fn into_ref(&self) -> &T { - // Safety: [`SafeDeviceCopyWrapper`] is a transparent newtype around [`T`] - unsafe { &*(self as *const Self).cast() } - } - - #[must_use] - pub fn from_mut(reference: &mut T) -> &mut Self { - // Safety: [`SafeDeviceCopyWrapper`] is a transparent newtype around [`T`] - unsafe { &mut *(reference as *mut T).cast() } - } - - #[must_use] - pub fn into_mut(&mut self) -> &mut T { - // Safety: [`SafeDeviceCopyWrapper`] is a transparent newtype around [`T`] - unsafe { &mut *(self as *mut Self).cast() } - } - - #[must_use] - pub const fn from_slice(slice: &[T]) -> &[Self] { - // Safety: [`SafeDeviceCopyWrapper`] is a transparent newtype around [`T`] - unsafe { core::slice::from_raw_parts(slice.as_ptr().cast(), slice.len()) } - } - - #[must_use] - pub const fn into_slice(slice: &[Self]) -> &[T] { - // Safety: [`SafeDeviceCopyWrapper`] is a transparent newtype around [`T`] - unsafe { core::slice::from_raw_parts(slice.as_ptr().cast(), slice.len()) } - } - - #[must_use] - pub fn from_mut_slice(slice: &mut [T]) -> &mut [Self] { - // Safety: [`SafeDeviceCopyWrapper`] is a transparent newtype around [`T`] - unsafe { core::slice::from_raw_parts_mut(slice.as_mut_ptr().cast(), slice.len()) } - } - - #[must_use] - pub fn into_mut_slice(slice: &mut [Self]) -> &mut [T] { - // Safety: [`SafeDeviceCopyWrapper`] is a transparent newtype around [`T`] - unsafe { core::slice::from_raw_parts_mut(slice.as_mut_ptr().cast(), slice.len()) } - } -} - -unsafe impl RustToCuda for SafeDeviceCopyWrapper { - type CudaAllocation = NoCudaAlloc; - type CudaRepresentation = Self; - - #[cfg(feature = "host")] - #[allow(clippy::type_complexity)] - unsafe fn borrow( - &self, - alloc: A, - ) -> rustacuda::error::CudaResult<( - DeviceAccessible, - CombinedCudaAlloc, - )> { - let alloc = CombinedCudaAlloc::new(NoCudaAlloc, alloc); - Ok((DeviceAccessible::from(&self.0), alloc)) - } - - #[cfg(feature = "host")] - unsafe fn restore( - &mut self, - alloc: CombinedCudaAlloc, - ) -> rustacuda::error::CudaResult { - let (_alloc_front, alloc_tail): (NoCudaAlloc, A) = alloc.split(); - - Ok(alloc_tail) - } -} - -unsafe impl RustToCudaAsync - for SafeDeviceCopyWrapper -{ - #[cfg(feature = "host")] - #[allow(clippy::type_complexity)] - unsafe fn borrow_async( - &self, - alloc: A, - _stream: &rustacuda::stream::Stream, - ) -> rustacuda::error::CudaResult<( - DeviceAccessible, - CombinedCudaAlloc, - )> { - let alloc = CombinedCudaAlloc::new(NoCudaAlloc, alloc); - Ok((DeviceAccessible::from(&self.0), alloc)) - } - - #[cfg(feature = "host")] - unsafe fn restore_async( - &mut self, - alloc: CombinedCudaAlloc, - _stream: &rustacuda::stream::Stream, - ) -> rustacuda::error::CudaResult { - let (_alloc_front, alloc_tail): (NoCudaAlloc, A) = alloc.split(); - - Ok(alloc_tail) - } -} - -unsafe impl CudaAsRust for SafeDeviceCopyWrapper { - type RustRepresentation = Self; - - #[cfg(feature = "device")] - unsafe fn as_rust(this: &DeviceAccessible) -> Self::RustRepresentation { - let mut uninit = core::mem::MaybeUninit::uninit(); - core::ptr::copy_nonoverlapping(&**this, uninit.as_mut_ptr(), 1); - uninit.assume_init() - } -} diff --git a/src/utils/exchange/buffer/common.rs b/src/utils/exchange/buffer/common.rs index cfacf61a2..079dba419 100644 --- a/src/utils/exchange/buffer/common.rs +++ b/src/utils/exchange/buffer/common.rs @@ -1,6 +1,10 @@ use const_type_layout::{TypeGraphLayout, TypeLayout}; -use crate::{lend::CudaAsRust, safety::PortableBitSemantics, utils::ffi::DeviceMutPointer}; +use crate::{ + lend::CudaAsRust, + safety::{PortableBitSemantics, StackOnly}, + utils::ffi::DeviceMutPointer, +}; use super::{CudaExchangeBuffer, CudaExchangeItem}; @@ -9,7 +13,7 @@ use super::{CudaExchangeBuffer, CudaExchangeItem}; #[derive(TypeLayout)] #[repr(C)] pub struct CudaExchangeBufferCudaRepresentation< - T: PortableBitSemantics + TypeGraphLayout, + T: StackOnly + PortableBitSemantics + TypeGraphLayout, const M2D: bool, const M2H: bool, >( @@ -17,8 +21,8 @@ pub struct CudaExchangeBufferCudaRepresentation< pub(super) usize, ); -unsafe impl CudaAsRust - for CudaExchangeBufferCudaRepresentation +unsafe impl + CudaAsRust for CudaExchangeBufferCudaRepresentation { type RustRepresentation = CudaExchangeBuffer; diff --git a/src/utils/exchange/buffer/device.rs b/src/utils/exchange/buffer/device.rs index 8c4b3b6ee..5083263b3 100644 --- a/src/utils/exchange/buffer/device.rs +++ b/src/utils/exchange/buffer/device.rs @@ -2,18 +2,21 @@ use core::ops::{Deref, DerefMut}; use const_type_layout::TypeGraphLayout; -use crate::{deps::alloc::boxed::Box, safety::PortableBitSemantics}; +use crate::{ + deps::alloc::boxed::Box, + safety::{PortableBitSemantics, StackOnly}, +}; use super::CudaExchangeItem; #[allow(clippy::module_name_repetitions)] pub struct CudaExchangeBufferDevice< - T: PortableBitSemantics + TypeGraphLayout, + T: StackOnly + PortableBitSemantics + TypeGraphLayout, const M2D: bool, const M2H: bool, >(pub(super) core::mem::ManuallyDrop]>>); -impl Deref +impl Deref for CudaExchangeBufferDevice { type Target = [CudaExchangeItem]; @@ -23,8 +26,8 @@ impl DerefMut - for CudaExchangeBufferDevice +impl + DerefMut for CudaExchangeBufferDevice { fn deref_mut(&mut self) -> &mut Self::Target { &mut self.0 diff --git a/src/utils/exchange/buffer/host.rs b/src/utils/exchange/buffer/host.rs index f7fedc804..e62227d8e 100644 --- a/src/utils/exchange/buffer/host.rs +++ b/src/utils/exchange/buffer/host.rs @@ -12,9 +12,9 @@ use rustacuda::{ use crate::{ alloc::{CombinedCudaAlloc, CudaAlloc, NoCudaAlloc}, host::CudaDropWrapper, - safety::PortableBitSemantics, + safety::{PortableBitSemantics, StackOnly}, utils::{ - device_copy::SafeDeviceCopyWrapper, + adapter::DeviceCopyWithPortableBitSemantics, ffi::{DeviceAccessible, DeviceMutPointer}, }, }; @@ -23,29 +23,35 @@ use super::{common::CudaExchangeBufferCudaRepresentation, CudaExchangeItem}; #[allow(clippy::module_name_repetitions)] pub struct CudaExchangeBufferHost< - T: PortableBitSemantics + TypeGraphLayout, + T: StackOnly + PortableBitSemantics + TypeGraphLayout, const M2D: bool, const M2H: bool, > { - host_buffer: - CudaDropWrapper>>>, + host_buffer: CudaDropWrapper< + LockedBuffer>>, + >, device_buffer: UnsafeCell< - CudaDropWrapper>>>, + CudaDropWrapper< + DeviceBuffer>>, + >, >, } -impl - CudaExchangeBufferHost +impl< + T: Clone + StackOnly + PortableBitSemantics + TypeGraphLayout, + const M2D: bool, + const M2H: bool, + > CudaExchangeBufferHost { /// # Errors /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside /// CUDA pub fn new(elem: &T, capacity: usize) -> CudaResult { // Safety: CudaExchangeItem is a `repr(transparent)` wrapper around T - let elem: &CudaExchangeItem = unsafe { &*(elem as *const T).cast() }; + let elem: &CudaExchangeItem = unsafe { &*std::ptr::from_ref(elem).cast() }; let host_buffer = CudaDropWrapper::from(LockedBuffer::new( - SafeDeviceCopyWrapper::from_ref(elem), + DeviceCopyWithPortableBitSemantics::from_ref(elem), capacity, )?); let device_buffer = UnsafeCell::new(CudaDropWrapper::from(DeviceBuffer::from_slice( @@ -59,7 +65,7 @@ impl +impl CudaExchangeBufferHost { /// # Errors @@ -67,14 +73,16 @@ impl) -> CudaResult { let host_buffer = unsafe { - let mut uninit: CudaDropWrapper>> = + let mut uninit: CudaDropWrapper>> = CudaDropWrapper::from(LockedBuffer::uninitialized(vec.len())?); for (i, src) in vec.into_iter().enumerate() { uninit .as_mut_ptr() .add(i) - .write(SafeDeviceCopyWrapper::from(CudaExchangeItem(src))); + .write(DeviceCopyWithPortableBitSemantics::from(CudaExchangeItem( + src, + ))); } uninit @@ -91,25 +99,25 @@ impl Deref +impl Deref for CudaExchangeBufferHost { type Target = [CudaExchangeItem]; fn deref(&self) -> &Self::Target { - SafeDeviceCopyWrapper::into_slice(self.host_buffer.as_slice()) + DeviceCopyWithPortableBitSemantics::into_slice(self.host_buffer.as_slice()) } } -impl DerefMut - for CudaExchangeBufferHost +impl + DerefMut for CudaExchangeBufferHost { fn deref_mut(&mut self) -> &mut Self::Target { - SafeDeviceCopyWrapper::into_mut_slice(self.host_buffer.as_mut_slice()) + DeviceCopyWithPortableBitSemantics::into_mut_slice(self.host_buffer.as_mut_slice()) } } -impl +impl CudaExchangeBufferHost { #[allow(clippy::type_complexity)] @@ -162,7 +170,7 @@ impl +impl CudaExchangeBufferHost { #[allow(clippy::type_complexity)] diff --git a/src/utils/exchange/buffer/mod.rs b/src/utils/exchange/buffer/mod.rs index f493f316c..31c76f1b7 100644 --- a/src/utils/exchange/buffer/mod.rs +++ b/src/utils/exchange/buffer/mod.rs @@ -6,10 +6,9 @@ use core::{ use const_type_layout::TypeLayout; -#[cfg(any(feature = "host", feature = "device"))] use const_type_layout::TypeGraphLayout; -use crate::safety::PortableBitSemantics; +use crate::safety::{PortableBitSemantics, StackOnly}; #[cfg(any(feature = "host", feature = "device"))] use crate::{ @@ -36,7 +35,7 @@ mod host; #[cfg(any(feature = "host", feature = "device"))] #[allow(clippy::module_name_repetitions)] pub struct CudaExchangeBuffer< - T: PortableBitSemantics + TypeGraphLayout, + T: StackOnly + PortableBitSemantics + TypeGraphLayout, const M2D: bool, const M2H: bool, > { @@ -47,8 +46,11 @@ pub struct CudaExchangeBuffer< } #[cfg(feature = "host")] -impl - CudaExchangeBuffer +impl< + T: Clone + StackOnly + PortableBitSemantics + TypeGraphLayout, + const M2D: bool, + const M2H: bool, + > CudaExchangeBuffer { /// # Errors /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside @@ -61,7 +63,7 @@ impl +impl CudaExchangeBuffer { /// # Errors @@ -75,7 +77,7 @@ impl Deref +impl Deref for CudaExchangeBuffer { type Target = [CudaExchangeItem]; @@ -86,8 +88,8 @@ impl DerefMut - for CudaExchangeBuffer +impl + DerefMut for CudaExchangeBuffer { fn deref_mut(&mut self) -> &mut Self::Target { &mut self.inner @@ -95,8 +97,8 @@ impl RustToCuda - for CudaExchangeBuffer +unsafe impl + RustToCuda for CudaExchangeBuffer { type CudaAllocation = NoCudaAlloc; type CudaRepresentation = CudaExchangeBufferCudaRepresentation; @@ -124,7 +126,7 @@ unsafe impl +unsafe impl RustToCudaAsync for CudaExchangeBuffer { #[cfg(feature = "host")] @@ -154,12 +156,14 @@ unsafe impl(T); -impl CudaExchangeItem { +impl + CudaExchangeItem +{ #[cfg(feature = "host")] pub const fn read(&self) -> &T { &self.0 @@ -171,7 +175,9 @@ impl CudaExchangeIte } } -impl CudaExchangeItem { +impl + CudaExchangeItem +{ #[cfg(feature = "device")] pub const fn read(&self) -> &T { &self.0 @@ -183,13 +189,15 @@ impl CudaExchangeIte } } -impl AsMut for CudaExchangeItem { +impl AsMut + for CudaExchangeItem +{ fn as_mut(&mut self) -> &mut T { &mut self.0 } } -impl CudaExchangeItem { +impl CudaExchangeItem { #[cfg(feature = "host")] pub const fn as_scratch(&self) -> &T { &self.0 @@ -201,7 +209,7 @@ impl CudaExchangeItem } } -impl CudaExchangeItem { +impl CudaExchangeItem { #[cfg(feature = "device")] pub const fn as_scratch(&self) -> &T { &self.0 @@ -213,13 +221,13 @@ impl CudaExchangeItem } } -impl CudaExchangeItem { +impl CudaExchangeItem { #[cfg(feature = "host")] pub const fn as_uninit(&self) -> &MaybeUninit { // Safety: // - MaybeUninit is a transparent newtype union // - CudaExchangeItem is a transparent newtype - unsafe { &*(self as *const Self).cast() } + unsafe { &*core::ptr::from_ref(self).cast() } } #[cfg(feature = "host")] @@ -227,17 +235,17 @@ impl CudaExchangeItem // Safety: // - MaybeUninit is a transparent newtype union // - CudaExchangeItem is a transparent newtype - unsafe { &mut *(self as *mut Self).cast() } + unsafe { &mut *core::ptr::from_mut(self).cast() } } } -impl CudaExchangeItem { +impl CudaExchangeItem { #[cfg(feature = "device")] pub const fn as_uninit(&self) -> &MaybeUninit { // Safety: // - MaybeUninit is a transparent newtype union // - CudaExchangeItem is a transparent newtype - unsafe { &*(self as *const Self).cast() } + unsafe { &*core::ptr::from_ref(self).cast() } } #[cfg(feature = "device")] @@ -245,6 +253,6 @@ impl CudaExchangeItem // Safety: // - MaybeUninit is a transparent newtype union // - CudaExchangeItem is a transparent newtype - unsafe { &mut *(self as *mut Self).cast() } + unsafe { &mut *core::ptr::from_mut(self).cast() } } } diff --git a/src/utils/exchange/wrapper.rs b/src/utils/exchange/wrapper.rs index 9eedb058e..b7bbeba09 100644 --- a/src/utils/exchange/wrapper.rs +++ b/src/utils/exchange/wrapper.rs @@ -20,17 +20,25 @@ use crate::{ HostAndDeviceMutRefAsync, }, lend::{RustToCuda, RustToCudaAsync}, - utils::{device_copy::SafeDeviceCopyWrapper, ffi::DeviceAccessible}, + utils::{adapter::DeviceCopyWithPortableBitSemantics, ffi::DeviceAccessible}, }; #[allow(clippy::module_name_repetitions)] pub struct ExchangeWrapperOnHost> { value: T, device_box: CudaDropWrapper< - DeviceBox::CudaRepresentation>>>, + DeviceBox< + DeviceCopyWithPortableBitSemantics< + DeviceAccessible<::CudaRepresentation>, + >, + >, >, locked_cuda_repr: CudaDropWrapper< - LockedBox::CudaRepresentation>>>, + LockedBox< + DeviceCopyWithPortableBitSemantics< + DeviceAccessible<::CudaRepresentation>, + >, + >, >, move_event: CudaDropWrapper, } @@ -39,10 +47,18 @@ pub struct ExchangeWrapperOnHost> pub struct ExchangeWrapperOnHostAsync<'stream, T: RustToCuda> { value: T, device_box: CudaDropWrapper< - DeviceBox::CudaRepresentation>>>, + DeviceBox< + DeviceCopyWithPortableBitSemantics< + DeviceAccessible<::CudaRepresentation>, + >, + >, >, locked_cuda_repr: CudaDropWrapper< - LockedBox::CudaRepresentation>>>, + LockedBox< + DeviceCopyWithPortableBitSemantics< + DeviceAccessible<::CudaRepresentation>, + >, + >, >, move_event: CudaDropWrapper, stream: PhantomData<&'stream Stream>, @@ -53,10 +69,18 @@ pub struct ExchangeWrapperOnHostAsync<'stream, T: RustToCuda> { value: T, device_box: CudaDropWrapper< - DeviceBox::CudaRepresentation>>>, + DeviceBox< + DeviceCopyWithPortableBitSemantics< + DeviceAccessible<::CudaRepresentation>, + >, + >, >, locked_cuda_repr: CudaDropWrapper< - LockedBox::CudaRepresentation>>>, + LockedBox< + DeviceCopyWithPortableBitSemantics< + DeviceAccessible<::CudaRepresentation>, + >, + >, >, null_alloc: CombinedCudaAlloc<::CudaAllocation, NoCudaAlloc>, move_event: CudaDropWrapper, @@ -66,10 +90,18 @@ pub struct ExchangeWrapperOnDevice pub struct ExchangeWrapperOnDeviceAsync<'stream, T: RustToCuda> { value: T, device_box: CudaDropWrapper< - DeviceBox::CudaRepresentation>>>, + DeviceBox< + DeviceCopyWithPortableBitSemantics< + DeviceAccessible<::CudaRepresentation>, + >, + >, >, locked_cuda_repr: CudaDropWrapper< - LockedBox::CudaRepresentation>>>, + LockedBox< + DeviceCopyWithPortableBitSemantics< + DeviceAccessible<::CudaRepresentation>, + >, + >, >, null_alloc: CombinedCudaAlloc<::CudaAllocation, NoCudaAlloc>, move_event: CudaDropWrapper, @@ -90,11 +122,13 @@ impl> ExchangeWrapperOnHost { let (cuda_repr, _null_alloc) = unsafe { value.borrow(NoCudaAlloc) }?; let locked_cuda_repr = unsafe { let mut uninit = CudaDropWrapper::from(LockedBox::< - SafeDeviceCopyWrapper::CudaRepresentation>>, + DeviceCopyWithPortableBitSemantics< + DeviceAccessible<::CudaRepresentation>, + >, >::uninitialized()?); uninit .as_mut_ptr() - .write(SafeDeviceCopyWrapper::from(cuda_repr)); + .write(DeviceCopyWithPortableBitSemantics::from(cuda_repr)); uninit }; @@ -122,7 +156,7 @@ impl> ExchangeWrapperOnHost { /// CUDA pub fn move_to_device(mut self) -> CudaResult> { let (cuda_repr, null_alloc) = unsafe { self.value.borrow(NoCudaAlloc) }?; - **self.locked_cuda_repr = SafeDeviceCopyWrapper::from(cuda_repr); + **self.locked_cuda_repr = DeviceCopyWithPortableBitSemantics::from(cuda_repr); self.device_box.copy_from(&**self.locked_cuda_repr)?; @@ -152,7 +186,7 @@ impl> ExchangeWrapperOnHost CudaResult> { let (cuda_repr, null_alloc) = unsafe { self.value.borrow_async(NoCudaAlloc, stream) }?; - **self.locked_cuda_repr = SafeDeviceCopyWrapper::from(cuda_repr); + **self.locked_cuda_repr = DeviceCopyWithPortableBitSemantics::from(cuda_repr); // Safety: The device value is not safely exposed until either // - the passed-in [`Stream`] is synchronised diff --git a/src/utils/ffi.rs b/src/utils/ffi.rs index 3b205ffca..52d7f691d 100644 --- a/src/utils/ffi.rs +++ b/src/utils/ffi.rs @@ -5,18 +5,18 @@ use core::{ ops::{Deref, DerefMut}, }; #[cfg(feature = "host")] -use std::{fmt, mem::MaybeUninit, ptr::copy_nonoverlapping}; +use std::fmt; -use const_type_layout::TypeLayout; +use const_type_layout::{TypeGraphLayout, TypeLayout}; use crate::safety::PortableBitSemantics; #[cfg(feature = "host")] -use crate::{lend::CudaAsRust, utils::device_copy::SafeDeviceCopyWrapper}; +use crate::{lend::CudaAsRust, utils::adapter::RustToCudaWithPortableBitCopySemantics}; #[cfg_attr(any(feature = "device", doc), derive(Debug))] #[derive(TypeLayout)] #[repr(transparent)] -pub struct DeviceAccessible(T); +pub struct DeviceAccessible(T); #[cfg(feature = "host")] impl From for DeviceAccessible { @@ -25,22 +25,19 @@ impl From for DeviceAccessible { } } -// TODO: should there be some copy bound here? #[cfg(feature = "host")] -impl From<&T> for DeviceAccessible> { +impl From<&T> + for DeviceAccessible> +{ fn from(value: &T) -> Self { - let value = unsafe { - let mut uninit = MaybeUninit::uninit(); - copy_nonoverlapping(value, uninit.as_mut_ptr(), 1); - uninit.assume_init() - }; - - Self(SafeDeviceCopyWrapper::from(value)) + Self(RustToCudaWithPortableBitCopySemantics::from_copy(value)) } } #[cfg(all(feature = "host", not(doc)))] -impl fmt::Debug for DeviceAccessible { +impl fmt::Debug + for DeviceAccessible +{ fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result { fmt.debug_struct(stringify!(DeviceAccessible)) .finish_non_exhaustive() @@ -48,7 +45,7 @@ impl fmt::Debug for DeviceAccessi } #[cfg(feature = "device")] -impl Deref for DeviceAccessible { +impl Deref for DeviceAccessible { type Target = T; fn deref(&self) -> &Self::Target { @@ -57,7 +54,7 @@ impl Deref for DeviceAccessible { } #[cfg(feature = "device")] -impl DerefMut for DeviceAccessible { +impl DerefMut for DeviceAccessible { fn deref_mut(&mut self) -> &mut Self::Target { &mut self.0 } diff --git a/src/utils/mod.rs b/src/utils/mod.rs index 65a4379fb..bab467e42 100644 --- a/src/utils/mod.rs +++ b/src/utils/mod.rs @@ -1,5 +1,5 @@ +pub mod adapter; pub mod aliasing; -pub mod device_copy; pub mod exchange; pub mod ffi; pub mod shared; From 76af5f11712529b35af113c2676071d471af04a3 Mon Sep 17 00:00:00 2001 From: Juniper Tyree Date: Thu, 28 Dec 2023 12:45:33 +0000 Subject: [PATCH 072/120] First exploration towards a stricter async CUDA API --- .../src/rust_to_cuda/field_copy.rs | 17 +- rust-cuda-derive/src/rust_to_cuda/impl.rs | 7 +- rust-cuda-derive/src/rust_to_cuda/mod.rs | 32 +-- src/alloc.rs | 17 +- src/lend/impls/box.rs | 106 +++++++++- src/lend/impls/option.rs | 6 +- src/lend/mod.rs | 135 ++++++++++++- src/utils/adapter.rs | 2 + src/utils/aliasing/const.rs | 6 +- src/utils/aliasing/dynamic.rs | 6 +- src/utils/async.rs | 186 ++++++++++++++++++ src/utils/exchange/buffer/mod.rs | 2 + src/utils/exchange/wrapper.rs | 45 +++-- src/utils/mod.rs | 1 + 14 files changed, 519 insertions(+), 49 deletions(-) create mode 100644 src/utils/async.rs diff --git a/rust-cuda-derive/src/rust_to_cuda/field_copy.rs b/rust-cuda-derive/src/rust_to_cuda/field_copy.rs index 10f528730..1baf8829e 100644 --- a/rust-cuda-derive/src/rust_to_cuda/field_copy.rs +++ b/rust-cuda-derive/src/rust_to_cuda/field_copy.rs @@ -12,6 +12,7 @@ pub fn impl_field_copy_init_and_expand_alloc_type( cuda_repr_field_ty: &CudaReprFieldTy, mut combined_cuda_alloc_type: TokenStream, + mut combined_cuda_alloc_async_type: TokenStream, r2c_field_declarations: &mut Vec, r2c_field_async_declarations: &mut Vec, @@ -20,7 +21,7 @@ pub fn impl_field_copy_init_and_expand_alloc_type( r2c_field_async_destructors: &mut Vec, c2r_field_initialisations: &mut Vec, -) -> TokenStream { +) -> (TokenStream, TokenStream) { #[allow(clippy::option_if_let_else)] let field_accessor = match &field.ident { Some(ident) => quote! { #ident }, @@ -63,6 +64,12 @@ pub fn impl_field_copy_init_and_expand_alloc_type( #combined_cuda_alloc_type > }; + combined_cuda_alloc_async_type = quote! { + #crate_path::alloc::CombinedCudaAlloc< + <#field_ty as #crate_path::lend::RustToCudaAsync>::CudaAllocationAsync, + #combined_cuda_alloc_async_type + > + }; r2c_field_declarations.push(quote! { let (#field_repr_ident, alloc_front) = #crate_path::lend::RustToCuda::borrow( @@ -109,6 +116,12 @@ pub fn impl_field_copy_init_and_expand_alloc_type( #combined_cuda_alloc_type > }; + combined_cuda_alloc_async_type = quote! { + #crate_path::alloc::CombinedCudaAlloc< + <#proxy_ty as #crate_path::lend::RustToCudaAsync>::CudaAllocationAsync, + #combined_cuda_alloc_async_type + > + }; r2c_field_declarations.push(quote! { let (#field_repr_ident, alloc_front) = #crate_path::lend::RustToCuda::borrow( @@ -160,5 +173,5 @@ pub fn impl_field_copy_init_and_expand_alloc_type( }, } - combined_cuda_alloc_type + (combined_cuda_alloc_type, combined_cuda_alloc_async_type) } diff --git a/rust-cuda-derive/src/rust_to_cuda/impl.rs b/rust-cuda-derive/src/rust_to_cuda/impl.rs index 612d77c5a..d1249720e 100644 --- a/rust-cuda-derive/src/rust_to_cuda/impl.rs +++ b/rust-cuda-derive/src/rust_to_cuda/impl.rs @@ -125,6 +125,7 @@ pub fn rust_to_cuda_async_trait( struct_name_cuda: &syn::Ident, struct_generics_cuda_async: &syn::Generics, struct_fields_cuda: &syn::Fields, + combined_cuda_alloc_async_type: &TokenStream, r2c_field_async_declarations: &[TokenStream], r2c_field_initialisations: &[TokenStream], r2c_field_async_destructors: &[TokenStream], @@ -149,6 +150,8 @@ pub fn rust_to_cuda_async_trait( unsafe impl #impl_generics #crate_path::lend::RustToCudaAsync for #struct_name #ty_generics #where_clause { + type CudaAllocationAsync = #combined_cuda_alloc_async_type; + #[cfg(not(target_os = "cuda"))] unsafe fn borrow_async( &self, @@ -156,7 +159,7 @@ pub fn rust_to_cuda_async_trait( stream: &#crate_path::deps::rustacuda::stream::Stream, ) -> #crate_path::deps::rustacuda::error::CudaResult<( #crate_path::utils::ffi::DeviceAccessible, - #crate_path::alloc::CombinedCudaAlloc + #crate_path::alloc::CombinedCudaAlloc )> { let alloc_front = #crate_path::alloc::NoCudaAlloc; let alloc_tail = alloc; @@ -175,7 +178,7 @@ pub fn rust_to_cuda_async_trait( unsafe fn restore_async( &mut self, alloc: #crate_path::alloc::CombinedCudaAlloc< - Self::CudaAllocation, CudaAllocType + Self::CudaAllocationAsync, CudaAllocType >, stream: &#crate_path::deps::rustacuda::stream::Stream, ) -> #crate_path::deps::rustacuda::error::CudaResult { diff --git a/rust-cuda-derive/src/rust_to_cuda/mod.rs b/rust-cuda-derive/src/rust_to_cuda/mod.rs index 6a885ac94..77382d4c4 100644 --- a/rust-cuda-derive/src/rust_to_cuda/mod.rs +++ b/rust-cuda-derive/src/rust_to_cuda/mod.rs @@ -33,6 +33,9 @@ pub fn impl_rust_to_cuda(ast: &syn::DeriveInput) -> proc_macro::TokenStream { let mut combined_cuda_alloc_type: TokenStream = quote! { #crate_path::alloc::NoCudaAlloc }; + let mut combined_cuda_alloc_async_type: TokenStream = quote! { + #crate_path::alloc::NoCudaAlloc + }; let mut r2c_field_declarations: Vec = Vec::new(); let mut r2c_field_async_declarations: Vec = Vec::new(); let mut r2c_field_initialisations: Vec = Vec::new(); @@ -57,19 +60,21 @@ pub fn impl_rust_to_cuda(ast: &syn::DeriveInput) -> proc_macro::TokenStream { let cuda_repr_field_ty = field_ty::swap_field_type_and_filter_attrs(&crate_path, field); - combined_cuda_alloc_type = field_copy::impl_field_copy_init_and_expand_alloc_type( - &crate_path, - field, - field_index, - &cuda_repr_field_ty, - combined_cuda_alloc_type, - &mut r2c_field_declarations, - &mut r2c_field_async_declarations, - &mut r2c_field_initialisations, - &mut r2c_field_destructors_reverse, - &mut r2c_field_async_destructors_reverse, - &mut c2r_field_initialisations, - ); + (combined_cuda_alloc_type, combined_cuda_alloc_async_type) = + field_copy::impl_field_copy_init_and_expand_alloc_type( + &crate_path, + field, + field_index, + &cuda_repr_field_ty, + combined_cuda_alloc_type, + combined_cuda_alloc_async_type, + &mut r2c_field_declarations, + &mut r2c_field_async_declarations, + &mut r2c_field_initialisations, + &mut r2c_field_destructors_reverse, + &mut r2c_field_async_destructors_reverse, + &mut c2r_field_initialisations, + ); } // The fields must be deallocated in the reverse order of their allocation @@ -110,6 +115,7 @@ pub fn impl_rust_to_cuda(ast: &syn::DeriveInput) -> proc_macro::TokenStream { &struct_name_cuda, &struct_generics_cuda_async, &struct_fields_cuda, + &combined_cuda_alloc_async_type, &r2c_field_async_declarations, &r2c_field_initialisations, &r2c_field_async_destructors, diff --git a/src/alloc.rs b/src/alloc.rs index f16178aec..80d0ee840 100644 --- a/src/alloc.rs +++ b/src/alloc.rs @@ -1,6 +1,6 @@ #![allow(clippy::module_name_repetitions)] -pub trait EmptyCudaAlloc: sealed::empty::Sealed {} +pub trait EmptyCudaAlloc: From + Into + sealed::empty::Sealed {} pub trait CudaAlloc: sealed::alloc::Sealed {} @@ -30,6 +30,21 @@ impl sealed::empty for CombinedCudaAlloc { } +impl From + for CombinedCudaAlloc +{ + fn from(_: NoCudaAlloc) -> Self { + Self(A::from(NoCudaAlloc), B::from(NoCudaAlloc)) + } +} +impl From> + for NoCudaAlloc +{ + fn from(val: CombinedCudaAlloc) -> Self { + let _: (Self, Self) = (val.0.into(), val.1.into()); + Self + } +} impl CombinedCudaAlloc { #[must_use] pub const fn new(front: A, tail: B) -> Self { diff --git a/src/lend/impls/box.rs b/src/lend/impls/box.rs index 4156b1a29..1ec853b34 100644 --- a/src/lend/impls/box.rs +++ b/src/lend/impls/box.rs @@ -1,13 +1,16 @@ -use crate::{deps::alloc::boxed::Box, utils::ffi::DeviceOwnedPointer}; +#[cfg(feature = "host")] +use std::mem::ManuallyDrop; use const_type_layout::{TypeGraphLayout, TypeLayout}; #[cfg(feature = "host")] -use rustacuda::{error::CudaResult, memory::DeviceBox}; +use rustacuda::{error::CudaResult, memory::DeviceBox, memory::LockedBox}; use crate::{ - lend::{CudaAsRust, RustToCuda}, + deps::alloc::boxed::Box, + lend::{CudaAsRust, RustToCuda, RustToCudaAsync}, safety::PortableBitSemantics, + utils::ffi::DeviceOwnedPointer, }; #[cfg(any(feature = "host", feature = "device"))] @@ -71,6 +74,103 @@ unsafe impl RustToCuda for Box { } } +unsafe impl RustToCudaAsync for Box { + #[cfg(all(feature = "host", not(doc)))] + type CudaAllocationAsync = CombinedCudaAlloc< + CudaDropWrapper>>>, + CudaDropWrapper>>>, + >; + #[cfg(any(not(feature = "host"), doc))] + type CudaAllocationAsync = crate::alloc::SomeCudaAlloc; + + #[cfg(feature = "host")] + unsafe fn borrow_async( + &self, + alloc: A, + stream: &rustacuda::stream::Stream, + ) -> rustacuda::error::CudaResult<( + DeviceAccessible, + CombinedCudaAlloc, + )> { + use rustacuda::memory::AsyncCopyDestination; + + let locked_box = unsafe { + let mut uninit = CudaDropWrapper::from(LockedBox::< + DeviceCopyWithPortableBitSemantics>, + >::uninitialized()?); + std::ptr::copy_nonoverlapping( + std::ptr::from_ref::(&**self) + .cast::>>(), + uninit.as_mut_ptr(), + 1, + ); + uninit + }; + + let mut device_box = CudaDropWrapper::from(DeviceBox::< + DeviceCopyWithPortableBitSemantics>, + >::uninitialized()?); + device_box.async_copy_from(&*locked_box, stream)?; + + Ok(( + DeviceAccessible::from(BoxCudaRepresentation(DeviceOwnedPointer( + device_box.as_device_ptr().as_raw_mut().cast(), + ))), + CombinedCudaAlloc::new(CombinedCudaAlloc::new(locked_box, device_box), alloc), + )) + } + + #[cfg(feature = "host")] + unsafe fn restore_async( + &mut self, + alloc: CombinedCudaAlloc, + stream: &rustacuda::stream::Stream, + ) -> rustacuda::error::CudaResult { + use rustacuda::memory::AsyncCopyDestination; + + struct PromiseSend(T); + #[allow(clippy::non_send_fields_in_send_ty)] + unsafe impl Send for PromiseSend {} + + let (alloc_front, alloc_tail) = alloc.split(); + let (mut locked_box, device_box) = alloc_front.split(); + + device_box.async_copy_to(&mut *locked_box, stream)?; + + { + // TODO: express this unsafe-rich completion safely + // by explicitly capturing &mut self until the + // async restore has completed + let self_ptr: *mut T = std::ptr::from_mut(self); + + let self_ptr = PromiseSend(self_ptr); + let locked_box = PromiseSend(locked_box); + let device_box = PromiseSend(device_box); + + stream.add_callback(Box::new(move |res| { + let self_ptr: PromiseSend<_> = self_ptr; + + std::mem::drop(device_box); + if res == Ok(()) { + // Safety: The precondition of this method guarantees that + // &mut self has been borrowed until after this + // completion is run + unsafe { + std::ptr::copy_nonoverlapping( + locked_box.0.as_ptr().cast::(), + self_ptr.0, + 1, + ); + } + } + std::mem::drop(locked_box); + })) + }?; + + Ok(alloc_tail) + } +} + unsafe impl CudaAsRust for BoxCudaRepresentation { type RustRepresentation = Box; diff --git a/src/lend/impls/option.rs b/src/lend/impls/option.rs index fab89b89d..a0f0f8b4a 100644 --- a/src/lend/impls/option.rs +++ b/src/lend/impls/option.rs @@ -79,6 +79,8 @@ unsafe impl RustToCuda for Option { } unsafe impl RustToCudaAsync for Option { + type CudaAllocationAsync = Option<::CudaAllocationAsync>; + #[cfg(feature = "host")] #[allow(clippy::type_complexity)] unsafe fn borrow_async( @@ -87,7 +89,7 @@ unsafe impl RustToCudaAsync for Option { stream: &rustacuda::stream::Stream, ) -> CudaResult<( DeviceAccessible, - CombinedCudaAlloc, + CombinedCudaAlloc, )> { let (cuda_repr, alloc) = match self { None => ( @@ -118,7 +120,7 @@ unsafe impl RustToCudaAsync for Option { #[cfg(feature = "host")] unsafe fn restore_async( &mut self, - alloc: CombinedCudaAlloc, + alloc: CombinedCudaAlloc, stream: &rustacuda::stream::Stream, ) -> CudaResult { let (alloc_front, alloc_tail) = alloc.split(); diff --git a/src/lend/mod.rs b/src/lend/mod.rs index 603064fb8..50f2aee1d 100644 --- a/src/lend/mod.rs +++ b/src/lend/mod.rs @@ -17,6 +17,7 @@ use crate::{alloc::EmptyCudaAlloc, utils::ffi::DeviceAccessible}; use crate::{ alloc::{CombinedCudaAlloc, NoCudaAlloc}, host::{HostAndDeviceConstRef, HostAndDeviceOwned}, + utils::r#async::{Async, CudaAsync}, }; mod impls; @@ -72,6 +73,8 @@ pub unsafe trait RustToCuda { /// This is an internal trait and should ONLY be derived automatically using /// `#[derive(LendRustToCuda)]` pub unsafe trait RustToCudaAsync: RustToCuda { + type CudaAllocationAsync: CudaAlloc; + #[doc(hidden)] #[cfg(feature = "host")] /// # Errors @@ -81,11 +84,19 @@ pub unsafe trait RustToCudaAsync: RustToCuda { /// /// # Safety /// - /// This is an internal function and should NEVER be called manually + /// This is an internal function and should NEVER be called manually. + /// /// The returned /// [`Self::CudaRepresentation`](RustToCuda::CudaRepresentation) must NEVER /// be accessed on the CPU as it contains a GPU-resident copy of /// `self`. + /// + /// Since this method may perform asynchronous computation but returns its + /// result immediately, this result must only be used to construct compound + /// asynchronous computations before it has been synchronized on. + /// + /// Similarly, `&self` should remain borrowed until synchronisation has + /// been performed. #[allow(clippy::type_complexity)] unsafe fn borrow_async( &self, @@ -93,7 +104,7 @@ pub unsafe trait RustToCudaAsync: RustToCuda { stream: &rustacuda::stream::Stream, ) -> rustacuda::error::CudaResult<( DeviceAccessible, - CombinedCudaAlloc, + CombinedCudaAlloc, )>; #[doc(hidden)] @@ -105,11 +116,17 @@ pub unsafe trait RustToCudaAsync: RustToCuda { /// /// # Safety /// - /// This is an internal function and should NEVER be called manually + /// This is an internal function and should NEVER be called manually. + /// + /// Since this method may perform asynchronous computation but returns + /// immediately, `&mut self` not be used until it has been synchronized on. + /// + /// Therefore, `&mut self` should remain mutably borrowed until + /// synchronisation has been performed. #[allow(clippy::type_complexity)] unsafe fn restore_async( &mut self, - alloc: CombinedCudaAlloc, + alloc: CombinedCudaAlloc, stream: &rustacuda::stream::Stream, ) -> rustacuda::error::CudaResult; } @@ -228,6 +245,116 @@ impl LendToCuda for T { } } +#[cfg(feature = "host")] +#[allow(clippy::module_name_repetitions)] +pub trait LendToCudaAsync: RustToCudaAsync { + /// Lends an immutable copy of `&self` to CUDA: + /// - code in the CUDA kernel can only access `&self` through the + /// [`DeviceConstRef`] inside the closure + /// - after the closure, `&self` will not have changed + /// + /// # Errors + /// + /// Returns a [`CudaError`] iff an error occurs inside CUDA + fn lend_to_cuda_async< + 'stream, + O, + E: From, + F: FnOnce( + Async< + 'stream, + HostAndDeviceConstRef::CudaRepresentation>>, + >, + ) -> Result, + >( + &self, + stream: &'stream rustacuda::stream::Stream, + inner: F, + ) -> Result; + + /// Moves `self` to CUDA iff `self` is [`StackOnly`]. + /// + /// # Errors + /// + /// Returns a [`CudaError`] iff an error occurs inside CUDA + fn move_to_cuda_async< + 'stream, + O, + E: From, + F: FnOnce( + Async< + 'stream, + HostAndDeviceOwned::CudaRepresentation>>, + >, + ) -> Result, + >( + self, + stream: &'stream rustacuda::stream::Stream, + inner: F, + ) -> Result + where + Self: RustToCuda; +} + +#[cfg(feature = "host")] +impl LendToCudaAsync for T { + fn lend_to_cuda_async< + 'stream, + O, + E: From, + F: FnOnce( + Async< + 'stream, + HostAndDeviceConstRef::CudaRepresentation>>, + >, + ) -> Result, + >( + &self, + stream: &'stream rustacuda::stream::Stream, + inner: F, + ) -> Result { + let (cuda_repr, alloc) = unsafe { self.borrow_async(NoCudaAlloc, stream) }?; + + let result = HostAndDeviceConstRef::with_new(&cuda_repr, |const_ref| { + inner(Async::new(const_ref, stream)?) + }); + + core::mem::drop(cuda_repr); + core::mem::drop(alloc); + + result + } + + fn move_to_cuda_async< + 'stream, + O, + E: From, + F: FnOnce( + Async< + 'stream, + HostAndDeviceOwned::CudaRepresentation>>, + >, + ) -> Result, + >( + self, + stream: &'stream rustacuda::stream::Stream, + inner: F, + ) -> Result + where + Self: RustToCuda, + { + let (cuda_repr, alloc) = unsafe { self.borrow_async(NoCudaAlloc, stream) }?; + + let result = HostAndDeviceOwned::with_new(cuda_repr, |owned_ref| { + inner(Async::new(owned_ref, stream)?) + }); + + core::mem::drop(alloc); + + result + } +} + #[cfg(feature = "device")] pub trait BorrowFromRust: RustToCuda { /// # Safety diff --git a/src/utils/adapter.rs b/src/utils/adapter.rs index 8be7712ef..8e27d98df 100644 --- a/src/utils/adapter.rs +++ b/src/utils/adapter.rs @@ -129,6 +129,8 @@ unsafe impl RustToCuda unsafe impl RustToCudaAsync for RustToCudaWithPortableBitCopySemantics { + type CudaAllocationAsync = NoCudaAlloc; + #[cfg(feature = "host")] #[allow(clippy::type_complexity)] unsafe fn borrow_async( diff --git a/src/utils/aliasing/const.rs b/src/utils/aliasing/const.rs index c36f814bf..8441a5bd1 100644 --- a/src/utils/aliasing/const.rs +++ b/src/utils/aliasing/const.rs @@ -215,6 +215,8 @@ unsafe impl RustToCuda unsafe impl RustToCudaAsync for SplitSliceOverCudaThreadsConstStride { + type CudaAllocationAsync = T::CudaAllocationAsync; + #[cfg(feature = "host")] #[allow(clippy::type_complexity)] unsafe fn borrow_async( @@ -223,7 +225,7 @@ unsafe impl RustToCudaAsync stream: &rustacuda::stream::Stream, ) -> rustacuda::error::CudaResult<( DeviceAccessible, - crate::alloc::CombinedCudaAlloc, + crate::alloc::CombinedCudaAlloc, )> { let (cuda_repr, alloc) = self.0.borrow_async(alloc, stream)?; @@ -236,7 +238,7 @@ unsafe impl RustToCudaAsync #[cfg(feature = "host")] unsafe fn restore_async( &mut self, - alloc: crate::alloc::CombinedCudaAlloc, + alloc: crate::alloc::CombinedCudaAlloc, stream: &rustacuda::stream::Stream, ) -> rustacuda::error::CudaResult { self.0.restore_async(alloc, stream) diff --git a/src/utils/aliasing/dynamic.rs b/src/utils/aliasing/dynamic.rs index 0ab97016c..f8a04fa06 100644 --- a/src/utils/aliasing/dynamic.rs +++ b/src/utils/aliasing/dynamic.rs @@ -193,6 +193,8 @@ unsafe impl RustToCuda for SplitSliceOverCudaThreadsDynamicStride } unsafe impl RustToCudaAsync for SplitSliceOverCudaThreadsDynamicStride { + type CudaAllocationAsync = T::CudaAllocationAsync; + #[cfg(feature = "host")] #[allow(clippy::type_complexity)] unsafe fn borrow_async( @@ -201,7 +203,7 @@ unsafe impl RustToCudaAsync for SplitSliceOverCudaThreadsDyn stream: &rustacuda::stream::Stream, ) -> rustacuda::error::CudaResult<( DeviceAccessible, - crate::alloc::CombinedCudaAlloc, + crate::alloc::CombinedCudaAlloc, )> { let (cuda_repr, alloc) = self.inner.borrow_async(alloc, stream)?; @@ -217,7 +219,7 @@ unsafe impl RustToCudaAsync for SplitSliceOverCudaThreadsDyn #[cfg(feature = "host")] unsafe fn restore_async( &mut self, - alloc: crate::alloc::CombinedCudaAlloc, + alloc: crate::alloc::CombinedCudaAlloc, stream: &rustacuda::stream::Stream, ) -> rustacuda::error::CudaResult { self.inner.restore_async(alloc, stream) diff --git a/src/utils/async.rs b/src/utils/async.rs new file mode 100644 index 000000000..b691b755f --- /dev/null +++ b/src/utils/async.rs @@ -0,0 +1,186 @@ +#[cfg(feature = "host")] +use std::{ + future::Future, future::IntoFuture, future::Ready, marker::PhantomData, sync::Arc, sync::Mutex, + task::Poll, task::Waker, +}; + +#[cfg(feature = "host")] +use rustacuda::{ + error::CudaError, error::CudaResult, event::Event, event::EventFlags, event::EventStatus, + stream::Stream, stream::StreamWaitEventFlags, +}; + +#[cfg(feature = "host")] +use crate::host::CudaDropWrapper; + +#[cfg(feature = "host")] +#[allow(clippy::module_name_repetitions)] +pub trait CudaAsync<'stream, T>: Sized + IntoFuture> { + /// Wraps a still-asynchronous `value` which is being computed on `stream` + /// such that its computation can be synchronised on. + /// + /// # Errors + /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside + /// CUDA. + fn new(value: T, stream: &'stream Stream) -> CudaResult; + + /// Synchronises on this computation to block until it has completed and + /// the inner value can be safely returned and again be used in synchronous + /// operations. + /// + /// # Errors + /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside + /// CUDA. + fn synchronize(self) -> CudaResult; + + /// Moves the asynchronous data move to a different [`Stream`]. + /// + /// # Errors + /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside + /// CUDA. + fn move_to_stream<'stream_new>( + self, + stream: &'stream_new Stream, + ) -> CudaResult>; +} + +#[cfg(feature = "host")] +pub struct Sync { + value: T, +} + +#[cfg(feature = "host")] +impl<'stream, T> CudaAsync<'stream, T> for Sync { + fn new(value: T, _stream: &'stream Stream) -> CudaResult { + Ok(Self { value }) + } + + fn synchronize(self) -> CudaResult { + Ok(self.value) + } + + #[allow(refining_impl_trait)] + fn move_to_stream(self, _stream: &Stream) -> CudaResult { + Ok(self) + } +} + +#[cfg(feature = "host")] +impl IntoFuture for Sync { + type IntoFuture = Ready>; + type Output = CudaResult; + + fn into_future(self) -> Self::IntoFuture { + std::future::ready(Ok(self.value)) + } +} + +#[cfg(feature = "host")] +pub struct Async<'stream, T> { + _stream: PhantomData<&'stream Stream>, + event: CudaDropWrapper, + waker: Arc>>, + value: T, +} + +#[cfg(feature = "host")] +impl<'stream, T> CudaAsync<'stream, T> for Async<'stream, T> { + fn new(value: T, stream: &'stream Stream) -> CudaResult { + let event = CudaDropWrapper::from(Event::new( + EventFlags::DISABLE_TIMING | EventFlags::BLOCKING_SYNC, + )?); + event.record(stream)?; + + let waker: Arc>> = Arc::new(Mutex::new(None)); + let waker_callback = waker.clone(); + stream.add_callback(Box::new(move |_| { + if let Ok(mut waker) = waker_callback.lock() { + if let Some(waker) = waker.take() { + waker.wake(); + } + } + }))?; + + Ok(Self { + _stream: PhantomData::<&'stream Stream>, + event, + waker, + value, + }) + } + + fn synchronize(self) -> CudaResult { + self.event.synchronize()?; + + Ok(self.value) + } + + #[allow(refining_impl_trait)] + fn move_to_stream<'stream_new>( + self, + stream: &'stream_new Stream, + ) -> CudaResult> { + stream.wait_event(&self.event, StreamWaitEventFlags::DEFAULT)?; + self.event.record(stream)?; + + let waker_callback = self.waker.clone(); + stream.add_callback(Box::new(move |_| { + if let Ok(mut waker) = waker_callback.lock() { + if let Some(waker) = waker.take() { + waker.wake(); + } + } + }))?; + + Ok(Async { + _stream: PhantomData::<&'stream_new Stream>, + event: self.event, + waker: self.waker, + value: self.value, + }) + } +} + +#[cfg(feature = "host")] +impl<'stream, T> Async<'stream, T> { + /// # Safety + /// + /// The returned inner value of type `T` may not yet have completed its + /// asynchronous work and may thus be in an inconsistent state. + /// + /// This method must only be used to construct a larger asynchronous + /// computation out of smaller ones that have all been submitted to the + /// same [`Stream`]. + pub unsafe fn unwrap_unchecked(self) -> T { + self.value + } +} + +#[cfg(feature = "host")] +impl<'stream, T> IntoFuture for Async<'stream, T> { + type Output = CudaResult; + + type IntoFuture = impl Future; + + fn into_future(self) -> Self::IntoFuture { + let mut wrapper = Some(self); + + std::future::poll_fn(move |cx| match &wrapper { + Some(Async { waker, event, .. }) => match event.query() { + Ok(EventStatus::NotReady) => waker.lock().map_or_else( + |_| Poll::Ready(Err(CudaError::OperatingSystemError)), + |mut waker| { + *waker = Some(cx.waker().clone()); + Poll::Pending + }, + ), + Ok(EventStatus::Ready) => match wrapper.take() { + Some(Async { value, .. }) => Poll::Ready(Ok(value)), + None => Poll::Ready(Err(CudaError::AlreadyAcquired)), + }, + Err(err) => Poll::Ready(Err(err)), + }, + None => Poll::Ready(Err(CudaError::AlreadyAcquired)), + }) + } +} diff --git a/src/utils/exchange/buffer/mod.rs b/src/utils/exchange/buffer/mod.rs index 31c76f1b7..9dfc4414e 100644 --- a/src/utils/exchange/buffer/mod.rs +++ b/src/utils/exchange/buffer/mod.rs @@ -129,6 +129,8 @@ unsafe impl RustToCudaAsync for CudaExchangeBuffer { + type CudaAllocationAsync = NoCudaAlloc; + #[cfg(feature = "host")] #[allow(clippy::type_complexity)] unsafe fn borrow_async( diff --git a/src/utils/exchange/wrapper.rs b/src/utils/exchange/wrapper.rs index b7bbeba09..454ecc8f3 100644 --- a/src/utils/exchange/wrapper.rs +++ b/src/utils/exchange/wrapper.rs @@ -14,7 +14,7 @@ use rustacuda::{ }; use crate::{ - alloc::{CombinedCudaAlloc, EmptyCudaAlloc, NoCudaAlloc}, + alloc::{EmptyCudaAlloc, NoCudaAlloc}, host::{ CudaDropWrapper, HostAndDeviceConstRef, HostAndDeviceConstRefAsync, HostAndDeviceMutRef, HostAndDeviceMutRefAsync, @@ -82,7 +82,6 @@ pub struct ExchangeWrapperOnDevice >, >, >, - null_alloc: CombinedCudaAlloc<::CudaAllocation, NoCudaAlloc>, move_event: CudaDropWrapper, } @@ -103,7 +102,6 @@ pub struct ExchangeWrapperOnDeviceAsync<'stream, T: RustToCuda, >, >, - null_alloc: CombinedCudaAlloc<::CudaAllocation, NoCudaAlloc>, move_event: CudaDropWrapper, stream: &'stream Stream, waker: Arc>>, @@ -160,17 +158,20 @@ impl> ExchangeWrapperOnHost { self.device_box.copy_from(&**self.locked_cuda_repr)?; + let _: NoCudaAlloc = null_alloc.into(); + Ok(ExchangeWrapperOnDevice { value: self.value, device_box: self.device_box, locked_cuda_repr: self.locked_cuda_repr, - null_alloc, move_event: self.move_event, }) } } -impl> ExchangeWrapperOnHost { +impl> + ExchangeWrapperOnHost +{ /// Moves the data asynchronously to the CUDA device. /// /// To avoid aliasing, each CUDA thread will get access to its own shallow @@ -208,11 +209,12 @@ impl> ExchangeWrapperOnHost> value: self.value, device_box: self.device_box, locked_cuda_repr: self.locked_cuda_repr, - null_alloc: self.null_alloc, move_event: self.move_event, }) } @@ -362,7 +363,6 @@ impl<'stream, T: RustToCuda> value: self.value, device_box: self.device_box, locked_cuda_repr: self.locked_cuda_repr, - null_alloc: self.null_alloc, move_event: self.move_event, stream, waker: self.waker, @@ -406,8 +406,10 @@ impl<'stream, T: RustToCuda> /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside /// CUDA pub fn move_to_host(mut self) -> CudaResult> { + let null_alloc = NoCudaAlloc.into(); + // Reflect deep changes back to the CPU - let _null_alloc: NoCudaAlloc = unsafe { self.value.restore(self.null_alloc) }?; + let _null_alloc: NoCudaAlloc = unsafe { self.value.restore(null_alloc) }?; // Note: Shallow changes are not reflected back to the CPU @@ -420,8 +422,10 @@ impl<'stream, T: RustToCuda> } } -impl<'stream, T: RustToCudaAsync> - ExchangeWrapperOnDeviceAsync<'stream, T> +impl< + 'stream, + T: RustToCudaAsync, + > ExchangeWrapperOnDeviceAsync<'stream, T> { /// Moves the data asynchronously back to the host CPU device. /// @@ -437,9 +441,10 @@ impl<'stream, T: RustToCudaAsync> mut self, stream: &'stream Stream, ) -> CudaResult> { + let null_alloc = NoCudaAlloc.into(); + // Reflect deep changes back to the CPU - let _null_alloc: NoCudaAlloc = - unsafe { self.value.restore_async(self.null_alloc, stream) }?; + let _null_alloc: NoCudaAlloc = unsafe { self.value.restore_async(null_alloc, stream) }?; // Note: Shallow changes are not reflected back to the CPU @@ -491,7 +496,6 @@ impl<'stream, T: RustToCuda> IntoFuture value: inner.value, device_box: inner.device_box, locked_cuda_repr: inner.locked_cuda_repr, - null_alloc: inner.null_alloc, move_event: inner.move_event, })), None => Poll::Ready(Err(CudaError::AlreadyAcquired)), @@ -515,8 +519,10 @@ impl> ExchangeWrapperOnDevice { /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside /// CUDA pub fn move_to_host(mut self) -> CudaResult> { + let null_alloc = NoCudaAlloc.into(); + // Reflect deep changes back to the CPU - let _null_alloc: NoCudaAlloc = unsafe { self.value.restore(self.null_alloc) }?; + let _null_alloc: NoCudaAlloc = unsafe { self.value.restore(null_alloc) }?; // Note: Shallow changes are not reflected back to the CPU @@ -547,7 +553,9 @@ impl> ExchangeWrapperOnDevice { } } -impl> ExchangeWrapperOnDevice { +impl> + ExchangeWrapperOnDevice +{ /// Moves the data asynchronously back to the host CPU device. /// /// To avoid aliasing, each CUDA thread only got access to its own shallow @@ -562,9 +570,10 @@ impl> ExchangeWrapperOnDevice mut self, stream: &Stream, ) -> CudaResult> { + let null_alloc = NoCudaAlloc.into(); + // Reflect deep changes back to the CPU - let _null_alloc: NoCudaAlloc = - unsafe { self.value.restore_async(self.null_alloc, stream) }?; + let _null_alloc: NoCudaAlloc = unsafe { self.value.restore_async(null_alloc, stream) }?; // Note: Shallow changes are not reflected back to the CPU diff --git a/src/utils/mod.rs b/src/utils/mod.rs index bab467e42..e41a3c4ee 100644 --- a/src/utils/mod.rs +++ b/src/utils/mod.rs @@ -1,5 +1,6 @@ pub mod adapter; pub mod aliasing; +pub mod r#async; pub mod exchange; pub mod ffi; pub mod shared; From 8ec927a5e3b53239976b6584d1c1af4456c441f5 Mon Sep 17 00:00:00 2001 From: Juniper Tyree Date: Fri, 29 Dec 2023 14:47:46 +0000 Subject: [PATCH 073/120] More experiments with async API --- src/lend/impls/box.rs | 90 ++++++++++++++------- src/lend/mod.rs | 38 ++++++--- src/utils/async.rs | 184 ++++++++++++++++++++++++++++++++---------- 3 files changed, 227 insertions(+), 85 deletions(-) diff --git a/src/lend/impls/box.rs b/src/lend/impls/box.rs index 1ec853b34..552f93e7e 100644 --- a/src/lend/impls/box.rs +++ b/src/lend/impls/box.rs @@ -21,6 +21,7 @@ use crate::{ alloc::{CombinedCudaAlloc, CudaAlloc}, host::CudaDropWrapper, utils::adapter::DeviceCopyWithPortableBitSemantics, + utils::r#async::{Async, CudaAsync}, }; #[doc(hidden)] @@ -74,7 +75,7 @@ unsafe impl RustToCuda for Box { } } -unsafe impl RustToCudaAsync for Box { +unsafe impl RustToCudaAsync for Box { #[cfg(all(feature = "host", not(doc)))] type CudaAllocationAsync = CombinedCudaAlloc< CudaDropWrapper>>>, @@ -121,11 +122,11 @@ unsafe impl RustToCudaAsync for Box( + unsafe fn restore_async<'stream, A: CudaAlloc>( &mut self, alloc: CombinedCudaAlloc, - stream: &rustacuda::stream::Stream, - ) -> rustacuda::error::CudaResult { + stream: &'stream rustacuda::stream::Stream, + ) -> CudaResult<(Async<'stream, (), &mut std::boxed::Box>, A)> { use rustacuda::memory::AsyncCopyDestination; struct PromiseSend(T); @@ -137,37 +138,64 @@ unsafe impl RustToCudaAsync for Box = self_ptr; + let locked_box = PromiseSend(locked_box); + let device_box = PromiseSend(device_box); + let r#async = + as crate::utils::r#async::CudaAsync< + (), + &mut Self, + >>::new((), stream, self, |data: &mut Self| { + // TODO: we cannot actually drop here since that would invoke a CUDA function std::mem::drop(device_box); - if res == Ok(()) { - // Safety: The precondition of this method guarantees that - // &mut self has been borrowed until after this - // completion is run - unsafe { - std::ptr::copy_nonoverlapping( - locked_box.0.as_ptr().cast::(), - self_ptr.0, - 1, - ); - } + // Safety: equivalent to *data = *locked_box since + // LockedBox> doesn't drop T + unsafe { + std::ptr::copy_nonoverlapping( + locked_box.0.as_ptr().cast::(), + &mut **data, + 1, + ); } + // TODO: we cannot actually drop here since that would invoke a CUDA function std::mem::drop(locked_box); - })) - }?; - - Ok(alloc_tail) + Ok(()) + })?; + // std::mem::drop(r#async); + + Ok((r#async, alloc_tail)) + + // { + // // TODO: express this unsafe-rich completion safely + // // by explicitly capturing &mut self until the + // // async restore has completed + // let self_ptr: *mut T = std::ptr::from_mut(self); + + // let self_ptr = PromiseSend(self_ptr); + // let locked_box = PromiseSend(locked_box); + // let device_box = PromiseSend(device_box); + + // stream.add_callback(Box::new(move |res| { + // let self_ptr: PromiseSend<_> = self_ptr; + + // std::mem::drop(device_box); + // if res == Ok(()) { + // // Safety: The precondition of this method guarantees that + // // &mut self has been borrowed until after this + // // completion is run + // unsafe { + // std::ptr::copy_nonoverlapping( + // locked_box.0.as_ptr().cast::(), + // self_ptr.0, + // 1, + // ); + // } + // } + // std::mem::drop(locked_box); + // })) + // }?; + + // Ok(alloc_tail) } } diff --git a/src/lend/mod.rs b/src/lend/mod.rs index 50f2aee1d..485f03c1f 100644 --- a/src/lend/mod.rs +++ b/src/lend/mod.rs @@ -124,11 +124,11 @@ pub unsafe trait RustToCudaAsync: RustToCuda { /// Therefore, `&mut self` should remain mutably borrowed until /// synchronisation has been performed. #[allow(clippy::type_complexity)] - unsafe fn restore_async( + unsafe fn restore_async<'stream, A: CudaAlloc>( &mut self, alloc: CombinedCudaAlloc, - stream: &rustacuda::stream::Stream, - ) -> rustacuda::error::CudaResult; + stream: &'stream rustacuda::stream::Stream, + ) -> rustacuda::error::CudaResult<(Async<'stream, (), &mut Self>, A)>; } /// # Safety @@ -179,7 +179,9 @@ pub trait LendToCuda: RustToCuda { >( &self, inner: F, - ) -> Result; + ) -> Result + where + Self: Sync; /// Moves `self` to CUDA iff `self` is [`StackOnly`]. /// @@ -197,7 +199,7 @@ pub trait LendToCuda: RustToCuda { inner: F, ) -> Result where - Self: RustToCuda; + Self: Send + RustToCuda; } #[cfg(feature = "host")] @@ -211,7 +213,10 @@ impl LendToCuda for T { >( &self, inner: F, - ) -> Result { + ) -> Result + where + Self: Sync, + { let (cuda_repr, alloc) = unsafe { self.borrow(NoCudaAlloc) }?; let result = HostAndDeviceConstRef::with_new(&cuda_repr, inner); @@ -233,7 +238,7 @@ impl LendToCuda for T { inner: F, ) -> Result where - Self: RustToCuda, + Self: Send + RustToCuda, { let (cuda_repr, alloc) = unsafe { self.borrow(NoCudaAlloc) }?; @@ -264,13 +269,16 @@ pub trait LendToCudaAsync: RustToCudaAsync { Async< 'stream, HostAndDeviceConstRef::CudaRepresentation>>, + &Self, >, ) -> Result, >( &self, stream: &'stream rustacuda::stream::Stream, inner: F, - ) -> Result; + ) -> Result + where + Self: Sync; /// Moves `self` to CUDA iff `self` is [`StackOnly`]. /// @@ -293,7 +301,7 @@ pub trait LendToCudaAsync: RustToCudaAsync { inner: F, ) -> Result where - Self: RustToCuda; + Self: Send + RustToCuda; } #[cfg(feature = "host")] @@ -306,17 +314,21 @@ impl LendToCudaAsync for T { Async< 'stream, HostAndDeviceConstRef::CudaRepresentation>>, + &Self, >, ) -> Result, >( &self, stream: &'stream rustacuda::stream::Stream, inner: F, - ) -> Result { + ) -> Result + where + Self: Sync, + { let (cuda_repr, alloc) = unsafe { self.borrow_async(NoCudaAlloc, stream) }?; let result = HostAndDeviceConstRef::with_new(&cuda_repr, |const_ref| { - inner(Async::new(const_ref, stream)?) + inner(Async::new(const_ref, stream, self, |_self| Ok(()))?) }); core::mem::drop(cuda_repr); @@ -341,12 +353,12 @@ impl LendToCudaAsync for T { inner: F, ) -> Result where - Self: RustToCuda, + Self: Send + RustToCuda, { let (cuda_repr, alloc) = unsafe { self.borrow_async(NoCudaAlloc, stream) }?; let result = HostAndDeviceOwned::with_new(cuda_repr, |owned_ref| { - inner(Async::new(owned_ref, stream)?) + inner(Async::new(owned_ref, stream, (), |()| Ok(()))?) }); core::mem::drop(alloc); diff --git a/src/utils/async.rs b/src/utils/async.rs index b691b755f..78bad1725 100644 --- a/src/utils/async.rs +++ b/src/utils/async.rs @@ -6,8 +6,8 @@ use std::{ #[cfg(feature = "host")] use rustacuda::{ - error::CudaError, error::CudaResult, event::Event, event::EventFlags, event::EventStatus, - stream::Stream, stream::StreamWaitEventFlags, + error::CudaError, error::CudaResult, event::Event, event::EventFlags, stream::Stream, + stream::StreamWaitEventFlags, }; #[cfg(feature = "host")] @@ -15,14 +15,19 @@ use crate::host::CudaDropWrapper; #[cfg(feature = "host")] #[allow(clippy::module_name_repetitions)] -pub trait CudaAsync<'stream, T>: Sized + IntoFuture> { +pub trait CudaAsync<'stream, T, C: Send = ()>: Sized + IntoFuture> { /// Wraps a still-asynchronous `value` which is being computed on `stream` /// such that its computation can be synchronised on. /// /// # Errors /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside /// CUDA. - fn new(value: T, stream: &'stream Stream) -> CudaResult; + fn new( + value: T, + stream: &'stream Stream, + capture: C, + on_completion: impl Send + FnOnce(C) -> CudaResult<()>, + ) -> CudaResult; /// Synchronises on this computation to block until it has completed and /// the inner value can be safely returned and again be used in synchronous @@ -41,7 +46,7 @@ pub trait CudaAsync<'stream, T>: Sized + IntoFuture> { fn move_to_stream<'stream_new>( self, stream: &'stream_new Stream, - ) -> CudaResult>; + ) -> CudaResult>; } #[cfg(feature = "host")] @@ -50,8 +55,15 @@ pub struct Sync { } #[cfg(feature = "host")] -impl<'stream, T> CudaAsync<'stream, T> for Sync { - fn new(value: T, _stream: &'stream Stream) -> CudaResult { +impl<'stream, T, C: Send> CudaAsync<'stream, T, C> for Sync { + fn new( + value: T, + _stream: &'stream Stream, + capture: C, + on_completion: impl Send + FnOnce(C) -> CudaResult<()>, + ) -> CudaResult { + on_completion(capture)?; + Ok(Self { value }) } @@ -76,58 +88,136 @@ impl IntoFuture for Sync { } #[cfg(feature = "host")] -pub struct Async<'stream, T> { +pub struct Async<'stream, T, C = ()> { _stream: PhantomData<&'stream Stream>, event: CudaDropWrapper, - waker: Arc>>, value: T, + status: Arc>>, +} + +// This could also be expressed as a +// https://docs.rs/oneshot/latest/oneshot/index.html channel +#[cfg(feature = "host")] +enum AsyncStatus { + Processing { waker: Option, capture: C }, + Completed { result: CudaResult<()> }, } +// TODO: completion is NOT allowed to make any cuda calls #[cfg(feature = "host")] -impl<'stream, T> CudaAsync<'stream, T> for Async<'stream, T> { - fn new(value: T, stream: &'stream Stream) -> CudaResult { +impl<'stream, T, C: Send> CudaAsync<'stream, T, C> for Async<'stream, T, C> { + fn new( + value: T, + stream: &'stream Stream, + capture: C, + on_completion: impl Send + FnOnce(C) -> CudaResult<()>, + ) -> CudaResult { let event = CudaDropWrapper::from(Event::new( EventFlags::DISABLE_TIMING | EventFlags::BLOCKING_SYNC, )?); - event.record(stream)?; - let waker: Arc>> = Arc::new(Mutex::new(None)); - let waker_callback = waker.clone(); - stream.add_callback(Box::new(move |_| { - if let Ok(mut waker) = waker_callback.lock() { - if let Some(waker) = waker.take() { - waker.wake(); - } + let status = Arc::new(Mutex::new(AsyncStatus::Processing { + waker: None, + capture, + })); + + let status_callback = status.clone(); + stream.add_callback(Box::new(move |res| { + let Ok(mut status) = status_callback.lock() else { + return; + }; + + let old_status = + std::mem::replace(&mut *status, AsyncStatus::Completed { result: Ok(()) }); + + let AsyncStatus::Processing { mut waker, capture } = old_status else { + // this path should never be taken + *status = old_status; + return; + }; + + if let Err(err) = res { + *status = AsyncStatus::Completed { result: Err(err) }; + } else if let Err(err) = on_completion(capture) { + *status = AsyncStatus::Completed { result: Err(err) }; + } + + if let Some(waker) = waker.take() { + waker.wake(); } }))?; + event.record(stream)?; + Ok(Self { _stream: PhantomData::<&'stream Stream>, event, - waker, value, + status, }) } fn synchronize(self) -> CudaResult { + let Ok(status) = self.status.lock() else { + return Err(CudaError::OperatingSystemError); + }; + + if let AsyncStatus::Completed { result } = &*status { + return result.map(|()| self.value); + } + + std::mem::drop(status); + self.event.synchronize()?; - Ok(self.value) + let Ok(status) = self.status.lock() else { + return Err(CudaError::OperatingSystemError); + }; + + match &*status { + AsyncStatus::Completed { result } => result.map(|()| self.value), + AsyncStatus::Processing { .. } => Err(CudaError::NotReady), + } } #[allow(refining_impl_trait)] fn move_to_stream<'stream_new>( self, stream: &'stream_new Stream, - ) -> CudaResult> { + ) -> CudaResult> { + let Ok(status) = self.status.lock() else { + return Err(CudaError::OperatingSystemError); + }; + + if let AsyncStatus::Completed { result } = &*status { + #[allow(clippy::let_unit_value)] + let () = (*result)?; + + std::mem::drop(status); + + // the computation has completed, so the result is available on any stream + return Ok(Async { + _stream: PhantomData::<&'stream_new Stream>, + event: self.event, + value: self.value, + status: self.status, + }); + } + + std::mem::drop(status); + stream.wait_event(&self.event, StreamWaitEventFlags::DEFAULT)?; self.event.record(stream)?; - let waker_callback = self.waker.clone(); + // add a new waker callback since the waker may have received a spurious + // wake-up from when the computation completed on the original stream + let waker_callback = self.status.clone(); stream.add_callback(Box::new(move |_| { - if let Ok(mut waker) = waker_callback.lock() { - if let Some(waker) = waker.take() { - waker.wake(); + if let Ok(mut status) = waker_callback.lock() { + if let AsyncStatus::Processing { waker, .. } = &mut *status { + if let Some(waker) = waker.take() { + waker.wake(); + } } } }))?; @@ -135,14 +225,14 @@ impl<'stream, T> CudaAsync<'stream, T> for Async<'stream, T> { Ok(Async { _stream: PhantomData::<&'stream_new Stream>, event: self.event, - waker: self.waker, value: self.value, + status: self.status, }) } } #[cfg(feature = "host")] -impl<'stream, T> Async<'stream, T> { +impl<'stream, T, C> Async<'stream, T, C> { /// # Safety /// /// The returned inner value of type `T` may not yet have completed its @@ -157,7 +247,7 @@ impl<'stream, T> Async<'stream, T> { } #[cfg(feature = "host")] -impl<'stream, T> IntoFuture for Async<'stream, T> { +impl<'stream, T, C> IntoFuture for Async<'stream, T, C> { type Output = CudaResult; type IntoFuture = impl Future; @@ -165,22 +255,34 @@ impl<'stream, T> IntoFuture for Async<'stream, T> { fn into_future(self) -> Self::IntoFuture { let mut wrapper = Some(self); - std::future::poll_fn(move |cx| match &wrapper { - Some(Async { waker, event, .. }) => match event.query() { - Ok(EventStatus::NotReady) => waker.lock().map_or_else( - |_| Poll::Ready(Err(CudaError::OperatingSystemError)), - |mut waker| { - *waker = Some(cx.waker().clone()); - Poll::Pending + std::future::poll_fn(move |cx| { + let poll = match &wrapper { + #[allow(clippy::option_if_let_else)] + Some(Async { + status: status_mutex, + .. + }) => match status_mutex.lock() { + Ok(mut status_guard) => match &mut *status_guard { + AsyncStatus::Completed { result: Ok(()) } => Poll::Ready(Ok(())), + AsyncStatus::Completed { result: Err(err) } => Poll::Ready(Err(*err)), + AsyncStatus::Processing { waker, .. } => { + *waker = Some(cx.waker().clone()); + Poll::Pending + }, }, - ), - Ok(EventStatus::Ready) => match wrapper.take() { + Err(_) => Poll::Ready(Err(CudaError::OperatingSystemError)), + }, + None => Poll::Ready(Err(CudaError::AlreadyAcquired)), + }; + + match poll { + Poll::Ready(Ok(())) => match wrapper.take() { Some(Async { value, .. }) => Poll::Ready(Ok(value)), None => Poll::Ready(Err(CudaError::AlreadyAcquired)), }, - Err(err) => Poll::Ready(Err(err)), - }, - None => Poll::Ready(Err(CudaError::AlreadyAcquired)), + Poll::Ready(Err(err)) => Poll::Ready(Err(err)), + Poll::Pending => Poll::Pending, + } }) } } From 4993daf56d9b948e64ad5bb17c1041489485655e Mon Sep 17 00:00:00 2001 From: Juniper Tyree Date: Sun, 31 Dec 2023 13:43:28 +0000 Subject: [PATCH 074/120] Further API experimentation --- Cargo.toml | 5 +- src/lend/impls/box.rs | 75 ++----- src/lend/impls/option.rs | 66 +++++-- src/lend/mod.rs | 19 +- src/utils/adapter.rs | 24 ++- src/utils/aliasing/mod.rs | 8 +- src/utils/async.rs | 399 ++++++++++++++++++-------------------- src/utils/exchange/mod.rs | 6 +- 8 files changed, 308 insertions(+), 294 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index bbabb2007..90626aae6 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -21,7 +21,7 @@ rust-version = "1.75" # nightly default = [] derive = ["dep:rustacuda_derive", "dep:rust-cuda-derive"] device = [] -host = ["dep:rustacuda", "dep:regex"] +host = ["dep:rustacuda", "dep:regex", "dep:oneshot"] kernel = ["dep:rust-cuda-kernel"] [dependencies] @@ -34,5 +34,8 @@ regex = { version = "1.10", optional = true } const-type-layout = { version = "0.2.1", features = ["derive"] } +safer_owning_ref = { version = "0.5" } +oneshot = { version = "0.1", optional = true, features = ["std", "async"] } + rust-cuda-derive = { path = "rust-cuda-derive", optional = true } rust-cuda-kernel = { path = "rust-cuda-kernel", optional = true } diff --git a/src/lend/impls/box.rs b/src/lend/impls/box.rs index 552f93e7e..34224eb62 100644 --- a/src/lend/impls/box.rs +++ b/src/lend/impls/box.rs @@ -21,7 +21,7 @@ use crate::{ alloc::{CombinedCudaAlloc, CudaAlloc}, host::CudaDropWrapper, utils::adapter::DeviceCopyWithPortableBitSemantics, - utils::r#async::{Async, CudaAsync}, + utils::r#async::Async, }; #[doc(hidden)] @@ -75,7 +75,7 @@ unsafe impl RustToCuda for Box { } } -unsafe impl RustToCudaAsync for Box { +unsafe impl RustToCudaAsync for Box { #[cfg(all(feature = "host", not(doc)))] type CudaAllocationAsync = CombinedCudaAlloc< CudaDropWrapper>>>, @@ -122,80 +122,41 @@ unsafe impl RustToCudaAsync fo } #[cfg(feature = "host")] - unsafe fn restore_async<'stream, A: CudaAlloc>( - &mut self, + unsafe fn restore_async<'a, 'stream, A: CudaAlloc, O>( + this: owning_ref::BoxRefMut<'a, O, Self>, alloc: CombinedCudaAlloc, stream: &'stream rustacuda::stream::Stream, - ) -> CudaResult<(Async<'stream, (), &mut std::boxed::Box>, A)> { + ) -> CudaResult<( + Async<'stream, owning_ref::BoxRefMut<'a, O, Self>, Self::CudaAllocationAsync>, + A, + )> { use rustacuda::memory::AsyncCopyDestination; - struct PromiseSend(T); - #[allow(clippy::non_send_fields_in_send_ty)] - unsafe impl Send for PromiseSend {} - let (alloc_front, alloc_tail) = alloc.split(); let (mut locked_box, device_box) = alloc_front.split(); device_box.async_copy_to(&mut *locked_box, stream)?; - let locked_box = PromiseSend(locked_box); - let device_box = PromiseSend(device_box); + let r#async = crate::utils::r#async::Async::pending( + this, + stream, + CombinedCudaAlloc::new(locked_box, device_box), + move |this, alloc| { + let data: &mut T = &mut *this; + let (locked_box, device_box) = alloc.split(); - let r#async = - as crate::utils::r#async::CudaAsync< - (), - &mut Self, - >>::new((), stream, self, |data: &mut Self| { - // TODO: we cannot actually drop here since that would invoke a CUDA function std::mem::drop(device_box); // Safety: equivalent to *data = *locked_box since // LockedBox> doesn't drop T unsafe { - std::ptr::copy_nonoverlapping( - locked_box.0.as_ptr().cast::(), - &mut **data, - 1, - ); + std::ptr::copy_nonoverlapping(locked_box.as_ptr().cast::(), data, 1); } - // TODO: we cannot actually drop here since that would invoke a CUDA function std::mem::drop(locked_box); Ok(()) - })?; - // std::mem::drop(r#async); + }, + )?; Ok((r#async, alloc_tail)) - - // { - // // TODO: express this unsafe-rich completion safely - // // by explicitly capturing &mut self until the - // // async restore has completed - // let self_ptr: *mut T = std::ptr::from_mut(self); - - // let self_ptr = PromiseSend(self_ptr); - // let locked_box = PromiseSend(locked_box); - // let device_box = PromiseSend(device_box); - - // stream.add_callback(Box::new(move |res| { - // let self_ptr: PromiseSend<_> = self_ptr; - - // std::mem::drop(device_box); - // if res == Ok(()) { - // // Safety: The precondition of this method guarantees that - // // &mut self has been borrowed until after this - // // completion is run - // unsafe { - // std::ptr::copy_nonoverlapping( - // locked_box.0.as_ptr().cast::(), - // self_ptr.0, - // 1, - // ); - // } - // } - // std::mem::drop(locked_box); - // })) - // }?; - - // Ok(alloc_tail) } } diff --git a/src/lend/impls/option.rs b/src/lend/impls/option.rs index a0f0f8b4a..5a70a24c6 100644 --- a/src/lend/impls/option.rs +++ b/src/lend/impls/option.rs @@ -12,7 +12,10 @@ use crate::{ }; #[cfg(feature = "host")] -use crate::alloc::{CombinedCudaAlloc, CudaAlloc}; +use crate::{ + alloc::{CombinedCudaAlloc, CudaAlloc}, + utils::r#async::Async, +}; #[doc(hidden)] #[allow(clippy::module_name_repetitions)] @@ -118,18 +121,59 @@ unsafe impl RustToCudaAsync for Option { } #[cfg(feature = "host")] - unsafe fn restore_async( - &mut self, + unsafe fn restore_async<'a, 'stream, A: CudaAlloc, O>( + mut this: owning_ref::BoxRefMut<'a, O, Self>, alloc: CombinedCudaAlloc, - stream: &rustacuda::stream::Stream, - ) -> CudaResult { + stream: &'stream rustacuda::stream::Stream, + ) -> CudaResult<( + Async<'stream, owning_ref::BoxRefMut<'a, O, Self>, Self::CudaAllocationAsync>, + A, + )> { let (alloc_front, alloc_tail) = alloc.split(); - match (self, alloc_front) { - (Some(value), Some(alloc_front)) => { - value.restore_async(CombinedCudaAlloc::new(alloc_front, alloc_tail), stream) - }, - _ => Ok(alloc_tail), + if let (Some(_), Some(alloc_front)) = (&mut *this, alloc_front) { + let this_backup = unsafe { std::mem::ManuallyDrop::new(std::ptr::read(&this)) }; + + #[allow(clippy::option_if_let_else)] + let (r#async, alloc_tail) = RustToCudaAsync::restore_async( + this.map_mut(|value| match value { + Some(value) => value, + None => unreachable!(), // TODO + }), + CombinedCudaAlloc::new(alloc_front, alloc_tail), + stream, + )?; + + let (value, capture_on_completion) = unsafe { r#async.unwrap_unchecked()? }; + + std::mem::forget(value); + let this = std::mem::ManuallyDrop::into_inner(this_backup); + + if let Some((capture, on_completion)) = capture_on_completion { + let r#async = Async::pending(this, stream, Some(capture), |this, capture| { + let mut value_backup = unsafe { + std::mem::ManuallyDrop::new(std::ptr::read(this).map_mut( + |value| match value { + Some(value) => value, + None => unreachable!(), // TODO + }, + )) + }; + + if let (Some(_), Some(capture)) = (&mut **this, capture) { + on_completion(&mut value_backup, capture)?; + } + + Ok(()) + })?; + Ok((r#async, alloc_tail)) + } else { + let r#async = Async::ready(this, stream); + Ok((r#async, alloc_tail)) + } + } else { + let r#async = Async::ready(this, stream); + Ok((r#async, alloc_tail)) } } } @@ -165,7 +209,7 @@ impl RustToCudaProxy } } -impl RustToCudaAsyncProxy> +impl RustToCudaAsyncProxy> for Option> { fn from_ref(val: &Option) -> &Self { diff --git a/src/lend/mod.rs b/src/lend/mod.rs index 485f03c1f..0b442cab5 100644 --- a/src/lend/mod.rs +++ b/src/lend/mod.rs @@ -17,7 +17,7 @@ use crate::{alloc::EmptyCudaAlloc, utils::ffi::DeviceAccessible}; use crate::{ alloc::{CombinedCudaAlloc, NoCudaAlloc}, host::{HostAndDeviceConstRef, HostAndDeviceOwned}, - utils::r#async::{Async, CudaAsync}, + utils::r#async::Async, }; mod impls; @@ -124,11 +124,14 @@ pub unsafe trait RustToCudaAsync: RustToCuda { /// Therefore, `&mut self` should remain mutably borrowed until /// synchronisation has been performed. #[allow(clippy::type_complexity)] - unsafe fn restore_async<'stream, A: CudaAlloc>( - &mut self, + unsafe fn restore_async<'a, 'stream, A: CudaAlloc, O>( + this: owning_ref::BoxRefMut<'a, O, Self>, alloc: CombinedCudaAlloc, stream: &'stream rustacuda::stream::Stream, - ) -> rustacuda::error::CudaResult<(Async<'stream, (), &mut Self>, A)>; + ) -> rustacuda::error::CudaResult<( + Async<'stream, owning_ref::BoxRefMut<'a, O, Self>, Self::CudaAllocationAsync>, + A, + )>; } /// # Safety @@ -293,6 +296,7 @@ pub trait LendToCudaAsync: RustToCudaAsync { Async< 'stream, HostAndDeviceOwned::CudaRepresentation>>, + (), >, ) -> Result, >( @@ -328,7 +332,9 @@ impl LendToCudaAsync for T { let (cuda_repr, alloc) = unsafe { self.borrow_async(NoCudaAlloc, stream) }?; let result = HostAndDeviceConstRef::with_new(&cuda_repr, |const_ref| { - inner(Async::new(const_ref, stream, self, |_self| Ok(()))?) + inner(Async::pending(const_ref, stream, self, |_ref, _self| { + Ok(()) + })?) }); core::mem::drop(cuda_repr); @@ -345,6 +351,7 @@ impl LendToCudaAsync for T { Async< 'stream, HostAndDeviceOwned::CudaRepresentation>>, + (), >, ) -> Result, >( @@ -358,7 +365,7 @@ impl LendToCudaAsync for T { let (cuda_repr, alloc) = unsafe { self.borrow_async(NoCudaAlloc, stream) }?; let result = HostAndDeviceOwned::with_new(cuda_repr, |owned_ref| { - inner(Async::new(owned_ref, stream, (), |()| Ok(()))?) + inner(Async::pending(owned_ref, stream, (), |_ref, ()| Ok(()))?) }); core::mem::drop(alloc); diff --git a/src/utils/adapter.rs b/src/utils/adapter.rs index 8e27d98df..182c29184 100644 --- a/src/utils/adapter.rs +++ b/src/utils/adapter.rs @@ -146,14 +146,28 @@ unsafe impl RustToCudaAsync } #[cfg(feature = "host")] - unsafe fn restore_async( - &mut self, + unsafe fn restore_async<'a, 'stream, A: CudaAlloc, O>( + this: owning_ref::BoxRefMut<'a, O, Self>, alloc: CombinedCudaAlloc, - _stream: &rustacuda::stream::Stream, - ) -> rustacuda::error::CudaResult { + stream: &'stream rustacuda::stream::Stream, + ) -> rustacuda::error::CudaResult<( + crate::utils::r#async::Async< + 'stream, + owning_ref::BoxRefMut<'a, O, Self>, + Self::CudaAllocationAsync, + >, + A, + )> { let (_alloc_front, alloc_tail): (NoCudaAlloc, A) = alloc.split(); - Ok(alloc_tail) + let r#async = crate::utils::r#async::Async::pending( + this, + stream, + NoCudaAlloc, + |_this, NoCudaAlloc| Ok(()), + )?; + + Ok((r#async, alloc_tail)) } } diff --git a/src/utils/aliasing/mod.rs b/src/utils/aliasing/mod.rs index e7753cf92..aa0a42742 100644 --- a/src/utils/aliasing/mod.rs +++ b/src/utils/aliasing/mod.rs @@ -1,5 +1,5 @@ -mod r#const; -mod dynamic; +// mod r#const; +// mod dynamic; -pub use dynamic::SplitSliceOverCudaThreadsDynamicStride; -pub use r#const::SplitSliceOverCudaThreadsConstStride; +// pub use dynamic::SplitSliceOverCudaThreadsDynamicStride; +// pub use r#const::SplitSliceOverCudaThreadsConstStride; diff --git a/src/utils/async.rs b/src/utils/async.rs index 78bad1725..683eeb235 100644 --- a/src/utils/async.rs +++ b/src/utils/async.rs @@ -1,8 +1,5 @@ #[cfg(feature = "host")] -use std::{ - future::Future, future::IntoFuture, future::Ready, marker::PhantomData, sync::Arc, sync::Mutex, - task::Poll, task::Waker, -}; +use std::{future::Future, future::IntoFuture, marker::PhantomData, task::Poll}; #[cfg(feature = "host")] use rustacuda::{ @@ -14,225 +11,159 @@ use rustacuda::{ use crate::host::CudaDropWrapper; #[cfg(feature = "host")] -#[allow(clippy::module_name_repetitions)] -pub trait CudaAsync<'stream, T, C: Send = ()>: Sized + IntoFuture> { - /// Wraps a still-asynchronous `value` which is being computed on `stream` - /// such that its computation can be synchronised on. - /// - /// # Errors - /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside - /// CUDA. - fn new( - value: T, - stream: &'stream Stream, - capture: C, - on_completion: impl Send + FnOnce(C) -> CudaResult<()>, - ) -> CudaResult; - - /// Synchronises on this computation to block until it has completed and - /// the inner value can be safely returned and again be used in synchronous - /// operations. - /// - /// # Errors - /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside - /// CUDA. - fn synchronize(self) -> CudaResult; - - /// Moves the asynchronous data move to a different [`Stream`]. - /// - /// # Errors - /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside - /// CUDA. - fn move_to_stream<'stream_new>( - self, - stream: &'stream_new Stream, - ) -> CudaResult>; -} - -#[cfg(feature = "host")] -pub struct Sync { +pub struct Async<'stream, T, C> { + _stream: PhantomData<&'stream Stream>, value: T, + status: AsyncStatus, } #[cfg(feature = "host")] -impl<'stream, T, C: Send> CudaAsync<'stream, T, C> for Sync { - fn new( - value: T, - _stream: &'stream Stream, +enum AsyncStatus { + #[allow(clippy::type_complexity)] + Processing { + receiver: oneshot::Receiver>, capture: C, - on_completion: impl Send + FnOnce(C) -> CudaResult<()>, - ) -> CudaResult { - on_completion(capture)?; - - Ok(Self { value }) - } - - fn synchronize(self) -> CudaResult { - Ok(self.value) - } - - #[allow(refining_impl_trait)] - fn move_to_stream(self, _stream: &Stream) -> CudaResult { - Ok(self) - } + on_completion: Box CudaResult<()>>, + event: CudaDropWrapper, + }, + Completed { + result: CudaResult<()>, + }, } +// TODO: completion is NOT allowed to make any cuda calls #[cfg(feature = "host")] -impl IntoFuture for Sync { - type IntoFuture = Ready>; - type Output = CudaResult; +impl<'stream, T, C> Async<'stream, T, C> { + /// Wraps a `value` which is ready on `stream`. + #[must_use] + pub const fn ready(value: T, stream: &'stream Stream) -> Self { + let _ = stream; - fn into_future(self) -> Self::IntoFuture { - std::future::ready(Ok(self.value)) + Self { + _stream: PhantomData::<&'stream Stream>, + value, + status: AsyncStatus::Completed { result: Ok(()) }, + } } -} -#[cfg(feature = "host")] -pub struct Async<'stream, T, C = ()> { - _stream: PhantomData<&'stream Stream>, - event: CudaDropWrapper, - value: T, - status: Arc>>, -} - -// This could also be expressed as a -// https://docs.rs/oneshot/latest/oneshot/index.html channel -#[cfg(feature = "host")] -enum AsyncStatus { - Processing { waker: Option, capture: C }, - Completed { result: CudaResult<()> }, -} - -// TODO: completion is NOT allowed to make any cuda calls -#[cfg(feature = "host")] -impl<'stream, T, C: Send> CudaAsync<'stream, T, C> for Async<'stream, T, C> { - fn new( + /// Wraps a still-pending `value` which is being computed on `stream` + /// such that its computation can be synchronised on. + /// + /// # Errors + /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside + /// CUDA. + pub fn pending( value: T, stream: &'stream Stream, capture: C, - on_completion: impl Send + FnOnce(C) -> CudaResult<()>, + on_completion: impl FnOnce(&mut T, C) -> CudaResult<()> + 'static, ) -> CudaResult { let event = CudaDropWrapper::from(Event::new( EventFlags::DISABLE_TIMING | EventFlags::BLOCKING_SYNC, )?); - let status = Arc::new(Mutex::new(AsyncStatus::Processing { - waker: None, - capture, - })); - - let status_callback = status.clone(); - stream.add_callback(Box::new(move |res| { - let Ok(mut status) = status_callback.lock() else { - return; - }; - - let old_status = - std::mem::replace(&mut *status, AsyncStatus::Completed { result: Ok(()) }); - - let AsyncStatus::Processing { mut waker, capture } = old_status else { - // this path should never be taken - *status = old_status; - return; - }; - - if let Err(err) = res { - *status = AsyncStatus::Completed { result: Err(err) }; - } else if let Err(err) = on_completion(capture) { - *status = AsyncStatus::Completed { result: Err(err) }; - } - - if let Some(waker) = waker.take() { - waker.wake(); - } - }))?; + let (sender, receiver) = oneshot::channel(); + stream.add_callback(Box::new(|result| std::mem::drop(sender.send(result))))?; event.record(stream)?; Ok(Self { _stream: PhantomData::<&'stream Stream>, - event, value, - status, + status: AsyncStatus::Processing { + capture, + receiver, + on_completion: Box::new(on_completion), + event, + }, }) } - fn synchronize(self) -> CudaResult { - let Ok(status) = self.status.lock() else { - return Err(CudaError::OperatingSystemError); + /// Synchronises on this computation to block until it has completed and + /// the inner value can be safely returned and again be used in synchronous + /// operations. + /// + /// # Errors + /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside + /// CUDA. + pub fn synchronize(mut self) -> CudaResult { + let (receiver, capture, on_completion) = match self.status { + AsyncStatus::Completed { result } => return result.map(|()| self.value), + AsyncStatus::Processing { + receiver, + capture, + on_completion, + event: _, + } => (receiver, capture, on_completion), }; - if let AsyncStatus::Completed { result } = &*status { - return result.map(|()| self.value); + match receiver.recv() { + Ok(Ok(())) => (), + Ok(Err(err)) => return Err(err), + Err(oneshot::RecvError) => return Err(CudaError::AlreadyAcquired), } - std::mem::drop(status); - - self.event.synchronize()?; - - let Ok(status) = self.status.lock() else { - return Err(CudaError::OperatingSystemError); - }; + on_completion(&mut self.value, capture)?; - match &*status { - AsyncStatus::Completed { result } => result.map(|()| self.value), - AsyncStatus::Processing { .. } => Err(CudaError::NotReady), - } + Ok(self.value) } - #[allow(refining_impl_trait)] - fn move_to_stream<'stream_new>( - self, + /// Moves the asynchronous data move to a different [`Stream`]. + /// + /// # Errors + /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside + /// CUDA. + pub fn move_to_stream<'stream_new>( + mut self, stream: &'stream_new Stream, ) -> CudaResult> { - let Ok(status) = self.status.lock() else { - return Err(CudaError::OperatingSystemError); + let (receiver, capture, on_completion, event) = match self.status { + AsyncStatus::Completed { .. } => { + return Ok(Async { + _stream: PhantomData::<&'stream_new Stream>, + value: self.value, + status: self.status, + }) + }, + AsyncStatus::Processing { + receiver, + capture, + on_completion, + event, + } => (receiver, capture, on_completion, event), }; - if let AsyncStatus::Completed { result } = &*status { - #[allow(clippy::let_unit_value)] - let () = (*result)?; - - std::mem::drop(status); - - // the computation has completed, so the result is available on any stream - return Ok(Async { - _stream: PhantomData::<&'stream_new Stream>, - event: self.event, - value: self.value, - status: self.status, - }); - } - - std::mem::drop(status); - - stream.wait_event(&self.event, StreamWaitEventFlags::DEFAULT)?; - self.event.record(stream)?; + match receiver.try_recv() { + Ok(Ok(())) => (), + Ok(Err(err)) => return Err(err), + Err(oneshot::TryRecvError::Empty) => { + stream.wait_event(&event, StreamWaitEventFlags::DEFAULT)?; + + return Ok(Async { + _stream: PhantomData::<&'stream_new Stream>, + value: self.value, + status: AsyncStatus::Processing { + receiver, + capture, + on_completion, + event, + }, + }); + }, + Err(oneshot::TryRecvError::Disconnected) => return Err(CudaError::AlreadyAcquired), + }; - // add a new waker callback since the waker may have received a spurious - // wake-up from when the computation completed on the original stream - let waker_callback = self.status.clone(); - stream.add_callback(Box::new(move |_| { - if let Ok(mut status) = waker_callback.lock() { - if let AsyncStatus::Processing { waker, .. } = &mut *status { - if let Some(waker) = waker.take() { - waker.wake(); - } - } - } - }))?; + on_completion(&mut self.value, capture)?; Ok(Async { _stream: PhantomData::<&'stream_new Stream>, - event: self.event, value: self.value, - status: self.status, + status: AsyncStatus::Completed { result: Ok(()) }, }) } -} -#[cfg(feature = "host")] -impl<'stream, T, C> Async<'stream, T, C> { + #[allow(clippy::missing_errors_doc)] // FIXME + #[allow(clippy::type_complexity)] // FIXME /// # Safety /// /// The returned inner value of type `T` may not yet have completed its @@ -241,8 +172,69 @@ impl<'stream, T, C> Async<'stream, T, C> { /// This method must only be used to construct a larger asynchronous /// computation out of smaller ones that have all been submitted to the /// same [`Stream`]. - pub unsafe fn unwrap_unchecked(self) -> T { - self.value + pub unsafe fn unwrap_unchecked( + self, + ) -> CudaResult<(T, Option<(C, Box CudaResult<()>>)>)> { + match self.status { + AsyncStatus::Completed { result: Ok(()) } => Ok((self.value, None)), + AsyncStatus::Completed { result: Err(err) } => Err(err), + AsyncStatus::Processing { + receiver: _, + capture, + on_completion, + event: _, + } => Ok((self.value, Some((capture, on_completion)))), + } + } +} + +#[cfg(feature = "host")] +struct AsyncFuture<'stream, T, C> { + _stream: PhantomData<&'stream Stream>, + value: Option, + #[allow(clippy::type_complexity)] + capture_on_completion: Option<(C, Box CudaResult<()> + 'static>)>, + status: AsyncStatus, +} + +#[cfg(feature = "host")] +impl<'stream, T, C> Future for AsyncFuture<'stream, T, C> { + type Output = CudaResult; + + fn poll( + self: core::pin::Pin<&mut Self>, + cx: &mut core::task::Context<'_>, + ) -> Poll { + // Safety: this function does not move out of `this` + let this = unsafe { self.get_unchecked_mut() }; + + match &mut this.status { + AsyncStatus::Processing { + receiver, + capture: (), + on_completion: _, + event: _, + } => match std::pin::Pin::new(receiver).poll(cx) { + Poll::Ready(Ok(Ok(()))) => (), + Poll::Ready(Ok(Err(err))) => return Poll::Ready(Err(err)), + Poll::Ready(Err(oneshot::RecvError)) => { + return Poll::Ready(Err(CudaError::AlreadyAcquired)) + }, + Poll::Pending => return Poll::Pending, + }, + AsyncStatus::Completed { result: Ok(()) } => (), + AsyncStatus::Completed { result: Err(err) } => return Poll::Ready(Err(*err)), + } + + let Some(mut value) = this.value.take() else { + return Poll::Ready(Err(CudaError::AlreadyAcquired)); + }; + + if let Some((capture, on_completion)) = this.capture_on_completion.take() { + on_completion(&mut value, capture)?; + } + + Poll::Ready(Ok(value)) } } @@ -253,36 +245,29 @@ impl<'stream, T, C> IntoFuture for Async<'stream, T, C> { type IntoFuture = impl Future; fn into_future(self) -> Self::IntoFuture { - let mut wrapper = Some(self); - - std::future::poll_fn(move |cx| { - let poll = match &wrapper { - #[allow(clippy::option_if_let_else)] - Some(Async { - status: status_mutex, - .. - }) => match status_mutex.lock() { - Ok(mut status_guard) => match &mut *status_guard { - AsyncStatus::Completed { result: Ok(()) } => Poll::Ready(Ok(())), - AsyncStatus::Completed { result: Err(err) } => Poll::Ready(Err(*err)), - AsyncStatus::Processing { waker, .. } => { - *waker = Some(cx.waker().clone()); - Poll::Pending - }, - }, - Err(_) => Poll::Ready(Err(CudaError::OperatingSystemError)), + let (capture_on_completion, status) = match self.status { + AsyncStatus::Completed { result } => (None, AsyncStatus::Completed { result }), + AsyncStatus::Processing { + receiver, + capture, + on_completion, + event, + } => ( + Some((capture, on_completion)), + AsyncStatus::Processing { + receiver, + capture: (), + on_completion: Box::new(|_self, ()| Ok(())), + event, }, - None => Poll::Ready(Err(CudaError::AlreadyAcquired)), - }; + ), + }; - match poll { - Poll::Ready(Ok(())) => match wrapper.take() { - Some(Async { value, .. }) => Poll::Ready(Ok(value)), - None => Poll::Ready(Err(CudaError::AlreadyAcquired)), - }, - Poll::Ready(Err(err)) => Poll::Ready(Err(err)), - Poll::Pending => Poll::Pending, - } - }) + AsyncFuture { + _stream: PhantomData::<&'stream Stream>, + value: Some(self.value), + capture_on_completion, + status, + } } } diff --git a/src/utils/exchange/mod.rs b/src/utils/exchange/mod.rs index 722e02559..9c0de5e36 100644 --- a/src/utils/exchange/mod.rs +++ b/src/utils/exchange/mod.rs @@ -1,4 +1,4 @@ -pub mod buffer; +// pub mod buffer; -#[cfg(feature = "host")] -pub mod wrapper; +// #[cfg(feature = "host")] +// pub mod wrapper; From f8618c5abfe6323356592cd5356108760016b4d9 Mon Sep 17 00:00:00 2001 From: Juniper Tyree Date: Mon, 1 Jan 2024 13:43:18 +0000 Subject: [PATCH 075/120] Further async API experimentation --- src/host/mod.rs | 5 ++ src/lend/impls/box.rs | 23 +++++--- src/lend/impls/boxed_slice.rs | 106 +++++++++++++++++++++++++++++++++- src/lend/impls/option.rs | 86 ++++++++++++++++----------- src/lend/mod.rs | 44 ++++++++++---- src/utils/adapter.rs | 23 ++++---- src/utils/async.rs | 37 ++++++------ 7 files changed, 243 insertions(+), 81 deletions(-) diff --git a/src/host/mod.rs b/src/host/mod.rs index f77c75792..6dee3b1f6 100644 --- a/src/host/mod.rs +++ b/src/host/mod.rs @@ -336,6 +336,11 @@ impl<'a, T: PortableBitSemantics + TypeGraphLayout> HostAndDeviceOwned<'a, T> { self.host_val } + #[must_use] + pub(crate) fn for_async_completion(&mut self) -> &mut T { + self.host_val + } + #[must_use] pub fn into_async<'stream>(self) -> HostAndDeviceOwnedAsync<'stream, 'a, T> { HostAndDeviceOwnedAsync { diff --git a/src/lend/impls/box.rs b/src/lend/impls/box.rs index 34224eb62..1dffba723 100644 --- a/src/lend/impls/box.rs +++ b/src/lend/impls/box.rs @@ -83,14 +83,16 @@ unsafe impl RustToCudaAsync for Box; #[cfg(any(not(feature = "host"), doc))] type CudaAllocationAsync = crate::alloc::SomeCudaAlloc; + #[cfg(feature = "host")] + type RestoreAsyncCapture = Self::CudaAllocationAsync; #[cfg(feature = "host")] - unsafe fn borrow_async( + unsafe fn borrow_async<'stream, A: CudaAlloc>( &self, alloc: A, - stream: &rustacuda::stream::Stream, + stream: &'stream rustacuda::stream::Stream, ) -> rustacuda::error::CudaResult<( - DeviceAccessible, + Async<'stream, DeviceAccessible, &Self>, CombinedCudaAlloc, )> { use rustacuda::memory::AsyncCopyDestination; @@ -114,9 +116,14 @@ unsafe impl RustToCudaAsync for Box RustToCudaAsync for Box, stream: &'stream rustacuda::stream::Stream, ) -> CudaResult<( - Async<'stream, owning_ref::BoxRefMut<'a, O, Self>, Self::CudaAllocationAsync>, + Async<'stream, owning_ref::BoxRefMut<'a, O, Self>, Self::RestoreAsyncCapture, Self>, A, )> { use rustacuda::memory::AsyncCopyDestination; @@ -141,7 +148,7 @@ unsafe impl RustToCudaAsync for Box RustToCuda for Box<[T]> { } } +unsafe impl RustToCudaAsync for Box<[T]> { + #[cfg(all(feature = "host", not(doc)))] + type CudaAllocationAsync = CombinedCudaAlloc< + CudaDropWrapper>>>, + CudaDropWrapper>>>, + >; + #[cfg(any(not(feature = "host"), doc))] + type CudaAllocationAsync = crate::alloc::SomeCudaAlloc; + #[cfg(feature = "host")] + type RestoreAsyncCapture = Self::CudaAllocationAsync; + + #[cfg(feature = "host")] + unsafe fn borrow_async<'stream, A: CudaAlloc>( + &self, + alloc: A, + stream: &'stream rustacuda::stream::Stream, + ) -> rustacuda::error::CudaResult<( + Async<'stream, DeviceAccessible, &Self>, + CombinedCudaAlloc, + )> { + use rustacuda::memory::AsyncCopyDestination; + + let locked_buffer = unsafe { + let mut uninit = CudaDropWrapper::from(LockedBuffer::< + DeviceCopyWithPortableBitSemantics>, + >::uninitialized(self.len())?); + std::ptr::copy_nonoverlapping( + self.as_ref() + .as_ptr() + .cast::>>(), + uninit.as_mut_ptr(), + self.len(), + ); + uninit + }; + + let mut device_buffer = CudaDropWrapper::from(DeviceBuffer::< + DeviceCopyWithPortableBitSemantics>, + >::uninitialized(self.len())?); + device_buffer.async_copy_from(&*locked_buffer, stream)?; + + Ok(( + Async::pending( + DeviceAccessible::from(BoxedSliceCudaRepresentation { + data: DeviceOwnedPointer(device_buffer.as_mut_ptr().cast()), + len: device_buffer.len(), + _marker: PhantomData::, + }), + stream, + self, + |_cuda_repr, _self| Ok(()), + )?, + CombinedCudaAlloc::new(CombinedCudaAlloc::new(locked_buffer, device_buffer), alloc), + )) + } + + #[cfg(feature = "host")] + unsafe fn restore_async<'a, 'stream, A: CudaAlloc, O>( + this: owning_ref::BoxRefMut<'a, O, Self>, + alloc: CombinedCudaAlloc, + stream: &'stream rustacuda::stream::Stream, + ) -> CudaResult<( + Async<'stream, owning_ref::BoxRefMut<'a, O, Self>, Self::RestoreAsyncCapture, Self>, + A, + )> { + use rustacuda::memory::AsyncCopyDestination; + + let (alloc_front, alloc_tail) = alloc.split(); + let (mut locked_buffer, device_buffer) = alloc_front.split(); + + device_buffer.async_copy_to(&mut *locked_buffer, stream)?; + + let r#async = crate::utils::r#async::Async::pending( + this, + stream, + CombinedCudaAlloc::new(locked_buffer, device_buffer), + move |this: &mut Self, alloc| { + let data: &mut [T] = &mut *this; + let (locked_buffer, device_buffer) = alloc.split(); + + std::mem::drop(device_buffer); + // Safety: equivalent to data.copy_from_slice(&*locked_buffer) + // since LockedBox> doesn't drop T + unsafe { + std::ptr::copy_nonoverlapping( + locked_buffer.as_ptr().cast::(), + data.as_mut_ptr(), + data.len(), + ); + } + std::mem::drop(locked_buffer); + Ok(()) + }, + )?; + + Ok((r#async, alloc_tail)) + } +} + unsafe impl CudaAsRust for BoxedSliceCudaRepresentation { diff --git a/src/lend/impls/option.rs b/src/lend/impls/option.rs index 5a70a24c6..b447c6e34 100644 --- a/src/lend/impls/option.rs +++ b/src/lend/impls/option.rs @@ -83,41 +83,65 @@ unsafe impl RustToCuda for Option { unsafe impl RustToCudaAsync for Option { type CudaAllocationAsync = Option<::CudaAllocationAsync>; + #[cfg(feature = "host")] + type RestoreAsyncCapture = ( + ::RestoreAsyncCapture, + Box< + dyn FnOnce(&mut T, ::RestoreAsyncCapture) -> CudaResult<()> + + 'static, + >, + ); #[cfg(feature = "host")] #[allow(clippy::type_complexity)] - unsafe fn borrow_async( + unsafe fn borrow_async<'stream, A: CudaAlloc>( &self, alloc: A, - stream: &rustacuda::stream::Stream, + stream: &'stream rustacuda::stream::Stream, ) -> CudaResult<( - DeviceAccessible, + Async<'stream, DeviceAccessible, &Self>, CombinedCudaAlloc, )> { let (cuda_repr, alloc) = match self { None => ( - OptionCudaRepresentation { - maybe: MaybeUninit::uninit(), - present: false, - }, + Async::ready( + DeviceAccessible::from(OptionCudaRepresentation { + maybe: MaybeUninit::uninit(), + present: false, + }), + stream, + ), CombinedCudaAlloc::new(None, alloc), ), Some(value) => { let (cuda_repr, alloc) = value.borrow_async(alloc, stream)?; - let (alloc_front, alloc_tail) = alloc.split(); + let (cuda_repr, capture_on_completion) = unsafe { cuda_repr.unwrap_unchecked()? }; - ( - OptionCudaRepresentation { - maybe: MaybeUninit::new(cuda_repr), - present: true, - }, - CombinedCudaAlloc::new(Some(alloc_front), alloc_tail), - ) + let (alloc_front, alloc_tail) = alloc.split(); + let alloc = CombinedCudaAlloc::new(Some(alloc_front), alloc_tail); + + let option_cuda_repr = DeviceAccessible::from(OptionCudaRepresentation { + maybe: MaybeUninit::new(cuda_repr), + present: true, + }); + + let r#async = if let Some((capture, on_completion)) = capture_on_completion { + Async::pending(option_cuda_repr, stream, self, |option_cuda_repr, this| { + // if let Some(capture) = this { + // on_completion(todo!(), capture)?; + // } + Ok(()) + })? + } else { + Async::ready(option_cuda_repr, stream) + }; + + (r#async, alloc) }, }; - Ok((DeviceAccessible::from(cuda_repr), alloc)) + Ok((cuda_repr, alloc)) } #[cfg(feature = "host")] @@ -126,7 +150,7 @@ unsafe impl RustToCudaAsync for Option { alloc: CombinedCudaAlloc, stream: &'stream rustacuda::stream::Stream, ) -> CudaResult<( - Async<'stream, owning_ref::BoxRefMut<'a, O, Self>, Self::CudaAllocationAsync>, + Async<'stream, owning_ref::BoxRefMut<'a, O, Self>, Self::RestoreAsyncCapture, Self>, A, )> { let (alloc_front, alloc_tail) = alloc.split(); @@ -150,22 +174,18 @@ unsafe impl RustToCudaAsync for Option { let this = std::mem::ManuallyDrop::into_inner(this_backup); if let Some((capture, on_completion)) = capture_on_completion { - let r#async = Async::pending(this, stream, Some(capture), |this, capture| { - let mut value_backup = unsafe { - std::mem::ManuallyDrop::new(std::ptr::read(this).map_mut( - |value| match value { - Some(value) => value, - None => unreachable!(), // TODO - }, - )) - }; - - if let (Some(_), Some(capture)) = (&mut **this, capture) { - on_completion(&mut value_backup, capture)?; - } - - Ok(()) - })?; + let r#async = Async::pending( + this, + stream, + (capture, on_completion), + |this: &mut Self, (capture, on_completion)| { + if let Some(value) = this { + on_completion(value, capture)?; + } + + Ok(()) + }, + )?; Ok((r#async, alloc_tail)) } else { let r#async = Async::ready(this, stream); diff --git a/src/lend/mod.rs b/src/lend/mod.rs index 0b442cab5..df533a050 100644 --- a/src/lend/mod.rs +++ b/src/lend/mod.rs @@ -75,6 +75,10 @@ pub unsafe trait RustToCuda { pub unsafe trait RustToCudaAsync: RustToCuda { type CudaAllocationAsync: CudaAlloc; + #[doc(hidden)] + #[cfg(feature = "host")] + type RestoreAsyncCapture; + #[doc(hidden)] #[cfg(feature = "host")] /// # Errors @@ -98,12 +102,12 @@ pub unsafe trait RustToCudaAsync: RustToCuda { /// Similarly, `&self` should remain borrowed until synchronisation has /// been performed. #[allow(clippy::type_complexity)] - unsafe fn borrow_async( + unsafe fn borrow_async<'stream, A: CudaAlloc>( &self, alloc: A, - stream: &rustacuda::stream::Stream, + stream: &'stream rustacuda::stream::Stream, ) -> rustacuda::error::CudaResult<( - DeviceAccessible, + Async<'stream, DeviceAccessible, &Self>, CombinedCudaAlloc, )>; @@ -129,7 +133,7 @@ pub unsafe trait RustToCudaAsync: RustToCuda { alloc: CombinedCudaAlloc, stream: &'stream rustacuda::stream::Stream, ) -> rustacuda::error::CudaResult<( - Async<'stream, owning_ref::BoxRefMut<'a, O, Self>, Self::CudaAllocationAsync>, + Async<'stream, owning_ref::BoxRefMut<'a, O, Self>, Self::RestoreAsyncCapture, Self>, A, )>; } @@ -296,7 +300,7 @@ pub trait LendToCudaAsync: RustToCudaAsync { Async< 'stream, HostAndDeviceOwned::CudaRepresentation>>, - (), + Self, >, ) -> Result, >( @@ -331,10 +335,20 @@ impl LendToCudaAsync for T { { let (cuda_repr, alloc) = unsafe { self.borrow_async(NoCudaAlloc, stream) }?; + let (cuda_repr, capture_on_completion) = unsafe { cuda_repr.unwrap_unchecked()? }; + let result = HostAndDeviceConstRef::with_new(&cuda_repr, |const_ref| { - inner(Async::pending(const_ref, stream, self, |_ref, _self| { - Ok(()) - })?) + let r#async = if let Some((capture, on_completion)) = capture_on_completion { + Async::pending(const_ref, stream, self, |const_ref, this| { + // TODO + // on_completion(const_ref.for_host(), this) + Ok(()) + })? + } else { + Async::ready(const_ref, stream) + }; + + inner(r#async) }); core::mem::drop(cuda_repr); @@ -351,7 +365,7 @@ impl LendToCudaAsync for T { Async< 'stream, HostAndDeviceOwned::CudaRepresentation>>, - (), + Self, >, ) -> Result, >( @@ -364,8 +378,18 @@ impl LendToCudaAsync for T { { let (cuda_repr, alloc) = unsafe { self.borrow_async(NoCudaAlloc, stream) }?; + let (cuda_repr, capture_on_completion) = unsafe { cuda_repr.unwrap_unchecked()? }; + let result = HostAndDeviceOwned::with_new(cuda_repr, |owned_ref| { - inner(Async::pending(owned_ref, stream, (), |_ref, ()| Ok(()))?) + let r#async = if let Some((capture, on_completion)) = capture_on_completion { + Async::pending(owned_ref, stream, self, |owned_ref, this| { + on_completion(owned_ref.for_async_completion(), &this) + })? + } else { + Async::ready(owned_ref, stream) + }; + + inner(r#async) }); core::mem::drop(alloc); diff --git a/src/utils/adapter.rs b/src/utils/adapter.rs index 182c29184..e2f667be8 100644 --- a/src/utils/adapter.rs +++ b/src/utils/adapter.rs @@ -130,19 +130,24 @@ unsafe impl RustToCudaAsync for RustToCudaWithPortableBitCopySemantics { type CudaAllocationAsync = NoCudaAlloc; + #[cfg(feature = "host")] + type RestoreAsyncCapture = (); #[cfg(feature = "host")] #[allow(clippy::type_complexity)] - unsafe fn borrow_async( + unsafe fn borrow_async<'stream, A: CudaAlloc>( &self, alloc: A, - _stream: &rustacuda::stream::Stream, + stream: &'stream rustacuda::stream::Stream, ) -> rustacuda::error::CudaResult<( - DeviceAccessible, + crate::utils::r#async::Async<'stream, DeviceAccessible, &Self>, CombinedCudaAlloc, )> { let alloc = CombinedCudaAlloc::new(NoCudaAlloc, alloc); - Ok((DeviceAccessible::from(*self), alloc)) + Ok(( + crate::utils::r#async::Async::ready(DeviceAccessible::from(*self), stream), + alloc, + )) } #[cfg(feature = "host")] @@ -154,18 +159,14 @@ unsafe impl RustToCudaAsync crate::utils::r#async::Async< 'stream, owning_ref::BoxRefMut<'a, O, Self>, - Self::CudaAllocationAsync, + Self::RestoreAsyncCapture, + Self, >, A, )> { let (_alloc_front, alloc_tail): (NoCudaAlloc, A) = alloc.split(); - let r#async = crate::utils::r#async::Async::pending( - this, - stream, - NoCudaAlloc, - |_this, NoCudaAlloc| Ok(()), - )?; + let r#async = crate::utils::r#async::Async::pending(this, stream, (), |_this, ()| Ok(()))?; Ok((r#async, alloc_tail)) } diff --git a/src/utils/async.rs b/src/utils/async.rs index 683eeb235..561cf97f8 100644 --- a/src/utils/async.rs +++ b/src/utils/async.rs @@ -1,5 +1,5 @@ #[cfg(feature = "host")] -use std::{future::Future, future::IntoFuture, marker::PhantomData, task::Poll}; +use std::{borrow::BorrowMut, future::Future, future::IntoFuture, marker::PhantomData, task::Poll}; #[cfg(feature = "host")] use rustacuda::{ @@ -11,19 +11,19 @@ use rustacuda::{ use crate::host::CudaDropWrapper; #[cfg(feature = "host")] -pub struct Async<'stream, T, C> { +pub struct Async<'stream, T: BorrowMut, C, B: ?Sized = T> { _stream: PhantomData<&'stream Stream>, value: T, - status: AsyncStatus, + status: AsyncStatus, } #[cfg(feature = "host")] -enum AsyncStatus { +enum AsyncStatus { #[allow(clippy::type_complexity)] Processing { receiver: oneshot::Receiver>, capture: C, - on_completion: Box CudaResult<()>>, + on_completion: Box CudaResult<()>>, event: CudaDropWrapper, }, Completed { @@ -33,7 +33,7 @@ enum AsyncStatus { // TODO: completion is NOT allowed to make any cuda calls #[cfg(feature = "host")] -impl<'stream, T, C> Async<'stream, T, C> { +impl<'stream, T: BorrowMut, C, B: ?Sized> Async<'stream, T, C, B> { /// Wraps a `value` which is ready on `stream`. #[must_use] pub const fn ready(value: T, stream: &'stream Stream) -> Self { @@ -56,7 +56,7 @@ impl<'stream, T, C> Async<'stream, T, C> { value: T, stream: &'stream Stream, capture: C, - on_completion: impl FnOnce(&mut T, C) -> CudaResult<()> + 'static, + on_completion: impl FnOnce(&mut B, C) -> CudaResult<()> + 'static, ) -> CudaResult { let event = CudaDropWrapper::from(Event::new( EventFlags::DISABLE_TIMING | EventFlags::BLOCKING_SYNC, @@ -103,7 +103,7 @@ impl<'stream, T, C> Async<'stream, T, C> { Err(oneshot::RecvError) => return Err(CudaError::AlreadyAcquired), } - on_completion(&mut self.value, capture)?; + on_completion(self.value.borrow_mut(), capture)?; Ok(self.value) } @@ -116,7 +116,7 @@ impl<'stream, T, C> Async<'stream, T, C> { pub fn move_to_stream<'stream_new>( mut self, stream: &'stream_new Stream, - ) -> CudaResult> { + ) -> CudaResult> { let (receiver, capture, on_completion, event) = match self.status { AsyncStatus::Completed { .. } => { return Ok(Async { @@ -153,7 +153,7 @@ impl<'stream, T, C> Async<'stream, T, C> { Err(oneshot::TryRecvError::Disconnected) => return Err(CudaError::AlreadyAcquired), }; - on_completion(&mut self.value, capture)?; + on_completion(self.value.borrow_mut(), capture)?; Ok(Async { _stream: PhantomData::<&'stream_new Stream>, @@ -174,7 +174,10 @@ impl<'stream, T, C> Async<'stream, T, C> { /// same [`Stream`]. pub unsafe fn unwrap_unchecked( self, - ) -> CudaResult<(T, Option<(C, Box CudaResult<()>>)>)> { + ) -> CudaResult<( + T, + Option<(C, Box CudaResult<()> + 'static>)>, + )> { match self.status { AsyncStatus::Completed { result: Ok(()) } => Ok((self.value, None)), AsyncStatus::Completed { result: Err(err) } => Err(err), @@ -189,16 +192,16 @@ impl<'stream, T, C> Async<'stream, T, C> { } #[cfg(feature = "host")] -struct AsyncFuture<'stream, T, C> { +struct AsyncFuture<'stream, T: BorrowMut, C, B: ?Sized> { _stream: PhantomData<&'stream Stream>, value: Option, #[allow(clippy::type_complexity)] - capture_on_completion: Option<(C, Box CudaResult<()> + 'static>)>, - status: AsyncStatus, + capture_on_completion: Option<(C, Box CudaResult<()> + 'static>)>, + status: AsyncStatus<(), B>, } #[cfg(feature = "host")] -impl<'stream, T, C> Future for AsyncFuture<'stream, T, C> { +impl<'stream, T: BorrowMut, C, B: ?Sized> Future for AsyncFuture<'stream, T, C, B> { type Output = CudaResult; fn poll( @@ -231,7 +234,7 @@ impl<'stream, T, C> Future for AsyncFuture<'stream, T, C> { }; if let Some((capture, on_completion)) = this.capture_on_completion.take() { - on_completion(&mut value, capture)?; + on_completion(value.borrow_mut(), capture)?; } Poll::Ready(Ok(value)) @@ -239,7 +242,7 @@ impl<'stream, T, C> Future for AsyncFuture<'stream, T, C> { } #[cfg(feature = "host")] -impl<'stream, T, C> IntoFuture for Async<'stream, T, C> { +impl<'stream, T: BorrowMut, C, B: ?Sized> IntoFuture for Async<'stream, T, C, B> { type Output = CudaResult; type IntoFuture = impl Future; From 5f52d1d3a3f0c125a7a4b08079d1d4174ffd994d Mon Sep 17 00:00:00 2001 From: Juniper Tyree Date: Tue, 2 Jan 2024 05:10:04 +0000 Subject: [PATCH 076/120] Further async API design work --- src/host/mod.rs | 5 -- src/lend/impls/box.rs | 20 ++--- src/lend/impls/boxed_slice.rs | 20 ++--- src/lend/impls/option.rs | 36 +++----- src/lend/mod.rs | 42 ++++------ src/utils/adapter.rs | 13 +-- src/utils/async.rs | 152 +++++++++++++++++++++------------- 7 files changed, 144 insertions(+), 144 deletions(-) diff --git a/src/host/mod.rs b/src/host/mod.rs index 6dee3b1f6..f77c75792 100644 --- a/src/host/mod.rs +++ b/src/host/mod.rs @@ -336,11 +336,6 @@ impl<'a, T: PortableBitSemantics + TypeGraphLayout> HostAndDeviceOwned<'a, T> { self.host_val } - #[must_use] - pub(crate) fn for_async_completion(&mut self) -> &mut T { - self.host_val - } - #[must_use] pub fn into_async<'stream>(self) -> HostAndDeviceOwnedAsync<'stream, 'a, T> { HostAndDeviceOwnedAsync { diff --git a/src/lend/impls/box.rs b/src/lend/impls/box.rs index 1dffba723..121fe3905 100644 --- a/src/lend/impls/box.rs +++ b/src/lend/impls/box.rs @@ -22,6 +22,8 @@ use crate::{ host::CudaDropWrapper, utils::adapter::DeviceCopyWithPortableBitSemantics, utils::r#async::Async, + utils::r#async::CompletionFnMut, + utils::r#async::NoCompletion, }; #[doc(hidden)] @@ -83,8 +85,6 @@ unsafe impl RustToCudaAsync for Box; #[cfg(any(not(feature = "host"), doc))] type CudaAllocationAsync = crate::alloc::SomeCudaAlloc; - #[cfg(feature = "host")] - type RestoreAsyncCapture = Self::CudaAllocationAsync; #[cfg(feature = "host")] unsafe fn borrow_async<'stream, A: CudaAlloc>( @@ -92,7 +92,7 @@ unsafe impl RustToCudaAsync for Box rustacuda::error::CudaResult<( - Async<'stream, DeviceAccessible, &Self>, + Async<'_, 'stream, DeviceAccessible>, CombinedCudaAlloc, )> { use rustacuda::memory::AsyncCopyDestination; @@ -121,8 +121,7 @@ unsafe impl RustToCudaAsync for Box RustToCudaAsync for Box, stream: &'stream rustacuda::stream::Stream, ) -> CudaResult<( - Async<'stream, owning_ref::BoxRefMut<'a, O, Self>, Self::RestoreAsyncCapture, Self>, + Async<'a, 'stream, owning_ref::BoxRefMut<'a, O, Self>, CompletionFnMut<'a, Self>>, A, )> { use rustacuda::memory::AsyncCopyDestination; @@ -144,14 +143,11 @@ unsafe impl RustToCudaAsync for Box>::pending( this, stream, - CombinedCudaAlloc::new(locked_box, device_box), - move |this: &mut Self, alloc| { + Box::new(move |this: &mut Self| { let data: &mut T = &mut *this; - let (locked_box, device_box) = alloc.split(); - std::mem::drop(device_box); // Safety: equivalent to *data = *locked_box since // LockedBox> doesn't drop T @@ -160,7 +156,7 @@ unsafe impl RustToCudaAsync for Box RustToCudaAsync for Box<[ >; #[cfg(any(not(feature = "host"), doc))] type CudaAllocationAsync = crate::alloc::SomeCudaAlloc; - #[cfg(feature = "host")] - type RestoreAsyncCapture = Self::CudaAllocationAsync; #[cfg(feature = "host")] unsafe fn borrow_async<'stream, A: CudaAlloc>( @@ -100,7 +98,7 @@ unsafe impl RustToCudaAsync for Box<[ alloc: A, stream: &'stream rustacuda::stream::Stream, ) -> rustacuda::error::CudaResult<( - Async<'stream, DeviceAccessible, &Self>, + Async<'_, 'stream, DeviceAccessible>, CombinedCudaAlloc, )> { use rustacuda::memory::AsyncCopyDestination; @@ -132,8 +130,7 @@ unsafe impl RustToCudaAsync for Box<[ _marker: PhantomData::, }), stream, - self, - |_cuda_repr, _self| Ok(()), + NoCompletion, )?, CombinedCudaAlloc::new(CombinedCudaAlloc::new(locked_buffer, device_buffer), alloc), )) @@ -145,7 +142,7 @@ unsafe impl RustToCudaAsync for Box<[ alloc: CombinedCudaAlloc, stream: &'stream rustacuda::stream::Stream, ) -> CudaResult<( - Async<'stream, owning_ref::BoxRefMut<'a, O, Self>, Self::RestoreAsyncCapture, Self>, + Async<'a, 'stream, owning_ref::BoxRefMut<'a, O, Self>, CompletionFnMut<'a, Self>>, A, )> { use rustacuda::memory::AsyncCopyDestination; @@ -155,14 +152,11 @@ unsafe impl RustToCudaAsync for Box<[ device_buffer.async_copy_to(&mut *locked_buffer, stream)?; - let r#async = crate::utils::r#async::Async::pending( + let r#async = crate::utils::r#async::Async::<_, CompletionFnMut<'a, Self>>::pending( this, stream, - CombinedCudaAlloc::new(locked_buffer, device_buffer), - move |this: &mut Self, alloc| { + Box::new(move |this: &mut Self| { let data: &mut [T] = &mut *this; - let (locked_buffer, device_buffer) = alloc.split(); - std::mem::drop(device_buffer); // Safety: equivalent to data.copy_from_slice(&*locked_buffer) // since LockedBox> doesn't drop T @@ -175,7 +169,7 @@ unsafe impl RustToCudaAsync for Box<[ } std::mem::drop(locked_buffer); Ok(()) - }, + }), )?; Ok((r#async, alloc_tail)) diff --git a/src/lend/impls/option.rs b/src/lend/impls/option.rs index b447c6e34..c05c0d3bb 100644 --- a/src/lend/impls/option.rs +++ b/src/lend/impls/option.rs @@ -14,7 +14,7 @@ use crate::{ #[cfg(feature = "host")] use crate::{ alloc::{CombinedCudaAlloc, CudaAlloc}, - utils::r#async::Async, + utils::r#async::{Async, CompletionFnMut, NoCompletion}, }; #[doc(hidden)] @@ -83,14 +83,6 @@ unsafe impl RustToCuda for Option { unsafe impl RustToCudaAsync for Option { type CudaAllocationAsync = Option<::CudaAllocationAsync>; - #[cfg(feature = "host")] - type RestoreAsyncCapture = ( - ::RestoreAsyncCapture, - Box< - dyn FnOnce(&mut T, ::RestoreAsyncCapture) -> CudaResult<()> - + 'static, - >, - ); #[cfg(feature = "host")] #[allow(clippy::type_complexity)] @@ -99,7 +91,7 @@ unsafe impl RustToCudaAsync for Option { alloc: A, stream: &'stream rustacuda::stream::Stream, ) -> CudaResult<( - Async<'stream, DeviceAccessible, &Self>, + Async<'_, 'stream, DeviceAccessible>, CombinedCudaAlloc, )> { let (cuda_repr, alloc) = match self { @@ -126,13 +118,8 @@ unsafe impl RustToCudaAsync for Option { present: true, }); - let r#async = if let Some((capture, on_completion)) = capture_on_completion { - Async::pending(option_cuda_repr, stream, self, |option_cuda_repr, this| { - // if let Some(capture) = this { - // on_completion(todo!(), capture)?; - // } - Ok(()) - })? + let r#async = if matches!(capture_on_completion, Some(NoCompletion)) { + Async::pending(option_cuda_repr, stream, NoCompletion)? } else { Async::ready(option_cuda_repr, stream) }; @@ -150,7 +137,7 @@ unsafe impl RustToCudaAsync for Option { alloc: CombinedCudaAlloc, stream: &'stream rustacuda::stream::Stream, ) -> CudaResult<( - Async<'stream, owning_ref::BoxRefMut<'a, O, Self>, Self::RestoreAsyncCapture, Self>, + Async<'a, 'stream, owning_ref::BoxRefMut<'a, O, Self>, CompletionFnMut<'a, Self>>, A, )> { let (alloc_front, alloc_tail) = alloc.split(); @@ -168,23 +155,22 @@ unsafe impl RustToCudaAsync for Option { stream, )?; - let (value, capture_on_completion) = unsafe { r#async.unwrap_unchecked()? }; + let (value, on_completion) = unsafe { r#async.unwrap_unchecked()? }; std::mem::forget(value); let this = std::mem::ManuallyDrop::into_inner(this_backup); - if let Some((capture, on_completion)) = capture_on_completion { - let r#async = Async::pending( + if let Some(on_completion) = on_completion { + let r#async = Async::<_, CompletionFnMut<'a, Self>>::pending( this, stream, - (capture, on_completion), - |this: &mut Self, (capture, on_completion)| { + Box::new(|this: &mut Self| { if let Some(value) = this { - on_completion(value, capture)?; + on_completion(value)?; } Ok(()) - }, + }), )?; Ok((r#async, alloc_tail)) } else { diff --git a/src/lend/mod.rs b/src/lend/mod.rs index df533a050..598a586b8 100644 --- a/src/lend/mod.rs +++ b/src/lend/mod.rs @@ -17,7 +17,7 @@ use crate::{alloc::EmptyCudaAlloc, utils::ffi::DeviceAccessible}; use crate::{ alloc::{CombinedCudaAlloc, NoCudaAlloc}, host::{HostAndDeviceConstRef, HostAndDeviceOwned}, - utils::r#async::Async, + utils::r#async::{Async, CompletionFnMut, NoCompletion}, }; mod impls; @@ -75,10 +75,6 @@ pub unsafe trait RustToCuda { pub unsafe trait RustToCudaAsync: RustToCuda { type CudaAllocationAsync: CudaAlloc; - #[doc(hidden)] - #[cfg(feature = "host")] - type RestoreAsyncCapture; - #[doc(hidden)] #[cfg(feature = "host")] /// # Errors @@ -107,7 +103,7 @@ pub unsafe trait RustToCudaAsync: RustToCuda { alloc: A, stream: &'stream rustacuda::stream::Stream, ) -> rustacuda::error::CudaResult<( - Async<'stream, DeviceAccessible, &Self>, + Async<'_, 'stream, DeviceAccessible>, CombinedCudaAlloc, )>; @@ -133,7 +129,7 @@ pub unsafe trait RustToCudaAsync: RustToCuda { alloc: CombinedCudaAlloc, stream: &'stream rustacuda::stream::Stream, ) -> rustacuda::error::CudaResult<( - Async<'stream, owning_ref::BoxRefMut<'a, O, Self>, Self::RestoreAsyncCapture, Self>, + Async<'a, 'stream, owning_ref::BoxRefMut<'a, O, Self>, CompletionFnMut<'a, Self>>, A, )>; } @@ -274,9 +270,9 @@ pub trait LendToCudaAsync: RustToCudaAsync { E: From, F: FnOnce( Async< + '_, 'stream, HostAndDeviceConstRef::CudaRepresentation>>, - &Self, >, ) -> Result, >( @@ -296,11 +292,11 @@ pub trait LendToCudaAsync: RustToCudaAsync { 'stream, O, E: From, - F: FnOnce( + F: for<'a> FnOnce( Async< + 'a, 'stream, HostAndDeviceOwned::CudaRepresentation>>, - Self, >, ) -> Result, >( @@ -320,9 +316,9 @@ impl LendToCudaAsync for T { E: From, F: FnOnce( Async< + '_, 'stream, HostAndDeviceConstRef::CudaRepresentation>>, - &Self, >, ) -> Result, >( @@ -338,12 +334,8 @@ impl LendToCudaAsync for T { let (cuda_repr, capture_on_completion) = unsafe { cuda_repr.unwrap_unchecked()? }; let result = HostAndDeviceConstRef::with_new(&cuda_repr, |const_ref| { - let r#async = if let Some((capture, on_completion)) = capture_on_completion { - Async::pending(const_ref, stream, self, |const_ref, this| { - // TODO - // on_completion(const_ref.for_host(), this) - Ok(()) - })? + let r#async = if matches!(capture_on_completion, Some(NoCompletion)) { + Async::pending(const_ref, stream, NoCompletion)? } else { Async::ready(const_ref, stream) }; @@ -361,11 +353,11 @@ impl LendToCudaAsync for T { 'stream, O, E: From, - F: FnOnce( + F: for<'a> FnOnce( Async< + 'a, 'stream, HostAndDeviceOwned::CudaRepresentation>>, - Self, >, ) -> Result, >( @@ -381,15 +373,11 @@ impl LendToCudaAsync for T { let (cuda_repr, capture_on_completion) = unsafe { cuda_repr.unwrap_unchecked()? }; let result = HostAndDeviceOwned::with_new(cuda_repr, |owned_ref| { - let r#async = if let Some((capture, on_completion)) = capture_on_completion { - Async::pending(owned_ref, stream, self, |owned_ref, this| { - on_completion(owned_ref.for_async_completion(), &this) - })? + if matches!(capture_on_completion, Some(NoCompletion)) { + inner(Async::pending(owned_ref, stream, NoCompletion)?) } else { - Async::ready(owned_ref, stream) - }; - - inner(r#async) + inner(Async::ready(owned_ref, stream)) + } }); core::mem::drop(alloc); diff --git a/src/utils/adapter.rs b/src/utils/adapter.rs index e2f667be8..093a02fd4 100644 --- a/src/utils/adapter.rs +++ b/src/utils/adapter.rs @@ -130,8 +130,6 @@ unsafe impl RustToCudaAsync for RustToCudaWithPortableBitCopySemantics { type CudaAllocationAsync = NoCudaAlloc; - #[cfg(feature = "host")] - type RestoreAsyncCapture = (); #[cfg(feature = "host")] #[allow(clippy::type_complexity)] @@ -140,7 +138,7 @@ unsafe impl RustToCudaAsync alloc: A, stream: &'stream rustacuda::stream::Stream, ) -> rustacuda::error::CudaResult<( - crate::utils::r#async::Async<'stream, DeviceAccessible, &Self>, + crate::utils::r#async::Async<'_, 'stream, DeviceAccessible>, CombinedCudaAlloc, )> { let alloc = CombinedCudaAlloc::new(NoCudaAlloc, alloc); @@ -157,16 +155,19 @@ unsafe impl RustToCudaAsync stream: &'stream rustacuda::stream::Stream, ) -> rustacuda::error::CudaResult<( crate::utils::r#async::Async< + 'a, 'stream, owning_ref::BoxRefMut<'a, O, Self>, - Self::RestoreAsyncCapture, - Self, + crate::utils::r#async::CompletionFnMut<'a, Self>, >, A, )> { let (_alloc_front, alloc_tail): (NoCudaAlloc, A) = alloc.split(); - let r#async = crate::utils::r#async::Async::pending(this, stream, (), |_this, ()| Ok(()))?; + let r#async = crate::utils::r#async::Async::< + _, + crate::utils::r#async::CompletionFnMut<'a, Self>, + >::pending(this, stream, Box::new(|_this| Ok(())))?; Ok((r#async, alloc_tail)) } diff --git a/src/utils/async.rs b/src/utils/async.rs index 561cf97f8..6aab8adca 100644 --- a/src/utils/async.rs +++ b/src/utils/async.rs @@ -11,29 +11,68 @@ use rustacuda::{ use crate::host::CudaDropWrapper; #[cfg(feature = "host")] -pub struct Async<'stream, T: BorrowMut, C, B: ?Sized = T> { +pub struct NoCompletion; +#[cfg(feature = "host")] +pub type CompletionFnMut<'a, T> = Box CudaResult<()> + 'a>; + +#[cfg(feature = "host")] +pub trait Completion>: sealed::Sealed { + type Completed: ?Sized; + + #[allow(clippy::missing_errors_doc)] // FIXME + fn complete(self, completed: &mut Self::Completed) -> CudaResult<()>; +} +#[cfg(feature = "host")] +mod sealed { + pub trait Sealed {} +} + +#[cfg(feature = "host")] +impl Completion for NoCompletion { + type Completed = T; + + fn complete(self, _completed: &mut Self::Completed) -> CudaResult<()> { + Ok(()) + } +} +#[cfg(feature = "host")] +impl sealed::Sealed for NoCompletion {} + +#[cfg(feature = "host")] +impl<'a, T: BorrowMut, B: ?Sized> Completion for CompletionFnMut<'a, B> { + type Completed = B; + + fn complete(self, completed: &mut Self::Completed) -> CudaResult<()> { + (self)(completed) + } +} +#[cfg(feature = "host")] +impl<'a, T: ?Sized> sealed::Sealed for CompletionFnMut<'a, T> {} + +#[cfg(feature = "host")] +pub struct Async<'a, 'stream, T: BorrowMut, C: Completion = NoCompletion> { _stream: PhantomData<&'stream Stream>, value: T, - status: AsyncStatus, + status: AsyncStatus<'a, T, C>, + _capture: PhantomData<&'a ()>, } #[cfg(feature = "host")] -enum AsyncStatus { +enum AsyncStatus<'a, T: BorrowMut, C: Completion> { #[allow(clippy::type_complexity)] Processing { receiver: oneshot::Receiver>, - capture: C, - on_completion: Box CudaResult<()>>, + completion: C, event: CudaDropWrapper, + _capture: PhantomData<&'a T>, }, Completed { result: CudaResult<()>, }, } -// TODO: completion is NOT allowed to make any cuda calls #[cfg(feature = "host")] -impl<'stream, T: BorrowMut, C, B: ?Sized> Async<'stream, T, C, B> { +impl<'a, 'stream, T: BorrowMut, C: Completion> Async<'a, 'stream, T, C> { /// Wraps a `value` which is ready on `stream`. #[must_use] pub const fn ready(value: T, stream: &'stream Stream) -> Self { @@ -43,6 +82,7 @@ impl<'stream, T: BorrowMut, C, B: ?Sized> Async<'stream, T, C, B> { _stream: PhantomData::<&'stream Stream>, value, status: AsyncStatus::Completed { result: Ok(()) }, + _capture: PhantomData::<&'a ()>, } } @@ -52,12 +92,7 @@ impl<'stream, T: BorrowMut, C, B: ?Sized> Async<'stream, T, C, B> { /// # Errors /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside /// CUDA. - pub fn pending( - value: T, - stream: &'stream Stream, - capture: C, - on_completion: impl FnOnce(&mut B, C) -> CudaResult<()> + 'static, - ) -> CudaResult { + pub fn pending(value: T, stream: &'stream Stream, completion: C) -> CudaResult { let event = CudaDropWrapper::from(Event::new( EventFlags::DISABLE_TIMING | EventFlags::BLOCKING_SYNC, )?); @@ -71,11 +106,12 @@ impl<'stream, T: BorrowMut, C, B: ?Sized> Async<'stream, T, C, B> { _stream: PhantomData::<&'stream Stream>, value, status: AsyncStatus::Processing { - capture, receiver, - on_completion: Box::new(on_completion), + completion, event, + _capture: PhantomData::<&'a T>, }, + _capture: PhantomData::<&'a ()>, }) } @@ -87,14 +123,14 @@ impl<'stream, T: BorrowMut, C, B: ?Sized> Async<'stream, T, C, B> { /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside /// CUDA. pub fn synchronize(mut self) -> CudaResult { - let (receiver, capture, on_completion) = match self.status { + let (receiver, completion) = match self.status { AsyncStatus::Completed { result } => return result.map(|()| self.value), AsyncStatus::Processing { receiver, - capture, - on_completion, + completion, event: _, - } => (receiver, capture, on_completion), + _capture, + } => (receiver, completion), }; match receiver.recv() { @@ -103,7 +139,7 @@ impl<'stream, T: BorrowMut, C, B: ?Sized> Async<'stream, T, C, B> { Err(oneshot::RecvError) => return Err(CudaError::AlreadyAcquired), } - on_completion(self.value.borrow_mut(), capture)?; + completion.complete(self.value.borrow_mut())?; Ok(self.value) } @@ -116,21 +152,22 @@ impl<'stream, T: BorrowMut, C, B: ?Sized> Async<'stream, T, C, B> { pub fn move_to_stream<'stream_new>( mut self, stream: &'stream_new Stream, - ) -> CudaResult> { - let (receiver, capture, on_completion, event) = match self.status { + ) -> CudaResult> { + let (receiver, completion, event) = match self.status { AsyncStatus::Completed { .. } => { return Ok(Async { _stream: PhantomData::<&'stream_new Stream>, value: self.value, status: self.status, + _capture: PhantomData::<&'a ()>, }) }, AsyncStatus::Processing { receiver, - capture, - on_completion, + completion, event, - } => (receiver, capture, on_completion, event), + _capture, + } => (receiver, completion, event), }; match receiver.try_recv() { @@ -144,21 +181,23 @@ impl<'stream, T: BorrowMut, C, B: ?Sized> Async<'stream, T, C, B> { value: self.value, status: AsyncStatus::Processing { receiver, - capture, - on_completion, + completion, event, + _capture: PhantomData::<&'a T>, }, + _capture: PhantomData::<&'a ()>, }); }, Err(oneshot::TryRecvError::Disconnected) => return Err(CudaError::AlreadyAcquired), }; - on_completion(self.value.borrow_mut(), capture)?; + completion.complete(self.value.borrow_mut())?; Ok(Async { _stream: PhantomData::<&'stream_new Stream>, value: self.value, status: AsyncStatus::Completed { result: Ok(()) }, + _capture: PhantomData::<&'a ()>, }) } @@ -172,36 +211,32 @@ impl<'stream, T: BorrowMut, C, B: ?Sized> Async<'stream, T, C, B> { /// This method must only be used to construct a larger asynchronous /// computation out of smaller ones that have all been submitted to the /// same [`Stream`]. - pub unsafe fn unwrap_unchecked( - self, - ) -> CudaResult<( - T, - Option<(C, Box CudaResult<()> + 'static>)>, - )> { + pub unsafe fn unwrap_unchecked(self) -> CudaResult<(T, Option)> { match self.status { AsyncStatus::Completed { result: Ok(()) } => Ok((self.value, None)), AsyncStatus::Completed { result: Err(err) } => Err(err), AsyncStatus::Processing { receiver: _, - capture, - on_completion, + completion, event: _, - } => Ok((self.value, Some((capture, on_completion)))), + _capture, + } => Ok((self.value, Some(completion))), } } } #[cfg(feature = "host")] -struct AsyncFuture<'stream, T: BorrowMut, C, B: ?Sized> { +struct AsyncFuture<'a, 'stream, T: BorrowMut, C: Completion> { _stream: PhantomData<&'stream Stream>, value: Option, - #[allow(clippy::type_complexity)] - capture_on_completion: Option<(C, Box CudaResult<()> + 'static>)>, - status: AsyncStatus<(), B>, + completion: Option, + status: AsyncStatus<'a, T, NoCompletion>, } #[cfg(feature = "host")] -impl<'stream, T: BorrowMut, C, B: ?Sized> Future for AsyncFuture<'stream, T, C, B> { +impl<'a, 'stream, T: BorrowMut, C: Completion> Future + for AsyncFuture<'a, 'stream, T, C> +{ type Output = CudaResult; fn poll( @@ -214,9 +249,9 @@ impl<'stream, T: BorrowMut, C, B: ?Sized> Future for AsyncFuture<'stream, T, match &mut this.status { AsyncStatus::Processing { receiver, - capture: (), - on_completion: _, + completion: _, event: _, + _capture, } => match std::pin::Pin::new(receiver).poll(cx) { Poll::Ready(Ok(Ok(()))) => (), Poll::Ready(Ok(Err(err))) => return Poll::Ready(Err(err)), @@ -233,8 +268,8 @@ impl<'stream, T: BorrowMut, C, B: ?Sized> Future for AsyncFuture<'stream, T, return Poll::Ready(Err(CudaError::AlreadyAcquired)); }; - if let Some((capture, on_completion)) = this.capture_on_completion.take() { - on_completion(value.borrow_mut(), capture)?; + if let Some(completion) = this.completion.take() { + completion.complete(value.borrow_mut())?; } Poll::Ready(Ok(value)) @@ -242,26 +277,31 @@ impl<'stream, T: BorrowMut, C, B: ?Sized> Future for AsyncFuture<'stream, T, } #[cfg(feature = "host")] -impl<'stream, T: BorrowMut, C, B: ?Sized> IntoFuture for Async<'stream, T, C, B> { +impl<'a, 'stream, T: BorrowMut, C: Completion> IntoFuture + for Async<'a, 'stream, T, C> +{ type Output = CudaResult; type IntoFuture = impl Future; fn into_future(self) -> Self::IntoFuture { - let (capture_on_completion, status) = match self.status { - AsyncStatus::Completed { result } => (None, AsyncStatus::Completed { result }), + let (completion, status): (Option, AsyncStatus<'a, T, NoCompletion>) = match self.status + { + AsyncStatus::Completed { result } => { + (None, AsyncStatus::Completed:: { result }) + }, AsyncStatus::Processing { receiver, - capture, - on_completion, + completion, event, + _capture, } => ( - Some((capture, on_completion)), - AsyncStatus::Processing { + Some(completion), + AsyncStatus::Processing:: { receiver, - capture: (), - on_completion: Box::new(|_self, ()| Ok(())), + completion: NoCompletion, event, + _capture: PhantomData::<&'a T>, }, ), }; @@ -269,7 +309,7 @@ impl<'stream, T: BorrowMut, C, B: ?Sized> IntoFuture for Async<'stream, T, C, AsyncFuture { _stream: PhantomData::<&'stream Stream>, value: Some(self.value), - capture_on_completion, + completion, status, } } From 9dc2ae7c30f1ad42941a033a3b8868fe96ebbf8b Mon Sep 17 00:00:00 2001 From: Juniper Tyree Date: Tue, 2 Jan 2024 05:53:07 +0000 Subject: [PATCH 077/120] Add RustToCudaAsync impls for &T and &[T], but not &mut T or &mut [T] --- .github/workflows/ci.yml | 4 +- Cargo.toml | 4 +- src/lend/impls/ref.rs | 73 +++++++++++++++++++++++++++++++- src/lend/impls/ref_mut.rs | 3 ++ src/lend/impls/slice_ref.rs | 75 ++++++++++++++++++++++++++++++++- src/lend/impls/slice_ref_mut.rs | 3 ++ 6 files changed, 154 insertions(+), 8 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index a8f37a6dd..fcf0fd63c 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -63,7 +63,7 @@ jobs: - name: Check feature powerset on CUDA run: | cargo hack check --feature-powerset --optional-deps \ - --skip host,rustacuda,rustacuda_derive,regex \ + --skip host \ --keep-going \ --target nvptx64-nvidia-cuda @@ -182,7 +182,7 @@ jobs: - name: Check feature powerset on CUDA run: | cargo hack clippy --feature-powerset --optional-deps \ - --skip host,rustacuda,rustacuda_derive,regex \ + --skip host \ --keep-going \ --target nvptx64-nvidia-cuda \ -- -D warnings diff --git a/Cargo.toml b/Cargo.toml index 90626aae6..eb0e1725f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -21,7 +21,7 @@ rust-version = "1.75" # nightly default = [] derive = ["dep:rustacuda_derive", "dep:rust-cuda-derive"] device = [] -host = ["dep:rustacuda", "dep:regex", "dep:oneshot"] +host = ["dep:rustacuda", "dep:regex", "dep:oneshot", "dep:safer_owning_ref"] kernel = ["dep:rust-cuda-kernel"] [dependencies] @@ -34,7 +34,7 @@ regex = { version = "1.10", optional = true } const-type-layout = { version = "0.2.1", features = ["derive"] } -safer_owning_ref = { version = "0.5" } +safer_owning_ref = { version = "0.5", optional = true } oneshot = { version = "0.1", optional = true, features = ["std", "async"] } rust-cuda-derive = { path = "rust-cuda-derive", optional = true } diff --git a/src/lend/impls/ref.rs b/src/lend/impls/ref.rs index c068920ab..501393f63 100644 --- a/src/lend/impls/ref.rs +++ b/src/lend/impls/ref.rs @@ -1,12 +1,14 @@ use core::marker::PhantomData; +#[cfg(feature = "host")] +use std::mem::ManuallyDrop; use const_type_layout::{TypeGraphLayout, TypeLayout}; #[cfg(feature = "host")] -use rustacuda::{error::CudaResult, memory::DeviceBox}; +use rustacuda::{error::CudaResult, memory::DeviceBox, memory::LockedBox}; use crate::{ - lend::{CudaAsRust, RustToCuda}, + lend::{CudaAsRust, RustToCuda, RustToCudaAsync}, safety::PortableBitSemantics, utils::ffi::DeviceConstPointer, }; @@ -19,6 +21,7 @@ use crate::{ alloc::{CombinedCudaAlloc, CudaAlloc}, host::CudaDropWrapper, utils::adapter::DeviceCopyWithPortableBitSemantics, + utils::r#async::{Async, CompletionFnMut, NoCompletion}, }; #[doc(hidden)] @@ -69,6 +72,72 @@ unsafe impl<'a, T: PortableBitSemantics + TypeGraphLayout> RustToCuda for &'a T } } +unsafe impl<'a, T: PortableBitSemantics + TypeGraphLayout> RustToCudaAsync for &'a T { + #[cfg(all(feature = "host", not(doc)))] + type CudaAllocationAsync = CombinedCudaAlloc< + CudaDropWrapper>>>, + CudaDropWrapper>>>, + >; + #[cfg(any(not(feature = "host"), doc))] + type CudaAllocationAsync = crate::alloc::SomeCudaAlloc; + + #[cfg(feature = "host")] + unsafe fn borrow_async<'stream, A: CudaAlloc>( + &self, + alloc: A, + stream: &'stream rustacuda::stream::Stream, + ) -> rustacuda::error::CudaResult<( + Async<'_, 'stream, DeviceAccessible>, + CombinedCudaAlloc, + )> { + use rustacuda::memory::AsyncCopyDestination; + + let locked_box = unsafe { + let mut uninit = CudaDropWrapper::from(LockedBox::< + DeviceCopyWithPortableBitSemantics>, + >::uninitialized()?); + std::ptr::copy_nonoverlapping( + std::ptr::from_ref::(&**self) + .cast::>>(), + uninit.as_mut_ptr(), + 1, + ); + uninit + }; + + let mut device_box = CudaDropWrapper::from(DeviceBox::< + DeviceCopyWithPortableBitSemantics>, + >::uninitialized()?); + device_box.async_copy_from(&*locked_box, stream)?; + + Ok(( + Async::pending( + DeviceAccessible::from(RefCudaRepresentation { + data: DeviceConstPointer(device_box.as_device_ptr().as_raw().cast()), + _marker: PhantomData::<&T>, + }), + stream, + NoCompletion, + )?, + CombinedCudaAlloc::new(CombinedCudaAlloc::new(locked_box, device_box), alloc), + )) + } + + #[cfg(feature = "host")] + unsafe fn restore_async<'b, 'stream, A: CudaAlloc, O>( + this: owning_ref::BoxRefMut<'b, O, Self>, + alloc: CombinedCudaAlloc, + stream: &'stream rustacuda::stream::Stream, + ) -> CudaResult<( + Async<'b, 'stream, owning_ref::BoxRefMut<'b, O, Self>, CompletionFnMut<'b, Self>>, + A, + )> { + let (_alloc_front, alloc_tail) = alloc.split(); + let r#async = Async::ready(this, stream); + Ok((r#async, alloc_tail)) + } +} + unsafe impl<'a, T: PortableBitSemantics + TypeGraphLayout> CudaAsRust for RefCudaRepresentation<'a, T> { diff --git a/src/lend/impls/ref_mut.rs b/src/lend/impls/ref_mut.rs index 2a59d8953..cab1ea8df 100644 --- a/src/lend/impls/ref_mut.rs +++ b/src/lend/impls/ref_mut.rs @@ -76,6 +76,9 @@ unsafe impl<'a, T: PortableBitSemantics + TypeGraphLayout> RustToCuda for &'a mu } } +// &mut T cannot implement RustToCudaAsync since the reference, potentially +// with garbage data, would remain accessible after failing a mutable restore + unsafe impl<'a, T: PortableBitSemantics + TypeGraphLayout> CudaAsRust for RefMutCudaRepresentation<'a, T> { diff --git a/src/lend/impls/slice_ref.rs b/src/lend/impls/slice_ref.rs index 70d3a1e63..4f8a3ecd9 100644 --- a/src/lend/impls/slice_ref.rs +++ b/src/lend/impls/slice_ref.rs @@ -1,12 +1,14 @@ use core::marker::PhantomData; +#[cfg(feature = "host")] +use std::mem::ManuallyDrop; use const_type_layout::{TypeGraphLayout, TypeLayout}; #[cfg(feature = "host")] -use rustacuda::{error::CudaResult, memory::DeviceBuffer}; +use rustacuda::{error::CudaResult, memory::DeviceBuffer, memory::LockedBuffer}; use crate::{ - lend::{CudaAsRust, RustToCuda}, + lend::{CudaAsRust, RustToCuda, RustToCudaAsync}, safety::PortableBitSemantics, utils::ffi::DeviceConstPointer, }; @@ -19,6 +21,7 @@ use crate::{ alloc::{CombinedCudaAlloc, CudaAlloc}, host::CudaDropWrapper, utils::adapter::DeviceCopyWithPortableBitSemantics, + utils::r#async::{Async, CompletionFnMut, NoCompletion}, }; #[doc(hidden)] @@ -72,6 +75,74 @@ unsafe impl<'a, T: PortableBitSemantics + TypeGraphLayout> RustToCuda for &'a [T } } +unsafe impl<'a, T: PortableBitSemantics + TypeGraphLayout> RustToCudaAsync for &'a [T] { + #[cfg(all(feature = "host", not(doc)))] + type CudaAllocationAsync = CombinedCudaAlloc< + CudaDropWrapper>>>, + CudaDropWrapper>>>, + >; + #[cfg(any(not(feature = "host"), doc))] + type CudaAllocationAsync = crate::alloc::SomeCudaAlloc; + + #[cfg(feature = "host")] + unsafe fn borrow_async<'stream, A: CudaAlloc>( + &self, + alloc: A, + stream: &'stream rustacuda::stream::Stream, + ) -> rustacuda::error::CudaResult<( + Async<'_, 'stream, DeviceAccessible>, + CombinedCudaAlloc, + )> { + use rustacuda::memory::AsyncCopyDestination; + + let locked_buffer = unsafe { + let mut uninit = CudaDropWrapper::from(LockedBuffer::< + DeviceCopyWithPortableBitSemantics>, + >::uninitialized(self.len())?); + std::ptr::copy_nonoverlapping( + self.as_ref() + .as_ptr() + .cast::>>(), + uninit.as_mut_ptr(), + self.len(), + ); + uninit + }; + + let mut device_buffer = CudaDropWrapper::from(DeviceBuffer::< + DeviceCopyWithPortableBitSemantics>, + >::uninitialized(self.len())?); + device_buffer.async_copy_from(&*locked_buffer, stream)?; + + Ok(( + Async::pending( + DeviceAccessible::from(SliceRefCudaRepresentation { + data: DeviceConstPointer(device_buffer.as_ptr().cast()), + len: device_buffer.len(), + _marker: PhantomData::<&'a [T]>, + }), + stream, + NoCompletion, + )?, + CombinedCudaAlloc::new(CombinedCudaAlloc::new(locked_buffer, device_buffer), alloc), + )) + } + + #[cfg(feature = "host")] + unsafe fn restore_async<'b, 'stream, A: CudaAlloc, O>( + this: owning_ref::BoxRefMut<'b, O, Self>, + alloc: CombinedCudaAlloc, + stream: &'stream rustacuda::stream::Stream, + ) -> CudaResult<( + Async<'b, 'stream, owning_ref::BoxRefMut<'b, O, Self>, CompletionFnMut<'b, Self>>, + A, + )> { + let (_alloc_front, alloc_tail) = alloc.split(); + let r#async = Async::ready(this, stream); + Ok((r#async, alloc_tail)) + } +} + unsafe impl<'a, T: PortableBitSemantics + TypeGraphLayout> CudaAsRust for SliceRefCudaRepresentation<'a, T> { diff --git a/src/lend/impls/slice_ref_mut.rs b/src/lend/impls/slice_ref_mut.rs index 0e802ccca..5c766dd24 100644 --- a/src/lend/impls/slice_ref_mut.rs +++ b/src/lend/impls/slice_ref_mut.rs @@ -79,6 +79,9 @@ unsafe impl<'a, T: PortableBitSemantics + TypeGraphLayout> RustToCuda for &'a mu } } +// &mut [T] cannot implement RustToCudaAsync since the slice, potentially with +// garbage data, would remain accessible after failing a mutable restore + unsafe impl<'a, T: PortableBitSemantics + TypeGraphLayout> CudaAsRust for SliceRefMutCudaRepresentation<'a, T> { From 91f9246832390215bc91ebf00b6692839918a13c Mon Sep 17 00:00:00 2001 From: Juniper Tyree Date: Wed, 3 Jan 2024 04:02:35 +0000 Subject: [PATCH 078/120] Add back mostly unchanged exchange wrapper + buffer with RustToCudaAsync impls --- src/host/mod.rs | 6 - src/utils/async.rs | 26 ++ src/utils/exchange/buffer/host.rs | 51 ++- src/utils/exchange/buffer/mod.rs | 47 ++- src/utils/exchange/mod.rs | 6 +- src/utils/exchange/wrapper.rs | 536 +++++++++--------------------- 6 files changed, 262 insertions(+), 410 deletions(-) diff --git a/src/host/mod.rs b/src/host/mod.rs index f77c75792..a705c8504 100644 --- a/src/host/mod.rs +++ b/src/host/mod.rs @@ -362,10 +362,7 @@ impl<'stream, 'a, T: PortableBitSemantics + TypeGraphLayout> pub unsafe fn new( device_box: &'a mut DeviceBox>, host_ref: &'a mut T, - stream: &'stream Stream, ) -> Self { - let _ = stream; - Self { device_box, host_ref, @@ -448,10 +445,7 @@ impl<'stream, 'a, T: PortableBitSemantics + TypeGraphLayout> pub const unsafe fn new( device_box: &'a DeviceBox>, host_ref: &'a T, - stream: &'stream Stream, ) -> Self { - let _ = stream; - Self { device_box, host_ref, diff --git a/src/utils/async.rs b/src/utils/async.rs index 6aab8adca..f408431ae 100644 --- a/src/utils/async.rs +++ b/src/utils/async.rs @@ -223,6 +223,32 @@ impl<'a, 'stream, T: BorrowMut, C: Completion> Async<'a, 'strea } => Ok((self.value, Some(completion))), } } + + /// # Safety + /// + /// The returned reference to the inner value of type `T` may not yet have + /// completed its asynchronous work and may thus be in an inconsistent + /// state. + /// + /// This method must only be used to construct a larger asynchronous + /// computation out of smaller ones that have all been submitted to the + /// same [`Stream`]. + pub const unsafe fn unwrap_ref_unchecked(&self) -> &T { + &self.value + } + + /// # Safety + /// + /// The returned reference to the inner value of type `T` may not yet have + /// completed its asynchronous work and may thus be in an inconsistent + /// state. + /// + /// This method must only be used to construct a larger asynchronous + /// computation out of smaller ones that have all been submitted to the + /// same [`Stream`]. + pub unsafe fn unwrap_mut_unchecked(&mut self) -> &mut T { + &mut self.value + } } #[cfg(feature = "host")] diff --git a/src/utils/exchange/buffer/host.rs b/src/utils/exchange/buffer/host.rs index e62227d8e..ce0cb9d41 100644 --- a/src/utils/exchange/buffer/host.rs +++ b/src/utils/exchange/buffer/host.rs @@ -16,6 +16,7 @@ use crate::{ utils::{ adapter::DeviceCopyWithPortableBitSemantics, ffi::{DeviceAccessible, DeviceMutPointer}, + r#async::{Async, CompletionFnMut, NoCompletion}, }, }; @@ -174,12 +175,12 @@ impl { #[allow(clippy::type_complexity)] - pub unsafe fn borrow_async( + pub unsafe fn borrow_async<'stream, A: CudaAlloc>( &self, alloc: A, - stream: &rustacuda::stream::Stream, + stream: &'stream rustacuda::stream::Stream, ) -> rustacuda::error::CudaResult<( - DeviceAccessible>, + Async<'_, 'stream, DeviceAccessible>>, CombinedCudaAlloc, )> { // Safety: device_buffer is inside an UnsafeCell @@ -196,33 +197,49 @@ impl( - &mut self, + pub unsafe fn restore_async<'a, 'stream, A: CudaAlloc, O>( + mut this: owning_ref::BoxRefMut<'a, O, Self>, alloc: CombinedCudaAlloc, - stream: &rustacuda::stream::Stream, - ) -> rustacuda::error::CudaResult { + stream: &'stream rustacuda::stream::Stream, + ) -> rustacuda::error::CudaResult<( + Async<'a, 'stream, owning_ref::BoxRefMut<'a, O, Self>, CompletionFnMut<'a, Self>>, + A, + )> { let (_alloc_front, alloc_tail) = alloc.split(); if M2H { // Only move the buffer contents back to the host if needed + let this: &mut Self = &mut this; + rustacuda::memory::AsyncCopyDestination::async_copy_to( - &***self.device_buffer.get_mut(), - self.host_buffer.as_mut_slice(), + &***this.device_buffer.get_mut(), + this.host_buffer.as_mut_slice(), stream, )?; } - Ok(alloc_tail) + let r#async = if M2H { + Async::<_, CompletionFnMut<'a, Self>>::pending(this, stream, Box::new(|_this| Ok(())))? + } else { + Async::ready(this, stream) + }; + + Ok((r#async, alloc_tail)) } } diff --git a/src/utils/exchange/buffer/mod.rs b/src/utils/exchange/buffer/mod.rs index 9dfc4414e..c48a715ac 100644 --- a/src/utils/exchange/buffer/mod.rs +++ b/src/utils/exchange/buffer/mod.rs @@ -20,6 +20,7 @@ use crate::{ use crate::{ alloc::{CombinedCudaAlloc, CudaAlloc}, utils::ffi::DeviceAccessible, + utils::r#async::{Async, CompletionFnMut}, }; #[cfg(any(feature = "host", feature = "device"))] @@ -133,25 +134,51 @@ unsafe impl( + unsafe fn borrow_async<'stream, A: CudaAlloc>( &self, alloc: A, - stream: &rustacuda::stream::Stream, + stream: &'stream rustacuda::stream::Stream, ) -> rustacuda::error::CudaResult<( - DeviceAccessible, - CombinedCudaAlloc, + Async<'_, 'stream, DeviceAccessible>, + CombinedCudaAlloc, )> { self.inner.borrow_async(alloc, stream) } #[cfg(feature = "host")] #[allow(clippy::type_complexity)] - unsafe fn restore_async( - &mut self, - alloc: CombinedCudaAlloc, - stream: &rustacuda::stream::Stream, - ) -> rustacuda::error::CudaResult { - self.inner.restore_async(alloc, stream) + unsafe fn restore_async<'a, 'stream, A: CudaAlloc, O>( + this: owning_ref::BoxRefMut<'a, O, Self>, + alloc: CombinedCudaAlloc, + stream: &'stream rustacuda::stream::Stream, + ) -> rustacuda::error::CudaResult<( + Async<'a, 'stream, owning_ref::BoxRefMut<'a, O, Self>, CompletionFnMut<'a, Self>>, + A, + )> { + let this_backup = unsafe { std::mem::ManuallyDrop::new(std::ptr::read(&this)) }; + + let (r#async, alloc_tail) = host::CudaExchangeBufferHost::restore_async( + this.map_mut(|this| &mut this.inner), + alloc, + stream, + )?; + + let (inner, on_completion) = unsafe { r#async.unwrap_unchecked()? }; + + std::mem::forget(inner); + let this = std::mem::ManuallyDrop::into_inner(this_backup); + + if let Some(on_completion) = on_completion { + let r#async = Async::<_, CompletionFnMut<'a, Self>>::pending( + this, + stream, + Box::new(|this: &mut Self| on_completion(&mut this.inner)), + )?; + Ok((r#async, alloc_tail)) + } else { + let r#async = Async::ready(this, stream); + Ok((r#async, alloc_tail)) + } } } diff --git a/src/utils/exchange/mod.rs b/src/utils/exchange/mod.rs index 9c0de5e36..722e02559 100644 --- a/src/utils/exchange/mod.rs +++ b/src/utils/exchange/mod.rs @@ -1,4 +1,4 @@ -// pub mod buffer; +pub mod buffer; -// #[cfg(feature = "host")] -// pub mod wrapper; +#[cfg(feature = "host")] +pub mod wrapper; diff --git a/src/utils/exchange/wrapper.rs b/src/utils/exchange/wrapper.rs index 454ecc8f3..09aef582d 100644 --- a/src/utils/exchange/wrapper.rs +++ b/src/utils/exchange/wrapper.rs @@ -1,16 +1,9 @@ -use std::{ - future::{Future, IntoFuture}, - marker::PhantomData, - ops::{Deref, DerefMut}, - sync::{Arc, Mutex}, - task::{Poll, Waker}, -}; +use std::ops::{Deref, DerefMut}; use rustacuda::{ - error::{CudaError, CudaResult}, - event::{Event, EventFlags, EventStatus}, + error::CudaResult, memory::{AsyncCopyDestination, CopyDestination, DeviceBox, LockedBox}, - stream::{Stream, StreamWaitEventFlags}, + stream::Stream, }; use crate::{ @@ -20,32 +13,16 @@ use crate::{ HostAndDeviceMutRefAsync, }, lend::{RustToCuda, RustToCudaAsync}, - utils::{adapter::DeviceCopyWithPortableBitSemantics, ffi::DeviceAccessible}, + utils::{ + adapter::DeviceCopyWithPortableBitSemantics, + ffi::DeviceAccessible, + r#async::{Async, CompletionFnMut, NoCompletion}, + }, }; #[allow(clippy::module_name_repetitions)] pub struct ExchangeWrapperOnHost> { - value: T, - device_box: CudaDropWrapper< - DeviceBox< - DeviceCopyWithPortableBitSemantics< - DeviceAccessible<::CudaRepresentation>, - >, - >, - >, - locked_cuda_repr: CudaDropWrapper< - LockedBox< - DeviceCopyWithPortableBitSemantics< - DeviceAccessible<::CudaRepresentation>, - >, - >, - >, - move_event: CudaDropWrapper, -} - -#[allow(clippy::module_name_repetitions)] -pub struct ExchangeWrapperOnHostAsync<'stream, T: RustToCuda> { - value: T, + value: Box, device_box: CudaDropWrapper< DeviceBox< DeviceCopyWithPortableBitSemantics< @@ -60,34 +37,11 @@ pub struct ExchangeWrapperOnHostAsync<'stream, T: RustToCuda, >, >, - move_event: CudaDropWrapper, - stream: PhantomData<&'stream Stream>, - waker: Arc>>, } #[allow(clippy::module_name_repetitions)] pub struct ExchangeWrapperOnDevice> { - value: T, - device_box: CudaDropWrapper< - DeviceBox< - DeviceCopyWithPortableBitSemantics< - DeviceAccessible<::CudaRepresentation>, - >, - >, - >, - locked_cuda_repr: CudaDropWrapper< - LockedBox< - DeviceCopyWithPortableBitSemantics< - DeviceAccessible<::CudaRepresentation>, - >, - >, - >, - move_event: CudaDropWrapper, -} - -#[allow(clippy::module_name_repetitions)] -pub struct ExchangeWrapperOnDeviceAsync<'stream, T: RustToCuda> { - value: T, + value: Box, device_box: CudaDropWrapper< DeviceBox< DeviceCopyWithPortableBitSemantics< @@ -102,9 +56,6 @@ pub struct ExchangeWrapperOnDeviceAsync<'stream, T: RustToCuda, >, >, - move_event: CudaDropWrapper, - stream: &'stream Stream, - waker: Arc>>, } impl> ExchangeWrapperOnHost { @@ -130,16 +81,14 @@ impl> ExchangeWrapperOnHost { uninit }; - let move_event = Event::new(EventFlags::DISABLE_TIMING)?.into(); - Ok(Self { - value, + value: Box::new(value), device_box, locked_cuda_repr, - move_event, }) } + // TODO: safety constraint? /// Moves the data synchronously to the CUDA device, where it can then be /// lent out immutably via [`ExchangeWrapperOnDevice::as_ref`], or mutably /// via [`ExchangeWrapperOnDevice::as_mut`]. @@ -164,7 +113,6 @@ impl> ExchangeWrapperOnHost { value: self.value, device_box: self.device_box, locked_cuda_repr: self.locked_cuda_repr, - move_event: self.move_event, }) } } @@ -172,6 +120,8 @@ impl> ExchangeWrapperOnHost { impl> ExchangeWrapperOnHost { + #[allow(clippy::needless_lifetimes)] // keep 'stream explicit + // TODO: safety constraint? /// Moves the data asynchronously to the CUDA device. /// /// To avoid aliasing, each CUDA thread will get access to its own shallow @@ -182,11 +132,14 @@ impl( mut self, - stream: &Stream, - ) -> CudaResult> { - let (cuda_repr, null_alloc) = unsafe { self.value.borrow_async(NoCudaAlloc, stream) }?; + stream: &'stream Stream, + ) -> CudaResult, NoCompletion>> { + let (cuda_repr, _null_alloc) = unsafe { self.value.borrow_async(NoCudaAlloc, stream) }?; + let (cuda_repr, _completion): (_, Option) = + unsafe { cuda_repr.unwrap_unchecked()? }; + **self.locked_cuda_repr = DeviceCopyWithPortableBitSemantics::from(cuda_repr); // Safety: The device value is not safely exposed until either @@ -196,112 +149,16 @@ impl>> = Arc::new(Mutex::new(None)); - - let waker_callback = waker.clone(); - stream.add_callback(Box::new(move |_| { - if let Ok(mut w) = waker_callback.lock() { - if let Some(w) = w.take() { - w.wake(); - } - } - }))?; - - let _: NoCudaAlloc = null_alloc.into(); - - Ok(ExchangeWrapperOnDeviceAsync { - value: self.value, - device_box: self.device_box, - locked_cuda_repr: self.locked_cuda_repr, - move_event: self.move_event, - stream, - waker, - }) - } -} - -impl<'stream, T: RustToCuda> - ExchangeWrapperOnHostAsync<'stream, T> -{ - /// Synchronises the host CPU thread until the data has moved to the CPU. - /// - /// # Errors - /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside - /// CUDA - pub fn sync_to_host(self) -> CudaResult> { - self.move_event.synchronize()?; - - Ok(ExchangeWrapperOnHost { - value: self.value, - device_box: self.device_box, - locked_cuda_repr: self.locked_cuda_repr, - move_event: self.move_event, - }) - } - - /// Moves the asynchronous data move to a different [`Stream`]. - /// - /// # Errors - /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside - /// CUDA - pub fn move_to_stream(self, stream: &Stream) -> CudaResult> { - stream.wait_event(&self.move_event, StreamWaitEventFlags::DEFAULT)?; - self.move_event.record(stream)?; - - let waker_callback = self.waker.clone(); - stream.add_callback(Box::new(move |_| { - if let Ok(mut w) = waker_callback.lock() { - if let Some(w) = w.take() { - w.wake(); - } - } - }))?; - - Ok(ExchangeWrapperOnHostAsync { - value: self.value, - device_box: self.device_box, - locked_cuda_repr: self.locked_cuda_repr, - move_event: self.move_event, - stream: PhantomData::<&Stream>, - waker: self.waker, - }) - } -} -impl<'stream, T: RustToCuda> IntoFuture - for ExchangeWrapperOnHostAsync<'stream, T> -{ - type Output = CudaResult>; - - type IntoFuture = impl Future; - - fn into_future(self) -> Self::IntoFuture { - let mut wrapper = Some(self); - - core::future::poll_fn(move |cx| match &wrapper { - Some(inner) => match inner.move_event.query() { - Ok(EventStatus::NotReady) => inner.waker.lock().map_or_else( - |_| Poll::Ready(Err(CudaError::OperatingSystemError)), - |mut w| { - *w = Some(cx.waker().clone()); - Poll::Pending - }, - ), - Ok(EventStatus::Ready) => match wrapper.take() { - Some(inner) => Poll::Ready(Ok(ExchangeWrapperOnHost { - value: inner.value, - device_box: inner.device_box, - locked_cuda_repr: inner.locked_cuda_repr, - move_event: inner.move_event, - })), - None => Poll::Ready(Err(CudaError::AlreadyAcquired)), - }, - Err(err) => Poll::Ready(Err(err)), + Async::pending( + ExchangeWrapperOnDevice { + value: self.value, + device_box: self.device_box, + locked_cuda_repr: self.locked_cuda_repr, }, - None => Poll::Ready(Err(CudaError::AlreadyAcquired)), - }) + stream, + NoCompletion, + ) } } @@ -319,83 +176,60 @@ impl> DerefMut for ExchangeWrapper } } -impl<'stream, T: RustToCuda> - ExchangeWrapperOnDeviceAsync<'stream, T> -{ - /// Synchronises the host CPU thread until the data has moved to the GPU. +impl> ExchangeWrapperOnDevice { + // TODO: safety constraint? + /// Moves the data synchronously back to the host CPU device. + /// + /// To avoid aliasing, each CUDA thread only got access to its own shallow + /// copy of the data. Hence, + /// - any shallow changes to the data will NOT be reflected back to the CPU + /// - any deep changes to the data WILL be reflected back to the CPU /// /// # Errors /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside /// CUDA - pub fn sync_to_device(self) -> CudaResult> { - self.move_event.synchronize()?; + pub fn move_to_host(mut self) -> CudaResult> { + let null_alloc = NoCudaAlloc.into(); - Ok(ExchangeWrapperOnDevice { - value: self.value, - device_box: self.device_box, - locked_cuda_repr: self.locked_cuda_repr, - move_event: self.move_event, - }) - } + // Reflect deep changes back to the CPU + let _null_alloc: NoCudaAlloc = unsafe { self.value.restore(null_alloc) }?; - /// Moves the asynchronous data move to a different [`Stream`]. - /// - /// # Errors - /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside - /// CUDA - pub fn move_to_stream( - self, - stream: &Stream, - ) -> CudaResult> { - stream.wait_event(&self.move_event, StreamWaitEventFlags::DEFAULT)?; - self.move_event.record(stream)?; - - let waker_callback = self.waker.clone(); - stream.add_callback(Box::new(move |_| { - if let Ok(mut w) = waker_callback.lock() { - if let Some(w) = w.take() { - w.wake(); - } - } - }))?; - - Ok(ExchangeWrapperOnDeviceAsync { + // Note: Shallow changes are not reflected back to the CPU + + Ok(ExchangeWrapperOnHost { value: self.value, device_box: self.device_box, locked_cuda_repr: self.locked_cuda_repr, - move_event: self.move_event, - stream, - waker: self.waker, }) } - pub fn as_ref_async( + #[must_use] + pub fn as_ref( &self, - ) -> HostAndDeviceConstRefAsync::CudaRepresentation>> { + ) -> HostAndDeviceConstRef::CudaRepresentation>> { // Safety: `device_box` contains exactly the device copy of `locked_cuda_repr` unsafe { - HostAndDeviceConstRefAsync::new( - &*self.device_box, - (**self.locked_cuda_repr).into_ref(), - self.stream, - ) + HostAndDeviceConstRef::new(&self.device_box, (**self.locked_cuda_repr).into_ref()) } } - pub fn as_mut_async( + #[must_use] + pub fn as_mut( &mut self, - ) -> HostAndDeviceMutRefAsync::CudaRepresentation>> { + ) -> HostAndDeviceMutRef::CudaRepresentation>> { // Safety: `device_box` contains exactly the device copy of `locked_cuda_repr` unsafe { - HostAndDeviceMutRefAsync::new( - &mut self.device_box, - (**self.locked_cuda_repr).into_mut(), - self.stream, - ) + HostAndDeviceMutRef::new(&mut self.device_box, (**self.locked_cuda_repr).into_mut()) } } +} - /// Moves the data synchronously back to the host CPU device. +impl> + ExchangeWrapperOnDevice +{ + #[allow(clippy::needless_lifetimes)] // keep 'stream explicit + // TODO: safety constraint? + /// Moves the data asynchronously back to the host CPU device. /// /// To avoid aliasing, each CUDA thread only got access to its own shallow /// copy of the data. Hence, @@ -405,28 +239,60 @@ impl<'stream, T: RustToCuda> /// # Errors /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside /// CUDA - pub fn move_to_host(mut self) -> CudaResult> { + pub fn move_to_host_async<'stream>( + self, + stream: &'stream Stream, + ) -> CudaResult< + Async< + 'static, + 'stream, + ExchangeWrapperOnHost, + CompletionFnMut<'static, ExchangeWrapperOnHost>, + >, + > { let null_alloc = NoCudaAlloc.into(); + let value = owning_ref::BoxRefMut::new(self.value); + // Reflect deep changes back to the CPU - let _null_alloc: NoCudaAlloc = unsafe { self.value.restore(null_alloc) }?; + let (r#async, _null_alloc): (_, NoCudaAlloc) = + unsafe { RustToCudaAsync::restore_async(value, null_alloc, stream) }?; + let (value, on_complete) = unsafe { r#async.unwrap_unchecked()? }; + + let value = value.into_owner(); // Note: Shallow changes are not reflected back to the CPU - Ok(ExchangeWrapperOnHost { - value: self.value, - device_box: self.device_box, - locked_cuda_repr: self.locked_cuda_repr, - move_event: self.move_event, - }) + if let Some(on_complete) = on_complete { + Async::<_, CompletionFnMut>>::pending( + ExchangeWrapperOnHost { + value, + device_box: self.device_box, + locked_cuda_repr: self.locked_cuda_repr, + }, + stream, + Box::new(|on_host: &mut ExchangeWrapperOnHost| on_complete(&mut on_host.value)), + ) + } else { + Ok(Async::ready( + ExchangeWrapperOnHost { + value, + device_box: self.device_box, + locked_cuda_repr: self.locked_cuda_repr, + }, + stream, + )) + } } } impl< + 'a, 'stream, T: RustToCudaAsync, - > ExchangeWrapperOnDeviceAsync<'stream, T> + > Async<'a, 'stream, ExchangeWrapperOnDevice, NoCompletion> { + // TODO: safety constraint? /// Moves the data asynchronously back to the host CPU device. /// /// To avoid aliasing, each CUDA thread only got access to its own shallow @@ -438,165 +304,87 @@ impl< /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside /// CUDA pub fn move_to_host_async( - mut self, + self, stream: &'stream Stream, - ) -> CudaResult> { - let null_alloc = NoCudaAlloc.into(); - - // Reflect deep changes back to the CPU - let _null_alloc: NoCudaAlloc = unsafe { self.value.restore_async(null_alloc, stream) }?; - - // Note: Shallow changes are not reflected back to the CPU - - self.move_event.record(stream)?; - - let waker: Arc>> = Arc::new(Mutex::new(None)); - - let waker_callback = waker.clone(); - stream.add_callback(Box::new(move |_| { - if let Ok(mut w) = waker_callback.lock() { - if let Some(w) = w.take() { - w.wake(); - } - } - }))?; - - Ok(ExchangeWrapperOnHostAsync { - value: self.value, - device_box: self.device_box, - locked_cuda_repr: self.locked_cuda_repr, - move_event: self.move_event, - stream: PhantomData::<&'stream Stream>, - waker, - }) - } -} - -impl<'stream, T: RustToCuda> IntoFuture - for ExchangeWrapperOnDeviceAsync<'stream, T> -{ - type Output = CudaResult>; - - type IntoFuture = impl Future; - - fn into_future(self) -> Self::IntoFuture { - let mut wrapper = Some(self); - - core::future::poll_fn(move |cx| match &wrapper { - Some(inner) => match inner.move_event.query() { - Ok(EventStatus::NotReady) => inner.waker.lock().map_or_else( - |_| Poll::Ready(Err(CudaError::OperatingSystemError)), - |mut w| { - *w = Some(cx.waker().clone()); - Poll::Pending - }, - ), - Ok(EventStatus::Ready) => match wrapper.take() { - Some(inner) => Poll::Ready(Ok(ExchangeWrapperOnDevice { - value: inner.value, - device_box: inner.device_box, - locked_cuda_repr: inner.locked_cuda_repr, - move_event: inner.move_event, - })), - None => Poll::Ready(Err(CudaError::AlreadyAcquired)), - }, - Err(err) => Poll::Ready(Err(err)), - }, - None => Poll::Ready(Err(CudaError::AlreadyAcquired)), - }) - } -} + ) -> CudaResult< + Async< + 'static, + 'stream, + ExchangeWrapperOnHost, + CompletionFnMut<'static, ExchangeWrapperOnHost>, + >, + > { + let (this, completion): (_, Option) = unsafe { self.unwrap_unchecked()? }; -impl> ExchangeWrapperOnDevice { - /// Moves the data synchronously back to the host CPU device. - /// - /// To avoid aliasing, each CUDA thread only got access to its own shallow - /// copy of the data. Hence, - /// - any shallow changes to the data will NOT be reflected back to the CPU - /// - any deep changes to the data WILL be reflected back to the CPU - /// - /// # Errors - /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside - /// CUDA - pub fn move_to_host(mut self) -> CudaResult> { let null_alloc = NoCudaAlloc.into(); + let value = owning_ref::BoxRefMut::new(this.value); + // Reflect deep changes back to the CPU - let _null_alloc: NoCudaAlloc = unsafe { self.value.restore(null_alloc) }?; + let (r#async, _null_alloc): (_, NoCudaAlloc) = + unsafe { RustToCudaAsync::restore_async(value, null_alloc, stream) }?; + let (value, on_complete) = unsafe { r#async.unwrap_unchecked()? }; + + let value = value.into_owner(); // Note: Shallow changes are not reflected back to the CPU - Ok(ExchangeWrapperOnHost { - value: self.value, - device_box: self.device_box, - locked_cuda_repr: self.locked_cuda_repr, - move_event: self.move_event, - }) + let on_host = ExchangeWrapperOnHost { + value, + device_box: this.device_box, + locked_cuda_repr: this.locked_cuda_repr, + }; + + if let Some(on_complete) = on_complete { + Async::<_, CompletionFnMut>>::pending( + on_host, + stream, + Box::new(|on_host: &mut ExchangeWrapperOnHost| on_complete(&mut on_host.value)), + ) + } else if matches!(completion, Some(NoCompletion)) { + Async::<_, CompletionFnMut>>::pending( + on_host, + stream, + Box::new(|_on_host: &mut ExchangeWrapperOnHost| Ok(())), + ) + } else { + Ok(Async::ready(on_host, stream)) + } } - pub fn as_ref( + // TODO: replace by async borrow map + #[must_use] + pub fn as_ref_async( &self, - ) -> HostAndDeviceConstRef::CudaRepresentation>> { + ) -> HostAndDeviceConstRefAsync< + 'stream, + '_, + DeviceAccessible<::CudaRepresentation>, + > { + let this = unsafe { self.unwrap_ref_unchecked() }; + // Safety: `device_box` contains exactly the device copy of `locked_cuda_repr` unsafe { - HostAndDeviceConstRef::new(&self.device_box, (**self.locked_cuda_repr).into_ref()) + HostAndDeviceConstRefAsync::new( + &*(this.device_box), + (**(this.locked_cuda_repr)).into_ref(), + ) } } - pub fn as_mut( + // TODO: replace by async borrow map mut + #[must_use] + pub fn as_mut_async( &mut self, - ) -> HostAndDeviceMutRef::CudaRepresentation>> { + ) -> HostAndDeviceMutRefAsync::CudaRepresentation>> { + let this = unsafe { self.unwrap_mut_unchecked() }; + // Safety: `device_box` contains exactly the device copy of `locked_cuda_repr` unsafe { - HostAndDeviceMutRef::new(&mut self.device_box, (**self.locked_cuda_repr).into_mut()) + HostAndDeviceMutRefAsync::new( + &mut *(this.device_box), + (**(this.locked_cuda_repr)).into_mut(), + ) } } } - -impl> - ExchangeWrapperOnDevice -{ - /// Moves the data asynchronously back to the host CPU device. - /// - /// To avoid aliasing, each CUDA thread only got access to its own shallow - /// copy of the data. Hence, - /// - any shallow changes to the data will NOT be reflected back to the CPU - /// - any deep changes to the data WILL be reflected back to the CPU - /// - /// # Errors - /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside - /// CUDA - pub fn move_to_host_async( - mut self, - stream: &Stream, - ) -> CudaResult> { - let null_alloc = NoCudaAlloc.into(); - - // Reflect deep changes back to the CPU - let _null_alloc: NoCudaAlloc = unsafe { self.value.restore_async(null_alloc, stream) }?; - - // Note: Shallow changes are not reflected back to the CPU - - self.move_event.record(stream)?; - - let waker: Arc>> = Arc::new(Mutex::new(None)); - - let waker_callback = waker.clone(); - stream.add_callback(Box::new(move |_| { - if let Ok(mut w) = waker_callback.lock() { - if let Some(w) = w.take() { - w.wake(); - } - } - }))?; - - Ok(ExchangeWrapperOnHostAsync { - value: self.value, - device_box: self.device_box, - locked_cuda_repr: self.locked_cuda_repr, - move_event: self.move_event, - stream: PhantomData::<&Stream>, - waker, - }) - } -} From 7e2801f06862c35041f978b27931d71bc574f9c4 Mon Sep 17 00:00:00 2001 From: Juniper Tyree Date: Wed, 3 Jan 2024 04:14:13 +0000 Subject: [PATCH 079/120] Add back mostly unchanged anti-aliasing types with RustToCudaAsync impls --- src/utils/aliasing/const.rs | 70 +++++++++++++++++++++++++++------- src/utils/aliasing/dynamic.rs | 71 +++++++++++++++++++++++++++-------- src/utils/aliasing/mod.rs | 8 ++-- 3 files changed, 116 insertions(+), 33 deletions(-) diff --git a/src/utils/aliasing/const.rs b/src/utils/aliasing/const.rs index 8441a5bd1..0259c301a 100644 --- a/src/utils/aliasing/const.rs +++ b/src/utils/aliasing/const.rs @@ -219,29 +219,71 @@ unsafe impl RustToCudaAsync #[cfg(feature = "host")] #[allow(clippy::type_complexity)] - unsafe fn borrow_async( + unsafe fn borrow_async<'stream, A: crate::alloc::CudaAlloc>( &self, alloc: A, - stream: &rustacuda::stream::Stream, + stream: &'stream rustacuda::stream::Stream, ) -> rustacuda::error::CudaResult<( - DeviceAccessible, + crate::utils::r#async::Async<'_, 'stream, DeviceAccessible>, crate::alloc::CombinedCudaAlloc, )> { - let (cuda_repr, alloc) = self.0.borrow_async(alloc, stream)?; - - Ok(( - DeviceAccessible::from(SplitSliceOverCudaThreadsConstStride::new(cuda_repr)), - alloc, - )) + let (r#async, alloc) = self.0.borrow_async(alloc, stream)?; + let (cuda_repr, completion) = unsafe { r#async.unwrap_unchecked()? }; + + let cuda_repr = + DeviceAccessible::from(SplitSliceOverCudaThreadsConstStride::new(cuda_repr)); + + let r#async = if matches!(completion, Some(crate::utils::r#async::NoCompletion)) { + crate::utils::r#async::Async::pending( + cuda_repr, + stream, + crate::utils::r#async::NoCompletion, + )? + } else { + crate::utils::r#async::Async::ready(cuda_repr, stream) + }; + + Ok((r#async, alloc)) } #[cfg(feature = "host")] - unsafe fn restore_async( - &mut self, + unsafe fn restore_async<'a, 'stream, A: crate::alloc::CudaAlloc, O>( + this: owning_ref::BoxRefMut<'a, O, Self>, alloc: crate::alloc::CombinedCudaAlloc, - stream: &rustacuda::stream::Stream, - ) -> rustacuda::error::CudaResult { - self.0.restore_async(alloc, stream) + stream: &'stream rustacuda::stream::Stream, + ) -> rustacuda::error::CudaResult<( + crate::utils::r#async::Async< + 'a, + 'stream, + owning_ref::BoxRefMut<'a, O, Self>, + crate::utils::r#async::CompletionFnMut<'a, Self>, + >, + A, + )> { + let this_backup = unsafe { std::mem::ManuallyDrop::new(std::ptr::read(&this)) }; + + let (r#async, alloc_tail) = + T::restore_async(this.map_mut(|this| &mut this.0), alloc, stream)?; + + let (inner, on_completion) = unsafe { r#async.unwrap_unchecked()? }; + + std::mem::forget(inner); + let this = std::mem::ManuallyDrop::into_inner(this_backup); + + if let Some(on_completion) = on_completion { + let r#async = crate::utils::r#async::Async::< + _, + crate::utils::r#async::CompletionFnMut<'a, Self>, + >::pending( + this, + stream, + Box::new(|this: &mut Self| on_completion(&mut this.0)), + )?; + Ok((r#async, alloc_tail)) + } else { + let r#async = crate::utils::r#async::Async::ready(this, stream); + Ok((r#async, alloc_tail)) + } } } diff --git a/src/utils/aliasing/dynamic.rs b/src/utils/aliasing/dynamic.rs index f8a04fa06..1c502dc8e 100644 --- a/src/utils/aliasing/dynamic.rs +++ b/src/utils/aliasing/dynamic.rs @@ -197,32 +197,73 @@ unsafe impl RustToCudaAsync for SplitSliceOverCudaThreadsDyn #[cfg(feature = "host")] #[allow(clippy::type_complexity)] - unsafe fn borrow_async( + unsafe fn borrow_async<'stream, A: crate::alloc::CudaAlloc>( &self, alloc: A, - stream: &rustacuda::stream::Stream, + stream: &'stream rustacuda::stream::Stream, ) -> rustacuda::error::CudaResult<( - DeviceAccessible, + crate::utils::r#async::Async<'_, 'stream, DeviceAccessible>, crate::alloc::CombinedCudaAlloc, )> { - let (cuda_repr, alloc) = self.inner.borrow_async(alloc, stream)?; + let (r#async, alloc) = self.inner.borrow_async(alloc, stream)?; + let (cuda_repr, completion) = unsafe { r#async.unwrap_unchecked()? }; - Ok(( - DeviceAccessible::from(SplitSliceOverCudaThreadsDynamicStride::new( + let cuda_repr = DeviceAccessible::from(SplitSliceOverCudaThreadsDynamicStride::new( + cuda_repr, + self.stride, + )); + + let r#async = if matches!(completion, Some(crate::utils::r#async::NoCompletion)) { + crate::utils::r#async::Async::pending( cuda_repr, - self.stride, - )), - alloc, - )) + stream, + crate::utils::r#async::NoCompletion, + )? + } else { + crate::utils::r#async::Async::ready(cuda_repr, stream) + }; + + Ok((r#async, alloc)) } #[cfg(feature = "host")] - unsafe fn restore_async( - &mut self, + unsafe fn restore_async<'a, 'stream, A: crate::alloc::CudaAlloc, O>( + this: owning_ref::BoxRefMut<'a, O, Self>, alloc: crate::alloc::CombinedCudaAlloc, - stream: &rustacuda::stream::Stream, - ) -> rustacuda::error::CudaResult { - self.inner.restore_async(alloc, stream) + stream: &'stream rustacuda::stream::Stream, + ) -> rustacuda::error::CudaResult<( + crate::utils::r#async::Async< + 'a, + 'stream, + owning_ref::BoxRefMut<'a, O, Self>, + crate::utils::r#async::CompletionFnMut<'a, Self>, + >, + A, + )> { + let this_backup = unsafe { std::mem::ManuallyDrop::new(std::ptr::read(&this)) }; + + let (r#async, alloc_tail) = + T::restore_async(this.map_mut(|this| &mut this.inner), alloc, stream)?; + + let (inner, on_completion) = unsafe { r#async.unwrap_unchecked()? }; + + std::mem::forget(inner); + let this = std::mem::ManuallyDrop::into_inner(this_backup); + + if let Some(on_completion) = on_completion { + let r#async = crate::utils::r#async::Async::< + _, + crate::utils::r#async::CompletionFnMut<'a, Self>, + >::pending( + this, + stream, + Box::new(|this: &mut Self| on_completion(&mut this.inner)), + )?; + Ok((r#async, alloc_tail)) + } else { + let r#async = crate::utils::r#async::Async::ready(this, stream); + Ok((r#async, alloc_tail)) + } } } diff --git a/src/utils/aliasing/mod.rs b/src/utils/aliasing/mod.rs index aa0a42742..e7753cf92 100644 --- a/src/utils/aliasing/mod.rs +++ b/src/utils/aliasing/mod.rs @@ -1,5 +1,5 @@ -// mod r#const; -// mod dynamic; +mod r#const; +mod dynamic; -// pub use dynamic::SplitSliceOverCudaThreadsDynamicStride; -// pub use r#const::SplitSliceOverCudaThreadsConstStride; +pub use dynamic::SplitSliceOverCudaThreadsDynamicStride; +pub use r#const::SplitSliceOverCudaThreadsConstStride; From af999e5b4620d929a1562d12d0ca7c5b9c73e0c1 Mon Sep 17 00:00:00 2001 From: Juniper Tyree Date: Wed, 3 Jan 2024 11:14:07 +0000 Subject: [PATCH 080/120] Progress on replacing ...Async with Async<...> --- src/host/mod.rs | 213 +++++----------------------------- src/kernel/mod.rs | 6 +- src/kernel/param.rs | 121 ++++++++++++------- src/utils/exchange/wrapper.rs | 47 ++++---- 4 files changed, 130 insertions(+), 257 deletions(-) diff --git a/src/host/mod.rs b/src/host/mod.rs index a705c8504..62870dd39 100644 --- a/src/host/mod.rs +++ b/src/host/mod.rs @@ -22,6 +22,7 @@ use crate::{ DeviceConstPointer, DeviceConstRef, DeviceMutPointer, DeviceMutRef, DeviceOwnedPointer, DeviceOwnedRef, }, + r#async::{Async, NoCompletion}, }, }; @@ -190,15 +191,20 @@ impl<'a, T: PortableBitSemantics + TypeGraphLayout> HostAndDeviceMutRef<'a, T> { } #[must_use] - pub fn as_async<'stream, 'b>(&'b mut self) -> HostAndDeviceMutRefAsync<'stream, 'b, T> + pub fn as_async<'b, 'stream>( + &'b mut self, + stream: &'stream Stream, + ) -> Async<'b, 'stream, HostAndDeviceMutRef<'b, T>, NoCompletion> where 'a: 'b, { - HostAndDeviceMutRefAsync { - device_box: self.device_box, - host_ref: self.host_ref, - stream: PhantomData::<&'stream Stream>, - } + Async::ready( + HostAndDeviceMutRef { + device_box: self.device_box, + host_ref: self.host_ref, + }, + stream, + ) } } @@ -284,15 +290,20 @@ impl<'a, T: PortableBitSemantics + TypeGraphLayout> HostAndDeviceConstRef<'a, T> } #[must_use] - pub const fn as_async<'stream, 'b>(&'b self) -> HostAndDeviceConstRefAsync<'stream, 'b, T> + pub const fn as_async<'b, 'stream>( + &'b self, + stream: &'stream Stream, + ) -> Async<'b, 'stream, HostAndDeviceConstRef<'b, T>, NoCompletion> where 'a: 'b, { - HostAndDeviceConstRefAsync { - device_box: self.device_box, - host_ref: self.host_ref, - stream: PhantomData::<&'stream Stream>, - } + Async::ready( + HostAndDeviceConstRef { + device_box: self.device_box, + host_ref: self.host_ref, + }, + stream, + ) } } @@ -337,178 +348,10 @@ impl<'a, T: PortableBitSemantics + TypeGraphLayout> HostAndDeviceOwned<'a, T> { } #[must_use] - pub fn into_async<'stream>(self) -> HostAndDeviceOwnedAsync<'stream, 'a, T> { - HostAndDeviceOwnedAsync { - device_box: self.device_box, - host_val: self.host_val, - stream: PhantomData::<&'stream Stream>, - } - } -} - -#[allow(clippy::module_name_repetitions)] -pub struct HostAndDeviceMutRefAsync<'stream, 'a, T: PortableBitSemantics + TypeGraphLayout> { - device_box: &'a mut DeviceBox>, - host_ref: &'a mut T, - stream: PhantomData<&'stream Stream>, -} - -impl<'stream, 'a, T: PortableBitSemantics + TypeGraphLayout> - HostAndDeviceMutRefAsync<'stream, 'a, T> -{ - /// # Safety - /// - /// `device_box` must contain EXACTLY the device copy of `host_ref` - pub unsafe fn new( - device_box: &'a mut DeviceBox>, - host_ref: &'a mut T, - ) -> Self { - Self { - device_box, - host_ref, - stream: PhantomData::<&'stream Stream>, - } - } - - #[must_use] - /// # Safety - /// - /// The returned [`DeviceMutRef`] must only be used on the constructed-with - /// [`Stream`] - pub unsafe fn for_device_async<'b>(&'b mut self) -> DeviceMutRef<'a, T> - where - 'a: 'b, - { - DeviceMutRef { - pointer: DeviceMutPointer(self.device_box.as_device_ptr().as_raw_mut().cast()), - reference: PhantomData, - } - } - - #[must_use] - pub fn for_host<'b: 'a>(&'b self) -> &'a T { - self.host_ref - } - - #[must_use] - pub fn as_ref<'b>(&'b self) -> HostAndDeviceConstRefAsync<'stream, 'b, T> - where - 'a: 'b, - { - HostAndDeviceConstRefAsync { - device_box: self.device_box, - host_ref: self.host_ref, - stream: self.stream, - } - } - - #[must_use] - pub fn as_mut<'b>(&'b mut self) -> HostAndDeviceMutRefAsync<'stream, 'b, T> - where - 'a: 'b, - { - HostAndDeviceMutRefAsync { - device_box: self.device_box, - host_ref: self.host_ref, - stream: self.stream, - } - } -} - -#[allow(clippy::module_name_repetitions)] -pub struct HostAndDeviceConstRefAsync<'stream, 'a, T: PortableBitSemantics + TypeGraphLayout> { - device_box: &'a DeviceBox>, - host_ref: &'a T, - stream: PhantomData<&'stream Stream>, -} - -impl<'stream, 'a, T: PortableBitSemantics + TypeGraphLayout> Clone - for HostAndDeviceConstRefAsync<'stream, 'a, T> -{ - fn clone(&self) -> Self { - *self - } -} - -impl<'stream, 'a, T: PortableBitSemantics + TypeGraphLayout> Copy - for HostAndDeviceConstRefAsync<'stream, 'a, T> -{ -} - -impl<'stream, 'a, T: PortableBitSemantics + TypeGraphLayout> - HostAndDeviceConstRefAsync<'stream, 'a, T> -{ - /// # Safety - /// - /// `device_box` must contain EXACTLY the device copy of `host_ref` - #[must_use] - pub const unsafe fn new( - device_box: &'a DeviceBox>, - host_ref: &'a T, - ) -> Self { - Self { - device_box, - host_ref, - stream: PhantomData::<&'stream Stream>, - } - } - - #[must_use] - /// # Safety - /// - /// The returned [`DeviceConstRef`] must only be used on the - /// constructed-with [`Stream`] - pub unsafe fn for_device_async<'b>(&'b self) -> DeviceConstRef<'a, T> - where - 'a: 'b, - { - let mut hack = ManuallyDrop::new(unsafe { std::ptr::read(self.device_box) }); - - DeviceConstRef { - pointer: DeviceConstPointer(hack.as_device_ptr().as_raw().cast()), - reference: PhantomData, - } - } - - #[must_use] - pub const fn for_host(&'a self) -> &'a T { - self.host_ref - } - - #[must_use] - pub const fn as_ref<'b>(&'b self) -> HostAndDeviceConstRefAsync<'stream, 'b, T> - where - 'a: 'b, - { - *self - } -} - -#[allow(clippy::module_name_repetitions)] -pub struct HostAndDeviceOwnedAsync<'stream, 'a, T: PortableBitSemantics + TypeGraphLayout> { - device_box: &'a mut DeviceBox>, - host_val: &'a mut T, - stream: PhantomData<&'stream Stream>, -} - -impl<'stream, 'a, T: PortableBitSemantics + TypeGraphLayout> - HostAndDeviceOwnedAsync<'stream, 'a, T> -{ - #[must_use] - /// # Safety - /// - /// The returned [`DeviceOwnedRef`] must only be used on the - /// constructed-with [`Stream`] - pub unsafe fn for_device_async(self) -> DeviceOwnedRef<'a, T> { - DeviceOwnedRef { - pointer: DeviceOwnedPointer(self.device_box.as_device_ptr().as_raw_mut().cast()), - marker: PhantomData::, - reference: PhantomData::<&'a mut ()>, - } - } - - #[must_use] - pub fn for_host(&self) -> &T { - self.host_val + pub const fn into_async<'stream>( + self, + stream: &'stream Stream, + ) -> Async<'a, 'stream, Self, NoCompletion> { + Async::ready(self, stream) } } diff --git a/src/kernel/mod.rs b/src/kernel/mod.rs index b6ed5b8e7..40985a0e8 100644 --- a/src/kernel/mod.rs +++ b/src/kernel/mod.rs @@ -67,9 +67,9 @@ pub trait CudaKernelParameter: sealed::Sealed { #[doc(hidden)] #[cfg(feature = "host")] - fn async_to_ffi<'stream, 'b>( + fn async_to_ffi<'stream, 'b, E: From>( param: Self::AsyncHostType<'stream, 'b>, - ) -> Self::FfiType<'stream, 'b>; + ) -> Result, E>; #[doc(hidden)] #[cfg(feature = "device")] @@ -377,7 +377,7 @@ macro_rules! impl_typed_kernel_launch { shared_memory_size, &[ $(core::ptr::from_mut( - &mut $T::async_to_ffi($arg) + &mut $T::async_to_ffi($arg)? ).cast::()),* ], ) } diff --git a/src/kernel/param.rs b/src/kernel/param.rs index 17d4bc3a5..ad4ade594 100644 --- a/src/kernel/param.rs +++ b/src/kernel/param.rs @@ -102,10 +102,10 @@ impl< } #[cfg(feature = "host")] - fn async_to_ffi<'stream, 'b>( + fn async_to_ffi<'stream, 'b, E: From>( param: Self::AsyncHostType<'stream, 'b>, - ) -> Self::FfiType<'stream, 'b> { - param + ) -> Result, E> { + Ok(param) } #[cfg(feature = "device")] @@ -138,7 +138,12 @@ impl< > CudaKernelParameter for &'a PerThreadShallowCopy { #[cfg(feature = "host")] - type AsyncHostType<'stream, 'b> = crate::host::HostAndDeviceConstRefAsync<'stream, 'b, T>; + type AsyncHostType<'stream, 'b> = crate::utils::r#async::Async< + 'b, + 'stream, + crate::host::HostAndDeviceConstRef<'b, T>, + crate::utils::r#async::NoCompletion, + >; #[cfg(any(feature = "device", doc))] type DeviceType<'b> = &'b T; type FfiType<'stream, 'b> = DeviceConstRef<'b, T>; @@ -148,10 +153,12 @@ impl< #[cfg(feature = "host")] fn with_new_async<'stream, O, E: From>( param: Self::SyncHostType, - _stream: &'stream rustacuda::stream::Stream, + stream: &'stream rustacuda::stream::Stream, inner: impl for<'b> FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result, ) -> Result { - crate::host::HostAndDeviceConstRef::with_new(param, |const_ref| inner(const_ref.as_async())) + crate::host::HostAndDeviceConstRef::with_new(param, |const_ref| { + inner(const_ref.as_async(stream)) + }) } #[cfg(feature = "host")] @@ -168,10 +175,12 @@ impl< } #[cfg(feature = "host")] - fn async_to_ffi<'stream, 'b>( + fn async_to_ffi<'stream, 'b, E: From>( param: Self::AsyncHostType<'stream, 'b>, - ) -> Self::FfiType<'stream, 'b> { - unsafe { param.for_device_async() } + ) -> Result, E> { + let (param, _completion): (_, Option) = + unsafe { param.unwrap_unchecked()? }; + Ok(param.for_device()) } #[cfg(feature = "device")] @@ -228,6 +237,7 @@ impl< param: &Self::AsyncHostType<'_, '_>, inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O, ) -> O { + let param = unsafe { param.unwrap_ref_unchecked() }; inner(Some(¶m_as_raw_bytes(param.for_host()))) } @@ -237,9 +247,9 @@ impl< } #[cfg(feature = "host")] - fn async_to_ffi<'stream, 'b>( + fn async_to_ffi<'stream, 'b, E: From>( param: Self::AsyncHostType<'stream, 'b>, - ) -> Self::FfiType<'stream, 'b> { + ) -> Result, E> { <&'a PerThreadShallowCopy as CudaKernelParameter>::async_to_ffi(param) } @@ -303,7 +313,12 @@ impl< > CudaKernelParameter for &'a ShallowInteriorMutable { #[cfg(feature = "host")] - type AsyncHostType<'stream, 'b> = crate::host::HostAndDeviceConstRefAsync<'stream, 'b, T>; + type AsyncHostType<'stream, 'b> = crate::utils::r#async::Async< + 'b, + 'stream, + crate::host::HostAndDeviceConstRef<'b, T>, + crate::utils::r#async::NoCompletion, + >; #[cfg(any(feature = "device", doc))] type DeviceType<'b> = &'b T; type FfiType<'stream, 'b> = DeviceConstRef<'b, T>; @@ -315,11 +330,11 @@ impl< #[cfg(feature = "host")] fn with_new_async<'stream, O, E: From>( param: Self::SyncHostType, - _stream: &'stream rustacuda::stream::Stream, + stream: &'stream rustacuda::stream::Stream, inner: impl for<'b> FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result, ) -> Result { crate::host::HostAndDeviceMutRef::with_new(param, |const_ref| { - inner(const_ref.as_ref().as_async()) + inner(const_ref.as_ref().as_async(stream)) }) } @@ -337,10 +352,12 @@ impl< } #[cfg(feature = "host")] - fn async_to_ffi<'stream, 'b>( + fn async_to_ffi<'stream, 'b, E: From>( param: Self::AsyncHostType<'stream, 'b>, - ) -> Self::FfiType<'stream, 'b> { - unsafe { param.for_device_async() } + ) -> Result, E> { + let (param, _completion): (_, Option) = + unsafe { param.unwrap_unchecked()? }; + Ok(param.for_device()) } #[cfg(feature = "device")] @@ -414,10 +431,14 @@ impl< > CudaKernelParameter for SharedHeapPerThreadShallowCopy { #[cfg(feature = "host")] - type AsyncHostType<'stream, 'b> = crate::host::HostAndDeviceOwnedAsync< - 'stream, + type AsyncHostType<'stream, 'b> = crate::utils::r#async::Async< 'b, - DeviceAccessible<::CudaRepresentation>, + 'stream, + crate::host::HostAndDeviceOwned< + 'b, + DeviceAccessible<::CudaRepresentation>, + >, + crate::utils::r#async::NoCompletion, >; #[cfg(any(feature = "device", doc))] type DeviceType<'b> = T; @@ -429,10 +450,10 @@ impl< #[cfg(feature = "host")] fn with_new_async<'stream, O, E: From>( param: Self::SyncHostType, - _stream: &'stream rustacuda::stream::Stream, + stream: &'stream rustacuda::stream::Stream, inner: impl for<'b> FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result, ) -> Result { - crate::lend::LendToCuda::move_to_cuda(param, |param| inner(param.into_async())) + crate::lend::LendToCuda::move_to_cuda(param, |param| inner(param.into_async(stream))) } #[cfg(feature = "host")] @@ -449,10 +470,12 @@ impl< } #[cfg(feature = "host")] - fn async_to_ffi<'stream, 'b>( + fn async_to_ffi<'stream, 'b, E: From>( param: Self::AsyncHostType<'stream, 'b>, - ) -> Self::FfiType<'stream, 'b> { - unsafe { param.for_device_async() } + ) -> Result, E> { + let (param, _completion): (_, Option) = + unsafe { param.unwrap_unchecked()? }; + Ok(param.for_device()) } #[cfg(feature = "device")] @@ -478,10 +501,14 @@ impl<'a, T: 'static + Sync + RustToCuda> CudaKernelParameter for &'a SharedHeapPerThreadShallowCopy { #[cfg(feature = "host")] - type AsyncHostType<'stream, 'b> = crate::host::HostAndDeviceConstRefAsync< - 'stream, + type AsyncHostType<'stream, 'b> = crate::utils::r#async::Async< 'b, - DeviceAccessible<::CudaRepresentation>, + 'stream, + crate::host::HostAndDeviceConstRef< + 'b, + DeviceAccessible<::CudaRepresentation>, + >, + crate::utils::r#async::NoCompletion, >; #[cfg(any(feature = "device", doc))] type DeviceType<'b> = &'b T; @@ -493,10 +520,10 @@ impl<'a, T: 'static + Sync + RustToCuda> CudaKernelParameter #[cfg(feature = "host")] fn with_new_async<'stream, O, E: From>( param: Self::SyncHostType, - _stream: &'stream rustacuda::stream::Stream, + stream: &'stream rustacuda::stream::Stream, inner: impl for<'b> FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result, ) -> Result { - crate::lend::LendToCuda::lend_to_cuda(param, |param| inner(param.as_async())) + crate::lend::LendToCuda::lend_to_cuda(param, |param| inner(param.as_async(stream))) } #[cfg(feature = "host")] @@ -513,10 +540,12 @@ impl<'a, T: 'static + Sync + RustToCuda> CudaKernelParameter } #[cfg(feature = "host")] - fn async_to_ffi<'stream, 'b>( + fn async_to_ffi<'stream, 'b, E: From>( param: Self::AsyncHostType<'stream, 'b>, - ) -> Self::FfiType<'stream, 'b> { - unsafe { param.for_device_async() } + ) -> Result, E> { + let (param, _completion): (_, Option) = + unsafe { param.unwrap_unchecked()? }; + Ok(param.for_device()) } #[cfg(feature = "device")] @@ -565,13 +594,14 @@ impl< param: &Self::AsyncHostType<'_, '_>, inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O, ) -> O { + let param = unsafe { param.unwrap_ref_unchecked() }; inner(Some(¶m_as_raw_bytes(param.for_host()))) } #[cfg(feature = "host")] - fn async_to_ffi<'stream, 'b>( + fn async_to_ffi<'stream, 'b, E: From>( param: Self::AsyncHostType<'stream, 'b>, - ) -> Self::FfiType<'stream, 'b> { + ) -> Result, E> { as CudaKernelParameter>::async_to_ffi(param) } @@ -634,6 +664,7 @@ impl<'a, T: 'static + Sync + RustToCuda> CudaKernelParameter param: &Self::AsyncHostType<'_, '_>, inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O, ) -> O { + let param = unsafe { param.unwrap_ref_unchecked() }; inner(Some(¶m_as_raw_bytes(param.for_host()))) } @@ -643,9 +674,9 @@ impl<'a, T: 'static + Sync + RustToCuda> CudaKernelParameter } #[cfg(feature = "host")] - fn async_to_ffi<'stream, 'b>( + fn async_to_ffi<'stream, 'b, E: From>( param: Self::AsyncHostType<'stream, 'b>, - ) -> Self::FfiType<'stream, 'b> { + ) -> Result, E> { <&'a SharedHeapPerThreadShallowCopy as CudaKernelParameter>::async_to_ffi(param) } @@ -738,13 +769,13 @@ impl<'a, T: 'static> CudaKernelParameter for &'a mut crate::utils::shared::Threa } #[cfg(feature = "host")] - fn async_to_ffi<'stream, 'b>( + fn async_to_ffi<'stream, 'b, E: From>( _param: Self::AsyncHostType<'stream, 'b>, - ) -> Self::FfiType<'stream, 'b> { - private_shared::ThreadBlockSharedFfi { + ) -> Result, E> { + Ok(private_shared::ThreadBlockSharedFfi { _dummy: [], _marker: PhantomData::, - } + }) } #[cfg(feature = "device")] @@ -795,13 +826,13 @@ impl<'a, T: 'static + PortableBitSemantics + TypeGraphLayout> CudaKernelParamete } #[cfg(feature = "host")] - fn async_to_ffi<'stream, 'b>( + fn async_to_ffi<'stream, 'b, E: From>( param: Self::AsyncHostType<'stream, 'b>, - ) -> Self::FfiType<'stream, 'b> { - private_shared::ThreadBlockSharedSliceFfi { + ) -> Result, E> { + Ok(private_shared::ThreadBlockSharedSliceFfi { len: param.len(), _marker: [], - } + }) } #[cfg(feature = "device")] diff --git a/src/utils/exchange/wrapper.rs b/src/utils/exchange/wrapper.rs index 09aef582d..aeee541e1 100644 --- a/src/utils/exchange/wrapper.rs +++ b/src/utils/exchange/wrapper.rs @@ -8,10 +8,7 @@ use rustacuda::{ use crate::{ alloc::{EmptyCudaAlloc, NoCudaAlloc}, - host::{ - CudaDropWrapper, HostAndDeviceConstRef, HostAndDeviceConstRefAsync, HostAndDeviceMutRef, - HostAndDeviceMutRefAsync, - }, + host::{CudaDropWrapper, HostAndDeviceConstRef, HostAndDeviceMutRef}, lend::{RustToCuda, RustToCudaAsync}, utils::{ adapter::DeviceCopyWithPortableBitSemantics, @@ -356,35 +353,37 @@ impl< #[must_use] pub fn as_ref_async( &self, - ) -> HostAndDeviceConstRefAsync< - 'stream, - '_, - DeviceAccessible<::CudaRepresentation>, - > { + ) -> Async<'_, 'stream, DeviceAccessible<::CudaRepresentation>, NoCompletion> + { let this = unsafe { self.unwrap_ref_unchecked() }; - // Safety: `device_box` contains exactly the device copy of `locked_cuda_repr` - unsafe { - HostAndDeviceConstRefAsync::new( - &*(this.device_box), - (**(this.locked_cuda_repr)).into_ref(), - ) - } + todo!() + + // Safety: `device_box` contains exactly the device copy of + // `locked_cuda_repr` unsafe { + // HostAndDeviceConstRefAsync::new( + // &*(this.device_box), + // (**(this.locked_cuda_repr)).into_ref(), + // ) + // } } // TODO: replace by async borrow map mut #[must_use] pub fn as_mut_async( &mut self, - ) -> HostAndDeviceMutRefAsync::CudaRepresentation>> { + ) -> Async<'_, 'stream, DeviceAccessible<::CudaRepresentation>, NoCompletion> + { let this = unsafe { self.unwrap_mut_unchecked() }; - // Safety: `device_box` contains exactly the device copy of `locked_cuda_repr` - unsafe { - HostAndDeviceMutRefAsync::new( - &mut *(this.device_box), - (**(this.locked_cuda_repr)).into_mut(), - ) - } + todo!() + + // Safety: `device_box` contains exactly the device copy of + // `locked_cuda_repr` unsafe { + // HostAndDeviceMutRefAsync::new( + // &mut *(this.device_box), + // (**(this.locked_cuda_repr)).into_mut(), + // ) + // } } } From 24efa2301379b96b71d91e0016d08787dcf31980 Mon Sep 17 00:00:00 2001 From: Juniper Tyree Date: Wed, 3 Jan 2024 13:29:53 +0000 Subject: [PATCH 081/120] Seal more implementation details --- src/host/mod.rs | 14 +++++---- src/kernel/mod.rs | 28 +++++++++++------ src/kernel/param.rs | 76 +++++++++++++++++++++++++++++++++++++-------- 3 files changed, 89 insertions(+), 29 deletions(-) diff --git a/src/host/mod.rs b/src/host/mod.rs index 62870dd39..9efc7b9b1 100644 --- a/src/host/mod.rs +++ b/src/host/mod.rs @@ -152,8 +152,9 @@ impl<'a, T: PortableBitSemantics + TypeGraphLayout> HostAndDeviceMutRef<'a, T> { result } + #[allow(dead_code)] // FIXME #[must_use] - pub fn for_device<'b>(&'b mut self) -> DeviceMutRef<'a, T> + pub(crate) fn for_device<'b>(&'b mut self) -> DeviceMutRef<'a, T> where 'a: 'b, { @@ -163,8 +164,9 @@ impl<'a, T: PortableBitSemantics + TypeGraphLayout> HostAndDeviceMutRef<'a, T> { } } + #[allow(dead_code)] // FIXME #[must_use] - pub fn for_host<'b: 'a>(&'b self) -> &'a T { + pub(crate) fn for_host<'b: 'a>(&'b self) -> &'a T { self.host_ref } @@ -264,7 +266,7 @@ impl<'a, T: PortableBitSemantics + TypeGraphLayout> HostAndDeviceConstRef<'a, T> } #[must_use] - pub fn for_device<'b>(&'b self) -> DeviceConstRef<'a, T> + pub(crate) fn for_device<'b>(&'b self) -> DeviceConstRef<'a, T> where 'a: 'b, { @@ -277,7 +279,7 @@ impl<'a, T: PortableBitSemantics + TypeGraphLayout> HostAndDeviceConstRef<'a, T> } #[must_use] - pub const fn for_host(&'a self) -> &'a T { + pub(crate) const fn for_host(&'a self) -> &'a T { self.host_ref } @@ -334,7 +336,7 @@ impl<'a, T: PortableBitSemantics + TypeGraphLayout> HostAndDeviceOwned<'a, T> { } #[must_use] - pub fn for_device(self) -> DeviceOwnedRef<'a, T> { + pub(crate) fn for_device(self) -> DeviceOwnedRef<'a, T> { DeviceOwnedRef { pointer: DeviceOwnedPointer(self.device_box.as_device_ptr().as_raw_mut().cast()), marker: PhantomData::, @@ -343,7 +345,7 @@ impl<'a, T: PortableBitSemantics + TypeGraphLayout> HostAndDeviceOwned<'a, T> { } #[must_use] - pub fn for_host(&self) -> &T { + pub(crate) fn for_host(&self) -> &T { self.host_val } diff --git a/src/kernel/mod.rs b/src/kernel/mod.rs index 40985a0e8..7026efc1a 100644 --- a/src/kernel/mod.rs +++ b/src/kernel/mod.rs @@ -34,6 +34,9 @@ pub mod param; mod sealed { #[doc(hidden)] pub trait Sealed {} + + #[cfg(feature = "host")] + pub struct Token; } pub trait CudaKernelParameter: sealed::Sealed { @@ -58,17 +61,22 @@ pub trait CudaKernelParameter: sealed::Sealed { #[cfg(feature = "host")] fn with_async_as_ptx_jit( param: &Self::AsyncHostType<'_, '_>, + token: sealed::Token, inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O, ) -> O; #[doc(hidden)] #[cfg(feature = "host")] - fn shared_layout_for_async(param: &Self::AsyncHostType<'_, '_>) -> std::alloc::Layout; + fn shared_layout_for_async( + param: &Self::AsyncHostType<'_, '_>, + token: sealed::Token, + ) -> std::alloc::Layout; #[doc(hidden)] #[cfg(feature = "host")] fn async_to_ffi<'stream, 'b, E: From>( param: Self::AsyncHostType<'stream, 'b>, + token: sealed::Token, ) -> Result, E>; #[doc(hidden)] @@ -139,10 +147,10 @@ macro_rules! impl_launcher_launch { self.kernel.$launch_async::<$($T),*>(self.stream, &self.config, $($arg),*) } }; - (impl $func:ident () + ($($other:ident),*) $inner:block) => { + (impl $func:ident () + ($($other:expr),*) $inner:block) => { $inner }; - (impl $func:ident ($arg0:ident : $T0:ident $(, $arg:ident : $T:ident)*) + ($($other:ident),*) $inner:block) => { + (impl $func:ident ($arg0:ident : $T0:ident $(, $arg:ident : $T:ident)*) + ($($other:expr),*) $inner:block) => { $T0::$func($arg0 $(, $other)*, |$arg0| { impl_launcher_launch! { impl $func ($($arg: $T),*) + ($($other),*) $inner } }) @@ -353,7 +361,7 @@ macro_rules! impl_typed_kernel_launch { Kernel: FnOnce(&mut Launcher<'stream, 'kernel, Kernel>, $($T),*), { let function = if config.ptx_jit { - impl_typed_kernel_launch! { impl with_async_as_ptx_jit ref ($($arg: $T),*) + () { + impl_typed_kernel_launch! { impl with_async_as_ptx_jit ref ($($arg: $T),*) + (sealed::Token) { self.compile_with_ptx_jit_args(Some(&[$($arg),*])) } }? } else { @@ -363,7 +371,7 @@ macro_rules! impl_typed_kernel_launch { #[allow(unused_mut)] let mut shared_memory_size = crate::utils::shared::SharedMemorySize::new(); $( - shared_memory_size.add($T::shared_layout_for_async(&$arg)); + shared_memory_size.add($T::shared_layout_for_async(&$arg, sealed::Token)); )* let Ok(shared_memory_size) = u32::try_from(shared_memory_size.total()) else { // FIXME: this should really be InvalidConfiguration = 9 @@ -377,24 +385,24 @@ macro_rules! impl_typed_kernel_launch { shared_memory_size, &[ $(core::ptr::from_mut( - &mut $T::async_to_ffi($arg)? + &mut $T::async_to_ffi($arg, sealed::Token)? ).cast::()),* ], ) } } }; - (impl $func:ident () + ($($other:ident),*) $inner:block) => { + (impl $func:ident () + ($($other:expr),*) $inner:block) => { $inner }; - (impl $func:ident ($arg0:ident : $T0:ident $(, $arg:ident : $T:ident)*) + ($($other:ident),*) $inner:block) => { + (impl $func:ident ($arg0:ident : $T0:ident $(, $arg:ident : $T:ident)*) + ($($other:expr),*) $inner:block) => { $T0::$func($arg0 $(, $other)*, |$arg0| { impl_typed_kernel_launch! { impl $func ($($arg: $T),*) + ($($other),*) $inner } }) }; - (impl $func:ident ref () + ($($other:ident),*) $inner:block) => { + (impl $func:ident ref () + ($($other:expr),*) $inner:block) => { $inner }; - (impl $func:ident ref ($arg0:ident : $T0:ident $(, $arg:ident : $T:ident)*) + ($($other:ident),*) $inner:block) => { + (impl $func:ident ref ($arg0:ident : $T0:ident $(, $arg:ident : $T:ident)*) + ($($other:expr),*) $inner:block) => { $T0::$func(&$arg0 $(, $other)*, |$arg0| { impl_typed_kernel_launch! { impl $func ref ($($arg: $T),*) + ($($other),*) $inner } }) diff --git a/src/kernel/param.rs b/src/kernel/param.rs index ad4ade594..0e3bf8790 100644 --- a/src/kernel/param.rs +++ b/src/kernel/param.rs @@ -91,19 +91,24 @@ impl< #[cfg(feature = "host")] fn with_async_as_ptx_jit( _param: &Self::AsyncHostType<'_, '_>, + _token: sealed::Token, inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O, ) -> O { inner(None) } #[cfg(feature = "host")] - fn shared_layout_for_async(_param: &Self::AsyncHostType<'_, '_>) -> Layout { + fn shared_layout_for_async( + _param: &Self::AsyncHostType<'_, '_>, + _token: sealed::Token, + ) -> Layout { Layout::new::<()>() } #[cfg(feature = "host")] fn async_to_ffi<'stream, 'b, E: From>( param: Self::AsyncHostType<'stream, 'b>, + _token: sealed::Token, ) -> Result, E> { Ok(param) } @@ -164,19 +169,24 @@ impl< #[cfg(feature = "host")] fn with_async_as_ptx_jit( _param: &Self::AsyncHostType<'_, '_>, + _token: sealed::Token, inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O, ) -> O { inner(None) } #[cfg(feature = "host")] - fn shared_layout_for_async(_param: &Self::AsyncHostType<'_, '_>) -> Layout { + fn shared_layout_for_async( + _param: &Self::AsyncHostType<'_, '_>, + _token: sealed::Token, + ) -> Layout { Layout::new::<()>() } #[cfg(feature = "host")] fn async_to_ffi<'stream, 'b, E: From>( param: Self::AsyncHostType<'stream, 'b>, + _token: sealed::Token, ) -> Result, E> { let (param, _completion): (_, Option) = unsafe { param.unwrap_unchecked()? }; @@ -235,6 +245,7 @@ impl< #[cfg(feature = "host")] fn with_async_as_ptx_jit( param: &Self::AsyncHostType<'_, '_>, + _token: sealed::Token, inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O, ) -> O { let param = unsafe { param.unwrap_ref_unchecked() }; @@ -242,15 +253,19 @@ impl< } #[cfg(feature = "host")] - fn shared_layout_for_async(_param: &Self::AsyncHostType<'_, '_>) -> Layout { + fn shared_layout_for_async( + _param: &Self::AsyncHostType<'_, '_>, + _token: sealed::Token, + ) -> Layout { Layout::new::<()>() } #[cfg(feature = "host")] fn async_to_ffi<'stream, 'b, E: From>( param: Self::AsyncHostType<'stream, 'b>, + token: sealed::Token, ) -> Result, E> { - <&'a PerThreadShallowCopy as CudaKernelParameter>::async_to_ffi(param) + <&'a PerThreadShallowCopy as CudaKernelParameter>::async_to_ffi(param, token) } #[cfg(feature = "device")] @@ -341,19 +356,24 @@ impl< #[cfg(feature = "host")] fn with_async_as_ptx_jit( _param: &Self::AsyncHostType<'_, '_>, + _token: sealed::Token, inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O, ) -> O { inner(None) } #[cfg(feature = "host")] - fn shared_layout_for_async(_param: &Self::AsyncHostType<'_, '_>) -> Layout { + fn shared_layout_for_async( + _param: &Self::AsyncHostType<'_, '_>, + _token: sealed::Token, + ) -> Layout { Layout::new::<()>() } #[cfg(feature = "host")] fn async_to_ffi<'stream, 'b, E: From>( param: Self::AsyncHostType<'stream, 'b>, + _token: sealed::Token, ) -> Result, E> { let (param, _completion): (_, Option) = unsafe { param.unwrap_unchecked()? }; @@ -459,19 +479,24 @@ impl< #[cfg(feature = "host")] fn with_async_as_ptx_jit( _param: &Self::AsyncHostType<'_, '_>, + _token: sealed::Token, inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O, ) -> O { inner(None) } #[cfg(feature = "host")] - fn shared_layout_for_async(_param: &Self::AsyncHostType<'_, '_>) -> Layout { + fn shared_layout_for_async( + _param: &Self::AsyncHostType<'_, '_>, + _token: sealed::Token, + ) -> Layout { Layout::new::<()>() } #[cfg(feature = "host")] fn async_to_ffi<'stream, 'b, E: From>( param: Self::AsyncHostType<'stream, 'b>, + _token: sealed::Token, ) -> Result, E> { let (param, _completion): (_, Option) = unsafe { param.unwrap_unchecked()? }; @@ -529,19 +554,24 @@ impl<'a, T: 'static + Sync + RustToCuda> CudaKernelParameter #[cfg(feature = "host")] fn with_async_as_ptx_jit( _param: &Self::AsyncHostType<'_, '_>, + _token: sealed::Token, inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O, ) -> O { inner(None) } #[cfg(feature = "host")] - fn shared_layout_for_async(_param: &Self::AsyncHostType<'_, '_>) -> Layout { + fn shared_layout_for_async( + _param: &Self::AsyncHostType<'_, '_>, + _token: sealed::Token, + ) -> Layout { Layout::new::<()>() } #[cfg(feature = "host")] fn async_to_ffi<'stream, 'b, E: From>( param: Self::AsyncHostType<'stream, 'b>, + _token: sealed::Token, ) -> Result, E> { let (param, _completion): (_, Option) = unsafe { param.unwrap_unchecked()? }; @@ -592,6 +622,7 @@ impl< #[cfg(feature = "host")] fn with_async_as_ptx_jit( param: &Self::AsyncHostType<'_, '_>, + _token: sealed::Token, inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O, ) -> O { let param = unsafe { param.unwrap_ref_unchecked() }; @@ -601,12 +632,16 @@ impl< #[cfg(feature = "host")] fn async_to_ffi<'stream, 'b, E: From>( param: Self::AsyncHostType<'stream, 'b>, + token: sealed::Token, ) -> Result, E> { - as CudaKernelParameter>::async_to_ffi(param) + as CudaKernelParameter>::async_to_ffi(param, token) } #[cfg(feature = "host")] - fn shared_layout_for_async(_param: &Self::AsyncHostType<'_, '_>) -> Layout { + fn shared_layout_for_async( + _param: &Self::AsyncHostType<'_, '_>, + _token: sealed::Token, + ) -> Layout { Layout::new::<()>() } @@ -662,6 +697,7 @@ impl<'a, T: 'static + Sync + RustToCuda> CudaKernelParameter #[cfg(feature = "host")] fn with_async_as_ptx_jit( param: &Self::AsyncHostType<'_, '_>, + _token: sealed::Token, inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O, ) -> O { let param = unsafe { param.unwrap_ref_unchecked() }; @@ -669,15 +705,19 @@ impl<'a, T: 'static + Sync + RustToCuda> CudaKernelParameter } #[cfg(feature = "host")] - fn shared_layout_for_async(_param: &Self::AsyncHostType<'_, '_>) -> Layout { + fn shared_layout_for_async( + _param: &Self::AsyncHostType<'_, '_>, + _token: sealed::Token, + ) -> Layout { Layout::new::<()>() } #[cfg(feature = "host")] fn async_to_ffi<'stream, 'b, E: From>( param: Self::AsyncHostType<'stream, 'b>, + token: sealed::Token, ) -> Result, E> { - <&'a SharedHeapPerThreadShallowCopy as CudaKernelParameter>::async_to_ffi(param) + <&'a SharedHeapPerThreadShallowCopy as CudaKernelParameter>::async_to_ffi(param, token) } #[cfg(feature = "device")] @@ -758,19 +798,24 @@ impl<'a, T: 'static> CudaKernelParameter for &'a mut crate::utils::shared::Threa #[cfg(feature = "host")] fn with_async_as_ptx_jit( _param: &Self::AsyncHostType<'_, '_>, + _token: sealed::Token, inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O, ) -> O { inner(None) } #[cfg(feature = "host")] - fn shared_layout_for_async(_param: &Self::AsyncHostType<'_, '_>) -> Layout { + fn shared_layout_for_async( + _param: &Self::AsyncHostType<'_, '_>, + _token: sealed::Token, + ) -> Layout { Layout::new::<()>() } #[cfg(feature = "host")] fn async_to_ffi<'stream, 'b, E: From>( _param: Self::AsyncHostType<'stream, 'b>, + _token: sealed::Token, ) -> Result, E> { Ok(private_shared::ThreadBlockSharedFfi { _dummy: [], @@ -815,19 +860,24 @@ impl<'a, T: 'static + PortableBitSemantics + TypeGraphLayout> CudaKernelParamete #[cfg(feature = "host")] fn with_async_as_ptx_jit( _param: &Self::AsyncHostType<'_, '_>, + _token: sealed::Token, inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O, ) -> O { inner(None) } #[cfg(feature = "host")] - fn shared_layout_for_async(param: &Self::AsyncHostType<'_, '_>) -> Layout { + fn shared_layout_for_async( + param: &Self::AsyncHostType<'_, '_>, + _token: sealed::Token, + ) -> Layout { param.layout() } #[cfg(feature = "host")] fn async_to_ffi<'stream, 'b, E: From>( param: Self::AsyncHostType<'stream, 'b>, + _token: sealed::Token, ) -> Result, E> { Ok(private_shared::ThreadBlockSharedSliceFfi { len: param.len(), From 1e19fe13b6a0daf860f860aa78468f7941d5ca28 Mon Sep 17 00:00:00 2001 From: Juniper Tyree Date: Wed, 3 Jan 2024 13:51:02 +0000 Subject: [PATCH 082/120] Further small API improvements --- src/host/mod.rs | 52 +++++++++++++++++------------------ src/lend/impls/option.rs | 18 +----------- src/lend/mod.rs | 7 ++--- src/utils/exchange/wrapper.rs | 10 +++++-- 4 files changed, 37 insertions(+), 50 deletions(-) diff --git a/src/host/mod.rs b/src/host/mod.rs index 9efc7b9b1..2ddc768dd 100644 --- a/src/host/mod.rs +++ b/src/host/mod.rs @@ -109,19 +109,6 @@ pub struct HostAndDeviceMutRef<'a, T: PortableBitSemantics + TypeGraphLayout> { } impl<'a, T: PortableBitSemantics + TypeGraphLayout> HostAndDeviceMutRef<'a, T> { - /// # Safety - /// - /// `device_box` must contain EXACTLY the device copy of `host_ref` - pub unsafe fn new( - device_box: &'a mut DeviceBox>, - host_ref: &'a mut T, - ) -> Self { - Self { - device_box, - host_ref, - } - } - /// # Errors /// /// Returns a [`CudaError`] iff `value` cannot be moved @@ -152,6 +139,19 @@ impl<'a, T: PortableBitSemantics + TypeGraphLayout> HostAndDeviceMutRef<'a, T> { result } + /// # Safety + /// + /// `device_box` must contain EXACTLY the device copy of `host_ref` + pub unsafe fn new_unchecked( + device_box: &'a mut DeviceBox>, + host_ref: &'a mut T, + ) -> Self { + Self { + device_box, + host_ref, + } + } + #[allow(dead_code)] // FIXME #[must_use] pub(crate) fn for_device<'b>(&'b mut self) -> DeviceMutRef<'a, T> @@ -225,19 +225,6 @@ impl<'a, T: PortableBitSemantics + TypeGraphLayout> Clone for HostAndDeviceConst impl<'a, T: PortableBitSemantics + TypeGraphLayout> Copy for HostAndDeviceConstRef<'a, T> {} impl<'a, T: PortableBitSemantics + TypeGraphLayout> HostAndDeviceConstRef<'a, T> { - /// # Safety - /// - /// `device_box` must contain EXACTLY the device copy of `host_ref` - pub const unsafe fn new( - device_box: &'a DeviceBox>, - host_ref: &'a T, - ) -> Self { - Self { - device_box, - host_ref, - } - } - /// # Errors /// /// Returns a [`CudaError`] iff `value` cannot be moved @@ -265,6 +252,19 @@ impl<'a, T: PortableBitSemantics + TypeGraphLayout> HostAndDeviceConstRef<'a, T> result } + /// # Safety + /// + /// `device_box` must contain EXACTLY the device copy of `host_ref` + pub const unsafe fn new_unchecked( + device_box: &'a DeviceBox>, + host_ref: &'a T, + ) -> Self { + Self { + device_box, + host_ref, + } + } + #[must_use] pub(crate) fn for_device<'b>(&'b self) -> DeviceConstRef<'a, T> where diff --git a/src/lend/impls/option.rs b/src/lend/impls/option.rs index c05c0d3bb..197906baf 100644 --- a/src/lend/impls/option.rs +++ b/src/lend/impls/option.rs @@ -6,7 +6,7 @@ use const_type_layout::{TypeGraphLayout, TypeLayout}; use rustacuda::error::CudaResult; use crate::{ - lend::{CudaAsRust, RustToCuda, RustToCudaAsync, RustToCudaAsyncProxy, RustToCudaProxy}, + lend::{CudaAsRust, RustToCuda, RustToCudaAsync, RustToCudaProxy}, safety::PortableBitSemantics, utils::{adapter::RustToCudaWithPortableBitCopySemantics, ffi::DeviceAccessible}, }; @@ -214,19 +214,3 @@ impl RustToCudaProxy self.map(RustToCudaWithPortableBitCopySemantics::into_inner) } } - -impl RustToCudaAsyncProxy> - for Option> -{ - fn from_ref(val: &Option) -> &Self { - >>::from_ref(val) - } - - fn from_mut(val: &mut Option) -> &mut Self { - >>::from_mut(val) - } - - fn into(self) -> Option { - >>::into(self) - } -} diff --git a/src/lend/mod.rs b/src/lend/mod.rs index 598a586b8..e2e5dcf99 100644 --- a/src/lend/mod.rs +++ b/src/lend/mod.rs @@ -155,12 +155,9 @@ pub trait RustToCudaProxy: RustToCuda { fn into(self) -> T; } -pub trait RustToCudaAsyncProxy: RustToCudaAsync { - fn from_ref(val: &T) -> &Self; - fn from_mut(val: &mut T) -> &mut Self; +pub trait RustToCudaAsyncProxy: RustToCudaAsync + RustToCudaProxy {} - fn into(self) -> T; -} +impl> RustToCudaAsyncProxy for P {} #[cfg(feature = "host")] #[allow(clippy::module_name_repetitions)] diff --git a/src/utils/exchange/wrapper.rs b/src/utils/exchange/wrapper.rs index aeee541e1..660fdaae7 100644 --- a/src/utils/exchange/wrapper.rs +++ b/src/utils/exchange/wrapper.rs @@ -206,7 +206,10 @@ impl> ExchangeWrapperOnDevice { ) -> HostAndDeviceConstRef::CudaRepresentation>> { // Safety: `device_box` contains exactly the device copy of `locked_cuda_repr` unsafe { - HostAndDeviceConstRef::new(&self.device_box, (**self.locked_cuda_repr).into_ref()) + HostAndDeviceConstRef::new_unchecked( + &self.device_box, + (**self.locked_cuda_repr).into_ref(), + ) } } @@ -216,7 +219,10 @@ impl> ExchangeWrapperOnDevice { ) -> HostAndDeviceMutRef::CudaRepresentation>> { // Safety: `device_box` contains exactly the device copy of `locked_cuda_repr` unsafe { - HostAndDeviceMutRef::new(&mut self.device_box, (**self.locked_cuda_repr).into_mut()) + HostAndDeviceMutRef::new_unchecked( + &mut self.device_box, + (**self.locked_cuda_repr).into_mut(), + ) } } } From a0521861c267c747269213587cf50c6238a0632c Mon Sep 17 00:00:00 2001 From: Juniper Tyree Date: Thu, 4 Jan 2024 05:24:05 +0000 Subject: [PATCH 083/120] Add AsyncProj helper API struct for async projections --- src/kernel/param.rs | 36 ++++++++------------ src/utils/async.rs | 64 +++++++++++++++++++++++------------ src/utils/exchange/wrapper.rs | 56 +++++++++++++++--------------- 3 files changed, 84 insertions(+), 72 deletions(-) diff --git a/src/kernel/param.rs b/src/kernel/param.rs index 0e3bf8790..8ca41ddf4 100644 --- a/src/kernel/param.rs +++ b/src/kernel/param.rs @@ -143,11 +143,10 @@ impl< > CudaKernelParameter for &'a PerThreadShallowCopy { #[cfg(feature = "host")] - type AsyncHostType<'stream, 'b> = crate::utils::r#async::Async< + type AsyncHostType<'stream, 'b> = crate::utils::r#async::AsyncProj< 'b, 'stream, - crate::host::HostAndDeviceConstRef<'b, T>, - crate::utils::r#async::NoCompletion, + &'b crate::host::HostAndDeviceConstRef<'b, T>, >; #[cfg(any(feature = "device", doc))] type DeviceType<'b> = &'b T; @@ -162,7 +161,7 @@ impl< inner: impl for<'b> FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result, ) -> Result { crate::host::HostAndDeviceConstRef::with_new(param, |const_ref| { - inner(const_ref.as_async(stream)) + inner(const_ref.as_async(stream).as_ref()) }) } @@ -188,8 +187,7 @@ impl< param: Self::AsyncHostType<'stream, 'b>, _token: sealed::Token, ) -> Result, E> { - let (param, _completion): (_, Option) = - unsafe { param.unwrap_unchecked()? }; + let param = unsafe { param.unwrap_unchecked() }; Ok(param.for_device()) } @@ -248,7 +246,7 @@ impl< _token: sealed::Token, inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O, ) -> O { - let param = unsafe { param.unwrap_ref_unchecked() }; + let param = unsafe { param.unwrap_unchecked() }; inner(Some(¶m_as_raw_bytes(param.for_host()))) } @@ -328,11 +326,10 @@ impl< > CudaKernelParameter for &'a ShallowInteriorMutable { #[cfg(feature = "host")] - type AsyncHostType<'stream, 'b> = crate::utils::r#async::Async< + type AsyncHostType<'stream, 'b> = crate::utils::r#async::AsyncProj< 'b, 'stream, - crate::host::HostAndDeviceConstRef<'b, T>, - crate::utils::r#async::NoCompletion, + &'b crate::host::HostAndDeviceConstRef<'b, T>, >; #[cfg(any(feature = "device", doc))] type DeviceType<'b> = &'b T; @@ -349,7 +346,7 @@ impl< inner: impl for<'b> FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result, ) -> Result { crate::host::HostAndDeviceMutRef::with_new(param, |const_ref| { - inner(const_ref.as_ref().as_async(stream)) + inner(const_ref.as_ref().as_async(stream).as_ref()) }) } @@ -375,8 +372,7 @@ impl< param: Self::AsyncHostType<'stream, 'b>, _token: sealed::Token, ) -> Result, E> { - let (param, _completion): (_, Option) = - unsafe { param.unwrap_unchecked()? }; + let param = unsafe { param.unwrap_unchecked() }; Ok(param.for_device()) } @@ -526,14 +522,13 @@ impl<'a, T: 'static + Sync + RustToCuda> CudaKernelParameter for &'a SharedHeapPerThreadShallowCopy { #[cfg(feature = "host")] - type AsyncHostType<'stream, 'b> = crate::utils::r#async::Async< + type AsyncHostType<'stream, 'b> = crate::utils::r#async::AsyncProj< 'b, 'stream, - crate::host::HostAndDeviceConstRef< + &'b crate::host::HostAndDeviceConstRef< 'b, DeviceAccessible<::CudaRepresentation>, >, - crate::utils::r#async::NoCompletion, >; #[cfg(any(feature = "device", doc))] type DeviceType<'b> = &'b T; @@ -548,7 +543,7 @@ impl<'a, T: 'static + Sync + RustToCuda> CudaKernelParameter stream: &'stream rustacuda::stream::Stream, inner: impl for<'b> FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result, ) -> Result { - crate::lend::LendToCuda::lend_to_cuda(param, |param| inner(param.as_async(stream))) + crate::lend::LendToCuda::lend_to_cuda(param, |param| inner(param.as_async(stream).as_ref())) } #[cfg(feature = "host")] @@ -573,8 +568,7 @@ impl<'a, T: 'static + Sync + RustToCuda> CudaKernelParameter param: Self::AsyncHostType<'stream, 'b>, _token: sealed::Token, ) -> Result, E> { - let (param, _completion): (_, Option) = - unsafe { param.unwrap_unchecked()? }; + let param = unsafe { param.unwrap_unchecked() }; Ok(param.for_device()) } @@ -625,7 +619,7 @@ impl< _token: sealed::Token, inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O, ) -> O { - let param = unsafe { param.unwrap_ref_unchecked() }; + let param = unsafe { param.as_ref().unwrap_unchecked() }; inner(Some(¶m_as_raw_bytes(param.for_host()))) } @@ -700,7 +694,7 @@ impl<'a, T: 'static + Sync + RustToCuda> CudaKernelParameter _token: sealed::Token, inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O, ) -> O { - let param = unsafe { param.unwrap_ref_unchecked() }; + let param = unsafe { param.unwrap_unchecked() }; inner(Some(¶m_as_raw_bytes(param.for_host()))) } diff --git a/src/utils/async.rs b/src/utils/async.rs index f408431ae..d945538d0 100644 --- a/src/utils/async.rs +++ b/src/utils/async.rs @@ -119,6 +119,10 @@ impl<'a, 'stream, T: BorrowMut, C: Completion> Async<'a, 'strea /// the inner value can be safely returned and again be used in synchronous /// operations. /// + /// Calling `synchronize` after the computation has completed, e.g. after + /// calling [`rustacuda::stream::Stream::synchronize`], should be very + /// cheap. + /// /// # Errors /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside /// CUDA. @@ -224,30 +228,12 @@ impl<'a, 'stream, T: BorrowMut, C: Completion> Async<'a, 'strea } } - /// # Safety - /// - /// The returned reference to the inner value of type `T` may not yet have - /// completed its asynchronous work and may thus be in an inconsistent - /// state. - /// - /// This method must only be used to construct a larger asynchronous - /// computation out of smaller ones that have all been submitted to the - /// same [`Stream`]. - pub const unsafe fn unwrap_ref_unchecked(&self) -> &T { - &self.value + pub const fn as_ref(&self) -> AsyncProj<'_, 'stream, &T> { + AsyncProj::new(&self.value) } - /// # Safety - /// - /// The returned reference to the inner value of type `T` may not yet have - /// completed its asynchronous work and may thus be in an inconsistent - /// state. - /// - /// This method must only be used to construct a larger asynchronous - /// computation out of smaller ones that have all been submitted to the - /// same [`Stream`]. - pub unsafe fn unwrap_mut_unchecked(&mut self) -> &mut T { - &mut self.value + pub fn as_mut(&mut self) -> AsyncProj<'_, 'stream, &mut T> { + AsyncProj::new(&mut self.value) } } @@ -340,3 +326,37 @@ impl<'a, 'stream, T: BorrowMut, C: Completion> IntoFuture } } } + +#[cfg(feature = "host")] +#[allow(clippy::module_name_repetitions)] +#[derive(Copy, Clone)] +pub struct AsyncProj<'a, 'stream, T: 'a> { + _capture: PhantomData<&'a ()>, + _stream: PhantomData<&'stream Stream>, + value: T, +} + +#[cfg(feature = "host")] +impl<'a, 'stream, T: 'a> AsyncProj<'a, 'stream, T> { + #[must_use] + pub(crate) const fn new(value: T) -> Self { + Self { + _capture: PhantomData::<&'a ()>, + _stream: PhantomData::<&'stream Stream>, + value, + } + } + + /// # Safety + /// + /// The returned reference to the inner value of type `T` may not yet have + /// completed its asynchronous work and may thus be in an inconsistent + /// state. + /// + /// This method must only be used to construct a larger asynchronous + /// computation out of smaller ones that have all been submitted to the + /// same [`Stream`]. + pub(crate) unsafe fn unwrap_unchecked(self) -> T { + self.value + } +} diff --git a/src/utils/exchange/wrapper.rs b/src/utils/exchange/wrapper.rs index 660fdaae7..0f1ff89f8 100644 --- a/src/utils/exchange/wrapper.rs +++ b/src/utils/exchange/wrapper.rs @@ -13,7 +13,7 @@ use crate::{ utils::{ adapter::DeviceCopyWithPortableBitSemantics, ffi::DeviceAccessible, - r#async::{Async, CompletionFnMut, NoCompletion}, + r#async::{Async, AsyncProj, CompletionFnMut, NoCompletion}, }, }; @@ -355,41 +355,39 @@ impl< } } - // TODO: replace by async borrow map #[must_use] pub fn as_ref_async( &self, - ) -> Async<'_, 'stream, DeviceAccessible<::CudaRepresentation>, NoCompletion> - { - let this = unsafe { self.unwrap_ref_unchecked() }; - - todo!() - - // Safety: `device_box` contains exactly the device copy of - // `locked_cuda_repr` unsafe { - // HostAndDeviceConstRefAsync::new( - // &*(this.device_box), - // (**(this.locked_cuda_repr)).into_ref(), - // ) - // } + ) -> AsyncProj< + '_, + 'stream, + HostAndDeviceConstRef::CudaRepresentation>>, + > { + let this = unsafe { self.as_ref().unwrap_unchecked() }; + + AsyncProj::new(unsafe { + HostAndDeviceConstRef::new_unchecked( + &*(this.device_box), + (**(this.locked_cuda_repr)).into_ref(), + ) + }) } - // TODO: replace by async borrow map mut #[must_use] pub fn as_mut_async( &mut self, - ) -> Async<'_, 'stream, DeviceAccessible<::CudaRepresentation>, NoCompletion> - { - let this = unsafe { self.unwrap_mut_unchecked() }; - - todo!() - - // Safety: `device_box` contains exactly the device copy of - // `locked_cuda_repr` unsafe { - // HostAndDeviceMutRefAsync::new( - // &mut *(this.device_box), - // (**(this.locked_cuda_repr)).into_mut(), - // ) - // } + ) -> AsyncProj< + '_, + 'stream, + HostAndDeviceMutRef::CudaRepresentation>>, + > { + let this = unsafe { self.as_mut().unwrap_unchecked() }; + + AsyncProj::new(unsafe { + HostAndDeviceMutRef::new_unchecked( + &mut *(this.device_box), + (**(this.locked_cuda_repr)).into_mut(), + ) + }) } } From b9d8ac0337bdad25e5c72991621af5fa865e197a Mon Sep 17 00:00:00 2001 From: Juniper Tyree Date: Thu, 4 Jan 2024 05:25:39 +0000 Subject: [PATCH 084/120] Disable async derive in examples for now --- examples/derive/src/lib.rs | 4 ++-- examples/single-source/src/main.rs | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/examples/derive/src/lib.rs b/examples/derive/src/lib.rs index 622b1b699..2d7b00ad6 100644 --- a/examples/derive/src/lib.rs +++ b/examples/derive/src/lib.rs @@ -3,14 +3,14 @@ #![feature(offset_of)] #[derive(rc::lend::LendRustToCuda)] -#[cuda(crate = "rc")] +#[cuda(crate = "rc", async = false)] struct Inner { #[cuda(embed)] inner: T, } #[derive(rc::lend::LendRustToCuda)] -#[cuda(crate = "rc")] +#[cuda(crate = "rc", async = false)] struct Outer { #[cuda(embed)] inner: Inner, diff --git a/examples/single-source/src/main.rs b/examples/single-source/src/main.rs index 4783deffa..ec699d43c 100644 --- a/examples/single-source/src/main.rs +++ b/examples/single-source/src/main.rs @@ -24,7 +24,7 @@ fn main() {} pub struct Dummy(i32); #[derive(Clone, rc::lend::LendRustToCuda)] -#[cuda(crate = "rc")] +#[cuda(crate = "rc", async = false)] #[allow(dead_code)] pub struct Wrapper { #[cuda(embed)] @@ -32,7 +32,7 @@ pub struct Wrapper { } #[derive(Clone, rc::lend::LendRustToCuda)] -#[cuda(crate = "rc")] +#[cuda(crate = "rc", async = false)] pub struct Empty([u8; 0]); #[repr(C)] From e0729b11ee4cfd31a36ffd133c2e14c0d7debb22 Mon Sep 17 00:00:00 2001 From: Juniper Tyree Date: Thu, 4 Jan 2024 11:13:53 +0000 Subject: [PATCH 085/120] Implement RustToCudaAsync derive impls --- examples/derive/src/lib.rs | 4 +- examples/single-source/src/main.rs | 4 +- .../src/rust_to_cuda/field_copy.rs | 55 ++++++++++++-- rust-cuda-derive/src/rust_to_cuda/impl.rs | 74 +++++++++++++++---- rust-cuda-derive/src/rust_to_cuda/mod.rs | 6 ++ src/deps.rs | 3 + src/utils/async.rs | 20 ++++- 7 files changed, 140 insertions(+), 26 deletions(-) diff --git a/examples/derive/src/lib.rs b/examples/derive/src/lib.rs index 2d7b00ad6..622b1b699 100644 --- a/examples/derive/src/lib.rs +++ b/examples/derive/src/lib.rs @@ -3,14 +3,14 @@ #![feature(offset_of)] #[derive(rc::lend::LendRustToCuda)] -#[cuda(crate = "rc", async = false)] +#[cuda(crate = "rc")] struct Inner { #[cuda(embed)] inner: T, } #[derive(rc::lend::LendRustToCuda)] -#[cuda(crate = "rc", async = false)] +#[cuda(crate = "rc")] struct Outer { #[cuda(embed)] inner: Inner, diff --git a/examples/single-source/src/main.rs b/examples/single-source/src/main.rs index ec699d43c..4783deffa 100644 --- a/examples/single-source/src/main.rs +++ b/examples/single-source/src/main.rs @@ -24,7 +24,7 @@ fn main() {} pub struct Dummy(i32); #[derive(Clone, rc::lend::LendRustToCuda)] -#[cuda(crate = "rc", async = false)] +#[cuda(crate = "rc")] #[allow(dead_code)] pub struct Wrapper { #[cuda(embed)] @@ -32,7 +32,7 @@ pub struct Wrapper { } #[derive(Clone, rc::lend::LendRustToCuda)] -#[cuda(crate = "rc", async = false)] +#[cuda(crate = "rc")] pub struct Empty([u8; 0]); #[repr(C)] diff --git a/rust-cuda-derive/src/rust_to_cuda/field_copy.rs b/rust-cuda-derive/src/rust_to_cuda/field_copy.rs index 1baf8829e..c32ac67ee 100644 --- a/rust-cuda-derive/src/rust_to_cuda/field_copy.rs +++ b/rust-cuda-derive/src/rust_to_cuda/field_copy.rs @@ -16,9 +16,11 @@ pub fn impl_field_copy_init_and_expand_alloc_type( r2c_field_declarations: &mut Vec, r2c_field_async_declarations: &mut Vec, + r2c_field_async_completions: &mut Vec, r2c_field_initialisations: &mut Vec, r2c_field_destructors: &mut Vec, r2c_field_async_destructors: &mut Vec, + r2c_field_async_completion_calls: &mut Vec, c2r_field_initialisations: &mut Vec, ) -> (TokenStream, TokenStream) { @@ -32,6 +34,11 @@ pub fn impl_field_copy_init_and_expand_alloc_type( Some(ident) => format_ident!("field_{}_repr", ident), None => format_ident!("field_{}_repr", field_index), }; + #[allow(clippy::option_if_let_else)] + let field_completion_ident = match &field.ident { + Some(ident) => format_ident!("field_{}_completion", ident), + None => format_ident!("field_{}_completion", field_index), + }; let optional_field_ident = field.ident.as_ref().map(|ident| quote! { #ident: }); match cuda_repr_field_ty { @@ -83,6 +90,7 @@ pub fn impl_field_copy_init_and_expand_alloc_type( alloc_front, stream, )?; + let (#field_repr_ident, #field_completion_ident) = #field_repr_ident.unwrap_unchecked()?; }); r2c_field_initialisations.push(quote! { @@ -96,13 +104,29 @@ pub fn impl_field_copy_init_and_expand_alloc_type( )?; }); r2c_field_async_destructors.push(quote! { - let alloc_front = #crate_path::lend::RustToCudaAsync::restore_async( - &mut self.#field_accessor, + let this_backup = unsafe { + ::core::mem::ManuallyDrop::new(::core::ptr::read(&this)) + }; + let (r#async, alloc_front) = #crate_path::lend::RustToCudaAsync::restore_async( + this.map_mut(|this| &mut this.#field_accessor), alloc_front, stream, )?; + let (value, #field_completion_ident) = r#async.unwrap_unchecked()?; + ::core::mem::forget(value); + let this = ::core::mem::ManuallyDrop::into_inner(this_backup); + }); + + r2c_field_async_completion_calls.push(quote! { + #crate_path::utils::r#async::Completion::< + #crate_path::deps::owning_ref::BoxRefMut<'a, CudaRestoreOwner, _> + >::complete( + #field_completion_ident, &mut this.#field_accessor, + )?; }); + r2c_field_async_completions.push(field_completion_ident); + c2r_field_initialisations.push(quote! { #optional_field_ident { #crate_path::lend::CudaAsRust::as_rust(&this.#field_accessor) @@ -139,6 +163,7 @@ pub fn impl_field_copy_init_and_expand_alloc_type( alloc_front, stream, )?; + let (#field_repr_ident, #field_completion_ident) = #field_repr_ident.unwrap_unchecked()?; }); r2c_field_initialisations.push(quote! { @@ -154,15 +179,33 @@ pub fn impl_field_copy_init_and_expand_alloc_type( )?; }); r2c_field_async_destructors.push(quote! { - let alloc_front = #crate_path::lend::RustToCudaAsync::restore_async( - < - #proxy_ty as #crate_path::lend::RustToCudaAsyncProxy<#field_ty> - >::from_mut(&mut self.#field_accessor), + let this_backup = unsafe { + ::core::mem::ManuallyDrop::new(::core::ptr::read(&this)) + }; + let (r#async, alloc_front) = #crate_path::lend::RustToCudaAsync::restore_async( + this.map_mut(|this| < + #proxy_ty as #crate_path::lend::RustToCudaProxyAsync<#field_ty> + >::from_mut(&mut this.#field_accessor)), alloc_front, stream, )?; + let (value, #field_completion_ident) = r#async.unwrap_unchecked()?; + ::core::mem::forget(value); + let this = ::core::mem::ManuallyDrop::into_inner(this_backup); + }); + + r2c_field_async_completion_calls.push(quote! { + #crate_path::utils::r#async::Completion::< + #crate_path::deps::owning_ref::BoxRefMut<'a, CudaRestoreOwner, _> + >::complete( + #field_completion_ident, < + #proxy_ty as #crate_path::lend::RustToCudaProxyAsync<#field_ty> + >::from_mut(&mut this.#field_accessor), + )?; }); + r2c_field_async_completions.push(field_completion_ident); + c2r_field_initialisations.push(quote! { #optional_field_ident { #crate_path::lend::RustToCudaProxy::<#field_ty>::into( diff --git a/rust-cuda-derive/src/rust_to_cuda/impl.rs b/rust-cuda-derive/src/rust_to_cuda/impl.rs index d1249720e..674f5e166 100644 --- a/rust-cuda-derive/src/rust_to_cuda/impl.rs +++ b/rust-cuda-derive/src/rust_to_cuda/impl.rs @@ -127,8 +127,10 @@ pub fn rust_to_cuda_async_trait( struct_fields_cuda: &syn::Fields, combined_cuda_alloc_async_type: &TokenStream, r2c_field_async_declarations: &[TokenStream], + r2c_field_async_completions: &[syn::Ident], r2c_field_initialisations: &[TokenStream], r2c_field_async_destructors: &[TokenStream], + r2c_field_async_completion_calls: &[TokenStream], ) -> TokenStream { let rust_to_cuda_struct_construction = match struct_fields_cuda { syn::Fields::Named(_) => quote! { @@ -144,6 +146,39 @@ pub fn rust_to_cuda_async_trait( syn::Fields::Unit => quote! { #struct_name_cuda }, }; + let async_borrow_completion = if r2c_field_async_completions.is_empty() { + quote! { #crate_path::utils::r#async::Async::ready(borrow, stream) } + } else { + quote! { + if #(#r2c_field_async_completions.is_none())&&* { + #crate_path::utils::r#async::Async::ready(borrow, stream) + } else { + #crate_path::utils::r#async::Async::pending( + borrow, stream, #crate_path::utils::r#async::NoCompletion, + )? + } + } + }; + + let async_restore_completion = if r2c_field_async_completions.is_empty() { + quote! { #crate_path::utils::r#async::Async::ready(this, stream) } + } else { + quote! { + if #(#r2c_field_async_completions.is_none())&&* { + #crate_path::utils::r#async::Async::ready(this, stream) + } else { + #crate_path::utils::r#async::Async::< + _, #crate_path::utils::r#async::CompletionFnMut, + >::pending( + this, stream, Box::new(|this| { + #(#r2c_field_async_completion_calls)* + Ok(()) + }), + )? + } + } + }; + let (impl_generics, ty_generics, where_clause) = struct_generics_cuda_async.split_for_impl(); quote! { @@ -153,13 +188,16 @@ pub fn rust_to_cuda_async_trait( type CudaAllocationAsync = #combined_cuda_alloc_async_type; #[cfg(not(target_os = "cuda"))] - unsafe fn borrow_async( + unsafe fn borrow_async<'stream, CudaAllocType: #crate_path::alloc::CudaAlloc>( &self, alloc: CudaAllocType, - stream: &#crate_path::deps::rustacuda::stream::Stream, + stream: &'stream #crate_path::deps::rustacuda::stream::Stream, ) -> #crate_path::deps::rustacuda::error::CudaResult<( - #crate_path::utils::ffi::DeviceAccessible, - #crate_path::alloc::CombinedCudaAlloc + #crate_path::utils::r#async::Async< + '_, 'stream, + #crate_path::utils::ffi::DeviceAccessible, + >, + #crate_path::alloc::CombinedCudaAlloc, )> { let alloc_front = #crate_path::alloc::NoCudaAlloc; let alloc_tail = alloc; @@ -167,26 +205,36 @@ pub fn rust_to_cuda_async_trait( #(#r2c_field_async_declarations)* let borrow = #rust_to_cuda_struct_construction; + let borrow = #crate_path::utils::ffi::DeviceAccessible::from(borrow); - Ok(( - #crate_path::utils::ffi::DeviceAccessible::from(borrow), - #crate_path::alloc::CombinedCudaAlloc::new(alloc_front, alloc_tail) - )) + let r#async = #async_borrow_completion; + let alloc = #crate_path::alloc::CombinedCudaAlloc::new(alloc_front, alloc_tail); + + Ok((r#async, alloc)) } #[cfg(not(target_os = "cuda"))] - unsafe fn restore_async( - &mut self, + unsafe fn restore_async<'a, 'stream, CudaAllocType: #crate_path::alloc::CudaAlloc, CudaRestoreOwner>( + this: #crate_path::deps::owning_ref::BoxRefMut<'a, CudaRestoreOwner, Self>, alloc: #crate_path::alloc::CombinedCudaAlloc< Self::CudaAllocationAsync, CudaAllocType >, - stream: &#crate_path::deps::rustacuda::stream::Stream, - ) -> #crate_path::deps::rustacuda::error::CudaResult { + stream: &'stream #crate_path::deps::rustacuda::stream::Stream, + ) -> #crate_path::deps::rustacuda::error::CudaResult<( + #crate_path::utils::r#async::Async< + 'a, 'stream, + #crate_path::deps::owning_ref::BoxRefMut<'a, CudaRestoreOwner, Self>, + #crate_path::utils::r#async::CompletionFnMut<'a, Self>, + >, + CudaAllocType, + )> { let (alloc_front, alloc_tail) = alloc.split(); #(#r2c_field_async_destructors)* - Ok(alloc_tail) + let r#async = #async_restore_completion; + + Ok((r#async, alloc_tail)) } } } diff --git a/rust-cuda-derive/src/rust_to_cuda/mod.rs b/rust-cuda-derive/src/rust_to_cuda/mod.rs index 77382d4c4..615c81edf 100644 --- a/rust-cuda-derive/src/rust_to_cuda/mod.rs +++ b/rust-cuda-derive/src/rust_to_cuda/mod.rs @@ -38,9 +38,11 @@ pub fn impl_rust_to_cuda(ast: &syn::DeriveInput) -> proc_macro::TokenStream { }; let mut r2c_field_declarations: Vec = Vec::new(); let mut r2c_field_async_declarations: Vec = Vec::new(); + let mut r2c_field_async_completions: Vec = Vec::new(); let mut r2c_field_initialisations: Vec = Vec::new(); let mut r2c_field_destructors: Vec = Vec::new(); let mut r2c_field_async_destructors: Vec = Vec::new(); + let mut r2c_field_async_completion_calls: Vec = Vec::new(); let mut c2r_field_initialisations: Vec = Vec::new(); @@ -70,9 +72,11 @@ pub fn impl_rust_to_cuda(ast: &syn::DeriveInput) -> proc_macro::TokenStream { combined_cuda_alloc_async_type, &mut r2c_field_declarations, &mut r2c_field_async_declarations, + &mut r2c_field_async_completions, &mut r2c_field_initialisations, &mut r2c_field_destructors_reverse, &mut r2c_field_async_destructors_reverse, + &mut r2c_field_async_completion_calls, &mut c2r_field_initialisations, ); } @@ -117,8 +121,10 @@ pub fn impl_rust_to_cuda(ast: &syn::DeriveInput) -> proc_macro::TokenStream { &struct_fields_cuda, &combined_cuda_alloc_async_type, &r2c_field_async_declarations, + &r2c_field_async_completions, &r2c_field_initialisations, &r2c_field_async_destructors, + &r2c_field_async_completion_calls, ) } else { TokenStream::new() diff --git a/src/deps.rs b/src/deps.rs index 68257e095..0000f9250 100644 --- a/src/deps.rs +++ b/src/deps.rs @@ -2,6 +2,9 @@ pub(crate) extern crate alloc; pub extern crate const_type_layout; +#[cfg(feature = "host")] +pub extern crate owning_ref; + #[cfg(feature = "host")] pub extern crate rustacuda; diff --git a/src/utils/async.rs b/src/utils/async.rs index d945538d0..84d8cedd8 100644 --- a/src/utils/async.rs +++ b/src/utils/async.rs @@ -16,7 +16,7 @@ pub struct NoCompletion; pub type CompletionFnMut<'a, T> = Box CudaResult<()> + 'a>; #[cfg(feature = "host")] -pub trait Completion>: sealed::Sealed { +pub trait Completion>: sealed::Sealed { type Completed: ?Sized; #[allow(clippy::missing_errors_doc)] // FIXME @@ -28,9 +28,10 @@ mod sealed { } #[cfg(feature = "host")] -impl Completion for NoCompletion { +impl Completion for NoCompletion { type Completed = T; + #[inline] fn complete(self, _completed: &mut Self::Completed) -> CudaResult<()> { Ok(()) } @@ -39,9 +40,10 @@ impl Completion for NoCompletion { impl sealed::Sealed for NoCompletion {} #[cfg(feature = "host")] -impl<'a, T: BorrowMut, B: ?Sized> Completion for CompletionFnMut<'a, B> { +impl<'a, T: ?Sized + BorrowMut, B: ?Sized> Completion for CompletionFnMut<'a, B> { type Completed = B; + #[inline] fn complete(self, completed: &mut Self::Completed) -> CudaResult<()> { (self)(completed) } @@ -49,6 +51,18 @@ impl<'a, T: BorrowMut, B: ?Sized> Completion for CompletionFnMut<'a, B> { #[cfg(feature = "host")] impl<'a, T: ?Sized> sealed::Sealed for CompletionFnMut<'a, T> {} +#[cfg(feature = "host")] +impl, C: Completion> Completion for Option { + type Completed = C::Completed; + + #[inline] + fn complete(self, completed: &mut Self::Completed) -> CudaResult<()> { + self.map_or(Ok(()), |completion| completion.complete(completed)) + } +} +#[cfg(feature = "host")] +impl sealed::Sealed for Option {} + #[cfg(feature = "host")] pub struct Async<'a, 'stream, T: BorrowMut, C: Completion = NoCompletion> { _stream: PhantomData<&'stream Stream>, From 875f04981021cd34fbd379fe057c78b37d0580c9 Mon Sep 17 00:00:00 2001 From: Juniper Tyree Date: Fri, 5 Jan 2024 03:46:46 +0000 Subject: [PATCH 086/120] Further async API improvements to add drop behaviour --- src/lend/mod.rs | 17 ++++++- src/utils/async.rs | 122 +++++++++++++++++++++++++++++++++++++-------- 2 files changed, 117 insertions(+), 22 deletions(-) diff --git a/src/lend/mod.rs b/src/lend/mod.rs index e2e5dcf99..7d8a1e864 100644 --- a/src/lend/mod.rs +++ b/src/lend/mod.rs @@ -256,7 +256,13 @@ pub trait LendToCudaAsync: RustToCudaAsync { /// Lends an immutable copy of `&self` to CUDA: /// - code in the CUDA kernel can only access `&self` through the /// [`DeviceConstRef`] inside the closure - /// - after the closure, `&self` will not have changed + /// - after the closure, `&self` will not have changed, i.e. interior + /// mutability is not handled by this method + /// + /// Since the [`HostAndDeviceConstRef`] is wrapped in an [`Async`] with + /// [`NoCompletion`], this [`Async`] can be safely dropped or forgotten + /// without changing any behaviour. Therefore, this [`Async`] does *not* + /// need to be returned from the `inner` closure. /// /// # Errors /// @@ -270,6 +276,7 @@ pub trait LendToCudaAsync: RustToCudaAsync { '_, 'stream, HostAndDeviceConstRef::CudaRepresentation>>, + NoCompletion, >, ) -> Result, >( @@ -282,6 +289,11 @@ pub trait LendToCudaAsync: RustToCudaAsync { /// Moves `self` to CUDA iff `self` is [`StackOnly`]. /// + /// Since the [`HostAndDeviceOwned`] is wrapped in an [`Async`] with + /// [`NoCompletion`], this [`Async`] can be safely dropped or forgotten + /// without changing any behaviour. Therefore, this [`Async`] does *not* + /// need to be returned from the `inner` closure. + /// /// # Errors /// /// Returns a [`CudaError`] iff an error occurs inside CUDA @@ -294,6 +306,7 @@ pub trait LendToCudaAsync: RustToCudaAsync { 'a, 'stream, HostAndDeviceOwned::CudaRepresentation>>, + NoCompletion, >, ) -> Result, >( @@ -316,6 +329,7 @@ impl LendToCudaAsync for T { '_, 'stream, HostAndDeviceConstRef::CudaRepresentation>>, + NoCompletion, >, ) -> Result, >( @@ -355,6 +369,7 @@ impl LendToCudaAsync for T { 'a, 'stream, HostAndDeviceOwned::CudaRepresentation>>, + NoCompletion, >, ) -> Result, >( diff --git a/src/utils/async.rs b/src/utils/async.rs index 84d8cedd8..87b91a3e0 100644 --- a/src/utils/async.rs +++ b/src/utils/async.rs @@ -19,6 +19,9 @@ pub type CompletionFnMut<'a, T> = Box CudaResult<()> + 'a> pub trait Completion>: sealed::Sealed { type Completed: ?Sized; + #[doc(hidden)] + fn synchronize_on_drop(&self) -> bool; + #[allow(clippy::missing_errors_doc)] // FIXME fn complete(self, completed: &mut Self::Completed) -> CudaResult<()>; } @@ -31,6 +34,11 @@ mod sealed { impl Completion for NoCompletion { type Completed = T; + #[inline] + fn synchronize_on_drop(&self) -> bool { + false + } + #[inline] fn complete(self, _completed: &mut Self::Completed) -> CudaResult<()> { Ok(()) @@ -43,6 +51,11 @@ impl sealed::Sealed for NoCompletion {} impl<'a, T: ?Sized + BorrowMut, B: ?Sized> Completion for CompletionFnMut<'a, B> { type Completed = B; + #[inline] + fn synchronize_on_drop(&self) -> bool { + true + } + #[inline] fn complete(self, completed: &mut Self::Completed) -> CudaResult<()> { (self)(completed) @@ -55,6 +68,11 @@ impl<'a, T: ?Sized> sealed::Sealed for CompletionFnMut<'a, T> {} impl, C: Completion> Completion for Option { type Completed = C::Completed; + #[inline] + fn synchronize_on_drop(&self) -> bool { + self.as_ref().map_or(false, Completion::synchronize_on_drop) + } + #[inline] fn complete(self, completed: &mut Self::Completed) -> CudaResult<()> { self.map_or(Ok(()), |completion| completion.complete(completed)) @@ -107,9 +125,7 @@ impl<'a, 'stream, T: BorrowMut, C: Completion> Async<'a, 'strea /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside /// CUDA. pub fn pending(value: T, stream: &'stream Stream, completion: C) -> CudaResult { - let event = CudaDropWrapper::from(Event::new( - EventFlags::DISABLE_TIMING | EventFlags::BLOCKING_SYNC, - )?); + let event = CudaDropWrapper::from(Event::new(EventFlags::DISABLE_TIMING)?); let (sender, receiver) = oneshot::channel(); @@ -140,9 +156,11 @@ impl<'a, 'stream, T: BorrowMut, C: Completion> Async<'a, 'strea /// # Errors /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside /// CUDA. - pub fn synchronize(mut self) -> CudaResult { - let (receiver, completion) = match self.status { - AsyncStatus::Completed { result } => return result.map(|()| self.value), + pub fn synchronize(self) -> CudaResult { + let (mut value, status) = self.destructure_into_parts(); + + let (receiver, completion) = match status { + AsyncStatus::Completed { result } => return result.map(|()| value), AsyncStatus::Processing { receiver, completion, @@ -157,9 +175,9 @@ impl<'a, 'stream, T: BorrowMut, C: Completion> Async<'a, 'strea Err(oneshot::RecvError) => return Err(CudaError::AlreadyAcquired), } - completion.complete(self.value.borrow_mut())?; + completion.complete(value.borrow_mut())?; - Ok(self.value) + Ok(value) } /// Moves the asynchronous data move to a different [`Stream`]. @@ -168,15 +186,17 @@ impl<'a, 'stream, T: BorrowMut, C: Completion> Async<'a, 'strea /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside /// CUDA. pub fn move_to_stream<'stream_new>( - mut self, + self, stream: &'stream_new Stream, ) -> CudaResult> { - let (receiver, completion, event) = match self.status { + let (mut value, status) = self.destructure_into_parts(); + + let (receiver, completion, event) = match status { AsyncStatus::Completed { .. } => { return Ok(Async { _stream: PhantomData::<&'stream_new Stream>, - value: self.value, - status: self.status, + value, + status, _capture: PhantomData::<&'a ()>, }) }, @@ -196,7 +216,7 @@ impl<'a, 'stream, T: BorrowMut, C: Completion> Async<'a, 'strea return Ok(Async { _stream: PhantomData::<&'stream_new Stream>, - value: self.value, + value, status: AsyncStatus::Processing { receiver, completion, @@ -209,11 +229,11 @@ impl<'a, 'stream, T: BorrowMut, C: Completion> Async<'a, 'strea Err(oneshot::TryRecvError::Disconnected) => return Err(CudaError::AlreadyAcquired), }; - completion.complete(self.value.borrow_mut())?; + completion.complete(value.borrow_mut())?; Ok(Async { _stream: PhantomData::<&'stream_new Stream>, - value: self.value, + value, status: AsyncStatus::Completed { result: Ok(()) }, _capture: PhantomData::<&'a ()>, }) @@ -230,15 +250,17 @@ impl<'a, 'stream, T: BorrowMut, C: Completion> Async<'a, 'strea /// computation out of smaller ones that have all been submitted to the /// same [`Stream`]. pub unsafe fn unwrap_unchecked(self) -> CudaResult<(T, Option)> { - match self.status { - AsyncStatus::Completed { result: Ok(()) } => Ok((self.value, None)), + let (value, status) = self.destructure_into_parts(); + + match status { + AsyncStatus::Completed { result: Ok(()) } => Ok((value, None)), AsyncStatus::Completed { result: Err(err) } => Err(err), AsyncStatus::Processing { receiver: _, completion, event: _, _capture, - } => Ok((self.value, Some(completion))), + } => Ok((value, Some(completion))), } } @@ -249,6 +271,34 @@ impl<'a, 'stream, T: BorrowMut, C: Completion> Async<'a, 'strea pub fn as_mut(&mut self) -> AsyncProj<'_, 'stream, &mut T> { AsyncProj::new(&mut self.value) } + + #[must_use] + fn destructure_into_parts(self) -> (T, AsyncStatus<'a, T, C>) { + let this = std::mem::ManuallyDrop::new(self); + + // Safety: we destructure self into its droppable components, + // value and status, without dropping self itself + unsafe { (std::ptr::read(&this.value), (std::ptr::read(&this.status))) } + } +} + +#[cfg(feature = "host")] +impl<'a, 'stream, T: BorrowMut, C: Completion> Drop for Async<'a, 'stream, T, C> { + fn drop(&mut self) { + let AsyncStatus::Processing { + receiver, + completion, + event: _, + _capture, + } = std::mem::replace(&mut self.status, AsyncStatus::Completed { result: Ok(()) }) + else { + return; + }; + + if completion.synchronize_on_drop() && receiver.recv() == Ok(Ok(())) { + let _ = completion.complete(self.value.borrow_mut()); + } + } } #[cfg(feature = "host")] @@ -311,8 +361,9 @@ impl<'a, 'stream, T: BorrowMut, C: Completion> IntoFuture type IntoFuture = impl Future; fn into_future(self) -> Self::IntoFuture { - let (completion, status): (Option, AsyncStatus<'a, T, NoCompletion>) = match self.status - { + let (value, status) = self.destructure_into_parts(); + + let (completion, status): (Option, AsyncStatus<'a, T, NoCompletion>) = match status { AsyncStatus::Completed { result } => { (None, AsyncStatus::Completed:: { result }) }, @@ -334,13 +385,42 @@ impl<'a, 'stream, T: BorrowMut, C: Completion> IntoFuture AsyncFuture { _stream: PhantomData::<&'stream Stream>, - value: Some(self.value), + value: Some(value), completion, status, } } } +#[cfg(feature = "host")] +impl<'a, 'stream, T: BorrowMut, C: Completion> Drop + for AsyncFuture<'a, 'stream, T, C> +{ + fn drop(&mut self) { + let Some(mut value) = self.value.take() else { + return; + }; + + let AsyncStatus::Processing { + receiver, + completion: NoCompletion, + event: _, + _capture, + } = std::mem::replace(&mut self.status, AsyncStatus::Completed { result: Ok(()) }) + else { + return; + }; + + let Some(completion) = self.completion.take() else { + return; + }; + + if completion.synchronize_on_drop() && receiver.recv() == Ok(Ok(())) { + let _ = completion.complete(value.borrow_mut()); + } + } +} + #[cfg(feature = "host")] #[allow(clippy::module_name_repetitions)] #[derive(Copy, Clone)] From 356b7b2f5085e55be706304e4b80f3fb4531cf89 Mon Sep 17 00:00:00 2001 From: Juniper Tyree Date: Fri, 5 Jan 2024 04:30:31 +0000 Subject: [PATCH 087/120] First sketch of the safety constraints of a new NoSafeAliasing trait --- src/safety/aliasing.rs | 83 ++++++++++++++++++++++++++++++++++++++++++ src/safety/mod.rs | 2 + 2 files changed, 85 insertions(+) create mode 100644 src/safety/aliasing.rs diff --git a/src/safety/aliasing.rs b/src/safety/aliasing.rs new file mode 100644 index 000000000..25cb61992 --- /dev/null +++ b/src/safety/aliasing.rs @@ -0,0 +1,83 @@ +#[allow(clippy::module_name_repetitions)] +/// Types for which mutable references can be safely shared with each CUDA +/// thread without breaking Rust's no-mutable-aliasing memory safety +/// guarantees. +/// +/// # Safety +/// +/// A type may only implement [`NoSafeAliasing`], if and only if all of the +/// conditions below hold: +/// +/// * Calling [`std::mem::replace`] on a mutable reference of the type does +/// *not* return a value which owns memory which it must deallocate on drop. +/// For instance, `&mut [T]` satisfies this criteria, but `Box` does not. +/// +/// * No safe alising mutable access is provided to the same memory locations +/// across multiple CUDA threads. You can use the +/// [`SplitSliceOverCudaThreadsConstStride`](crate::utils::aliasing::SplitSliceOverCudaThreadsConstStride) +/// and +/// [`SplitSliceOverCudaThreadsDynamicStride`](crate::utils::aliasing::SplitSliceOverCudaThreadsDynamicStride) +/// wrapper types to ensure that each thread is only given access to to its +/// own sub-slice partition so that aliasing is avoided. +/// +/// * A mutable reference of the type must not provide mutable access to some +/// shallow inner state (in contrast to deep, which refers to values behind +/// references) of the value which the API user expects to be mutably shared +/// between all threads even if it is not in practice so as to not violate the +/// second condition. For instance, a struct `Counter { pub a: u32 }` violates +/// this third condition, as code with access to `&mut Counter` also gets +/// mutable access to its field `a` and might assume that mutations of this +/// field are either shared across threads or shared back with the host after +/// the kernel has completed, neither of which is possible. In contrast, `&mut +/// [T]` satisfies this condition, as it is well known that modifying the +/// shallow length of a slice (by assigning a sub-slice) inside a function +/// does not alter the length of the slice that the caller of the function +/// passed in. +pub unsafe trait NoSafeAliasing {} + +unsafe impl< + 'a, + T: crate::safety::StackOnly + + crate::safety::PortableBitSemantics + + const_type_layout::TypeGraphLayout, + const STRIDE: usize, + > NoSafeAliasing + for crate::utils::aliasing::SplitSliceOverCudaThreadsConstStride<&'a mut [T], STRIDE> +{ +} +unsafe impl< + 'a, + T: crate::safety::StackOnly + + crate::safety::PortableBitSemantics + + const_type_layout::TypeGraphLayout, + > NoSafeAliasing + for crate::utils::aliasing::SplitSliceOverCudaThreadsDynamicStride<&'a mut [T]> +{ +} + +unsafe impl< + T: crate::safety::StackOnly + + crate::safety::PortableBitSemantics + + const_type_layout::TypeGraphLayout, + const M2D: bool, + const M2H: bool, + const STRIDE: usize, + > NoSafeAliasing + for crate::utils::aliasing::SplitSliceOverCudaThreadsConstStride< + crate::utils::exchange::buffer::CudaExchangeBuffer, + STRIDE, + > +{ +} +unsafe impl< + T: crate::safety::StackOnly + + crate::safety::PortableBitSemantics + + const_type_layout::TypeGraphLayout, + const M2D: bool, + const M2H: bool, + > NoSafeAliasing + for crate::utils::aliasing::SplitSliceOverCudaThreadsDynamicStride< + crate::utils::exchange::buffer::CudaExchangeBuffer, + > +{ +} diff --git a/src/safety/mod.rs b/src/safety/mod.rs index 243a2a9f9..a3741ea90 100644 --- a/src/safety/mod.rs +++ b/src/safety/mod.rs @@ -1,3 +1,4 @@ +mod aliasing; mod arch; mod portable; mod stack_only; @@ -7,5 +8,6 @@ pub mod kernel_signature; #[doc(hidden)] pub mod type_layout; +pub use aliasing::NoSafeAliasing; pub use portable::PortableBitSemantics; pub use stack_only::StackOnly; From 564ab2beeaece8e386506a483f4e948d4d16e584 Mon Sep 17 00:00:00 2001 From: Juniper Tyree Date: Sat, 6 Jan 2024 20:33:25 +0000 Subject: [PATCH 088/120] First steps towards reintroducing LendToCudaMut --- examples/single-source/src/main.rs | 4 +- src/kernel/param.rs | 196 +++++++++++++++++++++++------ src/lend/mod.rs | 177 ++++++++++++++++++++++++-- src/safety/aliasing.rs | 38 +++--- src/safety/mod.rs | 2 +- src/utils/aliasing/const.rs | 2 +- src/utils/aliasing/dynamic.rs | 2 +- src/utils/async.rs | 43 ++++++- src/utils/exchange/wrapper.rs | 41 ++---- 9 files changed, 409 insertions(+), 96 deletions(-) diff --git a/examples/single-source/src/main.rs b/examples/single-source/src/main.rs index 4783deffa..89bbdf990 100644 --- a/examples/single-source/src/main.rs +++ b/examples/single-source/src/main.rs @@ -64,9 +64,9 @@ pub fn kernel< + rc::safety::StackOnly, >( _x: &rc::kernel::param::PerThreadShallowCopy, - _z: &rc::kernel::param::SharedHeapPerThreadShallowCopy>, + _z: &rc::kernel::param::DeepPerThreadBorrow>, _v @ _w: &'a rc::kernel::param::ShallowInteriorMutable, - _: rc::kernel::param::SharedHeapPerThreadShallowCopy>, + _: rc::kernel::param::DeepPerThreadBorrow>, q @ Triple(s, mut __t, _u): rc::kernel::param::PerThreadShallowCopy, shared3: &mut rc::utils::shared::ThreadBlockShared, dynamic: &mut rc::utils::shared::ThreadBlockSharedSlice, diff --git a/src/kernel/param.rs b/src/kernel/param.rs index 8ca41ddf4..f28944b81 100644 --- a/src/kernel/param.rs +++ b/src/kernel/param.rs @@ -14,8 +14,8 @@ use crate::{ alloc::EmptyCudaAlloc, kernel::{sealed, CudaKernelParameter}, lend::RustToCuda, - safety::PortableBitSemantics, - utils::ffi::{DeviceAccessible, DeviceConstRef, DeviceOwnedRef}, + safety::{PortableBitSemantics, SafeMutableAliasing}, + utils::ffi::{DeviceAccessible, DeviceConstRef, DeviceMutRef, DeviceOwnedRef}, }; pub struct PtxJit { @@ -424,12 +424,12 @@ impl s { } -pub struct SharedHeapPerThreadShallowCopy { +pub struct DeepPerThreadBorrow { never: !, _marker: PhantomData, } -impl Deref for SharedHeapPerThreadShallowCopy { +impl Deref for DeepPerThreadBorrow { type Target = T; fn deref(&self) -> &Self::Target { @@ -444,7 +444,7 @@ impl< CudaRepresentation: 'static + crate::safety::StackOnly, CudaAllocation: EmptyCudaAlloc, >, - > CudaKernelParameter for SharedHeapPerThreadShallowCopy + > CudaKernelParameter for DeepPerThreadBorrow { #[cfg(feature = "host")] type AsyncHostType<'stream, 'b> = crate::utils::r#async::Async< @@ -514,13 +514,11 @@ impl< CudaRepresentation: 'static + crate::safety::StackOnly, CudaAllocation: EmptyCudaAlloc, >, - > sealed::Sealed for SharedHeapPerThreadShallowCopy + > sealed::Sealed for DeepPerThreadBorrow { } -impl<'a, T: 'static + Sync + RustToCuda> CudaKernelParameter - for &'a SharedHeapPerThreadShallowCopy -{ +impl<'a, T: 'static + Sync + RustToCuda> CudaKernelParameter for &'a DeepPerThreadBorrow { #[cfg(feature = "host")] type AsyncHostType<'stream, 'b> = crate::utils::r#async::AsyncProj< 'b, @@ -580,7 +578,78 @@ impl<'a, T: 'static + Sync + RustToCuda> CudaKernelParameter unsafe { crate::lend::BorrowFromRust::with_borrow_from_rust(param, inner) } } } -impl<'a, T: Sync + RustToCuda> sealed::Sealed for &'a SharedHeapPerThreadShallowCopy {} +impl<'a, T: Sync + RustToCuda> sealed::Sealed for &'a DeepPerThreadBorrow {} + +impl<'a, T: 'static + Sync + RustToCuda + SafeMutableAliasing> CudaKernelParameter + for &'a mut DeepPerThreadBorrow +{ + #[cfg(feature = "host")] + type AsyncHostType<'stream, 'b> = crate::utils::r#async::AsyncProj< + 'b, + 'stream, + &'b mut crate::host::HostAndDeviceMutRef< + 'b, + DeviceAccessible<::CudaRepresentation>, + >, + >; + #[cfg(any(feature = "device", doc))] + type DeviceType<'b> = &'b mut T; + type FfiType<'stream, 'b> = + DeviceMutRef<'b, DeviceAccessible<::CudaRepresentation>>; + #[cfg(feature = "host")] + type SyncHostType = &'a mut T; + + #[cfg(feature = "host")] + fn with_new_async<'stream, O, E: From>( + param: Self::SyncHostType, + stream: &'stream rustacuda::stream::Stream, + inner: impl for<'b> FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result, + ) -> Result { + crate::lend::LendToCuda::lend_to_cuda_mut(param, |mut param| { + // FIXME: express the same with param.as_async(stream).as_mut() + let _ = stream; + inner(crate::utils::r#async::AsyncProj::new(&mut param.as_mut())) + }) + } + + #[cfg(feature = "host")] + fn with_async_as_ptx_jit( + _param: &Self::AsyncHostType<'_, '_>, + _token: sealed::Token, + inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O, + ) -> O { + inner(None) + } + + #[cfg(feature = "host")] + fn shared_layout_for_async( + _param: &Self::AsyncHostType<'_, '_>, + _token: sealed::Token, + ) -> Layout { + Layout::new::<()>() + } + + #[cfg(feature = "host")] + fn async_to_ffi<'stream, 'b, E: From>( + param: Self::AsyncHostType<'stream, 'b>, + _token: sealed::Token, + ) -> Result, E> { + let param = unsafe { param.unwrap_unchecked() }; + Ok(param.for_device()) + } + + #[cfg(feature = "device")] + unsafe fn with_ffi_as_device( + param: Self::FfiType<'static, 'static>, + inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O, + ) -> O { + unsafe { crate::lend::BorrowFromRust::with_borrow_from_rust_mut(param, inner) } + } +} +impl<'a, T: Sync + RustToCuda + SafeMutableAliasing> sealed::Sealed + for &'a mut DeepPerThreadBorrow +{ +} impl< T: Send @@ -589,18 +658,17 @@ impl< CudaRepresentation: 'static + crate::safety::StackOnly, CudaAllocation: EmptyCudaAlloc, >, - > CudaKernelParameter for PtxJit> + > CudaKernelParameter for PtxJit> { #[cfg(feature = "host")] type AsyncHostType<'stream, 'b> = - as CudaKernelParameter>::AsyncHostType<'stream, 'b>; + as CudaKernelParameter>::AsyncHostType<'stream, 'b>; #[cfg(any(feature = "device", doc))] - type DeviceType<'b> = - as CudaKernelParameter>::DeviceType<'b>; + type DeviceType<'b> = as CudaKernelParameter>::DeviceType<'b>; type FfiType<'stream, 'b> = - as CudaKernelParameter>::FfiType<'stream, 'b>; + as CudaKernelParameter>::FfiType<'stream, 'b>; #[cfg(feature = "host")] - type SyncHostType = as CudaKernelParameter>::SyncHostType; + type SyncHostType = as CudaKernelParameter>::SyncHostType; #[cfg(feature = "host")] fn with_new_async<'stream, O, E: From>( @@ -608,9 +676,7 @@ impl< stream: &'stream rustacuda::stream::Stream, inner: impl for<'b> FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result, ) -> Result { - as CudaKernelParameter>::with_new_async( - param, stream, inner, - ) + as CudaKernelParameter>::with_new_async(param, stream, inner) } #[cfg(feature = "host")] @@ -628,7 +694,7 @@ impl< param: Self::AsyncHostType<'stream, 'b>, token: sealed::Token, ) -> Result, E> { - as CudaKernelParameter>::async_to_ffi(param, token) + as CudaKernelParameter>::async_to_ffi(param, token) } #[cfg(feature = "host")] @@ -646,7 +712,7 @@ impl< ) -> O { emit_param_ptx_jit_marker::<_, PARAM>(param.as_ref()); - as CudaKernelParameter>::with_ffi_as_device::( + as CudaKernelParameter>::with_ffi_as_device::( param, inner, ) } @@ -658,24 +724,84 @@ impl< CudaRepresentation: 'static + crate::safety::StackOnly, CudaAllocation: EmptyCudaAlloc, >, - > sealed::Sealed for PtxJit> + > sealed::Sealed for PtxJit> { } impl<'a, T: 'static + Sync + RustToCuda> CudaKernelParameter - for &'a PtxJit> + for &'a PtxJit> +{ + #[cfg(feature = "host")] + type AsyncHostType<'stream, 'b> = + <&'a DeepPerThreadBorrow as CudaKernelParameter>::AsyncHostType<'stream, 'b>; + #[cfg(any(feature = "device", doc))] + type DeviceType<'b> = <&'a DeepPerThreadBorrow as CudaKernelParameter>::DeviceType<'b>; + type FfiType<'stream, 'b> = + <&'a DeepPerThreadBorrow as CudaKernelParameter>::FfiType<'stream, 'b>; + #[cfg(feature = "host")] + type SyncHostType = <&'a DeepPerThreadBorrow as CudaKernelParameter>::SyncHostType; + + #[cfg(feature = "host")] + fn with_new_async<'stream, O, E: From>( + param: Self::SyncHostType, + stream: &'stream rustacuda::stream::Stream, + inner: impl for<'b> FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result, + ) -> Result { + <&'a DeepPerThreadBorrow as CudaKernelParameter>::with_new_async(param, stream, inner) + } + + #[cfg(feature = "host")] + fn with_async_as_ptx_jit( + param: &Self::AsyncHostType<'_, '_>, + _token: sealed::Token, + inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O, + ) -> O { + let param = unsafe { param.unwrap_unchecked() }; + inner(Some(¶m_as_raw_bytes(param.for_host()))) + } + + #[cfg(feature = "host")] + fn shared_layout_for_async( + _param: &Self::AsyncHostType<'_, '_>, + _token: sealed::Token, + ) -> Layout { + Layout::new::<()>() + } + + #[cfg(feature = "host")] + fn async_to_ffi<'stream, 'b, E: From>( + param: Self::AsyncHostType<'stream, 'b>, + token: sealed::Token, + ) -> Result, E> { + <&'a DeepPerThreadBorrow as CudaKernelParameter>::async_to_ffi(param, token) + } + + #[cfg(feature = "device")] + unsafe fn with_ffi_as_device( + param: Self::FfiType<'static, 'static>, + inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O, + ) -> O { + emit_param_ptx_jit_marker::<_, PARAM>(param.as_ref()); + + <&'a DeepPerThreadBorrow as CudaKernelParameter>::with_ffi_as_device::( + param, inner, + ) + } +} +impl<'a, T: 'static + Sync + RustToCuda> sealed::Sealed for &'a PtxJit> {} + +impl<'a, T: 'static + Sync + RustToCuda + SafeMutableAliasing> CudaKernelParameter + for &'a mut PtxJit> { #[cfg(feature = "host")] type AsyncHostType<'stream, 'b> = - <&'a SharedHeapPerThreadShallowCopy as CudaKernelParameter>::AsyncHostType<'stream, 'b>; + <&'a mut DeepPerThreadBorrow as CudaKernelParameter>::AsyncHostType<'stream, 'b>; #[cfg(any(feature = "device", doc))] - type DeviceType<'b> = - <&'a SharedHeapPerThreadShallowCopy as CudaKernelParameter>::DeviceType<'b>; + type DeviceType<'b> = <&'a mut DeepPerThreadBorrow as CudaKernelParameter>::DeviceType<'b>; type FfiType<'stream, 'b> = - <&'a SharedHeapPerThreadShallowCopy as CudaKernelParameter>::FfiType<'stream, 'b>; + <&'a mut DeepPerThreadBorrow as CudaKernelParameter>::FfiType<'stream, 'b>; #[cfg(feature = "host")] - type SyncHostType = - <&'a SharedHeapPerThreadShallowCopy as CudaKernelParameter>::SyncHostType; + type SyncHostType = <&'a mut DeepPerThreadBorrow as CudaKernelParameter>::SyncHostType; #[cfg(feature = "host")] fn with_new_async<'stream, O, E: From>( @@ -683,7 +809,7 @@ impl<'a, T: 'static + Sync + RustToCuda> CudaKernelParameter stream: &'stream rustacuda::stream::Stream, inner: impl for<'b> FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result, ) -> Result { - <&'a SharedHeapPerThreadShallowCopy as CudaKernelParameter>::with_new_async( + <&'a mut DeepPerThreadBorrow as CudaKernelParameter>::with_new_async( param, stream, inner, ) } @@ -694,7 +820,7 @@ impl<'a, T: 'static + Sync + RustToCuda> CudaKernelParameter _token: sealed::Token, inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O, ) -> O { - let param = unsafe { param.unwrap_unchecked() }; + let param = unsafe { param.as_ref().unwrap_unchecked() }; inner(Some(¶m_as_raw_bytes(param.for_host()))) } @@ -711,7 +837,7 @@ impl<'a, T: 'static + Sync + RustToCuda> CudaKernelParameter param: Self::AsyncHostType<'stream, 'b>, token: sealed::Token, ) -> Result, E> { - <&'a SharedHeapPerThreadShallowCopy as CudaKernelParameter>::async_to_ffi(param, token) + <&'a mut DeepPerThreadBorrow as CudaKernelParameter>::async_to_ffi(param, token) } #[cfg(feature = "device")] @@ -721,13 +847,13 @@ impl<'a, T: 'static + Sync + RustToCuda> CudaKernelParameter ) -> O { emit_param_ptx_jit_marker::<_, PARAM>(param.as_ref()); - <&'a SharedHeapPerThreadShallowCopy as CudaKernelParameter>::with_ffi_as_device::( + <&'a mut DeepPerThreadBorrow as CudaKernelParameter>::with_ffi_as_device::( param, inner, ) } } -impl<'a, T: 'static + Sync + RustToCuda> sealed::Sealed - for &'a PtxJit> +impl<'a, T: 'static + Sync + RustToCuda + SafeMutableAliasing> sealed::Sealed + for &'a mut PtxJit> { } diff --git a/src/lend/mod.rs b/src/lend/mod.rs index 7d8a1e864..a78cd4018 100644 --- a/src/lend/mod.rs +++ b/src/lend/mod.rs @@ -7,16 +7,16 @@ use rustacuda::error::CudaError; pub use rust_cuda_derive::LendRustToCuda; #[cfg(any(feature = "host", feature = "device", doc))] -use crate::safety::StackOnly; +use crate::safety::{SafeMutableAliasing, StackOnly}; #[cfg(feature = "device")] -use crate::utils::ffi::{DeviceConstRef, DeviceOwnedRef}; +use crate::utils::ffi::{DeviceConstRef, DeviceMutRef, DeviceOwnedRef}; use crate::{alloc::CudaAlloc, safety::PortableBitSemantics}; #[cfg(any(feature = "host", feature = "device"))] use crate::{alloc::EmptyCudaAlloc, utils::ffi::DeviceAccessible}; #[cfg(feature = "host")] use crate::{ alloc::{CombinedCudaAlloc, NoCudaAlloc}, - host::{HostAndDeviceConstRef, HostAndDeviceOwned}, + host::{HostAndDeviceConstRef, HostAndDeviceMutRef, HostAndDeviceOwned}, utils::r#async::{Async, CompletionFnMut, NoCompletion}, }; @@ -162,7 +162,7 @@ impl> RustToCudaAsyncProxy for P { #[cfg(feature = "host")] #[allow(clippy::module_name_repetitions)] pub trait LendToCuda: RustToCuda { - /// Lends an immutable copy of `&self` to CUDA: + /// Lends an immutable borrow of `&self` to CUDA: /// - code in the CUDA kernel can only access `&self` through the /// [`DeviceConstRef`] inside the closure /// - after the closure, `&self` will not have changed @@ -183,7 +183,30 @@ pub trait LendToCuda: RustToCuda { where Self: Sync; - /// Moves `self` to CUDA iff `self` is [`StackOnly`]. + /// Lends a mutable borrow of `&mut self` to CUDA iff `Self` is + /// [`SafeMutableAliasing`]: + /// - code in the CUDA kernel can only access `&mut self` through the + /// `DeviceMutRef` inside the closure + /// - after the closure, `&mut self` will reflect the changes from the + /// kernel execution + /// + /// # Errors + /// + /// Returns a `rustacuda::errors::CudaError` iff an error occurs inside CUDA + fn lend_to_cuda_mut< + O, + E: From, + F: FnOnce( + HostAndDeviceMutRef::CudaRepresentation>>, + ) -> Result, + >( + &mut self, + inner: F, + ) -> Result + where + Self: Sync + SafeMutableAliasing; + + /// Moves `self` to CUDA iff `Self` is [`StackOnly`]. /// /// # Errors /// @@ -227,6 +250,30 @@ impl LendToCuda for T { result } + fn lend_to_cuda_mut< + O, + E: From, + F: FnOnce( + HostAndDeviceMutRef::CudaRepresentation>>, + ) -> Result, + >( + &mut self, + inner: F, + ) -> Result + where + Self: Sync + SafeMutableAliasing, + { + let (mut cuda_repr, alloc) = unsafe { self.borrow(NoCudaAlloc) }?; + + let result = HostAndDeviceMutRef::with_new(&mut cuda_repr, inner); + + core::mem::drop(cuda_repr); + + let _: NoCudaAlloc = unsafe { self.restore(alloc) }?; + + result + } + fn move_to_cuda< O, E: From, @@ -287,6 +334,45 @@ pub trait LendToCudaAsync: RustToCudaAsync { where Self: Sync; + #[allow(clippy::type_complexity)] + /// Lends a mutable borrow of `&mut self` to CUDA iff `Self` is + /// [`SafeMutableAliasing`]: + /// - code in the CUDA kernel can only access `&mut self` through the + /// `DeviceMutRef` inside the closure + /// - after the closure, `&mut self` will reflect the changes from the + /// kernel execution + /// + /// # Errors + /// + /// Returns a `rustacuda::errors::CudaError` iff an error occurs inside CUDA + fn lend_to_cuda_mut_async< + 'a, + 'stream, + O, + E: From, + F: for<'b> FnOnce( + Async< + 'b, + 'stream, + HostAndDeviceMutRef::CudaRepresentation>>, + NoCompletion, + >, + ) -> Result, + T: 'a, + >( + this: owning_ref::BoxRefMut<'a, T, Self>, + stream: &'stream rustacuda::stream::Stream, + inner: F, + ) -> Result< + ( + Async<'a, 'stream, owning_ref::BoxRefMut<'a, T, Self>, CompletionFnMut<'a, Self>>, + O, + ), + E, + > + where + Self: Sync + SafeMutableAliasing; + /// Moves `self` to CUDA iff `self` is [`StackOnly`]. /// /// Since the [`HostAndDeviceOwned`] is wrapped in an [`Async`] with @@ -360,6 +446,55 @@ impl LendToCudaAsync for T { result } + fn lend_to_cuda_mut_async< + 'a, + 'stream, + O, + E: From, + F: for<'b> FnOnce( + Async< + 'b, + 'stream, + HostAndDeviceMutRef::CudaRepresentation>>, + NoCompletion, + >, + ) -> Result, + S: 'a, + >( + this: owning_ref::BoxRefMut<'a, S, Self>, + stream: &'stream rustacuda::stream::Stream, + inner: F, + ) -> Result< + ( + Async<'a, 'stream, owning_ref::BoxRefMut<'a, S, Self>, CompletionFnMut<'a, Self>>, + O, + ), + E, + > + where + Self: Sync + SafeMutableAliasing, + { + let (cuda_repr, alloc) = unsafe { this.borrow_async(NoCudaAlloc, stream) }?; + + let (mut cuda_repr, capture_on_completion) = unsafe { cuda_repr.unwrap_unchecked()? }; + + let result = HostAndDeviceMutRef::with_new(&mut cuda_repr, |mut_ref| { + let r#async = if matches!(capture_on_completion, Some(NoCompletion)) { + Async::pending(mut_ref, stream, NoCompletion)? + } else { + Async::ready(mut_ref, stream) + }; + + inner(r#async) + }); + + core::mem::drop(cuda_repr); + + let (r#async, _): (_, NoCudaAlloc) = unsafe { Self::restore_async(this, alloc, stream) }?; + + result.map(|ok| (r#async, ok)) + } + fn move_to_cuda_async< 'stream, O, @@ -403,13 +538,25 @@ pub trait BorrowFromRust: RustToCuda { /// # Safety /// /// This function is only safe to call iff `cuda_repr` is the - /// [`DeviceConstRef`] borrowed on the CPU using the corresponding - /// [`LendToCuda::lend_to_cuda`]. + /// [`DeviceConstRef`] borrowed on the CPU using the corresponding + /// [`LendToCuda::lend_to_cuda`]. unsafe fn with_borrow_from_rust O>( cuda_repr: DeviceConstRef::CudaRepresentation>>, inner: F, ) -> O; + /// # Safety + /// + /// This function is only safe to call iff `cuda_repr_mut` is the + /// [`DeviceMutRef`] borrowed on the CPU using the corresponding + /// [`LendToCuda::lend_to_cuda_mut`]. + unsafe fn with_borrow_from_rust_mut O>( + cuda_repr_mut: DeviceMutRef::CudaRepresentation>>, + inner: F, + ) -> O + where + Self: SafeMutableAliasing; + /// # Safety /// /// This function is only safe to call iff `cuda_repr` is the @@ -437,6 +584,22 @@ impl BorrowFromRust for T { inner(&rust_repr) } + #[inline] + unsafe fn with_borrow_from_rust_mut O>( + mut cuda_repr_mut: DeviceMutRef::CudaRepresentation>>, + inner: F, + ) -> O + where + Self: SafeMutableAliasing, + { + // `rust_repr` must never be dropped as we do NOT own any of the + // heap memory it might reference + let mut rust_repr_mut = + core::mem::ManuallyDrop::new(CudaAsRust::as_rust(cuda_repr_mut.as_mut())); + + inner(&mut rust_repr_mut) + } + #[inline] unsafe fn with_moved_from_rust O>( mut cuda_repr: DeviceOwnedRef::CudaRepresentation>>, diff --git a/src/safety/aliasing.rs b/src/safety/aliasing.rs index 25cb61992..7add5775c 100644 --- a/src/safety/aliasing.rs +++ b/src/safety/aliasing.rs @@ -5,8 +5,8 @@ /// /// # Safety /// -/// A type may only implement [`NoSafeAliasing`], if and only if all of the -/// conditions below hold: +/// A type may only implement [`SafeMutableAliasing`], if and +/// only if all of the safety conditions below hold: /// /// * Calling [`std::mem::replace`] on a mutable reference of the type does /// *not* return a value which owns memory which it must deallocate on drop. @@ -24,16 +24,18 @@ /// shallow inner state (in contrast to deep, which refers to values behind /// references) of the value which the API user expects to be mutably shared /// between all threads even if it is not in practice so as to not violate the -/// second condition. For instance, a struct `Counter { pub a: u32 }` violates -/// this third condition, as code with access to `&mut Counter` also gets -/// mutable access to its field `a` and might assume that mutations of this -/// field are either shared across threads or shared back with the host after -/// the kernel has completed, neither of which is possible. In contrast, `&mut -/// [T]` satisfies this condition, as it is well known that modifying the -/// shallow length of a slice (by assigning a sub-slice) inside a function -/// does not alter the length of the slice that the caller of the function -/// passed in. -pub unsafe trait NoSafeAliasing {} +/// second condition. For instance, `Vec` violates this third condition, as +/// code with access to `&mut Vec` can also mutate the length of the +/// vector, which is shallow state that is expected to be propagated to the +/// caller of a function sharing this vector (it is also related to the deep +/// contents of the vector via a safety invariant) and might thus assume that +/// mutations of this length are either shared across threads or shared back +/// with the host after the kernel has completed, neither of which is +/// possible. In contrast, `&mut [T]` satisfies this condition, as it is well +/// known that modifying the shallow length of a slice (by assigning a +/// sub-slice) inside a function does not alter the length of the slice that +/// the caller of the function passed in. +pub unsafe trait SafeMutableAliasing {} unsafe impl< 'a, @@ -41,20 +43,22 @@ unsafe impl< + crate::safety::PortableBitSemantics + const_type_layout::TypeGraphLayout, const STRIDE: usize, - > NoSafeAliasing + > SafeMutableAliasing for crate::utils::aliasing::SplitSliceOverCudaThreadsConstStride<&'a mut [T], STRIDE> { } + unsafe impl< 'a, T: crate::safety::StackOnly + crate::safety::PortableBitSemantics + const_type_layout::TypeGraphLayout, - > NoSafeAliasing + > SafeMutableAliasing for crate::utils::aliasing::SplitSliceOverCudaThreadsDynamicStride<&'a mut [T]> { } +#[cfg(any(feature = "host", feature = "device"))] unsafe impl< T: crate::safety::StackOnly + crate::safety::PortableBitSemantics @@ -62,20 +66,22 @@ unsafe impl< const M2D: bool, const M2H: bool, const STRIDE: usize, - > NoSafeAliasing + > SafeMutableAliasing for crate::utils::aliasing::SplitSliceOverCudaThreadsConstStride< crate::utils::exchange::buffer::CudaExchangeBuffer, STRIDE, > { } + +#[cfg(any(feature = "host", feature = "device"))] unsafe impl< T: crate::safety::StackOnly + crate::safety::PortableBitSemantics + const_type_layout::TypeGraphLayout, const M2D: bool, const M2H: bool, - > NoSafeAliasing + > SafeMutableAliasing for crate::utils::aliasing::SplitSliceOverCudaThreadsDynamicStride< crate::utils::exchange::buffer::CudaExchangeBuffer, > diff --git a/src/safety/mod.rs b/src/safety/mod.rs index a3741ea90..c26ef3389 100644 --- a/src/safety/mod.rs +++ b/src/safety/mod.rs @@ -8,6 +8,6 @@ pub mod kernel_signature; #[doc(hidden)] pub mod type_layout; -pub use aliasing::NoSafeAliasing; +pub use aliasing::SafeMutableAliasing; pub use portable::PortableBitSemantics; pub use stack_only::StackOnly; diff --git a/src/utils/aliasing/const.rs b/src/utils/aliasing/const.rs index 0259c301a..3ca7b0597 100644 --- a/src/utils/aliasing/const.rs +++ b/src/utils/aliasing/const.rs @@ -13,7 +13,7 @@ use crate::{ }; #[repr(transparent)] -#[derive(Clone, TypeLayout)] +#[derive(PartialEq, Eq, PartialOrd, Ord, Hash, TypeLayout)] pub struct SplitSliceOverCudaThreadsConstStride(T); impl SplitSliceOverCudaThreadsConstStride { diff --git a/src/utils/aliasing/dynamic.rs b/src/utils/aliasing/dynamic.rs index 1c502dc8e..2c663e9d6 100644 --- a/src/utils/aliasing/dynamic.rs +++ b/src/utils/aliasing/dynamic.rs @@ -13,7 +13,7 @@ use crate::{ }; #[repr(C)] -#[derive(Clone, TypeLayout)] +#[derive(PartialEq, Eq, PartialOrd, Ord, Hash, TypeLayout)] pub struct SplitSliceOverCudaThreadsDynamicStride { stride: usize, inner: T, diff --git a/src/utils/async.rs b/src/utils/async.rs index 87b91a3e0..e98758d4f 100644 --- a/src/utils/async.rs +++ b/src/utils/async.rs @@ -240,7 +240,6 @@ impl<'a, 'stream, T: BorrowMut, C: Completion> Async<'a, 'strea } #[allow(clippy::missing_errors_doc)] // FIXME - #[allow(clippy::type_complexity)] // FIXME /// # Safety /// /// The returned inner value of type `T` may not yet have completed its @@ -454,3 +453,45 @@ impl<'a, 'stream, T: 'a> AsyncProj<'a, 'stream, T> { self.value } } + +#[cfg(feature = "host")] +impl<'a, 'stream, T: 'a> AsyncProj<'a, 'stream, &'a T> { + #[must_use] + pub const fn as_ref<'b>(&'b self) -> AsyncProj<'b, 'stream, &'b T> + where + 'a: 'b, + { + AsyncProj { + _capture: PhantomData::<&'b ()>, + _stream: PhantomData::<&'stream Stream>, + value: self.value, + } + } +} + +#[cfg(feature = "host")] +impl<'a, 'stream, T: 'a> AsyncProj<'a, 'stream, &'a mut T> { + #[must_use] + pub fn as_ref<'b>(&'b self) -> AsyncProj<'b, 'stream, &'b T> + where + 'a: 'b, + { + AsyncProj { + _capture: PhantomData::<&'b ()>, + _stream: PhantomData::<&'stream Stream>, + value: self.value, + } + } + + #[must_use] + pub fn as_mut<'b>(&'b mut self) -> AsyncProj<'b, 'stream, &'b mut T> + where + 'a: 'b, + { + AsyncProj { + _capture: PhantomData::<&'b ()>, + _stream: PhantomData::<&'stream Stream>, + value: self.value, + } + } +} diff --git a/src/utils/exchange/wrapper.rs b/src/utils/exchange/wrapper.rs index 0f1ff89f8..1f3326c5b 100644 --- a/src/utils/exchange/wrapper.rs +++ b/src/utils/exchange/wrapper.rs @@ -10,6 +10,7 @@ use crate::{ alloc::{EmptyCudaAlloc, NoCudaAlloc}, host::{CudaDropWrapper, HostAndDeviceConstRef, HostAndDeviceMutRef}, lend::{RustToCuda, RustToCudaAsync}, + safety::SafeMutableAliasing, utils::{ adapter::DeviceCopyWithPortableBitSemantics, ffi::DeviceAccessible, @@ -85,16 +86,10 @@ impl> ExchangeWrapperOnHost { }) } - // TODO: safety constraint? /// Moves the data synchronously to the CUDA device, where it can then be /// lent out immutably via [`ExchangeWrapperOnDevice::as_ref`], or mutably /// via [`ExchangeWrapperOnDevice::as_mut`]. /// - /// To avoid aliasing, each CUDA thread will get access to its own shallow - /// copy of the data. Hence, - /// - any shallow changes to the data will NOT be reflected back to the CPU - /// - any deep changes to the data WILL be reflected back to the CPU - /// /// # Errors /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside /// CUDA @@ -118,14 +113,8 @@ impl { #[allow(clippy::needless_lifetimes)] // keep 'stream explicit - // TODO: safety constraint? /// Moves the data asynchronously to the CUDA device. /// - /// To avoid aliasing, each CUDA thread will get access to its own shallow - /// copy of the data. Hence, - /// - any shallow changes to the data will NOT be reflected back to the CPU - /// - any deep changes to the data WILL be reflected back to the CPU - /// /// # Errors /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside /// CUDA @@ -174,14 +163,8 @@ impl> DerefMut for ExchangeWrapper } impl> ExchangeWrapperOnDevice { - // TODO: safety constraint? /// Moves the data synchronously back to the host CPU device. /// - /// To avoid aliasing, each CUDA thread only got access to its own shallow - /// copy of the data. Hence, - /// - any shallow changes to the data will NOT be reflected back to the CPU - /// - any deep changes to the data WILL be reflected back to the CPU - /// /// # Errors /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside /// CUDA @@ -216,7 +199,10 @@ impl> ExchangeWrapperOnDevice { #[must_use] pub fn as_mut( &mut self, - ) -> HostAndDeviceMutRef::CudaRepresentation>> { + ) -> HostAndDeviceMutRef::CudaRepresentation>> + where + T: SafeMutableAliasing, + { // Safety: `device_box` contains exactly the device copy of `locked_cuda_repr` unsafe { HostAndDeviceMutRef::new_unchecked( @@ -231,14 +217,8 @@ impl { #[allow(clippy::needless_lifetimes)] // keep 'stream explicit - // TODO: safety constraint? /// Moves the data asynchronously back to the host CPU device. /// - /// To avoid aliasing, each CUDA thread only got access to its own shallow - /// copy of the data. Hence, - /// - any shallow changes to the data will NOT be reflected back to the CPU - /// - any deep changes to the data WILL be reflected back to the CPU - /// /// # Errors /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside /// CUDA @@ -295,14 +275,8 @@ impl< T: RustToCudaAsync, > Async<'a, 'stream, ExchangeWrapperOnDevice, NoCompletion> { - // TODO: safety constraint? /// Moves the data asynchronously back to the host CPU device. /// - /// To avoid aliasing, each CUDA thread only got access to its own shallow - /// copy of the data. Hence, - /// - any shallow changes to the data will NOT be reflected back to the CPU - /// - any deep changes to the data WILL be reflected back to the CPU - /// /// # Errors /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside /// CUDA @@ -380,7 +354,10 @@ impl< '_, 'stream, HostAndDeviceMutRef::CudaRepresentation>>, - > { + > + where + T: SafeMutableAliasing, + { let this = unsafe { self.as_mut().unwrap_unchecked() }; AsyncProj::new(unsafe { From eeb4020cc98baad04d0a01f747efd752463541d3 Mon Sep 17 00:00:00 2001 From: Juniper Tyree Date: Sun, 7 Jan 2024 04:25:37 +0000 Subject: [PATCH 089/120] Fix no-std Box import for LendRustToCuda derive --- rust-cuda-derive/src/rust_to_cuda/impl.rs | 2 +- src/deps.rs | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/rust-cuda-derive/src/rust_to_cuda/impl.rs b/rust-cuda-derive/src/rust_to_cuda/impl.rs index 674f5e166..40dd3487d 100644 --- a/rust-cuda-derive/src/rust_to_cuda/impl.rs +++ b/rust-cuda-derive/src/rust_to_cuda/impl.rs @@ -170,7 +170,7 @@ pub fn rust_to_cuda_async_trait( #crate_path::utils::r#async::Async::< _, #crate_path::utils::r#async::CompletionFnMut, >::pending( - this, stream, Box::new(|this| { + this, stream, #crate_path::deps::alloc::boxed::Box::new(|this| { #(#r2c_field_async_completion_calls)* Ok(()) }), diff --git a/src/deps.rs b/src/deps.rs index 0000f9250..50fd38f3f 100644 --- a/src/deps.rs +++ b/src/deps.rs @@ -1,4 +1,5 @@ -pub(crate) extern crate alloc; +#[doc(hidden)] +pub extern crate alloc; pub extern crate const_type_layout; From 4eaaa92afdc3e43427c58bc4125247dc49b762b7 Mon Sep 17 00:00:00 2001 From: Juniper Tyree Date: Sun, 7 Jan 2024 05:04:47 +0000 Subject: [PATCH 090/120] Re-add RustToCuda implementation for Final --- .vscode/settings.json | 1 + Cargo.toml | 3 ++ src/lend/impls/final.rs | 102 +++++++++++++++++++++++++++++++++++++++ src/lend/impls/mod.rs | 2 + src/lend/impls/option.rs | 4 +- src/lend/mod.rs | 12 ++--- 6 files changed, 116 insertions(+), 8 deletions(-) create mode 100644 src/lend/impls/final.rs diff --git a/.vscode/settings.json b/.vscode/settings.json index ddfa41463..d12ff8221 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -6,6 +6,7 @@ "rust-analyzer.cargo.allFeatures": false, "rust-analyzer.cargo.features": [ "derive", + "final", "host", "kernel" ], diff --git a/Cargo.toml b/Cargo.toml index eb0e1725f..5aaa324bb 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -21,6 +21,7 @@ rust-version = "1.75" # nightly default = [] derive = ["dep:rustacuda_derive", "dep:rust-cuda-derive"] device = [] +final = ["dep:final"] host = ["dep:rustacuda", "dep:regex", "dep:oneshot", "dep:safer_owning_ref"] kernel = ["dep:rust-cuda-kernel"] @@ -37,5 +38,7 @@ const-type-layout = { version = "0.2.1", features = ["derive"] } safer_owning_ref = { version = "0.5", optional = true } oneshot = { version = "0.1", optional = true, features = ["std", "async"] } +final = { version = "0.1.1", optional = true } + rust-cuda-derive = { path = "rust-cuda-derive", optional = true } rust-cuda-kernel = { path = "rust-cuda-kernel", optional = true } diff --git a/src/lend/impls/final.rs b/src/lend/impls/final.rs new file mode 100644 index 000000000..6235a58fe --- /dev/null +++ b/src/lend/impls/final.rs @@ -0,0 +1,102 @@ +use r#final::Final; + +use crate::{ + lend::{CudaAsRust, RustToCuda, RustToCudaAsync}, + utils::ffi::DeviceAccessible, +}; + +#[doc(hidden)] +#[allow(clippy::module_name_repetitions)] +#[derive(const_type_layout::TypeLayout)] +#[repr(transparent)] +pub struct FinalCudaRepresentation(DeviceAccessible); + +unsafe impl RustToCuda for Final { + type CudaAllocation = T::CudaAllocation; + type CudaRepresentation = FinalCudaRepresentation; + + #[cfg(feature = "host")] + #[allow(clippy::type_complexity)] + unsafe fn borrow( + &self, + alloc: A, + ) -> rustacuda::error::CudaResult<( + DeviceAccessible, + crate::alloc::CombinedCudaAlloc, + )> { + let (cuda_repr, alloc) = (**self).borrow(alloc)?; + + Ok(( + DeviceAccessible::from(FinalCudaRepresentation(cuda_repr)), + alloc, + )) + } + + #[cfg(feature = "host")] + unsafe fn restore( + &mut self, + alloc: crate::alloc::CombinedCudaAlloc, + ) -> rustacuda::error::CudaResult { + let (_alloc_front, alloc_tail) = alloc.split(); + Ok(alloc_tail) + } +} + +unsafe impl RustToCudaAsync for Final { + type CudaAllocationAsync = T::CudaAllocationAsync; + + #[cfg(feature = "host")] + unsafe fn borrow_async<'stream, A: crate::alloc::CudaAlloc>( + &self, + alloc: A, + stream: &'stream rustacuda::stream::Stream, + ) -> rustacuda::error::CudaResult<( + crate::utils::r#async::Async<'_, 'stream, DeviceAccessible>, + crate::alloc::CombinedCudaAlloc, + )> { + let (cuda_repr, alloc) = (**self).borrow_async(alloc, stream)?; + let (cuda_repr, completion) = unsafe { cuda_repr.unwrap_unchecked()? }; + + let final_cuda_repr = DeviceAccessible::from(FinalCudaRepresentation(cuda_repr)); + + let r#async = if matches!(completion, Some(crate::utils::r#async::NoCompletion)) { + crate::utils::r#async::Async::pending( + final_cuda_repr, + stream, + crate::utils::r#async::NoCompletion, + )? + } else { + crate::utils::r#async::Async::ready(final_cuda_repr, stream) + }; + + Ok((r#async, alloc)) + } + + #[cfg(feature = "host")] + unsafe fn restore_async<'a, 'stream, A: crate::alloc::CudaAlloc, O>( + this: owning_ref::BoxRefMut<'a, O, Self>, + alloc: crate::alloc::CombinedCudaAlloc, + stream: &'stream rustacuda::stream::Stream, + ) -> rustacuda::error::CudaResult<( + crate::utils::r#async::Async< + 'a, + 'stream, + owning_ref::BoxRefMut<'a, O, Self>, + crate::utils::r#async::CompletionFnMut<'a, Self>, + >, + A, + )> { + let (_alloc_front, alloc_tail) = alloc.split(); + let r#async = crate::utils::r#async::Async::ready(this, stream); + Ok((r#async, alloc_tail)) + } +} + +unsafe impl CudaAsRust for FinalCudaRepresentation { + type RustRepresentation = Final; + + #[cfg(feature = "device")] + unsafe fn as_rust(this: &DeviceAccessible) -> Self::RustRepresentation { + Final::new(CudaAsRust::as_rust(&this.0)) + } +} diff --git a/src/lend/impls/mod.rs b/src/lend/impls/mod.rs index 18f546bbd..e0360671c 100644 --- a/src/lend/impls/mod.rs +++ b/src/lend/impls/mod.rs @@ -1,5 +1,7 @@ mod r#box; mod boxed_slice; +#[cfg(feature = "final")] +mod r#final; mod option; mod r#ref; mod ref_mut; diff --git a/src/lend/impls/option.rs b/src/lend/impls/option.rs index 197906baf..b1c51b9a5 100644 --- a/src/lend/impls/option.rs +++ b/src/lend/impls/option.rs @@ -108,7 +108,7 @@ unsafe impl RustToCudaAsync for Option { Some(value) => { let (cuda_repr, alloc) = value.borrow_async(alloc, stream)?; - let (cuda_repr, capture_on_completion) = unsafe { cuda_repr.unwrap_unchecked()? }; + let (cuda_repr, completion) = unsafe { cuda_repr.unwrap_unchecked()? }; let (alloc_front, alloc_tail) = alloc.split(); let alloc = CombinedCudaAlloc::new(Some(alloc_front), alloc_tail); @@ -118,7 +118,7 @@ unsafe impl RustToCudaAsync for Option { present: true, }); - let r#async = if matches!(capture_on_completion, Some(NoCompletion)) { + let r#async = if matches!(completion, Some(NoCompletion)) { Async::pending(option_cuda_repr, stream, NoCompletion)? } else { Async::ready(option_cuda_repr, stream) diff --git a/src/lend/mod.rs b/src/lend/mod.rs index a78cd4018..b3f83ecff 100644 --- a/src/lend/mod.rs +++ b/src/lend/mod.rs @@ -428,10 +428,10 @@ impl LendToCudaAsync for T { { let (cuda_repr, alloc) = unsafe { self.borrow_async(NoCudaAlloc, stream) }?; - let (cuda_repr, capture_on_completion) = unsafe { cuda_repr.unwrap_unchecked()? }; + let (cuda_repr, completion) = unsafe { cuda_repr.unwrap_unchecked()? }; let result = HostAndDeviceConstRef::with_new(&cuda_repr, |const_ref| { - let r#async = if matches!(capture_on_completion, Some(NoCompletion)) { + let r#async = if matches!(completion, Some(NoCompletion)) { Async::pending(const_ref, stream, NoCompletion)? } else { Async::ready(const_ref, stream) @@ -476,10 +476,10 @@ impl LendToCudaAsync for T { { let (cuda_repr, alloc) = unsafe { this.borrow_async(NoCudaAlloc, stream) }?; - let (mut cuda_repr, capture_on_completion) = unsafe { cuda_repr.unwrap_unchecked()? }; + let (mut cuda_repr, completion) = unsafe { cuda_repr.unwrap_unchecked()? }; let result = HostAndDeviceMutRef::with_new(&mut cuda_repr, |mut_ref| { - let r#async = if matches!(capture_on_completion, Some(NoCompletion)) { + let r#async = if matches!(completion, Some(NoCompletion)) { Async::pending(mut_ref, stream, NoCompletion)? } else { Async::ready(mut_ref, stream) @@ -517,10 +517,10 @@ impl LendToCudaAsync for T { { let (cuda_repr, alloc) = unsafe { self.borrow_async(NoCudaAlloc, stream) }?; - let (cuda_repr, capture_on_completion) = unsafe { cuda_repr.unwrap_unchecked()? }; + let (cuda_repr, completion) = unsafe { cuda_repr.unwrap_unchecked()? }; let result = HostAndDeviceOwned::with_new(cuda_repr, |owned_ref| { - if matches!(capture_on_completion, Some(NoCompletion)) { + if matches!(completion, Some(NoCompletion)) { inner(Async::pending(owned_ref, stream, NoCompletion)?) } else { inner(Async::ready(owned_ref, stream)) From fc18c7908f94ebc1e76ba5b722ffe7118b618035 Mon Sep 17 00:00:00 2001 From: Juniper Tyree Date: Sun, 7 Jan 2024 05:30:17 +0000 Subject: [PATCH 091/120] Remove redundant RustToCudaAsyncProxy --- rust-cuda-derive/src/rust_to_cuda/field_copy.rs | 6 +++--- src/lend/mod.rs | 4 ---- 2 files changed, 3 insertions(+), 7 deletions(-) diff --git a/rust-cuda-derive/src/rust_to_cuda/field_copy.rs b/rust-cuda-derive/src/rust_to_cuda/field_copy.rs index c32ac67ee..05d133156 100644 --- a/rust-cuda-derive/src/rust_to_cuda/field_copy.rs +++ b/rust-cuda-derive/src/rust_to_cuda/field_copy.rs @@ -158,7 +158,7 @@ pub fn impl_field_copy_init_and_expand_alloc_type( r2c_field_async_declarations.push(quote! { let (#field_repr_ident, alloc_front) = #crate_path::lend::RustToCudaAsync::borrow_async( < - #proxy_ty as #crate_path::lend::RustToCudaAsyncProxy<#field_ty> + #proxy_ty as #crate_path::lend::RustToCudaProxy<#field_ty> >::from_ref(&self.#field_accessor), alloc_front, stream, @@ -184,7 +184,7 @@ pub fn impl_field_copy_init_and_expand_alloc_type( }; let (r#async, alloc_front) = #crate_path::lend::RustToCudaAsync::restore_async( this.map_mut(|this| < - #proxy_ty as #crate_path::lend::RustToCudaProxyAsync<#field_ty> + #proxy_ty as #crate_path::lend::RustToCudaProxy<#field_ty> >::from_mut(&mut this.#field_accessor)), alloc_front, stream, @@ -199,7 +199,7 @@ pub fn impl_field_copy_init_and_expand_alloc_type( #crate_path::deps::owning_ref::BoxRefMut<'a, CudaRestoreOwner, _> >::complete( #field_completion_ident, < - #proxy_ty as #crate_path::lend::RustToCudaProxyAsync<#field_ty> + #proxy_ty as #crate_path::lend::RustToCudaProxy<#field_ty> >::from_mut(&mut this.#field_accessor), )?; }); diff --git a/src/lend/mod.rs b/src/lend/mod.rs index b3f83ecff..7a3934aa0 100644 --- a/src/lend/mod.rs +++ b/src/lend/mod.rs @@ -155,10 +155,6 @@ pub trait RustToCudaProxy: RustToCuda { fn into(self) -> T; } -pub trait RustToCudaAsyncProxy: RustToCudaAsync + RustToCudaProxy {} - -impl> RustToCudaAsyncProxy for P {} - #[cfg(feature = "host")] #[allow(clippy::module_name_repetitions)] pub trait LendToCuda: RustToCuda { From abaa2598fc7ecd53a6a4538a55aad6931c910d8a Mon Sep 17 00:00:00 2001 From: Juniper Tyree Date: Sun, 7 Jan 2024 20:22:43 +0000 Subject: [PATCH 092/120] More progress on less 'static bounds on kernel params --- src/kernel/mod.rs | 62 +- src/kernel/param.rs | 1562 +++++++++++++++++++++---------------------- 2 files changed, 823 insertions(+), 801 deletions(-) diff --git a/src/kernel/mod.rs b/src/kernel/mod.rs index 7026efc1a..cc51d64f0 100644 --- a/src/kernel/mod.rs +++ b/src/kernel/mod.rs @@ -39,52 +39,76 @@ mod sealed { pub struct Token; } +#[cfg(feature = "host")] +pub trait WithNewAsync<'stream, P: ?Sized + CudaKernelParameter, O, E: From> { + fn with<'b>(self, param: P::AsyncHostType<'stream, 'b>) -> Result where P: 'b; +} + +#[cfg(feature = "host")] +impl<'stream, P: ?Sized + CudaKernelParameter, O, E: From, F: for<'b> FnOnce(P::AsyncHostType<'stream, 'b>) -> Result> WithNewAsync<'stream, P, O, E> for F { + fn with<'b>(self, param: P::AsyncHostType<'stream, 'b>) -> Result where P: 'b { + (self)(param) + } +} + +#[cfg(feature = "device")] +pub trait WithFfiAsDevice { + fn with<'b>(self, param: P::DeviceType<'b>) -> O where P: 'b; +} + +#[cfg(feature = "device")] +impl FnOnce(P::DeviceType<'b>) -> O> WithFfiAsDevice for F { + fn with<'b>(self, param: P::DeviceType<'b>) -> O where P: 'b { + (self)(param) + } +} + pub trait CudaKernelParameter: sealed::Sealed { #[cfg(feature = "host")] type SyncHostType; #[cfg(feature = "host")] - type AsyncHostType<'stream, 'b>; + type AsyncHostType<'stream, 'b> where Self: 'b; #[doc(hidden)] - type FfiType<'stream, 'b>: PortableBitSemantics; + type FfiType<'stream, 'b>: PortableBitSemantics where Self: 'b; #[cfg(any(feature = "device", doc))] - type DeviceType<'b>; + type DeviceType<'b> where Self: 'b; #[cfg(feature = "host")] #[allow(clippy::missing_errors_doc)] // FIXME - fn with_new_async<'stream, O, E: From>( + fn with_new_async<'stream, 'param, O, E: From>( param: Self::SyncHostType, stream: &'stream rustacuda::stream::Stream, - inner: impl for<'b> FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result, - ) -> Result; + inner: impl WithNewAsync<'stream, Self, O, E>, + ) -> Result where Self: 'param; #[doc(hidden)] #[cfg(feature = "host")] - fn with_async_as_ptx_jit( - param: &Self::AsyncHostType<'_, '_>, + fn with_async_as_ptx_jit<'stream, 'b, O>( + param: &Self::AsyncHostType<'stream, 'b>, token: sealed::Token, inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O, - ) -> O; + ) -> O where Self: 'b; #[doc(hidden)] #[cfg(feature = "host")] - fn shared_layout_for_async( - param: &Self::AsyncHostType<'_, '_>, + fn shared_layout_for_async<'stream, 'b>( + param: &Self::AsyncHostType<'stream, 'b>, token: sealed::Token, - ) -> std::alloc::Layout; + ) -> std::alloc::Layout where Self: 'b; #[doc(hidden)] #[cfg(feature = "host")] fn async_to_ffi<'stream, 'b, E: From>( param: Self::AsyncHostType<'stream, 'b>, token: sealed::Token, - ) -> Result, E>; + ) -> Result, E> where Self: 'b; #[doc(hidden)] #[cfg(feature = "device")] - unsafe fn with_ffi_as_device( - param: Self::FfiType<'static, 'static>, - inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O, - ) -> O; + unsafe fn with_ffi_as_device<'short, O, const PARAM: usize>( + param: Self::FfiType<'static, 'short>, + inner: impl WithFfiAsDevice, + ) -> O where Self: 'short; } #[cfg(feature = "host")] @@ -151,7 +175,7 @@ macro_rules! impl_launcher_launch { $inner }; (impl $func:ident ($arg0:ident : $T0:ident $(, $arg:ident : $T:ident)*) + ($($other:expr),*) $inner:block) => { - $T0::$func($arg0 $(, $other)*, |$arg0| { + $T0::$func($arg0 $(, $other)*, |$arg0: <$T0 as CudaKernelParameter>::AsyncHostType<'stream, '_>| { impl_launcher_launch! { impl $func ($($arg: $T),*) + ($($other),*) $inner } }) }; @@ -395,7 +419,7 @@ macro_rules! impl_typed_kernel_launch { $inner }; (impl $func:ident ($arg0:ident : $T0:ident $(, $arg:ident : $T:ident)*) + ($($other:expr),*) $inner:block) => { - $T0::$func($arg0 $(, $other)*, |$arg0| { + $T0::$func($arg0 $(, $other)*, |$arg0: <$T0 as CudaKernelParameter>::AsyncHostType<'stream, '_>| { impl_typed_kernel_launch! { impl $func ($($arg: $T),*) + ($($other),*) $inner } }) }; diff --git a/src/kernel/param.rs b/src/kernel/param.rs index f28944b81..edc56f4b7 100644 --- a/src/kernel/param.rs +++ b/src/kernel/param.rs @@ -72,36 +72,36 @@ impl< { #[cfg(feature = "host")] type AsyncHostType<'stream, 'b> = - crate::utils::adapter::RustToCudaWithPortableBitCopySemantics; + crate::utils::adapter::RustToCudaWithPortableBitCopySemantics where Self: 'b; #[cfg(any(feature = "device", doc))] - type DeviceType<'b> = T; - type FfiType<'stream, 'b> = crate::utils::adapter::RustToCudaWithPortableBitCopySemantics; + type DeviceType<'b> = T where Self: 'b; + type FfiType<'stream, 'b> = crate::utils::adapter::RustToCudaWithPortableBitCopySemantics where Self: 'b; #[cfg(feature = "host")] type SyncHostType = T; #[cfg(feature = "host")] - fn with_new_async<'stream, O, E: From>( + fn with_new_async<'stream, 'param, O, E: From>( param: Self::SyncHostType, _stream: &'stream rustacuda::stream::Stream, - inner: impl for<'b> FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result, - ) -> Result { - inner(crate::utils::adapter::RustToCudaWithPortableBitCopySemantics::from(param)) + inner: impl super::WithNewAsync<'stream, Self, O, E>, + ) -> Result where Self: 'param { + inner.with(crate::utils::adapter::RustToCudaWithPortableBitCopySemantics::from(param)) } #[cfg(feature = "host")] - fn with_async_as_ptx_jit( - _param: &Self::AsyncHostType<'_, '_>, + fn with_async_as_ptx_jit<'stream, 'b, O>( + _param: &Self::AsyncHostType<'stream, 'b>, _token: sealed::Token, inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O, - ) -> O { + ) -> O where Self: 'b { inner(None) } #[cfg(feature = "host")] - fn shared_layout_for_async( - _param: &Self::AsyncHostType<'_, '_>, + fn shared_layout_for_async<'stream, 'b>( + _param: &Self::AsyncHostType<'stream, 'b>, _token: sealed::Token, - ) -> Layout { + ) -> Layout where Self: 'b { Layout::new::<()>() } @@ -109,18 +109,18 @@ impl< fn async_to_ffi<'stream, 'b, E: From>( param: Self::AsyncHostType<'stream, 'b>, _token: sealed::Token, - ) -> Result, E> { + ) -> Result, E> where Self: 'b { Ok(param) } #[cfg(feature = "device")] - unsafe fn with_ffi_as_device( - param: Self::FfiType<'static, 'static>, - inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O, - ) -> O { + unsafe fn with_ffi_as_device<'short, O, const PARAM: usize>( + param: Self::FfiType<'static, 'short>, + inner: impl super::WithFfiAsDevice, + ) -> O where Self: 'short { let param = param.into_inner(); - inner(param) + inner.with(param) } } impl< @@ -135,8 +135,7 @@ impl< impl< 'a, - T: 'static - + Sync + T: Sync + crate::safety::StackOnly + crate::safety::PortableBitSemantics + TypeGraphLayout, @@ -147,38 +146,38 @@ impl< 'b, 'stream, &'b crate::host::HostAndDeviceConstRef<'b, T>, - >; + > where Self: 'b; #[cfg(any(feature = "device", doc))] - type DeviceType<'b> = &'b T; - type FfiType<'stream, 'b> = DeviceConstRef<'b, T>; + type DeviceType<'b> = &'b T where Self: 'b; + type FfiType<'stream, 'b> = DeviceConstRef<'b, T> where Self: 'b; #[cfg(feature = "host")] type SyncHostType = &'a T; #[cfg(feature = "host")] - fn with_new_async<'stream, O, E: From>( + fn with_new_async<'stream, 'param, O, E: From>( param: Self::SyncHostType, stream: &'stream rustacuda::stream::Stream, - inner: impl for<'b> FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result, - ) -> Result { + inner: impl super::WithNewAsync<'stream, Self, O, E>, + ) -> Result where Self: 'param { crate::host::HostAndDeviceConstRef::with_new(param, |const_ref| { - inner(const_ref.as_async(stream).as_ref()) + inner.with(const_ref.as_async(stream).as_ref()) }) } #[cfg(feature = "host")] - fn with_async_as_ptx_jit( - _param: &Self::AsyncHostType<'_, '_>, + fn with_async_as_ptx_jit<'stream, 'b, O>( + _param: &Self::AsyncHostType<'stream, 'b>, _token: sealed::Token, inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O, - ) -> O { + ) -> O where Self: 'b { inner(None) } #[cfg(feature = "host")] - fn shared_layout_for_async( - _param: &Self::AsyncHostType<'_, '_>, + fn shared_layout_for_async<'stream, 'b>( + _param: &Self::AsyncHostType<'stream, 'b>, _token: sealed::Token, - ) -> Layout { + ) -> Layout where Self: 'b { Layout::new::<()>() } @@ -186,25 +185,24 @@ impl< fn async_to_ffi<'stream, 'b, E: From>( param: Self::AsyncHostType<'stream, 'b>, _token: sealed::Token, - ) -> Result, E> { + ) -> Result, E> where Self: 'b { let param = unsafe { param.unwrap_unchecked() }; Ok(param.for_device()) } #[cfg(feature = "device")] - unsafe fn with_ffi_as_device( - param: Self::FfiType<'static, 'static>, - inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O, - ) -> O { + unsafe fn with_ffi_as_device<'short, O, const PARAM: usize>( + param: Self::FfiType<'static, 'short>, + inner: impl super::WithFfiAsDevice, + ) -> O where Self: 'short { let param = param.as_ref(); - inner(param) + inner.with(param) } } impl< 'a, - T: 'static - + Sync + T: Sync + crate::safety::StackOnly + crate::safety::PortableBitSemantics + TypeGraphLayout, @@ -214,8 +212,7 @@ impl< impl< 'a, - T: 'static - + Sync + T: Sync + crate::safety::StackOnly + crate::safety::PortableBitSemantics + TypeGraphLayout, @@ -223,38 +220,40 @@ impl< { #[cfg(feature = "host")] type AsyncHostType<'stream, 'b> = - <&'a PerThreadShallowCopy as CudaKernelParameter>::AsyncHostType<'stream, 'b>; + <&'a PerThreadShallowCopy as CudaKernelParameter>::AsyncHostType<'stream, 'b> where Self: 'b; #[cfg(any(feature = "device", doc))] - type DeviceType<'b> = <&'a PerThreadShallowCopy as CudaKernelParameter>::DeviceType<'b>; + type DeviceType<'b> = <&'a PerThreadShallowCopy as CudaKernelParameter>::DeviceType<'b> where Self: 'b; type FfiType<'stream, 'b> = - <&'a PerThreadShallowCopy as CudaKernelParameter>::FfiType<'stream, 'b>; + <&'a PerThreadShallowCopy as CudaKernelParameter>::FfiType<'stream, 'b> where Self: 'b; #[cfg(feature = "host")] type SyncHostType = <&'a PerThreadShallowCopy as CudaKernelParameter>::SyncHostType; #[cfg(feature = "host")] - fn with_new_async<'stream, O, E: From>( + fn with_new_async<'stream, 'param, O, E: From>( param: Self::SyncHostType, stream: &'stream rustacuda::stream::Stream, - inner: impl for<'b> FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result, - ) -> Result { - <&'a PerThreadShallowCopy as CudaKernelParameter>::with_new_async(param, stream, inner) + inner: impl super::WithNewAsync<'stream, Self, O, E>, + ) -> Result where Self: 'param { + crate::host::HostAndDeviceConstRef::with_new(param, |const_ref| { + inner.with(const_ref.as_async(stream).as_ref()) + }) } #[cfg(feature = "host")] - fn with_async_as_ptx_jit( - param: &Self::AsyncHostType<'_, '_>, + fn with_async_as_ptx_jit<'stream, 'b, O>( + param: &Self::AsyncHostType<'stream, 'b>, _token: sealed::Token, inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O, - ) -> O { + ) -> O where Self: 'b { let param = unsafe { param.unwrap_unchecked() }; inner(Some(¶m_as_raw_bytes(param.for_host()))) } #[cfg(feature = "host")] - fn shared_layout_for_async( - _param: &Self::AsyncHostType<'_, '_>, + fn shared_layout_for_async<'stream, 'b>( + _param: &Self::AsyncHostType<'stream, 'b>, _token: sealed::Token, - ) -> Layout { + ) -> Layout where Self: 'b { Layout::new::<()>() } @@ -262,26 +261,25 @@ impl< fn async_to_ffi<'stream, 'b, E: From>( param: Self::AsyncHostType<'stream, 'b>, token: sealed::Token, - ) -> Result, E> { + ) -> Result, E> where Self: 'b { <&'a PerThreadShallowCopy as CudaKernelParameter>::async_to_ffi(param, token) } #[cfg(feature = "device")] - unsafe fn with_ffi_as_device( - param: Self::FfiType<'static, 'static>, - inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O, - ) -> O { + unsafe fn with_ffi_as_device<'short, O, const PARAM: usize>( + param: Self::FfiType<'static, 'short>, + inner: impl super::WithFfiAsDevice, + ) -> O where Self: 'short { emit_param_ptx_jit_marker::<_, PARAM>(param.as_ref()); - <&'a PerThreadShallowCopy as CudaKernelParameter>::with_ffi_as_device::( - param, inner, - ) + let param = param.as_ref(); + + inner.with(param) } } impl< 'a, - T: 'static - + Sync + T: Sync + crate::safety::StackOnly + crate::safety::PortableBitSemantics + TypeGraphLayout, @@ -289,573 +287,573 @@ impl< { } -pub struct ShallowInteriorMutable< - T: Sync - + crate::safety::StackOnly - + crate::safety::PortableBitSemantics - + TypeGraphLayout - + InteriorMutableSync, -> { - never: !, - _marker: PhantomData, -} - -impl< - T: Sync - + crate::safety::StackOnly - + crate::safety::PortableBitSemantics - + TypeGraphLayout - + InteriorMutableSync, - > Deref for ShallowInteriorMutable -{ - type Target = T; - - fn deref(&self) -> &Self::Target { - self.never - } -} - -impl< - 'a, - T: 'static - + Sync - + crate::safety::StackOnly - + crate::safety::PortableBitSemantics - + TypeGraphLayout - + InteriorMutableSync, - > CudaKernelParameter for &'a ShallowInteriorMutable -{ - #[cfg(feature = "host")] - type AsyncHostType<'stream, 'b> = crate::utils::r#async::AsyncProj< - 'b, - 'stream, - &'b crate::host::HostAndDeviceConstRef<'b, T>, - >; - #[cfg(any(feature = "device", doc))] - type DeviceType<'b> = &'b T; - type FfiType<'stream, 'b> = DeviceConstRef<'b, T>; - #[cfg(feature = "host")] - /// The kernel takes a mutable borrow of the interior mutable data to ensure - /// the interior mutability is limited to just this kernel invocation. - type SyncHostType = &'a mut T; - - #[cfg(feature = "host")] - fn with_new_async<'stream, O, E: From>( - param: Self::SyncHostType, - stream: &'stream rustacuda::stream::Stream, - inner: impl for<'b> FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result, - ) -> Result { - crate::host::HostAndDeviceMutRef::with_new(param, |const_ref| { - inner(const_ref.as_ref().as_async(stream).as_ref()) - }) - } - - #[cfg(feature = "host")] - fn with_async_as_ptx_jit( - _param: &Self::AsyncHostType<'_, '_>, - _token: sealed::Token, - inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O, - ) -> O { - inner(None) - } - - #[cfg(feature = "host")] - fn shared_layout_for_async( - _param: &Self::AsyncHostType<'_, '_>, - _token: sealed::Token, - ) -> Layout { - Layout::new::<()>() - } - - #[cfg(feature = "host")] - fn async_to_ffi<'stream, 'b, E: From>( - param: Self::AsyncHostType<'stream, 'b>, - _token: sealed::Token, - ) -> Result, E> { - let param = unsafe { param.unwrap_unchecked() }; - Ok(param.for_device()) - } - - #[cfg(feature = "device")] - unsafe fn with_ffi_as_device( - param: Self::FfiType<'static, 'static>, - inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O, - ) -> O { - let param = param.as_ref(); - - inner(param) - } -} -impl< - 'a, - T: crate::safety::StackOnly - + Sync - + crate::safety::PortableBitSemantics - + TypeGraphLayout - + InteriorMutableSync, - > sealed::Sealed for &'a ShallowInteriorMutable -{ -} - -pub trait InteriorMutableSync: Sync + sealed::Sealed {} - -macro_rules! impl_atomic_interior_mutable { - ($atomic:ident($interior:ty)) => { - impl InteriorMutableSync for core::sync::atomic::$atomic {} - impl sealed::Sealed for core::sync::atomic::$atomic {} - }; - ($($atomic:ident($interior:ty)),*) => { - $(impl_atomic_interior_mutable! { $atomic($interior) })* - } -} - -impl_atomic_interior_mutable! { - AtomicBool(bool), - AtomicI8(i8), AtomicI16(i16), AtomicI32(i32), AtomicI64(i64), AtomicIsize(isize), - AtomicU8(u8), AtomicU16(u16), AtomicU32(u32), AtomicU64(u64), AtomicUsize(usize) -} - -impl InteriorMutableSync - for core::cell::SyncUnsafeCell -{ -} -impl sealed::Sealed - for core::cell::SyncUnsafeCell -{ -} - -pub struct DeepPerThreadBorrow { - never: !, - _marker: PhantomData, -} - -impl Deref for DeepPerThreadBorrow { - type Target = T; - - fn deref(&self) -> &Self::Target { - self.never - } -} - -impl< - T: Send - + Clone - + RustToCuda< - CudaRepresentation: 'static + crate::safety::StackOnly, - CudaAllocation: EmptyCudaAlloc, - >, - > CudaKernelParameter for DeepPerThreadBorrow -{ - #[cfg(feature = "host")] - type AsyncHostType<'stream, 'b> = crate::utils::r#async::Async< - 'b, - 'stream, - crate::host::HostAndDeviceOwned< - 'b, - DeviceAccessible<::CudaRepresentation>, - >, - crate::utils::r#async::NoCompletion, - >; - #[cfg(any(feature = "device", doc))] - type DeviceType<'b> = T; - type FfiType<'stream, 'b> = - DeviceOwnedRef<'b, DeviceAccessible<::CudaRepresentation>>; - #[cfg(feature = "host")] - type SyncHostType = T; - - #[cfg(feature = "host")] - fn with_new_async<'stream, O, E: From>( - param: Self::SyncHostType, - stream: &'stream rustacuda::stream::Stream, - inner: impl for<'b> FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result, - ) -> Result { - crate::lend::LendToCuda::move_to_cuda(param, |param| inner(param.into_async(stream))) - } - - #[cfg(feature = "host")] - fn with_async_as_ptx_jit( - _param: &Self::AsyncHostType<'_, '_>, - _token: sealed::Token, - inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O, - ) -> O { - inner(None) - } - - #[cfg(feature = "host")] - fn shared_layout_for_async( - _param: &Self::AsyncHostType<'_, '_>, - _token: sealed::Token, - ) -> Layout { - Layout::new::<()>() - } - - #[cfg(feature = "host")] - fn async_to_ffi<'stream, 'b, E: From>( - param: Self::AsyncHostType<'stream, 'b>, - _token: sealed::Token, - ) -> Result, E> { - let (param, _completion): (_, Option) = - unsafe { param.unwrap_unchecked()? }; - Ok(param.for_device()) - } - - #[cfg(feature = "device")] - unsafe fn with_ffi_as_device( - param: Self::FfiType<'static, 'static>, - inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O, - ) -> O { - unsafe { crate::lend::BorrowFromRust::with_moved_from_rust(param, inner) } - } -} -impl< - T: Send - + Clone - + RustToCuda< - CudaRepresentation: 'static + crate::safety::StackOnly, - CudaAllocation: EmptyCudaAlloc, - >, - > sealed::Sealed for DeepPerThreadBorrow -{ -} - -impl<'a, T: 'static + Sync + RustToCuda> CudaKernelParameter for &'a DeepPerThreadBorrow { - #[cfg(feature = "host")] - type AsyncHostType<'stream, 'b> = crate::utils::r#async::AsyncProj< - 'b, - 'stream, - &'b crate::host::HostAndDeviceConstRef< - 'b, - DeviceAccessible<::CudaRepresentation>, - >, - >; - #[cfg(any(feature = "device", doc))] - type DeviceType<'b> = &'b T; - type FfiType<'stream, 'b> = - DeviceConstRef<'b, DeviceAccessible<::CudaRepresentation>>; - #[cfg(feature = "host")] - type SyncHostType = &'a T; - - #[cfg(feature = "host")] - fn with_new_async<'stream, O, E: From>( - param: Self::SyncHostType, - stream: &'stream rustacuda::stream::Stream, - inner: impl for<'b> FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result, - ) -> Result { - crate::lend::LendToCuda::lend_to_cuda(param, |param| inner(param.as_async(stream).as_ref())) - } - - #[cfg(feature = "host")] - fn with_async_as_ptx_jit( - _param: &Self::AsyncHostType<'_, '_>, - _token: sealed::Token, - inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O, - ) -> O { - inner(None) - } - - #[cfg(feature = "host")] - fn shared_layout_for_async( - _param: &Self::AsyncHostType<'_, '_>, - _token: sealed::Token, - ) -> Layout { - Layout::new::<()>() - } - - #[cfg(feature = "host")] - fn async_to_ffi<'stream, 'b, E: From>( - param: Self::AsyncHostType<'stream, 'b>, - _token: sealed::Token, - ) -> Result, E> { - let param = unsafe { param.unwrap_unchecked() }; - Ok(param.for_device()) - } - - #[cfg(feature = "device")] - unsafe fn with_ffi_as_device( - param: Self::FfiType<'static, 'static>, - inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O, - ) -> O { - unsafe { crate::lend::BorrowFromRust::with_borrow_from_rust(param, inner) } - } -} -impl<'a, T: Sync + RustToCuda> sealed::Sealed for &'a DeepPerThreadBorrow {} - -impl<'a, T: 'static + Sync + RustToCuda + SafeMutableAliasing> CudaKernelParameter - for &'a mut DeepPerThreadBorrow -{ - #[cfg(feature = "host")] - type AsyncHostType<'stream, 'b> = crate::utils::r#async::AsyncProj< - 'b, - 'stream, - &'b mut crate::host::HostAndDeviceMutRef< - 'b, - DeviceAccessible<::CudaRepresentation>, - >, - >; - #[cfg(any(feature = "device", doc))] - type DeviceType<'b> = &'b mut T; - type FfiType<'stream, 'b> = - DeviceMutRef<'b, DeviceAccessible<::CudaRepresentation>>; - #[cfg(feature = "host")] - type SyncHostType = &'a mut T; - - #[cfg(feature = "host")] - fn with_new_async<'stream, O, E: From>( - param: Self::SyncHostType, - stream: &'stream rustacuda::stream::Stream, - inner: impl for<'b> FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result, - ) -> Result { - crate::lend::LendToCuda::lend_to_cuda_mut(param, |mut param| { - // FIXME: express the same with param.as_async(stream).as_mut() - let _ = stream; - inner(crate::utils::r#async::AsyncProj::new(&mut param.as_mut())) - }) - } - - #[cfg(feature = "host")] - fn with_async_as_ptx_jit( - _param: &Self::AsyncHostType<'_, '_>, - _token: sealed::Token, - inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O, - ) -> O { - inner(None) - } - - #[cfg(feature = "host")] - fn shared_layout_for_async( - _param: &Self::AsyncHostType<'_, '_>, - _token: sealed::Token, - ) -> Layout { - Layout::new::<()>() - } - - #[cfg(feature = "host")] - fn async_to_ffi<'stream, 'b, E: From>( - param: Self::AsyncHostType<'stream, 'b>, - _token: sealed::Token, - ) -> Result, E> { - let param = unsafe { param.unwrap_unchecked() }; - Ok(param.for_device()) - } - - #[cfg(feature = "device")] - unsafe fn with_ffi_as_device( - param: Self::FfiType<'static, 'static>, - inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O, - ) -> O { - unsafe { crate::lend::BorrowFromRust::with_borrow_from_rust_mut(param, inner) } - } -} -impl<'a, T: Sync + RustToCuda + SafeMutableAliasing> sealed::Sealed - for &'a mut DeepPerThreadBorrow -{ -} - -impl< - T: Send - + Clone - + RustToCuda< - CudaRepresentation: 'static + crate::safety::StackOnly, - CudaAllocation: EmptyCudaAlloc, - >, - > CudaKernelParameter for PtxJit> -{ - #[cfg(feature = "host")] - type AsyncHostType<'stream, 'b> = - as CudaKernelParameter>::AsyncHostType<'stream, 'b>; - #[cfg(any(feature = "device", doc))] - type DeviceType<'b> = as CudaKernelParameter>::DeviceType<'b>; - type FfiType<'stream, 'b> = - as CudaKernelParameter>::FfiType<'stream, 'b>; - #[cfg(feature = "host")] - type SyncHostType = as CudaKernelParameter>::SyncHostType; - - #[cfg(feature = "host")] - fn with_new_async<'stream, O, E: From>( - param: Self::SyncHostType, - stream: &'stream rustacuda::stream::Stream, - inner: impl for<'b> FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result, - ) -> Result { - as CudaKernelParameter>::with_new_async(param, stream, inner) - } - - #[cfg(feature = "host")] - fn with_async_as_ptx_jit( - param: &Self::AsyncHostType<'_, '_>, - _token: sealed::Token, - inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O, - ) -> O { - let param = unsafe { param.as_ref().unwrap_unchecked() }; - inner(Some(¶m_as_raw_bytes(param.for_host()))) - } - - #[cfg(feature = "host")] - fn async_to_ffi<'stream, 'b, E: From>( - param: Self::AsyncHostType<'stream, 'b>, - token: sealed::Token, - ) -> Result, E> { - as CudaKernelParameter>::async_to_ffi(param, token) - } - - #[cfg(feature = "host")] - fn shared_layout_for_async( - _param: &Self::AsyncHostType<'_, '_>, - _token: sealed::Token, - ) -> Layout { - Layout::new::<()>() - } - - #[cfg(feature = "device")] - unsafe fn with_ffi_as_device( - param: Self::FfiType<'static, 'static>, - inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O, - ) -> O { - emit_param_ptx_jit_marker::<_, PARAM>(param.as_ref()); - - as CudaKernelParameter>::with_ffi_as_device::( - param, inner, - ) - } -} -impl< - T: Send - + Clone - + RustToCuda< - CudaRepresentation: 'static + crate::safety::StackOnly, - CudaAllocation: EmptyCudaAlloc, - >, - > sealed::Sealed for PtxJit> -{ -} - -impl<'a, T: 'static + Sync + RustToCuda> CudaKernelParameter - for &'a PtxJit> -{ - #[cfg(feature = "host")] - type AsyncHostType<'stream, 'b> = - <&'a DeepPerThreadBorrow as CudaKernelParameter>::AsyncHostType<'stream, 'b>; - #[cfg(any(feature = "device", doc))] - type DeviceType<'b> = <&'a DeepPerThreadBorrow as CudaKernelParameter>::DeviceType<'b>; - type FfiType<'stream, 'b> = - <&'a DeepPerThreadBorrow as CudaKernelParameter>::FfiType<'stream, 'b>; - #[cfg(feature = "host")] - type SyncHostType = <&'a DeepPerThreadBorrow as CudaKernelParameter>::SyncHostType; - - #[cfg(feature = "host")] - fn with_new_async<'stream, O, E: From>( - param: Self::SyncHostType, - stream: &'stream rustacuda::stream::Stream, - inner: impl for<'b> FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result, - ) -> Result { - <&'a DeepPerThreadBorrow as CudaKernelParameter>::with_new_async(param, stream, inner) - } - - #[cfg(feature = "host")] - fn with_async_as_ptx_jit( - param: &Self::AsyncHostType<'_, '_>, - _token: sealed::Token, - inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O, - ) -> O { - let param = unsafe { param.unwrap_unchecked() }; - inner(Some(¶m_as_raw_bytes(param.for_host()))) - } - - #[cfg(feature = "host")] - fn shared_layout_for_async( - _param: &Self::AsyncHostType<'_, '_>, - _token: sealed::Token, - ) -> Layout { - Layout::new::<()>() - } - - #[cfg(feature = "host")] - fn async_to_ffi<'stream, 'b, E: From>( - param: Self::AsyncHostType<'stream, 'b>, - token: sealed::Token, - ) -> Result, E> { - <&'a DeepPerThreadBorrow as CudaKernelParameter>::async_to_ffi(param, token) - } - - #[cfg(feature = "device")] - unsafe fn with_ffi_as_device( - param: Self::FfiType<'static, 'static>, - inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O, - ) -> O { - emit_param_ptx_jit_marker::<_, PARAM>(param.as_ref()); - - <&'a DeepPerThreadBorrow as CudaKernelParameter>::with_ffi_as_device::( - param, inner, - ) - } -} -impl<'a, T: 'static + Sync + RustToCuda> sealed::Sealed for &'a PtxJit> {} - -impl<'a, T: 'static + Sync + RustToCuda + SafeMutableAliasing> CudaKernelParameter - for &'a mut PtxJit> -{ - #[cfg(feature = "host")] - type AsyncHostType<'stream, 'b> = - <&'a mut DeepPerThreadBorrow as CudaKernelParameter>::AsyncHostType<'stream, 'b>; - #[cfg(any(feature = "device", doc))] - type DeviceType<'b> = <&'a mut DeepPerThreadBorrow as CudaKernelParameter>::DeviceType<'b>; - type FfiType<'stream, 'b> = - <&'a mut DeepPerThreadBorrow as CudaKernelParameter>::FfiType<'stream, 'b>; - #[cfg(feature = "host")] - type SyncHostType = <&'a mut DeepPerThreadBorrow as CudaKernelParameter>::SyncHostType; - - #[cfg(feature = "host")] - fn with_new_async<'stream, O, E: From>( - param: Self::SyncHostType, - stream: &'stream rustacuda::stream::Stream, - inner: impl for<'b> FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result, - ) -> Result { - <&'a mut DeepPerThreadBorrow as CudaKernelParameter>::with_new_async( - param, stream, inner, - ) - } - - #[cfg(feature = "host")] - fn with_async_as_ptx_jit( - param: &Self::AsyncHostType<'_, '_>, - _token: sealed::Token, - inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O, - ) -> O { - let param = unsafe { param.as_ref().unwrap_unchecked() }; - inner(Some(¶m_as_raw_bytes(param.for_host()))) - } - - #[cfg(feature = "host")] - fn shared_layout_for_async( - _param: &Self::AsyncHostType<'_, '_>, - _token: sealed::Token, - ) -> Layout { - Layout::new::<()>() - } - - #[cfg(feature = "host")] - fn async_to_ffi<'stream, 'b, E: From>( - param: Self::AsyncHostType<'stream, 'b>, - token: sealed::Token, - ) -> Result, E> { - <&'a mut DeepPerThreadBorrow as CudaKernelParameter>::async_to_ffi(param, token) - } - - #[cfg(feature = "device")] - unsafe fn with_ffi_as_device( - param: Self::FfiType<'static, 'static>, - inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O, - ) -> O { - emit_param_ptx_jit_marker::<_, PARAM>(param.as_ref()); - - <&'a mut DeepPerThreadBorrow as CudaKernelParameter>::with_ffi_as_device::( - param, inner, - ) - } -} -impl<'a, T: 'static + Sync + RustToCuda + SafeMutableAliasing> sealed::Sealed - for &'a mut PtxJit> -{ -} +// pub struct ShallowInteriorMutable< +// T: Sync +// + crate::safety::StackOnly +// + crate::safety::PortableBitSemantics +// + TypeGraphLayout +// + InteriorMutableSync, +// > { +// never: !, +// _marker: PhantomData, +// } + +// impl< +// T: Sync +// + crate::safety::StackOnly +// + crate::safety::PortableBitSemantics +// + TypeGraphLayout +// + InteriorMutableSync, +// > Deref for ShallowInteriorMutable +// { +// type Target = T; + +// fn deref(&self) -> &Self::Target { +// self.never +// } +// } + +// impl< +// 'a, +// T: 'static +// + Sync +// + crate::safety::StackOnly +// + crate::safety::PortableBitSemantics +// + TypeGraphLayout +// + InteriorMutableSync, +// > CudaKernelParameter for &'a ShallowInteriorMutable +// { +// #[cfg(feature = "host")] +// type AsyncHostType<'stream, 'b> = crate::utils::r#async::AsyncProj< +// 'b, +// 'stream, +// &'b crate::host::HostAndDeviceConstRef<'b, T>, +// >; +// #[cfg(any(feature = "device", doc))] +// type DeviceType<'b> = &'b T; +// type FfiType<'stream, 'b> = DeviceConstRef<'b, T>; +// #[cfg(feature = "host")] +// /// The kernel takes a mutable borrow of the interior mutable data to ensure +// /// the interior mutability is limited to just this kernel invocation. +// type SyncHostType = &'a mut T; + +// #[cfg(feature = "host")] +// fn with_new_async<'stream, 'param, O, E: From>( +// param: Self::SyncHostType, +// stream: &'stream rustacuda::stream::Stream, +// inner: impl super::WithNewAsync<'stream, Self, O, E>, +// ) -> Result where Self: 'param { +// crate::host::HostAndDeviceMutRef::with_new(param, |const_ref| { +// inner.with(const_ref.as_ref().as_async(stream).as_ref()) +// }) +// } + +// #[cfg(feature = "host")] +// fn with_async_as_ptx_jit( +// _param: &Self::AsyncHostType<'_, '_>, +// _token: sealed::Token, +// inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O, +// ) -> O { +// inner(None) +// } + +// #[cfg(feature = "host")] +// fn shared_layout_for_async( +// _param: &Self::AsyncHostType<'_, '_>, +// _token: sealed::Token, +// ) -> Layout { +// Layout::new::<()>() +// } + +// #[cfg(feature = "host")] +// fn async_to_ffi<'stream, 'b, E: From>( +// param: Self::AsyncHostType<'stream, 'b>, +// _token: sealed::Token, +// ) -> Result, E> { +// let param = unsafe { param.unwrap_unchecked() }; +// Ok(param.for_device()) +// } + +// #[cfg(feature = "device")] +// unsafe fn with_ffi_as_device( +// param: Self::FfiType<'static, 'static>, +// inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O, +// ) -> O { +// let param = param.as_ref(); + +// inner(param) +// } +// } +// impl< +// 'a, +// T: crate::safety::StackOnly +// + Sync +// + crate::safety::PortableBitSemantics +// + TypeGraphLayout +// + InteriorMutableSync, +// > sealed::Sealed for &'a ShallowInteriorMutable +// { +// } + +// pub trait InteriorMutableSync: Sync + sealed::Sealed {} + +// macro_rules! impl_atomic_interior_mutable { +// ($atomic:ident($interior:ty)) => { +// impl InteriorMutableSync for core::sync::atomic::$atomic {} +// impl sealed::Sealed for core::sync::atomic::$atomic {} +// }; +// ($($atomic:ident($interior:ty)),*) => { +// $(impl_atomic_interior_mutable! { $atomic($interior) })* +// } +// } + +// impl_atomic_interior_mutable! { +// AtomicBool(bool), +// AtomicI8(i8), AtomicI16(i16), AtomicI32(i32), AtomicI64(i64), AtomicIsize(isize), +// AtomicU8(u8), AtomicU16(u16), AtomicU32(u32), AtomicU64(u64), AtomicUsize(usize) +// } + +// impl InteriorMutableSync +// for core::cell::SyncUnsafeCell +// { +// } +// impl sealed::Sealed +// for core::cell::SyncUnsafeCell +// { +// } + +// pub struct DeepPerThreadBorrow { +// never: !, +// _marker: PhantomData, +// } + +// impl Deref for DeepPerThreadBorrow { +// type Target = T; + +// fn deref(&self) -> &Self::Target { +// self.never +// } +// } + +// impl< +// T: Send +// + Clone +// + RustToCuda< +// CudaRepresentation: 'static + crate::safety::StackOnly, +// CudaAllocation: EmptyCudaAlloc, +// >, +// > CudaKernelParameter for DeepPerThreadBorrow +// { +// #[cfg(feature = "host")] +// type AsyncHostType<'stream, 'b> = crate::utils::r#async::Async< +// 'b, +// 'stream, +// crate::host::HostAndDeviceOwned< +// 'b, +// DeviceAccessible<::CudaRepresentation>, +// >, +// crate::utils::r#async::NoCompletion, +// >; +// #[cfg(any(feature = "device", doc))] +// type DeviceType<'b> = T; +// type FfiType<'stream, 'b> = +// DeviceOwnedRef<'b, DeviceAccessible<::CudaRepresentation>>; +// #[cfg(feature = "host")] +// type SyncHostType = T; + +// #[cfg(feature = "host")] +// fn with_new_async<'stream, 'param, O, E: From>( +// param: Self::SyncHostType, +// stream: &'stream rustacuda::stream::Stream, +// inner: impl super::WithNewAsync<'stream, Self, O, E>, +// ) -> Result where Self: 'param { +// crate::lend::LendToCuda::move_to_cuda(param, |param| inner.with(param.into_async(stream))) +// } + +// #[cfg(feature = "host")] +// fn with_async_as_ptx_jit( +// _param: &Self::AsyncHostType<'_, '_>, +// _token: sealed::Token, +// inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O, +// ) -> O { +// inner(None) +// } + +// #[cfg(feature = "host")] +// fn shared_layout_for_async( +// _param: &Self::AsyncHostType<'_, '_>, +// _token: sealed::Token, +// ) -> Layout { +// Layout::new::<()>() +// } + +// #[cfg(feature = "host")] +// fn async_to_ffi<'stream, 'b, E: From>( +// param: Self::AsyncHostType<'stream, 'b>, +// _token: sealed::Token, +// ) -> Result, E> { +// let (param, _completion): (_, Option) = +// unsafe { param.unwrap_unchecked()? }; +// Ok(param.for_device()) +// } + +// #[cfg(feature = "device")] +// unsafe fn with_ffi_as_device( +// param: Self::FfiType<'static, 'static>, +// inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O, +// ) -> O { +// unsafe { crate::lend::BorrowFromRust::with_moved_from_rust(param, inner) } +// } +// } +// impl< +// T: Send +// + Clone +// + RustToCuda< +// CudaRepresentation: 'static + crate::safety::StackOnly, +// CudaAllocation: EmptyCudaAlloc, +// >, +// > sealed::Sealed for DeepPerThreadBorrow +// { +// } + +// impl<'a, T: 'static + Sync + RustToCuda> CudaKernelParameter for &'a DeepPerThreadBorrow { +// #[cfg(feature = "host")] +// type AsyncHostType<'stream, 'b> = crate::utils::r#async::AsyncProj< +// 'b, +// 'stream, +// &'b crate::host::HostAndDeviceConstRef< +// 'b, +// DeviceAccessible<::CudaRepresentation>, +// >, +// >; +// #[cfg(any(feature = "device", doc))] +// type DeviceType<'b> = &'b T; +// type FfiType<'stream, 'b> = +// DeviceConstRef<'b, DeviceAccessible<::CudaRepresentation>>; +// #[cfg(feature = "host")] +// type SyncHostType = &'a T; + +// #[cfg(feature = "host")] +// fn with_new_async<'stream, 'param, O, E: From>( +// param: Self::SyncHostType, +// stream: &'stream rustacuda::stream::Stream, +// inner: impl super::WithNewAsync<'stream, Self, O, E>, +// ) -> Result where Self: 'param { +// crate::lend::LendToCuda::lend_to_cuda(param, |param| inner.with(param.as_async(stream).as_ref())) +// } + +// #[cfg(feature = "host")] +// fn with_async_as_ptx_jit( +// _param: &Self::AsyncHostType<'_, '_>, +// _token: sealed::Token, +// inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O, +// ) -> O { +// inner(None) +// } + +// #[cfg(feature = "host")] +// fn shared_layout_for_async( +// _param: &Self::AsyncHostType<'_, '_>, +// _token: sealed::Token, +// ) -> Layout { +// Layout::new::<()>() +// } + +// #[cfg(feature = "host")] +// fn async_to_ffi<'stream, 'b, E: From>( +// param: Self::AsyncHostType<'stream, 'b>, +// _token: sealed::Token, +// ) -> Result, E> { +// let param = unsafe { param.unwrap_unchecked() }; +// Ok(param.for_device()) +// } + +// #[cfg(feature = "device")] +// unsafe fn with_ffi_as_device( +// param: Self::FfiType<'static, 'static>, +// inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O, +// ) -> O { +// unsafe { crate::lend::BorrowFromRust::with_borrow_from_rust(param, inner) } +// } +// } +// impl<'a, T: Sync + RustToCuda> sealed::Sealed for &'a DeepPerThreadBorrow {} + +// impl<'a, T: 'static + Sync + RustToCuda + SafeMutableAliasing> CudaKernelParameter +// for &'a mut DeepPerThreadBorrow +// { +// #[cfg(feature = "host")] +// type AsyncHostType<'stream, 'b> = crate::utils::r#async::AsyncProj< +// 'b, +// 'stream, +// &'b mut crate::host::HostAndDeviceMutRef< +// 'b, +// DeviceAccessible<::CudaRepresentation>, +// >, +// >; +// #[cfg(any(feature = "device", doc))] +// type DeviceType<'b> = &'b mut T; +// type FfiType<'stream, 'b> = +// DeviceMutRef<'b, DeviceAccessible<::CudaRepresentation>>; +// #[cfg(feature = "host")] +// type SyncHostType = &'a mut T; + +// #[cfg(feature = "host")] +// fn with_new_async<'stream, 'param, O, E: From>( +// param: Self::SyncHostType, +// stream: &'stream rustacuda::stream::Stream, +// inner: impl super::WithNewAsync<'stream, Self, O, E>, +// ) -> Result where Self: 'param { +// crate::lend::LendToCuda::lend_to_cuda_mut(param, |mut param| { +// // FIXME: express the same with param.as_async(stream).as_mut() +// let _ = stream; +// inner.with(crate::utils::r#async::AsyncProj::new(&mut param.as_mut())) +// }) +// } + +// #[cfg(feature = "host")] +// fn with_async_as_ptx_jit( +// _param: &Self::AsyncHostType<'_, '_>, +// _token: sealed::Token, +// inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O, +// ) -> O { +// inner(None) +// } + +// #[cfg(feature = "host")] +// fn shared_layout_for_async( +// _param: &Self::AsyncHostType<'_, '_>, +// _token: sealed::Token, +// ) -> Layout { +// Layout::new::<()>() +// } + +// #[cfg(feature = "host")] +// fn async_to_ffi<'stream, 'b, E: From>( +// param: Self::AsyncHostType<'stream, 'b>, +// _token: sealed::Token, +// ) -> Result, E> { +// let param = unsafe { param.unwrap_unchecked() }; +// Ok(param.for_device()) +// } + +// #[cfg(feature = "device")] +// unsafe fn with_ffi_as_device( +// param: Self::FfiType<'static, 'static>, +// inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O, +// ) -> O { +// unsafe { crate::lend::BorrowFromRust::with_borrow_from_rust_mut(param, inner) } +// } +// } +// impl<'a, T: Sync + RustToCuda + SafeMutableAliasing> sealed::Sealed +// for &'a mut DeepPerThreadBorrow +// { +// } + +// impl< +// T: Send +// + Clone +// + RustToCuda< +// CudaRepresentation: 'static + crate::safety::StackOnly, +// CudaAllocation: EmptyCudaAlloc, +// >, +// > CudaKernelParameter for PtxJit> +// { +// #[cfg(feature = "host")] +// type AsyncHostType<'stream, 'b> = +// as CudaKernelParameter>::AsyncHostType<'stream, 'b>; +// #[cfg(any(feature = "device", doc))] +// type DeviceType<'b> = as CudaKernelParameter>::DeviceType<'b>; +// type FfiType<'stream, 'b> = +// as CudaKernelParameter>::FfiType<'stream, 'b>; +// #[cfg(feature = "host")] +// type SyncHostType = as CudaKernelParameter>::SyncHostType; + +// #[cfg(feature = "host")] +// fn with_new_async<'stream, 'param, O, E: From>( +// param: Self::SyncHostType, +// stream: &'stream rustacuda::stream::Stream, +// inner: impl super::WithNewAsync<'stream, Self, O, E>, +// ) -> Result where Self: 'param { +// as CudaKernelParameter>::with_new_async(param, stream, |param: as CudaKernelParameter>::AsyncHostType<'stream, '_>| inner.with(param)) +// } + +// #[cfg(feature = "host")] +// fn with_async_as_ptx_jit( +// param: &Self::AsyncHostType<'_, '_>, +// _token: sealed::Token, +// inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O, +// ) -> O { +// let param = unsafe { param.as_ref().unwrap_unchecked() }; +// inner(Some(¶m_as_raw_bytes(param.for_host()))) +// } + +// #[cfg(feature = "host")] +// fn async_to_ffi<'stream, 'b, E: From>( +// param: Self::AsyncHostType<'stream, 'b>, +// token: sealed::Token, +// ) -> Result, E> { +// as CudaKernelParameter>::async_to_ffi(param, token) +// } + +// #[cfg(feature = "host")] +// fn shared_layout_for_async( +// _param: &Self::AsyncHostType<'_, '_>, +// _token: sealed::Token, +// ) -> Layout { +// Layout::new::<()>() +// } + +// #[cfg(feature = "device")] +// unsafe fn with_ffi_as_device( +// param: Self::FfiType<'static, 'static>, +// inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O, +// ) -> O { +// emit_param_ptx_jit_marker::<_, PARAM>(param.as_ref()); + +// as CudaKernelParameter>::with_ffi_as_device::( +// param, inner, +// ) +// } +// } +// impl< +// T: Send +// + Clone +// + RustToCuda< +// CudaRepresentation: 'static + crate::safety::StackOnly, +// CudaAllocation: EmptyCudaAlloc, +// >, +// > sealed::Sealed for PtxJit> +// { +// } + +// impl<'a, T: 'static + Sync + RustToCuda> CudaKernelParameter +// for &'a PtxJit> +// { +// #[cfg(feature = "host")] +// type AsyncHostType<'stream, 'b> = +// <&'a DeepPerThreadBorrow as CudaKernelParameter>::AsyncHostType<'stream, 'b>; +// #[cfg(any(feature = "device", doc))] +// type DeviceType<'b> = <&'a DeepPerThreadBorrow as CudaKernelParameter>::DeviceType<'b>; +// type FfiType<'stream, 'b> = +// <&'a DeepPerThreadBorrow as CudaKernelParameter>::FfiType<'stream, 'b>; +// #[cfg(feature = "host")] +// type SyncHostType = <&'a DeepPerThreadBorrow as CudaKernelParameter>::SyncHostType; + +// #[cfg(feature = "host")] +// fn with_new_async<'stream, 'param, O, E: From>( +// param: Self::SyncHostType, +// stream: &'stream rustacuda::stream::Stream, +// inner: impl super::WithNewAsync<'stream, Self, O, E>, +// ) -> Result where Self: 'param { +// <&'a DeepPerThreadBorrow as CudaKernelParameter>::with_new_async(param, stream, |param: <&'a DeepPerThreadBorrow as CudaKernelParameter>::AsyncHostType<'stream, '_>| inner.with(param)) +// } + +// #[cfg(feature = "host")] +// fn with_async_as_ptx_jit( +// param: &Self::AsyncHostType<'_, '_>, +// _token: sealed::Token, +// inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O, +// ) -> O { +// let param = unsafe { param.unwrap_unchecked() }; +// inner(Some(¶m_as_raw_bytes(param.for_host()))) +// } + +// #[cfg(feature = "host")] +// fn shared_layout_for_async( +// _param: &Self::AsyncHostType<'_, '_>, +// _token: sealed::Token, +// ) -> Layout { +// Layout::new::<()>() +// } + +// #[cfg(feature = "host")] +// fn async_to_ffi<'stream, 'b, E: From>( +// param: Self::AsyncHostType<'stream, 'b>, +// token: sealed::Token, +// ) -> Result, E> { +// <&'a DeepPerThreadBorrow as CudaKernelParameter>::async_to_ffi(param, token) +// } + +// #[cfg(feature = "device")] +// unsafe fn with_ffi_as_device( +// param: Self::FfiType<'static, 'static>, +// inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O, +// ) -> O { +// emit_param_ptx_jit_marker::<_, PARAM>(param.as_ref()); + +// <&'a DeepPerThreadBorrow as CudaKernelParameter>::with_ffi_as_device::( +// param, inner, +// ) +// } +// } +// impl<'a, T: 'static + Sync + RustToCuda> sealed::Sealed for &'a PtxJit> {} + +// impl<'a, T: 'static + Sync + RustToCuda + SafeMutableAliasing> CudaKernelParameter +// for &'a mut PtxJit> +// { +// #[cfg(feature = "host")] +// type AsyncHostType<'stream, 'b> = +// <&'a mut DeepPerThreadBorrow as CudaKernelParameter>::AsyncHostType<'stream, 'b>; +// #[cfg(any(feature = "device", doc))] +// type DeviceType<'b> = <&'a mut DeepPerThreadBorrow as CudaKernelParameter>::DeviceType<'b>; +// type FfiType<'stream, 'b> = +// <&'a mut DeepPerThreadBorrow as CudaKernelParameter>::FfiType<'stream, 'b>; +// #[cfg(feature = "host")] +// type SyncHostType = <&'a mut DeepPerThreadBorrow as CudaKernelParameter>::SyncHostType; + +// #[cfg(feature = "host")] +// fn with_new_async<'stream, 'param, O, E: From>( +// param: Self::SyncHostType, +// stream: &'stream rustacuda::stream::Stream, +// inner: impl super::WithNewAsync<'stream, Self, O, E>, +// ) -> Result where Self: 'param { +// <&'a mut DeepPerThreadBorrow as CudaKernelParameter>::with_new_async( +// param, stream, |param: <&'a mut DeepPerThreadBorrow as CudaKernelParameter>::AsyncHostType<'stream, '_>| inner.with(param), +// ) +// } + +// #[cfg(feature = "host")] +// fn with_async_as_ptx_jit( +// param: &Self::AsyncHostType<'_, '_>, +// _token: sealed::Token, +// inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O, +// ) -> O { +// let param = unsafe { param.as_ref().unwrap_unchecked() }; +// inner(Some(¶m_as_raw_bytes(param.for_host()))) +// } + +// #[cfg(feature = "host")] +// fn shared_layout_for_async( +// _param: &Self::AsyncHostType<'_, '_>, +// _token: sealed::Token, +// ) -> Layout { +// Layout::new::<()>() +// } + +// #[cfg(feature = "host")] +// fn async_to_ffi<'stream, 'b, E: From>( +// param: Self::AsyncHostType<'stream, 'b>, +// token: sealed::Token, +// ) -> Result, E> { +// <&'a mut DeepPerThreadBorrow as CudaKernelParameter>::async_to_ffi(param, token) +// } + +// #[cfg(feature = "device")] +// unsafe fn with_ffi_as_device( +// param: Self::FfiType<'static, 'static>, +// inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O, +// ) -> O { +// emit_param_ptx_jit_marker::<_, PARAM>(param.as_ref()); + +// <&'a mut DeepPerThreadBorrow as CudaKernelParameter>::with_ffi_as_device::( +// param, inner, +// ) +// } +// } +// impl<'a, T: 'static + Sync + RustToCuda + SafeMutableAliasing> sealed::Sealed +// for &'a mut PtxJit> +// { +// } #[cfg(feature = "host")] fn param_as_raw_bytes(r: &T) -> NonNull<[u8]> { @@ -873,151 +871,151 @@ fn emit_param_ptx_jit_marker(param: &T) { } } -mod private_shared { - use core::marker::PhantomData; - - use const_type_layout::{TypeGraphLayout, TypeLayout}; - - use crate::safety::PortableBitSemantics; - - #[doc(hidden)] - #[derive(TypeLayout)] - #[repr(C)] - pub struct ThreadBlockSharedFfi { - pub(super) _dummy: [u8; 0], - pub(super) _marker: PhantomData, - } - - #[doc(hidden)] - #[derive(TypeLayout)] - #[repr(C)] - pub struct ThreadBlockSharedSliceFfi { - pub(super) len: usize, - pub(super) _marker: [T; 0], - } -} - -impl<'a, T: 'static> CudaKernelParameter for &'a mut crate::utils::shared::ThreadBlockShared { - #[cfg(feature = "host")] - type AsyncHostType<'stream, 'b> = &'b mut crate::utils::shared::ThreadBlockShared; - #[cfg(any(feature = "device", doc))] - type DeviceType<'b> = &'b mut crate::utils::shared::ThreadBlockShared; - type FfiType<'stream, 'b> = private_shared::ThreadBlockSharedFfi; - #[cfg(feature = "host")] - type SyncHostType = Self; - - #[cfg(feature = "host")] - fn with_new_async<'stream, O, E: From>( - param: Self::SyncHostType, - _stream: &'stream rustacuda::stream::Stream, - inner: impl for<'b> FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result, - ) -> Result { - inner(param) - } - - #[cfg(feature = "host")] - fn with_async_as_ptx_jit( - _param: &Self::AsyncHostType<'_, '_>, - _token: sealed::Token, - inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O, - ) -> O { - inner(None) - } - - #[cfg(feature = "host")] - fn shared_layout_for_async( - _param: &Self::AsyncHostType<'_, '_>, - _token: sealed::Token, - ) -> Layout { - Layout::new::<()>() - } - - #[cfg(feature = "host")] - fn async_to_ffi<'stream, 'b, E: From>( - _param: Self::AsyncHostType<'stream, 'b>, - _token: sealed::Token, - ) -> Result, E> { - Ok(private_shared::ThreadBlockSharedFfi { - _dummy: [], - _marker: PhantomData::, - }) - } - - #[cfg(feature = "device")] - #[allow(clippy::inline_always)] - #[inline(always)] - unsafe fn with_ffi_as_device( - _param: Self::FfiType<'static, 'static>, - inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O, - ) -> O { - let mut param = crate::utils::shared::ThreadBlockShared::new_uninit(); - - inner(&mut param) - } -} -impl<'a, T: 'static> sealed::Sealed for &'a mut crate::utils::shared::ThreadBlockShared {} - -impl<'a, T: 'static + PortableBitSemantics + TypeGraphLayout> CudaKernelParameter - for &'a mut crate::utils::shared::ThreadBlockSharedSlice -{ - #[cfg(feature = "host")] - type AsyncHostType<'stream, 'b> = &'b mut crate::utils::shared::ThreadBlockSharedSlice; - #[cfg(any(feature = "device", doc))] - type DeviceType<'b> = &'b mut crate::utils::shared::ThreadBlockSharedSlice; - type FfiType<'stream, 'b> = private_shared::ThreadBlockSharedSliceFfi; - #[cfg(feature = "host")] - type SyncHostType = Self; - - #[cfg(feature = "host")] - fn with_new_async<'stream, O, E: From>( - param: Self::SyncHostType, - _stream: &'stream rustacuda::stream::Stream, - inner: impl for<'b> FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result, - ) -> Result { - inner(param) - } - - #[cfg(feature = "host")] - fn with_async_as_ptx_jit( - _param: &Self::AsyncHostType<'_, '_>, - _token: sealed::Token, - inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O, - ) -> O { - inner(None) - } - - #[cfg(feature = "host")] - fn shared_layout_for_async( - param: &Self::AsyncHostType<'_, '_>, - _token: sealed::Token, - ) -> Layout { - param.layout() - } - - #[cfg(feature = "host")] - fn async_to_ffi<'stream, 'b, E: From>( - param: Self::AsyncHostType<'stream, 'b>, - _token: sealed::Token, - ) -> Result, E> { - Ok(private_shared::ThreadBlockSharedSliceFfi { - len: param.len(), - _marker: [], - }) - } - - #[cfg(feature = "device")] - #[allow(clippy::inline_always)] - #[inline(always)] - unsafe fn with_ffi_as_device( - param: Self::FfiType<'static, 'static>, - inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O, - ) -> O { - unsafe { - crate::utils::shared::ThreadBlockSharedSlice::with_uninit_for_len(param.len, inner) - } - } -} -impl<'a, T: 'static + PortableBitSemantics + TypeGraphLayout> sealed::Sealed - for &'a mut crate::utils::shared::ThreadBlockSharedSlice -{ -} +// mod private_shared { +// use core::marker::PhantomData; + +// use const_type_layout::{TypeGraphLayout, TypeLayout}; + +// use crate::safety::PortableBitSemantics; + +// #[doc(hidden)] +// #[derive(TypeLayout)] +// #[repr(C)] +// pub struct ThreadBlockSharedFfi { +// pub(super) _dummy: [u8; 0], +// pub(super) _marker: PhantomData, +// } + +// #[doc(hidden)] +// #[derive(TypeLayout)] +// #[repr(C)] +// pub struct ThreadBlockSharedSliceFfi { +// pub(super) len: usize, +// pub(super) _marker: [T; 0], +// } +// } + +// impl<'a, T: 'static> CudaKernelParameter for &'a mut crate::utils::shared::ThreadBlockShared { +// #[cfg(feature = "host")] +// type AsyncHostType<'stream, 'b> = &'b mut crate::utils::shared::ThreadBlockShared; +// #[cfg(any(feature = "device", doc))] +// type DeviceType<'b> = &'b mut crate::utils::shared::ThreadBlockShared; +// type FfiType<'stream, 'b> = private_shared::ThreadBlockSharedFfi; +// #[cfg(feature = "host")] +// type SyncHostType = Self; + +// #[cfg(feature = "host")] +// fn with_new_async<'stream, 'param, O, E: From>( +// param: Self::SyncHostType, +// _stream: &'stream rustacuda::stream::Stream, +// inner: impl super::WithNewAsync<'stream, Self, O, E>, +// ) -> Result where Self: 'param { +// inner.with(param) +// } + +// #[cfg(feature = "host")] +// fn with_async_as_ptx_jit( +// _param: &Self::AsyncHostType<'_, '_>, +// _token: sealed::Token, +// inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O, +// ) -> O { +// inner(None) +// } + +// #[cfg(feature = "host")] +// fn shared_layout_for_async( +// _param: &Self::AsyncHostType<'_, '_>, +// _token: sealed::Token, +// ) -> Layout { +// Layout::new::<()>() +// } + +// #[cfg(feature = "host")] +// fn async_to_ffi<'stream, 'b, E: From>( +// _param: Self::AsyncHostType<'stream, 'b>, +// _token: sealed::Token, +// ) -> Result, E> { +// Ok(private_shared::ThreadBlockSharedFfi { +// _dummy: [], +// _marker: PhantomData::, +// }) +// } + +// #[cfg(feature = "device")] +// #[allow(clippy::inline_always)] +// #[inline(always)] +// unsafe fn with_ffi_as_device( +// _param: Self::FfiType<'static, 'static>, +// inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O, +// ) -> O { +// let mut param = crate::utils::shared::ThreadBlockShared::new_uninit(); + +// inner(&mut param) +// } +// } +// impl<'a, T: 'static> sealed::Sealed for &'a mut crate::utils::shared::ThreadBlockShared {} + +// impl<'a, T: 'static + PortableBitSemantics + TypeGraphLayout> CudaKernelParameter +// for &'a mut crate::utils::shared::ThreadBlockSharedSlice +// { +// #[cfg(feature = "host")] +// type AsyncHostType<'stream, 'b> = &'b mut crate::utils::shared::ThreadBlockSharedSlice; +// #[cfg(any(feature = "device", doc))] +// type DeviceType<'b> = &'b mut crate::utils::shared::ThreadBlockSharedSlice; +// type FfiType<'stream, 'b> = private_shared::ThreadBlockSharedSliceFfi; +// #[cfg(feature = "host")] +// type SyncHostType = Self; + +// #[cfg(feature = "host")] +// fn with_new_async<'stream, 'param, O, E: From>( +// param: Self::SyncHostType, +// _stream: &'stream rustacuda::stream::Stream, +// inner: impl super::WithNewAsync<'stream, Self, O, E>, +// ) -> Result where Self: 'param { +// inner.with(param) +// } + +// #[cfg(feature = "host")] +// fn with_async_as_ptx_jit( +// _param: &Self::AsyncHostType<'_, '_>, +// _token: sealed::Token, +// inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O, +// ) -> O { +// inner(None) +// } + +// #[cfg(feature = "host")] +// fn shared_layout_for_async( +// param: &Self::AsyncHostType<'_, '_>, +// _token: sealed::Token, +// ) -> Layout { +// param.layout() +// } + +// #[cfg(feature = "host")] +// fn async_to_ffi<'stream, 'b, E: From>( +// param: Self::AsyncHostType<'stream, 'b>, +// _token: sealed::Token, +// ) -> Result, E> { +// Ok(private_shared::ThreadBlockSharedSliceFfi { +// len: param.len(), +// _marker: [], +// }) +// } + +// #[cfg(feature = "device")] +// #[allow(clippy::inline_always)] +// #[inline(always)] +// unsafe fn with_ffi_as_device( +// param: Self::FfiType<'static, 'static>, +// inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O, +// ) -> O { +// unsafe { +// crate::utils::shared::ThreadBlockSharedSlice::with_uninit_for_len(param.len, inner) +// } +// } +// } +// impl<'a, T: 'static + PortableBitSemantics + TypeGraphLayout> sealed::Sealed +// for &'a mut crate::utils::shared::ThreadBlockSharedSlice +// { +// } From e0d2319c54bf79ccb5e6a1bf3a1390d4b5dcebf7 Mon Sep 17 00:00:00 2001 From: Juniper Tyree Date: Sun, 7 Jan 2024 22:01:01 +0000 Subject: [PATCH 093/120] Further investigation of less 'static bounds --- src/kernel/mod.rs | 5 +- src/kernel/param.rs | 1433 ++++++++++++++++++++++--------------------- 2 files changed, 721 insertions(+), 717 deletions(-) diff --git a/src/kernel/mod.rs b/src/kernel/mod.rs index cc51d64f0..c03ca5517 100644 --- a/src/kernel/mod.rs +++ b/src/kernel/mod.rs @@ -39,12 +39,13 @@ mod sealed { pub struct Token; } -#[cfg(feature = "host")] +#[cfg(feature = "host")] // FIXME: make private? pub trait WithNewAsync<'stream, P: ?Sized + CudaKernelParameter, O, E: From> { + #[allow(clippy::missing_errors_doc)] // FIXME fn with<'b>(self, param: P::AsyncHostType<'stream, 'b>) -> Result where P: 'b; } -#[cfg(feature = "host")] +#[cfg(feature = "host")] // FIXME: make private? impl<'stream, P: ?Sized + CudaKernelParameter, O, E: From, F: for<'b> FnOnce(P::AsyncHostType<'stream, 'b>) -> Result> WithNewAsync<'stream, P, O, E> for F { fn with<'b>(self, param: P::AsyncHostType<'stream, 'b>) -> Result where P: 'b { (self)(param) diff --git a/src/kernel/param.rs b/src/kernel/param.rs index edc56f4b7..c40f68e1e 100644 --- a/src/kernel/param.rs +++ b/src/kernel/param.rs @@ -234,6 +234,7 @@ impl< stream: &'stream rustacuda::stream::Stream, inner: impl super::WithNewAsync<'stream, Self, O, E>, ) -> Result where Self: 'param { + // FIXME: forward impl crate::host::HostAndDeviceConstRef::with_new(param, |const_ref| { inner.with(const_ref.as_async(stream).as_ref()) }) @@ -272,6 +273,7 @@ impl< ) -> O where Self: 'short { emit_param_ptx_jit_marker::<_, PARAM>(param.as_ref()); + // FIXME: forward impl let param = param.as_ref(); inner.with(param) @@ -287,573 +289,574 @@ impl< { } -// pub struct ShallowInteriorMutable< -// T: Sync -// + crate::safety::StackOnly -// + crate::safety::PortableBitSemantics -// + TypeGraphLayout -// + InteriorMutableSync, -// > { -// never: !, -// _marker: PhantomData, -// } - -// impl< -// T: Sync -// + crate::safety::StackOnly -// + crate::safety::PortableBitSemantics -// + TypeGraphLayout -// + InteriorMutableSync, -// > Deref for ShallowInteriorMutable -// { -// type Target = T; - -// fn deref(&self) -> &Self::Target { -// self.never -// } -// } - -// impl< -// 'a, -// T: 'static -// + Sync -// + crate::safety::StackOnly -// + crate::safety::PortableBitSemantics -// + TypeGraphLayout -// + InteriorMutableSync, -// > CudaKernelParameter for &'a ShallowInteriorMutable -// { -// #[cfg(feature = "host")] -// type AsyncHostType<'stream, 'b> = crate::utils::r#async::AsyncProj< -// 'b, -// 'stream, -// &'b crate::host::HostAndDeviceConstRef<'b, T>, -// >; -// #[cfg(any(feature = "device", doc))] -// type DeviceType<'b> = &'b T; -// type FfiType<'stream, 'b> = DeviceConstRef<'b, T>; -// #[cfg(feature = "host")] -// /// The kernel takes a mutable borrow of the interior mutable data to ensure -// /// the interior mutability is limited to just this kernel invocation. -// type SyncHostType = &'a mut T; - -// #[cfg(feature = "host")] -// fn with_new_async<'stream, 'param, O, E: From>( -// param: Self::SyncHostType, -// stream: &'stream rustacuda::stream::Stream, -// inner: impl super::WithNewAsync<'stream, Self, O, E>, -// ) -> Result where Self: 'param { -// crate::host::HostAndDeviceMutRef::with_new(param, |const_ref| { -// inner.with(const_ref.as_ref().as_async(stream).as_ref()) -// }) -// } - -// #[cfg(feature = "host")] -// fn with_async_as_ptx_jit( -// _param: &Self::AsyncHostType<'_, '_>, -// _token: sealed::Token, -// inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O, -// ) -> O { -// inner(None) -// } - -// #[cfg(feature = "host")] -// fn shared_layout_for_async( -// _param: &Self::AsyncHostType<'_, '_>, -// _token: sealed::Token, -// ) -> Layout { -// Layout::new::<()>() -// } - -// #[cfg(feature = "host")] -// fn async_to_ffi<'stream, 'b, E: From>( -// param: Self::AsyncHostType<'stream, 'b>, -// _token: sealed::Token, -// ) -> Result, E> { -// let param = unsafe { param.unwrap_unchecked() }; -// Ok(param.for_device()) -// } - -// #[cfg(feature = "device")] -// unsafe fn with_ffi_as_device( -// param: Self::FfiType<'static, 'static>, -// inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O, -// ) -> O { -// let param = param.as_ref(); - -// inner(param) -// } -// } -// impl< -// 'a, -// T: crate::safety::StackOnly -// + Sync -// + crate::safety::PortableBitSemantics -// + TypeGraphLayout -// + InteriorMutableSync, -// > sealed::Sealed for &'a ShallowInteriorMutable -// { -// } - -// pub trait InteriorMutableSync: Sync + sealed::Sealed {} - -// macro_rules! impl_atomic_interior_mutable { -// ($atomic:ident($interior:ty)) => { -// impl InteriorMutableSync for core::sync::atomic::$atomic {} -// impl sealed::Sealed for core::sync::atomic::$atomic {} -// }; -// ($($atomic:ident($interior:ty)),*) => { -// $(impl_atomic_interior_mutable! { $atomic($interior) })* -// } -// } - -// impl_atomic_interior_mutable! { -// AtomicBool(bool), -// AtomicI8(i8), AtomicI16(i16), AtomicI32(i32), AtomicI64(i64), AtomicIsize(isize), -// AtomicU8(u8), AtomicU16(u16), AtomicU32(u32), AtomicU64(u64), AtomicUsize(usize) -// } - -// impl InteriorMutableSync -// for core::cell::SyncUnsafeCell -// { -// } -// impl sealed::Sealed -// for core::cell::SyncUnsafeCell -// { -// } - -// pub struct DeepPerThreadBorrow { -// never: !, -// _marker: PhantomData, -// } - -// impl Deref for DeepPerThreadBorrow { -// type Target = T; - -// fn deref(&self) -> &Self::Target { -// self.never -// } -// } - -// impl< -// T: Send -// + Clone -// + RustToCuda< -// CudaRepresentation: 'static + crate::safety::StackOnly, -// CudaAllocation: EmptyCudaAlloc, -// >, -// > CudaKernelParameter for DeepPerThreadBorrow -// { -// #[cfg(feature = "host")] -// type AsyncHostType<'stream, 'b> = crate::utils::r#async::Async< -// 'b, -// 'stream, -// crate::host::HostAndDeviceOwned< -// 'b, -// DeviceAccessible<::CudaRepresentation>, -// >, -// crate::utils::r#async::NoCompletion, -// >; -// #[cfg(any(feature = "device", doc))] -// type DeviceType<'b> = T; -// type FfiType<'stream, 'b> = -// DeviceOwnedRef<'b, DeviceAccessible<::CudaRepresentation>>; -// #[cfg(feature = "host")] -// type SyncHostType = T; - -// #[cfg(feature = "host")] -// fn with_new_async<'stream, 'param, O, E: From>( -// param: Self::SyncHostType, -// stream: &'stream rustacuda::stream::Stream, -// inner: impl super::WithNewAsync<'stream, Self, O, E>, -// ) -> Result where Self: 'param { -// crate::lend::LendToCuda::move_to_cuda(param, |param| inner.with(param.into_async(stream))) -// } - -// #[cfg(feature = "host")] -// fn with_async_as_ptx_jit( -// _param: &Self::AsyncHostType<'_, '_>, -// _token: sealed::Token, -// inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O, -// ) -> O { -// inner(None) -// } - -// #[cfg(feature = "host")] -// fn shared_layout_for_async( -// _param: &Self::AsyncHostType<'_, '_>, -// _token: sealed::Token, -// ) -> Layout { -// Layout::new::<()>() -// } - -// #[cfg(feature = "host")] -// fn async_to_ffi<'stream, 'b, E: From>( -// param: Self::AsyncHostType<'stream, 'b>, -// _token: sealed::Token, -// ) -> Result, E> { -// let (param, _completion): (_, Option) = -// unsafe { param.unwrap_unchecked()? }; -// Ok(param.for_device()) -// } - -// #[cfg(feature = "device")] -// unsafe fn with_ffi_as_device( -// param: Self::FfiType<'static, 'static>, -// inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O, -// ) -> O { -// unsafe { crate::lend::BorrowFromRust::with_moved_from_rust(param, inner) } -// } -// } -// impl< -// T: Send -// + Clone -// + RustToCuda< -// CudaRepresentation: 'static + crate::safety::StackOnly, -// CudaAllocation: EmptyCudaAlloc, -// >, -// > sealed::Sealed for DeepPerThreadBorrow -// { -// } - -// impl<'a, T: 'static + Sync + RustToCuda> CudaKernelParameter for &'a DeepPerThreadBorrow { -// #[cfg(feature = "host")] -// type AsyncHostType<'stream, 'b> = crate::utils::r#async::AsyncProj< -// 'b, -// 'stream, -// &'b crate::host::HostAndDeviceConstRef< -// 'b, -// DeviceAccessible<::CudaRepresentation>, -// >, -// >; -// #[cfg(any(feature = "device", doc))] -// type DeviceType<'b> = &'b T; -// type FfiType<'stream, 'b> = -// DeviceConstRef<'b, DeviceAccessible<::CudaRepresentation>>; -// #[cfg(feature = "host")] -// type SyncHostType = &'a T; - -// #[cfg(feature = "host")] -// fn with_new_async<'stream, 'param, O, E: From>( -// param: Self::SyncHostType, -// stream: &'stream rustacuda::stream::Stream, -// inner: impl super::WithNewAsync<'stream, Self, O, E>, -// ) -> Result where Self: 'param { -// crate::lend::LendToCuda::lend_to_cuda(param, |param| inner.with(param.as_async(stream).as_ref())) -// } - -// #[cfg(feature = "host")] -// fn with_async_as_ptx_jit( -// _param: &Self::AsyncHostType<'_, '_>, -// _token: sealed::Token, -// inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O, -// ) -> O { -// inner(None) -// } - -// #[cfg(feature = "host")] -// fn shared_layout_for_async( -// _param: &Self::AsyncHostType<'_, '_>, -// _token: sealed::Token, -// ) -> Layout { -// Layout::new::<()>() -// } - -// #[cfg(feature = "host")] -// fn async_to_ffi<'stream, 'b, E: From>( -// param: Self::AsyncHostType<'stream, 'b>, -// _token: sealed::Token, -// ) -> Result, E> { -// let param = unsafe { param.unwrap_unchecked() }; -// Ok(param.for_device()) -// } - -// #[cfg(feature = "device")] -// unsafe fn with_ffi_as_device( -// param: Self::FfiType<'static, 'static>, -// inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O, -// ) -> O { -// unsafe { crate::lend::BorrowFromRust::with_borrow_from_rust(param, inner) } -// } -// } -// impl<'a, T: Sync + RustToCuda> sealed::Sealed for &'a DeepPerThreadBorrow {} - -// impl<'a, T: 'static + Sync + RustToCuda + SafeMutableAliasing> CudaKernelParameter -// for &'a mut DeepPerThreadBorrow -// { -// #[cfg(feature = "host")] -// type AsyncHostType<'stream, 'b> = crate::utils::r#async::AsyncProj< -// 'b, -// 'stream, -// &'b mut crate::host::HostAndDeviceMutRef< -// 'b, -// DeviceAccessible<::CudaRepresentation>, -// >, -// >; -// #[cfg(any(feature = "device", doc))] -// type DeviceType<'b> = &'b mut T; -// type FfiType<'stream, 'b> = -// DeviceMutRef<'b, DeviceAccessible<::CudaRepresentation>>; -// #[cfg(feature = "host")] -// type SyncHostType = &'a mut T; - -// #[cfg(feature = "host")] -// fn with_new_async<'stream, 'param, O, E: From>( -// param: Self::SyncHostType, -// stream: &'stream rustacuda::stream::Stream, -// inner: impl super::WithNewAsync<'stream, Self, O, E>, -// ) -> Result where Self: 'param { -// crate::lend::LendToCuda::lend_to_cuda_mut(param, |mut param| { -// // FIXME: express the same with param.as_async(stream).as_mut() -// let _ = stream; -// inner.with(crate::utils::r#async::AsyncProj::new(&mut param.as_mut())) -// }) -// } - -// #[cfg(feature = "host")] -// fn with_async_as_ptx_jit( -// _param: &Self::AsyncHostType<'_, '_>, -// _token: sealed::Token, -// inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O, -// ) -> O { -// inner(None) -// } - -// #[cfg(feature = "host")] -// fn shared_layout_for_async( -// _param: &Self::AsyncHostType<'_, '_>, -// _token: sealed::Token, -// ) -> Layout { -// Layout::new::<()>() -// } - -// #[cfg(feature = "host")] -// fn async_to_ffi<'stream, 'b, E: From>( -// param: Self::AsyncHostType<'stream, 'b>, -// _token: sealed::Token, -// ) -> Result, E> { -// let param = unsafe { param.unwrap_unchecked() }; -// Ok(param.for_device()) -// } - -// #[cfg(feature = "device")] -// unsafe fn with_ffi_as_device( -// param: Self::FfiType<'static, 'static>, -// inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O, -// ) -> O { -// unsafe { crate::lend::BorrowFromRust::with_borrow_from_rust_mut(param, inner) } -// } -// } -// impl<'a, T: Sync + RustToCuda + SafeMutableAliasing> sealed::Sealed -// for &'a mut DeepPerThreadBorrow -// { -// } - -// impl< -// T: Send -// + Clone -// + RustToCuda< -// CudaRepresentation: 'static + crate::safety::StackOnly, -// CudaAllocation: EmptyCudaAlloc, -// >, -// > CudaKernelParameter for PtxJit> -// { -// #[cfg(feature = "host")] -// type AsyncHostType<'stream, 'b> = -// as CudaKernelParameter>::AsyncHostType<'stream, 'b>; -// #[cfg(any(feature = "device", doc))] -// type DeviceType<'b> = as CudaKernelParameter>::DeviceType<'b>; -// type FfiType<'stream, 'b> = -// as CudaKernelParameter>::FfiType<'stream, 'b>; -// #[cfg(feature = "host")] -// type SyncHostType = as CudaKernelParameter>::SyncHostType; - -// #[cfg(feature = "host")] -// fn with_new_async<'stream, 'param, O, E: From>( -// param: Self::SyncHostType, -// stream: &'stream rustacuda::stream::Stream, -// inner: impl super::WithNewAsync<'stream, Self, O, E>, -// ) -> Result where Self: 'param { -// as CudaKernelParameter>::with_new_async(param, stream, |param: as CudaKernelParameter>::AsyncHostType<'stream, '_>| inner.with(param)) -// } - -// #[cfg(feature = "host")] -// fn with_async_as_ptx_jit( -// param: &Self::AsyncHostType<'_, '_>, -// _token: sealed::Token, -// inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O, -// ) -> O { -// let param = unsafe { param.as_ref().unwrap_unchecked() }; -// inner(Some(¶m_as_raw_bytes(param.for_host()))) -// } - -// #[cfg(feature = "host")] -// fn async_to_ffi<'stream, 'b, E: From>( -// param: Self::AsyncHostType<'stream, 'b>, -// token: sealed::Token, -// ) -> Result, E> { -// as CudaKernelParameter>::async_to_ffi(param, token) -// } - -// #[cfg(feature = "host")] -// fn shared_layout_for_async( -// _param: &Self::AsyncHostType<'_, '_>, -// _token: sealed::Token, -// ) -> Layout { -// Layout::new::<()>() -// } - -// #[cfg(feature = "device")] -// unsafe fn with_ffi_as_device( -// param: Self::FfiType<'static, 'static>, -// inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O, -// ) -> O { -// emit_param_ptx_jit_marker::<_, PARAM>(param.as_ref()); - -// as CudaKernelParameter>::with_ffi_as_device::( -// param, inner, -// ) -// } -// } -// impl< -// T: Send -// + Clone -// + RustToCuda< -// CudaRepresentation: 'static + crate::safety::StackOnly, -// CudaAllocation: EmptyCudaAlloc, -// >, -// > sealed::Sealed for PtxJit> -// { -// } - -// impl<'a, T: 'static + Sync + RustToCuda> CudaKernelParameter -// for &'a PtxJit> -// { -// #[cfg(feature = "host")] -// type AsyncHostType<'stream, 'b> = -// <&'a DeepPerThreadBorrow as CudaKernelParameter>::AsyncHostType<'stream, 'b>; -// #[cfg(any(feature = "device", doc))] -// type DeviceType<'b> = <&'a DeepPerThreadBorrow as CudaKernelParameter>::DeviceType<'b>; -// type FfiType<'stream, 'b> = -// <&'a DeepPerThreadBorrow as CudaKernelParameter>::FfiType<'stream, 'b>; -// #[cfg(feature = "host")] -// type SyncHostType = <&'a DeepPerThreadBorrow as CudaKernelParameter>::SyncHostType; - -// #[cfg(feature = "host")] -// fn with_new_async<'stream, 'param, O, E: From>( -// param: Self::SyncHostType, -// stream: &'stream rustacuda::stream::Stream, -// inner: impl super::WithNewAsync<'stream, Self, O, E>, -// ) -> Result where Self: 'param { -// <&'a DeepPerThreadBorrow as CudaKernelParameter>::with_new_async(param, stream, |param: <&'a DeepPerThreadBorrow as CudaKernelParameter>::AsyncHostType<'stream, '_>| inner.with(param)) -// } - -// #[cfg(feature = "host")] -// fn with_async_as_ptx_jit( -// param: &Self::AsyncHostType<'_, '_>, -// _token: sealed::Token, -// inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O, -// ) -> O { -// let param = unsafe { param.unwrap_unchecked() }; -// inner(Some(¶m_as_raw_bytes(param.for_host()))) -// } - -// #[cfg(feature = "host")] -// fn shared_layout_for_async( -// _param: &Self::AsyncHostType<'_, '_>, -// _token: sealed::Token, -// ) -> Layout { -// Layout::new::<()>() -// } - -// #[cfg(feature = "host")] -// fn async_to_ffi<'stream, 'b, E: From>( -// param: Self::AsyncHostType<'stream, 'b>, -// token: sealed::Token, -// ) -> Result, E> { -// <&'a DeepPerThreadBorrow as CudaKernelParameter>::async_to_ffi(param, token) -// } - -// #[cfg(feature = "device")] -// unsafe fn with_ffi_as_device( -// param: Self::FfiType<'static, 'static>, -// inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O, -// ) -> O { -// emit_param_ptx_jit_marker::<_, PARAM>(param.as_ref()); - -// <&'a DeepPerThreadBorrow as CudaKernelParameter>::with_ffi_as_device::( -// param, inner, -// ) -// } -// } -// impl<'a, T: 'static + Sync + RustToCuda> sealed::Sealed for &'a PtxJit> {} - -// impl<'a, T: 'static + Sync + RustToCuda + SafeMutableAliasing> CudaKernelParameter -// for &'a mut PtxJit> -// { -// #[cfg(feature = "host")] -// type AsyncHostType<'stream, 'b> = -// <&'a mut DeepPerThreadBorrow as CudaKernelParameter>::AsyncHostType<'stream, 'b>; -// #[cfg(any(feature = "device", doc))] -// type DeviceType<'b> = <&'a mut DeepPerThreadBorrow as CudaKernelParameter>::DeviceType<'b>; -// type FfiType<'stream, 'b> = -// <&'a mut DeepPerThreadBorrow as CudaKernelParameter>::FfiType<'stream, 'b>; -// #[cfg(feature = "host")] -// type SyncHostType = <&'a mut DeepPerThreadBorrow as CudaKernelParameter>::SyncHostType; - -// #[cfg(feature = "host")] -// fn with_new_async<'stream, 'param, O, E: From>( -// param: Self::SyncHostType, -// stream: &'stream rustacuda::stream::Stream, -// inner: impl super::WithNewAsync<'stream, Self, O, E>, -// ) -> Result where Self: 'param { -// <&'a mut DeepPerThreadBorrow as CudaKernelParameter>::with_new_async( -// param, stream, |param: <&'a mut DeepPerThreadBorrow as CudaKernelParameter>::AsyncHostType<'stream, '_>| inner.with(param), -// ) -// } - -// #[cfg(feature = "host")] -// fn with_async_as_ptx_jit( -// param: &Self::AsyncHostType<'_, '_>, -// _token: sealed::Token, -// inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O, -// ) -> O { -// let param = unsafe { param.as_ref().unwrap_unchecked() }; -// inner(Some(¶m_as_raw_bytes(param.for_host()))) -// } - -// #[cfg(feature = "host")] -// fn shared_layout_for_async( -// _param: &Self::AsyncHostType<'_, '_>, -// _token: sealed::Token, -// ) -> Layout { -// Layout::new::<()>() -// } - -// #[cfg(feature = "host")] -// fn async_to_ffi<'stream, 'b, E: From>( -// param: Self::AsyncHostType<'stream, 'b>, -// token: sealed::Token, -// ) -> Result, E> { -// <&'a mut DeepPerThreadBorrow as CudaKernelParameter>::async_to_ffi(param, token) -// } - -// #[cfg(feature = "device")] -// unsafe fn with_ffi_as_device( -// param: Self::FfiType<'static, 'static>, -// inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O, -// ) -> O { -// emit_param_ptx_jit_marker::<_, PARAM>(param.as_ref()); - -// <&'a mut DeepPerThreadBorrow as CudaKernelParameter>::with_ffi_as_device::( -// param, inner, -// ) -// } -// } -// impl<'a, T: 'static + Sync + RustToCuda + SafeMutableAliasing> sealed::Sealed -// for &'a mut PtxJit> -// { -// } +pub struct ShallowInteriorMutable< + T: Sync + + crate::safety::StackOnly + + crate::safety::PortableBitSemantics + + TypeGraphLayout + + InteriorMutableSync, +> { + never: !, + _marker: PhantomData, +} + +impl< + T: Sync + + crate::safety::StackOnly + + crate::safety::PortableBitSemantics + + TypeGraphLayout + + InteriorMutableSync, + > Deref for ShallowInteriorMutable +{ + type Target = T; + + fn deref(&self) -> &Self::Target { + self.never + } +} + +impl< + 'a, + T: Sync + + crate::safety::StackOnly + + crate::safety::PortableBitSemantics + + TypeGraphLayout + + InteriorMutableSync, + > CudaKernelParameter for &'a ShallowInteriorMutable +{ + #[cfg(feature = "host")] + type AsyncHostType<'stream, 'b> = crate::utils::r#async::AsyncProj< + 'b, + 'stream, + &'b crate::host::HostAndDeviceConstRef<'b, T> + > where Self: 'b; + #[cfg(any(feature = "device", doc))] + type DeviceType<'b> = &'b T where Self: 'b; + type FfiType<'stream, 'b> = DeviceConstRef<'b, T> where Self: 'b; + #[cfg(feature = "host")] + /// The kernel takes a mutable borrow of the interior mutable data to ensure + /// the interior mutability is limited to just this kernel invocation. + type SyncHostType = &'a mut T; + + #[cfg(feature = "host")] + fn with_new_async<'stream, 'param, O, E: From>( + param: Self::SyncHostType, + stream: &'stream rustacuda::stream::Stream, + inner: impl super::WithNewAsync<'stream, Self, O, E>, + ) -> Result where Self: 'param { + crate::host::HostAndDeviceMutRef::with_new(param, |const_ref| { + inner.with(const_ref.as_ref().as_async(stream).as_ref()) + }) + } + + #[cfg(feature = "host")] + fn with_async_as_ptx_jit<'stream, 'b, O>( + _param: &Self::AsyncHostType<'stream, 'b>, + _token: sealed::Token, + inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O, + ) -> O where Self: 'b { + inner(None) + } + + #[cfg(feature = "host")] + fn shared_layout_for_async<'stream, 'b>( + _param: &Self::AsyncHostType<'stream, 'b>, + _token: sealed::Token, + ) -> Layout where Self: 'b { + Layout::new::<()>() + } + + #[cfg(feature = "host")] + fn async_to_ffi<'stream, 'b, E: From>( + param: Self::AsyncHostType<'stream, 'b>, + _token: sealed::Token, + ) -> Result, E> where Self: 'b { + let param = unsafe { param.unwrap_unchecked() }; + Ok(param.for_device()) + } + + #[cfg(feature = "device")] + unsafe fn with_ffi_as_device<'short, O, const PARAM: usize>( + param: Self::FfiType<'static, 'short>, + inner: impl super::WithFfiAsDevice, + ) -> O where Self: 'short { + let param = param.as_ref(); + + inner.with(param) + } +} +impl< + 'a, + T: crate::safety::StackOnly + + Sync + + crate::safety::PortableBitSemantics + + TypeGraphLayout + + InteriorMutableSync, + > sealed::Sealed for &'a ShallowInteriorMutable +{ +} + +pub trait InteriorMutableSync: Sync + sealed::Sealed {} + +macro_rules! impl_atomic_interior_mutable { + ($atomic:ident($interior:ty)) => { + impl InteriorMutableSync for core::sync::atomic::$atomic {} + impl sealed::Sealed for core::sync::atomic::$atomic {} + }; + ($($atomic:ident($interior:ty)),*) => { + $(impl_atomic_interior_mutable! { $atomic($interior) })* + } +} + +impl_atomic_interior_mutable! { + AtomicBool(bool), + AtomicI8(i8), AtomicI16(i16), AtomicI32(i32), AtomicI64(i64), AtomicIsize(isize), + AtomicU8(u8), AtomicU16(u16), AtomicU32(u32), AtomicU64(u64), AtomicUsize(usize) +} + +impl InteriorMutableSync + for core::cell::SyncUnsafeCell +{ +} +impl sealed::Sealed + for core::cell::SyncUnsafeCell +{ +} + +pub struct DeepPerThreadBorrow { + never: !, + _marker: PhantomData, +} + +impl Deref for DeepPerThreadBorrow { + type Target = T; + + fn deref(&self) -> &Self::Target { + self.never + } +} + +impl< + T: Send + + Clone + + RustToCuda< + CudaRepresentation: crate::safety::StackOnly, + CudaAllocation: EmptyCudaAlloc, + >, + > CudaKernelParameter for DeepPerThreadBorrow +{ + #[cfg(feature = "host")] + type AsyncHostType<'stream, 'b> = crate::utils::r#async::Async< + 'b, + 'stream, + crate::host::HostAndDeviceOwned< + 'b, + DeviceAccessible<::CudaRepresentation>, + >, + crate::utils::r#async::NoCompletion, + > where Self: 'b; + #[cfg(any(feature = "device", doc))] + type DeviceType<'b> = T where Self: 'b; + type FfiType<'stream, 'b> = + DeviceOwnedRef<'b, DeviceAccessible<::CudaRepresentation>> where Self: 'b; + #[cfg(feature = "host")] + type SyncHostType = T; + + #[cfg(feature = "host")] + fn with_new_async<'stream, 'param, O, E: From>( + param: Self::SyncHostType, + stream: &'stream rustacuda::stream::Stream, + inner: impl super::WithNewAsync<'stream, Self, O, E>, + ) -> Result where Self: 'param { + crate::lend::LendToCuda::move_to_cuda(param, |param| inner.with(param.into_async(stream))) + } + + #[cfg(feature = "host")] + fn with_async_as_ptx_jit<'stream, 'b, O>( + _param: &Self::AsyncHostType<'stream, 'b>, + _token: sealed::Token, + inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O, + ) -> O where Self: 'b { + inner(None) + } + + #[cfg(feature = "host")] + fn shared_layout_for_async<'stream, 'b>( + _param: &Self::AsyncHostType<'stream, 'b>, + _token: sealed::Token, + ) -> Layout where Self: 'b { + Layout::new::<()>() + } + + #[cfg(feature = "host")] + fn async_to_ffi<'stream, 'b, E: From>( + param: Self::AsyncHostType<'stream, 'b>, + _token: sealed::Token, + ) -> Result, E> where Self: 'b { + let (param, _completion): (_, Option) = + unsafe { param.unwrap_unchecked()? }; + Ok(param.for_device()) + } + + #[cfg(feature = "device")] + unsafe fn with_ffi_as_device<'short, O, const PARAM: usize>( + param: Self::FfiType<'static, 'short>, + inner: impl super::WithFfiAsDevice, + ) -> O where Self: 'short { + unsafe { crate::lend::BorrowFromRust::with_moved_from_rust(param, |param| inner.with(param)) } + } +} +impl< + T: Send + + Clone + + RustToCuda< + CudaRepresentation: crate::safety::StackOnly, + CudaAllocation: EmptyCudaAlloc, + >, + > sealed::Sealed for DeepPerThreadBorrow +{ +} + +impl<'a, T: Sync + RustToCuda> CudaKernelParameter for &'a DeepPerThreadBorrow { + #[cfg(feature = "host")] + type AsyncHostType<'stream, 'b> = crate::utils::r#async::AsyncProj< + 'b, + 'stream, + &'b crate::host::HostAndDeviceConstRef< + 'b, + DeviceAccessible<::CudaRepresentation>, + >, + > where Self: 'b; + #[cfg(any(feature = "device", doc))] + type DeviceType<'b> = &'b T where Self: 'b; + type FfiType<'stream, 'b> = + DeviceConstRef<'b, DeviceAccessible<::CudaRepresentation>> where Self: 'b; + #[cfg(feature = "host")] + type SyncHostType = &'a T; + + #[cfg(feature = "host")] + fn with_new_async<'stream, 'param, O, E: From>( + param: Self::SyncHostType, + stream: &'stream rustacuda::stream::Stream, + inner: impl super::WithNewAsync<'stream, Self, O, E>, + ) -> Result where Self: 'param { + crate::lend::LendToCuda::lend_to_cuda(param, |param| inner.with(param.as_async(stream).as_ref())) + } + + #[cfg(feature = "host")] + fn with_async_as_ptx_jit<'stream, 'b, O>( + _param: &Self::AsyncHostType<'stream, 'b>, + _token: sealed::Token, + inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O, + ) -> O where Self: 'b { + inner(None) + } + + #[cfg(feature = "host")] + fn shared_layout_for_async<'stream, 'b>( + _param: &Self::AsyncHostType<'stream, 'b>, + _token: sealed::Token, + ) -> Layout where Self: 'b { + Layout::new::<()>() + } + + #[cfg(feature = "host")] + fn async_to_ffi<'stream, 'b, E: From>( + param: Self::AsyncHostType<'stream, 'b>, + _token: sealed::Token, + ) -> Result, E> where Self: 'b { + let param = unsafe { param.unwrap_unchecked() }; + Ok(param.for_device()) + } + + #[cfg(feature = "device")] + unsafe fn with_ffi_as_device<'short, O, const PARAM: usize>( + param: Self::FfiType<'static, 'short>, + inner: impl super::WithFfiAsDevice, + ) -> O where Self: 'short { + unsafe { crate::lend::BorrowFromRust::with_borrow_from_rust(param, |param| inner.with(param)) } + } +} +impl<'a, T: Sync + RustToCuda> sealed::Sealed for &'a DeepPerThreadBorrow {} + +impl<'a, T: Sync + RustToCuda + SafeMutableAliasing> CudaKernelParameter + for &'a mut DeepPerThreadBorrow +{ + #[cfg(feature = "host")] + type AsyncHostType<'stream, 'b> = crate::utils::r#async::AsyncProj< + 'b, + 'stream, + &'b mut crate::host::HostAndDeviceMutRef< + 'b, + DeviceAccessible<::CudaRepresentation>, + >, + > where Self: 'b; + #[cfg(any(feature = "device", doc))] + type DeviceType<'b> = &'b mut T where Self: 'b; + type FfiType<'stream, 'b> = + DeviceMutRef<'b, DeviceAccessible<::CudaRepresentation>> where Self: 'b; + #[cfg(feature = "host")] + type SyncHostType = &'a mut T; + + #[cfg(feature = "host")] + fn with_new_async<'stream, 'param, O, E: From>( + param: Self::SyncHostType, + stream: &'stream rustacuda::stream::Stream, + inner: impl super::WithNewAsync<'stream, Self, O, E>, + ) -> Result where Self: 'param { + crate::lend::LendToCuda::lend_to_cuda_mut(param, |mut param| { + // FIXME: express the same with param.as_async(stream).as_mut() + let _ = stream; + inner.with(crate::utils::r#async::AsyncProj::new(&mut param.as_mut())) + }) + } + + #[cfg(feature = "host")] + fn with_async_as_ptx_jit<'stream, 'b, O>( + _param: &Self::AsyncHostType<'stream, 'b>, + _token: sealed::Token, + inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O, + ) -> O where Self: 'b { + inner(None) + } + + #[cfg(feature = "host")] + fn shared_layout_for_async<'stream, 'b>( + _param: &Self::AsyncHostType<'stream, 'b>, + _token: sealed::Token, + ) -> Layout where Self: 'b { + Layout::new::<()>() + } + + #[cfg(feature = "host")] + fn async_to_ffi<'stream, 'b, E: From>( + param: Self::AsyncHostType<'stream, 'b>, + _token: sealed::Token, + ) -> Result, E> where Self: 'b { + let param = unsafe { param.unwrap_unchecked() }; + Ok(param.for_device()) + } + + #[cfg(feature = "device")] + unsafe fn with_ffi_as_device<'short, O, const PARAM: usize>( + param: Self::FfiType<'static, 'short>, + inner: impl super::WithFfiAsDevice, + ) -> O where Self: 'short { + unsafe { crate::lend::BorrowFromRust::with_borrow_from_rust_mut(param, |param| inner.with(param)) } + } +} +impl<'a, T: Sync + RustToCuda + SafeMutableAliasing> sealed::Sealed + for &'a mut DeepPerThreadBorrow +{ +} + +impl< + T: Send + + Clone + + RustToCuda< + CudaRepresentation: crate::safety::StackOnly, + CudaAllocation: EmptyCudaAlloc, + >, + > CudaKernelParameter for PtxJit> +{ + #[cfg(feature = "host")] + type AsyncHostType<'stream, 'b> = + as CudaKernelParameter>::AsyncHostType<'stream, 'b> where Self: 'b; + #[cfg(any(feature = "device", doc))] + type DeviceType<'b> = as CudaKernelParameter>::DeviceType<'b> where Self: 'b; + type FfiType<'stream, 'b> = + as CudaKernelParameter>::FfiType<'stream, 'b> where Self: 'b; + #[cfg(feature = "host")] + type SyncHostType = as CudaKernelParameter>::SyncHostType; + + #[cfg(feature = "host")] + fn with_new_async<'stream, 'param, O, E: From>( + param: Self::SyncHostType, + stream: &'stream rustacuda::stream::Stream, + inner: impl super::WithNewAsync<'stream, Self, O, E>, + ) -> Result where Self: 'param { + // FIXME: forward impl + crate::lend::LendToCuda::move_to_cuda(param, |param| inner.with(param.into_async(stream))) + } + + #[cfg(feature = "host")] + fn with_async_as_ptx_jit<'stream, 'b, O>( + param: &Self::AsyncHostType<'stream, 'b>, + _token: sealed::Token, + inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O, + ) -> O where Self: 'b { + let param = unsafe { param.as_ref().unwrap_unchecked() }; + inner(Some(¶m_as_raw_bytes(param.for_host()))) + } + + #[cfg(feature = "host")] + fn async_to_ffi<'stream, 'b, E: From>( + param: Self::AsyncHostType<'stream, 'b>, + token: sealed::Token, + ) -> Result, E> where Self: 'b { + as CudaKernelParameter>::async_to_ffi(param, token) + } + + #[cfg(feature = "host")] + fn shared_layout_for_async<'stream, 'b>( + _param: &Self::AsyncHostType<'stream, 'b>, + _token: sealed::Token, + ) -> Layout where Self: 'b { + Layout::new::<()>() + } + + #[cfg(feature = "device")] + unsafe fn with_ffi_as_device<'short, O, const PARAM: usize>( + param: Self::FfiType<'static, 'short>, + inner: impl super::WithFfiAsDevice, + ) -> O where Self: 'short { + emit_param_ptx_jit_marker::<_, PARAM>(param.as_ref()); + + // FIXME: forward impl + unsafe { crate::lend::BorrowFromRust::with_moved_from_rust(param, |param| inner.with(param)) } + } +} +impl< + T: Send + + Clone + + RustToCuda< + CudaRepresentation: crate::safety::StackOnly, + CudaAllocation: EmptyCudaAlloc, + >, + > sealed::Sealed for PtxJit> +{ +} + +impl<'a, T: Sync + RustToCuda> CudaKernelParameter + for &'a PtxJit> +{ + #[cfg(feature = "host")] + type AsyncHostType<'stream, 'b> = + <&'a DeepPerThreadBorrow as CudaKernelParameter>::AsyncHostType<'stream, 'b> where Self: 'b; + #[cfg(any(feature = "device", doc))] + type DeviceType<'b> = <&'a DeepPerThreadBorrow as CudaKernelParameter>::DeviceType<'b> where Self: 'b; + type FfiType<'stream, 'b> = + <&'a DeepPerThreadBorrow as CudaKernelParameter>::FfiType<'stream, 'b> where Self: 'b; + #[cfg(feature = "host")] + type SyncHostType = <&'a DeepPerThreadBorrow as CudaKernelParameter>::SyncHostType; + + #[cfg(feature = "host")] + fn with_new_async<'stream, 'param, O, E: From>( + param: Self::SyncHostType, + stream: &'stream rustacuda::stream::Stream, + inner: impl super::WithNewAsync<'stream, Self, O, E>, + ) -> Result where Self: 'param { + // FIXME: forward impl + crate::lend::LendToCuda::lend_to_cuda(param, |param| inner.with(param.as_async(stream).as_ref())) + } + + #[cfg(feature = "host")] + fn with_async_as_ptx_jit<'stream, 'b, O>( + param: &Self::AsyncHostType<'stream, 'b>, + _token: sealed::Token, + inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O, + ) -> O where Self: 'b { + let param = unsafe { param.unwrap_unchecked() }; + inner(Some(¶m_as_raw_bytes(param.for_host()))) + } + + #[cfg(feature = "host")] + fn shared_layout_for_async<'stream, 'b>( + _param: &Self::AsyncHostType<'stream, 'b>, + _token: sealed::Token, + ) -> Layout where Self: 'b { + Layout::new::<()>() + } + + #[cfg(feature = "host")] + fn async_to_ffi<'stream, 'b, E: From>( + param: Self::AsyncHostType<'stream, 'b>, + token: sealed::Token, + ) -> Result, E> where Self: 'b { + <&'a DeepPerThreadBorrow as CudaKernelParameter>::async_to_ffi(param, token) + } + + #[cfg(feature = "device")] + unsafe fn with_ffi_as_device<'short, O, const PARAM: usize>( + param: Self::FfiType<'static, 'short>, + inner: impl super::WithFfiAsDevice, + ) -> O where Self: 'short { + emit_param_ptx_jit_marker::<_, PARAM>(param.as_ref()); + + // FIXME: forward impl + unsafe { crate::lend::BorrowFromRust::with_borrow_from_rust(param, |param| inner.with(param)) } + } +} +impl<'a, T: Sync + RustToCuda> sealed::Sealed for &'a PtxJit> {} + +impl<'a, T: Sync + RustToCuda + SafeMutableAliasing> CudaKernelParameter + for &'a mut PtxJit> +{ + #[cfg(feature = "host")] + type AsyncHostType<'stream, 'b> = + <&'a mut DeepPerThreadBorrow as CudaKernelParameter>::AsyncHostType<'stream, 'b> where Self: 'b; + #[cfg(any(feature = "device", doc))] + type DeviceType<'b> = <&'a mut DeepPerThreadBorrow as CudaKernelParameter>::DeviceType<'b> where Self: 'b; + type FfiType<'stream, 'b> = + <&'a mut DeepPerThreadBorrow as CudaKernelParameter>::FfiType<'stream, 'b> where Self: 'b; + #[cfg(feature = "host")] + type SyncHostType = <&'a mut DeepPerThreadBorrow as CudaKernelParameter>::SyncHostType; + + #[cfg(feature = "host")] + fn with_new_async<'stream, 'param, O, E: From>( + param: Self::SyncHostType, + stream: &'stream rustacuda::stream::Stream, + inner: impl super::WithNewAsync<'stream, Self, O, E>, + ) -> Result where Self: 'param { + // FIXME: forward impl + crate::lend::LendToCuda::lend_to_cuda_mut(param, |mut param| { + // FIXME: express the same with param.as_async(stream).as_mut() + let _ = stream; + inner.with(crate::utils::r#async::AsyncProj::new(&mut param.as_mut())) + }) + } + + #[cfg(feature = "host")] + fn with_async_as_ptx_jit<'stream, 'b, O>( + param: &Self::AsyncHostType<'stream, 'b>, + _token: sealed::Token, + inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O, + ) -> O where Self: 'b { + let param = unsafe { param.as_ref().unwrap_unchecked() }; + inner(Some(¶m_as_raw_bytes(param.for_host()))) + } + + #[cfg(feature = "host")] + fn shared_layout_for_async<'stream, 'b>( + _param: &Self::AsyncHostType<'stream, 'b>, + _token: sealed::Token, + ) -> Layout where Self: 'b { + Layout::new::<()>() + } + + #[cfg(feature = "host")] + fn async_to_ffi<'stream, 'b, E: From>( + param: Self::AsyncHostType<'stream, 'b>, + token: sealed::Token, + ) -> Result, E> where Self: 'b { + <&'a mut DeepPerThreadBorrow as CudaKernelParameter>::async_to_ffi(param, token) + } + + #[cfg(feature = "device")] + unsafe fn with_ffi_as_device<'short, O, const PARAM: usize>( + param: Self::FfiType<'static, 'short>, + inner: impl super::WithFfiAsDevice, + ) -> O where Self: 'short { + emit_param_ptx_jit_marker::<_, PARAM>(param.as_ref()); + + // FIXME: forward impl + unsafe { crate::lend::BorrowFromRust::with_borrow_from_rust_mut(param, |param| inner.with(param)) } + } +} +impl<'a, T: Sync + RustToCuda + SafeMutableAliasing> sealed::Sealed + for &'a mut PtxJit> +{ +} #[cfg(feature = "host")] fn param_as_raw_bytes(r: &T) -> NonNull<[u8]> { @@ -871,151 +874,151 @@ fn emit_param_ptx_jit_marker(param: &T) { } } -// mod private_shared { -// use core::marker::PhantomData; - -// use const_type_layout::{TypeGraphLayout, TypeLayout}; - -// use crate::safety::PortableBitSemantics; - -// #[doc(hidden)] -// #[derive(TypeLayout)] -// #[repr(C)] -// pub struct ThreadBlockSharedFfi { -// pub(super) _dummy: [u8; 0], -// pub(super) _marker: PhantomData, -// } - -// #[doc(hidden)] -// #[derive(TypeLayout)] -// #[repr(C)] -// pub struct ThreadBlockSharedSliceFfi { -// pub(super) len: usize, -// pub(super) _marker: [T; 0], -// } -// } - -// impl<'a, T: 'static> CudaKernelParameter for &'a mut crate::utils::shared::ThreadBlockShared { -// #[cfg(feature = "host")] -// type AsyncHostType<'stream, 'b> = &'b mut crate::utils::shared::ThreadBlockShared; -// #[cfg(any(feature = "device", doc))] -// type DeviceType<'b> = &'b mut crate::utils::shared::ThreadBlockShared; -// type FfiType<'stream, 'b> = private_shared::ThreadBlockSharedFfi; -// #[cfg(feature = "host")] -// type SyncHostType = Self; - -// #[cfg(feature = "host")] -// fn with_new_async<'stream, 'param, O, E: From>( -// param: Self::SyncHostType, -// _stream: &'stream rustacuda::stream::Stream, -// inner: impl super::WithNewAsync<'stream, Self, O, E>, -// ) -> Result where Self: 'param { -// inner.with(param) -// } - -// #[cfg(feature = "host")] -// fn with_async_as_ptx_jit( -// _param: &Self::AsyncHostType<'_, '_>, -// _token: sealed::Token, -// inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O, -// ) -> O { -// inner(None) -// } - -// #[cfg(feature = "host")] -// fn shared_layout_for_async( -// _param: &Self::AsyncHostType<'_, '_>, -// _token: sealed::Token, -// ) -> Layout { -// Layout::new::<()>() -// } - -// #[cfg(feature = "host")] -// fn async_to_ffi<'stream, 'b, E: From>( -// _param: Self::AsyncHostType<'stream, 'b>, -// _token: sealed::Token, -// ) -> Result, E> { -// Ok(private_shared::ThreadBlockSharedFfi { -// _dummy: [], -// _marker: PhantomData::, -// }) -// } - -// #[cfg(feature = "device")] -// #[allow(clippy::inline_always)] -// #[inline(always)] -// unsafe fn with_ffi_as_device( -// _param: Self::FfiType<'static, 'static>, -// inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O, -// ) -> O { -// let mut param = crate::utils::shared::ThreadBlockShared::new_uninit(); - -// inner(&mut param) -// } -// } -// impl<'a, T: 'static> sealed::Sealed for &'a mut crate::utils::shared::ThreadBlockShared {} - -// impl<'a, T: 'static + PortableBitSemantics + TypeGraphLayout> CudaKernelParameter -// for &'a mut crate::utils::shared::ThreadBlockSharedSlice -// { -// #[cfg(feature = "host")] -// type AsyncHostType<'stream, 'b> = &'b mut crate::utils::shared::ThreadBlockSharedSlice; -// #[cfg(any(feature = "device", doc))] -// type DeviceType<'b> = &'b mut crate::utils::shared::ThreadBlockSharedSlice; -// type FfiType<'stream, 'b> = private_shared::ThreadBlockSharedSliceFfi; -// #[cfg(feature = "host")] -// type SyncHostType = Self; - -// #[cfg(feature = "host")] -// fn with_new_async<'stream, 'param, O, E: From>( -// param: Self::SyncHostType, -// _stream: &'stream rustacuda::stream::Stream, -// inner: impl super::WithNewAsync<'stream, Self, O, E>, -// ) -> Result where Self: 'param { -// inner.with(param) -// } - -// #[cfg(feature = "host")] -// fn with_async_as_ptx_jit( -// _param: &Self::AsyncHostType<'_, '_>, -// _token: sealed::Token, -// inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O, -// ) -> O { -// inner(None) -// } - -// #[cfg(feature = "host")] -// fn shared_layout_for_async( -// param: &Self::AsyncHostType<'_, '_>, -// _token: sealed::Token, -// ) -> Layout { -// param.layout() -// } - -// #[cfg(feature = "host")] -// fn async_to_ffi<'stream, 'b, E: From>( -// param: Self::AsyncHostType<'stream, 'b>, -// _token: sealed::Token, -// ) -> Result, E> { -// Ok(private_shared::ThreadBlockSharedSliceFfi { -// len: param.len(), -// _marker: [], -// }) -// } - -// #[cfg(feature = "device")] -// #[allow(clippy::inline_always)] -// #[inline(always)] -// unsafe fn with_ffi_as_device( -// param: Self::FfiType<'static, 'static>, -// inner: impl for<'b> FnOnce(Self::DeviceType<'b>) -> O, -// ) -> O { -// unsafe { -// crate::utils::shared::ThreadBlockSharedSlice::with_uninit_for_len(param.len, inner) -// } -// } -// } -// impl<'a, T: 'static + PortableBitSemantics + TypeGraphLayout> sealed::Sealed -// for &'a mut crate::utils::shared::ThreadBlockSharedSlice -// { -// } +mod private_shared { + use core::marker::PhantomData; + + use const_type_layout::{TypeGraphLayout, TypeLayout}; + + use crate::safety::PortableBitSemantics; + + #[doc(hidden)] + #[derive(TypeLayout)] + #[repr(C)] + pub struct ThreadBlockSharedFfi { + pub(super) _dummy: [u8; 0], + pub(super) _marker: PhantomData, + } + + #[doc(hidden)] + #[derive(TypeLayout)] + #[repr(C)] + pub struct ThreadBlockSharedSliceFfi { + pub(super) len: usize, + pub(super) _marker: [T; 0], + } +} + +impl<'a, T: 'static> CudaKernelParameter for &'a mut crate::utils::shared::ThreadBlockShared { + #[cfg(feature = "host")] + type AsyncHostType<'stream, 'b> = &'b mut crate::utils::shared::ThreadBlockShared where Self: 'b; + #[cfg(any(feature = "device", doc))] + type DeviceType<'b> = &'b mut crate::utils::shared::ThreadBlockShared where Self: 'b; + type FfiType<'stream, 'b> = private_shared::ThreadBlockSharedFfi where Self: 'b; + #[cfg(feature = "host")] + type SyncHostType = Self; + + #[cfg(feature = "host")] + fn with_new_async<'stream, 'param, O, E: From>( + param: Self::SyncHostType, + _stream: &'stream rustacuda::stream::Stream, + inner: impl super::WithNewAsync<'stream, Self, O, E>, + ) -> Result where Self: 'param { + inner.with(param) + } + + #[cfg(feature = "host")] + fn with_async_as_ptx_jit<'stream, 'b, O>( + _param: &Self::AsyncHostType<'stream, 'b>, + _token: sealed::Token, + inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O, + ) -> O where Self: 'b { + inner(None) + } + + #[cfg(feature = "host")] + fn shared_layout_for_async<'stream, 'b>( + _param: &Self::AsyncHostType<'stream, 'b>, + _token: sealed::Token, + ) -> Layout where Self: 'b { + Layout::new::<()>() + } + + #[cfg(feature = "host")] + fn async_to_ffi<'stream, 'b, E: From>( + _param: Self::AsyncHostType<'stream, 'b>, + _token: sealed::Token, + ) -> Result, E> where Self: 'b { + Ok(private_shared::ThreadBlockSharedFfi { + _dummy: [], + _marker: PhantomData::, + }) + } + + #[cfg(feature = "device")] + #[allow(clippy::inline_always)] + #[inline(always)] + unsafe fn with_ffi_as_device<'short, O, const PARAM: usize>( + _param: Self::FfiType<'static, 'short>, + inner: impl super::WithFfiAsDevice, + ) -> O where Self: 'short { + let mut param = crate::utils::shared::ThreadBlockShared::new_uninit(); + + inner.with(&mut param) + } +} +impl<'a, T: 'static> sealed::Sealed for &'a mut crate::utils::shared::ThreadBlockShared {} + +impl<'a, T: 'static + PortableBitSemantics + TypeGraphLayout> CudaKernelParameter + for &'a mut crate::utils::shared::ThreadBlockSharedSlice +{ + #[cfg(feature = "host")] + type AsyncHostType<'stream, 'b> = &'b mut crate::utils::shared::ThreadBlockSharedSlice where Self: 'b; + #[cfg(any(feature = "device", doc))] + type DeviceType<'b> = &'b mut crate::utils::shared::ThreadBlockSharedSlice where Self: 'b; + type FfiType<'stream, 'b> = private_shared::ThreadBlockSharedSliceFfi where Self: 'b; + #[cfg(feature = "host")] + type SyncHostType = Self; + + #[cfg(feature = "host")] + fn with_new_async<'stream, 'param, O, E: From>( + param: Self::SyncHostType, + _stream: &'stream rustacuda::stream::Stream, + inner: impl super::WithNewAsync<'stream, Self, O, E>, + ) -> Result where Self: 'param { + inner.with(param) + } + + #[cfg(feature = "host")] + fn with_async_as_ptx_jit<'stream, 'b, O>( + _param: &Self::AsyncHostType<'stream, 'b>, + _token: sealed::Token, + inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O, + ) -> O where Self: 'b { + inner(None) + } + + #[cfg(feature = "host")] + fn shared_layout_for_async<'stream, 'b>( + param: &Self::AsyncHostType<'stream, 'b>, + _token: sealed::Token, + ) -> Layout where Self: 'b { + param.layout() + } + + #[cfg(feature = "host")] + fn async_to_ffi<'stream, 'b, E: From>( + param: Self::AsyncHostType<'stream, 'b>, + _token: sealed::Token, + ) -> Result, E> where Self: 'b { + Ok(private_shared::ThreadBlockSharedSliceFfi { + len: param.len(), + _marker: [], + }) + } + + #[cfg(feature = "device")] + #[allow(clippy::inline_always)] + #[inline(always)] + unsafe fn with_ffi_as_device<'short, O, const PARAM: usize>( + param: Self::FfiType<'static, 'short>, + inner: impl super::WithFfiAsDevice, + ) -> O where Self: 'short { + unsafe { + crate::utils::shared::ThreadBlockSharedSlice::with_uninit_for_len(param.len, |param| inner.with(param)) + } + } +} +impl<'a, T: 'static + PortableBitSemantics + TypeGraphLayout> sealed::Sealed + for &'a mut crate::utils::shared::ThreadBlockSharedSlice +{ +} From fd08c41bae173dfe97ec55598efd4517f918e9f5 Mon Sep 17 00:00:00 2001 From: Juniper Tyree Date: Mon, 8 Jan 2024 10:20:47 +0000 Subject: [PATCH 094/120] Remove 'static bounds from LendToCuda ref kernel params --- .../wrapper/generate/cuda_generic_function.rs | 57 ++- .../kernel/wrapper/generate/cuda_wrapper.rs | 4 +- src/kernel/mod.rs | 71 ++- src/kernel/param.rs | 404 +++++++++++++----- 4 files changed, 397 insertions(+), 139 deletions(-) diff --git a/rust-cuda-kernel/src/kernel/wrapper/generate/cuda_generic_function.rs b/rust-cuda-kernel/src/kernel/wrapper/generate/cuda_generic_function.rs index 1b05df23b..4084db0ed 100644 --- a/rust-cuda-kernel/src/kernel/wrapper/generate/cuda_generic_function.rs +++ b/rust-cuda-kernel/src/kernel/wrapper/generate/cuda_generic_function.rs @@ -16,17 +16,60 @@ pub(in super::super) fn quote_cuda_generic_function( func_attrs: &[syn::Attribute], func_block: &syn::Block, ) -> TokenStream { + let mut generic_params = (*generic_params).clone(); + let kernel_func_inputs = func_inputs .iter() + .enumerate() .map( - |syn::PatType { - attrs, - ty, - pat, - colon_token, - }| { + |( + i, + syn::PatType { + attrs, + ty, + pat, + colon_token, + }, + )| { + let (ty, lt) = if let syn::Type::Reference(syn::TypeReference { + and_token, + lifetime, + mutability, + elem, + }) = &**ty + { + let lifetime = if let Some(lifetime) = lifetime { + lifetime.clone() + } else { + let lifetime = + syn::Lifetime::new(&format!("'__rust_cuda_lt_{i}"), ty.span()); + generic_params.insert( + 0, + syn::GenericParam::Lifetime(syn::LifetimeDef { + attrs: Vec::new(), + colon_token: None, + lifetime: lifetime.clone(), + bounds: syn::punctuated::Punctuated::new(), + }), + ); + lifetime + }; + let lt = quote!(#lifetime); + ( + syn::Type::Reference(syn::TypeReference { + and_token: *and_token, + lifetime: Some(lifetime), + mutability: *mutability, + elem: elem.clone(), + }), + lt, + ) + } else { + (syn::Type::clone(ty), quote!('_)) + }; + let ty: syn::Type = syn::parse_quote_spanned! { ty.span()=> - <#ty as #crate_path::kernel::CudaKernelParameter>::DeviceType<'_> + <#ty as #crate_path::kernel::CudaKernelParameter>::DeviceType<#lt> }; syn::FnArg::Typed(syn::PatType { diff --git a/rust-cuda-kernel/src/kernel/wrapper/generate/cuda_wrapper.rs b/rust-cuda-kernel/src/kernel/wrapper/generate/cuda_wrapper.rs index f61bb9b32..48049c5a1 100644 --- a/rust-cuda-kernel/src/kernel/wrapper/generate/cuda_wrapper.rs +++ b/rust-cuda-kernel/src/kernel/wrapper/generate/cuda_wrapper.rs @@ -52,7 +52,9 @@ pub(in super::super) fn quote_cuda_wrapper( < #specialised_ty as #crate_path::kernel::CudaKernelParameter >::with_ffi_as_device::<_, #i>( - #pat, |#pat| { #inner } + #pat, |#pat: < + #specialised_ty as #crate_path::kernel::CudaKernelParameter + >::DeviceType::<'_>| { #inner } ) } } diff --git a/src/kernel/mod.rs b/src/kernel/mod.rs index c03ca5517..a27ed5b71 100644 --- a/src/kernel/mod.rs +++ b/src/kernel/mod.rs @@ -40,26 +40,51 @@ mod sealed { } #[cfg(feature = "host")] // FIXME: make private? -pub trait WithNewAsync<'stream, P: ?Sized + CudaKernelParameter, O, E: From> { +pub trait WithNewAsync< + 'stream, + P: ?Sized + CudaKernelParameter, + O, + E: From, +> +{ #[allow(clippy::missing_errors_doc)] // FIXME - fn with<'b>(self, param: P::AsyncHostType<'stream, 'b>) -> Result where P: 'b; + fn with<'b>(self, param: P::AsyncHostType<'stream, 'b>) -> Result + where + P: 'b; } #[cfg(feature = "host")] // FIXME: make private? -impl<'stream, P: ?Sized + CudaKernelParameter, O, E: From, F: for<'b> FnOnce(P::AsyncHostType<'stream, 'b>) -> Result> WithNewAsync<'stream, P, O, E> for F { - fn with<'b>(self, param: P::AsyncHostType<'stream, 'b>) -> Result where P: 'b { +impl< + 'stream, + P: ?Sized + CudaKernelParameter, + O, + E: From, + F: for<'b> FnOnce(P::AsyncHostType<'stream, 'b>) -> Result, + > WithNewAsync<'stream, P, O, E> for F +{ + fn with<'b>(self, param: P::AsyncHostType<'stream, 'b>) -> Result + where + P: 'b, + { (self)(param) } } #[cfg(feature = "device")] pub trait WithFfiAsDevice { - fn with<'b>(self, param: P::DeviceType<'b>) -> O where P: 'b; + fn with<'b>(self, param: P::DeviceType<'b>) -> O + where + P: 'b; } #[cfg(feature = "device")] -impl FnOnce(P::DeviceType<'b>) -> O> WithFfiAsDevice for F { - fn with<'b>(self, param: P::DeviceType<'b>) -> O where P: 'b { +impl FnOnce(P::DeviceType<'b>) -> O> + WithFfiAsDevice for F +{ + fn with<'b>(self, param: P::DeviceType<'b>) -> O + where + P: 'b, + { (self)(param) } } @@ -68,11 +93,17 @@ pub trait CudaKernelParameter: sealed::Sealed { #[cfg(feature = "host")] type SyncHostType; #[cfg(feature = "host")] - type AsyncHostType<'stream, 'b> where Self: 'b; + type AsyncHostType<'stream, 'b> + where + Self: 'b; #[doc(hidden)] - type FfiType<'stream, 'b>: PortableBitSemantics where Self: 'b; + type FfiType<'stream, 'b>: PortableBitSemantics + where + Self: 'b; #[cfg(any(feature = "device", doc))] - type DeviceType<'b> where Self: 'b; + type DeviceType<'b> + where + Self: 'b; #[cfg(feature = "host")] #[allow(clippy::missing_errors_doc)] // FIXME @@ -80,7 +111,9 @@ pub trait CudaKernelParameter: sealed::Sealed { param: Self::SyncHostType, stream: &'stream rustacuda::stream::Stream, inner: impl WithNewAsync<'stream, Self, O, E>, - ) -> Result where Self: 'param; + ) -> Result + where + Self: 'param; #[doc(hidden)] #[cfg(feature = "host")] @@ -88,28 +121,36 @@ pub trait CudaKernelParameter: sealed::Sealed { param: &Self::AsyncHostType<'stream, 'b>, token: sealed::Token, inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O, - ) -> O where Self: 'b; + ) -> O + where + Self: 'b; #[doc(hidden)] #[cfg(feature = "host")] fn shared_layout_for_async<'stream, 'b>( param: &Self::AsyncHostType<'stream, 'b>, token: sealed::Token, - ) -> std::alloc::Layout where Self: 'b; + ) -> std::alloc::Layout + where + Self: 'b; #[doc(hidden)] #[cfg(feature = "host")] fn async_to_ffi<'stream, 'b, E: From>( param: Self::AsyncHostType<'stream, 'b>, token: sealed::Token, - ) -> Result, E> where Self: 'b; + ) -> Result, E> + where + Self: 'b; #[doc(hidden)] #[cfg(feature = "device")] unsafe fn with_ffi_as_device<'short, O, const PARAM: usize>( param: Self::FfiType<'static, 'short>, inner: impl WithFfiAsDevice, - ) -> O where Self: 'short; + ) -> O + where + Self: 'short; } #[cfg(feature = "host")] diff --git a/src/kernel/param.rs b/src/kernel/param.rs index c40f68e1e..1f149d8b4 100644 --- a/src/kernel/param.rs +++ b/src/kernel/param.rs @@ -80,11 +80,14 @@ impl< type SyncHostType = T; #[cfg(feature = "host")] - fn with_new_async<'stream, 'param, O, E: From>( + fn with_new_async<'stream, 'b, O, E: From>( param: Self::SyncHostType, _stream: &'stream rustacuda::stream::Stream, inner: impl super::WithNewAsync<'stream, Self, O, E>, - ) -> Result where Self: 'param { + ) -> Result + where + Self: 'b, + { inner.with(crate::utils::adapter::RustToCudaWithPortableBitCopySemantics::from(param)) } @@ -93,7 +96,10 @@ impl< _param: &Self::AsyncHostType<'stream, 'b>, _token: sealed::Token, inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O, - ) -> O where Self: 'b { + ) -> O + where + Self: 'b, + { inner(None) } @@ -101,7 +107,10 @@ impl< fn shared_layout_for_async<'stream, 'b>( _param: &Self::AsyncHostType<'stream, 'b>, _token: sealed::Token, - ) -> Layout where Self: 'b { + ) -> Layout + where + Self: 'b, + { Layout::new::<()>() } @@ -109,7 +118,10 @@ impl< fn async_to_ffi<'stream, 'b, E: From>( param: Self::AsyncHostType<'stream, 'b>, _token: sealed::Token, - ) -> Result, E> where Self: 'b { + ) -> Result, E> + where + Self: 'b, + { Ok(param) } @@ -117,7 +129,10 @@ impl< unsafe fn with_ffi_as_device<'short, O, const PARAM: usize>( param: Self::FfiType<'static, 'short>, inner: impl super::WithFfiAsDevice, - ) -> O where Self: 'short { + ) -> O + where + Self: 'short, + { let param = param.into_inner(); inner.with(param) @@ -135,10 +150,7 @@ impl< impl< 'a, - T: Sync - + crate::safety::StackOnly - + crate::safety::PortableBitSemantics - + TypeGraphLayout, + T: Sync + crate::safety::StackOnly + crate::safety::PortableBitSemantics + TypeGraphLayout, > CudaKernelParameter for &'a PerThreadShallowCopy { #[cfg(feature = "host")] @@ -154,11 +166,14 @@ impl< type SyncHostType = &'a T; #[cfg(feature = "host")] - fn with_new_async<'stream, 'param, O, E: From>( + fn with_new_async<'stream, 'b, O, E: From>( param: Self::SyncHostType, stream: &'stream rustacuda::stream::Stream, inner: impl super::WithNewAsync<'stream, Self, O, E>, - ) -> Result where Self: 'param { + ) -> Result + where + Self: 'b, + { crate::host::HostAndDeviceConstRef::with_new(param, |const_ref| { inner.with(const_ref.as_async(stream).as_ref()) }) @@ -169,7 +184,10 @@ impl< _param: &Self::AsyncHostType<'stream, 'b>, _token: sealed::Token, inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O, - ) -> O where Self: 'b { + ) -> O + where + Self: 'b, + { inner(None) } @@ -177,7 +195,10 @@ impl< fn shared_layout_for_async<'stream, 'b>( _param: &Self::AsyncHostType<'stream, 'b>, _token: sealed::Token, - ) -> Layout where Self: 'b { + ) -> Layout + where + Self: 'b, + { Layout::new::<()>() } @@ -185,7 +206,10 @@ impl< fn async_to_ffi<'stream, 'b, E: From>( param: Self::AsyncHostType<'stream, 'b>, _token: sealed::Token, - ) -> Result, E> where Self: 'b { + ) -> Result, E> + where + Self: 'b, + { let param = unsafe { param.unwrap_unchecked() }; Ok(param.for_device()) } @@ -194,7 +218,10 @@ impl< unsafe fn with_ffi_as_device<'short, O, const PARAM: usize>( param: Self::FfiType<'static, 'short>, inner: impl super::WithFfiAsDevice, - ) -> O where Self: 'short { + ) -> O + where + Self: 'short, + { let param = param.as_ref(); inner.with(param) @@ -202,20 +229,14 @@ impl< } impl< 'a, - T: Sync - + crate::safety::StackOnly - + crate::safety::PortableBitSemantics - + TypeGraphLayout, + T: Sync + crate::safety::StackOnly + crate::safety::PortableBitSemantics + TypeGraphLayout, > sealed::Sealed for &'a PerThreadShallowCopy { } impl< 'a, - T: Sync - + crate::safety::StackOnly - + crate::safety::PortableBitSemantics - + TypeGraphLayout, + T: Sync + crate::safety::StackOnly + crate::safety::PortableBitSemantics + TypeGraphLayout, > CudaKernelParameter for &'a PtxJit> { #[cfg(feature = "host")] @@ -229,11 +250,14 @@ impl< type SyncHostType = <&'a PerThreadShallowCopy as CudaKernelParameter>::SyncHostType; #[cfg(feature = "host")] - fn with_new_async<'stream, 'param, O, E: From>( + fn with_new_async<'stream, 'b, O, E: From>( param: Self::SyncHostType, stream: &'stream rustacuda::stream::Stream, inner: impl super::WithNewAsync<'stream, Self, O, E>, - ) -> Result where Self: 'param { + ) -> Result + where + Self: 'b, + { // FIXME: forward impl crate::host::HostAndDeviceConstRef::with_new(param, |const_ref| { inner.with(const_ref.as_async(stream).as_ref()) @@ -245,7 +269,10 @@ impl< param: &Self::AsyncHostType<'stream, 'b>, _token: sealed::Token, inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O, - ) -> O where Self: 'b { + ) -> O + where + Self: 'b, + { let param = unsafe { param.unwrap_unchecked() }; inner(Some(¶m_as_raw_bytes(param.for_host()))) } @@ -254,7 +281,10 @@ impl< fn shared_layout_for_async<'stream, 'b>( _param: &Self::AsyncHostType<'stream, 'b>, _token: sealed::Token, - ) -> Layout where Self: 'b { + ) -> Layout + where + Self: 'b, + { Layout::new::<()>() } @@ -262,7 +292,10 @@ impl< fn async_to_ffi<'stream, 'b, E: From>( param: Self::AsyncHostType<'stream, 'b>, token: sealed::Token, - ) -> Result, E> where Self: 'b { + ) -> Result, E> + where + Self: 'b, + { <&'a PerThreadShallowCopy as CudaKernelParameter>::async_to_ffi(param, token) } @@ -270,7 +303,10 @@ impl< unsafe fn with_ffi_as_device<'short, O, const PARAM: usize>( param: Self::FfiType<'static, 'short>, inner: impl super::WithFfiAsDevice, - ) -> O where Self: 'short { + ) -> O + where + Self: 'short, + { emit_param_ptx_jit_marker::<_, PARAM>(param.as_ref()); // FIXME: forward impl @@ -281,10 +317,7 @@ impl< } impl< 'a, - T: Sync - + crate::safety::StackOnly - + crate::safety::PortableBitSemantics - + TypeGraphLayout, + T: Sync + crate::safety::StackOnly + crate::safety::PortableBitSemantics + TypeGraphLayout, > sealed::Sealed for &'a PtxJit> { } @@ -339,11 +372,14 @@ impl< type SyncHostType = &'a mut T; #[cfg(feature = "host")] - fn with_new_async<'stream, 'param, O, E: From>( + fn with_new_async<'stream, 'b, O, E: From>( param: Self::SyncHostType, stream: &'stream rustacuda::stream::Stream, inner: impl super::WithNewAsync<'stream, Self, O, E>, - ) -> Result where Self: 'param { + ) -> Result + where + Self: 'b, + { crate::host::HostAndDeviceMutRef::with_new(param, |const_ref| { inner.with(const_ref.as_ref().as_async(stream).as_ref()) }) @@ -354,7 +390,10 @@ impl< _param: &Self::AsyncHostType<'stream, 'b>, _token: sealed::Token, inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O, - ) -> O where Self: 'b { + ) -> O + where + Self: 'b, + { inner(None) } @@ -362,7 +401,10 @@ impl< fn shared_layout_for_async<'stream, 'b>( _param: &Self::AsyncHostType<'stream, 'b>, _token: sealed::Token, - ) -> Layout where Self: 'b { + ) -> Layout + where + Self: 'b, + { Layout::new::<()>() } @@ -370,7 +412,10 @@ impl< fn async_to_ffi<'stream, 'b, E: From>( param: Self::AsyncHostType<'stream, 'b>, _token: sealed::Token, - ) -> Result, E> where Self: 'b { + ) -> Result, E> + where + Self: 'b, + { let param = unsafe { param.unwrap_unchecked() }; Ok(param.for_device()) } @@ -379,7 +424,10 @@ impl< unsafe fn with_ffi_as_device<'short, O, const PARAM: usize>( param: Self::FfiType<'static, 'short>, inner: impl super::WithFfiAsDevice, - ) -> O where Self: 'short { + ) -> O + where + Self: 'short, + { let param = param.as_ref(); inner.with(param) @@ -439,10 +487,7 @@ impl Deref for DeepPerThreadBorrow { impl< T: Send + Clone - + RustToCuda< - CudaRepresentation: crate::safety::StackOnly, - CudaAllocation: EmptyCudaAlloc, - >, + + RustToCuda, > CudaKernelParameter for DeepPerThreadBorrow { #[cfg(feature = "host")] @@ -463,11 +508,14 @@ impl< type SyncHostType = T; #[cfg(feature = "host")] - fn with_new_async<'stream, 'param, O, E: From>( + fn with_new_async<'stream, 'b, O, E: From>( param: Self::SyncHostType, stream: &'stream rustacuda::stream::Stream, inner: impl super::WithNewAsync<'stream, Self, O, E>, - ) -> Result where Self: 'param { + ) -> Result + where + Self: 'b, + { crate::lend::LendToCuda::move_to_cuda(param, |param| inner.with(param.into_async(stream))) } @@ -476,7 +524,10 @@ impl< _param: &Self::AsyncHostType<'stream, 'b>, _token: sealed::Token, inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O, - ) -> O where Self: 'b { + ) -> O + where + Self: 'b, + { inner(None) } @@ -484,7 +535,10 @@ impl< fn shared_layout_for_async<'stream, 'b>( _param: &Self::AsyncHostType<'stream, 'b>, _token: sealed::Token, - ) -> Layout where Self: 'b { + ) -> Layout + where + Self: 'b, + { Layout::new::<()>() } @@ -492,7 +546,10 @@ impl< fn async_to_ffi<'stream, 'b, E: From>( param: Self::AsyncHostType<'stream, 'b>, _token: sealed::Token, - ) -> Result, E> where Self: 'b { + ) -> Result, E> + where + Self: 'b, + { let (param, _completion): (_, Option) = unsafe { param.unwrap_unchecked()? }; Ok(param.for_device()) @@ -502,17 +559,19 @@ impl< unsafe fn with_ffi_as_device<'short, O, const PARAM: usize>( param: Self::FfiType<'static, 'short>, inner: impl super::WithFfiAsDevice, - ) -> O where Self: 'short { - unsafe { crate::lend::BorrowFromRust::with_moved_from_rust(param, |param| inner.with(param)) } + ) -> O + where + Self: 'short, + { + unsafe { + crate::lend::BorrowFromRust::with_moved_from_rust(param, |param| inner.with(param)) + } } } impl< T: Send + Clone - + RustToCuda< - CudaRepresentation: crate::safety::StackOnly, - CudaAllocation: EmptyCudaAlloc, - >, + + RustToCuda, > sealed::Sealed for DeepPerThreadBorrow { } @@ -535,12 +594,17 @@ impl<'a, T: Sync + RustToCuda> CudaKernelParameter for &'a DeepPerThreadBorrow>( + fn with_new_async<'stream, 'b, O, E: From>( param: Self::SyncHostType, stream: &'stream rustacuda::stream::Stream, inner: impl super::WithNewAsync<'stream, Self, O, E>, - ) -> Result where Self: 'param { - crate::lend::LendToCuda::lend_to_cuda(param, |param| inner.with(param.as_async(stream).as_ref())) + ) -> Result + where + Self: 'b, + { + crate::lend::LendToCuda::lend_to_cuda(param, |param| { + inner.with(param.as_async(stream).as_ref()) + }) } #[cfg(feature = "host")] @@ -548,7 +612,10 @@ impl<'a, T: Sync + RustToCuda> CudaKernelParameter for &'a DeepPerThreadBorrow, _token: sealed::Token, inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O, - ) -> O where Self: 'b { + ) -> O + where + Self: 'b, + { inner(None) } @@ -556,7 +623,10 @@ impl<'a, T: Sync + RustToCuda> CudaKernelParameter for &'a DeepPerThreadBorrow( _param: &Self::AsyncHostType<'stream, 'b>, _token: sealed::Token, - ) -> Layout where Self: 'b { + ) -> Layout + where + Self: 'b, + { Layout::new::<()>() } @@ -564,7 +634,10 @@ impl<'a, T: Sync + RustToCuda> CudaKernelParameter for &'a DeepPerThreadBorrow>( param: Self::AsyncHostType<'stream, 'b>, _token: sealed::Token, - ) -> Result, E> where Self: 'b { + ) -> Result, E> + where + Self: 'b, + { let param = unsafe { param.unwrap_unchecked() }; Ok(param.for_device()) } @@ -573,8 +646,13 @@ impl<'a, T: Sync + RustToCuda> CudaKernelParameter for &'a DeepPerThreadBorrow( param: Self::FfiType<'static, 'short>, inner: impl super::WithFfiAsDevice, - ) -> O where Self: 'short { - unsafe { crate::lend::BorrowFromRust::with_borrow_from_rust(param, |param| inner.with(param)) } + ) -> O + where + Self: 'short, + { + unsafe { + crate::lend::BorrowFromRust::with_borrow_from_rust(param, |param| inner.with(param)) + } } } impl<'a, T: Sync + RustToCuda> sealed::Sealed for &'a DeepPerThreadBorrow {} @@ -599,11 +677,14 @@ impl<'a, T: Sync + RustToCuda + SafeMutableAliasing> CudaKernelParameter type SyncHostType = &'a mut T; #[cfg(feature = "host")] - fn with_new_async<'stream, 'param, O, E: From>( + fn with_new_async<'stream, 'b, O, E: From>( param: Self::SyncHostType, stream: &'stream rustacuda::stream::Stream, inner: impl super::WithNewAsync<'stream, Self, O, E>, - ) -> Result where Self: 'param { + ) -> Result + where + Self: 'b, + { crate::lend::LendToCuda::lend_to_cuda_mut(param, |mut param| { // FIXME: express the same with param.as_async(stream).as_mut() let _ = stream; @@ -616,7 +697,10 @@ impl<'a, T: Sync + RustToCuda + SafeMutableAliasing> CudaKernelParameter _param: &Self::AsyncHostType<'stream, 'b>, _token: sealed::Token, inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O, - ) -> O where Self: 'b { + ) -> O + where + Self: 'b, + { inner(None) } @@ -624,7 +708,10 @@ impl<'a, T: Sync + RustToCuda + SafeMutableAliasing> CudaKernelParameter fn shared_layout_for_async<'stream, 'b>( _param: &Self::AsyncHostType<'stream, 'b>, _token: sealed::Token, - ) -> Layout where Self: 'b { + ) -> Layout + where + Self: 'b, + { Layout::new::<()>() } @@ -632,7 +719,10 @@ impl<'a, T: Sync + RustToCuda + SafeMutableAliasing> CudaKernelParameter fn async_to_ffi<'stream, 'b, E: From>( param: Self::AsyncHostType<'stream, 'b>, _token: sealed::Token, - ) -> Result, E> where Self: 'b { + ) -> Result, E> + where + Self: 'b, + { let param = unsafe { param.unwrap_unchecked() }; Ok(param.for_device()) } @@ -641,8 +731,13 @@ impl<'a, T: Sync + RustToCuda + SafeMutableAliasing> CudaKernelParameter unsafe fn with_ffi_as_device<'short, O, const PARAM: usize>( param: Self::FfiType<'static, 'short>, inner: impl super::WithFfiAsDevice, - ) -> O where Self: 'short { - unsafe { crate::lend::BorrowFromRust::with_borrow_from_rust_mut(param, |param| inner.with(param)) } + ) -> O + where + Self: 'short, + { + unsafe { + crate::lend::BorrowFromRust::with_borrow_from_rust_mut(param, |param| inner.with(param)) + } } } impl<'a, T: Sync + RustToCuda + SafeMutableAliasing> sealed::Sealed @@ -653,10 +748,7 @@ impl<'a, T: Sync + RustToCuda + SafeMutableAliasing> sealed::Sealed impl< T: Send + Clone - + RustToCuda< - CudaRepresentation: crate::safety::StackOnly, - CudaAllocation: EmptyCudaAlloc, - >, + + RustToCuda, > CudaKernelParameter for PtxJit> { #[cfg(feature = "host")] @@ -670,11 +762,14 @@ impl< type SyncHostType = as CudaKernelParameter>::SyncHostType; #[cfg(feature = "host")] - fn with_new_async<'stream, 'param, O, E: From>( + fn with_new_async<'stream, 'b, O, E: From>( param: Self::SyncHostType, stream: &'stream rustacuda::stream::Stream, inner: impl super::WithNewAsync<'stream, Self, O, E>, - ) -> Result where Self: 'param { + ) -> Result + where + Self: 'b, + { // FIXME: forward impl crate::lend::LendToCuda::move_to_cuda(param, |param| inner.with(param.into_async(stream))) } @@ -684,7 +779,10 @@ impl< param: &Self::AsyncHostType<'stream, 'b>, _token: sealed::Token, inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O, - ) -> O where Self: 'b { + ) -> O + where + Self: 'b, + { let param = unsafe { param.as_ref().unwrap_unchecked() }; inner(Some(¶m_as_raw_bytes(param.for_host()))) } @@ -693,7 +791,10 @@ impl< fn async_to_ffi<'stream, 'b, E: From>( param: Self::AsyncHostType<'stream, 'b>, token: sealed::Token, - ) -> Result, E> where Self: 'b { + ) -> Result, E> + where + Self: 'b, + { as CudaKernelParameter>::async_to_ffi(param, token) } @@ -701,7 +802,10 @@ impl< fn shared_layout_for_async<'stream, 'b>( _param: &Self::AsyncHostType<'stream, 'b>, _token: sealed::Token, - ) -> Layout where Self: 'b { + ) -> Layout + where + Self: 'b, + { Layout::new::<()>() } @@ -709,27 +813,27 @@ impl< unsafe fn with_ffi_as_device<'short, O, const PARAM: usize>( param: Self::FfiType<'static, 'short>, inner: impl super::WithFfiAsDevice, - ) -> O where Self: 'short { + ) -> O + where + Self: 'short, + { emit_param_ptx_jit_marker::<_, PARAM>(param.as_ref()); // FIXME: forward impl - unsafe { crate::lend::BorrowFromRust::with_moved_from_rust(param, |param| inner.with(param)) } + unsafe { + crate::lend::BorrowFromRust::with_moved_from_rust(param, |param| inner.with(param)) + } } } impl< T: Send + Clone - + RustToCuda< - CudaRepresentation: crate::safety::StackOnly, - CudaAllocation: EmptyCudaAlloc, - >, + + RustToCuda, > sealed::Sealed for PtxJit> { } -impl<'a, T: Sync + RustToCuda> CudaKernelParameter - for &'a PtxJit> -{ +impl<'a, T: Sync + RustToCuda> CudaKernelParameter for &'a PtxJit> { #[cfg(feature = "host")] type AsyncHostType<'stream, 'b> = <&'a DeepPerThreadBorrow as CudaKernelParameter>::AsyncHostType<'stream, 'b> where Self: 'b; @@ -741,13 +845,18 @@ impl<'a, T: Sync + RustToCuda> CudaKernelParameter type SyncHostType = <&'a DeepPerThreadBorrow as CudaKernelParameter>::SyncHostType; #[cfg(feature = "host")] - fn with_new_async<'stream, 'param, O, E: From>( + fn with_new_async<'stream, 'b, O, E: From>( param: Self::SyncHostType, stream: &'stream rustacuda::stream::Stream, inner: impl super::WithNewAsync<'stream, Self, O, E>, - ) -> Result where Self: 'param { + ) -> Result + where + Self: 'b, + { // FIXME: forward impl - crate::lend::LendToCuda::lend_to_cuda(param, |param| inner.with(param.as_async(stream).as_ref())) + crate::lend::LendToCuda::lend_to_cuda(param, |param| { + inner.with(param.as_async(stream).as_ref()) + }) } #[cfg(feature = "host")] @@ -755,7 +864,10 @@ impl<'a, T: Sync + RustToCuda> CudaKernelParameter param: &Self::AsyncHostType<'stream, 'b>, _token: sealed::Token, inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O, - ) -> O where Self: 'b { + ) -> O + where + Self: 'b, + { let param = unsafe { param.unwrap_unchecked() }; inner(Some(¶m_as_raw_bytes(param.for_host()))) } @@ -764,7 +876,10 @@ impl<'a, T: Sync + RustToCuda> CudaKernelParameter fn shared_layout_for_async<'stream, 'b>( _param: &Self::AsyncHostType<'stream, 'b>, _token: sealed::Token, - ) -> Layout where Self: 'b { + ) -> Layout + where + Self: 'b, + { Layout::new::<()>() } @@ -772,7 +887,10 @@ impl<'a, T: Sync + RustToCuda> CudaKernelParameter fn async_to_ffi<'stream, 'b, E: From>( param: Self::AsyncHostType<'stream, 'b>, token: sealed::Token, - ) -> Result, E> where Self: 'b { + ) -> Result, E> + where + Self: 'b, + { <&'a DeepPerThreadBorrow as CudaKernelParameter>::async_to_ffi(param, token) } @@ -780,11 +898,16 @@ impl<'a, T: Sync + RustToCuda> CudaKernelParameter unsafe fn with_ffi_as_device<'short, O, const PARAM: usize>( param: Self::FfiType<'static, 'short>, inner: impl super::WithFfiAsDevice, - ) -> O where Self: 'short { + ) -> O + where + Self: 'short, + { emit_param_ptx_jit_marker::<_, PARAM>(param.as_ref()); // FIXME: forward impl - unsafe { crate::lend::BorrowFromRust::with_borrow_from_rust(param, |param| inner.with(param)) } + unsafe { + crate::lend::BorrowFromRust::with_borrow_from_rust(param, |param| inner.with(param)) + } } } impl<'a, T: Sync + RustToCuda> sealed::Sealed for &'a PtxJit> {} @@ -803,11 +926,14 @@ impl<'a, T: Sync + RustToCuda + SafeMutableAliasing> CudaKernelParameter type SyncHostType = <&'a mut DeepPerThreadBorrow as CudaKernelParameter>::SyncHostType; #[cfg(feature = "host")] - fn with_new_async<'stream, 'param, O, E: From>( + fn with_new_async<'stream, 'b, O, E: From>( param: Self::SyncHostType, stream: &'stream rustacuda::stream::Stream, inner: impl super::WithNewAsync<'stream, Self, O, E>, - ) -> Result where Self: 'param { + ) -> Result + where + Self: 'b, + { // FIXME: forward impl crate::lend::LendToCuda::lend_to_cuda_mut(param, |mut param| { // FIXME: express the same with param.as_async(stream).as_mut() @@ -821,7 +947,10 @@ impl<'a, T: Sync + RustToCuda + SafeMutableAliasing> CudaKernelParameter param: &Self::AsyncHostType<'stream, 'b>, _token: sealed::Token, inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O, - ) -> O where Self: 'b { + ) -> O + where + Self: 'b, + { let param = unsafe { param.as_ref().unwrap_unchecked() }; inner(Some(¶m_as_raw_bytes(param.for_host()))) } @@ -830,7 +959,10 @@ impl<'a, T: Sync + RustToCuda + SafeMutableAliasing> CudaKernelParameter fn shared_layout_for_async<'stream, 'b>( _param: &Self::AsyncHostType<'stream, 'b>, _token: sealed::Token, - ) -> Layout where Self: 'b { + ) -> Layout + where + Self: 'b, + { Layout::new::<()>() } @@ -838,7 +970,10 @@ impl<'a, T: Sync + RustToCuda + SafeMutableAliasing> CudaKernelParameter fn async_to_ffi<'stream, 'b, E: From>( param: Self::AsyncHostType<'stream, 'b>, token: sealed::Token, - ) -> Result, E> where Self: 'b { + ) -> Result, E> + where + Self: 'b, + { <&'a mut DeepPerThreadBorrow as CudaKernelParameter>::async_to_ffi(param, token) } @@ -846,11 +981,16 @@ impl<'a, T: Sync + RustToCuda + SafeMutableAliasing> CudaKernelParameter unsafe fn with_ffi_as_device<'short, O, const PARAM: usize>( param: Self::FfiType<'static, 'short>, inner: impl super::WithFfiAsDevice, - ) -> O where Self: 'short { + ) -> O + where + Self: 'short, + { emit_param_ptx_jit_marker::<_, PARAM>(param.as_ref()); // FIXME: forward impl - unsafe { crate::lend::BorrowFromRust::with_borrow_from_rust_mut(param, |param| inner.with(param)) } + unsafe { + crate::lend::BorrowFromRust::with_borrow_from_rust_mut(param, |param| inner.with(param)) + } } } impl<'a, T: Sync + RustToCuda + SafeMutableAliasing> sealed::Sealed @@ -908,11 +1048,14 @@ impl<'a, T: 'static> CudaKernelParameter for &'a mut crate::utils::shared::Threa type SyncHostType = Self; #[cfg(feature = "host")] - fn with_new_async<'stream, 'param, O, E: From>( + fn with_new_async<'stream, 'b, O, E: From>( param: Self::SyncHostType, _stream: &'stream rustacuda::stream::Stream, inner: impl super::WithNewAsync<'stream, Self, O, E>, - ) -> Result where Self: 'param { + ) -> Result + where + Self: 'b, + { inner.with(param) } @@ -921,7 +1064,10 @@ impl<'a, T: 'static> CudaKernelParameter for &'a mut crate::utils::shared::Threa _param: &Self::AsyncHostType<'stream, 'b>, _token: sealed::Token, inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O, - ) -> O where Self: 'b { + ) -> O + where + Self: 'b, + { inner(None) } @@ -929,7 +1075,10 @@ impl<'a, T: 'static> CudaKernelParameter for &'a mut crate::utils::shared::Threa fn shared_layout_for_async<'stream, 'b>( _param: &Self::AsyncHostType<'stream, 'b>, _token: sealed::Token, - ) -> Layout where Self: 'b { + ) -> Layout + where + Self: 'b, + { Layout::new::<()>() } @@ -937,7 +1086,10 @@ impl<'a, T: 'static> CudaKernelParameter for &'a mut crate::utils::shared::Threa fn async_to_ffi<'stream, 'b, E: From>( _param: Self::AsyncHostType<'stream, 'b>, _token: sealed::Token, - ) -> Result, E> where Self: 'b { + ) -> Result, E> + where + Self: 'b, + { Ok(private_shared::ThreadBlockSharedFfi { _dummy: [], _marker: PhantomData::, @@ -950,7 +1102,10 @@ impl<'a, T: 'static> CudaKernelParameter for &'a mut crate::utils::shared::Threa unsafe fn with_ffi_as_device<'short, O, const PARAM: usize>( _param: Self::FfiType<'static, 'short>, inner: impl super::WithFfiAsDevice, - ) -> O where Self: 'short { + ) -> O + where + Self: 'short, + { let mut param = crate::utils::shared::ThreadBlockShared::new_uninit(); inner.with(&mut param) @@ -970,11 +1125,14 @@ impl<'a, T: 'static + PortableBitSemantics + TypeGraphLayout> CudaKernelParamete type SyncHostType = Self; #[cfg(feature = "host")] - fn with_new_async<'stream, 'param, O, E: From>( + fn with_new_async<'stream, 'b, O, E: From>( param: Self::SyncHostType, _stream: &'stream rustacuda::stream::Stream, inner: impl super::WithNewAsync<'stream, Self, O, E>, - ) -> Result where Self: 'param { + ) -> Result + where + Self: 'b, + { inner.with(param) } @@ -983,7 +1141,10 @@ impl<'a, T: 'static + PortableBitSemantics + TypeGraphLayout> CudaKernelParamete _param: &Self::AsyncHostType<'stream, 'b>, _token: sealed::Token, inner: impl for<'p> FnOnce(Option<&'p NonNull<[u8]>>) -> O, - ) -> O where Self: 'b { + ) -> O + where + Self: 'b, + { inner(None) } @@ -991,7 +1152,10 @@ impl<'a, T: 'static + PortableBitSemantics + TypeGraphLayout> CudaKernelParamete fn shared_layout_for_async<'stream, 'b>( param: &Self::AsyncHostType<'stream, 'b>, _token: sealed::Token, - ) -> Layout where Self: 'b { + ) -> Layout + where + Self: 'b, + { param.layout() } @@ -999,7 +1163,10 @@ impl<'a, T: 'static + PortableBitSemantics + TypeGraphLayout> CudaKernelParamete fn async_to_ffi<'stream, 'b, E: From>( param: Self::AsyncHostType<'stream, 'b>, _token: sealed::Token, - ) -> Result, E> where Self: 'b { + ) -> Result, E> + where + Self: 'b, + { Ok(private_shared::ThreadBlockSharedSliceFfi { len: param.len(), _marker: [], @@ -1012,9 +1179,14 @@ impl<'a, T: 'static + PortableBitSemantics + TypeGraphLayout> CudaKernelParamete unsafe fn with_ffi_as_device<'short, O, const PARAM: usize>( param: Self::FfiType<'static, 'short>, inner: impl super::WithFfiAsDevice, - ) -> O where Self: 'short { + ) -> O + where + Self: 'short, + { unsafe { - crate::utils::shared::ThreadBlockSharedSlice::with_uninit_for_len(param.len, |param| inner.with(param)) + crate::utils::shared::ThreadBlockSharedSlice::with_uninit_for_len(param.len, |param| { + inner.with(param) + }) } } } From 61e83a65036945b70c4ebef4ac2f59994b9e5f4b Mon Sep 17 00:00:00 2001 From: Juniper Tyree Date: Mon, 8 Jan 2024 11:36:27 +0000 Subject: [PATCH 095/120] Make CudaExchangeBuffer Sync --- src/utils/exchange/buffer/host.rs | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/utils/exchange/buffer/host.rs b/src/utils/exchange/buffer/host.rs index ce0cb9d41..7fc8b45bf 100644 --- a/src/utils/exchange/buffer/host.rs +++ b/src/utils/exchange/buffer/host.rs @@ -1,5 +1,5 @@ use std::{ - cell::UnsafeCell, + cell::SyncUnsafeCell, ops::{Deref, DerefMut}, }; @@ -31,7 +31,7 @@ pub struct CudaExchangeBufferHost< host_buffer: CudaDropWrapper< LockedBuffer>>, >, - device_buffer: UnsafeCell< + device_buffer: SyncUnsafeCell< CudaDropWrapper< DeviceBuffer>>, >, @@ -55,7 +55,7 @@ impl< DeviceCopyWithPortableBitSemantics::from_ref(elem), capacity, )?); - let device_buffer = UnsafeCell::new(CudaDropWrapper::from(DeviceBuffer::from_slice( + let device_buffer = SyncUnsafeCell::new(CudaDropWrapper::from(DeviceBuffer::from_slice( host_buffer.as_slice(), )?)); @@ -89,7 +89,7 @@ impl>, CombinedCudaAlloc, )> { - // Safety: device_buffer is inside an UnsafeCell + // Safety: device_buffer is inside an SyncUnsafeCell // borrow checks must be satisfied through LendToCuda let device_buffer = &mut *self.device_buffer.get(); @@ -183,7 +183,7 @@ impl>>, CombinedCudaAlloc, )> { - // Safety: device_buffer is inside an UnsafeCell + // Safety: device_buffer is inside an SyncUnsafeCell // borrow checks must be satisfied through LendToCuda let device_buffer = &mut *self.device_buffer.get(); From 8dc0c6df52348fd119230ca8f1a4edc9562a1f86 Mon Sep 17 00:00:00 2001 From: Juniper Tyree Date: Mon, 8 Jan 2024 11:44:58 +0000 Subject: [PATCH 096/120] Make CudaExchangeBuffer Sync v2 --- src/utils/exchange/buffer/host.rs | 12 ++++++------ src/utils/exchange/buffer/mod.rs | 8 ++++++++ 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/src/utils/exchange/buffer/host.rs b/src/utils/exchange/buffer/host.rs index 7fc8b45bf..ce0cb9d41 100644 --- a/src/utils/exchange/buffer/host.rs +++ b/src/utils/exchange/buffer/host.rs @@ -1,5 +1,5 @@ use std::{ - cell::SyncUnsafeCell, + cell::UnsafeCell, ops::{Deref, DerefMut}, }; @@ -31,7 +31,7 @@ pub struct CudaExchangeBufferHost< host_buffer: CudaDropWrapper< LockedBuffer>>, >, - device_buffer: SyncUnsafeCell< + device_buffer: UnsafeCell< CudaDropWrapper< DeviceBuffer>>, >, @@ -55,7 +55,7 @@ impl< DeviceCopyWithPortableBitSemantics::from_ref(elem), capacity, )?); - let device_buffer = SyncUnsafeCell::new(CudaDropWrapper::from(DeviceBuffer::from_slice( + let device_buffer = UnsafeCell::new(CudaDropWrapper::from(DeviceBuffer::from_slice( host_buffer.as_slice(), )?)); @@ -89,7 +89,7 @@ impl>, CombinedCudaAlloc, )> { - // Safety: device_buffer is inside an SyncUnsafeCell + // Safety: device_buffer is inside an UnsafeCell // borrow checks must be satisfied through LendToCuda let device_buffer = &mut *self.device_buffer.get(); @@ -183,7 +183,7 @@ impl>>, CombinedCudaAlloc, )> { - // Safety: device_buffer is inside an SyncUnsafeCell + // Safety: device_buffer is inside an UnsafeCell // borrow checks must be satisfied through LendToCuda let device_buffer = &mut *self.device_buffer.get(); diff --git a/src/utils/exchange/buffer/mod.rs b/src/utils/exchange/buffer/mod.rs index c48a715ac..ea5118236 100644 --- a/src/utils/exchange/buffer/mod.rs +++ b/src/utils/exchange/buffer/mod.rs @@ -46,6 +46,14 @@ pub struct CudaExchangeBuffer< inner: device::CudaExchangeBufferDevice, } +unsafe impl< + T: StackOnly + PortableBitSemantics + TypeGraphLayout + Sync, + const M2D: bool, + const M2H: bool, + > Sync for CudaExchangeBuffer +{ +} + #[cfg(feature = "host")] impl< T: Clone + StackOnly + PortableBitSemantics + TypeGraphLayout, From dd9507d96ed34bf03a7537d62a693266ea4a8cb5 Mon Sep 17 00:00:00 2001 From: Juniper Tyree Date: Tue, 9 Jan 2024 03:31:42 +0000 Subject: [PATCH 097/120] Add AsyncProj proj_ref and proj_mut convenience methods --- src/kernel/param.rs | 7 +++---- src/utils/async.rs | 27 +++++++++++++++++++++++++++ 2 files changed, 30 insertions(+), 4 deletions(-) diff --git a/src/kernel/param.rs b/src/kernel/param.rs index 1f149d8b4..6be634b24 100644 --- a/src/kernel/param.rs +++ b/src/kernel/param.rs @@ -71,8 +71,7 @@ impl< > CudaKernelParameter for PerThreadShallowCopy { #[cfg(feature = "host")] - type AsyncHostType<'stream, 'b> = - crate::utils::adapter::RustToCudaWithPortableBitCopySemantics where Self: 'b; + type AsyncHostType<'stream, 'b> = T where Self: 'b; #[cfg(any(feature = "device", doc))] type DeviceType<'b> = T where Self: 'b; type FfiType<'stream, 'b> = crate::utils::adapter::RustToCudaWithPortableBitCopySemantics where Self: 'b; @@ -88,7 +87,7 @@ impl< where Self: 'b, { - inner.with(crate::utils::adapter::RustToCudaWithPortableBitCopySemantics::from(param)) + inner.with(param) } #[cfg(feature = "host")] @@ -122,7 +121,7 @@ impl< where Self: 'b, { - Ok(param) + Ok(crate::utils::adapter::RustToCudaWithPortableBitCopySemantics::from(param)) } #[cfg(feature = "device")] diff --git a/src/utils/async.rs b/src/utils/async.rs index e98758d4f..b008ac553 100644 --- a/src/utils/async.rs +++ b/src/utils/async.rs @@ -454,6 +454,33 @@ impl<'a, 'stream, T: 'a> AsyncProj<'a, 'stream, T> { } } +#[cfg(feature = "host")] +impl<'a, 'stream, T: 'a> AsyncProj<'a, 'stream, T> { + #[must_use] + pub const fn proj_ref<'b>(&'b self) -> AsyncProj<'b, 'stream, &'b T> + where + 'a: 'b, + { + AsyncProj { + _capture: PhantomData::<&'b ()>, + _stream: PhantomData::<&'stream Stream>, + value: &self.value, + } + } + + #[must_use] + pub fn proj_mut<'b>(&'b mut self) -> AsyncProj<'b, 'stream, &'b mut T> + where + 'a: 'b, + { + AsyncProj { + _capture: PhantomData::<&'b ()>, + _stream: PhantomData::<&'stream Stream>, + value: &mut self.value, + } + } +} + #[cfg(feature = "host")] impl<'a, 'stream, T: 'a> AsyncProj<'a, 'stream, &'a T> { #[must_use] From e2032bf4e1c34bca9dc7214e2394001ff93bdab6 Mon Sep 17 00:00:00 2001 From: Juniper Tyree Date: Thu, 11 Jan 2024 04:42:52 +0000 Subject: [PATCH 098/120] Add RustToCudaWithPortableBitCloneSemantics adapter --- rust-cuda-derive/src/rust_to_cuda/field_ty.rs | 4 +- src/utils/adapter.rs | 172 ++++++++++++++++++ src/utils/exchange/buffer/mod.rs | 1 + 3 files changed, 175 insertions(+), 2 deletions(-) diff --git a/rust-cuda-derive/src/rust_to_cuda/field_ty.rs b/rust-cuda-derive/src/rust_to_cuda/field_ty.rs index 36924aaf9..b2f624d66 100644 --- a/rust-cuda-derive/src/rust_to_cuda/field_ty.rs +++ b/rust-cuda-derive/src/rust_to_cuda/field_ty.rs @@ -83,7 +83,7 @@ pub fn swap_field_type_and_filter_attrs( _ => { emit_error!( meta.span(), - "[rust-cuda]: Expected #[cuda(ignore)] / #[cdua(embed)] / \ + "[rust-cuda]: Expected #[cuda(ignore)] / #[cuda(embed)] / \ #[cuda(embed = \"\")] field attribute" ); } @@ -92,7 +92,7 @@ pub fn swap_field_type_and_filter_attrs( } else { emit_error!( attr.span(), - "[rust-cuda]: Expected #[cuda(ignore)] / #[cdua(embed)] / \ + "[rust-cuda]: Expected #[cuda(ignore)] / #[cuda(embed)] / \ #[cuda(embed = \"\")] field attribute." ); } diff --git a/src/utils/adapter.rs b/src/utils/adapter.rs index 093a02fd4..c80cab4d0 100644 --- a/src/utils/adapter.rs +++ b/src/utils/adapter.rs @@ -186,6 +186,178 @@ unsafe impl CudaAsRust } } +#[derive(Copy, Clone, Debug, TypeLayout)] +#[repr(transparent)] +pub struct RustToCudaWithPortableBitCloneSemantics< + T: Clone + PortableBitSemantics + TypeGraphLayout, +>(T); + +impl From + for RustToCudaWithPortableBitCloneSemantics +{ + fn from(value: T) -> Self { + Self(value) + } +} + +impl RustToCudaWithPortableBitCloneSemantics { + #[must_use] + pub const fn from_clone(value: &T) -> Self { + Self(value.clone()) + } + + #[must_use] + pub const fn into_inner(self) -> T { + self.0 + } + + #[must_use] + pub const fn from_ref(reference: &T) -> &Self { + // Safety: [`RustToCudaWithPortableBitCloneSemantics`] is a transparent newtype + // around `T` + unsafe { &*core::ptr::from_ref(reference).cast() } + } + + #[must_use] + pub const fn into_ref(&self) -> &T { + // Safety: [`RustToCudaWithPortableBitCloneSemantics`] is a transparent newtype + // around `T` + unsafe { &*core::ptr::from_ref(self).cast() } + } + + #[must_use] + pub fn from_mut(reference: &mut T) -> &mut Self { + // Safety: [`RustToCudaWithPortableBitCloneSemantics`] is a transparent newtype + // around `T` + unsafe { &mut *core::ptr::from_mut(reference).cast() } + } + + #[must_use] + pub fn into_mut(&mut self) -> &mut T { + // Safety: [`RustToCudaWithPortableBitCloneSemantics`] is a transparent newtype + // around `T` + unsafe { &mut *core::ptr::from_mut(self).cast() } + } + + #[must_use] + pub const fn from_slice(slice: &[T]) -> &[Self] { + // Safety: [`RustToCudaWithPortableBitCloneSemantics`] is a transparent newtype + // around `T` + unsafe { core::slice::from_raw_parts(slice.as_ptr().cast(), slice.len()) } + } + + #[must_use] + pub const fn into_slice(slice: &[Self]) -> &[T] { + // Safety: [`RustToCudaWithPortableBitCloneSemantics`] is a transparent newtype + // around `T` + unsafe { core::slice::from_raw_parts(slice.as_ptr().cast(), slice.len()) } + } + + #[must_use] + pub fn from_mut_slice(slice: &mut [T]) -> &mut [Self] { + // Safety: [`RustToCudaWithPortableBitCloneSemantics`] is a transparent newtype + // around `T` + unsafe { core::slice::from_raw_parts_mut(slice.as_mut_ptr().cast(), slice.len()) } + } + + #[must_use] + pub fn into_mut_slice(slice: &mut [Self]) -> &mut [T] { + // Safety: [`RustToCudaWithPortableBitCloneSemantics`] is a transparent newtype + // around `T` + unsafe { core::slice::from_raw_parts_mut(slice.as_mut_ptr().cast(), slice.len()) } + } +} + +unsafe impl RustToCuda + for RustToCudaWithPortableBitCloneSemantics +{ + type CudaAllocation = NoCudaAlloc; + type CudaRepresentation = Self; + + #[cfg(feature = "host")] + #[allow(clippy::type_complexity)] + unsafe fn borrow( + &self, + alloc: A, + ) -> rustacuda::error::CudaResult<( + DeviceAccessible, + CombinedCudaAlloc, + )> { + let alloc = CombinedCudaAlloc::new(NoCudaAlloc, alloc); + Ok((DeviceAccessible::from(self.clone()), alloc)) + } + + #[cfg(feature = "host")] + unsafe fn restore( + &mut self, + alloc: CombinedCudaAlloc, + ) -> rustacuda::error::CudaResult { + let (_alloc_front, alloc_tail): (NoCudaAlloc, A) = alloc.split(); + + Ok(alloc_tail) + } +} + +unsafe impl RustToCudaAsync + for RustToCudaWithPortableBitCloneSemantics +{ + type CudaAllocationAsync = NoCudaAlloc; + + #[cfg(feature = "host")] + #[allow(clippy::type_complexity)] + unsafe fn borrow_async<'stream, A: CudaAlloc>( + &self, + alloc: A, + stream: &'stream rustacuda::stream::Stream, + ) -> rustacuda::error::CudaResult<( + crate::utils::r#async::Async<'_, 'stream, DeviceAccessible>, + CombinedCudaAlloc, + )> { + let alloc = CombinedCudaAlloc::new(NoCudaAlloc, alloc); + Ok(( + crate::utils::r#async::Async::ready(DeviceAccessible::from(self.clone()), stream), + alloc, + )) + } + + #[cfg(feature = "host")] + unsafe fn restore_async<'a, 'stream, A: CudaAlloc, O>( + this: owning_ref::BoxRefMut<'a, O, Self>, + alloc: CombinedCudaAlloc, + stream: &'stream rustacuda::stream::Stream, + ) -> rustacuda::error::CudaResult<( + crate::utils::r#async::Async< + 'a, + 'stream, + owning_ref::BoxRefMut<'a, O, Self>, + crate::utils::r#async::CompletionFnMut<'a, Self>, + >, + A, + )> { + let (_alloc_front, alloc_tail): (NoCudaAlloc, A) = alloc.split(); + + let r#async = crate::utils::r#async::Async::< + _, + crate::utils::r#async::CompletionFnMut<'a, Self>, + >::pending(this, stream, Box::new(|_this| Ok(())))?; + + Ok((r#async, alloc_tail)) + } +} + +unsafe impl CudaAsRust + for RustToCudaWithPortableBitCloneSemantics +{ + type RustRepresentation = Self; + + #[cfg(feature = "device")] + unsafe fn as_rust(this: &DeviceAccessible) -> Self::RustRepresentation { + let mut uninit = core::mem::MaybeUninit::uninit(); + core::ptr::copy_nonoverlapping(&**this, uninit.as_mut_ptr(), 1); + uninit.assume_init() + } +} + #[allow(clippy::module_name_repetitions)] #[derive(Copy, Clone, Debug, TypeLayout)] #[repr(transparent)] diff --git a/src/utils/exchange/buffer/mod.rs b/src/utils/exchange/buffer/mod.rs index ea5118236..28ee028d1 100644 --- a/src/utils/exchange/buffer/mod.rs +++ b/src/utils/exchange/buffer/mod.rs @@ -46,6 +46,7 @@ pub struct CudaExchangeBuffer< inner: device::CudaExchangeBufferDevice, } +#[cfg(any(feature = "host", feature = "device"))] unsafe impl< T: StackOnly + PortableBitSemantics + TypeGraphLayout + Sync, const M2D: bool, From eb6757cda5f0b0f71e509cd3f3e56f6208d62e1d Mon Sep 17 00:00:00 2001 From: Juniper Tyree Date: Thu, 11 Jan 2024 04:45:34 +0000 Subject: [PATCH 099/120] Fix invalid const fn bounds --- src/utils/adapter.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/utils/adapter.rs b/src/utils/adapter.rs index c80cab4d0..1b6b3af4c 100644 --- a/src/utils/adapter.rs +++ b/src/utils/adapter.rs @@ -202,12 +202,12 @@ impl From impl RustToCudaWithPortableBitCloneSemantics { #[must_use] - pub const fn from_clone(value: &T) -> Self { + pub fn from_clone(value: &T) -> Self { Self(value.clone()) } #[must_use] - pub const fn into_inner(self) -> T { + pub fn into_inner(self) -> T { self.0 } From 8552c2163406d0af443130f9d209b477bc34fd1b Mon Sep 17 00:00:00 2001 From: Juniper Tyree Date: Thu, 11 Jan 2024 04:51:23 +0000 Subject: [PATCH 100/120] Add Deref[Mut] to the adapters --- src/utils/adapter.rs | 52 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) diff --git a/src/utils/adapter.rs b/src/utils/adapter.rs index 1b6b3af4c..f7d041089 100644 --- a/src/utils/adapter.rs +++ b/src/utils/adapter.rs @@ -1,5 +1,7 @@ #![allow(clippy::trait_duplication_in_bounds)] +use core::ops::{Deref, DerefMut}; + use const_type_layout::{TypeGraphLayout, TypeLayout}; use crate::{ @@ -28,6 +30,24 @@ impl From } } +impl Deref + for RustToCudaWithPortableBitCopySemantics +{ + type Target = T; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} + +impl DerefMut + for RustToCudaWithPortableBitCopySemantics +{ + fn deref_mut(&mut self) -> &mut Self::Target { + &mut self.0 + } +} + impl RustToCudaWithPortableBitCopySemantics { #[must_use] pub const fn from_copy(value: &T) -> Self { @@ -200,6 +220,24 @@ impl From } } +impl Deref + for RustToCudaWithPortableBitCloneSemantics +{ + type Target = T; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} + +impl DerefMut + for RustToCudaWithPortableBitCloneSemantics +{ + fn deref_mut(&mut self) -> &mut Self::Target { + &mut self.0 + } +} + impl RustToCudaWithPortableBitCloneSemantics { #[must_use] pub fn from_clone(value: &T) -> Self { @@ -374,6 +412,20 @@ impl From for DeviceCopyWithPortab } } +impl Deref for DeviceCopyWithPortableBitSemantics { + type Target = T; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} + +impl DerefMut for DeviceCopyWithPortableBitSemantics { + fn deref_mut(&mut self) -> &mut Self::Target { + &mut self.0 + } +} + impl DeviceCopyWithPortableBitSemantics { #[must_use] pub fn into_inner(self) -> T { From 5e1534cf3c4bd98df88aefbfe647dcd9a519dd65 Mon Sep 17 00:00:00 2001 From: Juniper Tyree Date: Thu, 11 Jan 2024 05:05:06 +0000 Subject: [PATCH 101/120] Fix pointer type inference error --- src/utils/exchange/buffer/host.rs | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/utils/exchange/buffer/host.rs b/src/utils/exchange/buffer/host.rs index ce0cb9d41..184de1aca 100644 --- a/src/utils/exchange/buffer/host.rs +++ b/src/utils/exchange/buffer/host.rs @@ -77,9 +77,11 @@ impl>> = CudaDropWrapper::from(LockedBuffer::uninitialized(vec.len())?); + let uninit_ptr: *mut DeviceCopyWithPortableBitSemantics> = + uninit.as_mut_ptr(); + for (i, src) in vec.into_iter().enumerate() { - uninit - .as_mut_ptr() + uninit_ptr .add(i) .write(DeviceCopyWithPortableBitSemantics::from(CudaExchangeItem( src, From c74b542d35007dda960831ef1ce014c7ddb70ef8 Mon Sep 17 00:00:00 2001 From: Juniper Tyree Date: Fri, 12 Jan 2024 03:29:32 +0000 Subject: [PATCH 102/120] Try removing __rust_cuda_ffi_safe_assert module --- .../kernel/wrapper/generate/cuda_wrapper.rs | 26 +++++++++------- .../generate/host_linker_macro/get_ptx.rs | 31 ++++++++++--------- src/host/mod.rs | 2 -- 3 files changed, 31 insertions(+), 28 deletions(-) diff --git a/rust-cuda-kernel/src/kernel/wrapper/generate/cuda_wrapper.rs b/rust-cuda-kernel/src/kernel/wrapper/generate/cuda_wrapper.rs index 48049c5a1..74ab20f5b 100644 --- a/rust-cuda-kernel/src/kernel/wrapper/generate/cuda_wrapper.rs +++ b/rust-cuda-kernel/src/kernel/wrapper/generate/cuda_wrapper.rs @@ -61,6 +61,15 @@ pub(in super::super) fn quote_cuda_wrapper( }, ); + let private_func_params = func_params + .iter() + .map(|param| { + let mut private = syn::Ident::clone(param); + private.set_span(proc_macro::Span::def_site().into()); + private + }) + .collect::>(); + quote! { #[cfg(target_os = "cuda")] #[#crate_path::device::specialise_kernel_function(#func_ident)] @@ -68,6 +77,12 @@ pub(in super::super) fn quote_cuda_wrapper( #[allow(unused_unsafe)] #(#func_attrs)* pub unsafe extern "ptx-kernel" fn #func_ident_hash(#(#ffi_inputs),*) { + extern "C" { #( + #[allow(dead_code)] + #[deny(improper_ctypes)] + static #private_func_params: #ffi_types; + )* } + unsafe { // Initialise the dynamically-sized thread-block shared memory // and the thread-local offset pointer that points to it @@ -89,17 +104,6 @@ pub(in super::super) fn quote_cuda_wrapper( ::core::arch::asm!(#KERNEL_TYPE_USE_END_CANARY); } - #[deny(improper_ctypes)] - mod __rust_cuda_ffi_safe_assert { - #[allow(unused_imports)] - use super::*; - - extern "C" { #( - #[allow(dead_code)] - static #func_params: #ffi_types; - )* } - } - #ffi_param_ptx_jit_wrap } } diff --git a/rust-cuda-kernel/src/kernel/wrapper/generate/host_linker_macro/get_ptx.rs b/rust-cuda-kernel/src/kernel/wrapper/generate/host_linker_macro/get_ptx.rs index 599b68fce..303b43ff1 100644 --- a/rust-cuda-kernel/src/kernel/wrapper/generate/host_linker_macro/get_ptx.rs +++ b/rust-cuda-kernel/src/kernel/wrapper/generate/host_linker_macro/get_ptx.rs @@ -80,10 +80,24 @@ pub(super) fn quote_get_ptx( .collect::>() }; + let private_func_params = func_params + .iter() + .map(|param| { + let mut private = syn::Ident::clone(param); + private.set_span(proc_macro::Span::def_site().into()); + private + }) + .collect::>(); + quote! { fn get_ptx() -> &'static ::core::ffi::CStr { - #[allow(unused_imports)] - use __rust_cuda_ffi_safe_assert::#args; + #args_trait + + extern "C" { #( + #[allow(dead_code)] + #[deny(improper_ctypes)] + static #private_func_params: #cpu_func_lifetime_erased_types; + )* } #crate_path::kernel::link_kernel!{ #func_ident #func_ident_hash #crate_name #crate_manifest_dir #generic_start_token @@ -95,19 +109,6 @@ pub(super) fn quote_get_ptx( #(#type_layout_asserts)* - #[deny(improper_ctypes)] - mod __rust_cuda_ffi_safe_assert { - #[allow(unused_imports)] - use super::*; - - #args_trait - - extern "C" { #( - #[allow(dead_code)] - static #func_params: #cpu_func_lifetime_erased_types; - )* } - } - PTX_CSTR } } diff --git a/src/host/mod.rs b/src/host/mod.rs index 2ddc768dd..ef45511e9 100644 --- a/src/host/mod.rs +++ b/src/host/mod.rs @@ -152,7 +152,6 @@ impl<'a, T: PortableBitSemantics + TypeGraphLayout> HostAndDeviceMutRef<'a, T> { } } - #[allow(dead_code)] // FIXME #[must_use] pub(crate) fn for_device<'b>(&'b mut self) -> DeviceMutRef<'a, T> where @@ -164,7 +163,6 @@ impl<'a, T: PortableBitSemantics + TypeGraphLayout> HostAndDeviceMutRef<'a, T> { } } - #[allow(dead_code)] // FIXME #[must_use] pub(crate) fn for_host<'b: 'a>(&'b self) -> &'a T { self.host_ref From 139adce5c160f8d9d1f89b661a2ed8f623fb1212 Mon Sep 17 00:00:00 2001 From: Juniper Tyree Date: Sat, 13 Jan 2024 06:10:07 +0000 Subject: [PATCH 103/120] Ensure async launch mutable borrow safety with barriers on use and stream move --- examples/print/src/main.rs | 16 +- rust-cuda-derive/src/rust_to_cuda/impl.rs | 4 +- .../wrapper/generate/cuda_generic_function.rs | 3 + src/host/mod.rs | 50 ++-- src/kernel/mod.rs | 46 +++- src/kernel/param.rs | 49 ++-- src/lend/impls/box.rs | 4 +- src/lend/impls/boxed_slice.rs | 4 +- src/lend/impls/final.rs | 4 +- src/lend/impls/option.rs | 4 +- src/lend/impls/ref.rs | 4 +- src/lend/impls/slice_ref.rs | 4 +- src/lend/mod.rs | 16 +- src/utils/adapter.rs | 8 +- src/utils/aliasing/const.rs | 4 +- src/utils/aliasing/dynamic.rs | 4 +- src/utils/async.rs | 249 ++++++++++++++---- src/utils/exchange/buffer/host.rs | 4 +- src/utils/exchange/buffer/mod.rs | 4 +- src/utils/exchange/wrapper.rs | 49 ++-- 20 files changed, 358 insertions(+), 172 deletions(-) diff --git a/examples/print/src/main.rs b/examples/print/src/main.rs index 7423f06ac..7cd9ab3f2 100644 --- a/examples/print/src/main.rs +++ b/examples/print/src/main.rs @@ -55,7 +55,7 @@ fn main() -> rust_cuda::deps::rustacuda::error::CudaResult<()> { ); // Create a new CUDA stream to submit kernels to - let stream = + let mut stream = rust_cuda::host::CudaDropWrapper::from(rust_cuda::deps::rustacuda::stream::Stream::new( rust_cuda::deps::rustacuda::stream::StreamFlags::NON_BLOCKING, None, @@ -70,12 +70,14 @@ fn main() -> rust_cuda::deps::rustacuda::error::CudaResult<()> { }; // Launch the CUDA kernel on the stream and synchronise to its completion - println!("Launching print kernel ..."); - kernel.launch1(&stream, &config, Action::Print)?; - println!("Launching panic kernel ..."); - kernel.launch1(&stream, &config, Action::Panic)?; - println!("Launching alloc error kernel ..."); - kernel.launch1(&stream, &config, Action::AllocError)?; + rust_cuda::host::Stream::with(&mut stream, |stream| { + println!("Launching print kernel ..."); + kernel.launch1(stream, &config, Action::Print)?; + println!("Launching panic kernel ..."); + kernel.launch1(stream, &config, Action::Panic)?; + println!("Launching alloc error kernel ..."); + kernel.launch1(stream, &config, Action::AllocError) + })?; Ok(()) } diff --git a/rust-cuda-derive/src/rust_to_cuda/impl.rs b/rust-cuda-derive/src/rust_to_cuda/impl.rs index 40dd3487d..e45a0e283 100644 --- a/rust-cuda-derive/src/rust_to_cuda/impl.rs +++ b/rust-cuda-derive/src/rust_to_cuda/impl.rs @@ -191,7 +191,7 @@ pub fn rust_to_cuda_async_trait( unsafe fn borrow_async<'stream, CudaAllocType: #crate_path::alloc::CudaAlloc>( &self, alloc: CudaAllocType, - stream: &'stream #crate_path::deps::rustacuda::stream::Stream, + stream: &'stream #crate_path::host::Stream, ) -> #crate_path::deps::rustacuda::error::CudaResult<( #crate_path::utils::r#async::Async< '_, 'stream, @@ -219,7 +219,7 @@ pub fn rust_to_cuda_async_trait( alloc: #crate_path::alloc::CombinedCudaAlloc< Self::CudaAllocationAsync, CudaAllocType >, - stream: &'stream #crate_path::deps::rustacuda::stream::Stream, + stream: &'stream #crate_path::host::Stream, ) -> #crate_path::deps::rustacuda::error::CudaResult<( #crate_path::utils::r#async::Async< 'a, 'stream, diff --git a/rust-cuda-kernel/src/kernel/wrapper/generate/cuda_generic_function.rs b/rust-cuda-kernel/src/kernel/wrapper/generate/cuda_generic_function.rs index 4084db0ed..62cb3456d 100644 --- a/rust-cuda-kernel/src/kernel/wrapper/generate/cuda_generic_function.rs +++ b/rust-cuda-kernel/src/kernel/wrapper/generate/cuda_generic_function.rs @@ -82,6 +82,9 @@ pub(in super::super) fn quote_cuda_generic_function( ) .collect::>(); + let generic_start_token = generic_start_token.unwrap_or_default(); + let generic_close_token = generic_close_token.unwrap_or_default(); + quote! { #[cfg(target_os = "cuda")] #(#func_attrs)* diff --git a/src/host/mod.rs b/src/host/mod.rs index ef45511e9..25fd73a84 100644 --- a/src/host/mod.rs +++ b/src/host/mod.rs @@ -11,7 +11,6 @@ use rustacuda::{ event::Event, memory::{CopyDestination, DeviceBox, DeviceBuffer, LockedBox, LockedBuffer}, module::Module, - stream::Stream, }; use crate::{ @@ -26,6 +25,33 @@ use crate::{ }, }; +#[repr(transparent)] +pub struct Stream { + stream: rustacuda::stream::Stream, +} + +impl Deref for Stream { + type Target = rustacuda::stream::Stream; + + fn deref(&self) -> &Self::Target { + &self.stream + } +} + +impl Stream { + pub fn with( + stream: &mut rustacuda::stream::Stream, + inner: impl for<'stream> FnOnce(&'stream Self) -> O, + ) -> O { + // Safety: + // - Stream is a newtype wrapper around rustacuda::stream::Stream + // - we forge a unique lifetime for a unique reference + let stream = unsafe { &*std::ptr::from_ref(stream).cast() }; + + inner(stream) + } +} + pub trait CudaDroppable: Sized { #[allow(clippy::missing_errors_doc)] fn drop(val: Self) -> Result<(), (rustacuda::error::CudaError, Self)>; @@ -88,7 +114,7 @@ impl CudaDroppable for LockedBuffer { } macro_rules! impl_sealed_drop_value { - ($type:ident) => { + ($type:ty) => { impl CudaDroppable for $type { fn drop(val: Self) -> Result<(), (CudaError, Self)> { Self::drop(val) @@ -98,7 +124,7 @@ macro_rules! impl_sealed_drop_value { } impl_sealed_drop_value!(Module); -impl_sealed_drop_value!(Stream); +impl_sealed_drop_value!(rustacuda::stream::Stream); impl_sealed_drop_value!(Context); impl_sealed_drop_value!(Event); @@ -142,7 +168,7 @@ impl<'a, T: PortableBitSemantics + TypeGraphLayout> HostAndDeviceMutRef<'a, T> { /// # Safety /// /// `device_box` must contain EXACTLY the device copy of `host_ref` - pub unsafe fn new_unchecked( + pub(crate) unsafe fn new_unchecked( device_box: &'a mut DeviceBox>, host_ref: &'a mut T, ) -> Self { @@ -180,7 +206,7 @@ impl<'a, T: PortableBitSemantics + TypeGraphLayout> HostAndDeviceMutRef<'a, T> { } #[must_use] - pub fn as_mut<'b>(&'b mut self) -> HostAndDeviceMutRef<'b, T> + pub fn into_mut<'b>(self) -> HostAndDeviceMutRef<'b, T> where 'a: 'b, { @@ -191,20 +217,14 @@ impl<'a, T: PortableBitSemantics + TypeGraphLayout> HostAndDeviceMutRef<'a, T> { } #[must_use] - pub fn as_async<'b, 'stream>( - &'b mut self, + pub fn into_async<'b, 'stream>( + self, stream: &'stream Stream, ) -> Async<'b, 'stream, HostAndDeviceMutRef<'b, T>, NoCompletion> where 'a: 'b, { - Async::ready( - HostAndDeviceMutRef { - device_box: self.device_box, - host_ref: self.host_ref, - }, - stream, - ) + Async::ready(self.into_mut(), stream) } } @@ -253,7 +273,7 @@ impl<'a, T: PortableBitSemantics + TypeGraphLayout> HostAndDeviceConstRef<'a, T> /// # Safety /// /// `device_box` must contain EXACTLY the device copy of `host_ref` - pub const unsafe fn new_unchecked( + pub(crate) const unsafe fn new_unchecked( device_box: &'a DeviceBox>, host_ref: &'a T, ) -> Self { diff --git a/src/kernel/mod.rs b/src/kernel/mod.rs index a27ed5b71..b5fea0af8 100644 --- a/src/kernel/mod.rs +++ b/src/kernel/mod.rs @@ -11,7 +11,6 @@ use rustacuda::{ error::{CudaError, CudaResult}, function::Function, module::Module, - stream::Stream, }; #[cfg(feature = "kernel")] @@ -27,6 +26,8 @@ mod ptx_jit; #[cfg(feature = "host")] use ptx_jit::{PtxJITCompiler, PtxJITResult}; +#[cfg(feature = "host")] +use crate::host::Stream; use crate::safety::PortableBitSemantics; pub mod param; @@ -109,7 +110,7 @@ pub trait CudaKernelParameter: sealed::Sealed { #[allow(clippy::missing_errors_doc)] // FIXME fn with_new_async<'stream, 'param, O, E: From>( param: Self::SyncHostType, - stream: &'stream rustacuda::stream::Stream, + stream: &'stream crate::host::Stream, inner: impl WithNewAsync<'stream, Self, O, E>, ) -> Result where @@ -206,7 +207,9 @@ macro_rules! impl_launcher_launch { pub fn $launch_async<$($T: CudaKernelParameter),*>( &mut self, $($arg: $T::AsyncHostType<'stream, '_>),* - ) -> CudaResult<()> + ) -> CudaResult> where Kernel: FnOnce(&mut Launcher<'stream, '_, Kernel>, $($T),*), { @@ -375,13 +378,10 @@ macro_rules! impl_typed_kernel_launch { config, $($arg,)* |kernel, stream, config, $($arg),*| { - let result = kernel.$launch_async::<$($T),*>(stream, config, $($arg),*); + let r#async = kernel.$launch_async::<$($T),*>(stream, config, $($arg),*)?; // important: always synchronise here, this function is sync! - match (stream.synchronize(), result) { - (Ok(()), result) => result, - (Err(_), Err(err)) | (Err(err), Ok(())) => Err(err), - } + r#async.synchronize() }, ) } @@ -422,7 +422,29 @@ macro_rules! impl_typed_kernel_launch { stream: &'stream Stream, config: &LaunchConfig, $($arg: $T::AsyncHostType<'stream, '_>),* - ) -> CudaResult<()> + ) -> CudaResult> + // launch_async does not need to capture its parameters until kernel completion: + // - moved parameters are moved and cannot be used again, deallocation will sync + // - immutably borrowed parameters can be shared across multiple kernel launches + // - mutably borrowed parameters are more tricky: + // - Rust's borrowing rules ensure that a single mutable reference cannot be + // passed into multiple parameters of the kernel (no mutable aliasing) + // - CUDA guarantees that kernels launched on the same stream are executed + // sequentially, so even immediate resubmissions for the same mutable data + // will not have temporally overlapping mutation on the same stream + // - however, we have to guarantee that mutable data cannot be used on several + // different streams at the same time + // - Async::move_to_stream always adds a synchronisation barrier between the + // old and the new stream to ensure that all uses on the old stream happen + // strictly before all uses on the new stream + // - async launches take AsyncProj<&mut HostAndDeviceMutRef<..>>, which either + // captures an Async, which must be moved to a different stream explicitly, + // or contains data that cannot async move to a different stream without + // - any use of a mutable borrow in an async kernel launch adds a sync barrier + // on the launch stream s.t. the borrow is only complete once the kernel has + // completed where Kernel: FnOnce(&mut Launcher<'stream, 'kernel, Kernel>, $($T),*), { @@ -454,7 +476,11 @@ macro_rules! impl_typed_kernel_launch { &mut $T::async_to_ffi($arg, sealed::Token)? ).cast::()),* ], - ) } + ) }?; + + crate::utils::r#async::Async::pending( + (), stream, crate::utils::r#async::NoCompletion, + ) } }; (impl $func:ident () + ($($other:expr),*) $inner:block) => { diff --git a/src/kernel/param.rs b/src/kernel/param.rs index 6be634b24..a5a3cf457 100644 --- a/src/kernel/param.rs +++ b/src/kernel/param.rs @@ -81,7 +81,7 @@ impl< #[cfg(feature = "host")] fn with_new_async<'stream, 'b, O, E: From>( param: Self::SyncHostType, - _stream: &'stream rustacuda::stream::Stream, + _stream: &'stream crate::host::Stream, inner: impl super::WithNewAsync<'stream, Self, O, E>, ) -> Result where @@ -167,7 +167,7 @@ impl< #[cfg(feature = "host")] fn with_new_async<'stream, 'b, O, E: From>( param: Self::SyncHostType, - stream: &'stream rustacuda::stream::Stream, + stream: &'stream crate::host::Stream, inner: impl super::WithNewAsync<'stream, Self, O, E>, ) -> Result where @@ -251,7 +251,7 @@ impl< #[cfg(feature = "host")] fn with_new_async<'stream, 'b, O, E: From>( param: Self::SyncHostType, - stream: &'stream rustacuda::stream::Stream, + stream: &'stream crate::host::Stream, inner: impl super::WithNewAsync<'stream, Self, O, E>, ) -> Result where @@ -272,7 +272,7 @@ impl< where Self: 'b, { - let param = unsafe { param.unwrap_unchecked() }; + let param = unsafe { param.unwrap_ref_unchecked() }; inner(Some(¶m_as_raw_bytes(param.for_host()))) } @@ -373,7 +373,7 @@ impl< #[cfg(feature = "host")] fn with_new_async<'stream, 'b, O, E: From>( param: Self::SyncHostType, - stream: &'stream rustacuda::stream::Stream, + stream: &'stream crate::host::Stream, inner: impl super::WithNewAsync<'stream, Self, O, E>, ) -> Result where @@ -509,7 +509,7 @@ impl< #[cfg(feature = "host")] fn with_new_async<'stream, 'b, O, E: From>( param: Self::SyncHostType, - stream: &'stream rustacuda::stream::Stream, + stream: &'stream crate::host::Stream, inner: impl super::WithNewAsync<'stream, Self, O, E>, ) -> Result where @@ -595,7 +595,7 @@ impl<'a, T: Sync + RustToCuda> CudaKernelParameter for &'a DeepPerThreadBorrow>( param: Self::SyncHostType, - stream: &'stream rustacuda::stream::Stream, + stream: &'stream crate::host::Stream, inner: impl super::WithNewAsync<'stream, Self, O, E>, ) -> Result where @@ -678,16 +678,20 @@ impl<'a, T: Sync + RustToCuda + SafeMutableAliasing> CudaKernelParameter #[cfg(feature = "host")] fn with_new_async<'stream, 'b, O, E: From>( param: Self::SyncHostType, - stream: &'stream rustacuda::stream::Stream, + stream: &'stream crate::host::Stream, inner: impl super::WithNewAsync<'stream, Self, O, E>, ) -> Result where Self: 'b, { - crate::lend::LendToCuda::lend_to_cuda_mut(param, |mut param| { - // FIXME: express the same with param.as_async(stream).as_mut() + crate::lend::LendToCuda::lend_to_cuda_mut(param, |param| { + // FIXME: express the same with param.into_async(stream).as_mut() let _ = stream; - inner.with(crate::utils::r#async::AsyncProj::new(&mut param.as_mut())) + inner.with({ + // Safety: this projection cannot be moved to a different stream + // without first exiting lend_to_cuda_mut and synchronizing + unsafe { crate::utils::r#async::AsyncProj::new(&mut param.into_mut(), None) } + }) }) } @@ -716,12 +720,13 @@ impl<'a, T: Sync + RustToCuda + SafeMutableAliasing> CudaKernelParameter #[cfg(feature = "host")] fn async_to_ffi<'stream, 'b, E: From>( - param: Self::AsyncHostType<'stream, 'b>, + mut param: Self::AsyncHostType<'stream, 'b>, _token: sealed::Token, ) -> Result, E> where Self: 'b, { + param.record_mut_use()?; let param = unsafe { param.unwrap_unchecked() }; Ok(param.for_device()) } @@ -763,7 +768,7 @@ impl< #[cfg(feature = "host")] fn with_new_async<'stream, 'b, O, E: From>( param: Self::SyncHostType, - stream: &'stream rustacuda::stream::Stream, + stream: &'stream crate::host::Stream, inner: impl super::WithNewAsync<'stream, Self, O, E>, ) -> Result where @@ -846,7 +851,7 @@ impl<'a, T: Sync + RustToCuda> CudaKernelParameter for &'a PtxJit>( param: Self::SyncHostType, - stream: &'stream rustacuda::stream::Stream, + stream: &'stream crate::host::Stream, inner: impl super::WithNewAsync<'stream, Self, O, E>, ) -> Result where @@ -867,7 +872,7 @@ impl<'a, T: Sync + RustToCuda> CudaKernelParameter for &'a PtxJit CudaKernelParameter #[cfg(feature = "host")] fn with_new_async<'stream, 'b, O, E: From>( param: Self::SyncHostType, - stream: &'stream rustacuda::stream::Stream, + stream: &'stream crate::host::Stream, inner: impl super::WithNewAsync<'stream, Self, O, E>, ) -> Result where Self: 'b, { // FIXME: forward impl - crate::lend::LendToCuda::lend_to_cuda_mut(param, |mut param| { + crate::lend::LendToCuda::lend_to_cuda_mut(param, |param| { // FIXME: express the same with param.as_async(stream).as_mut() let _ = stream; - inner.with(crate::utils::r#async::AsyncProj::new(&mut param.as_mut())) + inner.with({ + // Safety: this projection cannot be moved to a different stream + // without first exiting lend_to_cuda_mut and synchronizing + unsafe { crate::utils::r#async::AsyncProj::new(&mut param.into_mut(), None) } + }) }) } @@ -1049,7 +1058,7 @@ impl<'a, T: 'static> CudaKernelParameter for &'a mut crate::utils::shared::Threa #[cfg(feature = "host")] fn with_new_async<'stream, 'b, O, E: From>( param: Self::SyncHostType, - _stream: &'stream rustacuda::stream::Stream, + _stream: &'stream crate::host::Stream, inner: impl super::WithNewAsync<'stream, Self, O, E>, ) -> Result where @@ -1126,7 +1135,7 @@ impl<'a, T: 'static + PortableBitSemantics + TypeGraphLayout> CudaKernelParamete #[cfg(feature = "host")] fn with_new_async<'stream, 'b, O, E: From>( param: Self::SyncHostType, - _stream: &'stream rustacuda::stream::Stream, + _stream: &'stream crate::host::Stream, inner: impl super::WithNewAsync<'stream, Self, O, E>, ) -> Result where diff --git a/src/lend/impls/box.rs b/src/lend/impls/box.rs index 121fe3905..fff0bb8d8 100644 --- a/src/lend/impls/box.rs +++ b/src/lend/impls/box.rs @@ -90,7 +90,7 @@ unsafe impl RustToCudaAsync for Box( &self, alloc: A, - stream: &'stream rustacuda::stream::Stream, + stream: &'stream crate::host::Stream, ) -> rustacuda::error::CudaResult<( Async<'_, 'stream, DeviceAccessible>, CombinedCudaAlloc, @@ -131,7 +131,7 @@ unsafe impl RustToCudaAsync for Box( this: owning_ref::BoxRefMut<'a, O, Self>, alloc: CombinedCudaAlloc, - stream: &'stream rustacuda::stream::Stream, + stream: &'stream crate::host::Stream, ) -> CudaResult<( Async<'a, 'stream, owning_ref::BoxRefMut<'a, O, Self>, CompletionFnMut<'a, Self>>, A, diff --git a/src/lend/impls/boxed_slice.rs b/src/lend/impls/boxed_slice.rs index 09a612c98..c275a6d1c 100644 --- a/src/lend/impls/boxed_slice.rs +++ b/src/lend/impls/boxed_slice.rs @@ -96,7 +96,7 @@ unsafe impl RustToCudaAsync for Box<[ unsafe fn borrow_async<'stream, A: CudaAlloc>( &self, alloc: A, - stream: &'stream rustacuda::stream::Stream, + stream: &'stream crate::host::Stream, ) -> rustacuda::error::CudaResult<( Async<'_, 'stream, DeviceAccessible>, CombinedCudaAlloc, @@ -140,7 +140,7 @@ unsafe impl RustToCudaAsync for Box<[ unsafe fn restore_async<'a, 'stream, A: CudaAlloc, O>( this: owning_ref::BoxRefMut<'a, O, Self>, alloc: CombinedCudaAlloc, - stream: &'stream rustacuda::stream::Stream, + stream: &'stream crate::host::Stream, ) -> CudaResult<( Async<'a, 'stream, owning_ref::BoxRefMut<'a, O, Self>, CompletionFnMut<'a, Self>>, A, diff --git a/src/lend/impls/final.rs b/src/lend/impls/final.rs index 6235a58fe..845424ef9 100644 --- a/src/lend/impls/final.rs +++ b/src/lend/impls/final.rs @@ -49,7 +49,7 @@ unsafe impl RustToCudaAsync for Final { unsafe fn borrow_async<'stream, A: crate::alloc::CudaAlloc>( &self, alloc: A, - stream: &'stream rustacuda::stream::Stream, + stream: &'stream crate::host::Stream, ) -> rustacuda::error::CudaResult<( crate::utils::r#async::Async<'_, 'stream, DeviceAccessible>, crate::alloc::CombinedCudaAlloc, @@ -76,7 +76,7 @@ unsafe impl RustToCudaAsync for Final { unsafe fn restore_async<'a, 'stream, A: crate::alloc::CudaAlloc, O>( this: owning_ref::BoxRefMut<'a, O, Self>, alloc: crate::alloc::CombinedCudaAlloc, - stream: &'stream rustacuda::stream::Stream, + stream: &'stream crate::host::Stream, ) -> rustacuda::error::CudaResult<( crate::utils::r#async::Async< 'a, diff --git a/src/lend/impls/option.rs b/src/lend/impls/option.rs index b1c51b9a5..76be7e762 100644 --- a/src/lend/impls/option.rs +++ b/src/lend/impls/option.rs @@ -89,7 +89,7 @@ unsafe impl RustToCudaAsync for Option { unsafe fn borrow_async<'stream, A: CudaAlloc>( &self, alloc: A, - stream: &'stream rustacuda::stream::Stream, + stream: &'stream crate::host::Stream, ) -> CudaResult<( Async<'_, 'stream, DeviceAccessible>, CombinedCudaAlloc, @@ -135,7 +135,7 @@ unsafe impl RustToCudaAsync for Option { unsafe fn restore_async<'a, 'stream, A: CudaAlloc, O>( mut this: owning_ref::BoxRefMut<'a, O, Self>, alloc: CombinedCudaAlloc, - stream: &'stream rustacuda::stream::Stream, + stream: &'stream crate::host::Stream, ) -> CudaResult<( Async<'a, 'stream, owning_ref::BoxRefMut<'a, O, Self>, CompletionFnMut<'a, Self>>, A, diff --git a/src/lend/impls/ref.rs b/src/lend/impls/ref.rs index 501393f63..3ce472317 100644 --- a/src/lend/impls/ref.rs +++ b/src/lend/impls/ref.rs @@ -85,7 +85,7 @@ unsafe impl<'a, T: PortableBitSemantics + TypeGraphLayout> RustToCudaAsync for & unsafe fn borrow_async<'stream, A: CudaAlloc>( &self, alloc: A, - stream: &'stream rustacuda::stream::Stream, + stream: &'stream crate::host::Stream, ) -> rustacuda::error::CudaResult<( Async<'_, 'stream, DeviceAccessible>, CombinedCudaAlloc, @@ -127,7 +127,7 @@ unsafe impl<'a, T: PortableBitSemantics + TypeGraphLayout> RustToCudaAsync for & unsafe fn restore_async<'b, 'stream, A: CudaAlloc, O>( this: owning_ref::BoxRefMut<'b, O, Self>, alloc: CombinedCudaAlloc, - stream: &'stream rustacuda::stream::Stream, + stream: &'stream crate::host::Stream, ) -> CudaResult<( Async<'b, 'stream, owning_ref::BoxRefMut<'b, O, Self>, CompletionFnMut<'b, Self>>, A, diff --git a/src/lend/impls/slice_ref.rs b/src/lend/impls/slice_ref.rs index 4f8a3ecd9..07271a75a 100644 --- a/src/lend/impls/slice_ref.rs +++ b/src/lend/impls/slice_ref.rs @@ -88,7 +88,7 @@ unsafe impl<'a, T: PortableBitSemantics + TypeGraphLayout> RustToCudaAsync for & unsafe fn borrow_async<'stream, A: CudaAlloc>( &self, alloc: A, - stream: &'stream rustacuda::stream::Stream, + stream: &'stream crate::host::Stream, ) -> rustacuda::error::CudaResult<( Async<'_, 'stream, DeviceAccessible>, CombinedCudaAlloc, @@ -132,7 +132,7 @@ unsafe impl<'a, T: PortableBitSemantics + TypeGraphLayout> RustToCudaAsync for & unsafe fn restore_async<'b, 'stream, A: CudaAlloc, O>( this: owning_ref::BoxRefMut<'b, O, Self>, alloc: CombinedCudaAlloc, - stream: &'stream rustacuda::stream::Stream, + stream: &'stream crate::host::Stream, ) -> CudaResult<( Async<'b, 'stream, owning_ref::BoxRefMut<'b, O, Self>, CompletionFnMut<'b, Self>>, A, diff --git a/src/lend/mod.rs b/src/lend/mod.rs index 7a3934aa0..6c0467fd5 100644 --- a/src/lend/mod.rs +++ b/src/lend/mod.rs @@ -101,7 +101,7 @@ pub unsafe trait RustToCudaAsync: RustToCuda { unsafe fn borrow_async<'stream, A: CudaAlloc>( &self, alloc: A, - stream: &'stream rustacuda::stream::Stream, + stream: &'stream crate::host::Stream, ) -> rustacuda::error::CudaResult<( Async<'_, 'stream, DeviceAccessible>, CombinedCudaAlloc, @@ -127,7 +127,7 @@ pub unsafe trait RustToCudaAsync: RustToCuda { unsafe fn restore_async<'a, 'stream, A: CudaAlloc, O>( this: owning_ref::BoxRefMut<'a, O, Self>, alloc: CombinedCudaAlloc, - stream: &'stream rustacuda::stream::Stream, + stream: &'stream crate::host::Stream, ) -> rustacuda::error::CudaResult<( Async<'a, 'stream, owning_ref::BoxRefMut<'a, O, Self>, CompletionFnMut<'a, Self>>, A, @@ -324,7 +324,7 @@ pub trait LendToCudaAsync: RustToCudaAsync { ) -> Result, >( &self, - stream: &'stream rustacuda::stream::Stream, + stream: &'stream crate::host::Stream, inner: F, ) -> Result where @@ -357,7 +357,7 @@ pub trait LendToCudaAsync: RustToCudaAsync { T: 'a, >( this: owning_ref::BoxRefMut<'a, T, Self>, - stream: &'stream rustacuda::stream::Stream, + stream: &'stream crate::host::Stream, inner: F, ) -> Result< ( @@ -393,7 +393,7 @@ pub trait LendToCudaAsync: RustToCudaAsync { ) -> Result, >( self, - stream: &'stream rustacuda::stream::Stream, + stream: &'stream crate::host::Stream, inner: F, ) -> Result where @@ -416,7 +416,7 @@ impl LendToCudaAsync for T { ) -> Result, >( &self, - stream: &'stream rustacuda::stream::Stream, + stream: &'stream crate::host::Stream, inner: F, ) -> Result where @@ -458,7 +458,7 @@ impl LendToCudaAsync for T { S: 'a, >( this: owning_ref::BoxRefMut<'a, S, Self>, - stream: &'stream rustacuda::stream::Stream, + stream: &'stream crate::host::Stream, inner: F, ) -> Result< ( @@ -505,7 +505,7 @@ impl LendToCudaAsync for T { ) -> Result, >( self, - stream: &'stream rustacuda::stream::Stream, + stream: &'stream crate::host::Stream, inner: F, ) -> Result where diff --git a/src/utils/adapter.rs b/src/utils/adapter.rs index f7d041089..84aa28569 100644 --- a/src/utils/adapter.rs +++ b/src/utils/adapter.rs @@ -156,7 +156,7 @@ unsafe impl RustToCudaAsync unsafe fn borrow_async<'stream, A: CudaAlloc>( &self, alloc: A, - stream: &'stream rustacuda::stream::Stream, + stream: &'stream crate::host::Stream, ) -> rustacuda::error::CudaResult<( crate::utils::r#async::Async<'_, 'stream, DeviceAccessible>, CombinedCudaAlloc, @@ -172,7 +172,7 @@ unsafe impl RustToCudaAsync unsafe fn restore_async<'a, 'stream, A: CudaAlloc, O>( this: owning_ref::BoxRefMut<'a, O, Self>, alloc: CombinedCudaAlloc, - stream: &'stream rustacuda::stream::Stream, + stream: &'stream crate::host::Stream, ) -> rustacuda::error::CudaResult<( crate::utils::r#async::Async< 'a, @@ -346,7 +346,7 @@ unsafe impl RustToCudaAsync unsafe fn borrow_async<'stream, A: CudaAlloc>( &self, alloc: A, - stream: &'stream rustacuda::stream::Stream, + stream: &'stream crate::host::Stream, ) -> rustacuda::error::CudaResult<( crate::utils::r#async::Async<'_, 'stream, DeviceAccessible>, CombinedCudaAlloc, @@ -362,7 +362,7 @@ unsafe impl RustToCudaAsync unsafe fn restore_async<'a, 'stream, A: CudaAlloc, O>( this: owning_ref::BoxRefMut<'a, O, Self>, alloc: CombinedCudaAlloc, - stream: &'stream rustacuda::stream::Stream, + stream: &'stream crate::host::Stream, ) -> rustacuda::error::CudaResult<( crate::utils::r#async::Async< 'a, diff --git a/src/utils/aliasing/const.rs b/src/utils/aliasing/const.rs index 3ca7b0597..24178131c 100644 --- a/src/utils/aliasing/const.rs +++ b/src/utils/aliasing/const.rs @@ -222,7 +222,7 @@ unsafe impl RustToCudaAsync unsafe fn borrow_async<'stream, A: crate::alloc::CudaAlloc>( &self, alloc: A, - stream: &'stream rustacuda::stream::Stream, + stream: &'stream crate::host::Stream, ) -> rustacuda::error::CudaResult<( crate::utils::r#async::Async<'_, 'stream, DeviceAccessible>, crate::alloc::CombinedCudaAlloc, @@ -250,7 +250,7 @@ unsafe impl RustToCudaAsync unsafe fn restore_async<'a, 'stream, A: crate::alloc::CudaAlloc, O>( this: owning_ref::BoxRefMut<'a, O, Self>, alloc: crate::alloc::CombinedCudaAlloc, - stream: &'stream rustacuda::stream::Stream, + stream: &'stream crate::host::Stream, ) -> rustacuda::error::CudaResult<( crate::utils::r#async::Async< 'a, diff --git a/src/utils/aliasing/dynamic.rs b/src/utils/aliasing/dynamic.rs index 2c663e9d6..c16d4bf4f 100644 --- a/src/utils/aliasing/dynamic.rs +++ b/src/utils/aliasing/dynamic.rs @@ -200,7 +200,7 @@ unsafe impl RustToCudaAsync for SplitSliceOverCudaThreadsDyn unsafe fn borrow_async<'stream, A: crate::alloc::CudaAlloc>( &self, alloc: A, - stream: &'stream rustacuda::stream::Stream, + stream: &'stream crate::host::Stream, ) -> rustacuda::error::CudaResult<( crate::utils::r#async::Async<'_, 'stream, DeviceAccessible>, crate::alloc::CombinedCudaAlloc, @@ -230,7 +230,7 @@ unsafe impl RustToCudaAsync for SplitSliceOverCudaThreadsDyn unsafe fn restore_async<'a, 'stream, A: crate::alloc::CudaAlloc, O>( this: owning_ref::BoxRefMut<'a, O, Self>, alloc: crate::alloc::CombinedCudaAlloc, - stream: &'stream rustacuda::stream::Stream, + stream: &'stream crate::host::Stream, ) -> rustacuda::error::CudaResult<( crate::utils::r#async::Async< 'a, diff --git a/src/utils/async.rs b/src/utils/async.rs index b008ac553..7a33da8d6 100644 --- a/src/utils/async.rs +++ b/src/utils/async.rs @@ -3,12 +3,12 @@ use std::{borrow::BorrowMut, future::Future, future::IntoFuture, marker::Phantom #[cfg(feature = "host")] use rustacuda::{ - error::CudaError, error::CudaResult, event::Event, event::EventFlags, stream::Stream, + error::CudaError, error::CudaResult, event::Event, event::EventFlags, stream::StreamWaitEventFlags, }; #[cfg(feature = "host")] -use crate::host::CudaDropWrapper; +use crate::host::{CudaDropWrapper, Stream}; #[cfg(feature = "host")] pub struct NoCompletion; @@ -19,6 +19,8 @@ pub type CompletionFnMut<'a, T> = Box CudaResult<()> + 'a> pub trait Completion>: sealed::Sealed { type Completed: ?Sized; + fn no_op() -> Self; + #[doc(hidden)] fn synchronize_on_drop(&self) -> bool; @@ -34,6 +36,11 @@ mod sealed { impl Completion for NoCompletion { type Completed = T; + #[inline] + fn no_op() -> Self { + Self + } + #[inline] fn synchronize_on_drop(&self) -> bool { false @@ -51,6 +58,11 @@ impl sealed::Sealed for NoCompletion {} impl<'a, T: ?Sized + BorrowMut, B: ?Sized> Completion for CompletionFnMut<'a, B> { type Completed = B; + #[inline] + fn no_op() -> Self { + Box::new(|_value| Ok(())) + } + #[inline] fn synchronize_on_drop(&self) -> bool { true @@ -68,6 +80,11 @@ impl<'a, T: ?Sized> sealed::Sealed for CompletionFnMut<'a, T> {} impl, C: Completion> Completion for Option { type Completed = C::Completed; + #[inline] + fn no_op() -> Self { + None + } + #[inline] fn synchronize_on_drop(&self) -> bool { self.as_ref().map_or(false, Completion::synchronize_on_drop) @@ -83,7 +100,7 @@ impl sealed::Sealed for Option {} #[cfg(feature = "host")] pub struct Async<'a, 'stream, T: BorrowMut, C: Completion = NoCompletion> { - _stream: PhantomData<&'stream Stream>, + stream: &'stream Stream, value: T, status: AsyncStatus<'a, T, C>, _capture: PhantomData<&'a ()>, @@ -95,7 +112,7 @@ enum AsyncStatus<'a, T: BorrowMut, C: Completion> { Processing { receiver: oneshot::Receiver>, completion: C, - event: CudaDropWrapper, + event: Option>, _capture: PhantomData<&'a T>, }, Completed { @@ -108,10 +125,8 @@ impl<'a, 'stream, T: BorrowMut, C: Completion> Async<'a, 'strea /// Wraps a `value` which is ready on `stream`. #[must_use] pub const fn ready(value: T, stream: &'stream Stream) -> Self { - let _ = stream; - Self { - _stream: PhantomData::<&'stream Stream>, + stream, value, status: AsyncStatus::Completed { result: Ok(()) }, _capture: PhantomData::<&'a ()>, @@ -125,20 +140,16 @@ impl<'a, 'stream, T: BorrowMut, C: Completion> Async<'a, 'strea /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside /// CUDA. pub fn pending(value: T, stream: &'stream Stream, completion: C) -> CudaResult { - let event = CudaDropWrapper::from(Event::new(EventFlags::DISABLE_TIMING)?); - let (sender, receiver) = oneshot::channel(); - stream.add_callback(Box::new(|result| std::mem::drop(sender.send(result))))?; - event.record(stream)?; Ok(Self { - _stream: PhantomData::<&'stream Stream>, + stream, value, status: AsyncStatus::Processing { receiver, completion, - event, + event: None, _capture: PhantomData::<&'a T>, }, _capture: PhantomData::<&'a ()>, @@ -157,7 +168,7 @@ impl<'a, 'stream, T: BorrowMut, C: Completion> Async<'a, 'strea /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside /// CUDA. pub fn synchronize(self) -> CudaResult { - let (mut value, status) = self.destructure_into_parts(); + let (_stream, mut value, status) = self.destructure_into_parts(); let (receiver, completion) = match status { AsyncStatus::Completed { result } => return result.map(|()| value), @@ -182,6 +193,11 @@ impl<'a, 'stream, T: BorrowMut, C: Completion> Async<'a, 'strea /// Moves the asynchronous data move to a different [`Stream`]. /// + /// This method always adds a synchronisation barrier between the old and + /// and the new [`Stream`] to ensure that any usages of this [`Async`] + /// computations on the old [`Stream`] have completed before they can be + /// used on the new one. + /// /// # Errors /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside /// CUDA. @@ -189,52 +205,45 @@ impl<'a, 'stream, T: BorrowMut, C: Completion> Async<'a, 'strea self, stream: &'stream_new Stream, ) -> CudaResult> { - let (mut value, status) = self.destructure_into_parts(); - - let (receiver, completion, event) = match status { - AsyncStatus::Completed { .. } => { - return Ok(Async { - _stream: PhantomData::<&'stream_new Stream>, - value, - status, - _capture: PhantomData::<&'a ()>, - }) + let (old_stream, mut value, status) = self.destructure_into_parts(); + + let completion = match status { + AsyncStatus::Completed { result } => { + result?; + C::no_op() }, AsyncStatus::Processing { receiver, completion, - event, + event: _, _capture, - } => (receiver, completion, event), - }; - - match receiver.try_recv() { - Ok(Ok(())) => (), - Ok(Err(err)) => return Err(err), - Err(oneshot::TryRecvError::Empty) => { - stream.wait_event(&event, StreamWaitEventFlags::DEFAULT)?; - - return Ok(Async { - _stream: PhantomData::<&'stream_new Stream>, - value, - status: AsyncStatus::Processing { - receiver, - completion, - event, - _capture: PhantomData::<&'a T>, - }, - _capture: PhantomData::<&'a ()>, - }); + } => match receiver.try_recv() { + Ok(Ok(())) => { + completion.complete(value.borrow_mut())?; + C::no_op() + }, + Ok(Err(err)) => return Err(err), + Err(oneshot::TryRecvError::Empty) => completion, + Err(oneshot::TryRecvError::Disconnected) => return Err(CudaError::AlreadyAcquired), }, - Err(oneshot::TryRecvError::Disconnected) => return Err(CudaError::AlreadyAcquired), }; - completion.complete(value.borrow_mut())?; + let event = CudaDropWrapper::from(Event::new(EventFlags::DISABLE_TIMING)?); + event.record(old_stream)?; + stream.wait_event(&event, StreamWaitEventFlags::DEFAULT)?; + + let (sender, receiver) = oneshot::channel(); + stream.add_callback(Box::new(|result| std::mem::drop(sender.send(result))))?; Ok(Async { - _stream: PhantomData::<&'stream_new Stream>, + stream, value, - status: AsyncStatus::Completed { result: Ok(()) }, + status: AsyncStatus::Processing { + receiver, + completion, + event: Some(event), + _capture: PhantomData::<&'a T>, + }, _capture: PhantomData::<&'a ()>, }) } @@ -249,7 +258,7 @@ impl<'a, 'stream, T: BorrowMut, C: Completion> Async<'a, 'strea /// computation out of smaller ones that have all been submitted to the /// same [`Stream`]. pub unsafe fn unwrap_unchecked(self) -> CudaResult<(T, Option)> { - let (value, status) = self.destructure_into_parts(); + let (_stream, value, status) = self.destructure_into_parts(); match status { AsyncStatus::Completed { result: Ok(()) } => Ok((value, None)), @@ -264,20 +273,63 @@ impl<'a, 'stream, T: BorrowMut, C: Completion> Async<'a, 'strea } pub const fn as_ref(&self) -> AsyncProj<'_, 'stream, &T> { - AsyncProj::new(&self.value) + // Safety: this projection captures this async + unsafe { AsyncProj::new(&self.value, None) } } pub fn as_mut(&mut self) -> AsyncProj<'_, 'stream, &mut T> { - AsyncProj::new(&mut self.value) + // Safety: this projection captures this async + unsafe { + AsyncProj::new( + &mut self.value, + Some(Box::new(|| { + let completion = match &mut self.status { + AsyncStatus::Completed { result } => { + (*result)?; + C::no_op() + }, + AsyncStatus::Processing { + receiver: _, + completion, + event: _, + _capture, + } => std::mem::replace(completion, C::no_op()), + }; + + let event = CudaDropWrapper::from(Event::new(EventFlags::DISABLE_TIMING)?); + + let (sender, receiver) = oneshot::channel(); + + self.stream + .add_callback(Box::new(|result| std::mem::drop(sender.send(result))))?; + event.record(self.stream)?; + + self.status = AsyncStatus::Processing { + receiver, + completion, + event: Some(event), + _capture: PhantomData::<&'a T>, + }; + + Ok(()) + })), + ) + } } #[must_use] - fn destructure_into_parts(self) -> (T, AsyncStatus<'a, T, C>) { + fn destructure_into_parts(self) -> (&'stream Stream, T, AsyncStatus<'a, T, C>) { let this = std::mem::ManuallyDrop::new(self); // Safety: we destructure self into its droppable components, // value and status, without dropping self itself - unsafe { (std::ptr::read(&this.value), (std::ptr::read(&this.status))) } + unsafe { + ( + this.stream, + std::ptr::read(&this.value), + (std::ptr::read(&this.status)), + ) + } } } @@ -360,7 +412,7 @@ impl<'a, 'stream, T: BorrowMut, C: Completion> IntoFuture type IntoFuture = impl Future; fn into_future(self) -> Self::IntoFuture { - let (value, status) = self.destructure_into_parts(); + let (_stream, value, status) = self.destructure_into_parts(); let (completion, status): (Option, AsyncStatus<'a, T, NoCompletion>) = match status { AsyncStatus::Completed { result } => { @@ -422,21 +474,30 @@ impl<'a, 'stream, T: BorrowMut, C: Completion> Drop #[cfg(feature = "host")] #[allow(clippy::module_name_repetitions)] -#[derive(Copy, Clone)] pub struct AsyncProj<'a, 'stream, T: 'a> { _capture: PhantomData<&'a ()>, _stream: PhantomData<&'stream Stream>, value: T, + use_callback: Option CudaResult<()> + 'a>>, } #[cfg(feature = "host")] impl<'a, 'stream, T: 'a> AsyncProj<'a, 'stream, T> { #[must_use] - pub(crate) const fn new(value: T) -> Self { + /// # Safety + /// + /// This projection must either capture an existing [`Async`] or come from + /// a source that ensures that the projected value can never (async) move + /// to a different [`Stream`]. + pub(crate) const unsafe fn new( + value: T, + use_callback: Option CudaResult<()> + 'a>>, + ) -> Self { Self { _capture: PhantomData::<&'a ()>, _stream: PhantomData::<&'stream Stream>, value, + use_callback, } } @@ -452,6 +513,22 @@ impl<'a, 'stream, T: 'a> AsyncProj<'a, 'stream, T> { pub(crate) unsafe fn unwrap_unchecked(self) -> T { self.value } + + #[allow(clippy::type_complexity)] + /// # Safety + /// + /// The returned reference to the inner value of type `T` may not yet have + /// completed its asynchronous work and may thus be in an inconsistent + /// state. + /// + /// This method must only be used to construct a larger asynchronous + /// computation out of smaller ones that have all been submitted to the + /// same [`Stream`]. + pub(crate) unsafe fn unwrap_unchecked_with_use( + self, + ) -> (T, Option CudaResult<()> + 'a>>) { + (self.value, self.use_callback) + } } #[cfg(feature = "host")] @@ -465,6 +542,7 @@ impl<'a, 'stream, T: 'a> AsyncProj<'a, 'stream, T> { _capture: PhantomData::<&'b ()>, _stream: PhantomData::<&'stream Stream>, value: &self.value, + use_callback: None, } } @@ -477,8 +555,18 @@ impl<'a, 'stream, T: 'a> AsyncProj<'a, 'stream, T> { _capture: PhantomData::<&'b ()>, _stream: PhantomData::<&'stream Stream>, value: &mut self.value, + use_callback: self.use_callback.as_mut().map(|use_callback| { + let use_callback: Box CudaResult<()>> = Box::new(use_callback); + use_callback + }), } } + + pub(crate) fn record_mut_use(&mut self) -> CudaResult<()> { + self.use_callback + .as_mut() + .map_or(Ok(()), |use_callback| use_callback()) + } } #[cfg(feature = "host")] @@ -492,8 +580,22 @@ impl<'a, 'stream, T: 'a> AsyncProj<'a, 'stream, &'a T> { _capture: PhantomData::<&'b ()>, _stream: PhantomData::<&'stream Stream>, value: self.value, + use_callback: None, } } + + /// # Safety + /// + /// The returned reference to the inner value of type `&T` may not yet have + /// completed its asynchronous work and may thus be in an inconsistent + /// state. + /// + /// This method must only be used to construct a larger asynchronous + /// computation out of smaller ones that have all been submitted to the + /// same [`Stream`]. + pub(crate) const unsafe fn unwrap_ref_unchecked(&self) -> &T { + self.value + } } #[cfg(feature = "host")] @@ -507,6 +609,7 @@ impl<'a, 'stream, T: 'a> AsyncProj<'a, 'stream, &'a mut T> { _capture: PhantomData::<&'b ()>, _stream: PhantomData::<&'stream Stream>, value: self.value, + use_callback: None, } } @@ -519,6 +622,38 @@ impl<'a, 'stream, T: 'a> AsyncProj<'a, 'stream, &'a mut T> { _capture: PhantomData::<&'b ()>, _stream: PhantomData::<&'stream Stream>, value: self.value, + use_callback: self.use_callback.as_mut().map(|use_callback| { + let use_callback: Box CudaResult<()>> = Box::new(use_callback); + use_callback + }), } } + + #[allow(dead_code)] // FIXME + /// # Safety + /// + /// The returned reference to the inner value of type `&T` may not yet have + /// completed its asynchronous work and may thus be in an inconsistent + /// state. + /// + /// This method must only be used to construct a larger asynchronous + /// computation out of smaller ones that have all been submitted to the + /// same [`Stream`]. + pub(crate) unsafe fn unwrap_ref_unchecked(&self) -> &T { + self.value + } + + #[allow(dead_code)] // FIXME + /// # Safety + /// + /// The returned reference to the inner value of type `&T` may not yet have + /// completed its asynchronous work and may thus be in an inconsistent + /// state. + /// + /// This method must only be used to construct a larger asynchronous + /// computation out of smaller ones that have all been submitted to the + /// same [`Stream`]. + pub(crate) unsafe fn unwrap_mut_unchecked(&mut self) -> &mut T { + self.value + } } diff --git a/src/utils/exchange/buffer/host.rs b/src/utils/exchange/buffer/host.rs index 184de1aca..7db5ba3a2 100644 --- a/src/utils/exchange/buffer/host.rs +++ b/src/utils/exchange/buffer/host.rs @@ -180,7 +180,7 @@ impl( &self, alloc: A, - stream: &'stream rustacuda::stream::Stream, + stream: &'stream crate::host::Stream, ) -> rustacuda::error::CudaResult<( Async<'_, 'stream, DeviceAccessible>>, CombinedCudaAlloc, @@ -217,7 +217,7 @@ impl( mut this: owning_ref::BoxRefMut<'a, O, Self>, alloc: CombinedCudaAlloc, - stream: &'stream rustacuda::stream::Stream, + stream: &'stream crate::host::Stream, ) -> rustacuda::error::CudaResult<( Async<'a, 'stream, owning_ref::BoxRefMut<'a, O, Self>, CompletionFnMut<'a, Self>>, A, diff --git a/src/utils/exchange/buffer/mod.rs b/src/utils/exchange/buffer/mod.rs index 28ee028d1..80fa09bbd 100644 --- a/src/utils/exchange/buffer/mod.rs +++ b/src/utils/exchange/buffer/mod.rs @@ -146,7 +146,7 @@ unsafe impl( &self, alloc: A, - stream: &'stream rustacuda::stream::Stream, + stream: &'stream crate::host::Stream, ) -> rustacuda::error::CudaResult<( Async<'_, 'stream, DeviceAccessible>, CombinedCudaAlloc, @@ -159,7 +159,7 @@ unsafe impl( this: owning_ref::BoxRefMut<'a, O, Self>, alloc: CombinedCudaAlloc, - stream: &'stream rustacuda::stream::Stream, + stream: &'stream crate::host::Stream, ) -> rustacuda::error::CudaResult<( Async<'a, 'stream, owning_ref::BoxRefMut<'a, O, Self>, CompletionFnMut<'a, Self>>, A, diff --git a/src/utils/exchange/wrapper.rs b/src/utils/exchange/wrapper.rs index 1f3326c5b..faa9c5b44 100644 --- a/src/utils/exchange/wrapper.rs +++ b/src/utils/exchange/wrapper.rs @@ -3,12 +3,11 @@ use std::ops::{Deref, DerefMut}; use rustacuda::{ error::CudaResult, memory::{AsyncCopyDestination, CopyDestination, DeviceBox, LockedBox}, - stream::Stream, }; use crate::{ alloc::{EmptyCudaAlloc, NoCudaAlloc}, - host::{CudaDropWrapper, HostAndDeviceConstRef, HostAndDeviceMutRef}, + host::{CudaDropWrapper, HostAndDeviceConstRef, HostAndDeviceMutRef, Stream}, lend::{RustToCuda, RustToCudaAsync}, safety::SafeMutableAliasing, utils::{ @@ -195,22 +194,6 @@ impl> ExchangeWrapperOnDevice { ) } } - - #[must_use] - pub fn as_mut( - &mut self, - ) -> HostAndDeviceMutRef::CudaRepresentation>> - where - T: SafeMutableAliasing, - { - // Safety: `device_box` contains exactly the device copy of `locked_cuda_repr` - unsafe { - HostAndDeviceMutRef::new_unchecked( - &mut self.device_box, - (**self.locked_cuda_repr).into_mut(), - ) - } - } } impl> @@ -339,12 +322,16 @@ impl< > { let this = unsafe { self.as_ref().unwrap_unchecked() }; - AsyncProj::new(unsafe { - HostAndDeviceConstRef::new_unchecked( - &*(this.device_box), - (**(this.locked_cuda_repr)).into_ref(), + // Safety: this projection captures this async + unsafe { + AsyncProj::new( + HostAndDeviceConstRef::new_unchecked( + &*(this.device_box), + (**(this.locked_cuda_repr)).into_ref(), + ), + None, ) - }) + } } #[must_use] @@ -358,13 +345,17 @@ impl< where T: SafeMutableAliasing, { - let this = unsafe { self.as_mut().unwrap_unchecked() }; + let (this, use_callback) = unsafe { self.as_mut().unwrap_unchecked_with_use() }; - AsyncProj::new(unsafe { - HostAndDeviceMutRef::new_unchecked( - &mut *(this.device_box), - (**(this.locked_cuda_repr)).into_mut(), + // Safety: this projection captures this async + unsafe { + AsyncProj::new( + HostAndDeviceMutRef::new_unchecked( + &mut *(this.device_box), + (**(this.locked_cuda_repr)).into_mut(), + ), + use_callback, ) - }) + } } } From 36aa41a374637084fe95ed95799875dc622229e4 Mon Sep 17 00:00:00 2001 From: Juniper Tyree Date: Sat, 13 Jan 2024 07:18:28 +0000 Subject: [PATCH 104/120] Fix uniqueness guarantee for Stream using branded types --- rust-cuda-derive/src/rust_to_cuda/impl.rs | 4 +- src/host/mod.rs | 54 ++++++++++++++++------- src/kernel/mod.rs | 12 ++--- src/kernel/param.rs | 24 +++++----- src/lend/impls/box.rs | 8 ++-- src/lend/impls/boxed_slice.rs | 8 ++-- src/lend/impls/final.rs | 4 +- src/lend/impls/option.rs | 4 +- src/lend/impls/ref.rs | 6 +-- src/lend/impls/slice_ref.rs | 6 +-- src/lend/mod.rs | 16 +++---- src/utils/adapter.rs | 8 ++-- src/utils/aliasing/const.rs | 4 +- src/utils/aliasing/dynamic.rs | 4 +- src/utils/async.rs | 32 +++++++------- src/utils/exchange/buffer/host.rs | 8 ++-- src/utils/exchange/buffer/mod.rs | 4 +- src/utils/exchange/wrapper.rs | 8 ++-- 18 files changed, 119 insertions(+), 95 deletions(-) diff --git a/rust-cuda-derive/src/rust_to_cuda/impl.rs b/rust-cuda-derive/src/rust_to_cuda/impl.rs index e45a0e283..e0a67b7e3 100644 --- a/rust-cuda-derive/src/rust_to_cuda/impl.rs +++ b/rust-cuda-derive/src/rust_to_cuda/impl.rs @@ -191,7 +191,7 @@ pub fn rust_to_cuda_async_trait( unsafe fn borrow_async<'stream, CudaAllocType: #crate_path::alloc::CudaAlloc>( &self, alloc: CudaAllocType, - stream: &'stream #crate_path::host::Stream, + stream: #crate_path::host::Stream<'stream>, ) -> #crate_path::deps::rustacuda::error::CudaResult<( #crate_path::utils::r#async::Async< '_, 'stream, @@ -219,7 +219,7 @@ pub fn rust_to_cuda_async_trait( alloc: #crate_path::alloc::CombinedCudaAlloc< Self::CudaAllocationAsync, CudaAllocType >, - stream: &'stream #crate_path::host::Stream, + stream: #crate_path::host::Stream<'stream>, ) -> #crate_path::deps::rustacuda::error::CudaResult<( #crate_path::utils::r#async::Async< 'a, 'stream, diff --git a/src/host/mod.rs b/src/host/mod.rs index 25fd73a84..23bab2706 100644 --- a/src/host/mod.rs +++ b/src/host/mod.rs @@ -25,30 +25,54 @@ use crate::{ }, }; +type InvariantLifetime<'brand> = PhantomData &'brand ()>; + +#[derive(Copy, Clone)] #[repr(transparent)] -pub struct Stream { - stream: rustacuda::stream::Stream, +pub struct Stream<'stream> { + stream: &'stream rustacuda::stream::Stream, + _brand: InvariantLifetime<'stream>, } -impl Deref for Stream { +impl<'stream> Deref for Stream<'stream> { type Target = rustacuda::stream::Stream; fn deref(&self) -> &Self::Target { - &self.stream + self.stream } } -impl Stream { +impl<'stream> Stream<'stream> { + #[allow(clippy::needless_pass_by_ref_mut)] + /// Create a new uniquely branded [`Stream`], which can bind async + /// operations to the [`Stream`] that they are computed on. + /// + /// The uniqueness guarantees are provided by using branded types, + /// as inspired by the Ghost Cell paper by Yanovski, J., Dang, H.-H., + /// Jung, R., and Dreyer, D.: . + /// + /// # Examples + /// + /// The following example shows that two [`Stream`]'s with different + /// `'stream` lifetime brands cannot be used interchangeably. + /// + /// ```rust, compile_fail + /// use rust_cuda::host::Stream; + /// + /// fn check_same<'stream>(_stream_a: Stream<'stream>, _stream_b: Stream<'stream>) {} + /// + /// fn two_streams<'stream_a, 'stream_b>(stream_a: Stream<'stream_a>, stream_b: Stream<'stream_b>) { + /// check_same(stream_a, stream_b); + /// } + /// ``` pub fn with( stream: &mut rustacuda::stream::Stream, - inner: impl for<'stream> FnOnce(&'stream Self) -> O, + inner: impl for<'new_stream> FnOnce(Stream<'new_stream>) -> O, ) -> O { - // Safety: - // - Stream is a newtype wrapper around rustacuda::stream::Stream - // - we forge a unique lifetime for a unique reference - let stream = unsafe { &*std::ptr::from_ref(stream).cast() }; - - inner(stream) + inner(Stream { + stream, + _brand: InvariantLifetime::default(), + }) } } @@ -219,7 +243,7 @@ impl<'a, T: PortableBitSemantics + TypeGraphLayout> HostAndDeviceMutRef<'a, T> { #[must_use] pub fn into_async<'b, 'stream>( self, - stream: &'stream Stream, + stream: Stream<'stream>, ) -> Async<'b, 'stream, HostAndDeviceMutRef<'b, T>, NoCompletion> where 'a: 'b, @@ -312,7 +336,7 @@ impl<'a, T: PortableBitSemantics + TypeGraphLayout> HostAndDeviceConstRef<'a, T> #[must_use] pub const fn as_async<'b, 'stream>( &'b self, - stream: &'stream Stream, + stream: Stream<'stream>, ) -> Async<'b, 'stream, HostAndDeviceConstRef<'b, T>, NoCompletion> where 'a: 'b, @@ -370,7 +394,7 @@ impl<'a, T: PortableBitSemantics + TypeGraphLayout> HostAndDeviceOwned<'a, T> { #[must_use] pub const fn into_async<'stream>( self, - stream: &'stream Stream, + stream: Stream<'stream>, ) -> Async<'a, 'stream, Self, NoCompletion> { Async::ready(self, stream) } diff --git a/src/kernel/mod.rs b/src/kernel/mod.rs index b5fea0af8..42e13e0ce 100644 --- a/src/kernel/mod.rs +++ b/src/kernel/mod.rs @@ -110,7 +110,7 @@ pub trait CudaKernelParameter: sealed::Sealed { #[allow(clippy::missing_errors_doc)] // FIXME fn with_new_async<'stream, 'param, O, E: From>( param: Self::SyncHostType, - stream: &'stream crate::host::Stream, + stream: crate::host::Stream<'stream>, inner: impl WithNewAsync<'stream, Self, O, E>, ) -> Result where @@ -156,7 +156,7 @@ pub trait CudaKernelParameter: sealed::Sealed { #[cfg(feature = "host")] pub struct Launcher<'stream, 'kernel, Kernel> { - pub stream: &'stream Stream, + pub stream: Stream<'stream>, pub kernel: &'kernel mut TypedPtxKernel, pub config: LaunchConfig, } @@ -366,7 +366,7 @@ macro_rules! impl_typed_kernel_launch { #[allow(clippy::too_many_arguments)] // func is defined for <= 12 args pub fn $launch<'kernel, 'stream, $($T: CudaKernelParameter),*>( &'kernel mut self, - stream: &'stream Stream, + stream: Stream<'stream>, config: &LaunchConfig, $($arg: $T::SyncHostType),* ) -> CudaResult<()> @@ -396,12 +396,12 @@ macro_rules! impl_typed_kernel_launch { $($T: CudaKernelParameter),* >( &'kernel mut self, - stream: &'stream Stream, + stream: Stream<'stream>, config: &LaunchConfig, $($arg: $T::SyncHostType,)* inner: impl FnOnce( &'kernel mut Self, - &'stream Stream, + Stream<'stream>, &LaunchConfig, $($T::AsyncHostType<'stream, '_>),* ) -> Result, @@ -419,7 +419,7 @@ macro_rules! impl_typed_kernel_launch { #[allow(clippy::too_many_arguments)] // func is defined for <= 12 args pub fn $launch_async<'kernel, 'stream, $($T: CudaKernelParameter),*>( &'kernel mut self, - stream: &'stream Stream, + stream: Stream<'stream>, config: &LaunchConfig, $($arg: $T::AsyncHostType<'stream, '_>),* ) -> CudaResult>( param: Self::SyncHostType, - _stream: &'stream crate::host::Stream, + _stream: crate::host::Stream<'stream>, inner: impl super::WithNewAsync<'stream, Self, O, E>, ) -> Result where @@ -167,7 +167,7 @@ impl< #[cfg(feature = "host")] fn with_new_async<'stream, 'b, O, E: From>( param: Self::SyncHostType, - stream: &'stream crate::host::Stream, + stream: crate::host::Stream<'stream>, inner: impl super::WithNewAsync<'stream, Self, O, E>, ) -> Result where @@ -251,7 +251,7 @@ impl< #[cfg(feature = "host")] fn with_new_async<'stream, 'b, O, E: From>( param: Self::SyncHostType, - stream: &'stream crate::host::Stream, + stream: crate::host::Stream<'stream>, inner: impl super::WithNewAsync<'stream, Self, O, E>, ) -> Result where @@ -373,7 +373,7 @@ impl< #[cfg(feature = "host")] fn with_new_async<'stream, 'b, O, E: From>( param: Self::SyncHostType, - stream: &'stream crate::host::Stream, + stream: crate::host::Stream<'stream>, inner: impl super::WithNewAsync<'stream, Self, O, E>, ) -> Result where @@ -509,7 +509,7 @@ impl< #[cfg(feature = "host")] fn with_new_async<'stream, 'b, O, E: From>( param: Self::SyncHostType, - stream: &'stream crate::host::Stream, + stream: crate::host::Stream<'stream>, inner: impl super::WithNewAsync<'stream, Self, O, E>, ) -> Result where @@ -595,7 +595,7 @@ impl<'a, T: Sync + RustToCuda> CudaKernelParameter for &'a DeepPerThreadBorrow>( param: Self::SyncHostType, - stream: &'stream crate::host::Stream, + stream: crate::host::Stream<'stream>, inner: impl super::WithNewAsync<'stream, Self, O, E>, ) -> Result where @@ -678,7 +678,7 @@ impl<'a, T: Sync + RustToCuda + SafeMutableAliasing> CudaKernelParameter #[cfg(feature = "host")] fn with_new_async<'stream, 'b, O, E: From>( param: Self::SyncHostType, - stream: &'stream crate::host::Stream, + stream: crate::host::Stream<'stream>, inner: impl super::WithNewAsync<'stream, Self, O, E>, ) -> Result where @@ -768,7 +768,7 @@ impl< #[cfg(feature = "host")] fn with_new_async<'stream, 'b, O, E: From>( param: Self::SyncHostType, - stream: &'stream crate::host::Stream, + stream: crate::host::Stream<'stream>, inner: impl super::WithNewAsync<'stream, Self, O, E>, ) -> Result where @@ -851,7 +851,7 @@ impl<'a, T: Sync + RustToCuda> CudaKernelParameter for &'a PtxJit>( param: Self::SyncHostType, - stream: &'stream crate::host::Stream, + stream: crate::host::Stream<'stream>, inner: impl super::WithNewAsync<'stream, Self, O, E>, ) -> Result where @@ -932,7 +932,7 @@ impl<'a, T: Sync + RustToCuda + SafeMutableAliasing> CudaKernelParameter #[cfg(feature = "host")] fn with_new_async<'stream, 'b, O, E: From>( param: Self::SyncHostType, - stream: &'stream crate::host::Stream, + stream: crate::host::Stream<'stream>, inner: impl super::WithNewAsync<'stream, Self, O, E>, ) -> Result where @@ -1058,7 +1058,7 @@ impl<'a, T: 'static> CudaKernelParameter for &'a mut crate::utils::shared::Threa #[cfg(feature = "host")] fn with_new_async<'stream, 'b, O, E: From>( param: Self::SyncHostType, - _stream: &'stream crate::host::Stream, + _stream: crate::host::Stream<'stream>, inner: impl super::WithNewAsync<'stream, Self, O, E>, ) -> Result where @@ -1135,7 +1135,7 @@ impl<'a, T: 'static + PortableBitSemantics + TypeGraphLayout> CudaKernelParamete #[cfg(feature = "host")] fn with_new_async<'stream, 'b, O, E: From>( param: Self::SyncHostType, - _stream: &'stream crate::host::Stream, + _stream: crate::host::Stream<'stream>, inner: impl super::WithNewAsync<'stream, Self, O, E>, ) -> Result where diff --git a/src/lend/impls/box.rs b/src/lend/impls/box.rs index fff0bb8d8..b4cec19cd 100644 --- a/src/lend/impls/box.rs +++ b/src/lend/impls/box.rs @@ -90,7 +90,7 @@ unsafe impl RustToCudaAsync for Box( &self, alloc: A, - stream: &'stream crate::host::Stream, + stream: crate::host::Stream<'stream>, ) -> rustacuda::error::CudaResult<( Async<'_, 'stream, DeviceAccessible>, CombinedCudaAlloc, @@ -113,7 +113,7 @@ unsafe impl RustToCudaAsync for Box>, >::uninitialized()?); - device_box.async_copy_from(&*locked_box, stream)?; + device_box.async_copy_from(&*locked_box, &stream)?; Ok(( Async::pending( @@ -131,7 +131,7 @@ unsafe impl RustToCudaAsync for Box( this: owning_ref::BoxRefMut<'a, O, Self>, alloc: CombinedCudaAlloc, - stream: &'stream crate::host::Stream, + stream: crate::host::Stream<'stream>, ) -> CudaResult<( Async<'a, 'stream, owning_ref::BoxRefMut<'a, O, Self>, CompletionFnMut<'a, Self>>, A, @@ -141,7 +141,7 @@ unsafe impl RustToCudaAsync for Box>::pending( this, diff --git a/src/lend/impls/boxed_slice.rs b/src/lend/impls/boxed_slice.rs index c275a6d1c..5215d2acf 100644 --- a/src/lend/impls/boxed_slice.rs +++ b/src/lend/impls/boxed_slice.rs @@ -96,7 +96,7 @@ unsafe impl RustToCudaAsync for Box<[ unsafe fn borrow_async<'stream, A: CudaAlloc>( &self, alloc: A, - stream: &'stream crate::host::Stream, + stream: crate::host::Stream<'stream>, ) -> rustacuda::error::CudaResult<( Async<'_, 'stream, DeviceAccessible>, CombinedCudaAlloc, @@ -120,7 +120,7 @@ unsafe impl RustToCudaAsync for Box<[ let mut device_buffer = CudaDropWrapper::from(DeviceBuffer::< DeviceCopyWithPortableBitSemantics>, >::uninitialized(self.len())?); - device_buffer.async_copy_from(&*locked_buffer, stream)?; + device_buffer.async_copy_from(&*locked_buffer, &stream)?; Ok(( Async::pending( @@ -140,7 +140,7 @@ unsafe impl RustToCudaAsync for Box<[ unsafe fn restore_async<'a, 'stream, A: CudaAlloc, O>( this: owning_ref::BoxRefMut<'a, O, Self>, alloc: CombinedCudaAlloc, - stream: &'stream crate::host::Stream, + stream: crate::host::Stream<'stream>, ) -> CudaResult<( Async<'a, 'stream, owning_ref::BoxRefMut<'a, O, Self>, CompletionFnMut<'a, Self>>, A, @@ -150,7 +150,7 @@ unsafe impl RustToCudaAsync for Box<[ let (alloc_front, alloc_tail) = alloc.split(); let (mut locked_buffer, device_buffer) = alloc_front.split(); - device_buffer.async_copy_to(&mut *locked_buffer, stream)?; + device_buffer.async_copy_to(&mut *locked_buffer, &stream)?; let r#async = crate::utils::r#async::Async::<_, CompletionFnMut<'a, Self>>::pending( this, diff --git a/src/lend/impls/final.rs b/src/lend/impls/final.rs index 845424ef9..5799a77eb 100644 --- a/src/lend/impls/final.rs +++ b/src/lend/impls/final.rs @@ -49,7 +49,7 @@ unsafe impl RustToCudaAsync for Final { unsafe fn borrow_async<'stream, A: crate::alloc::CudaAlloc>( &self, alloc: A, - stream: &'stream crate::host::Stream, + stream: crate::host::Stream<'stream>, ) -> rustacuda::error::CudaResult<( crate::utils::r#async::Async<'_, 'stream, DeviceAccessible>, crate::alloc::CombinedCudaAlloc, @@ -76,7 +76,7 @@ unsafe impl RustToCudaAsync for Final { unsafe fn restore_async<'a, 'stream, A: crate::alloc::CudaAlloc, O>( this: owning_ref::BoxRefMut<'a, O, Self>, alloc: crate::alloc::CombinedCudaAlloc, - stream: &'stream crate::host::Stream, + stream: crate::host::Stream<'stream>, ) -> rustacuda::error::CudaResult<( crate::utils::r#async::Async< 'a, diff --git a/src/lend/impls/option.rs b/src/lend/impls/option.rs index 76be7e762..0e9c3c34d 100644 --- a/src/lend/impls/option.rs +++ b/src/lend/impls/option.rs @@ -89,7 +89,7 @@ unsafe impl RustToCudaAsync for Option { unsafe fn borrow_async<'stream, A: CudaAlloc>( &self, alloc: A, - stream: &'stream crate::host::Stream, + stream: crate::host::Stream<'stream>, ) -> CudaResult<( Async<'_, 'stream, DeviceAccessible>, CombinedCudaAlloc, @@ -135,7 +135,7 @@ unsafe impl RustToCudaAsync for Option { unsafe fn restore_async<'a, 'stream, A: CudaAlloc, O>( mut this: owning_ref::BoxRefMut<'a, O, Self>, alloc: CombinedCudaAlloc, - stream: &'stream crate::host::Stream, + stream: crate::host::Stream<'stream>, ) -> CudaResult<( Async<'a, 'stream, owning_ref::BoxRefMut<'a, O, Self>, CompletionFnMut<'a, Self>>, A, diff --git a/src/lend/impls/ref.rs b/src/lend/impls/ref.rs index 3ce472317..4233d1423 100644 --- a/src/lend/impls/ref.rs +++ b/src/lend/impls/ref.rs @@ -85,7 +85,7 @@ unsafe impl<'a, T: PortableBitSemantics + TypeGraphLayout> RustToCudaAsync for & unsafe fn borrow_async<'stream, A: CudaAlloc>( &self, alloc: A, - stream: &'stream crate::host::Stream, + stream: crate::host::Stream<'stream>, ) -> rustacuda::error::CudaResult<( Async<'_, 'stream, DeviceAccessible>, CombinedCudaAlloc, @@ -108,7 +108,7 @@ unsafe impl<'a, T: PortableBitSemantics + TypeGraphLayout> RustToCudaAsync for & let mut device_box = CudaDropWrapper::from(DeviceBox::< DeviceCopyWithPortableBitSemantics>, >::uninitialized()?); - device_box.async_copy_from(&*locked_box, stream)?; + device_box.async_copy_from(&*locked_box, &stream)?; Ok(( Async::pending( @@ -127,7 +127,7 @@ unsafe impl<'a, T: PortableBitSemantics + TypeGraphLayout> RustToCudaAsync for & unsafe fn restore_async<'b, 'stream, A: CudaAlloc, O>( this: owning_ref::BoxRefMut<'b, O, Self>, alloc: CombinedCudaAlloc, - stream: &'stream crate::host::Stream, + stream: crate::host::Stream<'stream>, ) -> CudaResult<( Async<'b, 'stream, owning_ref::BoxRefMut<'b, O, Self>, CompletionFnMut<'b, Self>>, A, diff --git a/src/lend/impls/slice_ref.rs b/src/lend/impls/slice_ref.rs index 07271a75a..bd74dea64 100644 --- a/src/lend/impls/slice_ref.rs +++ b/src/lend/impls/slice_ref.rs @@ -88,7 +88,7 @@ unsafe impl<'a, T: PortableBitSemantics + TypeGraphLayout> RustToCudaAsync for & unsafe fn borrow_async<'stream, A: CudaAlloc>( &self, alloc: A, - stream: &'stream crate::host::Stream, + stream: crate::host::Stream<'stream>, ) -> rustacuda::error::CudaResult<( Async<'_, 'stream, DeviceAccessible>, CombinedCudaAlloc, @@ -112,7 +112,7 @@ unsafe impl<'a, T: PortableBitSemantics + TypeGraphLayout> RustToCudaAsync for & let mut device_buffer = CudaDropWrapper::from(DeviceBuffer::< DeviceCopyWithPortableBitSemantics>, >::uninitialized(self.len())?); - device_buffer.async_copy_from(&*locked_buffer, stream)?; + device_buffer.async_copy_from(&*locked_buffer, &stream)?; Ok(( Async::pending( @@ -132,7 +132,7 @@ unsafe impl<'a, T: PortableBitSemantics + TypeGraphLayout> RustToCudaAsync for & unsafe fn restore_async<'b, 'stream, A: CudaAlloc, O>( this: owning_ref::BoxRefMut<'b, O, Self>, alloc: CombinedCudaAlloc, - stream: &'stream crate::host::Stream, + stream: crate::host::Stream<'stream>, ) -> CudaResult<( Async<'b, 'stream, owning_ref::BoxRefMut<'b, O, Self>, CompletionFnMut<'b, Self>>, A, diff --git a/src/lend/mod.rs b/src/lend/mod.rs index 6c0467fd5..e05237768 100644 --- a/src/lend/mod.rs +++ b/src/lend/mod.rs @@ -101,7 +101,7 @@ pub unsafe trait RustToCudaAsync: RustToCuda { unsafe fn borrow_async<'stream, A: CudaAlloc>( &self, alloc: A, - stream: &'stream crate::host::Stream, + stream: crate::host::Stream<'stream>, ) -> rustacuda::error::CudaResult<( Async<'_, 'stream, DeviceAccessible>, CombinedCudaAlloc, @@ -127,7 +127,7 @@ pub unsafe trait RustToCudaAsync: RustToCuda { unsafe fn restore_async<'a, 'stream, A: CudaAlloc, O>( this: owning_ref::BoxRefMut<'a, O, Self>, alloc: CombinedCudaAlloc, - stream: &'stream crate::host::Stream, + stream: crate::host::Stream<'stream>, ) -> rustacuda::error::CudaResult<( Async<'a, 'stream, owning_ref::BoxRefMut<'a, O, Self>, CompletionFnMut<'a, Self>>, A, @@ -324,7 +324,7 @@ pub trait LendToCudaAsync: RustToCudaAsync { ) -> Result, >( &self, - stream: &'stream crate::host::Stream, + stream: crate::host::Stream<'stream>, inner: F, ) -> Result where @@ -357,7 +357,7 @@ pub trait LendToCudaAsync: RustToCudaAsync { T: 'a, >( this: owning_ref::BoxRefMut<'a, T, Self>, - stream: &'stream crate::host::Stream, + stream: crate::host::Stream<'stream>, inner: F, ) -> Result< ( @@ -393,7 +393,7 @@ pub trait LendToCudaAsync: RustToCudaAsync { ) -> Result, >( self, - stream: &'stream crate::host::Stream, + stream: crate::host::Stream<'stream>, inner: F, ) -> Result where @@ -416,7 +416,7 @@ impl LendToCudaAsync for T { ) -> Result, >( &self, - stream: &'stream crate::host::Stream, + stream: crate::host::Stream<'stream>, inner: F, ) -> Result where @@ -458,7 +458,7 @@ impl LendToCudaAsync for T { S: 'a, >( this: owning_ref::BoxRefMut<'a, S, Self>, - stream: &'stream crate::host::Stream, + stream: crate::host::Stream<'stream>, inner: F, ) -> Result< ( @@ -505,7 +505,7 @@ impl LendToCudaAsync for T { ) -> Result, >( self, - stream: &'stream crate::host::Stream, + stream: crate::host::Stream<'stream>, inner: F, ) -> Result where diff --git a/src/utils/adapter.rs b/src/utils/adapter.rs index 84aa28569..fa023cc66 100644 --- a/src/utils/adapter.rs +++ b/src/utils/adapter.rs @@ -156,7 +156,7 @@ unsafe impl RustToCudaAsync unsafe fn borrow_async<'stream, A: CudaAlloc>( &self, alloc: A, - stream: &'stream crate::host::Stream, + stream: crate::host::Stream<'stream>, ) -> rustacuda::error::CudaResult<( crate::utils::r#async::Async<'_, 'stream, DeviceAccessible>, CombinedCudaAlloc, @@ -172,7 +172,7 @@ unsafe impl RustToCudaAsync unsafe fn restore_async<'a, 'stream, A: CudaAlloc, O>( this: owning_ref::BoxRefMut<'a, O, Self>, alloc: CombinedCudaAlloc, - stream: &'stream crate::host::Stream, + stream: crate::host::Stream<'stream>, ) -> rustacuda::error::CudaResult<( crate::utils::r#async::Async< 'a, @@ -346,7 +346,7 @@ unsafe impl RustToCudaAsync unsafe fn borrow_async<'stream, A: CudaAlloc>( &self, alloc: A, - stream: &'stream crate::host::Stream, + stream: crate::host::Stream<'stream>, ) -> rustacuda::error::CudaResult<( crate::utils::r#async::Async<'_, 'stream, DeviceAccessible>, CombinedCudaAlloc, @@ -362,7 +362,7 @@ unsafe impl RustToCudaAsync unsafe fn restore_async<'a, 'stream, A: CudaAlloc, O>( this: owning_ref::BoxRefMut<'a, O, Self>, alloc: CombinedCudaAlloc, - stream: &'stream crate::host::Stream, + stream: crate::host::Stream<'stream>, ) -> rustacuda::error::CudaResult<( crate::utils::r#async::Async< 'a, diff --git a/src/utils/aliasing/const.rs b/src/utils/aliasing/const.rs index 24178131c..097b4c0f4 100644 --- a/src/utils/aliasing/const.rs +++ b/src/utils/aliasing/const.rs @@ -222,7 +222,7 @@ unsafe impl RustToCudaAsync unsafe fn borrow_async<'stream, A: crate::alloc::CudaAlloc>( &self, alloc: A, - stream: &'stream crate::host::Stream, + stream: crate::host::Stream<'stream>, ) -> rustacuda::error::CudaResult<( crate::utils::r#async::Async<'_, 'stream, DeviceAccessible>, crate::alloc::CombinedCudaAlloc, @@ -250,7 +250,7 @@ unsafe impl RustToCudaAsync unsafe fn restore_async<'a, 'stream, A: crate::alloc::CudaAlloc, O>( this: owning_ref::BoxRefMut<'a, O, Self>, alloc: crate::alloc::CombinedCudaAlloc, - stream: &'stream crate::host::Stream, + stream: crate::host::Stream<'stream>, ) -> rustacuda::error::CudaResult<( crate::utils::r#async::Async< 'a, diff --git a/src/utils/aliasing/dynamic.rs b/src/utils/aliasing/dynamic.rs index c16d4bf4f..3928c87d1 100644 --- a/src/utils/aliasing/dynamic.rs +++ b/src/utils/aliasing/dynamic.rs @@ -200,7 +200,7 @@ unsafe impl RustToCudaAsync for SplitSliceOverCudaThreadsDyn unsafe fn borrow_async<'stream, A: crate::alloc::CudaAlloc>( &self, alloc: A, - stream: &'stream crate::host::Stream, + stream: crate::host::Stream<'stream>, ) -> rustacuda::error::CudaResult<( crate::utils::r#async::Async<'_, 'stream, DeviceAccessible>, crate::alloc::CombinedCudaAlloc, @@ -230,7 +230,7 @@ unsafe impl RustToCudaAsync for SplitSliceOverCudaThreadsDyn unsafe fn restore_async<'a, 'stream, A: crate::alloc::CudaAlloc, O>( this: owning_ref::BoxRefMut<'a, O, Self>, alloc: crate::alloc::CombinedCudaAlloc, - stream: &'stream crate::host::Stream, + stream: crate::host::Stream<'stream>, ) -> rustacuda::error::CudaResult<( crate::utils::r#async::Async< 'a, diff --git a/src/utils/async.rs b/src/utils/async.rs index 7a33da8d6..24ef8bfc2 100644 --- a/src/utils/async.rs +++ b/src/utils/async.rs @@ -100,7 +100,7 @@ impl sealed::Sealed for Option {} #[cfg(feature = "host")] pub struct Async<'a, 'stream, T: BorrowMut, C: Completion = NoCompletion> { - stream: &'stream Stream, + stream: Stream<'stream>, value: T, status: AsyncStatus<'a, T, C>, _capture: PhantomData<&'a ()>, @@ -124,7 +124,7 @@ enum AsyncStatus<'a, T: BorrowMut, C: Completion> { impl<'a, 'stream, T: BorrowMut, C: Completion> Async<'a, 'stream, T, C> { /// Wraps a `value` which is ready on `stream`. #[must_use] - pub const fn ready(value: T, stream: &'stream Stream) -> Self { + pub const fn ready(value: T, stream: Stream<'stream>) -> Self { Self { stream, value, @@ -139,7 +139,7 @@ impl<'a, 'stream, T: BorrowMut, C: Completion> Async<'a, 'strea /// # Errors /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside /// CUDA. - pub fn pending(value: T, stream: &'stream Stream, completion: C) -> CudaResult { + pub fn pending(value: T, stream: Stream<'stream>, completion: C) -> CudaResult { let (sender, receiver) = oneshot::channel(); stream.add_callback(Box::new(|result| std::mem::drop(sender.send(result))))?; @@ -203,7 +203,7 @@ impl<'a, 'stream, T: BorrowMut, C: Completion> Async<'a, 'strea /// CUDA. pub fn move_to_stream<'stream_new>( self, - stream: &'stream_new Stream, + stream: Stream<'stream_new>, ) -> CudaResult> { let (old_stream, mut value, status) = self.destructure_into_parts(); @@ -229,7 +229,7 @@ impl<'a, 'stream, T: BorrowMut, C: Completion> Async<'a, 'strea }; let event = CudaDropWrapper::from(Event::new(EventFlags::DISABLE_TIMING)?); - event.record(old_stream)?; + event.record(&old_stream)?; stream.wait_event(&event, StreamWaitEventFlags::DEFAULT)?; let (sender, receiver) = oneshot::channel(); @@ -302,7 +302,7 @@ impl<'a, 'stream, T: BorrowMut, C: Completion> Async<'a, 'strea self.stream .add_callback(Box::new(|result| std::mem::drop(sender.send(result))))?; - event.record(self.stream)?; + event.record(&self.stream)?; self.status = AsyncStatus::Processing { receiver, @@ -318,7 +318,7 @@ impl<'a, 'stream, T: BorrowMut, C: Completion> Async<'a, 'strea } #[must_use] - fn destructure_into_parts(self) -> (&'stream Stream, T, AsyncStatus<'a, T, C>) { + fn destructure_into_parts(self) -> (Stream<'stream>, T, AsyncStatus<'a, T, C>) { let this = std::mem::ManuallyDrop::new(self); // Safety: we destructure self into its droppable components, @@ -354,7 +354,7 @@ impl<'a, 'stream, T: BorrowMut, C: Completion> Drop for Async<' #[cfg(feature = "host")] struct AsyncFuture<'a, 'stream, T: BorrowMut, C: Completion> { - _stream: PhantomData<&'stream Stream>, + _stream: PhantomData>, value: Option, completion: Option, status: AsyncStatus<'a, T, NoCompletion>, @@ -435,7 +435,7 @@ impl<'a, 'stream, T: BorrowMut, C: Completion> IntoFuture }; AsyncFuture { - _stream: PhantomData::<&'stream Stream>, + _stream: PhantomData::>, value: Some(value), completion, status, @@ -476,7 +476,7 @@ impl<'a, 'stream, T: BorrowMut, C: Completion> Drop #[allow(clippy::module_name_repetitions)] pub struct AsyncProj<'a, 'stream, T: 'a> { _capture: PhantomData<&'a ()>, - _stream: PhantomData<&'stream Stream>, + _stream: PhantomData>, value: T, use_callback: Option CudaResult<()> + 'a>>, } @@ -495,7 +495,7 @@ impl<'a, 'stream, T: 'a> AsyncProj<'a, 'stream, T> { ) -> Self { Self { _capture: PhantomData::<&'a ()>, - _stream: PhantomData::<&'stream Stream>, + _stream: PhantomData::>, value, use_callback, } @@ -540,7 +540,7 @@ impl<'a, 'stream, T: 'a> AsyncProj<'a, 'stream, T> { { AsyncProj { _capture: PhantomData::<&'b ()>, - _stream: PhantomData::<&'stream Stream>, + _stream: PhantomData::>, value: &self.value, use_callback: None, } @@ -553,7 +553,7 @@ impl<'a, 'stream, T: 'a> AsyncProj<'a, 'stream, T> { { AsyncProj { _capture: PhantomData::<&'b ()>, - _stream: PhantomData::<&'stream Stream>, + _stream: PhantomData::>, value: &mut self.value, use_callback: self.use_callback.as_mut().map(|use_callback| { let use_callback: Box CudaResult<()>> = Box::new(use_callback); @@ -578,7 +578,7 @@ impl<'a, 'stream, T: 'a> AsyncProj<'a, 'stream, &'a T> { { AsyncProj { _capture: PhantomData::<&'b ()>, - _stream: PhantomData::<&'stream Stream>, + _stream: PhantomData::>, value: self.value, use_callback: None, } @@ -607,7 +607,7 @@ impl<'a, 'stream, T: 'a> AsyncProj<'a, 'stream, &'a mut T> { { AsyncProj { _capture: PhantomData::<&'b ()>, - _stream: PhantomData::<&'stream Stream>, + _stream: PhantomData::>, value: self.value, use_callback: None, } @@ -620,7 +620,7 @@ impl<'a, 'stream, T: 'a> AsyncProj<'a, 'stream, &'a mut T> { { AsyncProj { _capture: PhantomData::<&'b ()>, - _stream: PhantomData::<&'stream Stream>, + _stream: PhantomData::>, value: self.value, use_callback: self.use_callback.as_mut().map(|use_callback| { let use_callback: Box CudaResult<()>> = Box::new(use_callback); diff --git a/src/utils/exchange/buffer/host.rs b/src/utils/exchange/buffer/host.rs index 7db5ba3a2..e252d0ce7 100644 --- a/src/utils/exchange/buffer/host.rs +++ b/src/utils/exchange/buffer/host.rs @@ -180,7 +180,7 @@ impl( &self, alloc: A, - stream: &'stream crate::host::Stream, + stream: crate::host::Stream<'stream>, ) -> rustacuda::error::CudaResult<( Async<'_, 'stream, DeviceAccessible>>, CombinedCudaAlloc, @@ -195,7 +195,7 @@ impl( mut this: owning_ref::BoxRefMut<'a, O, Self>, alloc: CombinedCudaAlloc, - stream: &'stream crate::host::Stream, + stream: crate::host::Stream<'stream>, ) -> rustacuda::error::CudaResult<( Async<'a, 'stream, owning_ref::BoxRefMut<'a, O, Self>, CompletionFnMut<'a, Self>>, A, @@ -232,7 +232,7 @@ impl( &self, alloc: A, - stream: &'stream crate::host::Stream, + stream: crate::host::Stream<'stream>, ) -> rustacuda::error::CudaResult<( Async<'_, 'stream, DeviceAccessible>, CombinedCudaAlloc, @@ -159,7 +159,7 @@ unsafe impl( this: owning_ref::BoxRefMut<'a, O, Self>, alloc: CombinedCudaAlloc, - stream: &'stream crate::host::Stream, + stream: crate::host::Stream<'stream>, ) -> rustacuda::error::CudaResult<( Async<'a, 'stream, owning_ref::BoxRefMut<'a, O, Self>, CompletionFnMut<'a, Self>>, A, diff --git a/src/utils/exchange/wrapper.rs b/src/utils/exchange/wrapper.rs index faa9c5b44..9f1196bb0 100644 --- a/src/utils/exchange/wrapper.rs +++ b/src/utils/exchange/wrapper.rs @@ -119,7 +119,7 @@ impl( mut self, - stream: &'stream Stream, + stream: Stream<'stream>, ) -> CudaResult, NoCompletion>> { let (cuda_repr, _null_alloc) = unsafe { self.value.borrow_async(NoCudaAlloc, stream) }?; let (cuda_repr, _completion): (_, Option) = @@ -132,7 +132,7 @@ impl( self, - stream: &'stream Stream, + stream: Stream<'stream>, ) -> CudaResult< Async< 'static, @@ -265,7 +265,7 @@ impl< /// CUDA pub fn move_to_host_async( self, - stream: &'stream Stream, + stream: Stream<'stream>, ) -> CudaResult< Async< 'static, From 0b355cc4c1bd417e8d3eefd1652a329b16c37f01 Mon Sep 17 00:00:00 2001 From: Juniper Tyree Date: Sat, 13 Jan 2024 08:56:07 +0000 Subject: [PATCH 105/120] Try without ref proj --- src/kernel/param.rs | 40 +++++++++++++++++++++-------------- src/utils/exchange/wrapper.rs | 2 +- 2 files changed, 25 insertions(+), 17 deletions(-) diff --git a/src/kernel/param.rs b/src/kernel/param.rs index 8cafb8c8d..ff53f6dd4 100644 --- a/src/kernel/param.rs +++ b/src/kernel/param.rs @@ -156,7 +156,7 @@ impl< type AsyncHostType<'stream, 'b> = crate::utils::r#async::AsyncProj< 'b, 'stream, - &'b crate::host::HostAndDeviceConstRef<'b, T>, + crate::host::HostAndDeviceConstRef<'b, T>, > where Self: 'b; #[cfg(any(feature = "device", doc))] type DeviceType<'b> = &'b T where Self: 'b; @@ -173,8 +173,9 @@ impl< where Self: 'b, { + let _ = stream; crate::host::HostAndDeviceConstRef::with_new(param, |const_ref| { - inner.with(const_ref.as_async(stream).as_ref()) + inner.with(unsafe { crate::utils::r#async::AsyncProj::new(const_ref, None) }) }) } @@ -257,9 +258,10 @@ impl< where Self: 'b, { + let _ = stream; // FIXME: forward impl crate::host::HostAndDeviceConstRef::with_new(param, |const_ref| { - inner.with(const_ref.as_async(stream).as_ref()) + inner.with(unsafe { crate::utils::r#async::AsyncProj::new(const_ref, None) }) }) } @@ -272,7 +274,8 @@ impl< where Self: 'b, { - let param = unsafe { param.unwrap_ref_unchecked() }; + let param_ref = param.proj_ref(); + let param = unsafe { param_ref.unwrap_ref_unchecked() }; inner(Some(¶m_as_raw_bytes(param.for_host()))) } @@ -360,7 +363,7 @@ impl< type AsyncHostType<'stream, 'b> = crate::utils::r#async::AsyncProj< 'b, 'stream, - &'b crate::host::HostAndDeviceConstRef<'b, T> + crate::host::HostAndDeviceConstRef<'b, T> > where Self: 'b; #[cfg(any(feature = "device", doc))] type DeviceType<'b> = &'b T where Self: 'b; @@ -379,8 +382,9 @@ impl< where Self: 'b, { - crate::host::HostAndDeviceMutRef::with_new(param, |const_ref| { - inner.with(const_ref.as_ref().as_async(stream).as_ref()) + let _ = stream; + crate::host::HostAndDeviceMutRef::with_new(param, |mut_ref| { + inner.with(unsafe { crate::utils::r#async::AsyncProj::new(mut_ref.as_ref(), None) }) }) } @@ -580,7 +584,7 @@ impl<'a, T: Sync + RustToCuda> CudaKernelParameter for &'a DeepPerThreadBorrow = crate::utils::r#async::AsyncProj< 'b, 'stream, - &'b crate::host::HostAndDeviceConstRef< + crate::host::HostAndDeviceConstRef< 'b, DeviceAccessible<::CudaRepresentation>, >, @@ -601,8 +605,9 @@ impl<'a, T: Sync + RustToCuda> CudaKernelParameter for &'a DeepPerThreadBorrow CudaKernelParameter type AsyncHostType<'stream, 'b> = crate::utils::r#async::AsyncProj< 'b, 'stream, - &'b mut crate::host::HostAndDeviceMutRef< + crate::host::HostAndDeviceMutRef< 'b, DeviceAccessible<::CudaRepresentation>, >, @@ -690,7 +695,7 @@ impl<'a, T: Sync + RustToCuda + SafeMutableAliasing> CudaKernelParameter inner.with({ // Safety: this projection cannot be moved to a different stream // without first exiting lend_to_cuda_mut and synchronizing - unsafe { crate::utils::r#async::AsyncProj::new(&mut param.into_mut(), None) } + unsafe { crate::utils::r#async::AsyncProj::new(param.into_mut(), None) } }) }) } @@ -727,7 +732,7 @@ impl<'a, T: Sync + RustToCuda + SafeMutableAliasing> CudaKernelParameter Self: 'b, { param.record_mut_use()?; - let param = unsafe { param.unwrap_unchecked() }; + let mut param = unsafe { param.unwrap_unchecked() }; Ok(param.for_device()) } @@ -858,8 +863,9 @@ impl<'a, T: Sync + RustToCuda> CudaKernelParameter for &'a PtxJit CudaKernelParameter for &'a PtxJit CudaKernelParameter inner.with({ // Safety: this projection cannot be moved to a different stream // without first exiting lend_to_cuda_mut and synchronizing - unsafe { crate::utils::r#async::AsyncProj::new(&mut param.into_mut(), None) } + unsafe { crate::utils::r#async::AsyncProj::new(param.into_mut(), None) } }) }) } @@ -959,7 +966,8 @@ impl<'a, T: Sync + RustToCuda + SafeMutableAliasing> CudaKernelParameter where Self: 'b, { - let param = unsafe { param.as_ref().unwrap_unchecked() }; + let param_ref = param.proj_ref(); + let param = unsafe { param_ref.unwrap_unchecked() }; inner(Some(¶m_as_raw_bytes(param.for_host()))) } diff --git a/src/utils/exchange/wrapper.rs b/src/utils/exchange/wrapper.rs index 9f1196bb0..f84bfa04d 100644 --- a/src/utils/exchange/wrapper.rs +++ b/src/utils/exchange/wrapper.rs @@ -340,7 +340,7 @@ impl< ) -> AsyncProj< '_, 'stream, - HostAndDeviceMutRef::CudaRepresentation>>, + HostAndDeviceMutRef<'_, DeviceAccessible<::CudaRepresentation>>, > where T: SafeMutableAliasing, From e6f20dc77ecb07de54d5e16addbbf39376542e8e Mon Sep 17 00:00:00 2001 From: Juniper Tyree Date: Sat, 13 Jan 2024 09:10:07 +0000 Subject: [PATCH 106/120] Try add extract ref --- src/host/mod.rs | 11 +++++++ src/utils/async.rs | 76 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 87 insertions(+) diff --git a/src/host/mod.rs b/src/host/mod.rs index 23bab2706..589556560 100644 --- a/src/host/mod.rs +++ b/src/host/mod.rs @@ -229,6 +229,17 @@ impl<'a, T: PortableBitSemantics + TypeGraphLayout> HostAndDeviceMutRef<'a, T> { } } + #[must_use] + pub(crate) unsafe fn as_mut<'b>(&'b mut self) -> HostAndDeviceMutRef<'b, T> + where + 'a: 'b, + { + HostAndDeviceMutRef { + device_box: self.device_box, + host_ref: self.host_ref, + } + } + #[must_use] pub fn into_mut<'b>(self) -> HostAndDeviceMutRef<'b, T> where diff --git a/src/utils/async.rs b/src/utils/async.rs index 24ef8bfc2..be4e2458c 100644 --- a/src/utils/async.rs +++ b/src/utils/async.rs @@ -333,6 +333,82 @@ impl<'a, 'stream, T: BorrowMut, C: Completion> Async<'a, 'strea } } +#[cfg(feature = "host")] +impl< + 'a, + 'stream, + T: crate::safety::PortableBitSemantics + const_type_layout::TypeGraphLayout, + C: Completion>, + > Async<'a, 'stream, crate::host::HostAndDeviceConstRef<'a, T>, C> +where + crate::host::HostAndDeviceConstRef<'a, T>: BorrowMut, +{ + pub const fn extract_ref( + &self, + ) -> AsyncProj<'_, 'stream, crate::host::HostAndDeviceConstRef<'_, T>> { + // Safety: this projection captures this async + unsafe { AsyncProj::new(self.value.as_ref(), None) } + } +} + +#[cfg(feature = "host")] +impl< + 'a, + 'stream, + T: crate::safety::PortableBitSemantics + const_type_layout::TypeGraphLayout, + C: Completion>, + > Async<'a, 'stream, crate::host::HostAndDeviceMutRef<'a, T>, C> +where + crate::host::HostAndDeviceMutRef<'a, T>: BorrowMut, +{ + pub fn extract_ref(&self) -> AsyncProj<'_, 'stream, crate::host::HostAndDeviceConstRef<'_, T>> { + // Safety: this projection captures this async + unsafe { AsyncProj::new(self.value.as_ref(), None) } + } + + pub fn extract_mut( + &mut self, + ) -> AsyncProj<'_, 'stream, crate::host::HostAndDeviceMutRef<'_, T>> { + // Safety: this projection captures this async + unsafe { + AsyncProj::new( + self.value.as_mut(), + Some(Box::new(|| { + let completion = match &mut self.status { + AsyncStatus::Completed { result } => { + (*result)?; + C::no_op() + }, + AsyncStatus::Processing { + receiver: _, + completion, + event: _, + _capture, + } => std::mem::replace(completion, C::no_op()), + }; + + let event = CudaDropWrapper::from(Event::new(EventFlags::DISABLE_TIMING)?); + + let (sender, receiver) = oneshot::channel(); + + self.stream + .add_callback(Box::new(|result| std::mem::drop(sender.send(result))))?; + event.record(&self.stream)?; + + self.status = AsyncStatus::Processing { + receiver, + completion, + event: Some(event), + _capture: PhantomData, + }; + + Ok(()) + })), + ) + } + } +} + #[cfg(feature = "host")] impl<'a, 'stream, T: BorrowMut, C: Completion> Drop for Async<'a, 'stream, T, C> { fn drop(&mut self) { From 4148959b21ba72881434e6d1f94fd4bd35f27e2f Mon Sep 17 00:00:00 2001 From: Juniper Tyree Date: Sat, 13 Jan 2024 09:22:18 +0000 Subject: [PATCH 107/120] Fix doc link --- src/utils/exchange/wrapper.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/utils/exchange/wrapper.rs b/src/utils/exchange/wrapper.rs index f84bfa04d..bb137a4af 100644 --- a/src/utils/exchange/wrapper.rs +++ b/src/utils/exchange/wrapper.rs @@ -87,7 +87,7 @@ impl> ExchangeWrapperOnHost { /// Moves the data synchronously to the CUDA device, where it can then be /// lent out immutably via [`ExchangeWrapperOnDevice::as_ref`], or mutably - /// via [`ExchangeWrapperOnDevice::as_mut`]. + /// via [`ExchangeWrapperOnDevice::as_mut_async`](Async::as_mut_async). /// /// # Errors /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside From d1f141e9044ffa24bd286c3b8dd1213ca74436cf Mon Sep 17 00:00:00 2001 From: Juniper Tyree Date: Sun, 14 Jan 2024 03:22:39 +0000 Subject: [PATCH 108/120] clean up kernel signature check --- .../src/rust_to_cuda/field_copy.rs | 2 +- rust-cuda-kernel/src/kernel/link/config.rs | 2 +- rust-cuda-kernel/src/kernel/link/mod.rs | 30 +++++++--- rust-cuda-kernel/src/kernel/mod.rs | 2 + .../wrapper/generate/cuda_generic_function.rs | 2 +- .../kernel/wrapper/generate/cuda_wrapper.rs | 40 +++++-------- .../kernel/wrapper/generate/host_kernel_ty.rs | 2 +- .../generate/host_linker_macro/args_trait.rs | 2 +- .../generate/host_linker_macro/get_ptx.rs | 56 +++++++++---------- .../wrapper/generate/host_linker_macro/mod.rs | 2 +- rust-cuda-kernel/src/kernel/wrapper/mod.rs | 6 +- src/safety/mod.rs | 4 +- ...kernel_signature.rs => ptx_entry_point.rs} | 18 ++++-- src/safety/ptx_kernel_signature.rs | 51 +++++++++++++++++ src/safety/type_layout.rs | 33 ----------- 15 files changed, 136 insertions(+), 116 deletions(-) rename src/safety/{kernel_signature.rs => ptx_entry_point.rs} (62%) create mode 100644 src/safety/ptx_kernel_signature.rs delete mode 100644 src/safety/type_layout.rs diff --git a/rust-cuda-derive/src/rust_to_cuda/field_copy.rs b/rust-cuda-derive/src/rust_to_cuda/field_copy.rs index 05d133156..18fd867c1 100644 --- a/rust-cuda-derive/src/rust_to_cuda/field_copy.rs +++ b/rust-cuda-derive/src/rust_to_cuda/field_copy.rs @@ -1,7 +1,7 @@ use proc_macro2::TokenStream; use quote::{format_ident, quote, ToTokens}; -use super::field_ty::CudaReprFieldTy; +use crate::rust_to_cuda::field_ty::CudaReprFieldTy; #[allow(clippy::too_many_arguments, clippy::too_many_lines)] pub fn impl_field_copy_init_and_expand_alloc_type( diff --git a/rust-cuda-kernel/src/kernel/link/config.rs b/rust-cuda-kernel/src/kernel/link/config.rs index d7a4d0458..469318f02 100644 --- a/rust-cuda-kernel/src/kernel/link/config.rs +++ b/rust-cuda-kernel/src/kernel/link/config.rs @@ -1,6 +1,6 @@ use std::{collections::HashMap, path::PathBuf}; -use super::super::lints::{parse_ptx_lint_level, LintLevel, PtxLint}; +use crate::kernel::lints::{parse_ptx_lint_level, LintLevel, PtxLint}; #[allow(clippy::module_name_repetitions)] pub(super) struct LinkKernelConfig { diff --git a/rust-cuda-kernel/src/kernel/link/mod.rs b/rust-cuda-kernel/src/kernel/link/mod.rs index 8424e7056..b64776707 100644 --- a/rust-cuda-kernel/src/kernel/link/mod.rs +++ b/rust-cuda-kernel/src/kernel/link/mod.rs @@ -13,15 +13,17 @@ use std::{ use colored::Colorize; use proc_macro::TokenStream; +use proc_macro2::Span; use ptx_builder::{ builder::{BuildStatus, Builder, MessageFormat, Profile}, error::{BuildErrorKind, Error, Result}, }; -use super::{ +use crate::kernel::{ lints::{LintLevel, PtxLint}, utils::skip_kernel_compilation, - KERNEL_TYPE_USE_END_CANARY, KERNEL_TYPE_USE_START_CANARY, + KERNEL_TYPE_LAYOUT_IDENT, KERNEL_TYPE_USE_END_CANARY, KERNEL_TYPE_USE_START_CANARY, + PTX_CSTR_IDENT, }; mod config; @@ -33,7 +35,9 @@ use error::emit_ptx_build_error; use ptx_compiler_sys::NvptxError; pub fn check_kernel(tokens: TokenStream) -> TokenStream { - proc_macro_error::set_dummy(quote! {::core::result::Result::Err(())}); + proc_macro_error::set_dummy( + quote! {::core::compile_error!("rust-cuda PTX kernel check failed");}, + ); let CheckKernelConfig { kernel, @@ -54,7 +58,7 @@ pub fn check_kernel(tokens: TokenStream) -> TokenStream { let kernel_ptx = compile_kernel(&kernel, &crate_name, &crate_path, Specialisation::Check); let Some(kernel_ptx) = kernel_ptx else { - return quote!(::core::result::Result::Err(())).into(); + return quote!(::core::compile_error!("rust-cuda PTX kernel check failed");).into(); }; check_kernel_ptx_and_report( @@ -64,13 +68,18 @@ pub fn check_kernel(tokens: TokenStream) -> TokenStream { &HashMap::new(), ); - quote!(::core::result::Result::Ok(())).into() + quote!().into() } #[allow(clippy::module_name_repetitions)] pub fn link_kernel(tokens: TokenStream) -> TokenStream { + let ptx_cstr_ident = syn::Ident::new(PTX_CSTR_IDENT, Span::call_site()); + let ffi_signature_ident = syn::Ident::new(KERNEL_TYPE_LAYOUT_IDENT, Span::call_site()); + proc_macro_error::set_dummy(quote! { - const PTX_CSTR: &'static ::core::ffi::CStr = c"ERROR in this PTX compilation"; + const #ptx_cstr_ident: &'static ::core::ffi::CStr = c"ERROR in this PTX compilation"; + const #ffi_signature_ident: &[u8; 29] = b"ERROR in this PTX compilation"; + ::core::compile_error!("rust-cuda PTX kernel compilation failed"); }); let LinkKernelConfig { @@ -94,7 +103,7 @@ pub fn link_kernel(tokens: TokenStream) -> TokenStream { if skip_kernel_compilation() { return quote! { - const PTX_CSTR: &'static ::core::ffi::CStr = c"CLIPPY skips specialised PTX compilation"; + const #ptx_cstr_ident: &'static ::core::ffi::CStr = c"CLIPPY skips specialised PTX compilation"; } .into(); } @@ -106,7 +115,9 @@ pub fn link_kernel(tokens: TokenStream) -> TokenStream { Specialisation::Link(&specialisation), ) else { return (quote! { - const PTX_CSTR: &'static ::core::ffi::CStr = c"ERROR in this PTX compilation"; + const #ptx_cstr_ident: &'static ::core::ffi::CStr = c"ERROR in this PTX compilation"; + const #ffi_signature_ident: &[u8; 29] = b"ERROR in this PTX compilation"; + ::core::compile_error!("rust-cuda PTX kernel compilation failed"); }) .into(); }; @@ -137,7 +148,8 @@ pub fn link_kernel(tokens: TokenStream) -> TokenStream { let kernel_ptx = quote! { unsafe { ::core::ffi::CStr::from_bytes_with_nul_unchecked(#kernel_ptx) } }; - (quote! { const PTX_CSTR: &'static ::core::ffi::CStr = #kernel_ptx; #(#type_layouts)* }).into() + (quote! { const #ptx_cstr_ident: &'static ::core::ffi::CStr = #kernel_ptx; #(#type_layouts)* }) + .into() } fn extract_ptx_kernel_layout(kernel_ptx: &mut String) -> Vec { diff --git a/rust-cuda-kernel/src/kernel/mod.rs b/rust-cuda-kernel/src/kernel/mod.rs index 9e3a80789..86ffbd8fd 100644 --- a/rust-cuda-kernel/src/kernel/mod.rs +++ b/rust-cuda-kernel/src/kernel/mod.rs @@ -7,3 +7,5 @@ mod utils; const KERNEL_TYPE_USE_START_CANARY: &str = "// //"; const KERNEL_TYPE_USE_END_CANARY: &str = "// //"; +const KERNEL_TYPE_LAYOUT_IDENT: &str = "KERNEL_SIGNATURE_LAYOUT"; +const PTX_CSTR_IDENT: &str = "PTX_CSTR"; diff --git a/rust-cuda-kernel/src/kernel/wrapper/generate/cuda_generic_function.rs b/rust-cuda-kernel/src/kernel/wrapper/generate/cuda_generic_function.rs index 62cb3456d..ccf21c96b 100644 --- a/rust-cuda-kernel/src/kernel/wrapper/generate/cuda_generic_function.rs +++ b/rust-cuda-kernel/src/kernel/wrapper/generate/cuda_generic_function.rs @@ -1,7 +1,7 @@ use proc_macro2::TokenStream; use syn::spanned::Spanned; -use super::super::{DeclGenerics, FuncIdent}; +use crate::kernel::wrapper::{DeclGenerics, FuncIdent}; pub(in super::super) fn quote_cuda_generic_function( crate_path: &syn::Path, diff --git a/rust-cuda-kernel/src/kernel/wrapper/generate/cuda_wrapper.rs b/rust-cuda-kernel/src/kernel/wrapper/generate/cuda_wrapper.rs index 74ab20f5b..938074e56 100644 --- a/rust-cuda-kernel/src/kernel/wrapper/generate/cuda_wrapper.rs +++ b/rust-cuda-kernel/src/kernel/wrapper/generate/cuda_wrapper.rs @@ -1,9 +1,9 @@ use proc_macro2::TokenStream; use syn::spanned::Spanned; -use super::super::{ - super::{KERNEL_TYPE_USE_END_CANARY, KERNEL_TYPE_USE_START_CANARY}, - FuncIdent, FunctionInputs, ImplGenerics, +use crate::kernel::{ + wrapper::{FuncIdent, FunctionInputs, ImplGenerics}, + KERNEL_TYPE_LAYOUT_IDENT, KERNEL_TYPE_USE_END_CANARY, KERNEL_TYPE_USE_START_CANARY, }; #[allow(clippy::too_many_lines)] @@ -25,16 +25,6 @@ pub(in super::super) fn quote_cuda_wrapper( let (ffi_inputs, ffi_types) = specialise_ffi_input_types(crate_path, inputs, func, impl_generics); - let func_layout_params = func_params - .iter() - .map(|ident| { - syn::Ident::new( - &format!("__{func_ident_hash}_{ident}_layout").to_uppercase(), - ident.span(), - ) - }) - .collect::>(); - let ffi_param_ptx_jit_wrap = func_inputs.iter().enumerate().rev().fold( quote! { #func_ident(#(#func_params),*) @@ -70,6 +60,9 @@ pub(in super::super) fn quote_cuda_wrapper( }) .collect::>(); + let ffi_signature_ident = syn::Ident::new(KERNEL_TYPE_LAYOUT_IDENT, func_ident.span()); + let ffi_signature_ty = quote! { extern "C" fn(#(#ffi_types),*) }; + quote! { #[cfg(target_os = "cuda")] #[#crate_path::device::specialise_kernel_function(#func_ident)] @@ -89,20 +82,13 @@ pub(in super::super) fn quote_cuda_wrapper( #crate_path::utils::shared::init(); } - unsafe { - ::core::arch::asm!(#KERNEL_TYPE_USE_START_CANARY); - } - #( - #[no_mangle] - static #func_layout_params: [ - u8; #crate_path::deps::const_type_layout::serialised_type_graph_len::<#ffi_types>() - ] = #crate_path::deps::const_type_layout::serialise_type_graph::<#ffi_types>(); - - unsafe { ::core::ptr::read_volatile(&#func_layout_params[0]) }; - )* - unsafe { - ::core::arch::asm!(#KERNEL_TYPE_USE_END_CANARY); - } + unsafe { ::core::arch::asm!(#KERNEL_TYPE_USE_START_CANARY); } + #[no_mangle] + static #ffi_signature_ident: [ + u8; #crate_path::deps::const_type_layout::serialised_type_graph_len::<#ffi_signature_ty>() + ] = #crate_path::deps::const_type_layout::serialise_type_graph::<#ffi_signature_ty>(); + unsafe { ::core::ptr::read_volatile(&#ffi_signature_ident) }; + unsafe { ::core::arch::asm!(#KERNEL_TYPE_USE_END_CANARY); } #ffi_param_ptx_jit_wrap } diff --git a/rust-cuda-kernel/src/kernel/wrapper/generate/host_kernel_ty.rs b/rust-cuda-kernel/src/kernel/wrapper/generate/host_kernel_ty.rs index 84ece28b5..78e972d69 100644 --- a/rust-cuda-kernel/src/kernel/wrapper/generate/host_kernel_ty.rs +++ b/rust-cuda-kernel/src/kernel/wrapper/generate/host_kernel_ty.rs @@ -1,6 +1,6 @@ use proc_macro2::TokenStream; -use super::super::{DeclGenerics, FuncIdent, FunctionInputs, ImplGenerics}; +use crate::kernel::wrapper::{DeclGenerics, FuncIdent, FunctionInputs, ImplGenerics}; pub(in super::super) fn quote_host_kernel_ty( crate_path: &syn::Path, diff --git a/rust-cuda-kernel/src/kernel/wrapper/generate/host_linker_macro/args_trait.rs b/rust-cuda-kernel/src/kernel/wrapper/generate/host_linker_macro/args_trait.rs index 25cc27955..26653e435 100644 --- a/rust-cuda-kernel/src/kernel/wrapper/generate/host_linker_macro/args_trait.rs +++ b/rust-cuda-kernel/src/kernel/wrapper/generate/host_linker_macro/args_trait.rs @@ -1,6 +1,6 @@ use proc_macro2::TokenStream; -use super::super::super::{FunctionInputs, ImplGenerics}; +use crate::kernel::wrapper::{FunctionInputs, ImplGenerics}; pub(in super::super) fn quote_args_trait( args: &syn::Ident, diff --git a/rust-cuda-kernel/src/kernel/wrapper/generate/host_linker_macro/get_ptx.rs b/rust-cuda-kernel/src/kernel/wrapper/generate/host_linker_macro/get_ptx.rs index 303b43ff1..955093e5c 100644 --- a/rust-cuda-kernel/src/kernel/wrapper/generate/host_linker_macro/get_ptx.rs +++ b/rust-cuda-kernel/src/kernel/wrapper/generate/host_linker_macro/get_ptx.rs @@ -1,9 +1,11 @@ use proc_macro2::TokenStream; use syn::spanned::Spanned; -use crate::kernel::utils::skip_kernel_compilation; - -use super::super::super::{DeclGenerics, FuncIdent, FunctionInputs, ImplGenerics}; +use crate::kernel::{ + utils::skip_kernel_compilation, + wrapper::{DeclGenerics, FuncIdent, FunctionInputs, ImplGenerics}, + KERNEL_TYPE_LAYOUT_IDENT, PTX_CSTR_IDENT, +}; #[allow(clippy::too_many_arguments)] pub(super) fn quote_get_ptx( @@ -38,15 +40,17 @@ pub(super) fn quote_get_ptx( let cpu_func_lifetime_erased_types = generate_lifetime_erased_types(crate_path, &args, generics, inputs, macro_type_ids); + let ptx_cstr_ident = syn::Ident::new(PTX_CSTR_IDENT, func_ident.span()); + let matching_kernel_assert = if skip_kernel_compilation() { quote!() } else { quote::quote_spanned! { func_ident.span()=> - const _: #crate_path::safety::kernel_signature::Assert<{ - #crate_path::safety::kernel_signature::CpuAndGpuKernelSignatures::Match - }> = #crate_path::safety::kernel_signature::Assert::<{ - #crate_path::safety::kernel_signature::check( - PTX_CSTR.to_bytes(), + const _: #crate_path::safety::ptx_entry_point::Assert<{ + #crate_path::safety::ptx_entry_point::HostAndDeviceKernelEntryPoint::Match + }> = #crate_path::safety::ptx_entry_point::Assert::<{ + #crate_path::safety::ptx_entry_point::check( + #ptx_cstr_ident.to_bytes(), #crate_path::kernel::specialise_kernel_entry_point!( #func_ident_hash #generic_start_token #($#macro_type_ids),* @@ -57,27 +61,19 @@ pub(super) fn quote_get_ptx( } }; - let type_layout_asserts = if skip_kernel_compilation() { - Vec::new() + let signature_layout_assert = if skip_kernel_compilation() { + quote!() } else { - cpu_func_lifetime_erased_types - .iter() - .zip(func_params.iter()) - .map(|(ty, param)| { - let layout_param = syn::Ident::new( - &format!("__{func_ident_hash}_{param}_layout").to_uppercase(), - param.span(), - ); - - quote::quote_spanned! { ty.span()=> - const _: #crate_path::safety::type_layout::Assert<{ - #crate_path::safety::type_layout::CpuAndGpuTypeLayouts::Match - }> = #crate_path::safety::type_layout::Assert::<{ - #crate_path::safety::type_layout::check::<#ty>(#layout_param) - }>; - } - }) - .collect::>() + let ffi_signature_ident = syn::Ident::new(KERNEL_TYPE_LAYOUT_IDENT, func_ident.span()); + let ffi_signature_ty = quote! { extern "C" fn(#(#cpu_func_lifetime_erased_types),*) }; + + quote::quote_spanned! { func_ident.span()=> + const _: #crate_path::safety::ptx_kernel_signature::Assert<{ + #crate_path::safety::ptx_kernel_signature::HostAndDeviceKernelSignatureTypeLayout::Match + }> = #crate_path::safety::ptx_kernel_signature::Assert::<{ + #crate_path::safety::ptx_kernel_signature::check::<#ffi_signature_ty>(#ffi_signature_ident) + }>; + } }; let private_func_params = func_params @@ -107,9 +103,9 @@ pub(super) fn quote_get_ptx( #matching_kernel_assert - #(#type_layout_asserts)* + #signature_layout_assert - PTX_CSTR + #ptx_cstr_ident } } } diff --git a/rust-cuda-kernel/src/kernel/wrapper/generate/host_linker_macro/mod.rs b/rust-cuda-kernel/src/kernel/wrapper/generate/host_linker_macro/mod.rs index cfc0af751..36479b62a 100644 --- a/rust-cuda-kernel/src/kernel/wrapper/generate/host_linker_macro/mod.rs +++ b/rust-cuda-kernel/src/kernel/wrapper/generate/host_linker_macro/mod.rs @@ -1,6 +1,6 @@ use proc_macro2::TokenStream; -use super::super::{DeclGenerics, FuncIdent, FunctionInputs, ImplGenerics, KernelConfig}; +use crate::kernel::wrapper::{DeclGenerics, FuncIdent, FunctionInputs, ImplGenerics, KernelConfig}; mod args_trait; mod get_ptx; diff --git a/rust-cuda-kernel/src/kernel/wrapper/mod.rs b/rust-cuda-kernel/src/kernel/wrapper/mod.rs index 7793c2dc0..f400e3147 100644 --- a/rust-cuda-kernel/src/kernel/wrapper/mod.rs +++ b/rust-cuda-kernel/src/kernel/wrapper/mod.rs @@ -9,7 +9,7 @@ mod config; mod generate; mod parse; -use super::lints::{parse_ptx_lint_level, LintLevel, PtxLint}; +use crate::kernel::lints::{parse_ptx_lint_level, LintLevel, PtxLint}; use config::KernelConfig; use generate::{ @@ -346,8 +346,8 @@ fn quote_generic_check( quote::quote_spanned! { func_ident_hash.span()=> #[cfg(not(target_os = "cuda"))] - const _: ::core::result::Result<(), ()> = #crate_path::kernel::check_kernel!( + #crate_path::kernel::check_kernel! { #func_ident #func_ident_hash #crate_name #crate_manifest_dir - ); + } } } diff --git a/src/safety/mod.rs b/src/safety/mod.rs index c26ef3389..7e078e34e 100644 --- a/src/safety/mod.rs +++ b/src/safety/mod.rs @@ -4,9 +4,9 @@ mod portable; mod stack_only; #[doc(hidden)] -pub mod kernel_signature; +pub mod ptx_entry_point; #[doc(hidden)] -pub mod type_layout; +pub mod ptx_kernel_signature; pub use aliasing::SafeMutableAliasing; pub use portable::PortableBitSemantics; diff --git a/src/safety/kernel_signature.rs b/src/safety/ptx_entry_point.rs similarity index 62% rename from src/safety/kernel_signature.rs rename to src/safety/ptx_entry_point.rs index 96bdd3f32..b1d62cf4e 100644 --- a/src/safety/kernel_signature.rs +++ b/src/safety/ptx_entry_point.rs @@ -1,30 +1,36 @@ #[derive(PartialEq, Eq, core::marker::ConstParamTy)] -pub enum CpuAndGpuKernelSignatures { +pub enum HostAndDeviceKernelEntryPoint { Match, Mismatch, } -pub struct Assert; +pub struct Assert; #[must_use] -pub const fn check(ptx: &[u8], entry_point: &[u8]) -> CpuAndGpuKernelSignatures { +pub const fn check(ptx: &[u8], entry_point: &[u8]) -> HostAndDeviceKernelEntryPoint { + const PTX_ERROR_MESSAGE: &[u8] = b"ERROR in this PTX compilation"; const KERNEL_TYPE: &[u8] = b".visible .entry "; + // Short-circuit to avoid extra errors when PTX compilation fails + if ptx.len() == PTX_ERROR_MESSAGE.len() && starts_with(ptx, PTX_ERROR_MESSAGE, 0) { + return HostAndDeviceKernelEntryPoint::Match; + } + let mut j = 0; while j < ptx.len() { let Some(j2) = find(ptx, KERNEL_TYPE, j) else { - return CpuAndGpuKernelSignatures::Mismatch; + return HostAndDeviceKernelEntryPoint::Mismatch; }; if starts_with(ptx, entry_point, j2) { - return CpuAndGpuKernelSignatures::Match; + return HostAndDeviceKernelEntryPoint::Match; } j += 1; } - CpuAndGpuKernelSignatures::Mismatch + HostAndDeviceKernelEntryPoint::Mismatch } const fn find(haystack: &[u8], needle: &[u8], from: usize) -> Option { diff --git a/src/safety/ptx_kernel_signature.rs b/src/safety/ptx_kernel_signature.rs new file mode 100644 index 000000000..eb4a63820 --- /dev/null +++ b/src/safety/ptx_kernel_signature.rs @@ -0,0 +1,51 @@ +use const_type_layout::{serialise_type_graph, serialised_type_graph_len, TypeGraphLayout}; + +#[allow(clippy::module_name_repetitions)] +#[derive(PartialEq, Eq, core::marker::ConstParamTy)] +pub enum HostAndDeviceKernelSignatureTypeLayout { + Match, + Mismatch, +} + +pub struct Assert; + +#[must_use] +pub const fn check( + device: &'static [u8], +) -> HostAndDeviceKernelSignatureTypeLayout +where + [u8; serialised_type_graph_len::()]:, +{ + const SIGNATURE_ERROR_MESSAGE: &[u8] = b"ERROR in this PTX compilation"; + + // Short-circuit to avoid extra errors when PTX compilation fails + if equals(device, SIGNATURE_ERROR_MESSAGE) { + return HostAndDeviceKernelSignatureTypeLayout::Match; + } + + let host = serialise_type_graph::(); + + if equals(device, &host) { + HostAndDeviceKernelSignatureTypeLayout::Match + } else { + HostAndDeviceKernelSignatureTypeLayout::Mismatch + } +} + +const fn equals(device: &[u8], host: &[u8]) -> bool { + if host.len() != device.len() { + return false; + } + + let mut i = 0; + + while i < host.len() { + if host[i] != device[i] { + return false; + } + + i += 1; + } + + true +} diff --git a/src/safety/type_layout.rs b/src/safety/type_layout.rs deleted file mode 100644 index f225f0055..000000000 --- a/src/safety/type_layout.rs +++ /dev/null @@ -1,33 +0,0 @@ -use const_type_layout::{serialise_type_graph, serialised_type_graph_len, TypeGraphLayout}; - -#[derive(PartialEq, Eq, core::marker::ConstParamTy)] -pub enum CpuAndGpuTypeLayouts { - Match, - Mismatch, -} - -pub struct Assert; - -#[must_use] -pub const fn check(device: &'static [u8]) -> CpuAndGpuTypeLayouts -where - [u8; serialised_type_graph_len::()]:, -{ - let host = serialise_type_graph::(); - - if host.len() != device.len() { - return CpuAndGpuTypeLayouts::Mismatch; - } - - let mut i = 0; - - while i < host.len() { - if host[i] != device[i] { - return CpuAndGpuTypeLayouts::Mismatch; - } - - i += 1; - } - - CpuAndGpuTypeLayouts::Match -} From 19120623df08f95f2b0786bdcdb3cdecfdc28842 Mon Sep 17 00:00:00 2001 From: Juniper Tyree Date: Thu, 18 Jan 2024 13:53:51 +0000 Subject: [PATCH 109/120] Some cleanup before merging --- examples/print/src/main.rs | 10 ++++- rust-cuda-kernel/src/kernel/link/mod.rs | 23 +++++----- .../src/kernel/specialise/entry_point.rs | 23 ++++------ rust-cuda-kernel/src/lib.rs | 1 + src/device/utils.rs | 43 ++++++++++++++----- src/kernel/mod.rs | 14 +++--- src/kernel/param.rs | 36 ++++++++++------ src/lend/impls/option.rs | 6 +-- 8 files changed, 94 insertions(+), 62 deletions(-) diff --git a/examples/print/src/main.rs b/examples/print/src/main.rs index 7cd9ab3f2..93a50ba55 100644 --- a/examples/print/src/main.rs +++ b/examples/print/src/main.rs @@ -93,12 +93,18 @@ mod cuda_prelude { fn panic(info: &::core::panic::PanicInfo) -> ! { // pretty format and print the panic message // but don't allow dynamic formatting or panic payload downcasting - rust_cuda::device::utils::pretty_panic_handler(info, false, false) + rust_cuda::device::utils::pretty_print_panic_info(info, false, false); + + // Safety: no mutable data is shared with the kernel + unsafe { rust_cuda::device::utils::exit() } } #[alloc_error_handler] #[track_caller] fn alloc_error_handler(layout: ::core::alloc::Layout) -> ! { - rust_cuda::device::utils::pretty_alloc_error_handler(layout) + rust_cuda::device::utils::pretty_print_alloc_error(layout); + + // Safety: no mutable data is shared with the kernel + unsafe { rust_cuda::device::utils::exit() } } } diff --git a/rust-cuda-kernel/src/kernel/link/mod.rs b/rust-cuda-kernel/src/kernel/link/mod.rs index b64776707..b83eab9d2 100644 --- a/rust-cuda-kernel/src/kernel/link/mod.rs +++ b/rust-cuda-kernel/src/kernel/link/mod.rs @@ -1,7 +1,7 @@ use std::{ collections::HashMap, env, - ffi::{CStr, CString}, + ffi::CString, fmt::Write as FmtWrite, fs, io::{Read, Write}, @@ -132,21 +132,18 @@ pub fn link_kernel(tokens: TokenStream) -> TokenStream { &ptx_lint_levels, ); - let mut kernel_ptx = kernel_ptx.into_bytes(); - kernel_ptx.push(b'\0'); - - if let Err(err) = CStr::from_bytes_with_nul(&kernel_ptx) { - abort_call_site!( + let kernel_ptx = match CString::new(kernel_ptx) { + Ok(kernel_ptx) => kernel_ptx, + Err(err) => abort_call_site!( "Kernel compilation generated invalid PTX: internal nul byte: {:?}", err - ); - } + ), + }; - // TODO: CStr constructor blocked on https://github.com/rust-lang/rust/issues/118560 - let kernel_ptx = syn::LitByteStr::new(&kernel_ptx, proc_macro2::Span::call_site()); - // Safety: the validity of kernel_ptx as a CStr was just checked above - let kernel_ptx = - quote! { unsafe { ::core::ffi::CStr::from_bytes_with_nul_unchecked(#kernel_ptx) } }; + let kernel_ptx = proc_macro::Literal::c_string(&kernel_ptx); + let kernel_ptx = proc_macro2::TokenStream::from(proc_macro::TokenStream::from( + proc_macro::TokenTree::Literal(kernel_ptx), + )); (quote! { const #ptx_cstr_ident: &'static ::core::ffi::CStr = #kernel_ptx; #(#type_layouts)* }) .into() diff --git a/rust-cuda-kernel/src/kernel/specialise/entry_point.rs b/rust-cuda-kernel/src/kernel/specialise/entry_point.rs index 5653a5539..2bc50b0e5 100644 --- a/rust-cuda-kernel/src/kernel/specialise/entry_point.rs +++ b/rust-cuda-kernel/src/kernel/specialise/entry_point.rs @@ -1,4 +1,4 @@ -use std::ffi::CStr; +use std::ffi::CString; use proc_macro::TokenStream; @@ -27,23 +27,16 @@ pub fn specialise_kernel_entry_point(tokens: TokenStream) -> TokenStream { format!("{kernel}_kernel") }; - let mut mangled_kernel_ident = mangled_kernel_ident.into_bytes(); - mangled_kernel_ident.push(b'\0'); - - if let Err(err) = CStr::from_bytes_with_nul(&mangled_kernel_ident) { - abort_call_site!( + let mangled_kernel_ident = match CString::new(mangled_kernel_ident) { + Ok(mangled_kernel_ident) => mangled_kernel_ident, + Err(err) => abort_call_site!( "Kernel compilation generated invalid kernel entry point: internal nul byte: {:?}", err - ); - } - - // TODO: CStr constructor blocked on https://github.com/rust-lang/rust/issues/118560 - let mangled_kernel_ident = - syn::LitByteStr::new(&mangled_kernel_ident, proc_macro2::Span::call_site()); - // Safety: the validity of mangled_kernel_ident as a CStr was just checked above - let mangled_kernel_ident = quote! { unsafe { ::core::ffi::CStr::from_bytes_with_nul_unchecked(#mangled_kernel_ident) } }; + ), + }; - (quote! { #mangled_kernel_ident }).into() + let mangled_kernel_ident = proc_macro::Literal::c_string(&mangled_kernel_ident); + proc_macro::TokenTree::Literal(mangled_kernel_ident).into() } struct SpecialiseMangleConfig { diff --git a/rust-cuda-kernel/src/lib.rs b/rust-cuda-kernel/src/lib.rs index b26a78531..436380ff3 100644 --- a/rust-cuda-kernel/src/lib.rs +++ b/rust-cuda-kernel/src/lib.rs @@ -5,6 +5,7 @@ #![feature(let_chains)] #![feature(map_try_insert)] #![feature(proc_macro_def_site)] +#![feature(proc_macro_c_str_literals)] #![feature(cfg_version)] #![cfg_attr(not(version("1.76.0")), feature(c_str_literals))] #![doc(html_root_url = "https://juntyr.github.io/rust-cuda/")] diff --git a/src/device/utils.rs b/src/device/utils.rs index cbc5080ab..8447c5235 100644 --- a/src/device/utils.rs +++ b/src/device/utils.rs @@ -1,15 +1,27 @@ use crate::deps::alloc::{fmt, string::String}; /// Abort the CUDA kernel using the `trap` system call. +/// +/// [`abort`] poisons the CUDA context and no more work can be performed in it. #[allow(clippy::inline_always)] #[inline(always)] pub fn abort() -> ! { unsafe { ::core::arch::nvptx::trap() } } +/// Exit the CUDA kernel using the `exit` instruction. +/// +/// # Safety +/// +/// [`exit`] quits the kernel early and any mutable data accessible outside this +/// kernel launch (by the host or a subsequent kernel launch) may be in an +/// inconsistent state. Therefore, kernel failure must be communicated back to +/// host and handled in some other manner. +/// +/// Safely return from the main kernel function instead. #[allow(clippy::inline_always)] #[inline(always)] -pub fn exit() -> ! { +pub unsafe fn exit() -> ! { unsafe { ::core::arch::asm!("exit;", options(noreturn)) } } @@ -68,14 +80,28 @@ pub fn print(args: ::core::fmt::Arguments) { } } -// TODO: docs +/// Helper function to efficiently pretty-print a [`core::panic::PanicInfo`] +/// using the `vprintf` system call. +/// +/// If `allow_dynamic_message` is set, +/// [`alloc::fmt::format`](crate::deps::alloc::fmt::format) is used to print +/// [`core::panic::PanicInfo::message`] message when +/// [`core::fmt::Arguments::as_str`] returns [`None`]. Note that this may pull +/// in a large amount of string formatting and dynamic allocation code. +/// If unset, a default placeholder panic message is printed instead. +/// +/// If `allow_dynamic_payload` is set, [`core::panic::PanicInfo::payload`] is +/// checked for [`&str`] and [`String`] to get a message to print if +/// [`core::panic::PanicInfo::message`] returns [`None`]. Note that this may +/// pull in some dynamic dispatch code. If unset, a default placeholder panic +/// message is printed instead. #[allow(clippy::inline_always)] #[inline(always)] -pub fn pretty_panic_handler( +pub fn pretty_print_panic_info( info: &::core::panic::PanicInfo, allow_dynamic_message: bool, allow_dynamic_payload: bool, -) -> ! { +) { #[repr(C)] struct FormatArgs { file_len: u32, @@ -140,15 +166,14 @@ pub fn pretty_panic_handler( ::core::ptr::from_ref(&args).cast(), ); } - - exit() } -// TODO: docs +/// Helper function to efficiently pretty-print an error message (inside an +/// allocation error handler) using the `vprintf` system call. #[track_caller] #[allow(clippy::inline_always)] #[inline(always)] -pub fn pretty_alloc_error_handler(layout: ::core::alloc::Layout) -> ! { +pub fn pretty_print_alloc_error(layout: ::core::alloc::Layout) { #[repr(C)] struct FormatArgs { size: usize, @@ -186,6 +211,4 @@ pub fn pretty_alloc_error_handler(layout: ::core::alloc::Layout) -> ! { ::core::ptr::from_ref(&args).cast(), ); } - - exit() } diff --git a/src/kernel/mod.rs b/src/kernel/mod.rs index 42e13e0ce..e69b0b15a 100644 --- a/src/kernel/mod.rs +++ b/src/kernel/mod.rs @@ -40,7 +40,8 @@ mod sealed { pub struct Token; } -#[cfg(feature = "host")] // FIXME: make private? +#[cfg(all(feature = "host", not(doc)))] +#[doc(hidden)] pub trait WithNewAsync< 'stream, P: ?Sized + CudaKernelParameter, @@ -48,13 +49,12 @@ pub trait WithNewAsync< E: From, > { - #[allow(clippy::missing_errors_doc)] // FIXME fn with<'b>(self, param: P::AsyncHostType<'stream, 'b>) -> Result where P: 'b; } -#[cfg(feature = "host")] // FIXME: make private? +#[cfg(all(feature = "host", not(doc)))] impl< 'stream, P: ?Sized + CudaKernelParameter, @@ -72,6 +72,7 @@ impl< } #[cfg(feature = "device")] +#[doc(hidden)] pub trait WithFfiAsDevice { fn with<'b>(self, param: P::DeviceType<'b>) -> O where @@ -108,13 +109,14 @@ pub trait CudaKernelParameter: sealed::Sealed { #[cfg(feature = "host")] #[allow(clippy::missing_errors_doc)] // FIXME - fn with_new_async<'stream, 'param, O, E: From>( + fn with_new_async<'stream, 'b, O, E: From>( param: Self::SyncHostType, stream: crate::host::Stream<'stream>, - inner: impl WithNewAsync<'stream, Self, O, E>, + #[cfg(not(doc))] inner: impl WithNewAsync<'stream, Self, O, E>, + #[cfg(doc)] inner: impl FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result, ) -> Result where - Self: 'param; + Self: 'b; #[doc(hidden)] #[cfg(feature = "host")] diff --git a/src/kernel/param.rs b/src/kernel/param.rs index ff53f6dd4..c87148c7a 100644 --- a/src/kernel/param.rs +++ b/src/kernel/param.rs @@ -82,7 +82,8 @@ impl< fn with_new_async<'stream, 'b, O, E: From>( param: Self::SyncHostType, _stream: crate::host::Stream<'stream>, - inner: impl super::WithNewAsync<'stream, Self, O, E>, + #[cfg(not(doc))] inner: impl super::WithNewAsync<'stream, Self, O, E>, + #[cfg(doc)] inner: impl FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result, ) -> Result where Self: 'b, @@ -168,7 +169,8 @@ impl< fn with_new_async<'stream, 'b, O, E: From>( param: Self::SyncHostType, stream: crate::host::Stream<'stream>, - inner: impl super::WithNewAsync<'stream, Self, O, E>, + #[cfg(not(doc))] inner: impl super::WithNewAsync<'stream, Self, O, E>, + #[cfg(doc)] inner: impl FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result, ) -> Result where Self: 'b, @@ -253,7 +255,8 @@ impl< fn with_new_async<'stream, 'b, O, E: From>( param: Self::SyncHostType, stream: crate::host::Stream<'stream>, - inner: impl super::WithNewAsync<'stream, Self, O, E>, + #[cfg(not(doc))] inner: impl super::WithNewAsync<'stream, Self, O, E>, + #[cfg(doc)] inner: impl FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result, ) -> Result where Self: 'b, @@ -377,7 +380,8 @@ impl< fn with_new_async<'stream, 'b, O, E: From>( param: Self::SyncHostType, stream: crate::host::Stream<'stream>, - inner: impl super::WithNewAsync<'stream, Self, O, E>, + #[cfg(not(doc))] inner: impl super::WithNewAsync<'stream, Self, O, E>, + #[cfg(doc)] inner: impl FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result, ) -> Result where Self: 'b, @@ -514,7 +518,8 @@ impl< fn with_new_async<'stream, 'b, O, E: From>( param: Self::SyncHostType, stream: crate::host::Stream<'stream>, - inner: impl super::WithNewAsync<'stream, Self, O, E>, + #[cfg(not(doc))] inner: impl super::WithNewAsync<'stream, Self, O, E>, + #[cfg(doc)] inner: impl FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result, ) -> Result where Self: 'b, @@ -600,7 +605,8 @@ impl<'a, T: Sync + RustToCuda> CudaKernelParameter for &'a DeepPerThreadBorrow>( param: Self::SyncHostType, stream: crate::host::Stream<'stream>, - inner: impl super::WithNewAsync<'stream, Self, O, E>, + #[cfg(not(doc))] inner: impl super::WithNewAsync<'stream, Self, O, E>, + #[cfg(doc)] inner: impl FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result, ) -> Result where Self: 'b, @@ -684,7 +690,8 @@ impl<'a, T: Sync + RustToCuda + SafeMutableAliasing> CudaKernelParameter fn with_new_async<'stream, 'b, O, E: From>( param: Self::SyncHostType, stream: crate::host::Stream<'stream>, - inner: impl super::WithNewAsync<'stream, Self, O, E>, + #[cfg(not(doc))] inner: impl super::WithNewAsync<'stream, Self, O, E>, + #[cfg(doc)] inner: impl FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result, ) -> Result where Self: 'b, @@ -774,7 +781,8 @@ impl< fn with_new_async<'stream, 'b, O, E: From>( param: Self::SyncHostType, stream: crate::host::Stream<'stream>, - inner: impl super::WithNewAsync<'stream, Self, O, E>, + #[cfg(not(doc))] inner: impl super::WithNewAsync<'stream, Self, O, E>, + #[cfg(doc)] inner: impl FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result, ) -> Result where Self: 'b, @@ -857,7 +865,8 @@ impl<'a, T: Sync + RustToCuda> CudaKernelParameter for &'a PtxJit>( param: Self::SyncHostType, stream: crate::host::Stream<'stream>, - inner: impl super::WithNewAsync<'stream, Self, O, E>, + #[cfg(not(doc))] inner: impl super::WithNewAsync<'stream, Self, O, E>, + #[cfg(doc)] inner: impl FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result, ) -> Result where Self: 'b, @@ -940,7 +949,8 @@ impl<'a, T: Sync + RustToCuda + SafeMutableAliasing> CudaKernelParameter fn with_new_async<'stream, 'b, O, E: From>( param: Self::SyncHostType, stream: crate::host::Stream<'stream>, - inner: impl super::WithNewAsync<'stream, Self, O, E>, + #[cfg(not(doc))] inner: impl super::WithNewAsync<'stream, Self, O, E>, + #[cfg(doc)] inner: impl FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result, ) -> Result where Self: 'b, @@ -1067,7 +1077,8 @@ impl<'a, T: 'static> CudaKernelParameter for &'a mut crate::utils::shared::Threa fn with_new_async<'stream, 'b, O, E: From>( param: Self::SyncHostType, _stream: crate::host::Stream<'stream>, - inner: impl super::WithNewAsync<'stream, Self, O, E>, + #[cfg(not(doc))] inner: impl super::WithNewAsync<'stream, Self, O, E>, + #[cfg(doc)] inner: impl FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result, ) -> Result where Self: 'b, @@ -1144,7 +1155,8 @@ impl<'a, T: 'static + PortableBitSemantics + TypeGraphLayout> CudaKernelParamete fn with_new_async<'stream, 'b, O, E: From>( param: Self::SyncHostType, _stream: crate::host::Stream<'stream>, - inner: impl super::WithNewAsync<'stream, Self, O, E>, + #[cfg(not(doc))] inner: impl super::WithNewAsync<'stream, Self, O, E>, + #[cfg(doc)] inner: impl FnOnce(Self::AsyncHostType<'stream, 'b>) -> Result, ) -> Result where Self: 'b, diff --git a/src/lend/impls/option.rs b/src/lend/impls/option.rs index 0e9c3c34d..3f1d1e160 100644 --- a/src/lend/impls/option.rs +++ b/src/lend/impls/option.rs @@ -147,10 +147,8 @@ unsafe impl RustToCudaAsync for Option { #[allow(clippy::option_if_let_else)] let (r#async, alloc_tail) = RustToCudaAsync::restore_async( - this.map_mut(|value| match value { - Some(value) => value, - None => unreachable!(), // TODO - }), + // Safety: we have already established value is Some above + this.map_mut(|value| unsafe { value.as_mut().unwrap_unchecked() }), CombinedCudaAlloc::new(alloc_front, alloc_tail), stream, )?; From 30986364465cdc9f50602f8223e2fa338977a0d3 Mon Sep 17 00:00:00 2001 From: Juniper Tyree Date: Thu, 18 Jan 2024 14:10:14 +0000 Subject: [PATCH 110/120] Fix some clippy lints, add FIXMEs for others --- rust-cuda-derive/src/lib.rs | 2 ++ rust-cuda-kernel/src/kernel/link/error.rs | 9 +++--- rust-cuda-kernel/src/kernel/link/mod.rs | 32 ++++++++----------- .../src/kernel/link/ptx_compiler_sys.rs | 2 +- .../src/kernel/specialise/entry_point.rs | 1 + .../wrapper/generate/cuda_generic_function.rs | 6 ++-- rust-cuda-kernel/src/lib.rs | 10 +++++- src/lib.rs | 5 ++- 8 files changed, 37 insertions(+), 30 deletions(-) diff --git a/rust-cuda-derive/src/lib.rs b/rust-cuda-derive/src/lib.rs index fba846798..a560b6d67 100644 --- a/rust-cuda-derive/src/lib.rs +++ b/rust-cuda-derive/src/lib.rs @@ -5,6 +5,8 @@ #![deny(clippy::perf)] #![deny(clippy::style)] #![deny(clippy::suspicious)] +#![deny(unsafe_code)] +// #![warn(missing_docs)] // FIXME #![feature(if_let_guard)] #![feature(let_chains)] #![doc(html_root_url = "https://juntyr.github.io/rust-cuda/")] diff --git a/rust-cuda-kernel/src/kernel/link/error.rs b/rust-cuda-kernel/src/kernel/link/error.rs index 0c83e19a5..811269ccc 100644 --- a/rust-cuda-kernel/src/kernel/link/error.rs +++ b/rust-cuda-kernel/src/kernel/link/error.rs @@ -22,15 +22,14 @@ pub fn emit_ptx_build_error() { let call_site = proc_macro::Span::call_site(); - let (byte_start, byte_end) = - if let Some(captures) = PROC_MACRO_SPAN_REGEX.captures(&format!("{call_site:?}")) { + let (byte_start, byte_end) = PROC_MACRO_SPAN_REGEX + .captures(&format!("{call_site:?}")) + .map_or((0_u32, 0_u32), |captures| { ( captures["start"].parse().unwrap_or(0_u32), captures["end"].parse().unwrap_or(0_u32), ) - } else { - (0_u32, 0_u32) - }; + }); let span = DiagnosticSpanBuilder::default() .file_name( diff --git a/rust-cuda-kernel/src/kernel/link/mod.rs b/rust-cuda-kernel/src/kernel/link/mod.rs index b83eab9d2..27c2533c6 100644 --- a/rust-cuda-kernel/src/kernel/link/mod.rs +++ b/rust-cuda-kernel/src/kernel/link/mod.rs @@ -386,6 +386,7 @@ fn check_kernel_ptx( ) { let compiler = { let mut compiler = std::ptr::null_mut(); + #[allow(unsafe_code)] // FFI if let Err(err) = NvptxError::try_err_from(unsafe { ptx_compiler_sys::nvPTXCompilerCreate( addr_of_mut!(compiler), @@ -451,6 +452,7 @@ fn check_kernel_ptx( let options_ptrs = options.iter().map(|o| o.as_ptr()).collect::>(); + #[allow(unsafe_code)] // FFI NvptxError::try_err_from(unsafe { ptx_compiler_sys::nvPTXCompilerCompile( compiler, @@ -493,6 +495,7 @@ fn check_kernel_ptx( let options_ptrs = options.iter().map(|o| o.as_ptr()).collect::>(); + #[allow(unsafe_code)] // FFI NvptxError::try_err_from(unsafe { ptx_compiler_sys::nvPTXCompilerCompile( compiler, @@ -505,6 +508,7 @@ fn check_kernel_ptx( let error_log = (|| { let mut error_log_size = 0; + #[allow(unsafe_code)] // FFI NvptxError::try_err_from(unsafe { ptx_compiler_sys::nvPTXCompilerGetErrorLogSize(compiler, addr_of_mut!(error_log_size)) })?; @@ -514,23 +518,20 @@ fn check_kernel_ptx( } #[allow(clippy::cast_possible_truncation)] - let mut error_log: Vec = Vec::with_capacity(error_log_size as usize); + let mut error_log: Vec = vec![0; error_log_size as usize]; + #[allow(unsafe_code)] // FFI NvptxError::try_err_from(unsafe { ptx_compiler_sys::nvPTXCompilerGetErrorLog(compiler, error_log.as_mut_ptr().cast()) })?; - #[allow(clippy::cast_possible_truncation)] - unsafe { - error_log.set_len(error_log_size as usize); - } - Ok(Some(String::from_utf8_lossy(&error_log).into_owned())) })(); let info_log = (|| { let mut info_log_size = 0; + #[allow(unsafe_code)] // FFI NvptxError::try_err_from(unsafe { ptx_compiler_sys::nvPTXCompilerGetInfoLogSize(compiler, addr_of_mut!(info_log_size)) })?; @@ -540,17 +541,13 @@ fn check_kernel_ptx( } #[allow(clippy::cast_possible_truncation)] - let mut info_log: Vec = Vec::with_capacity(info_log_size as usize); + let mut info_log: Vec = vec![0; info_log_size as usize]; + #[allow(unsafe_code)] // FFI NvptxError::try_err_from(unsafe { ptx_compiler_sys::nvPTXCompilerGetInfoLog(compiler, info_log.as_mut_ptr().cast()) })?; - #[allow(clippy::cast_possible_truncation)] - unsafe { - info_log.set_len(info_log_size as usize); - } - Ok(Some(String::from_utf8_lossy(&info_log).into_owned())) })(); @@ -561,6 +558,7 @@ fn check_kernel_ptx( let mut binary_size = 0; + #[allow(unsafe_code)] // FFI NvptxError::try_err_from(unsafe { ptx_compiler_sys::nvPTXCompilerGetCompiledProgramSize( compiler, @@ -573,17 +571,13 @@ fn check_kernel_ptx( } #[allow(clippy::cast_possible_truncation)] - let mut binary: Vec = Vec::with_capacity(binary_size as usize); + let mut binary: Vec = vec![0; binary_size as usize]; + #[allow(unsafe_code)] // FFI NvptxError::try_err_from(unsafe { ptx_compiler_sys::nvPTXCompilerGetCompiledProgram(compiler, binary.as_mut_ptr().cast()) })?; - #[allow(clippy::cast_possible_truncation)] - unsafe { - binary.set_len(binary_size as usize); - } - Ok(Some(binary)) })(); @@ -591,6 +585,7 @@ fn check_kernel_ptx( let mut major = 0; let mut minor = 0; + #[allow(unsafe_code)] // FFI NvptxError::try_err_from(unsafe { ptx_compiler_sys::nvPTXCompilerGetVersion(addr_of_mut!(major), addr_of_mut!(minor)) })?; @@ -600,6 +595,7 @@ fn check_kernel_ptx( let drop = { let mut compiler = compiler; + #[allow(unsafe_code)] // FFI NvptxError::try_err_from(unsafe { ptx_compiler_sys::nvPTXCompilerDestroy(addr_of_mut!(compiler)) }) diff --git a/rust-cuda-kernel/src/kernel/link/ptx_compiler_sys.rs b/rust-cuda-kernel/src/kernel/link/ptx_compiler_sys.rs index fac72cebf..7fffc7b4c 100644 --- a/rust-cuda-kernel/src/kernel/link/ptx_compiler_sys.rs +++ b/rust-cuda-kernel/src/kernel/link/ptx_compiler_sys.rs @@ -42,7 +42,7 @@ impl NvptxError { const NVPTXCOMPILE_ERROR_UNSUPPORTED_PTX_VERSION: NvptxCompileResult = 7; const NVPTXCOMPILE_SUCCESS: NvptxCompileResult = 0; - pub fn try_err_from(result: NvptxCompileResult) -> Result<(), Self> { + pub const fn try_err_from(result: NvptxCompileResult) -> Result<(), Self> { match result { Self::NVPTXCOMPILE_SUCCESS => Ok(()), Self::NVPTXCOMPILE_ERROR_INVALID_COMPILER_HANDLE => Err(Self::InvalidCompilerHandle), diff --git a/rust-cuda-kernel/src/kernel/specialise/entry_point.rs b/rust-cuda-kernel/src/kernel/specialise/entry_point.rs index 2bc50b0e5..1c80b7899 100644 --- a/rust-cuda-kernel/src/kernel/specialise/entry_point.rs +++ b/rust-cuda-kernel/src/kernel/specialise/entry_point.rs @@ -18,6 +18,7 @@ pub fn specialise_kernel_entry_point(tokens: TokenStream) -> TokenStream { }, }; + #[allow(clippy::option_if_let_else)] let mangled_kernel_ident = if let Some(specialisation) = specialisation { format!( "{kernel}_kernel_{:016x}", diff --git a/rust-cuda-kernel/src/kernel/wrapper/generate/cuda_generic_function.rs b/rust-cuda-kernel/src/kernel/wrapper/generate/cuda_generic_function.rs index ccf21c96b..00e00d7d8 100644 --- a/rust-cuda-kernel/src/kernel/wrapper/generate/cuda_generic_function.rs +++ b/rust-cuda-kernel/src/kernel/wrapper/generate/cuda_generic_function.rs @@ -38,9 +38,7 @@ pub(in super::super) fn quote_cuda_generic_function( elem, }) = &**ty { - let lifetime = if let Some(lifetime) = lifetime { - lifetime.clone() - } else { + let lifetime = lifetime.clone().unwrap_or_else(|| { let lifetime = syn::Lifetime::new(&format!("'__rust_cuda_lt_{i}"), ty.span()); generic_params.insert( @@ -53,7 +51,7 @@ pub(in super::super) fn quote_cuda_generic_function( }), ); lifetime - }; + }); let lt = quote!(#lifetime); ( syn::Type::Reference(syn::TypeReference { diff --git a/rust-cuda-kernel/src/lib.rs b/rust-cuda-kernel/src/lib.rs index 436380ff3..6aa0d44c7 100644 --- a/rust-cuda-kernel/src/lib.rs +++ b/rust-cuda-kernel/src/lib.rs @@ -1,4 +1,12 @@ -#![deny(clippy::pedantic)] +#![deny(clippy::complexity)] +#![deny(clippy::correctness)] +#![warn(clippy::nursery)] +#![warn(clippy::pedantic)] +#![deny(clippy::perf)] +#![deny(clippy::style)] +#![deny(clippy::suspicious)] +#![deny(unsafe_code)] +// #![warn(missing_docs)] // FIXME #![feature(box_patterns)] #![feature(proc_macro_tracked_env)] #![feature(proc_macro_span)] diff --git a/src/lib.rs b/src/lib.rs index c782c4047..2a63674fa 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -5,7 +5,10 @@ #![deny(clippy::perf)] #![deny(clippy::style)] #![deny(clippy::suspicious)] -#![allow(clippy::useless_attribute)] +// #![warn(clippy::multiple_unsafe_ops_per_block)] // FIXME +// #![warn(clippy::undocumented_unsafe_blocks)] // FIXME +#![deny(unused_unsafe)] +// #![warn(missing_docs)] // FIXME #![cfg_attr(all(any(feature = "device", target_os = "cuda"), not(doc)), no_std)] #![feature(associated_type_bounds)] #![feature(auto_traits)] From 54eacc9245aea9118970fdd20ae03b3008b569b3 Mon Sep 17 00:00:00 2001 From: Juniper Tyree Date: Fri, 19 Jan 2024 05:04:47 +0000 Subject: [PATCH 111/120] Add docs for rust-cuda-derive --- README.md | 5 +- rust-cuda-derive/src/lib.rs | 94 ++++++++++++++++++- rust-cuda-derive/src/rust_to_cuda/field_ty.rs | 6 +- rust-cuda-derive/src/rust_to_cuda/generics.rs | 10 +- rust-cuda-kernel/src/lib.rs | 22 +++++ src/lib.rs | 22 +++++ 6 files changed, 146 insertions(+), 13 deletions(-) diff --git a/README.md b/README.md index e9b24ddbb..5080b7033 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,11 @@ -# rust-cuda   [![CI Status]][workflow] [![Rust Doc]][docs] [![License Status]][fossa] [![Code Coverage]][codecov] [![Gitpod Ready-to-Code]][gitpod] +# rust-cuda   [![CI Status]][workflow] [![MSRV]][repo] [![Rust Doc]][docs] [![License Status]][fossa] [![Code Coverage]][codecov] [![Gitpod Ready-to-Code]][gitpod] [CI Status]: https://img.shields.io/github/actions/workflow/status/juntyr/rust-cuda/ci.yml?branch=main [workflow]: https://github.com/juntyr/rust-cuda/actions/workflows/ci.yml?query=branch%3Amain +[MSRV]: https://img.shields.io/badge/MSRV-1.77.0--nightly-orange +[repo]: https://github.com/juntyr/rust-cuda + [Rust Doc]: https://img.shields.io/badge/docs-main-blue [docs]: https://juntyr.github.io/rust-cuda/ diff --git a/rust-cuda-derive/src/lib.rs b/rust-cuda-derive/src/lib.rs index a560b6d67..5b897a8b2 100644 --- a/rust-cuda-derive/src/lib.rs +++ b/rust-cuda-derive/src/lib.rs @@ -1,3 +1,37 @@ +//! [![CI Status]][workflow] [![MSRV]][repo] [![Rust Doc]][docs] [![License +//! Status]][fossa] [![Code Coverage]][codecov] [![Gitpod +//! Ready-to-Code]][gitpod] +//! +//! [CI Status]: https://img.shields.io/github/actions/workflow/status/juntyr/rust-cuda/ci.yml?branch=main +//! [workflow]: https://github.com/juntyr/rust-cuda/actions/workflows/ci.yml?query=branch%3Amain +//! +//! [MSRV]: https://img.shields.io/badge/MSRV-1.77.0--nightly-orange +//! [repo]: https://github.com/juntyr/rust-cuda +//! +//! [Rust Doc]: https://img.shields.io/badge/docs-main-blue +//! [docs]: https://juntyr.github.io/rust-cuda/rust_cuda_derive/ +//! +//! [License Status]: https://app.fossa.com/api/projects/custom%2B26490%2Fgithub.com%2Fjuntyr%2Frust-cuda.svg?type=shield +//! [fossa]: https://app.fossa.com/projects/custom%2B26490%2Fgithub.com%2Fjuntyr%2Frust-cuda?ref=badge_shield +//! +//! [Code Coverage]: https://img.shields.io/codecov/c/github/juntyr/rust-cuda?token=wfeAeybbbx +//! [codecov]: https://codecov.io/gh/juntyr/rust-cuda +//! +//! [Gitpod Ready-to-Code]: https://img.shields.io/badge/Gitpod-ready-blue?logo=gitpod +//! [gitpod]: https://gitpod.io/#https://github.com/juntyr/rust-cuda +//! +//! `rust-cuda-derive` provides the [`#[derive(LendRustToCuda)`](LendRustToCuda) +//! derive macro for the +//! [`rust_cuda::lend::RustToCuda`] +//! utility trait, which enables the usage of the +//! [`rust_cuda::lend::LendToCuda`] +//! trait that allows Rust data structures to be shared with CUDA kernels. +//! +//! The async variants of both traits are *optionally* implemented as well. +//! +//! [`rust_cuda::lend::RustToCuda`]: https://juntyr.github.io/rust-cuda/rust_cuda/lend/trait.RustToCuda.html +//! [`rust_cuda::lend::LendToCuda`]: https://juntyr.github.io/rust-cuda/rust_cuda/lend/trait.LendToCuda.html + #![deny(clippy::complexity)] #![deny(clippy::correctness)] #![warn(clippy::nursery)] @@ -6,7 +40,7 @@ #![deny(clippy::style)] #![deny(clippy::suspicious)] #![deny(unsafe_code)] -// #![warn(missing_docs)] // FIXME +#![deny(missing_docs)] #![feature(if_let_guard)] #![feature(let_chains)] #![doc(html_root_url = "https://juntyr.github.io/rust-cuda/")] @@ -22,6 +56,64 @@ mod rust_to_cuda; #[proc_macro_error] #[proc_macro_derive(LendRustToCuda, attributes(cuda))] +/// Provides the [`#[derive(LendRustToCuda)`](LendRustToCuda) +/// derive macro for the +/// [`rust_cuda::lend::RustToCuda`] +/// utility trait, which enables the usage of the +/// [`rust_cuda::lend::LendToCuda`] +/// trait that allows Rust data structures to be shared with CUDA kernels. +/// +/// At the moment, only +/// [`struct`](https://doc.rust-lang.org/std/keyword.struct.html)s are supported +/// by this derive macro. +/// +/// The derive also accepts a `#[cuda(...)]` attribute. You can annotate the +/// entire struct with the `#[cuda(...)]` to configure the implementation as +/// follows: +/// +/// - `#[cuda(crate = "")]` changes the path to the [`rust-cuda`] +/// crate that the derive uses, which by default is `rust_cuda`. +/// - `#[cuda(bound = "")]` adds the provided predicate to the +/// where clause of the trait implementation. +/// - `#[cuda(free = "")]` removes the the auto-added trait bounds for the +/// type parameter `` from the trait implementation, e.g. when +/// implementing a wrapper around [`std::marker::PhantomData`] which should +/// implement the trait for any `T`. +/// - `#[cuda(async = )]` explicitly enables or disables the async +/// implementation of the trait, [`rust_cuda::lend::RustToCudaAsync`]. By +/// default, `#[cuda(async = true)]` is set. +/// - `#[cuda(layout::ATTR = "VALUE")]` adds the `#[layout(ATTR = "VALUE")]` +/// attribute to the [`#derive(const_type_layout::TypeLayout)`] derive for +/// this struct's [`rust_cuda::lend::RustToCuda::CudaRepresentation`]. +/// - `#[cuda(ignore)]` removes all subsequent attributes from the generated +/// [`rust_cuda::lend::RustToCuda::CudaRepresentation`] struct. +/// +/// Additionally, the `#[cuda(...)]` attribute can also be applied individually +/// to the fields of the struct to customise the implementation as follows: +/// +/// - `#[cuda(embed)]` signals that this field has a non-identity CUDA +/// representation and should be embedded by using the +/// [`rust_cuda::lend::RustToCuda`] implementation of this field's type. When +/// this attribute is not specified, the field must instead implement +/// [`Copy`], [`rust_cuda::safety::PortableBitSemantics`], and +/// [`const_type_layout::TypeGraphLayout`]. +/// - `#[cuda(embed = "")]` works like `#[cuda(embed)]` but can be +/// used when the field's type does not implement +/// [`rust_cuda::lend::RustToCuda`] itself, but some `` exists, +/// which implements [`rust_cuda::lend::RustToCudaProxy`] for the field's +/// type. +/// - `#[cuda(ignore)]` removes all subsequent attributes from this field in the +/// generated [`rust_cuda::lend::RustToCuda::CudaRepresentation`] struct. +/// +/// [`rust_cuda::lend::RustToCuda`]: https://juntyr.github.io/rust-cuda/rust_cuda/lend/trait.RustToCuda.html +/// [`rust_cuda::lend::LendToCuda`]: https://juntyr.github.io/rust-cuda/rust_cuda/lend/trait.LendToCuda.html +/// [`rust-cuda`]: https://juntyr.github.io/rust-cuda/rust_cuda +/// [`rust_cuda::lend::RustToCudaAsync`]: https://juntyr.github.io/rust-cuda/rust_cuda/lend/trait.RustToCudaAsync.html +/// [`#derive(const_type_layout::TypeLayout)`]: https://docs.rs/const-type-layout/0.2.1/const_type_layout/derive.TypeLayout.html +/// [`rust_cuda::lend::RustToCuda::CudaRepresentation`]: https://juntyr.github.io/rust-cuda/rust_cuda/lend/trait.RustToCuda.html#associatedtype.CudaRepresentation +/// [`rust_cuda::safety::PortableBitSemantics`]: https://juntyr.github.io/rust-cuda/rust_cuda/safety/trait.PortableBitSemantics.html +/// [`const_type_layout::TypeGraphLayout`]: https://docs.rs/const-type-layout/0.2.1/const_type_layout/trait.TypeGraphLayout.html +/// [`rust_cuda::lend::RustToCudaProxy`]: https://juntyr.github.io/rust-cuda/rust_cuda/lend/trait.RustToCudaProxy.html pub fn rust_to_cuda_derive(input: TokenStream) -> TokenStream { // Note: We cannot report a more precise span yet let ast = match syn::parse(input) { diff --git a/rust-cuda-derive/src/rust_to_cuda/field_ty.rs b/rust-cuda-derive/src/rust_to_cuda/field_ty.rs index b2f624d66..c9fe48b77 100644 --- a/rust-cuda-derive/src/rust_to_cuda/field_ty.rs +++ b/rust-cuda-derive/src/rust_to_cuda/field_ty.rs @@ -69,7 +69,7 @@ pub fn swap_field_type_and_filter_attrs( Err(err) => emit_error!( s.span(), "[rust-cuda]: Invalid #[cuda(embed = \ - \"\")] field attribute: {}.", + \"\")] field attribute: {}.", err ), } @@ -84,7 +84,7 @@ pub fn swap_field_type_and_filter_attrs( emit_error!( meta.span(), "[rust-cuda]: Expected #[cuda(ignore)] / #[cuda(embed)] / \ - #[cuda(embed = \"\")] field attribute" + #[cuda(embed = \"\")] field attribute" ); } } @@ -93,7 +93,7 @@ pub fn swap_field_type_and_filter_attrs( emit_error!( attr.span(), "[rust-cuda]: Expected #[cuda(ignore)] / #[cuda(embed)] / \ - #[cuda(embed = \"\")] field attribute." + #[cuda(embed = \"\")] field attribute." ); } diff --git a/rust-cuda-derive/src/rust_to_cuda/generics.rs b/rust-cuda-derive/src/rust_to_cuda/generics.rs index 4325f39fb..f090f5c70 100644 --- a/rust-cuda-derive/src/rust_to_cuda/generics.rs +++ b/rust-cuda-derive/src/rust_to_cuda/generics.rs @@ -159,10 +159,7 @@ pub fn expand_cuda_struct_generics_where_requested_in_attrs( _ => { emit_error!( meta.span(), - "[rust-cuda]: Expected #[cuda(ignore)] / \ - #[cuda(bound = \"\")] / \ - #[cuda(crate = \"\")] / \ - #[cuda(layout::ATTR = \"VALUE\")] struct attribute." + "[rust-cuda]: Expected #[cuda(crate = \"\")] / #[cuda(bound = \"\")] / #[cuda(free = \"\")] / #[cuda(async = )] / #[cuda(layout::ATTR = \"VALUE\")] / #[cuda(ignore)] struct attribute." ); }, } @@ -170,10 +167,7 @@ pub fn expand_cuda_struct_generics_where_requested_in_attrs( } else { emit_error!( attr.span(), - "[rust-cuda]: Expected #[cuda(ignore)] / \ - #[cuda(bound = \"\")] / \ - #[cuda(crate = \"\")] / \ - #[cuda(layout::ATTR = \"VALUE\")] struct attribute." + "[rust-cuda]: Expected #[cuda(crate = \"\")] / #[cuda(bound = \"\")] / #[cuda(free = \"\")] / #[cuda(async = )] / #[cuda(layout::ATTR = \"VALUE\")] / #[cuda(ignore)] struct attribute." ); } diff --git a/rust-cuda-kernel/src/lib.rs b/rust-cuda-kernel/src/lib.rs index 6aa0d44c7..86d4137bb 100644 --- a/rust-cuda-kernel/src/lib.rs +++ b/rust-cuda-kernel/src/lib.rs @@ -1,3 +1,25 @@ +//! [![CI Status]][workflow] [![MSRV]][repo] [![Rust Doc]][docs] [![License +//! Status]][fossa] [![Code Coverage]][codecov] [![Gitpod +//! Ready-to-Code]][gitpod] +//! +//! [CI Status]: https://img.shields.io/github/actions/workflow/status/juntyr/rust-cuda/ci.yml?branch=main +//! [workflow]: https://github.com/juntyr/rust-cuda/actions/workflows/ci.yml?query=branch%3Amain +//! +//! [MSRV]: https://img.shields.io/badge/MSRV-1.77.0--nightly-orange +//! [repo]: https://github.com/juntyr/rust-cuda +//! +//! [Rust Doc]: https://img.shields.io/badge/docs-main-blue +//! [docs]: https://juntyr.github.io/rust-cuda/rust_cuda_kernel/ +//! +//! [License Status]: https://app.fossa.com/api/projects/custom%2B26490%2Fgithub.com%2Fjuntyr%2Frust-cuda.svg?type=shield +//! [fossa]: https://app.fossa.com/projects/custom%2B26490%2Fgithub.com%2Fjuntyr%2Frust-cuda?ref=badge_shield +//! +//! [Code Coverage]: https://img.shields.io/codecov/c/github/juntyr/rust-cuda?token=wfeAeybbbx +//! [codecov]: https://codecov.io/gh/juntyr/rust-cuda +//! +//! [Gitpod Ready-to-Code]: https://img.shields.io/badge/Gitpod-ready-blue?logo=gitpod +//! [gitpod]: https://gitpod.io/#https://github.com/juntyr/rust-cuda + #![deny(clippy::complexity)] #![deny(clippy::correctness)] #![warn(clippy::nursery)] diff --git a/src/lib.rs b/src/lib.rs index 2a63674fa..df8c42290 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,3 +1,25 @@ +//! [![CI Status]][workflow] [![MSRV]][repo] [![Rust Doc]][docs] [![License +//! Status]][fossa] [![Code Coverage]][codecov] [![Gitpod +//! Ready-to-Code]][gitpod] +//! +//! [CI Status]: https://img.shields.io/github/actions/workflow/status/juntyr/rust-cuda/ci.yml?branch=main +//! [workflow]: https://github.com/juntyr/rust-cuda/actions/workflows/ci.yml?query=branch%3Amain +//! +//! [MSRV]: https://img.shields.io/badge/MSRV-1.77.0--nightly-orange +//! [repo]: https://github.com/juntyr/rust-cuda +//! +//! [Rust Doc]: https://img.shields.io/badge/docs-main-blue +//! [docs]: https://juntyr.github.io/rust-cuda/rust_cuda/ +//! +//! [License Status]: https://app.fossa.com/api/projects/custom%2B26490%2Fgithub.com%2Fjuntyr%2Frust-cuda.svg?type=shield +//! [fossa]: https://app.fossa.com/projects/custom%2B26490%2Fgithub.com%2Fjuntyr%2Frust-cuda?ref=badge_shield +//! +//! [Code Coverage]: https://img.shields.io/codecov/c/github/juntyr/rust-cuda?token=wfeAeybbbx +//! [codecov]: https://codecov.io/gh/juntyr/rust-cuda +//! +//! [Gitpod Ready-to-Code]: https://img.shields.io/badge/Gitpod-ready-blue?logo=gitpod +//! [gitpod]: https://gitpod.io/#https://github.com/juntyr/rust-cuda + #![deny(clippy::complexity)] #![deny(clippy::correctness)] #![warn(clippy::nursery)] From fd9682d2635117a5dc1af302fa1c98de882c631d Mon Sep 17 00:00:00 2001 From: Juniper Tyree Date: Sat, 20 Jan 2024 11:58:35 +0000 Subject: [PATCH 112/120] Small refactoring + added docs for rust-cuda-kernel --- examples/print/src/main.rs | 2 +- examples/single-source/src/main.rs | 2 +- rust-cuda-derive/src/lib.rs | 4 +- rust-cuda-kernel/src/kernel/link/mod.rs | 18 +-- rust-cuda-kernel/src/kernel/lints.rs | 12 +- rust-cuda-kernel/src/kernel/specialise/mod.rs | 2 +- .../specialise/{ty.rs => param_type.rs} | 7 +- rust-cuda-kernel/src/kernel/wrapper/config.rs | 6 +- .../kernel/wrapper/generate/cuda_wrapper.rs | 4 +- .../args_trait.rs | 0 .../get_ptx.rs | 2 +- .../mod.rs | 6 +- .../src/kernel/wrapper/generate/mod.rs | 2 +- rust-cuda-kernel/src/kernel/wrapper/mod.rs | 12 +- rust-cuda-kernel/src/lib.rs | 127 +++++++++++++++++- src/device/mod.rs | 2 +- src/kernel/mod.rs | 2 +- 17 files changed, 164 insertions(+), 46 deletions(-) rename rust-cuda-kernel/src/kernel/specialise/{ty.rs => param_type.rs} (96%) rename rust-cuda-kernel/src/kernel/wrapper/generate/{host_linker_macro => host_link_macro}/args_trait.rs (100%) rename rust-cuda-kernel/src/kernel/wrapper/generate/{host_linker_macro => host_link_macro}/get_ptx.rs (99%) rename rust-cuda-kernel/src/kernel/wrapper/generate/{host_linker_macro => host_link_macro}/mod.rs (96%) diff --git a/examples/print/src/main.rs b/examples/print/src/main.rs index 93a50ba55..5aec2b391 100644 --- a/examples/print/src/main.rs +++ b/examples/print/src/main.rs @@ -22,7 +22,7 @@ pub enum Action { } #[rust_cuda::kernel::kernel(use link! for impl)] -#[kernel(allow(ptx::local_memory_usage))] +#[kernel(allow(ptx::local_memory_use))] pub fn kernel(action: rust_cuda::kernel::param::PerThreadShallowCopy) { match action { Action::Print => rust_cuda::device::utils::println!("println! from CUDA kernel"), diff --git a/examples/single-source/src/main.rs b/examples/single-source/src/main.rs index 89bbdf990..f0aa25ce7 100644 --- a/examples/single-source/src/main.rs +++ b/examples/single-source/src/main.rs @@ -49,7 +49,7 @@ pub struct Triple(i32, i32, i32); #[kernel(crate = "rc")] #[kernel( allow(ptx::double_precision_use), - forbid(ptx::local_memory_usage, ptx::register_spills) + forbid(ptx::local_memory_use, ptx::register_spills) )] pub fn kernel< 'a, diff --git a/rust-cuda-derive/src/lib.rs b/rust-cuda-derive/src/lib.rs index 5b897a8b2..514bbf66e 100644 --- a/rust-cuda-derive/src/lib.rs +++ b/rust-cuda-derive/src/lib.rs @@ -20,8 +20,8 @@ //! [Gitpod Ready-to-Code]: https://img.shields.io/badge/Gitpod-ready-blue?logo=gitpod //! [gitpod]: https://gitpod.io/#https://github.com/juntyr/rust-cuda //! -//! `rust-cuda-derive` provides the [`#[derive(LendRustToCuda)`](LendRustToCuda) -//! derive macro for the +//! `rust-cuda-derive` provides the +//! [`#[derive(LendRustToCuda)]`](LendRustToCuda) derive macro for the //! [`rust_cuda::lend::RustToCuda`] //! utility trait, which enables the usage of the //! [`rust_cuda::lend::LendToCuda`] diff --git a/rust-cuda-kernel/src/kernel/link/mod.rs b/rust-cuda-kernel/src/kernel/link/mod.rs index 27c2533c6..10d3b63ed 100644 --- a/rust-cuda-kernel/src/kernel/link/mod.rs +++ b/rust-cuda-kernel/src/kernel/link/mod.rs @@ -55,7 +55,7 @@ pub fn check_kernel(tokens: TokenStream) -> TokenStream { }, }; - let kernel_ptx = compile_kernel(&kernel, &crate_name, &crate_path, Specialisation::Check); + let kernel_ptx = compile_kernel_ptx(&kernel, &crate_name, &crate_path, Specialisation::Check); let Some(kernel_ptx) = kernel_ptx else { return quote!(::core::compile_error!("rust-cuda PTX kernel check failed");).into(); @@ -72,7 +72,7 @@ pub fn check_kernel(tokens: TokenStream) -> TokenStream { } #[allow(clippy::module_name_repetitions)] -pub fn link_kernel(tokens: TokenStream) -> TokenStream { +pub fn compile_kernel(tokens: TokenStream) -> TokenStream { let ptx_cstr_ident = syn::Ident::new(PTX_CSTR_IDENT, Span::call_site()); let ffi_signature_ident = syn::Ident::new(KERNEL_TYPE_LAYOUT_IDENT, Span::call_site()); @@ -93,7 +93,7 @@ pub fn link_kernel(tokens: TokenStream) -> TokenStream { Ok(config) => config, Err(err) => { abort_call_site!( - "link_kernel!(KERNEL HASH NAME PATH SPECIALISATION LINTS,*) expects KERNEL and \ + "compile_kernel!(KERNEL HASH NAME PATH SPECIALISATION LINTS,*) expects KERNEL and \ HASH identifiers, NAME and PATH string literals, and SPECIALISATION and LINTS \ tokens: {:?}", err @@ -108,7 +108,7 @@ pub fn link_kernel(tokens: TokenStream) -> TokenStream { .into(); } - let Some(mut kernel_ptx) = compile_kernel( + let Some(mut kernel_ptx) = compile_kernel_ptx( &kernel, &crate_name, &crate_path, @@ -285,7 +285,7 @@ fn check_kernel_ptx_and_report( Ok(None) => (), Ok(Some(binary)) => { if ptx_lint_levels - .get(&PtxLint::DumpBinary) + .get(&PtxLint::DumpAssembly) .map_or(false, |level| *level > LintLevel::Allow) { const HEX: [char; 16] = [ @@ -299,7 +299,7 @@ fn check_kernel_ptx_and_report( } if ptx_lint_levels - .get(&PtxLint::DumpBinary) + .get(&PtxLint::DumpAssembly) .map_or(false, |level| *level > LintLevel::Warn) { emit_call_site_error!( @@ -431,7 +431,7 @@ fn check_kernel_ptx( options.push(c"--warn-on-double-precision-use"); } if ptx_lint_levels - .get(&PtxLint::LocalMemoryUsage) + .get(&PtxLint::LocalMemoryUse) .map_or(false, |level| *level > LintLevel::Warn) { options.push(c"--warn-on-local-memory-usage"); @@ -475,7 +475,7 @@ fn check_kernel_ptx( options.push(c"--warn-on-double-precision-use"); } if ptx_lint_levels - .get(&PtxLint::LocalMemoryUsage) + .get(&PtxLint::LocalMemoryUse) .map_or(false, |level| *level > LintLevel::Allow) { options.push(c"--warn-on-local-memory-usage"); @@ -604,7 +604,7 @@ fn check_kernel_ptx( (result, error_log, info_log, binary, version, drop) } -fn compile_kernel( +fn compile_kernel_ptx( kernel: &syn::Ident, crate_name: &str, crate_path: &Path, diff --git a/rust-cuda-kernel/src/kernel/lints.rs b/rust-cuda-kernel/src/kernel/lints.rs index 6c198b71a..5fbe415b2 100644 --- a/rust-cuda-kernel/src/kernel/lints.rs +++ b/rust-cuda-kernel/src/kernel/lints.rs @@ -88,9 +88,9 @@ pub fn parse_ptx_lint_level( let lint = match lint { l if l == "verbose" => PtxLint::Verbose, l if l == "double_precision_use" => PtxLint::DoublePrecisionUse, - l if l == "local_memory_usage" => PtxLint::LocalMemoryUsage, + l if l == "local_memory_use" => PtxLint::LocalMemoryUse, l if l == "register_spills" => PtxLint::RegisterSpills, - l if l == "dump_binary" => PtxLint::DumpBinary, + l if l == "dump_assembly" => PtxLint::DumpAssembly, l if l == "dynamic_stack_size" => PtxLint::DynamicStackSize, _ => { emit_error!( @@ -151,9 +151,9 @@ impl fmt::Display for LintLevel { pub enum PtxLint { Verbose, DoublePrecisionUse, - LocalMemoryUsage, + LocalMemoryUse, RegisterSpills, - DumpBinary, + DumpAssembly, DynamicStackSize, } @@ -162,9 +162,9 @@ impl fmt::Display for PtxLint { match self { Self::Verbose => fmt.write_str("verbose"), Self::DoublePrecisionUse => fmt.write_str("double_precision_use"), - Self::LocalMemoryUsage => fmt.write_str("local_memory_usage"), + Self::LocalMemoryUse => fmt.write_str("local_memory_use"), Self::RegisterSpills => fmt.write_str("register_spills"), - Self::DumpBinary => fmt.write_str("dump_binary"), + Self::DumpAssembly => fmt.write_str("dump_assembly"), Self::DynamicStackSize => fmt.write_str("dynamic_stack_size"), } } diff --git a/rust-cuda-kernel/src/kernel/specialise/mod.rs b/rust-cuda-kernel/src/kernel/specialise/mod.rs index 6d30d4d5d..e5dcd518e 100644 --- a/rust-cuda-kernel/src/kernel/specialise/mod.rs +++ b/rust-cuda-kernel/src/kernel/specialise/mod.rs @@ -1,3 +1,3 @@ pub mod entry_point; pub mod function; -pub mod ty; +pub mod param_type; diff --git a/rust-cuda-kernel/src/kernel/specialise/ty.rs b/rust-cuda-kernel/src/kernel/specialise/param_type.rs similarity index 96% rename from rust-cuda-kernel/src/kernel/specialise/ty.rs rename to rust-cuda-kernel/src/kernel/specialise/param_type.rs index 1671f43f0..a398e5eac 100644 --- a/rust-cuda-kernel/src/kernel/specialise/ty.rs +++ b/rust-cuda-kernel/src/kernel/specialise/param_type.rs @@ -1,7 +1,8 @@ use proc_macro::TokenStream; use quote::ToTokens; -pub fn specialise_kernel_type(tokens: TokenStream) -> TokenStream { +#[allow(clippy::module_name_repetitions)] +pub fn specialise_kernel_param_type(tokens: TokenStream) -> TokenStream { let SpecialiseTypeConfig { mut ty, generics, @@ -10,8 +11,8 @@ pub fn specialise_kernel_type(tokens: TokenStream) -> TokenStream { Ok(config) => config, Err(err) => { abort_call_site!( - "specialise_kernel_type!(TY for GENERICS in KERNEL) expects TY type, GENERICS \ - generics, and KERNEL identifier: {:?}", + "specialise_kernel_param_type!(TY for GENERICS in KERNEL) expects TY type, \ + GENERICS generics, and KERNEL identifier: {:?}", err ) }, diff --git a/rust-cuda-kernel/src/kernel/wrapper/config.rs b/rust-cuda-kernel/src/kernel/wrapper/config.rs index 8f8cd2240..66807f2d1 100644 --- a/rust-cuda-kernel/src/kernel/wrapper/config.rs +++ b/rust-cuda-kernel/src/kernel/wrapper/config.rs @@ -1,17 +1,17 @@ pub(super) struct KernelConfig { pub(super) visibility: Option, - pub(super) linker: syn::Ident, + pub(super) link: syn::Ident, } impl syn::parse::Parse for KernelConfig { fn parse(input: syn::parse::ParseStream) -> syn::Result { let visibility: Option = input.parse()?; let _use: syn::token::Use = input.parse()?; - let linker: syn::Ident = input.parse()?; + let link: syn::Ident = input.parse()?; let _bang: syn::token::Bang = input.parse()?; let _for: syn::token::For = input.parse()?; let _impl: syn::token::Impl = input.parse()?; - Ok(Self { visibility, linker }) + Ok(Self { visibility, link }) } } diff --git a/rust-cuda-kernel/src/kernel/wrapper/generate/cuda_wrapper.rs b/rust-cuda-kernel/src/kernel/wrapper/generate/cuda_wrapper.rs index 938074e56..c3cb11458 100644 --- a/rust-cuda-kernel/src/kernel/wrapper/generate/cuda_wrapper.rs +++ b/rust-cuda-kernel/src/kernel/wrapper/generate/cuda_wrapper.rs @@ -31,7 +31,7 @@ pub(in super::super) fn quote_cuda_wrapper( }, |inner, (i, syn::PatType { pat, ty, .. })| { let specialised_ty = quote::quote_spanned! { ty.span()=> - #crate_path::device::specialise_kernel_type!(#ty for #generics in #func_ident) + #crate_path::device::specialise_kernel_param_type!(#ty for #generics in #func_ident) }; // Load the device param from its FFI representation @@ -110,7 +110,7 @@ fn specialise_ffi_input_types( ty, }| { let specialised_ty = quote::quote_spanned! { ty.span()=> - #crate_path::device::specialise_kernel_type!(#ty for #impl_generics in #func_ident) + #crate_path::device::specialise_kernel_param_type!(#ty for #impl_generics in #func_ident) }; let ffi_ty: syn::Type = syn::parse_quote_spanned! { ty.span()=> diff --git a/rust-cuda-kernel/src/kernel/wrapper/generate/host_linker_macro/args_trait.rs b/rust-cuda-kernel/src/kernel/wrapper/generate/host_link_macro/args_trait.rs similarity index 100% rename from rust-cuda-kernel/src/kernel/wrapper/generate/host_linker_macro/args_trait.rs rename to rust-cuda-kernel/src/kernel/wrapper/generate/host_link_macro/args_trait.rs diff --git a/rust-cuda-kernel/src/kernel/wrapper/generate/host_linker_macro/get_ptx.rs b/rust-cuda-kernel/src/kernel/wrapper/generate/host_link_macro/get_ptx.rs similarity index 99% rename from rust-cuda-kernel/src/kernel/wrapper/generate/host_linker_macro/get_ptx.rs rename to rust-cuda-kernel/src/kernel/wrapper/generate/host_link_macro/get_ptx.rs index 955093e5c..5504d12a8 100644 --- a/rust-cuda-kernel/src/kernel/wrapper/generate/host_linker_macro/get_ptx.rs +++ b/rust-cuda-kernel/src/kernel/wrapper/generate/host_link_macro/get_ptx.rs @@ -95,7 +95,7 @@ pub(super) fn quote_get_ptx( static #private_func_params: #cpu_func_lifetime_erased_types; )* } - #crate_path::kernel::link_kernel!{ + #crate_path::kernel::compile_kernel!{ #func_ident #func_ident_hash #crate_name #crate_manifest_dir #generic_start_token #($#macro_type_ids),* #generic_close_token #ptx_lint_levels diff --git a/rust-cuda-kernel/src/kernel/wrapper/generate/host_linker_macro/mod.rs b/rust-cuda-kernel/src/kernel/wrapper/generate/host_link_macro/mod.rs similarity index 96% rename from rust-cuda-kernel/src/kernel/wrapper/generate/host_linker_macro/mod.rs rename to rust-cuda-kernel/src/kernel/wrapper/generate/host_link_macro/mod.rs index 36479b62a..ea5daccdc 100644 --- a/rust-cuda-kernel/src/kernel/wrapper/generate/host_linker_macro/mod.rs +++ b/rust-cuda-kernel/src/kernel/wrapper/generate/host_link_macro/mod.rs @@ -8,10 +8,10 @@ mod get_ptx; use get_ptx::quote_get_ptx; #[allow(clippy::too_many_arguments)] // FIXME -pub(in super::super) fn quote_host_linker_macro( +pub(in super::super) fn quote_host_link_macro( crate_path: &syn::Path, KernelConfig { - visibility, linker, .. + visibility, link, .. }: &KernelConfig, decl_generics @ DeclGenerics { generic_start_token, @@ -86,7 +86,7 @@ pub(in super::super) fn quote_host_linker_macro( quote! { #[cfg(not(target_os = "cuda"))] - #visibility macro #linker( + #visibility macro #link( impl #func_ident_name #generic_start_token #(#macro_generics),* $(,)? #generic_close_token for $ptx:ident diff --git a/rust-cuda-kernel/src/kernel/wrapper/generate/mod.rs b/rust-cuda-kernel/src/kernel/wrapper/generate/mod.rs index bf2c293cc..829cb0433 100644 --- a/rust-cuda-kernel/src/kernel/wrapper/generate/mod.rs +++ b/rust-cuda-kernel/src/kernel/wrapper/generate/mod.rs @@ -1,4 +1,4 @@ pub mod cuda_generic_function; pub mod cuda_wrapper; pub mod host_kernel_ty; -pub mod host_linker_macro; +pub mod host_link_macro; diff --git a/rust-cuda-kernel/src/kernel/wrapper/mod.rs b/rust-cuda-kernel/src/kernel/wrapper/mod.rs index f400e3147..0c4f743ab 100644 --- a/rust-cuda-kernel/src/kernel/wrapper/mod.rs +++ b/rust-cuda-kernel/src/kernel/wrapper/mod.rs @@ -14,7 +14,7 @@ use crate::kernel::lints::{parse_ptx_lint_level, LintLevel, PtxLint}; use config::KernelConfig; use generate::{ cuda_generic_function::quote_cuda_generic_function, cuda_wrapper::quote_cuda_wrapper, - host_kernel_ty::quote_host_kernel_ty, host_linker_macro::quote_host_linker_macro, + host_kernel_ty::quote_host_kernel_ty, host_link_macro::quote_host_link_macro, }; use parse::parse_kernel_fn; use proc_macro2::{Ident, Span}; @@ -33,7 +33,7 @@ pub fn kernel(attr: TokenStream, func: TokenStream) -> TokenStream { Ok(config) => config, Err(err) => { abort_call_site!( - "#[kernel(pub? use LINKER! for impl)] expects LINKER identifier: {:?}", + "#[kernel(pub? use LINK! for impl)] expects LINK macro identifier: {:?}", err ) }, @@ -107,9 +107,9 @@ pub fn kernel(attr: TokenStream, func: TokenStream) -> TokenStream { let _ = ptx_lint_levels.try_insert(PtxLint::Verbose, LintLevel::Allow); let _ = ptx_lint_levels.try_insert(PtxLint::DoublePrecisionUse, LintLevel::Warn); - let _ = ptx_lint_levels.try_insert(PtxLint::LocalMemoryUsage, LintLevel::Warn); + let _ = ptx_lint_levels.try_insert(PtxLint::LocalMemoryUse, LintLevel::Warn); let _ = ptx_lint_levels.try_insert(PtxLint::RegisterSpills, LintLevel::Warn); - let _ = ptx_lint_levels.try_insert(PtxLint::DumpBinary, LintLevel::Allow); + let _ = ptx_lint_levels.try_insert(PtxLint::DumpAssembly, LintLevel::Allow); let _ = ptx_lint_levels.try_insert(PtxLint::DynamicStackSize, LintLevel::Warn); let ptx_lint_levels = { @@ -223,7 +223,7 @@ pub fn kernel(attr: TokenStream, func: TokenStream) -> TokenStream { &func.attrs, ); let host_generic_kernel_check = quote_generic_check(&crate_path, &func_ident); - let host_linker_macro = quote_host_linker_macro( + let host_link_macro = quote_host_link_macro( &crate_path, &config, &decl_generics, @@ -255,7 +255,7 @@ pub fn kernel(attr: TokenStream, func: TokenStream) -> TokenStream { #host_generic_kernel_check - #host_linker_macro + #host_link_macro #cuda_wrapper #cuda_generic_function diff --git a/rust-cuda-kernel/src/lib.rs b/rust-cuda-kernel/src/lib.rs index 86d4137bb..29ac48f12 100644 --- a/rust-cuda-kernel/src/lib.rs +++ b/rust-cuda-kernel/src/lib.rs @@ -19,6 +19,10 @@ //! //! [Gitpod Ready-to-Code]: https://img.shields.io/badge/Gitpod-ready-blue?logo=gitpod //! [gitpod]: https://gitpod.io/#https://github.com/juntyr/rust-cuda +//! +//! `rust-cuda-kernel` provides the [`#[kernel]`](macro@kernel) attribute +//! macro. When applied to a function, it compiles it as a CUDA kernel that +//! can be *safely* called from Rust code on the host. #![deny(clippy::complexity)] #![deny(clippy::correctness)] @@ -28,7 +32,7 @@ #![deny(clippy::style)] #![deny(clippy::suspicious)] #![deny(unsafe_code)] -// #![warn(missing_docs)] // FIXME +#![warn(missing_docs)] #![feature(box_patterns)] #![feature(proc_macro_tracked_env)] #![feature(proc_macro_span)] @@ -51,6 +55,109 @@ mod kernel; #[proc_macro_error] #[proc_macro_attribute] +/// Provides the [`#[kernel]`](macro@kernel) attribute macro. When applied to a +/// function, it compiles it as a CUDA kernel that can be *safely* called from +/// Rust code on the host. +/// +/// The annotated function must be public, not const, not async, not have an +/// explicit ABI, not be variadic, not have a receiver (e.g. `&self`), and +/// return the unit type `()`. At the moment, the kernel function must also +/// not use a where clause – use type generic bounds instead. +/// +/// While the [`#[kernel]`](macro@kernel) attribute supports functions with any +/// number of arguments, [`rust_cuda::kernel::TypedPtxKernel`] only supports +/// launching kernels with up to 12 parameters at the moment. +/// +/// The [`#[kernel]`](macro@kernel) attribute uses the following syntax: +/// +/// ```rust,ignore +/// #[kernel(pub? use link! for impl)] +/// fn my_kernel(/* parameters */) { +/// /* kernel code */ +/// } +/// ``` +/// +/// where `link` is the name of a macro that will be generated to manually link +/// specific monomorphised instantiations of the (optionally generic) kernel +/// function, and the optional `pub` controls whether this macro is public or +/// private. +/// +/// Note that all kernel parameters must implement the sealed +/// [`rust_cuda::kernel::CudaKernelParameter`] trait. +/// +/// To use a specific monomorphised instantiation of the kernel, the generated +/// `link!` macro must be invoked with the following syntax: +/// +/// ```rust,ignore +/// struct KernelPtx; +/// link! { impl my_kernel for KernelPtx } +/// ``` +/// for the non-generic kernel function `my_kernel` and a non-generic marker +/// type `KernelPtx`, which can be used as the generic `Kernel` type parameter +/// for [`rust_cuda::kernel::TypedPtxKernel`] to instantiate and launch the +/// kernel. Specifically, the [`rust_cuda::kernel::CompiledKernelPtx`] trait is +/// implemented for the `KernelPtx` type. +/// +/// If the kernel function is generic, the following syntax is used instead: +/// ```rust,ignore +/// #[kernel(pub? use link! for impl)] +/// fn my_kernel<'a, A, B: Bounded, const N: usize>(/* parameters */) { +/// /* kernel code */ +/// } +/// +/// struct KernelPtx<'a, A, B: Bounded, const N: usize>(/* ... */); +/// link! { impl my_kernel<'a, u32, MyStruct, 42> for KernelPtx } +/// link! { impl my_kernel<'a, bool, MyOtherStruct, 24> for KernelPtx } +/// ``` +/// +/// If the kernel generic space is closed, the `link!` macro can be made +/// private and all instantiations must be requested in the same crate that +/// defines the kernel function. If downstream code should be allowed to use +/// and compile new specific monomorphised instantiations of the kernel, the +/// `link!` macro should be publicly exported. Then, downstream code can define +/// its own `MyKernelPtx` marker types for which the kernel is linked and which +/// can be passed to [`rust_cuda::kernel::CompiledKernelPtx`]-generic code in +/// the kernel-defining crate to construct the requested +/// [`rust_cuda::kernel::TypedPtxKernel`]. +/// +/// Inside the scope of the [`#[kernel]`](macro@kernel) attribute, a helper +/// `#[kernel(...)]` attribute can be applied to the kernel function: +/// +/// - `#[kernel(crate = "")]` changes the path to the [`rust-cuda`] +/// crate that the kernel compilation uses, which by default is `rust_cuda`. +/// - `#[kernel(allow/warn/deny/forbid())]` checks the specified +/// CUDA-specific lint for each kernel compilation, using default Rust +/// semantics for allowing, warning on, denying, or forbidding a lint. The +/// following lints are supported: +/// - `ptx::double_precision_use`: check for any uses of [`f64`] operations +/// inside the compiled PTX binary, as they are often significantly less +/// performant on NVIDIA GPUs than [`f32`] operations. By default, +/// `#[kernel(warn(ptx::double_precision_use))]` is set. +/// - `ptx::local_memory_use`: check for any usage of local memory, which may +/// slow down kernel execution. By default, +/// `#[kernel(warn(ptx::local_memory_use))]` is set. +/// - `ptx::register_spills`: check for any spills of registers to local +/// memory. While using less registers can allow more kernels to be run in +/// parallel, register spills may also point to missed optimisations. By +/// default, `#[kernel(warn(ptx::register_spills))]` is set. +/// - `ptx::dynamic_stack_size`: check if the PTX compiler is unable to +/// statically determine the size of the required kernel function stack. +/// When the static stack size is known, the compiler may be able to keep it +/// entirely within the fast register file. However, when the stack size is +/// dynamic, more costly memory load and store operations are needed. By +/// default, `#[kernel(warn(ptx::dynamic_stack_size))]` is set. +/// - `ptx::verbose`: utility lint to output verbose PTX compiler messages as +/// warnings (`warn`) or errors (`deny` or `forbid`) or to not output them +/// (`allow`). By default, `#[kernel(allow(ptx::verbose))]` is set. +/// - `ptx::dump_assembly`: utility lint to output the compiled PTX assembly +/// code as a warning (`warn`) or an error (`deny` or `forbid`) or to not +/// output it (`allow`). By default, `#[kernel(allow(ptx::dump_assembly))]` +/// is set. +/// +/// [`rust_cuda::kernel::TypedPtxKernel`]: https://juntyr.github.io/rust-cuda/rust_cuda/kernel/struct.TypedPtxKernel.html +/// [`rust_cuda::kernel::CudaKernelParameter`]: https://juntyr.github.io/rust-cuda/rust_cuda/kernel/trait.CudaKernelParameter.html +/// [`rust_cuda::kernel::CompiledKernelPtx`]: https://juntyr.github.io/rust-cuda/rust_cuda/kernel/trait.CompiledKernelPtx.html +/// [`rust-cuda`]: https://juntyr.github.io/rust-cuda/rust_cuda pub fn kernel(attr: TokenStream, func: TokenStream) -> TokenStream { kernel::wrapper::kernel(attr, func) } @@ -58,13 +165,17 @@ pub fn kernel(attr: TokenStream, func: TokenStream) -> TokenStream { #[doc(hidden)] #[proc_macro_error] #[proc_macro] -pub fn specialise_kernel_type(tokens: TokenStream) -> TokenStream { - kernel::specialise::ty::specialise_kernel_type(tokens) +/// Helper macro to specialise the generic kernel param types when compiling +/// the specialised kernel for CUDA. +pub fn specialise_kernel_param_type(tokens: TokenStream) -> TokenStream { + kernel::specialise::param_type::specialise_kernel_param_type(tokens) } #[doc(hidden)] #[proc_macro_error] #[proc_macro] +/// Helper macro to specialise the CUDA kernel entry point name, used on the +/// host for linking to it. pub fn specialise_kernel_entry_point(tokens: TokenStream) -> TokenStream { kernel::specialise::entry_point::specialise_kernel_entry_point(tokens) } @@ -72,6 +183,8 @@ pub fn specialise_kernel_entry_point(tokens: TokenStream) -> TokenStream { #[doc(hidden)] #[proc_macro_error] #[proc_macro_attribute] +/// Helper macro to specialise the name of the CUDA kernel function item, used +/// to give each specialised version a unique ident when compiling for CUDA. pub fn specialise_kernel_function(attr: TokenStream, func: TokenStream) -> TokenStream { kernel::specialise::function::specialise_kernel_function(attr, func) } @@ -79,6 +192,8 @@ pub fn specialise_kernel_function(attr: TokenStream, func: TokenStream) -> Token #[doc(hidden)] #[proc_macro_error] #[proc_macro] +/// Helper macro to cheaply check the generic CUDA kernel, used on the host to +/// provide code error feedback even when no specialised kernel is linked. pub fn check_kernel(tokens: TokenStream) -> TokenStream { kernel::link::check_kernel(tokens) } @@ -86,6 +201,8 @@ pub fn check_kernel(tokens: TokenStream) -> TokenStream { #[doc(hidden)] #[proc_macro_error] #[proc_macro] -pub fn link_kernel(tokens: TokenStream) -> TokenStream { - kernel::link::link_kernel(tokens) +/// Helper macro to compile a specialised CUDA kernel and produce its PTX +/// assembly code, which is used on the host when linking specialised kernels. +pub fn compile_kernel(tokens: TokenStream) -> TokenStream { + kernel::link::compile_kernel(tokens) } diff --git a/src/device/mod.rs b/src/device/mod.rs index 791035d51..df20ae5a8 100644 --- a/src/device/mod.rs +++ b/src/device/mod.rs @@ -1,6 +1,6 @@ #[doc(hidden)] #[cfg(feature = "kernel")] -pub use rust_cuda_kernel::{specialise_kernel_function, specialise_kernel_type}; +pub use rust_cuda_kernel::{specialise_kernel_function, specialise_kernel_param_type}; pub mod alloc; pub mod thread; diff --git a/src/kernel/mod.rs b/src/kernel/mod.rs index e69b0b15a..3fc2b2e60 100644 --- a/src/kernel/mod.rs +++ b/src/kernel/mod.rs @@ -19,7 +19,7 @@ pub use rust_cuda_kernel::kernel; #[doc(hidden)] #[cfg(all(feature = "kernel", feature = "host"))] #[allow(clippy::module_name_repetitions)] -pub use rust_cuda_kernel::{check_kernel, link_kernel, specialise_kernel_entry_point}; +pub use rust_cuda_kernel::{check_kernel, compile_kernel, specialise_kernel_entry_point}; #[cfg(feature = "host")] mod ptx_jit; From d11e6d9fa9a242e15c9481376254aa245efb216a Mon Sep 17 00:00:00 2001 From: Juniper Tyree Date: Sat, 20 Jan 2024 12:03:33 +0000 Subject: [PATCH 113/120] Bump MSRV to 1.77-nightly --- Cargo.toml | 2 +- examples/print/src/main.rs | 1 - examples/single-source/src/main.rs | 1 - rust-cuda-derive/Cargo.toml | 8 +------- rust-cuda-kernel/Cargo.toml | 1 + rust-cuda-kernel/src/lib.rs | 1 - rust-toolchain | 1 - src/lib.rs | 2 -- 8 files changed, 3 insertions(+), 14 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 5aaa324bb..3b6dbf342 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -13,7 +13,7 @@ version = "0.1.0" authors = ["Juniper Tyree "] license = "MIT OR Apache-2.0" edition = "2021" -rust-version = "1.75" # nightly +rust-version = "1.77" # nightly # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html diff --git a/examples/print/src/main.rs b/examples/print/src/main.rs index 5aec2b391..c99ae0df9 100644 --- a/examples/print/src/main.rs +++ b/examples/print/src/main.rs @@ -6,7 +6,6 @@ #![cfg_attr(target_os = "cuda", feature(asm_experimental_arch))] #![feature(const_type_name)] #![feature(cfg_version)] -#![cfg_attr(not(version("1.76.0")), feature(c_str_literals))] #![feature(type_alias_impl_trait)] #![feature(decl_macro)] diff --git a/examples/single-source/src/main.rs b/examples/single-source/src/main.rs index f0aa25ce7..b4a7cec52 100644 --- a/examples/single-source/src/main.rs +++ b/examples/single-source/src/main.rs @@ -7,7 +7,6 @@ #![feature(const_type_name)] #![feature(offset_of)] #![feature(cfg_version)] -#![cfg_attr(not(version("1.76.0")), feature(c_str_literals))] #![feature(type_alias_impl_trait)] #![feature(associated_type_bounds)] #![feature(decl_macro)] diff --git a/rust-cuda-derive/Cargo.toml b/rust-cuda-derive/Cargo.toml index 73a74907b..fc214dea7 100644 --- a/rust-cuda-derive/Cargo.toml +++ b/rust-cuda-derive/Cargo.toml @@ -4,6 +4,7 @@ version = "0.1.0" authors = ["Juniper Tyree "] license = "MIT OR Apache-2.0" edition = "2021" +rust-version = "1.77" # nightly # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html @@ -15,10 +16,3 @@ syn = { version = "1.0", features = ["full", "fold"] } quote = "1.0" proc-macro2 = "1.0" proc-macro-error = "1.0" -# regex = "1.5" -# lazy_static = "1.4" -# serde_json = "1.0" -# cargo_metadata = { version = "0.18", features = ["builder"] } -# strip-ansi-escapes = "0.2" -# colored = "2.0" -# thiserror = "1.0" diff --git a/rust-cuda-kernel/Cargo.toml b/rust-cuda-kernel/Cargo.toml index 23e641841..b944bf875 100644 --- a/rust-cuda-kernel/Cargo.toml +++ b/rust-cuda-kernel/Cargo.toml @@ -4,6 +4,7 @@ version = "0.1.0" authors = ["Juniper Tyree "] license = "MIT OR Apache-2.0" edition = "2021" +rust-version = "1.77" # nightly links = "libnvptxcompiler_static" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html diff --git a/rust-cuda-kernel/src/lib.rs b/rust-cuda-kernel/src/lib.rs index 29ac48f12..e6d5cf3ac 100644 --- a/rust-cuda-kernel/src/lib.rs +++ b/rust-cuda-kernel/src/lib.rs @@ -41,7 +41,6 @@ #![feature(proc_macro_def_site)] #![feature(proc_macro_c_str_literals)] #![feature(cfg_version)] -#![cfg_attr(not(version("1.76.0")), feature(c_str_literals))] #![doc(html_root_url = "https://juntyr.github.io/rust-cuda/")] extern crate proc_macro; diff --git a/rust-toolchain b/rust-toolchain index e6cfef665..7734bcf14 100644 --- a/rust-toolchain +++ b/rust-toolchain @@ -1,5 +1,4 @@ [toolchain] -# Pin to final 1.75.0 nightly channel = "nightly" components = [ "cargo", "rustfmt", "clippy" ] targets = [ "x86_64-unknown-linux-gnu", "nvptx64-nvidia-cuda" ] diff --git a/src/lib.rs b/src/lib.rs index df8c42290..1c92688b1 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -54,8 +54,6 @@ #![feature(never_type)] #![feature(layout_for_ptr)] #![feature(cfg_version)] -#![cfg_attr(not(version("1.76.0")), feature(c_str_literals))] -#![cfg_attr(not(version("1.76.0")), feature(ptr_from_ref))] #![allow(incomplete_features)] #![feature(generic_const_exprs)] #![cfg_attr(feature = "device", feature(slice_ptr_get))] From 521419c50b734a55d8ccccdffc45674d681d20d1 Mon Sep 17 00:00:00 2001 From: Juniper Tyree Date: Sat, 20 Jan 2024 17:06:24 +0000 Subject: [PATCH 114/120] Try trait-based kernel signature check --- .../generate/host_link_macro/get_ptx.rs | 13 ++-- src/safety/ptx_kernel_signature.rs | 59 +++++-------------- 2 files changed, 23 insertions(+), 49 deletions(-) diff --git a/rust-cuda-kernel/src/kernel/wrapper/generate/host_link_macro/get_ptx.rs b/rust-cuda-kernel/src/kernel/wrapper/generate/host_link_macro/get_ptx.rs index 5504d12a8..39b859a77 100644 --- a/rust-cuda-kernel/src/kernel/wrapper/generate/host_link_macro/get_ptx.rs +++ b/rust-cuda-kernel/src/kernel/wrapper/generate/host_link_macro/get_ptx.rs @@ -68,11 +68,14 @@ pub(super) fn quote_get_ptx( let ffi_signature_ty = quote! { extern "C" fn(#(#cpu_func_lifetime_erased_types),*) }; quote::quote_spanned! { func_ident.span()=> - const _: #crate_path::safety::ptx_kernel_signature::Assert<{ - #crate_path::safety::ptx_kernel_signature::HostAndDeviceKernelSignatureTypeLayout::Match - }> = #crate_path::safety::ptx_kernel_signature::Assert::<{ - #crate_path::safety::ptx_kernel_signature::check::<#ffi_signature_ty>(#ffi_signature_ident) - }>; + const _: () = #crate_path::safety::ptx_kernel_signature::check::< + { + &#crate_path::deps::const_type_layout::serialise_type_graph::< + #ffi_signature_ty + >() + }, + #ffi_signature_ident, + >(); } }; diff --git a/src/safety/ptx_kernel_signature.rs b/src/safety/ptx_kernel_signature.rs index eb4a63820..aa42b905e 100644 --- a/src/safety/ptx_kernel_signature.rs +++ b/src/safety/ptx_kernel_signature.rs @@ -1,51 +1,22 @@ -use const_type_layout::{serialise_type_graph, serialised_type_graph_len, TypeGraphLayout}; +const SIGNATURE_ERROR_MESSAGE: &[u8] = b"ERROR in this PTX compilation"; -#[allow(clippy::module_name_repetitions)] -#[derive(PartialEq, Eq, core::marker::ConstParamTy)] -pub enum HostAndDeviceKernelSignatureTypeLayout { - Match, - Mismatch, +#[marker] +pub trait SameHostAndDeviceKernelSignatureTypeLayout +{ } -pub struct Assert; - -#[must_use] -pub const fn check( - device: &'static [u8], -) -> HostAndDeviceKernelSignatureTypeLayout -where - [u8; serialised_type_graph_len::()]:, +impl SameHostAndDeviceKernelSignatureTypeLayout for () {} +impl SameHostAndDeviceKernelSignatureTypeLayout + for () +{ +} +impl SameHostAndDeviceKernelSignatureTypeLayout + for () { - const SIGNATURE_ERROR_MESSAGE: &[u8] = b"ERROR in this PTX compilation"; - - // Short-circuit to avoid extra errors when PTX compilation fails - if equals(device, SIGNATURE_ERROR_MESSAGE) { - return HostAndDeviceKernelSignatureTypeLayout::Match; - } - - let host = serialise_type_graph::(); - - if equals(device, &host) { - HostAndDeviceKernelSignatureTypeLayout::Match - } else { - HostAndDeviceKernelSignatureTypeLayout::Mismatch - } } -const fn equals(device: &[u8], host: &[u8]) -> bool { - if host.len() != device.len() { - return false; - } - - let mut i = 0; - - while i < host.len() { - if host[i] != device[i] { - return false; - } - - i += 1; - } - - true +pub const fn check() +where + (): SameHostAndDeviceKernelSignatureTypeLayout, +{ } From 07dc90875a7a535006c9abae3c8e903a652814b3 Mon Sep 17 00:00:00 2001 From: Juniper Tyree Date: Sat, 20 Jan 2024 19:39:29 +0000 Subject: [PATCH 115/120] Try naming host kernel layout const --- .../wrapper/generate/host_link_macro/get_ptx.rs | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/rust-cuda-kernel/src/kernel/wrapper/generate/host_link_macro/get_ptx.rs b/rust-cuda-kernel/src/kernel/wrapper/generate/host_link_macro/get_ptx.rs index 39b859a77..51ecfb0da 100644 --- a/rust-cuda-kernel/src/kernel/wrapper/generate/host_link_macro/get_ptx.rs +++ b/rust-cuda-kernel/src/kernel/wrapper/generate/host_link_macro/get_ptx.rs @@ -65,15 +65,18 @@ pub(super) fn quote_get_ptx( quote!() } else { let ffi_signature_ident = syn::Ident::new(KERNEL_TYPE_LAYOUT_IDENT, func_ident.span()); + let ffi_signature_host_ident = quote::format_ident!("{ffi_signature_ident}_HOST"); let ffi_signature_ty = quote! { extern "C" fn(#(#cpu_func_lifetime_erased_types),*) }; quote::quote_spanned! { func_ident.span()=> + #[allow(dead_code)] + const #ffi_signature_host_ident: &'static [u8] = + &#crate_path::deps::const_type_layout::serialise_type_graph::< + #ffi_signature_ty + >(); + const _: () = #crate_path::safety::ptx_kernel_signature::check::< - { - &#crate_path::deps::const_type_layout::serialise_type_graph::< - #ffi_signature_ty - >() - }, + #ffi_signature_host_ident, #ffi_signature_ident, >(); } From 1c8115c219c4d48dfcaafd9dd6889001a0742277 Mon Sep 17 00:00:00 2001 From: Juniper Tyree Date: Sun, 21 Jan 2024 07:16:04 +0000 Subject: [PATCH 116/120] Try match against byte literal for faster comparison --- rust-cuda-kernel/src/kernel/link/mod.rs | 15 ++++++++--- .../generate/host_link_macro/get_ptx.rs | 19 +++++++------- src/safety/ptx_kernel_signature.rs | 25 ++++--------------- 3 files changed, 26 insertions(+), 33 deletions(-) diff --git a/rust-cuda-kernel/src/kernel/link/mod.rs b/rust-cuda-kernel/src/kernel/link/mod.rs index 10d3b63ed..a0df9366a 100644 --- a/rust-cuda-kernel/src/kernel/link/mod.rs +++ b/rust-cuda-kernel/src/kernel/link/mod.rs @@ -78,7 +78,9 @@ pub fn compile_kernel(tokens: TokenStream) -> TokenStream { proc_macro_error::set_dummy(quote! { const #ptx_cstr_ident: &'static ::core::ffi::CStr = c"ERROR in this PTX compilation"; - const #ffi_signature_ident: &[u8; 29] = b"ERROR in this PTX compilation"; + const fn #ffi_signature_ident(host: &[u8]) -> rust_cuda_import::safety::ptx_kernel_signature::HostAndDeviceKernelSignatureTypeLayout { + rust_cuda_import::safety::ptx_kernel_signature::HostAndDeviceKernelSignatureTypeLayout::Match + } ::core::compile_error!("rust-cuda PTX kernel compilation failed"); }); @@ -116,7 +118,9 @@ pub fn compile_kernel(tokens: TokenStream) -> TokenStream { ) else { return (quote! { const #ptx_cstr_ident: &'static ::core::ffi::CStr = c"ERROR in this PTX compilation"; - const #ffi_signature_ident: &[u8; 29] = b"ERROR in this PTX compilation"; + const fn #ffi_signature_ident(host: &[u8]) -> rust_cuda_import::safety::ptx_kernel_signature::HostAndDeviceKernelSignatureTypeLayout { + rust_cuda_import::safety::ptx_kernel_signature::HostAndDeviceKernelSignatureTypeLayout::Match + } ::core::compile_error!("rust-cuda PTX kernel compilation failed"); }) .into(); @@ -199,7 +203,12 @@ fn extract_ptx_kernel_layout(kernel_ptx: &mut String) -> Vec rust_cuda_import::safety::ptx_kernel_signature::HostAndDeviceKernelSignatureTypeLayout { + match host { + #byte_str => rust_cuda_import::safety::ptx_kernel_signature::HostAndDeviceKernelSignatureTypeLayout::Match, + _ => rust_cuda_import::safety::ptx_kernel_signature::HostAndDeviceKernelSignatureTypeLayout::Mismatch, + } + } }); let type_layout_end = bytes_start + bytes_end_offset + AFTER_BYTES_PATTERN.len(); diff --git a/rust-cuda-kernel/src/kernel/wrapper/generate/host_link_macro/get_ptx.rs b/rust-cuda-kernel/src/kernel/wrapper/generate/host_link_macro/get_ptx.rs index 51ecfb0da..a54a62b18 100644 --- a/rust-cuda-kernel/src/kernel/wrapper/generate/host_link_macro/get_ptx.rs +++ b/rust-cuda-kernel/src/kernel/wrapper/generate/host_link_macro/get_ptx.rs @@ -65,20 +65,16 @@ pub(super) fn quote_get_ptx( quote!() } else { let ffi_signature_ident = syn::Ident::new(KERNEL_TYPE_LAYOUT_IDENT, func_ident.span()); - let ffi_signature_host_ident = quote::format_ident!("{ffi_signature_ident}_HOST"); let ffi_signature_ty = quote! { extern "C" fn(#(#cpu_func_lifetime_erased_types),*) }; quote::quote_spanned! { func_ident.span()=> - #[allow(dead_code)] - const #ffi_signature_host_ident: &'static [u8] = - &#crate_path::deps::const_type_layout::serialise_type_graph::< + const _: #crate_path::safety::ptx_kernel_signature::Assert<{ + #crate_path::safety::ptx_kernel_signature::HostAndDeviceKernelSignatureTypeLayout::Match + }> = #crate_path::safety::ptx_kernel_signature::Assert::<{ + #ffi_signature_ident(&#crate_path::deps::const_type_layout::serialise_type_graph::< #ffi_signature_ty - >(); - - const _: () = #crate_path::safety::ptx_kernel_signature::check::< - #ffi_signature_host_ident, - #ffi_signature_ident, - >(); + >()) + }>; } }; @@ -93,6 +89,9 @@ pub(super) fn quote_get_ptx( quote! { fn get_ptx() -> &'static ::core::ffi::CStr { + #[allow(dead_code)] + use #crate_path as rust_cuda_import; + #args_trait extern "C" { #( diff --git a/src/safety/ptx_kernel_signature.rs b/src/safety/ptx_kernel_signature.rs index aa42b905e..5b33567bb 100644 --- a/src/safety/ptx_kernel_signature.rs +++ b/src/safety/ptx_kernel_signature.rs @@ -1,22 +1,7 @@ -const SIGNATURE_ERROR_MESSAGE: &[u8] = b"ERROR in this PTX compilation"; - -#[marker] -pub trait SameHostAndDeviceKernelSignatureTypeLayout -{ +#[derive(PartialEq, Eq, core::marker::ConstParamTy)] +pub enum HostAndDeviceKernelSignatureTypeLayout { + Match, + Mismatch, } -impl SameHostAndDeviceKernelSignatureTypeLayout for () {} -impl SameHostAndDeviceKernelSignatureTypeLayout - for () -{ -} -impl SameHostAndDeviceKernelSignatureTypeLayout - for () -{ -} - -pub const fn check() -where - (): SameHostAndDeviceKernelSignatureTypeLayout, -{ -} +pub struct Assert; From b040cac08c51ffe3903d0e900eb1ed7ba59a38d0 Mon Sep 17 00:00:00 2001 From: Juniper Tyree Date: Sun, 21 Jan 2024 08:00:41 +0000 Subject: [PATCH 117/120] Try with memcmp intrinsic --- rust-cuda-kernel/src/kernel/link/mod.rs | 15 ++------ .../generate/host_link_macro/get_ptx.rs | 7 +--- src/lib.rs | 5 ++- src/safety/ptx_kernel_signature.rs | 34 +++++++++++++++++++ 4 files changed, 42 insertions(+), 19 deletions(-) diff --git a/rust-cuda-kernel/src/kernel/link/mod.rs b/rust-cuda-kernel/src/kernel/link/mod.rs index a0df9366a..10d3b63ed 100644 --- a/rust-cuda-kernel/src/kernel/link/mod.rs +++ b/rust-cuda-kernel/src/kernel/link/mod.rs @@ -78,9 +78,7 @@ pub fn compile_kernel(tokens: TokenStream) -> TokenStream { proc_macro_error::set_dummy(quote! { const #ptx_cstr_ident: &'static ::core::ffi::CStr = c"ERROR in this PTX compilation"; - const fn #ffi_signature_ident(host: &[u8]) -> rust_cuda_import::safety::ptx_kernel_signature::HostAndDeviceKernelSignatureTypeLayout { - rust_cuda_import::safety::ptx_kernel_signature::HostAndDeviceKernelSignatureTypeLayout::Match - } + const #ffi_signature_ident: &[u8; 29] = b"ERROR in this PTX compilation"; ::core::compile_error!("rust-cuda PTX kernel compilation failed"); }); @@ -118,9 +116,7 @@ pub fn compile_kernel(tokens: TokenStream) -> TokenStream { ) else { return (quote! { const #ptx_cstr_ident: &'static ::core::ffi::CStr = c"ERROR in this PTX compilation"; - const fn #ffi_signature_ident(host: &[u8]) -> rust_cuda_import::safety::ptx_kernel_signature::HostAndDeviceKernelSignatureTypeLayout { - rust_cuda_import::safety::ptx_kernel_signature::HostAndDeviceKernelSignatureTypeLayout::Match - } + const #ffi_signature_ident: &[u8; 29] = b"ERROR in this PTX compilation"; ::core::compile_error!("rust-cuda PTX kernel compilation failed"); }) .into(); @@ -203,12 +199,7 @@ fn extract_ptx_kernel_layout(kernel_ptx: &mut String) -> Vec rust_cuda_import::safety::ptx_kernel_signature::HostAndDeviceKernelSignatureTypeLayout { - match host { - #byte_str => rust_cuda_import::safety::ptx_kernel_signature::HostAndDeviceKernelSignatureTypeLayout::Match, - _ => rust_cuda_import::safety::ptx_kernel_signature::HostAndDeviceKernelSignatureTypeLayout::Mismatch, - } - } + const #param: &[u8; #len] = #byte_str; }); let type_layout_end = bytes_start + bytes_end_offset + AFTER_BYTES_PATTERN.len(); diff --git a/rust-cuda-kernel/src/kernel/wrapper/generate/host_link_macro/get_ptx.rs b/rust-cuda-kernel/src/kernel/wrapper/generate/host_link_macro/get_ptx.rs index a54a62b18..5504d12a8 100644 --- a/rust-cuda-kernel/src/kernel/wrapper/generate/host_link_macro/get_ptx.rs +++ b/rust-cuda-kernel/src/kernel/wrapper/generate/host_link_macro/get_ptx.rs @@ -71,9 +71,7 @@ pub(super) fn quote_get_ptx( const _: #crate_path::safety::ptx_kernel_signature::Assert<{ #crate_path::safety::ptx_kernel_signature::HostAndDeviceKernelSignatureTypeLayout::Match }> = #crate_path::safety::ptx_kernel_signature::Assert::<{ - #ffi_signature_ident(&#crate_path::deps::const_type_layout::serialise_type_graph::< - #ffi_signature_ty - >()) + #crate_path::safety::ptx_kernel_signature::check::<#ffi_signature_ty>(#ffi_signature_ident) }>; } }; @@ -89,9 +87,6 @@ pub(super) fn quote_get_ptx( quote! { fn get_ptx() -> &'static ::core::ffi::CStr { - #[allow(dead_code)] - use #crate_path as rust_cuda_import; - #args_trait extern "C" { #( diff --git a/src/lib.rs b/src/lib.rs index 1c92688b1..a6d41b648 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -54,9 +54,12 @@ #![feature(never_type)] #![feature(layout_for_ptr)] #![feature(cfg_version)] +#![cfg_attr(feature = "device", feature(slice_ptr_get))] #![allow(incomplete_features)] #![feature(generic_const_exprs)] -#![cfg_attr(feature = "device", feature(slice_ptr_get))] +#![allow(internal_features)] +#![feature(core_intrinsics)] +#![feature(const_intrinsic_compare_bytes)] #![doc(html_root_url = "https://juntyr.github.io/rust-cuda/")] #[cfg(all(feature = "host", feature = "device", not(doc)))] diff --git a/src/safety/ptx_kernel_signature.rs b/src/safety/ptx_kernel_signature.rs index 5b33567bb..a8b298691 100644 --- a/src/safety/ptx_kernel_signature.rs +++ b/src/safety/ptx_kernel_signature.rs @@ -1,3 +1,6 @@ +use const_type_layout::{serialise_type_graph, serialised_type_graph_len, TypeGraphLayout}; + +#[allow(clippy::module_name_repetitions)] #[derive(PartialEq, Eq, core::marker::ConstParamTy)] pub enum HostAndDeviceKernelSignatureTypeLayout { Match, @@ -5,3 +8,34 @@ pub enum HostAndDeviceKernelSignatureTypeLayout { } pub struct Assert; + +#[must_use] +pub const fn check( + device: &'static [u8], +) -> HostAndDeviceKernelSignatureTypeLayout +where + [u8; serialised_type_graph_len::()]:, +{ + const SIGNATURE_ERROR_MESSAGE: &[u8] = b"ERROR in this PTX compilation"; + + // Short-circuit to avoid extra errors when PTX compilation fails + if equals(device, SIGNATURE_ERROR_MESSAGE) { + return HostAndDeviceKernelSignatureTypeLayout::Match; + } + + let host = serialise_type_graph::(); + + if equals(device, &host) { + HostAndDeviceKernelSignatureTypeLayout::Match + } else { + HostAndDeviceKernelSignatureTypeLayout::Mismatch + } +} + +const fn equals(device: &[u8], host: &[u8]) -> bool { + if device.len() != host.len() { + return false; + } + + unsafe { core::intrinsics::compare_bytes(device.as_ptr(), host.as_ptr(), device.len()) == 0 } +} From 44a974bd2b0191193b635c4254995ed7b12ea9f5 Mon Sep 17 00:00:00 2001 From: Juniper Tyree Date: Thu, 1 Feb 2024 08:55:18 +0000 Subject: [PATCH 118/120] Try out experimental const-type-layout with compression --- Cargo.toml | 2 +- examples/derive/src/lib.rs | 1 - examples/single-source/src/main.rs | 1 - rust-cuda-kernel/src/kernel/link/config.rs | 2 ++ rust-cuda-kernel/src/kernel/link/mod.rs | 21 +++++++++++++++++++ .../src/kernel/specialise/entry_point.rs | 1 + .../src/kernel/specialise/function.rs | 1 + .../wrapper/generate/cuda_generic_function.rs | 1 + .../kernel/wrapper/generate/cuda_wrapper.rs | 1 + .../kernel/wrapper/generate/host_kernel_ty.rs | 1 + .../generate/host_link_macro/args_trait.rs | 1 + .../generate/host_link_macro/get_ptx.rs | 1 + .../wrapper/generate/host_link_macro/mod.rs | 1 + rust-cuda-kernel/src/kernel/wrapper/mod.rs | 1 + src/lib.rs | 1 - src/safety/ptx_entry_point.rs | 17 +++------------ 16 files changed, 36 insertions(+), 18 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 3b6dbf342..acb60681a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -33,7 +33,7 @@ rustacuda_derive = { git = "https://github.com/juntyr/RustaCUDA", rev = "c6ea7cc regex = { version = "1.10", optional = true } -const-type-layout = { version = "0.2.1", features = ["derive"] } +const-type-layout = { git = "https://github.com/juntyr/const-type-layout", branch = "compress", features = ["derive"] } safer_owning_ref = { version = "0.5", optional = true } oneshot = { version = "0.1", optional = true, features = ["std", "async"] } diff --git a/examples/derive/src/lib.rs b/examples/derive/src/lib.rs index 622b1b699..6960eadeb 100644 --- a/examples/derive/src/lib.rs +++ b/examples/derive/src/lib.rs @@ -1,6 +1,5 @@ #![deny(clippy::pedantic)] #![feature(const_type_name)] -#![feature(offset_of)] #[derive(rc::lend::LendRustToCuda)] #[cuda(crate = "rc")] diff --git a/examples/single-source/src/main.rs b/examples/single-source/src/main.rs index b4a7cec52..3861190d2 100644 --- a/examples/single-source/src/main.rs +++ b/examples/single-source/src/main.rs @@ -5,7 +5,6 @@ #![cfg_attr(target_os = "cuda", feature(alloc_error_handler))] #![cfg_attr(target_os = "cuda", feature(asm_experimental_arch))] #![feature(const_type_name)] -#![feature(offset_of)] #![feature(cfg_version)] #![feature(type_alias_impl_trait)] #![feature(associated_type_bounds)] diff --git a/rust-cuda-kernel/src/kernel/link/config.rs b/rust-cuda-kernel/src/kernel/link/config.rs index 469318f02..02297ba7d 100644 --- a/rust-cuda-kernel/src/kernel/link/config.rs +++ b/rust-cuda-kernel/src/kernel/link/config.rs @@ -1,5 +1,7 @@ use std::{collections::HashMap, path::PathBuf}; +use quote::quote; + use crate::kernel::lints::{parse_ptx_lint_level, LintLevel, PtxLint}; #[allow(clippy::module_name_repetitions)] diff --git a/rust-cuda-kernel/src/kernel/link/mod.rs b/rust-cuda-kernel/src/kernel/link/mod.rs index 10d3b63ed..001aad0ef 100644 --- a/rust-cuda-kernel/src/kernel/link/mod.rs +++ b/rust-cuda-kernel/src/kernel/link/mod.rs @@ -18,6 +18,7 @@ use ptx_builder::{ builder::{BuildStatus, Builder, MessageFormat, Profile}, error::{BuildErrorKind, Error, Result}, }; +use quote::quote; use crate::kernel::{ lints::{LintLevel, PtxLint}, @@ -196,6 +197,26 @@ fn extract_ptx_kernel_layout(kernel_ptx: &mut String) -> Vec TokenStream { diff --git a/rust-cuda-kernel/src/kernel/specialise/function.rs b/rust-cuda-kernel/src/kernel/specialise/function.rs index 068f30d97..44d8b8a81 100644 --- a/rust-cuda-kernel/src/kernel/specialise/function.rs +++ b/rust-cuda-kernel/src/kernel/specialise/function.rs @@ -1,6 +1,7 @@ use std::env::VarError; use proc_macro::TokenStream; +use quote::quote; #[allow(clippy::module_name_repetitions)] pub fn specialise_kernel_function(attr: TokenStream, func: TokenStream) -> TokenStream { diff --git a/rust-cuda-kernel/src/kernel/wrapper/generate/cuda_generic_function.rs b/rust-cuda-kernel/src/kernel/wrapper/generate/cuda_generic_function.rs index 00e00d7d8..0799f4cc7 100644 --- a/rust-cuda-kernel/src/kernel/wrapper/generate/cuda_generic_function.rs +++ b/rust-cuda-kernel/src/kernel/wrapper/generate/cuda_generic_function.rs @@ -1,5 +1,6 @@ use proc_macro2::TokenStream; use syn::spanned::Spanned; +use quote::quote; use crate::kernel::wrapper::{DeclGenerics, FuncIdent}; diff --git a/rust-cuda-kernel/src/kernel/wrapper/generate/cuda_wrapper.rs b/rust-cuda-kernel/src/kernel/wrapper/generate/cuda_wrapper.rs index c3cb11458..ff7e2ee48 100644 --- a/rust-cuda-kernel/src/kernel/wrapper/generate/cuda_wrapper.rs +++ b/rust-cuda-kernel/src/kernel/wrapper/generate/cuda_wrapper.rs @@ -1,5 +1,6 @@ use proc_macro2::TokenStream; use syn::spanned::Spanned; +use quote::quote; use crate::kernel::{ wrapper::{FuncIdent, FunctionInputs, ImplGenerics}, diff --git a/rust-cuda-kernel/src/kernel/wrapper/generate/host_kernel_ty.rs b/rust-cuda-kernel/src/kernel/wrapper/generate/host_kernel_ty.rs index 78e972d69..757f22470 100644 --- a/rust-cuda-kernel/src/kernel/wrapper/generate/host_kernel_ty.rs +++ b/rust-cuda-kernel/src/kernel/wrapper/generate/host_kernel_ty.rs @@ -1,4 +1,5 @@ use proc_macro2::TokenStream; +use quote::quote; use crate::kernel::wrapper::{DeclGenerics, FuncIdent, FunctionInputs, ImplGenerics}; diff --git a/rust-cuda-kernel/src/kernel/wrapper/generate/host_link_macro/args_trait.rs b/rust-cuda-kernel/src/kernel/wrapper/generate/host_link_macro/args_trait.rs index 26653e435..1813942d8 100644 --- a/rust-cuda-kernel/src/kernel/wrapper/generate/host_link_macro/args_trait.rs +++ b/rust-cuda-kernel/src/kernel/wrapper/generate/host_link_macro/args_trait.rs @@ -1,4 +1,5 @@ use proc_macro2::TokenStream; +use quote::quote; use crate::kernel::wrapper::{FunctionInputs, ImplGenerics}; diff --git a/rust-cuda-kernel/src/kernel/wrapper/generate/host_link_macro/get_ptx.rs b/rust-cuda-kernel/src/kernel/wrapper/generate/host_link_macro/get_ptx.rs index 5504d12a8..4d5e01a25 100644 --- a/rust-cuda-kernel/src/kernel/wrapper/generate/host_link_macro/get_ptx.rs +++ b/rust-cuda-kernel/src/kernel/wrapper/generate/host_link_macro/get_ptx.rs @@ -1,5 +1,6 @@ use proc_macro2::TokenStream; use syn::spanned::Spanned; +use quote::quote; use crate::kernel::{ utils::skip_kernel_compilation, diff --git a/rust-cuda-kernel/src/kernel/wrapper/generate/host_link_macro/mod.rs b/rust-cuda-kernel/src/kernel/wrapper/generate/host_link_macro/mod.rs index ea5daccdc..353e6c5dc 100644 --- a/rust-cuda-kernel/src/kernel/wrapper/generate/host_link_macro/mod.rs +++ b/rust-cuda-kernel/src/kernel/wrapper/generate/host_link_macro/mod.rs @@ -1,4 +1,5 @@ use proc_macro2::TokenStream; +use quote::quote; use crate::kernel::wrapper::{DeclGenerics, FuncIdent, FunctionInputs, ImplGenerics, KernelConfig}; diff --git a/rust-cuda-kernel/src/kernel/wrapper/mod.rs b/rust-cuda-kernel/src/kernel/wrapper/mod.rs index 0c4f743ab..9dffacc51 100644 --- a/rust-cuda-kernel/src/kernel/wrapper/mod.rs +++ b/rust-cuda-kernel/src/kernel/wrapper/mod.rs @@ -19,6 +19,7 @@ use generate::{ use parse::parse_kernel_fn; use proc_macro2::{Ident, Span}; use syn::spanned::Spanned; +use quote::quote; #[allow(clippy::too_many_lines)] pub fn kernel(attr: TokenStream, func: TokenStream) -> TokenStream { diff --git a/src/lib.rs b/src/lib.rs index a6d41b648..35e11ed1b 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -42,7 +42,6 @@ #![feature(doc_cfg)] #![feature(marker_trait_attr)] #![feature(const_type_name)] -#![feature(offset_of)] #![feature(adt_const_params)] #![feature(impl_trait_in_assoc_type)] #![feature(ptr_metadata)] diff --git a/src/safety/ptx_entry_point.rs b/src/safety/ptx_entry_point.rs index b1d62cf4e..ab06a13d9 100644 --- a/src/safety/ptx_entry_point.rs +++ b/src/safety/ptx_entry_point.rs @@ -55,19 +55,8 @@ const fn find(haystack: &[u8], needle: &[u8], from: usize) -> Option { } const fn starts_with(haystack: &[u8], needle: &[u8], from: usize) -> bool { - let mut i = 0; - - while i < needle.len() { - if (from + i) >= haystack.len() { - return false; - } - - if needle[i] == haystack[from + i] { - i += 1; - } else { - return false; - } - } + let haystack_len = haystack.len() - from; + let check_len = if needle.len() < haystack_len { needle.len() } else { haystack_len }; - true + unsafe { core::intrinsics::compare_bytes(haystack.as_ptr().add(from), needle.as_ptr(), check_len) == 0 } } From 3ec8118114eabbb1b3048af248d0439e4d250a37 Mon Sep 17 00:00:00 2001 From: Juniper Tyree Date: Mon, 5 Feb 2024 06:55:09 +0000 Subject: [PATCH 119/120] Try check --- rust-cuda-kernel/src/kernel/link/mod.rs | 22 +++++++++++++++---- .../generate/host_link_macro/get_ptx.rs | 11 +++++++--- 2 files changed, 26 insertions(+), 7 deletions(-) diff --git a/rust-cuda-kernel/src/kernel/link/mod.rs b/rust-cuda-kernel/src/kernel/link/mod.rs index 001aad0ef..0b68debc7 100644 --- a/rust-cuda-kernel/src/kernel/link/mod.rs +++ b/rust-cuda-kernel/src/kernel/link/mod.rs @@ -79,7 +79,11 @@ pub fn compile_kernel(tokens: TokenStream) -> TokenStream { proc_macro_error::set_dummy(quote! { const #ptx_cstr_ident: &'static ::core::ffi::CStr = c"ERROR in this PTX compilation"; - const #ffi_signature_ident: &[u8; 29] = b"ERROR in this PTX compilation"; + + const fn #ffi_signature_ident() -> HostAndDeviceKernelSignatureTypeLayout { + HostAndDeviceKernelSignatureTypeLayout::Match + } + ::core::compile_error!("rust-cuda PTX kernel compilation failed"); }); @@ -117,7 +121,11 @@ pub fn compile_kernel(tokens: TokenStream) -> TokenStream { ) else { return (quote! { const #ptx_cstr_ident: &'static ::core::ffi::CStr = c"ERROR in this PTX compilation"; - const #ffi_signature_ident: &[u8; 29] = b"ERROR in this PTX compilation"; + + const fn #ffi_signature_ident() -> HostAndDeviceKernelSignatureTypeLayout { + HostAndDeviceKernelSignatureTypeLayout::Match + } + ::core::compile_error!("rust-cuda PTX kernel compilation failed"); }) .into(); @@ -217,10 +225,16 @@ fn extract_ptx_kernel_layout(kernel_ptx: &mut String) -> Vec() -> HostAndDeviceKernelSignatureTypeLayout { + if check_serialised_type_graph::(#byte_str) { + HostAndDeviceKernelSignatureTypeLayout::Match + } else { + HostAndDeviceKernelSignatureTypeLayout::Mismatch + } + } }); let type_layout_end = bytes_start + bytes_end_offset + AFTER_BYTES_PATTERN.len(); diff --git a/rust-cuda-kernel/src/kernel/wrapper/generate/host_link_macro/get_ptx.rs b/rust-cuda-kernel/src/kernel/wrapper/generate/host_link_macro/get_ptx.rs index 4d5e01a25..08ec8ab40 100644 --- a/rust-cuda-kernel/src/kernel/wrapper/generate/host_link_macro/get_ptx.rs +++ b/rust-cuda-kernel/src/kernel/wrapper/generate/host_link_macro/get_ptx.rs @@ -71,9 +71,14 @@ pub(super) fn quote_get_ptx( quote::quote_spanned! { func_ident.span()=> const _: #crate_path::safety::ptx_kernel_signature::Assert<{ #crate_path::safety::ptx_kernel_signature::HostAndDeviceKernelSignatureTypeLayout::Match - }> = #crate_path::safety::ptx_kernel_signature::Assert::<{ - #crate_path::safety::ptx_kernel_signature::check::<#ffi_signature_ty>(#ffi_signature_ident) - }>; + }> = { + use #crate_path::deps::const_type_layout::{TypeLayoutGraph, check_serialised_type_graph}; + use crate_path::safety::ptx_kernel_signature::HostAndDeviceKernelSignatureTypeLayout; + + #crate_path::safety::ptx_kernel_signature::Assert::<{ + #ffi_signature_ident::<#ffi_signature_ty>() + }> + }; } }; From 6311a6d40e91e817f0474447f5312129d2ca9581 Mon Sep 17 00:00:00 2001 From: Juniper Tyree Date: Mon, 5 Feb 2024 09:13:16 +0000 Subject: [PATCH 120/120] Try check again --- rust-cuda-kernel/src/kernel/link/mod.rs | 6 +++--- .../wrapper/generate/host_link_macro/get_ptx.rs | 15 +++++++-------- 2 files changed, 10 insertions(+), 11 deletions(-) diff --git a/rust-cuda-kernel/src/kernel/link/mod.rs b/rust-cuda-kernel/src/kernel/link/mod.rs index 0b68debc7..bbe243c9f 100644 --- a/rust-cuda-kernel/src/kernel/link/mod.rs +++ b/rust-cuda-kernel/src/kernel/link/mod.rs @@ -80,7 +80,7 @@ pub fn compile_kernel(tokens: TokenStream) -> TokenStream { proc_macro_error::set_dummy(quote! { const #ptx_cstr_ident: &'static ::core::ffi::CStr = c"ERROR in this PTX compilation"; - const fn #ffi_signature_ident() -> HostAndDeviceKernelSignatureTypeLayout { + const fn #ffi_signature_ident() -> HostAndDeviceKernelSignatureTypeLayout { HostAndDeviceKernelSignatureTypeLayout::Match } @@ -122,7 +122,7 @@ pub fn compile_kernel(tokens: TokenStream) -> TokenStream { return (quote! { const #ptx_cstr_ident: &'static ::core::ffi::CStr = c"ERROR in this PTX compilation"; - const fn #ffi_signature_ident() -> HostAndDeviceKernelSignatureTypeLayout { + const fn #ffi_signature_ident() -> HostAndDeviceKernelSignatureTypeLayout { HostAndDeviceKernelSignatureTypeLayout::Match } @@ -228,7 +228,7 @@ fn extract_ptx_kernel_layout(kernel_ptx: &mut String) -> Vec() -> HostAndDeviceKernelSignatureTypeLayout { + const fn #param() -> HostAndDeviceKernelSignatureTypeLayout { if check_serialised_type_graph::(#byte_str) { HostAndDeviceKernelSignatureTypeLayout::Match } else { diff --git a/rust-cuda-kernel/src/kernel/wrapper/generate/host_link_macro/get_ptx.rs b/rust-cuda-kernel/src/kernel/wrapper/generate/host_link_macro/get_ptx.rs index 08ec8ab40..ef65d5596 100644 --- a/rust-cuda-kernel/src/kernel/wrapper/generate/host_link_macro/get_ptx.rs +++ b/rust-cuda-kernel/src/kernel/wrapper/generate/host_link_macro/get_ptx.rs @@ -71,14 +71,9 @@ pub(super) fn quote_get_ptx( quote::quote_spanned! { func_ident.span()=> const _: #crate_path::safety::ptx_kernel_signature::Assert<{ #crate_path::safety::ptx_kernel_signature::HostAndDeviceKernelSignatureTypeLayout::Match - }> = { - use #crate_path::deps::const_type_layout::{TypeLayoutGraph, check_serialised_type_graph}; - use crate_path::safety::ptx_kernel_signature::HostAndDeviceKernelSignatureTypeLayout; - - #crate_path::safety::ptx_kernel_signature::Assert::<{ - #ffi_signature_ident::<#ffi_signature_ty>() - }> - }; + }> = #crate_path::safety::ptx_kernel_signature::Assert::<{ + #ffi_signature_ident::<#ffi_signature_ty>() + }>; } }; @@ -93,6 +88,10 @@ pub(super) fn quote_get_ptx( quote! { fn get_ptx() -> &'static ::core::ffi::CStr { + // FIXME: don't use imports here + use #crate_path::deps::const_type_layout::{TypeGraphLayout, check_serialised_type_graph}; + use #crate_path::safety::ptx_kernel_signature::HostAndDeviceKernelSignatureTypeLayout; + #args_trait extern "C" { #(