diff --git a/src/abstract_gpu_interface.jl b/src/abstract_gpu_interface.jl index 511eff44a..362083c62 100644 --- a/src/abstract_gpu_interface.jl +++ b/src/abstract_gpu_interface.jl @@ -106,8 +106,8 @@ end # CUDAnative.__syncthreads() # end - - +abstract type GPUBackend end +backend(::Type{T}) where T = error("Can't choose GPU backend for $T") """ gpu_call(kernel::Function, A::GPUArray, args::Tuple, configuration = length(A)) @@ -124,7 +124,7 @@ Optionally, a launch configuration can be supplied in the following way: 2) Pass a tuple of integer tuples to define blocks and threads per blocks! """ -function gpu_call(kernel, A::GPUArray, args::Tuple, configuration = length(A)) +function gpu_call(kernel, A::AbstractArray, args::Tuple, configuration = length(A)) ITuple = NTuple{N, Integer} where N # If is a single integer, we assume it to be the global size / total number of threads one wants to launch thread_blocks = if isa(configuration, Integer) @@ -148,8 +148,8 @@ function gpu_call(kernel, A::GPUArray, args::Tuple, configuration = length(A)) `linear_index` will be inbetween 1:prod((blocks..., threads...)) """) end - _gpu_call(kernel, A, args, thread_blocks) + _gpu_call(backend(typeof(A)), kernel, A, args, thread_blocks) end # Internal GPU call function, that needs to be overloaded by the backends. -_gpu_call(f, A, args, thread_blocks) = error("Not implemented") +_gpu_call(::Any, f, A, args, thread_blocks) = error("Not implemented") diff --git a/src/abstractarray.jl b/src/abstractarray.jl index 7148924ad..23fe047be 100644 --- a/src/abstractarray.jl +++ b/src/abstractarray.jl @@ -30,10 +30,6 @@ function deserialize(s::AbstractSerializer, ::Type{T}) where T <: GPUArray T(A) end -@inline unpack_buffer(x) = x -@inline unpack_buffer(x::GPUArray) = pointer(x) -@inline unpack_buffer(x::Ref{<: GPUArray}) = unpack_buffer(x[]) - function to_cartesian(A, indices::Tuple) start = CartesianIndex(ntuple(length(indices)) do i val = indices[i] @@ -56,22 +52,24 @@ end ## showing -for (atype, op) in - [(:(GPUArray), :(Array)), - (:(LinearAlgebra.Adjoint{<:Any,<:GPUArray}), :(x->LinearAlgebra.adjoint(Array(parent(x))))), - (:(LinearAlgebra.Transpose{<:Any,<:GPUArray}), :(x->LinearAlgebra.transpose(Array(parent(x)))))] +for (AT, f) in + (GPUArray => Array, + LinearAlgebra.Adjoint{<:Any,<:GPUArray} => x->LinearAlgebra.adjoint(Array(parent(x))), + LinearAlgebra.Transpose{<:Any,<:GPUArray} => x->LinearAlgebra.transpose(Array(parent(x))), + SubArray{<:Any,<:Any,<:GPUArray} => x->SubArray(Array(parent(x)), parentindices(x)) + ) @eval begin # for display - Base.print_array(io::IO, X::($atype)) = - Base.print_array(io,($op)(X)) + Base.print_array(io::IO, X::$AT) = + Base.print_array(io,$f(X)) # for show - Base._show_nonempty(io::IO, X::($atype), prefix::String) = - Base._show_nonempty(io,($op)(X),prefix) - Base._show_empty(io::IO, X::($atype)) = - Base._show_empty(io,($op)(X)) - Base.show_vector(io::IO, v::($atype), args...) = - Base.show_vector(io,($op)(v),args...) + Base._show_nonempty(io::IO, X::$AT, prefix::String) = + Base._show_nonempty(io,$f(X),prefix) + Base._show_empty(io::IO, X::$AT) = + Base._show_empty(io,$f(X)) + Base.show_vector(io::IO, v::$AT, args...) = + Base.show_vector(io,$f(v),args...) end end diff --git a/src/array.jl b/src/array.jl index b668b2d15..b461250c7 100644 --- a/src/array.jl +++ b/src/array.jl @@ -21,6 +21,8 @@ function JLArray{T, N}(size::NTuple{N, Integer}) where {T, N} JLArray{T, N}(Array{T, N}(undef, size), size) end +struct JLBackend <: GPUBackend end +backend(::Type{<:JLArray}) = JLBackend() ## getters @@ -120,7 +122,7 @@ function AbstractDeviceArray(ptr::Array, shape::Vararg{Integer, N}) where N reshape(ptr, shape) end -function _gpu_call(f, A::JLArray, args::Tuple, blocks_threads::Tuple{T, T}) where T <: NTuple{N, Integer} where N +function _gpu_call(::JLBackend, f, A, args::Tuple, blocks_threads::Tuple{T, T}) where T <: NTuple{N, Integer} where N blocks, threads = blocks_threads idx = ntuple(i-> 1, length(blocks)) blockdim = blocks diff --git a/src/broadcast.jl b/src/broadcast.jl index 4bcc3ab30..67c9f6f2a 100644 --- a/src/broadcast.jl +++ b/src/broadcast.jl @@ -2,13 +2,56 @@ using Base.Broadcast import Base.Broadcast: BroadcastStyle, Broadcasted, ArrayStyle -BroadcastStyle(::Type{T}) where T <: GPUArray = ArrayStyle{T}() +# we define a generic `BroadcastStyle` here that should be sufficient for most cases. +# dependent packages like `CuArrays` can define their own `BroadcastStyle` allowing +# them to further change or optimize broadcasting. +# +# TODO: investigate if we should define out own `GPUArrayStyle{N} <: AbstractArrayStyle{N}` +# +# NOTE: this uses the specific `T` that was used e.g. `JLArray` or `CLArray` for ArrayStyle, +# instead of using `ArrayStyle{GPUArray}`, due to the fact how `similar` works. +BroadcastStyle(::Type{T}) where {T<:GPUArray} = ArrayStyle{T}() -function Base.similar(bc::Broadcasted{<:ArrayStyle{GPU}}, ::Type{ElType}) where {GPU <: GPUArray, ElType} +# These wrapper types otherwise forget that they are GPU compatible +# +# NOTE: Don't directly use ArrayStyle{GPUArray} here since that would mean that `CuArrays` +# customization no longer take effect. +BroadcastStyle(::Type{<:LinearAlgebra.Transpose{<:Any,T}}) where {T<:GPUArray} = BroadcastStyle(T) +BroadcastStyle(::Type{<:LinearAlgebra.Adjoint{<:Any,T}}) where {T<:GPUArray} = BroadcastStyle(T) +BroadcastStyle(::Type{<:SubArray{<:Any,<:Any,T}}) where {T<:GPUArray} = BroadcastStyle(T) + +backend(::Type{<:LinearAlgebra.Transpose{<:Any,T}}) where {T<:GPUArray} = backend(T) +backend(::Type{<:LinearAlgebra.Adjoint{<:Any,T}}) where {T<:GPUArray} = backend(T) +backend(::Type{<:SubArray{<:Any,<:Any,T}}) where {T<:GPUArray} = backend(T) + +# This Union is a hack. Ideally Base would have a Transpose <: WrappedArray <: AbstractArray +# and we could define our methods in terms of Union{GPUArray, WrappedArray{<:Any, <:GPUArray}} +const GPUDestArray = Union{GPUArray, + LinearAlgebra.Transpose{<:Any,<:GPUArray}, + LinearAlgebra.Adjoint{<:Any,<:GPUArray}, + SubArray{<:Any,<:Any,<:GPUArray}} + +# This method is responsible for selection the output type of broadcast +function Base.similar(bc::Broadcasted{<:ArrayStyle{GPU}}, ::Type{ElType}) where + {GPU <: GPUArray, ElType} similar(GPU, ElType, axes(bc)) end -@inline function Base.copyto!(dest::GPUArray, bc::Broadcasted{Nothing}) +# We purposefully only specialize `copyto!`, dependent packages need to make sure that they +# can handle: +# - `bc::Broadcast.Broadcasted{Style}` +# - `ex::Broadcast.Extruded` +# - `LinearAlgebra.Transpose{,<:GPUArray}` and `LinearAlgebra.Adjoint{,<:GPUArray}`, etc +# as arguments to a kernel and that they do the right conversion. +# +# This Broadcast can be further customize by: +# - `Broadcast.preprocess(dest::GPUArray, bc::Broadcasted{Nothing})` which allows for a +# complete transformation based on the output type just at the end of the pipeline. +# - `Broadcast.broadcasted(::Style, f)` selection of an implementation of `f` compatible +# with `Style` +# +# For more information see the Base documentation. +@inline function Base.copyto!(dest::GPUDestArray, bc::Broadcasted{Nothing}) axes(dest) == axes(bc) || Broadcast.throwdm(axes(dest), axes(bc)) bc′ = Broadcast.preprocess(dest, bc) gpu_call(dest, (dest, bc′)) do state, dest, bc′ @@ -20,6 +63,12 @@ end return dest end +# Base defines this method as a performance optimization, but we don't know how to do +# `fill!` in general for all `GPUDestArray` so we just go straight to the fallback +@inline Base.copyto!(dest::GPUDestArray, bc::Broadcasted{<:Broadcast.AbstractArrayStyle{0}}) = + copyto!(dest, convert(Broadcasted{Nothing}, bc)) + +# TODO: is this still necessary? function mapidx(f, A::GPUArray, args::NTuple{N, Any}) where N gpu_call(A, (f, A, args)) do state, f, A, args ilin = @linearidx(A, state) diff --git a/src/mapreduce.jl b/src/mapreduce.jl index 2f1c7ac5e..b5034c0f4 100644 --- a/src/mapreduce.jl +++ b/src/mapreduce.jl @@ -5,7 +5,10 @@ Base.any(A::GPUArray{Bool}) = mapreduce(identity, |, A; init = false) Base.all(A::GPUArray{Bool}) = mapreduce(identity, &, A; init = true) -Base.count(pred, A::GPUArray) = Int(mapreduce(pred, +, A; init = 0)) + +Base.any(f::Function, A::GPUArray) = mapreduce(f, |, A; init = false) +Base.all(f::Function, A::GPUArray) = mapreduce(f, &, A; init = true) +Base.count(pred::Function, A::GPUArray) = Int(mapreduce(pred, +, A; init = 0)) Base.:(==)(A::GPUArray, B::GPUArray) = Bool(mapreduce(==, &, A, B; init = true)) diff --git a/src/testsuite/broadcasting.jl b/src/testsuite/broadcasting.jl index 6340a6729..39e5477e2 100644 --- a/src/testsuite/broadcasting.jl +++ b/src/testsuite/broadcasting.jl @@ -53,6 +53,14 @@ function broadcasting(AT) end end + @testset "Adjoint and Transpose" begin + A = AT(rand(ET, N)) + A' .= ET(2) + @test all(x->x==ET(2), A) + transpose(A) .= ET(1) + @test all(x->x==ET(1), A) + end + ############ # issue #27 @test compare((a, b)-> a .+ b, AT, rand(ET, 4, 5, 3), rand(ET, 1, 5, 3))