Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions src/abstract_gpu_interface.jl
Original file line number Diff line number Diff line change
Expand Up @@ -106,8 +106,8 @@ end
# CUDAnative.__syncthreads()
# end



abstract type GPUBackend end
backend(::Type{T}) where T = error("Can't choose GPU backend for $T")

"""
gpu_call(kernel::Function, A::GPUArray, args::Tuple, configuration = length(A))
Expand All @@ -124,7 +124,7 @@ Optionally, a launch configuration can be supplied in the following way:
2) Pass a tuple of integer tuples to define blocks and threads per blocks!

"""
function gpu_call(kernel, A::GPUArray, args::Tuple, configuration = length(A))
function gpu_call(kernel, A::AbstractArray, args::Tuple, configuration = length(A))
ITuple = NTuple{N, Integer} where N
# If is a single integer, we assume it to be the global size / total number of threads one wants to launch
thread_blocks = if isa(configuration, Integer)
Expand All @@ -148,8 +148,8 @@ function gpu_call(kernel, A::GPUArray, args::Tuple, configuration = length(A))
`linear_index` will be inbetween 1:prod((blocks..., threads...))
""")
end
_gpu_call(kernel, A, args, thread_blocks)
_gpu_call(backend(typeof(A)), kernel, A, args, thread_blocks)
end

# Internal GPU call function, that needs to be overloaded by the backends.
_gpu_call(f, A, args, thread_blocks) = error("Not implemented")
_gpu_call(::Any, f, A, args, thread_blocks) = error("Not implemented")
30 changes: 14 additions & 16 deletions src/abstractarray.jl
Original file line number Diff line number Diff line change
Expand Up @@ -30,10 +30,6 @@ function deserialize(s::AbstractSerializer, ::Type{T}) where T <: GPUArray
T(A)
end

@inline unpack_buffer(x) = x
@inline unpack_buffer(x::GPUArray) = pointer(x)
@inline unpack_buffer(x::Ref{<: GPUArray}) = unpack_buffer(x[])

function to_cartesian(A, indices::Tuple)
start = CartesianIndex(ntuple(length(indices)) do i
val = indices[i]
Expand All @@ -56,22 +52,24 @@ end

## showing

for (atype, op) in
[(:(GPUArray), :(Array)),
(:(LinearAlgebra.Adjoint{<:Any,<:GPUArray}), :(x->LinearAlgebra.adjoint(Array(parent(x))))),
(:(LinearAlgebra.Transpose{<:Any,<:GPUArray}), :(x->LinearAlgebra.transpose(Array(parent(x)))))]
for (AT, f) in
(GPUArray => Array,
LinearAlgebra.Adjoint{<:Any,<:GPUArray} => x->LinearAlgebra.adjoint(Array(parent(x))),
LinearAlgebra.Transpose{<:Any,<:GPUArray} => x->LinearAlgebra.transpose(Array(parent(x))),
SubArray{<:Any,<:Any,<:GPUArray} => x->SubArray(Array(parent(x)), parentindices(x))
)
@eval begin
# for display
Base.print_array(io::IO, X::($atype)) =
Base.print_array(io,($op)(X))
Base.print_array(io::IO, X::$AT) =
Base.print_array(io,$f(X))

# for show
Base._show_nonempty(io::IO, X::($atype), prefix::String) =
Base._show_nonempty(io,($op)(X),prefix)
Base._show_empty(io::IO, X::($atype)) =
Base._show_empty(io,($op)(X))
Base.show_vector(io::IO, v::($atype), args...) =
Base.show_vector(io,($op)(v),args...)
Base._show_nonempty(io::IO, X::$AT, prefix::String) =
Base._show_nonempty(io,$f(X),prefix)
Base._show_empty(io::IO, X::$AT) =
Base._show_empty(io,$f(X))
Base.show_vector(io::IO, v::$AT, args...) =
Base.show_vector(io,$f(v),args...)
end
end

Expand Down
4 changes: 3 additions & 1 deletion src/array.jl
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@ function JLArray{T, N}(size::NTuple{N, Integer}) where {T, N}
JLArray{T, N}(Array{T, N}(undef, size), size)
end

struct JLBackend <: GPUBackend end
backend(::Type{<:JLArray}) = JLBackend()

## getters

Expand Down Expand Up @@ -120,7 +122,7 @@ function AbstractDeviceArray(ptr::Array, shape::Vararg{Integer, N}) where N
reshape(ptr, shape)
end

function _gpu_call(f, A::JLArray, args::Tuple, blocks_threads::Tuple{T, T}) where T <: NTuple{N, Integer} where N
function _gpu_call(::JLBackend, f, A, args::Tuple, blocks_threads::Tuple{T, T}) where T <: NTuple{N, Integer} where N
blocks, threads = blocks_threads
idx = ntuple(i-> 1, length(blocks))
blockdim = blocks
Expand Down
55 changes: 52 additions & 3 deletions src/broadcast.jl
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,56 @@ using Base.Broadcast

import Base.Broadcast: BroadcastStyle, Broadcasted, ArrayStyle

BroadcastStyle(::Type{T}) where T <: GPUArray = ArrayStyle{T}()
# we define a generic `BroadcastStyle` here that should be sufficient for most cases.
# dependent packages like `CuArrays` can define their own `BroadcastStyle` allowing
# them to further change or optimize broadcasting.
#
# TODO: investigate if we should define out own `GPUArrayStyle{N} <: AbstractArrayStyle{N}`
#
# NOTE: this uses the specific `T` that was used e.g. `JLArray` or `CLArray` for ArrayStyle,
# instead of using `ArrayStyle{GPUArray}`, due to the fact how `similar` works.
BroadcastStyle(::Type{T}) where {T<:GPUArray} = ArrayStyle{T}()

function Base.similar(bc::Broadcasted{<:ArrayStyle{GPU}}, ::Type{ElType}) where {GPU <: GPUArray, ElType}
# These wrapper types otherwise forget that they are GPU compatible
#
# NOTE: Don't directly use ArrayStyle{GPUArray} here since that would mean that `CuArrays`
# customization no longer take effect.
BroadcastStyle(::Type{<:LinearAlgebra.Transpose{<:Any,T}}) where {T<:GPUArray} = BroadcastStyle(T)
BroadcastStyle(::Type{<:LinearAlgebra.Adjoint{<:Any,T}}) where {T<:GPUArray} = BroadcastStyle(T)
BroadcastStyle(::Type{<:SubArray{<:Any,<:Any,T}}) where {T<:GPUArray} = BroadcastStyle(T)

backend(::Type{<:LinearAlgebra.Transpose{<:Any,T}}) where {T<:GPUArray} = backend(T)
backend(::Type{<:LinearAlgebra.Adjoint{<:Any,T}}) where {T<:GPUArray} = backend(T)
backend(::Type{<:SubArray{<:Any,<:Any,T}}) where {T<:GPUArray} = backend(T)

# This Union is a hack. Ideally Base would have a Transpose <: WrappedArray <: AbstractArray
# and we could define our methods in terms of Union{GPUArray, WrappedArray{<:Any, <:GPUArray}}
const GPUDestArray = Union{GPUArray,
LinearAlgebra.Transpose{<:Any,<:GPUArray},
LinearAlgebra.Adjoint{<:Any,<:GPUArray},
SubArray{<:Any,<:Any,<:GPUArray}}

# This method is responsible for selection the output type of broadcast
function Base.similar(bc::Broadcasted{<:ArrayStyle{GPU}}, ::Type{ElType}) where
{GPU <: GPUArray, ElType}
similar(GPU, ElType, axes(bc))
end

@inline function Base.copyto!(dest::GPUArray, bc::Broadcasted{Nothing})
# We purposefully only specialize `copyto!`, dependent packages need to make sure that they
# can handle:
# - `bc::Broadcast.Broadcasted{Style}`
# - `ex::Broadcast.Extruded`
# - `LinearAlgebra.Transpose{,<:GPUArray}` and `LinearAlgebra.Adjoint{,<:GPUArray}`, etc
# as arguments to a kernel and that they do the right conversion.
#
# This Broadcast can be further customize by:
# - `Broadcast.preprocess(dest::GPUArray, bc::Broadcasted{Nothing})` which allows for a
# complete transformation based on the output type just at the end of the pipeline.
# - `Broadcast.broadcasted(::Style, f)` selection of an implementation of `f` compatible
# with `Style`
#
# For more information see the Base documentation.
@inline function Base.copyto!(dest::GPUDestArray, bc::Broadcasted{Nothing})
axes(dest) == axes(bc) || Broadcast.throwdm(axes(dest), axes(bc))
bc′ = Broadcast.preprocess(dest, bc)
gpu_call(dest, (dest, bc′)) do state, dest, bc′
Expand All @@ -20,6 +63,12 @@ end
return dest
end

# Base defines this method as a performance optimization, but we don't know how to do
# `fill!` in general for all `GPUDestArray` so we just go straight to the fallback
@inline Base.copyto!(dest::GPUDestArray, bc::Broadcasted{<:Broadcast.AbstractArrayStyle{0}}) =
copyto!(dest, convert(Broadcasted{Nothing}, bc))

# TODO: is this still necessary?
function mapidx(f, A::GPUArray, args::NTuple{N, Any}) where N
gpu_call(A, (f, A, args)) do state, f, A, args
ilin = @linearidx(A, state)
Expand Down
5 changes: 4 additions & 1 deletion src/mapreduce.jl
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,10 @@

Base.any(A::GPUArray{Bool}) = mapreduce(identity, |, A; init = false)
Base.all(A::GPUArray{Bool}) = mapreduce(identity, &, A; init = true)
Base.count(pred, A::GPUArray) = Int(mapreduce(pred, +, A; init = 0))

Base.any(f::Function, A::GPUArray) = mapreduce(f, |, A; init = false)
Base.all(f::Function, A::GPUArray) = mapreduce(f, &, A; init = true)
Base.count(pred::Function, A::GPUArray) = Int(mapreduce(pred, +, A; init = 0))

Base.:(==)(A::GPUArray, B::GPUArray) = Bool(mapreduce(==, &, A, B; init = true))

Expand Down
8 changes: 8 additions & 0 deletions src/testsuite/broadcasting.jl
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,14 @@ function broadcasting(AT)
end
end

@testset "Adjoint and Transpose" begin
A = AT(rand(ET, N))
A' .= ET(2)
@test all(x->x==ET(2), A)
transpose(A) .= ET(1)
@test all(x->x==ET(1), A)
end

############
# issue #27
@test compare((a, b)-> a .+ b, AT, rand(ET, 4, 5, 3), rand(ET, 1, 5, 3))
Expand Down