diff --git a/src/abstract_gpu_interface.jl b/src/abstract_gpu_interface.jl
index 511eff44a..362083c62 100644
--- a/src/abstract_gpu_interface.jl
+++ b/src/abstract_gpu_interface.jl
@@ -106,8 +106,8 @@ end
 #     CUDAnative.__syncthreads()
 # end
 
-
-
+abstract type GPUBackend end
+backend(::Type{T}) where T = error("Can't choose GPU backend for $T")
 
 """
     gpu_call(kernel::Function, A::GPUArray, args::Tuple, configuration = length(A))
@@ -124,7 +124,7 @@ Optionally, a launch configuration can be supplied in the following way:
     2) Pass a tuple of integer tuples to define blocks and threads per blocks!
 
 """
-function gpu_call(kernel, A::GPUArray, args::Tuple, configuration = length(A))
+function gpu_call(kernel, A::AbstractArray, args::Tuple, configuration = length(A))
     ITuple = NTuple{N, Integer} where N
     # If is a single integer, we assume it to be the global size / total number of threads one wants to launch
     thread_blocks = if isa(configuration, Integer)
@@ -148,8 +148,8 @@ function gpu_call(kernel, A::GPUArray, args::Tuple, configuration = length(A))
                 `linear_index` will be inbetween 1:prod((blocks..., threads...))
         """)
     end
-    _gpu_call(kernel, A, args, thread_blocks)
+    _gpu_call(backend(typeof(A)), kernel, A, args, thread_blocks)
 end
 
 # Internal GPU call function, that needs to be overloaded by the backends.
-_gpu_call(f, A, args, thread_blocks) = error("Not implemented")
+_gpu_call(::Any, f, A, args, thread_blocks) = error("Not implemented")
diff --git a/src/abstractarray.jl b/src/abstractarray.jl
index 7148924ad..23fe047be 100644
--- a/src/abstractarray.jl
+++ b/src/abstractarray.jl
@@ -30,10 +30,6 @@ function deserialize(s::AbstractSerializer, ::Type{T}) where T <: GPUArray
     T(A)
 end
 
-@inline unpack_buffer(x) = x
-@inline unpack_buffer(x::GPUArray) = pointer(x)
-@inline unpack_buffer(x::Ref{<: GPUArray}) = unpack_buffer(x[])
-
 function to_cartesian(A, indices::Tuple)
     start = CartesianIndex(ntuple(length(indices)) do i
         val = indices[i]
@@ -56,22 +52,24 @@ end
 
 ## showing
 
-for (atype, op) in
-    [(:(GPUArray), :(Array)),
-     (:(LinearAlgebra.Adjoint{<:Any,<:GPUArray}), :(x->LinearAlgebra.adjoint(Array(parent(x))))),
-     (:(LinearAlgebra.Transpose{<:Any,<:GPUArray}), :(x->LinearAlgebra.transpose(Array(parent(x)))))]
+for (AT, f) in
+    (GPUArray                                  => Array,
+     LinearAlgebra.Adjoint{<:Any,<:GPUArray}   => x->LinearAlgebra.adjoint(Array(parent(x))),
+     LinearAlgebra.Transpose{<:Any,<:GPUArray} => x->LinearAlgebra.transpose(Array(parent(x))),
+     SubArray{<:Any,<:Any,<:GPUArray}          => x->SubArray(Array(parent(x)), parentindices(x))
+    )
   @eval begin
     # for display
-    Base.print_array(io::IO, X::($atype)) =
-        Base.print_array(io,($op)(X))
+    Base.print_array(io::IO, X::$AT) =
+        Base.print_array(io,$f(X))
 
     # for show
-    Base._show_nonempty(io::IO, X::($atype), prefix::String) = 
-        Base._show_nonempty(io,($op)(X),prefix)
-    Base._show_empty(io::IO, X::($atype)) =
-        Base._show_empty(io,($op)(X))
-    Base.show_vector(io::IO, v::($atype), args...) =
-        Base.show_vector(io,($op)(v),args...)
+    Base._show_nonempty(io::IO, X::$AT, prefix::String) =
+        Base._show_nonempty(io,$f(X),prefix)
+    Base._show_empty(io::IO, X::$AT) =
+        Base._show_empty(io,$f(X))
+    Base.show_vector(io::IO, v::$AT, args...) =
+        Base.show_vector(io,$f(v),args...)
   end
 end
 
diff --git a/src/array.jl b/src/array.jl
index b668b2d15..b461250c7 100644
--- a/src/array.jl
+++ b/src/array.jl
@@ -21,6 +21,8 @@ function JLArray{T, N}(size::NTuple{N, Integer}) where {T, N}
     JLArray{T, N}(Array{T, N}(undef, size), size)
 end
 
+struct JLBackend <: GPUBackend end
+backend(::Type{<:JLArray}) = JLBackend()
 
 ## getters
 
@@ -120,7 +122,7 @@ function AbstractDeviceArray(ptr::Array, shape::Vararg{Integer, N}) where N
     reshape(ptr, shape)
 end
 
-function _gpu_call(f, A::JLArray, args::Tuple, blocks_threads::Tuple{T, T}) where T <: NTuple{N, Integer} where N
+function _gpu_call(::JLBackend, f, A, args::Tuple, blocks_threads::Tuple{T, T}) where T <: NTuple{N, Integer} where N
     blocks, threads = blocks_threads
     idx = ntuple(i-> 1, length(blocks))
     blockdim = blocks
diff --git a/src/broadcast.jl b/src/broadcast.jl
index 4bcc3ab30..67c9f6f2a 100644
--- a/src/broadcast.jl
+++ b/src/broadcast.jl
@@ -2,13 +2,56 @@ using Base.Broadcast
 
 import Base.Broadcast: BroadcastStyle, Broadcasted, ArrayStyle
 
-BroadcastStyle(::Type{T}) where T <: GPUArray = ArrayStyle{T}()
+# we define a generic `BroadcastStyle` here that should be sufficient for most cases.
+# dependent packages like `CuArrays` can define their own `BroadcastStyle` allowing
+# them to further change or optimize broadcasting.
+#
+# TODO: investigate if we should define out own `GPUArrayStyle{N} <: AbstractArrayStyle{N}`
+#
+# NOTE: this uses the specific `T` that was used e.g. `JLArray` or `CLArray` for ArrayStyle,
+#       instead of using `ArrayStyle{GPUArray}`, due to the fact how `similar` works.
+BroadcastStyle(::Type{T}) where {T<:GPUArray} = ArrayStyle{T}()
 
-function Base.similar(bc::Broadcasted{<:ArrayStyle{GPU}}, ::Type{ElType}) where {GPU <: GPUArray, ElType}
+# These wrapper types otherwise forget that they are GPU compatible
+#
+# NOTE: Don't directly use ArrayStyle{GPUArray} here since that would mean that `CuArrays`
+#       customization no longer take effect.
+BroadcastStyle(::Type{<:LinearAlgebra.Transpose{<:Any,T}}) where {T<:GPUArray} = BroadcastStyle(T)
+BroadcastStyle(::Type{<:LinearAlgebra.Adjoint{<:Any,T}}) where {T<:GPUArray} = BroadcastStyle(T)
+BroadcastStyle(::Type{<:SubArray{<:Any,<:Any,T}}) where {T<:GPUArray} = BroadcastStyle(T)
+
+backend(::Type{<:LinearAlgebra.Transpose{<:Any,T}}) where {T<:GPUArray} = backend(T)
+backend(::Type{<:LinearAlgebra.Adjoint{<:Any,T}}) where {T<:GPUArray} = backend(T)
+backend(::Type{<:SubArray{<:Any,<:Any,T}}) where {T<:GPUArray} = backend(T)
+
+# This Union is a hack. Ideally Base would have a Transpose <: WrappedArray <: AbstractArray
+# and we could define our methods in terms of Union{GPUArray, WrappedArray{<:Any, <:GPUArray}}
+const GPUDestArray = Union{GPUArray,
+                           LinearAlgebra.Transpose{<:Any,<:GPUArray},
+                           LinearAlgebra.Adjoint{<:Any,<:GPUArray},
+                           SubArray{<:Any,<:Any,<:GPUArray}}
+
+# This method is responsible for selection the output type of broadcast
+function Base.similar(bc::Broadcasted{<:ArrayStyle{GPU}}, ::Type{ElType}) where
+                     {GPU <: GPUArray, ElType}
     similar(GPU, ElType, axes(bc))
 end
 
-@inline function Base.copyto!(dest::GPUArray, bc::Broadcasted{Nothing})
+# We purposefully only specialize `copyto!`, dependent packages need to make sure that they
+# can handle:
+# - `bc::Broadcast.Broadcasted{Style}`
+# - `ex::Broadcast.Extruded`
+# - `LinearAlgebra.Transpose{,<:GPUArray}` and `LinearAlgebra.Adjoint{,<:GPUArray}`, etc
+#    as arguments to a kernel and that they do the right conversion.
+#
+# This Broadcast can be further customize by:
+# - `Broadcast.preprocess(dest::GPUArray, bc::Broadcasted{Nothing})` which allows for a
+#   complete transformation based on the output type just at the end of the pipeline.
+# - `Broadcast.broadcasted(::Style, f)` selection of an implementation of `f` compatible
+#   with `Style`
+#
+# For more information see the Base documentation.
+@inline function Base.copyto!(dest::GPUDestArray, bc::Broadcasted{Nothing})
     axes(dest) == axes(bc) || Broadcast.throwdm(axes(dest), axes(bc))
     bc′ = Broadcast.preprocess(dest, bc)
     gpu_call(dest, (dest, bc′)) do state, dest, bc′
@@ -20,6 +63,12 @@ end
     return dest
 end
 
+# Base defines this method as a performance optimization, but we don't know how to do
+# `fill!` in general for all `GPUDestArray` so we just go straight to the fallback
+@inline Base.copyto!(dest::GPUDestArray, bc::Broadcasted{<:Broadcast.AbstractArrayStyle{0}}) =
+    copyto!(dest, convert(Broadcasted{Nothing}, bc))
+
+# TODO: is this still necessary?
 function mapidx(f, A::GPUArray, args::NTuple{N, Any}) where N
     gpu_call(A, (f, A, args)) do state, f, A, args
         ilin = @linearidx(A, state)
diff --git a/src/mapreduce.jl b/src/mapreduce.jl
index 2f1c7ac5e..b5034c0f4 100644
--- a/src/mapreduce.jl
+++ b/src/mapreduce.jl
@@ -5,7 +5,10 @@
 
 Base.any(A::GPUArray{Bool}) = mapreduce(identity, |, A; init = false)
 Base.all(A::GPUArray{Bool}) = mapreduce(identity, &, A; init = true)
-Base.count(pred, A::GPUArray) = Int(mapreduce(pred, +, A; init = 0))
+
+Base.any(f::Function, A::GPUArray) = mapreduce(f, |, A; init = false)
+Base.all(f::Function, A::GPUArray) = mapreduce(f, &, A; init = true)
+Base.count(pred::Function, A::GPUArray) = Int(mapreduce(pred, +, A; init = 0))
 
 Base.:(==)(A::GPUArray, B::GPUArray) = Bool(mapreduce(==, &, A, B; init = true))
 
diff --git a/src/testsuite/broadcasting.jl b/src/testsuite/broadcasting.jl
index 6340a6729..39e5477e2 100644
--- a/src/testsuite/broadcasting.jl
+++ b/src/testsuite/broadcasting.jl
@@ -53,6 +53,14 @@ function broadcasting(AT)
                 end
             end
 
+            @testset "Adjoint and Transpose" begin
+                A = AT(rand(ET, N))
+                A' .= ET(2)
+                @test all(x->x==ET(2), A)
+                transpose(A) .= ET(1)
+                @test all(x->x==ET(1), A)
+            end
+
             ############
             # issue #27
             @test compare((a, b)-> a .+ b, AT, rand(ET, 4, 5, 3), rand(ET, 1, 5, 3))