|
| 1 | +# This file is a part of Julia. License is MIT: https://julialang.org/license |
| 2 | + |
| 3 | +""" |
| 4 | + _is_effect_free(f, t::Type{<:Tuple}) -> ans::Bool |
| 5 | +
|
| 6 | +Return `true` if `f(args...)` with `args::t` is consdiered to be `:effect_free` |
| 7 | +as defined in `@assume_effects`. |
| 8 | +""" |
| 9 | +function _is_effect_free(f::F, t) where {F} |
| 10 | + eff = Core.Compiler.infer_effects(f, t) |
| 11 | + return eff.effect_free == Core.Compiler.ALWAYS_TRUE |
| 12 | +end |
| 13 | + |
| 14 | +# Choosing a rather very conservative bound for parallelism to avoid slowing |
| 15 | +# down the case parallelism is not helpful. This number is chosen s.t. the |
| 16 | +# sequential `map(identity, ::Vector{Float64})` of this size takes about > 100 |
| 17 | +# μs. So, the cost of task spawn/sync is likely neglegible even for very |
| 18 | +# trivial `f`. This is a global mutable so that it is easy to check the |
| 19 | +# parallel path in end-to-end tests. |
| 20 | +const _MAP_MIN_BASESIZE = Ref(2^18) |
| 21 | + |
| 22 | +function _maybe_parallelize_collect(iter::Generator) |
| 23 | + # TODO: use transducer to cleanly implement this |
| 24 | + # TODO: support filter, flatten, product, etc. |
| 25 | + arrays = if iter.iter isa AbstractArray |
| 26 | + (iter.iter,) |
| 27 | + elseif iter.iter isa Iterators.Zip{<:Tuple{Vararg{AbstractArray}}} |
| 28 | + iter.iter.is |
| 29 | + else |
| 30 | + return nothing |
| 31 | + end |
| 32 | + arrays === () && return nothing |
| 33 | + |
| 34 | + # TODO: Maybe avoid parallel implementation if `f` and `getindex` are also |
| 35 | + # `:consistent`? It can happen for something like FillArrays. |
| 36 | + |
| 37 | + # TODO: guess a good number from the cost of `f` and `getindex`? |
| 38 | + min_basesize = max(2, _MAP_MIN_BASESIZE[]) |
| 39 | + length(arrays[1]) < min_basesize && return nothing |
| 40 | + |
| 41 | + # Only handle compatible shape (and thus uniform length) for now. |
| 42 | + all(==(axes(arrays[1])), map(axes, tail(arrays))) || return nothing |
| 43 | + shape = size(arrays[1]) # relies on the check above |
| 44 | + |
| 45 | + Threads.nthreads() == 1 && return nothing |
| 46 | + |
| 47 | + indices = eachindex(arrays...) |
| 48 | + indextype = eltype(indices) |
| 49 | + _is_effect_free(ith_all, Tuple{indextype,typeof(arrays)}) || return nothing |
| 50 | + |
| 51 | + # FIXME: `getvalue` captures `iter` just in case `f` is a `Type` (which |
| 52 | + # would be captured as a `DataType`) |
| 53 | + getvalue = if iter.iter isa AbstractArray |
| 54 | + _is_effect_free(iter.f, Tuple{eltype(arrays[1])}) || return nothing |
| 55 | + @inline getvalue1(i) = iter.f(@inbounds arrays[1][i]) |
| 56 | + else |
| 57 | + _is_effect_free(iter.f, Tuple{Tuple{map(eltype, arrays)...}}) || return nothing |
| 58 | + @inline getvalue2(i) = iter.f((@inbounds ith_all(i, arrays))) |
| 59 | + end |
| 60 | + |
| 61 | + # Cap the `basesize` assuming that the workload of `f` is uniform. This may |
| 62 | + # not be a good choice especially once we support filter and flatten. |
| 63 | + # However, since this code path is enabled automatically, it may be better |
| 64 | + # to play on the very safe side. |
| 65 | + basesize = min(min_basesize, cld(length(indices), Threads.nthreads())) |
| 66 | + |
| 67 | + # Note: `@default_eltype` is incorrect if `T(....)` (with `T::Type`) does |
| 68 | + # not return a `T`. However, `collect(::Generator)` already uses `@default_eltype`. |
| 69 | + et = @default_eltype(iter) |
| 70 | + if isconcretetype(et) |
| 71 | + # We do not leak compiler internal here even though `et` is visible from |
| 72 | + # the user because we have checked that input is not empty. It is not |
| 73 | + # perfect since the sequential implementation of `collect(::Generator)` |
| 74 | + # itself does leak the compiler internal by returning `Array{et}`. |
| 75 | + # However, if/when `collect(::Generator)` solved this issue, the |
| 76 | + # parallel implementation does not get in the way of typocalyps-free |
| 77 | + # Base. |
| 78 | + dest = Array{et}(undef, size(arrays[1])) |
| 79 | + return Some(_parallel_map!(getvalue, dest, indices)) |
| 80 | + else |
| 81 | + # TODO: use `_parallel_map!` if `allocatedinline(et)` and then refine |
| 82 | + # type (and fuse the mapping and the type bound computation) |
| 83 | + ys = _parallel_map(getvalue, indices; basesize)::Array{<:et} |
| 84 | + if length(shape) == 1 |
| 85 | + return Some(ys) |
| 86 | + else |
| 87 | + return Some(reshape(ys, shape)) |
| 88 | + end |
| 89 | + end |
| 90 | +end |
| 91 | + |
| 92 | +""" |
| 93 | + _parallel_map!(f, dest, xs) -> dest |
| 94 | +
|
| 95 | +A parallel version of `map!` (i.e., `dest .= f.(xs)`). |
| 96 | +
|
| 97 | +Before turning this to a proper API (say) `Threads.map!`: |
| 98 | +* (ideally: define infrastructure for making it extensible) |
| 99 | +* use basesize to control parallelism in a compositional manner |
| 100 | +* reject obviously wrong inputs like `dest::BitArray` (or just support it) |
| 101 | +""" |
| 102 | +function _parallel_map!(f, dest, xs) |
| 103 | + # TODO: use divide-and-conquer strategy for fast spawn and sync |
| 104 | + # TODO: don't use `@threads` |
| 105 | + # TODO: since the caller allocates `dest` and `f` is effect-free, we can use |
| 106 | + # `@simd ivdep` |
| 107 | + Threads.@threads for i in eachindex(dest, xs) |
| 108 | + @inbounds dest[i] = f(xs[i]) |
| 109 | + end |
| 110 | + return dest |
| 111 | +end |
| 112 | + |
| 113 | +function _halve(xs::AbstractVector) |
| 114 | + f = firstindex(xs) |
| 115 | + l = lastindex(xs) |
| 116 | + h = length(xs) ÷ 2 |
| 117 | + return view(xs, f:f+h), view(xs, f+h+1:l) |
| 118 | +end |
| 119 | + |
| 120 | +_halve(xs::AbstractArray) = _halve_ndarray(xs) |
| 121 | + |
| 122 | +function _halve_ndarray(xs::AbstractArray{N}, ::Val{D} = Val(N)) where {N,D} |
| 123 | + if D > 1 |
| 124 | + size(xs, D) < 2 && return _halve_ndarray(xs, Val(D - 1)) |
| 125 | + end |
| 126 | + f = firstindex(xs, D) |
| 127 | + l = lastindex(xs, D) |
| 128 | + h = size(xs, D) ÷ 2 |
| 129 | + cs1 = ntuple(_ -> :, Val(D - 1)) |
| 130 | + cs2 = ntuple(_ -> :, Val(N - D)) |
| 131 | + return view(xs, cs1..., f:f+h, cs2...), view(xs, cs1..., f+h+1:l, cs2...) |
| 132 | +end |
| 133 | + |
| 134 | +""" |
| 135 | + _parallel_map(f, xs; basesize) -> ys::Vector |
| 136 | +
|
| 137 | +Note: The output is always a `Vector` even if the input can have arbitrary |
| 138 | +`ndims`. The caller is responsible for `reshape`ing the output properly. |
| 139 | +""" |
| 140 | +function _parallel_map(f, xs; basesize) |
| 141 | + length(xs) <= max(2, basesize) && return vec(_serial_collect(Iterators.map(f, xs))) |
| 142 | + xs1, xs2 = _halve(xs) |
| 143 | + task = Threads.@spawn _parallel_map(f, xs2; basesize) |
| 144 | + ys1 = _parallel_map(f, xs1; basesize)::Vector |
| 145 | + ys2 = fetch(task)::Vector |
| 146 | + if eltype(ys2) <: eltype(ys1) |
| 147 | + return append!(ys1, ys2) |
| 148 | + elseif eltype(ys1) <: eltype(ys2) |
| 149 | + insert!(ys1, firstindex(ys1), ys2) |
| 150 | + return ys2 |
| 151 | + else |
| 152 | + # Note: we cannot use `vcat` here since `collect` uses |
| 153 | + # `promote_typejoin` instead of `promote` |
| 154 | + T = promote_typejoin(eltype(ys1), eltype(ys2)) |
| 155 | + ys3 = empty!(Vector{T}(undef, length(ys1) + length(ys2))) |
| 156 | + return append!(ys3, ys1, ys2) |
| 157 | + end |
| 158 | +end |
0 commit comments