From 0fa8313ead7cb27745f2ff3cfe65a690ddcd65c9 Mon Sep 17 00:00:00 2001 From: Adam Beckmeyer Date: Thu, 19 Dec 2019 14:59:46 -0500 Subject: [PATCH 1/2] Tweak mandelbrot-fast.jl for performance. Multiple versions with different threadings included because different versions are faster depending on the machine. Depending on machine, gains can be over 20% compared to original mandelbrot-fast.jl. NOTE: running mandelbrot-fast.v3.jl requires installation of https://github.com/mohamed82008/KissThreading.jl Changes included in every version: - Removing threading from filling xvals and yvals--threading overhead is too high for such a simple operation. - Remove @simd annotation from mandel_inner--simd is occurring at the level of mand8; @simd doesn't hurt runtime but increases compilation time. - Only run mandelbrot when !isinteractive() to make development and debugging easier. - Various tweaks and minor stylistic updates for succinctness and maybe a marginal increase in performance. --- mandelbrot/mandelbrot-fast.jl | 81 +++++++++++++++----------------- mandelbrot/mandelbrot-fast.v2.jl | 63 +++++++++++++++++++++++++ mandelbrot/mandelbrot-fast.v3.jl | 64 +++++++++++++++++++++++++ 3 files changed, 164 insertions(+), 44 deletions(-) create mode 100644 mandelbrot/mandelbrot-fast.v2.jl create mode 100644 mandelbrot/mandelbrot-fast.v3.jl diff --git a/mandelbrot/mandelbrot-fast.jl b/mandelbrot/mandelbrot-fast.jl index cda1933..9e368e2 100644 --- a/mandelbrot/mandelbrot-fast.jl +++ b/mandelbrot/mandelbrot-fast.jl @@ -1,74 +1,67 @@ #= The Computer Language Benchmarks Game https://salsa.debian.org/benchmarksgame-team/benchmarksgame/ + direct transliteration of the swift#3 program by Ralph Ganszky and Daniel Muellenborn: https://benchmarksgame-team.pages.debian.net/benchmarksgame/program/mandelbrot-swift-3.html - modified for Julia 1.0 by Simon Danisch + + modified for Julia 1.0 by Simon Danisch. + tweaked for performance by https://github.com/maltezfaria and Adam Beckmeyer. =# const zerov8 = ntuple(x-> 0f0, 8) - -@inline function step_mandel(Zr,Zi,Tr,Ti,cr,ci) - Zi = 2f0 .* Zr .* Zi .+ ci - Zr = Tr .- Ti .+ cr - Tr = Zr .* Zr - Ti = Zi .* Zi - return Zr,Zi,Tr,Ti -end +const masks = (0b01111111, 0b10111111, 0b11011111, 0b11101111, 0b11110111, + 0b11111011, 0b11111101, 0b11111110) # Calculate mandelbrot set for one Vec8 into one byte Base.@propagate_inbounds function mand8(cr, ci) - Zr = zerov8 - Zi = zerov8 - Tr = zerov8 - Ti = zerov8 - t = zerov8 + Zr = Zi = Tr = Ti = t = zerov8 i = 0 - while i<50 - for _ in 1:5 - Zr,Zi,Tr,Ti = step_mandel(Zr,Zi,Tr,Ti,cr,ci) - i += 1 + for _=1:10 + for _=1:5 + Zi = 2f0 .* Zr .* Zi .+ ci + Zr = Tr .- Ti .+ cr + Tr = Zr .* Zr + Ti = Zi .* Zi end t = Tr .+ Ti all(x-> x > 4f0, t) && (return 0x00) end + byte = 0xff - t[1] <= 4.0 || (byte &= 0b01111111) - t[2] <= 4.0 || (byte &= 0b10111111) - t[3] <= 4.0 || (byte &= 0b11011111) - t[4] <= 4.0 || (byte &= 0b11101111) - t[5] <= 4.0 || (byte &= 0b11110111) - t[6] <= 4.0 || (byte &= 0b11111011) - t[7] <= 4.0 || (byte &= 0b11111101) - t[8] <= 4.0 || (byte &= 0b11111110) + for i=1:8 + t[i] <= 4.0 || (byte &= masks[i]) + end return byte end function mandel_inner(rows, ci, y, N, xvals) - @simd for x in 1:8:N - @inbounds begin - cr = ntuple(i-> xvals[x + (i - 1)], 8) - rows[((y-1)*N÷8+(x-1)÷8) + 1] = mand8(cr, ci) - end + @inbounds for x=1:8:N + cr = ntuple(i-> xvals[x + i - 1], 8) + rows[((y-1)*N÷8+(x-1)÷8) + 1] = mand8(cr, ci) end end -function mandelbrot(n = 200) +function mandelbrot(io, n = 200) inv_ = 2.0 / n - N = n - xvals = zeros(Float32, n) - yvals = zeros(Float32, n) - Threads.@threads for i in 0:(N-1) - @inbounds xvals[i + 1] = i * inv_ - 1.5 - @inbounds yvals[i + 1] = i * inv_ - 1.0 + xvals = Vector{Float32}(undef, n) + yvals = Vector{Float32}(undef, n) + @inbounds for i in 0:(n-1) + xvals[i + 1] = i * inv_ - 1.5 + yvals[i + 1] = i * inv_ - 1.0 end - rows = zeros(UInt8, n*N÷8) - Threads.@threads for y in 1:N + + rows = Vector{UInt8}(undef, n^2 ÷ 8) + @sync for y=1:n @inbounds ci = yvals[y] - mandel_inner(rows, ci, y, N, xvals) + # This allows dynamic scheduling instead of static scheduling + # of Threads.@threads macro. See + # https://github.com/JuliaLang/julia/issues/21017 . On some + # computers this is faster, on others not. + Threads.@spawn mandel_inner(rows, ci, y, n, xvals) end - write(stdout, "P4\n$n $n\n") - write(stdout, rows) + write(io, "P4\n$n $n\n") + write(io, rows) end -mandelbrot(parse(Int, ARGS[1])) +isinteractive() || mandelbrot(stdout, parse(Int, ARGS[1])) diff --git a/mandelbrot/mandelbrot-fast.v2.jl b/mandelbrot/mandelbrot-fast.v2.jl new file mode 100644 index 0000000..5a4ee28 --- /dev/null +++ b/mandelbrot/mandelbrot-fast.v2.jl @@ -0,0 +1,63 @@ +#= +The Computer Language Benchmarks Game + https://salsa.debian.org/benchmarksgame-team/benchmarksgame/ + + direct transliteration of the swift#3 program by Ralph Ganszky and Daniel Muellenborn: + https://benchmarksgame-team.pages.debian.net/benchmarksgame/program/mandelbrot-swift-3.html + + modified for Julia 1.0 by Simon Danisch. + tweaked for performance by https://github.com/maltezfaria and Adam Beckmeyer. +=# +const zerov8 = ntuple(x-> 0f0, 8) +const masks = (0b01111111, 0b10111111, 0b11011111, 0b11101111, 0b11110111, + 0b11111011, 0b11111101, 0b11111110) + +# Calculate mandelbrot set for one Vec8 into one byte +Base.@propagate_inbounds function mand8(cr, ci) + Zr = Zi = Tr = Ti = t = zerov8 + i = 0 + + for _=1:10 + for _=1:5 + Zi = 2f0 .* Zr .* Zi .+ ci + Zr = Tr .- Ti .+ cr + Tr = Zr .* Zr + Ti = Zi .* Zi + end + t = Tr .+ Ti + all(x-> x > 4f0, t) && (return 0x00) + end + + byte = 0xff + for i=1:8 + t[i] <= 4.0 || (byte &= masks[i]) + end + return byte +end + +function mandel_inner(rows, ci, y, N, xvals) + @inbounds for x=1:8:N + cr = ntuple(i-> xvals[x + i - 1], 8) + rows[((y-1)*N÷8+(x-1)÷8) + 1] = mand8(cr, ci) + end +end + +function mandelbrot(io, n = 200) + inv_ = 2.0 / n + xvals = Vector{Float32}(undef, n) + yvals = Vector{Float32}(undef, n) + @inbounds for i in 0:(n-1) + xvals[i + 1] = i * inv_ - 1.5 + yvals[i + 1] = i * inv_ - 1.0 + end + + rows = Vector{UInt8}(undef, n^2 ÷ 8) + Threads.@threads for y=1:n + @inbounds ci = yvals[y] + mandel_inner(rows, ci, y, n, xvals) + end + write(io, "P4\n$n $n\n") + write(io, rows) +end + +isinteractive() || mandelbrot(stdout, parse(Int, ARGS[1])) diff --git a/mandelbrot/mandelbrot-fast.v3.jl b/mandelbrot/mandelbrot-fast.v3.jl new file mode 100644 index 0000000..312bc19 --- /dev/null +++ b/mandelbrot/mandelbrot-fast.v3.jl @@ -0,0 +1,64 @@ +#= +The Computer Language Benchmarks Game + https://salsa.debian.org/benchmarksgame-team/benchmarksgame/ + + direct transliteration of the swift#3 program by Ralph Ganszky and Daniel Muellenborn: + https://benchmarksgame-team.pages.debian.net/benchmarksgame/program/mandelbrot-swift-3.html + + modified for Julia 1.0 by Simon Danisch. + tweaked for performance by https://github.com/maltezfaria and Adam Beckmeyer. +=# +using KissThreading + +const zerov8 = ntuple(x-> 0f0, 8) +const masks = (0b01111111, 0b10111111, 0b11011111, 0b11101111, 0b11110111, + 0b11111011, 0b11111101, 0b11111110) + +# Calculate mandelbrot set for one Vec8 into one byte +Base.@propagate_inbounds function mand8(cr, ci) + Zr = Zi = Tr = Ti = t = zerov8 + i = 0 + + for _=1:10 + for _=1:5 + Zi = 2f0 .* Zr .* Zi .+ ci + Zr = Tr .- Ti .+ cr + Tr = Zr .* Zr + Ti = Zi .* Zi + end + t = Tr .+ Ti + all(x-> x > 4f0, t) && (return 0x00) + end + + byte = 0xff + for i=1:8 + t[i] <= 4.0 || (byte &= masks[i]) + end + return byte +end + +function mandel_inner(rows, ci, y, N, xvals) + @inbounds for x=1:8:N + cr = ntuple(i-> xvals[x + i - 1], 8) + rows[((y-1)*N÷8+(x-1)÷8) + 1] = mand8(cr, ci) + end +end + +function mandelbrot(io, n = 200) + inv_ = 2.0 / n + xvals = Vector{Float32}(undef, n) + yvals = Vector{Float32}(undef, n) + @inbounds for i in 0:(n-1) + xvals[i + 1] = i * inv_ - 1.5 + yvals[i + 1] = i * inv_ - 1.0 + end + + rows = Vector{UInt8}(undef, n^2 ÷ 8) + f(y) = @inbounds mandel_inner(rows, yvals[y], y, n, xvals) + tmap!(f, Vector{Nothing}(undef, n), collect(1:n); batch_size=8) + + write(io, "P4\n$n $n\n") + write(io, rows) +end + +isinteractive() || mandelbrot(stdout, parse(Int, ARGS[1])) From 204f6e395d8f200c8cb02df8dbd1d44574fe4ce3 Mon Sep 17 00:00:00 2001 From: Adam Beckmeyer Date: Fri, 20 Dec 2019 17:39:28 -0500 Subject: [PATCH 2/2] Convert to using Float64 instead of Float32 Different results obtained for higher n with Float32 than other implementations, so it's not allowed. --- mandelbrot/mandelbrot-fast.jl | 10 +++++----- mandelbrot/mandelbrot-fast.v2.jl | 10 +++++----- mandelbrot/mandelbrot-fast.v3.jl | 10 +++++----- 3 files changed, 15 insertions(+), 15 deletions(-) diff --git a/mandelbrot/mandelbrot-fast.jl b/mandelbrot/mandelbrot-fast.jl index 9e368e2..0135db7 100644 --- a/mandelbrot/mandelbrot-fast.jl +++ b/mandelbrot/mandelbrot-fast.jl @@ -8,7 +8,7 @@ The Computer Language Benchmarks Game modified for Julia 1.0 by Simon Danisch. tweaked for performance by https://github.com/maltezfaria and Adam Beckmeyer. =# -const zerov8 = ntuple(x-> 0f0, 8) +const zerov8 = ntuple(x-> 0.0, 8) const masks = (0b01111111, 0b10111111, 0b11011111, 0b11101111, 0b11110111, 0b11111011, 0b11111101, 0b11111110) @@ -19,13 +19,13 @@ Base.@propagate_inbounds function mand8(cr, ci) for _=1:10 for _=1:5 - Zi = 2f0 .* Zr .* Zi .+ ci + Zi = 2.0 .* Zr .* Zi .+ ci Zr = Tr .- Ti .+ cr Tr = Zr .* Zr Ti = Zi .* Zi end t = Tr .+ Ti - all(x-> x > 4f0, t) && (return 0x00) + all(x-> x > 4.0, t) && (return 0x00) end byte = 0xff @@ -44,8 +44,8 @@ end function mandelbrot(io, n = 200) inv_ = 2.0 / n - xvals = Vector{Float32}(undef, n) - yvals = Vector{Float32}(undef, n) + xvals = Vector{Float64}(undef, n) + yvals = Vector{Float64}(undef, n) @inbounds for i in 0:(n-1) xvals[i + 1] = i * inv_ - 1.5 yvals[i + 1] = i * inv_ - 1.0 diff --git a/mandelbrot/mandelbrot-fast.v2.jl b/mandelbrot/mandelbrot-fast.v2.jl index 5a4ee28..9af8c37 100644 --- a/mandelbrot/mandelbrot-fast.v2.jl +++ b/mandelbrot/mandelbrot-fast.v2.jl @@ -8,7 +8,7 @@ The Computer Language Benchmarks Game modified for Julia 1.0 by Simon Danisch. tweaked for performance by https://github.com/maltezfaria and Adam Beckmeyer. =# -const zerov8 = ntuple(x-> 0f0, 8) +const zerov8 = ntuple(x-> 0.0, 8) const masks = (0b01111111, 0b10111111, 0b11011111, 0b11101111, 0b11110111, 0b11111011, 0b11111101, 0b11111110) @@ -19,13 +19,13 @@ Base.@propagate_inbounds function mand8(cr, ci) for _=1:10 for _=1:5 - Zi = 2f0 .* Zr .* Zi .+ ci + Zi = 2.0 .* Zr .* Zi .+ ci Zr = Tr .- Ti .+ cr Tr = Zr .* Zr Ti = Zi .* Zi end t = Tr .+ Ti - all(x-> x > 4f0, t) && (return 0x00) + all(x-> x > 4.0, t) && (return 0x00) end byte = 0xff @@ -44,8 +44,8 @@ end function mandelbrot(io, n = 200) inv_ = 2.0 / n - xvals = Vector{Float32}(undef, n) - yvals = Vector{Float32}(undef, n) + xvals = Vector{Float64}(undef, n) + yvals = Vector{Float64}(undef, n) @inbounds for i in 0:(n-1) xvals[i + 1] = i * inv_ - 1.5 yvals[i + 1] = i * inv_ - 1.0 diff --git a/mandelbrot/mandelbrot-fast.v3.jl b/mandelbrot/mandelbrot-fast.v3.jl index 312bc19..8b49960 100644 --- a/mandelbrot/mandelbrot-fast.v3.jl +++ b/mandelbrot/mandelbrot-fast.v3.jl @@ -10,7 +10,7 @@ The Computer Language Benchmarks Game =# using KissThreading -const zerov8 = ntuple(x-> 0f0, 8) +const zerov8 = ntuple(x-> 0.0, 8) const masks = (0b01111111, 0b10111111, 0b11011111, 0b11101111, 0b11110111, 0b11111011, 0b11111101, 0b11111110) @@ -21,13 +21,13 @@ Base.@propagate_inbounds function mand8(cr, ci) for _=1:10 for _=1:5 - Zi = 2f0 .* Zr .* Zi .+ ci + Zi = 2.0 .* Zr .* Zi .+ ci Zr = Tr .- Ti .+ cr Tr = Zr .* Zr Ti = Zi .* Zi end t = Tr .+ Ti - all(x-> x > 4f0, t) && (return 0x00) + all(x-> x > 4.0, t) && (return 0x00) end byte = 0xff @@ -46,8 +46,8 @@ end function mandelbrot(io, n = 200) inv_ = 2.0 / n - xvals = Vector{Float32}(undef, n) - yvals = Vector{Float32}(undef, n) + xvals = Vector{Float64}(undef, n) + yvals = Vector{Float64}(undef, n) @inbounds for i in 0:(n-1) xvals[i + 1] = i * inv_ - 1.5 yvals[i + 1] = i * inv_ - 1.0