From 0fa8313ead7cb27745f2ff3cfe65a690ddcd65c9 Mon Sep 17 00:00:00 2001
From: Adam Beckmeyer <adam_gpg@thebeckmeyers.xyz>
Date: Thu, 19 Dec 2019 14:59:46 -0500
Subject: [PATCH 1/2] Tweak mandelbrot-fast.jl for performance.

Multiple versions with different threadings included because different
versions are faster depending on the machine. Depending on machine,
gains can be over 20% compared to original mandelbrot-fast.jl.

NOTE: running mandelbrot-fast.v3.jl requires installation of
https://github.com/mohamed82008/KissThreading.jl

Changes included in every version:
- Removing threading from filling xvals and yvals--threading overhead
  is too high for such a simple operation.
- Remove @simd annotation from mandel_inner--simd is occurring at the
  level of mand8; @simd doesn't hurt runtime but increases compilation
  time.
- Only run mandelbrot when !isinteractive() to make development and
  debugging easier.
- Various tweaks and minor stylistic updates for succinctness and
  maybe a marginal increase in performance.
---
 mandelbrot/mandelbrot-fast.jl    | 81 +++++++++++++++-----------------
 mandelbrot/mandelbrot-fast.v2.jl | 63 +++++++++++++++++++++++++
 mandelbrot/mandelbrot-fast.v3.jl | 64 +++++++++++++++++++++++++
 3 files changed, 164 insertions(+), 44 deletions(-)
 create mode 100644 mandelbrot/mandelbrot-fast.v2.jl
 create mode 100644 mandelbrot/mandelbrot-fast.v3.jl

diff --git a/mandelbrot/mandelbrot-fast.jl b/mandelbrot/mandelbrot-fast.jl
index cda1933..9e368e2 100644
--- a/mandelbrot/mandelbrot-fast.jl
+++ b/mandelbrot/mandelbrot-fast.jl
@@ -1,74 +1,67 @@
 #=
 The Computer Language Benchmarks Game
  https://salsa.debian.org/benchmarksgame-team/benchmarksgame/
+
  direct transliteration of the swift#3 program by Ralph Ganszky and Daniel Muellenborn:
  https://benchmarksgame-team.pages.debian.net/benchmarksgame/program/mandelbrot-swift-3.html
- modified for Julia 1.0 by Simon Danisch
+
+ modified for Julia 1.0 by Simon Danisch.
+ tweaked for performance by https://github.com/maltezfaria and Adam Beckmeyer.
 =#
 const zerov8 = ntuple(x-> 0f0, 8)
-
-@inline function step_mandel(Zr,Zi,Tr,Ti,cr,ci)
-    Zi = 2f0 .* Zr .* Zi .+ ci
-    Zr = Tr .- Ti .+ cr
-    Tr = Zr .* Zr
-    Ti = Zi .* Zi
-    return Zr,Zi,Tr,Ti
-end
+const masks = (0b01111111, 0b10111111, 0b11011111, 0b11101111, 0b11110111,
+               0b11111011, 0b11111101, 0b11111110)
 
 # Calculate mandelbrot set for one Vec8 into one byte
 Base.@propagate_inbounds function mand8(cr, ci)
-    Zr = zerov8
-    Zi = zerov8
-    Tr = zerov8
-    Ti = zerov8
-    t = zerov8
+    Zr = Zi = Tr = Ti = t = zerov8
     i = 0
 
-    while i<50
-        for _ in 1:5
-            Zr,Zi,Tr,Ti = step_mandel(Zr,Zi,Tr,Ti,cr,ci)
-            i += 1
+    for _=1:10
+        for _=1:5
+            Zi = 2f0 .* Zr .* Zi .+ ci
+            Zr = Tr .- Ti .+ cr
+            Tr = Zr .* Zr
+            Ti = Zi .* Zi
         end
         t = Tr .+ Ti
         all(x-> x > 4f0, t) && (return 0x00)
     end
+
     byte = 0xff
-    t[1] <= 4.0 || (byte &= 0b01111111)
-    t[2] <= 4.0 || (byte &= 0b10111111)
-    t[3] <= 4.0 || (byte &= 0b11011111)
-    t[4] <= 4.0 || (byte &= 0b11101111)
-    t[5] <= 4.0 || (byte &= 0b11110111)
-    t[6] <= 4.0 || (byte &= 0b11111011)
-    t[7] <= 4.0 || (byte &= 0b11111101)
-    t[8] <= 4.0 || (byte &= 0b11111110)
+    for i=1:8
+        t[i] <= 4.0 || (byte &= masks[i])
+    end
     return byte
 end
 
 function mandel_inner(rows, ci, y, N, xvals)
-    @simd for x in 1:8:N
-        @inbounds begin
-            cr = ntuple(i-> xvals[x + (i - 1)], 8)
-            rows[((y-1)*N÷8+(x-1)÷8) + 1] = mand8(cr, ci)
-        end
+    @inbounds for x=1:8:N
+        cr = ntuple(i-> xvals[x + i - 1], 8)
+        rows[((y-1)*N÷8+(x-1)÷8) + 1] = mand8(cr, ci)
     end
 end
 
-function mandelbrot(n = 200)
+function mandelbrot(io, n = 200)
     inv_ = 2.0 / n
-    N = n
-    xvals = zeros(Float32, n)
-    yvals = zeros(Float32, n)
-    Threads.@threads for i in 0:(N-1)
-        @inbounds xvals[i + 1] = i * inv_ - 1.5
-        @inbounds yvals[i + 1] = i * inv_ - 1.0
+    xvals = Vector{Float32}(undef, n)
+    yvals = Vector{Float32}(undef, n)
+    @inbounds for i in 0:(n-1)
+        xvals[i + 1] = i * inv_ - 1.5
+        yvals[i + 1] = i * inv_ - 1.0
     end
-    rows = zeros(UInt8, n*N÷8)
-    Threads.@threads for y in 1:N
+
+    rows = Vector{UInt8}(undef, n^2 ÷ 8)
+    @sync for y=1:n
         @inbounds ci = yvals[y]
-        mandel_inner(rows, ci, y, N, xvals)
+        # This allows dynamic scheduling instead of static scheduling
+        # of Threads.@threads macro. See
+        # https://github.com/JuliaLang/julia/issues/21017 . On some
+        # computers this is faster, on others not.
+        Threads.@spawn mandel_inner(rows, ci, y, n, xvals)
     end
-    write(stdout, "P4\n$n $n\n")
-    write(stdout, rows)
+    write(io, "P4\n$n $n\n")
+    write(io, rows)
 end
 
-mandelbrot(parse(Int, ARGS[1]))
+isinteractive() || mandelbrot(stdout, parse(Int, ARGS[1]))
diff --git a/mandelbrot/mandelbrot-fast.v2.jl b/mandelbrot/mandelbrot-fast.v2.jl
new file mode 100644
index 0000000..5a4ee28
--- /dev/null
+++ b/mandelbrot/mandelbrot-fast.v2.jl
@@ -0,0 +1,63 @@
+#=
+The Computer Language Benchmarks Game
+ https://salsa.debian.org/benchmarksgame-team/benchmarksgame/
+
+ direct transliteration of the swift#3 program by Ralph Ganszky and Daniel Muellenborn:
+ https://benchmarksgame-team.pages.debian.net/benchmarksgame/program/mandelbrot-swift-3.html
+
+ modified for Julia 1.0 by Simon Danisch.
+ tweaked for performance by https://github.com/maltezfaria and Adam Beckmeyer.
+=#
+const zerov8 = ntuple(x-> 0f0, 8)
+const masks = (0b01111111, 0b10111111, 0b11011111, 0b11101111, 0b11110111,
+               0b11111011, 0b11111101, 0b11111110)
+
+# Calculate mandelbrot set for one Vec8 into one byte
+Base.@propagate_inbounds function mand8(cr, ci)
+    Zr = Zi = Tr = Ti = t = zerov8
+    i = 0
+
+    for _=1:10
+        for _=1:5
+            Zi = 2f0 .* Zr .* Zi .+ ci
+            Zr = Tr .- Ti .+ cr
+            Tr = Zr .* Zr
+            Ti = Zi .* Zi
+        end
+        t = Tr .+ Ti
+        all(x-> x > 4f0, t) && (return 0x00)
+    end
+
+    byte = 0xff
+    for i=1:8
+        t[i] <= 4.0 || (byte &= masks[i])
+    end
+    return byte
+end
+
+function mandel_inner(rows, ci, y, N, xvals)
+    @inbounds for x=1:8:N
+        cr = ntuple(i-> xvals[x + i - 1], 8)
+        rows[((y-1)*N÷8+(x-1)÷8) + 1] = mand8(cr, ci)
+    end
+end
+
+function mandelbrot(io, n = 200)
+    inv_ = 2.0 / n
+    xvals = Vector{Float32}(undef, n)
+    yvals = Vector{Float32}(undef, n)
+    @inbounds for i in 0:(n-1)
+        xvals[i + 1] = i * inv_ - 1.5
+        yvals[i + 1] = i * inv_ - 1.0
+    end
+
+    rows = Vector{UInt8}(undef, n^2 ÷ 8)
+    Threads.@threads for y=1:n
+        @inbounds ci = yvals[y]
+        mandel_inner(rows, ci, y, n, xvals)
+    end
+    write(io, "P4\n$n $n\n")
+    write(io, rows)
+end
+
+isinteractive() || mandelbrot(stdout, parse(Int, ARGS[1]))
diff --git a/mandelbrot/mandelbrot-fast.v3.jl b/mandelbrot/mandelbrot-fast.v3.jl
new file mode 100644
index 0000000..312bc19
--- /dev/null
+++ b/mandelbrot/mandelbrot-fast.v3.jl
@@ -0,0 +1,64 @@
+#=
+The Computer Language Benchmarks Game
+ https://salsa.debian.org/benchmarksgame-team/benchmarksgame/
+
+ direct transliteration of the swift#3 program by Ralph Ganszky and Daniel Muellenborn:
+ https://benchmarksgame-team.pages.debian.net/benchmarksgame/program/mandelbrot-swift-3.html
+
+ modified for Julia 1.0 by Simon Danisch.
+ tweaked for performance by https://github.com/maltezfaria and Adam Beckmeyer.
+=#
+using KissThreading
+
+const zerov8 = ntuple(x-> 0f0, 8)
+const masks = (0b01111111, 0b10111111, 0b11011111, 0b11101111, 0b11110111,
+               0b11111011, 0b11111101, 0b11111110)
+
+# Calculate mandelbrot set for one Vec8 into one byte
+Base.@propagate_inbounds function mand8(cr, ci)
+    Zr = Zi = Tr = Ti = t = zerov8
+    i = 0
+
+    for _=1:10
+        for _=1:5
+            Zi = 2f0 .* Zr .* Zi .+ ci
+            Zr = Tr .- Ti .+ cr
+            Tr = Zr .* Zr
+            Ti = Zi .* Zi
+        end
+        t = Tr .+ Ti
+        all(x-> x > 4f0, t) && (return 0x00)
+    end
+
+    byte = 0xff
+    for i=1:8
+        t[i] <= 4.0 || (byte &= masks[i])
+    end
+    return byte
+end
+
+function mandel_inner(rows, ci, y, N, xvals)
+    @inbounds for x=1:8:N
+        cr = ntuple(i-> xvals[x + i - 1], 8)
+        rows[((y-1)*N÷8+(x-1)÷8) + 1] = mand8(cr, ci)
+    end
+end
+
+function mandelbrot(io, n = 200)
+    inv_ = 2.0 / n
+    xvals = Vector{Float32}(undef, n)
+    yvals = Vector{Float32}(undef, n)
+    @inbounds for i in 0:(n-1)
+        xvals[i + 1] = i * inv_ - 1.5
+        yvals[i + 1] = i * inv_ - 1.0
+    end
+
+    rows = Vector{UInt8}(undef, n^2 ÷ 8)
+    f(y) = @inbounds mandel_inner(rows, yvals[y], y, n, xvals)
+    tmap!(f, Vector{Nothing}(undef, n), collect(1:n); batch_size=8)
+
+    write(io, "P4\n$n $n\n")
+    write(io, rows)
+end
+
+isinteractive() || mandelbrot(stdout, parse(Int, ARGS[1]))

From 204f6e395d8f200c8cb02df8dbd1d44574fe4ce3 Mon Sep 17 00:00:00 2001
From: Adam Beckmeyer <adam_gpg@thebeckmeyers.xyz>
Date: Fri, 20 Dec 2019 17:39:28 -0500
Subject: [PATCH 2/2] Convert to using Float64 instead of Float32

Different results obtained for higher n with Float32 than other
implementations, so it's not allowed.
---
 mandelbrot/mandelbrot-fast.jl    | 10 +++++-----
 mandelbrot/mandelbrot-fast.v2.jl | 10 +++++-----
 mandelbrot/mandelbrot-fast.v3.jl | 10 +++++-----
 3 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/mandelbrot/mandelbrot-fast.jl b/mandelbrot/mandelbrot-fast.jl
index 9e368e2..0135db7 100644
--- a/mandelbrot/mandelbrot-fast.jl
+++ b/mandelbrot/mandelbrot-fast.jl
@@ -8,7 +8,7 @@ The Computer Language Benchmarks Game
  modified for Julia 1.0 by Simon Danisch.
  tweaked for performance by https://github.com/maltezfaria and Adam Beckmeyer.
 =#
-const zerov8 = ntuple(x-> 0f0, 8)
+const zerov8 = ntuple(x-> 0.0, 8)
 const masks = (0b01111111, 0b10111111, 0b11011111, 0b11101111, 0b11110111,
                0b11111011, 0b11111101, 0b11111110)
 
@@ -19,13 +19,13 @@ Base.@propagate_inbounds function mand8(cr, ci)
 
     for _=1:10
         for _=1:5
-            Zi = 2f0 .* Zr .* Zi .+ ci
+            Zi = 2.0 .* Zr .* Zi .+ ci
             Zr = Tr .- Ti .+ cr
             Tr = Zr .* Zr
             Ti = Zi .* Zi
         end
         t = Tr .+ Ti
-        all(x-> x > 4f0, t) && (return 0x00)
+        all(x-> x > 4.0, t) && (return 0x00)
     end
 
     byte = 0xff
@@ -44,8 +44,8 @@ end
 
 function mandelbrot(io, n = 200)
     inv_ = 2.0 / n
-    xvals = Vector{Float32}(undef, n)
-    yvals = Vector{Float32}(undef, n)
+    xvals = Vector{Float64}(undef, n)
+    yvals = Vector{Float64}(undef, n)
     @inbounds for i in 0:(n-1)
         xvals[i + 1] = i * inv_ - 1.5
         yvals[i + 1] = i * inv_ - 1.0
diff --git a/mandelbrot/mandelbrot-fast.v2.jl b/mandelbrot/mandelbrot-fast.v2.jl
index 5a4ee28..9af8c37 100644
--- a/mandelbrot/mandelbrot-fast.v2.jl
+++ b/mandelbrot/mandelbrot-fast.v2.jl
@@ -8,7 +8,7 @@ The Computer Language Benchmarks Game
  modified for Julia 1.0 by Simon Danisch.
  tweaked for performance by https://github.com/maltezfaria and Adam Beckmeyer.
 =#
-const zerov8 = ntuple(x-> 0f0, 8)
+const zerov8 = ntuple(x-> 0.0, 8)
 const masks = (0b01111111, 0b10111111, 0b11011111, 0b11101111, 0b11110111,
                0b11111011, 0b11111101, 0b11111110)
 
@@ -19,13 +19,13 @@ Base.@propagate_inbounds function mand8(cr, ci)
 
     for _=1:10
         for _=1:5
-            Zi = 2f0 .* Zr .* Zi .+ ci
+            Zi = 2.0 .* Zr .* Zi .+ ci
             Zr = Tr .- Ti .+ cr
             Tr = Zr .* Zr
             Ti = Zi .* Zi
         end
         t = Tr .+ Ti
-        all(x-> x > 4f0, t) && (return 0x00)
+        all(x-> x > 4.0, t) && (return 0x00)
     end
 
     byte = 0xff
@@ -44,8 +44,8 @@ end
 
 function mandelbrot(io, n = 200)
     inv_ = 2.0 / n
-    xvals = Vector{Float32}(undef, n)
-    yvals = Vector{Float32}(undef, n)
+    xvals = Vector{Float64}(undef, n)
+    yvals = Vector{Float64}(undef, n)
     @inbounds for i in 0:(n-1)
         xvals[i + 1] = i * inv_ - 1.5
         yvals[i + 1] = i * inv_ - 1.0
diff --git a/mandelbrot/mandelbrot-fast.v3.jl b/mandelbrot/mandelbrot-fast.v3.jl
index 312bc19..8b49960 100644
--- a/mandelbrot/mandelbrot-fast.v3.jl
+++ b/mandelbrot/mandelbrot-fast.v3.jl
@@ -10,7 +10,7 @@ The Computer Language Benchmarks Game
 =#
 using KissThreading
 
-const zerov8 = ntuple(x-> 0f0, 8)
+const zerov8 = ntuple(x-> 0.0, 8)
 const masks = (0b01111111, 0b10111111, 0b11011111, 0b11101111, 0b11110111,
                0b11111011, 0b11111101, 0b11111110)
 
@@ -21,13 +21,13 @@ Base.@propagate_inbounds function mand8(cr, ci)
 
     for _=1:10
         for _=1:5
-            Zi = 2f0 .* Zr .* Zi .+ ci
+            Zi = 2.0 .* Zr .* Zi .+ ci
             Zr = Tr .- Ti .+ cr
             Tr = Zr .* Zr
             Ti = Zi .* Zi
         end
         t = Tr .+ Ti
-        all(x-> x > 4f0, t) && (return 0x00)
+        all(x-> x > 4.0, t) && (return 0x00)
     end
 
     byte = 0xff
@@ -46,8 +46,8 @@ end
 
 function mandelbrot(io, n = 200)
     inv_ = 2.0 / n
-    xvals = Vector{Float32}(undef, n)
-    yvals = Vector{Float32}(undef, n)
+    xvals = Vector{Float64}(undef, n)
+    yvals = Vector{Float64}(undef, n)
     @inbounds for i in 0:(n-1)
         xvals[i + 1] = i * inv_ - 1.5
         yvals[i + 1] = i * inv_ - 1.0