New best:
julia> @benchmark compute_mandelbrot_polyturbo!($result)
BenchmarkTools.Trial: 652 samples with 1 evaluation.
Range (min … max): 1.494 ms … 4.784 ms ┊ GC (min … max): 0.00% … 0.00%
Time (median): 1.520 ms ┊ GC (median): 0.00%
Time (mean ± σ): 1.533 ms ± 147.023 μs ┊ GC (mean ± σ): 0.00% ± 0.00%
▃▃ ▂▄▆██▆▃ ▁ ▂
██▇▄▁▁▆▄▁▇███████▇███▇▆▇███▆▆█████▇▅▇▅▁▇▄▅▇▆▅▁▄▅▁▁▁▁▁▁▁▁▁▁▄ █
1.49 ms Histogram: log(frequency) by time 1.6 ms <
Memory estimate: 0 bytes, allocs estimate: 0.
code
using LoopVectorization, VectorizationBase, Polyester
# using Plots
const xn = 960
const yn = 960
const xmin = -2.0
const xmax = 0.6
const ymin = -1.5
const ymax = 1.5
const MAX_ITERS = 200
@inline function mandelbrot_kernel(c)
z = c
for i = 1:MAX_ITERS
z = z * z + c
if abs2(z) > 4
return i
end
end
return MAX_ITERS
end
@inline function mandelbrot_kernel(r, i::Vec{W}) where {W}
z = c = complex(r,i)
inactive = VectorizationBase.Mask(VectorizationBase.zero_mask(Val(W)))
imask = VectorizationBase.vbroadcast(Val(W), 0)
for i = 1:MAX_ITERS
z = muladd(z, z, c)
imask = ifelse(inactive, imask, i)
inactive |= abs2(z) > 4
VectorizationBase.vall(inactive) && return imask
end
return imask
end
@inline function mandelbrot_kernel(r::VecUnroll, i::VecUnroll)
VectorizationBase.VecUnroll(VectorizationBase.fmap(mandelbrot_kernel, VectorizationBase.data(r),VectorizationBase.data(i)))
end
@inline function mandelbrot_kernel(r::Vec, i::VecUnroll)
VectorizationBase.VecUnroll(VectorizationBase.fmap(mandelbrot_kernel, r, VectorizationBase.data(i)))
end
@inline function mandelbrot_kernel(r::VecUnroll, i::Vec)
VectorizationBase.VecUnroll(VectorizationBase.fmap(mandelbrot_kernel, VectorizationBase.data(r),i))
end
function compute_mandelbrot!(result)
x_range = range(xmin, xmax, xn)
y_range = range(ymin, ymax, xn)
Threads.@threads for j = 1:yn
for i = 1:xn
x = x_range[i]
y = y_range[j]
result[j, i] = mandelbrot_kernel(complex(x, y))
end
end
return result
end
compute_mandelbrot() = compute_mandelbrot!(zeros(yn, xn))
function compute_mandelbrot_turbo!(result)
x_range = range(xmin, xmax, xn)
y_range = range(ymin, ymax, xn)
@tturbo for j = 1:yn
for i = 1:xn
x = x_range[i]
y = y_range[j]
result[j, i] = mandelbrot_kernel(x, y)
end
end
return result
end
function compute_mandelbrot_polyturbo!(result)
x_range = range(xmin, xmax, xn)
y_range = range(ymin, ymax, xn)
@batch per=thread for i = 1:xn
x = x_range[i]
@turbo for j = 1:yn
y = y_range[j]
result[j, i] = mandelbrot_kernel(x, y)
end
end
return result
end
compute_mandelbrot_turbo() = compute_mandelbrot_turbo!(zeros(yn, xn))
compute_mandelbrot_polyturbo() = compute_mandelbrot_polyturbo!(zeros(yn, xn))
result = zeros(yn, xn);
@benchmark compute_mandelbrot!($result)
@benchmark compute_mandelbrot_turbo!($result)
@benchmark compute_mandelbrot_polyturbo!($result)