julia> @benchmark compute_mandelbrot!($result)
BenchmarkTools.Trial: 68 samples with 1 evaluation.
Range (min … max): 14.030 ms … 15.901 ms ┊ GC (min … max): 0.00% … 0.00%
Time (median): 14.824 ms ┊ GC (median): 0.00%
Time (mean ± σ): 14.836 ms ± 576.959 μs ┊ GC (mean ± σ): 0.00% ± 0.00%
██ ▅ ▂ ▂▂
█▁████▁▅█▅▁██▁▁▁▁█▁▅▁▁█▅▅▁█▁▅▅▁▅▅█▁█▅█▁▁█▅▁█▁▅▅▁██▅▅█▁▅▁▁▁▅▅ ▁
14 ms Histogram: frequency by time 15.9 ms <
Memory estimate: 18.69 KiB, allocs estimate: 177.
julia> @benchmark compute_mandelbrot_turbo!($result)
BenchmarkTools.Trial: 404 samples with 1 evaluation.
Range (min … max): 2.460 ms … 2.946 ms ┊ GC (min … max): 0.00% … 0.00%
Time (median): 2.462 ms ┊ GC (median): 0.00%
Time (mean ± σ): 2.477 ms ± 77.247 μs ┊ GC (mean ± σ): 0.00% ± 0.00%
julia> versioninfo()
Julia Version 1.9.3-DEV.0
Commit 6fc1be04ee (2023-07-06 14:55 UTC)
Platform Info:
OS: Linux (x86_64-generic-linux)
CPU: 28 × Intel(R) Core(TM) i9-9940X CPU @ 3.30GHz
WORD_SIZE: 64
LIBM: libopenlibm
LLVM: libLLVM-14.0.6 (ORCJIT, skylake-avx512)
Threads: 28 on 28 virtual cores
Code:
using LoopVectorization, VectorizationBase
# using Plots
const xn = 960
const yn = 960
const xmin = -2.0
const xmax = 0.6
const ymin = -1.5
const ymax = 1.5
const MAX_ITERS = 200
@inline function mandelbrot_kernel(c)
z = c
for i = 1:MAX_ITERS
z = z * z + c
if abs2(z) > 4
return i
end
end
return MAX_ITERS
end
@inline function mandelbrot_kernel(r::Vec{W}, i::Vec{W}) where {W}
z = c = complex(r,i)
inactive = VectorizationBase.Mask(VectorizationBase.zero_mask(Val(W)))
imask = VectorizationBase.vbroadcast(Val(W), 0)
for i = 1:MAX_ITERS
z = muladd(z, z, c)
imask = ifelse(inactive, imask, i)
inactive |= abs2(z) > 4
VectorizationBase.vall(inactive) && return imask
end
return imask
end
@inline function mandelbrot_kernel(r::VecUnroll, i::VecUnroll)
VectorizationBase.VecUnroll(fmap(mandelbrot_kernel, VectorizationBase.data(r),VectorizationBase.data(i)))
end
function compute_mandelbrot!(result)
x_range = range(xmin, xmax, xn)
y_range = range(ymin, ymax, xn)
Threads.@threads for j = 1:yn
for i = 1:xn
x = x_range[i]
y = y_range[j]
result[j, i] = mandelbrot_kernel(complex(x, y))
end
end
return result
end
compute_mandelbrot() = compute_mandelbrot!(zeros(yn, xn))
function compute_mandelbrot_turbo!(result)
x_range = range(xmin, xmax, xn)
y_range = range(ymin, ymax, xn)
@tturbo for j = 1:yn
for i = 1:xn
x = x_range[i]
y = y_range[j]
result[j, i] = mandelbrot_kernel(x, y)
end
end
return result
end
compute_mandelbrot_turbo() = compute_mandelbrot_turbo!(zeros(yn, xn))
result = zeros(yn, xn);
@benchmark compute_mandelbrot!($result)
@benchmark compute_mandelbrot_turbo!($result)
EDIT:
I get
julia> @benchmark mandelbrot($result, $x_range, $y_range, Val(4))
BenchmarkTools.Trial: 312 samples with 1 evaluation.
Range (min … max): 3.094 ms … 7.088 ms ┊ GC (min … max): 0.00% … 52.95%
Time (median): 3.146 ms ┊ GC (median): 0.00%
Time (mean ± σ): 3.199 ms ± 438.872 μs ┊ GC (mean ± σ): 1.49% ± 5.96%
█▅
██▇▄▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▄ ▆
3.09 ms Histogram: log(frequency) by time 6.98 ms <
Memory estimate: 593.08 KiB, allocs estimate: 4843.
from @sdanisch’s example.
I also see better perf with more threads, but LV is limiting itself to 1 per core and not using hyperthreads.