Julia & Mojo Mandelbrot Benchmark

julia> @benchmark compute_mandelbrot!($result)
BenchmarkTools.Trial: 68 samples with 1 evaluation.
 Range (min … max):  14.030 ms …  15.901 ms  ┊ GC (min … max): 0.00% … 0.00%
 Time  (median):     14.824 ms               ┊ GC (median):    0.00%
 Time  (mean ± σ):   14.836 ms ± 576.959 μs  ┊ GC (mean ± σ):  0.00% ± 0.00%

    ██        ▅                        ▂          ▂▂            
  █▁████▁▅█▅▁██▁▁▁▁█▁▅▁▁█▅▅▁█▁▅▅▁▅▅█▁█▅█▁▁█▅▁█▁▅▅▁██▅▅█▁▅▁▁▁▅▅ ▁
  14 ms           Histogram: frequency by time         15.9 ms <

 Memory estimate: 18.69 KiB, allocs estimate: 177.

julia> @benchmark compute_mandelbrot_turbo!($result)
BenchmarkTools.Trial: 404 samples with 1 evaluation.
 Range (min … max):  2.460 ms …  2.946 ms  ┊ GC (min … max): 0.00% … 0.00%
 Time  (median):     2.462 ms              ┊ GC (median):    0.00%
 Time  (mean ± σ):   2.477 ms ± 77.247 μs  ┊ GC (mean ± σ):  0.00% ± 0.00%

julia> versioninfo()
Julia Version 1.9.3-DEV.0
Commit 6fc1be04ee (2023-07-06 14:55 UTC)
Platform Info:
  OS: Linux (x86_64-generic-linux)
  CPU: 28 × Intel(R) Core(TM) i9-9940X CPU @ 3.30GHz
  WORD_SIZE: 64
  LIBM: libopenlibm
  LLVM: libLLVM-14.0.6 (ORCJIT, skylake-avx512)
  Threads: 28 on 28 virtual cores

Code:

using LoopVectorization, VectorizationBase

# using Plots 

const xn = 960
const yn = 960
const xmin = -2.0
const xmax = 0.6
const ymin = -1.5
const ymax = 1.5
const MAX_ITERS = 200

@inline function mandelbrot_kernel(c)
  z = c
  for i = 1:MAX_ITERS
    z = z * z + c
    if abs2(z) > 4
      return i
    end
  end
  return MAX_ITERS
end
@inline function mandelbrot_kernel(r::Vec{W}, i::Vec{W}) where {W}
  z = c = complex(r,i)
  inactive = VectorizationBase.Mask(VectorizationBase.zero_mask(Val(W)))
  imask = VectorizationBase.vbroadcast(Val(W), 0)
  for i = 1:MAX_ITERS
    z = muladd(z, z, c)
    imask = ifelse(inactive, imask, i)
    inactive |= abs2(z) > 4
    VectorizationBase.vall(inactive) && return imask
  end
  return imask
end
@inline function mandelbrot_kernel(r::VecUnroll, i::VecUnroll)
VectorizationBase.VecUnroll(fmap(mandelbrot_kernel, VectorizationBase.data(r),VectorizationBase.data(i)))
end

function compute_mandelbrot!(result)

  x_range = range(xmin, xmax, xn)
  y_range = range(ymin, ymax, xn)

  Threads.@threads for j = 1:yn
    for i = 1:xn
      x = x_range[i]
      y = y_range[j]
      result[j, i] = mandelbrot_kernel(complex(x, y))
    end
  end
  return result
end
compute_mandelbrot() = compute_mandelbrot!(zeros(yn, xn))
function compute_mandelbrot_turbo!(result)
  x_range = range(xmin, xmax, xn)
  y_range = range(ymin, ymax, xn)
  
  @tturbo for j = 1:yn
    for i = 1:xn
      x = x_range[i]
      y = y_range[j]
      result[j, i] = mandelbrot_kernel(x, y)
    end
  end
  return result
end
compute_mandelbrot_turbo() = compute_mandelbrot_turbo!(zeros(yn, xn))
result = zeros(yn, xn);

@benchmark compute_mandelbrot!($result)
@benchmark compute_mandelbrot_turbo!($result)

EDIT:
I get

julia> @benchmark mandelbrot($result, $x_range, $y_range, Val(4))
BenchmarkTools.Trial: 312 samples with 1 evaluation.
 Range (min … max):  3.094 ms …   7.088 ms  ┊ GC (min … max): 0.00% … 52.95%
 Time  (median):     3.146 ms               ┊ GC (median):    0.00%
 Time  (mean ± σ):   3.199 ms ± 438.872 μs  ┊ GC (mean ± σ):  1.49% ±  5.96%

  █▅                                                           
  ██▇▄▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▄ ▆
  3.09 ms      Histogram: log(frequency) by time      6.98 ms <

 Memory estimate: 593.08 KiB, allocs estimate: 4843.

from @sdanisch’s example.

I also see better perf with more threads, but LV is limiting itself to 1 per core and not using hyperthreads.

8 Likes