For those of you who aren’t aware, the Mojo SDK was recently released, so I thought I would take the opportunity to start benchmarking some Julia code against Mojo. As a first test, I am calculating the Mandelbrot set using the code provided by Modular.
This is my Julia implementation:
using Plots
const xn = 960
const yn = 960
const xmin = -2.0
const xmax = 0.6
const ymin = -1.5
const ymax = 1.5
const MAX_ITERS = 200
function mandelbrot_kernel(c)
z = c
for i = 1:MAX_ITERS
z = z * z + c
if abs2(z) > 4
return i
end
end
return MAX_ITERS
end
function compute_mandelbrot()
result = zeros(yn, xn)
x_range = range(xmin, xmax, xn)
y_range = range(ymin, ymax, xn)
Threads.@threads for j = 1:yn
for i = 1:xn
x = x_range[i]
y = y_range[j]
result[j, i] = mandelbrot_kernel(complex(x, y))
end
end
return result
end
result = compute_mandelbrot()
x_range = range(xmin, xmax, xn)
y_range = range(ymin, ymax, yn)
heatmap(x_range, y_range, result)
I then benchmarked the Julia code
julia> @btime compute_mandelbrot()
7.452 ms (341 allocations: 7.07 MiB)
For completeness, this is the Mojo code I used
from benchmark import Benchmark
from complex import ComplexSIMD, ComplexFloat64
from math import iota
from python import Python
from runtime.llcl import num_cores, Runtime
from algorithm import parallelize, vectorize
from tensor import Tensor
from utils.index import Index
alias width = 960
alias height = 960
alias MAX_ITERS = 200
alias min_x = -2.0
alias max_x = 0.6
alias min_y = -1.5
alias max_y = 1.5
alias float_type = DType.float64
alias simd_width = simdwidthof[float_type]()
def show_plot(tensor: Tensor[float_type]):
alias scale = 10
alias dpi = 64
np = Python.import_module("numpy")
plt = Python.import_module("matplotlib.pyplot")
colors = Python.import_module("matplotlib.colors")
numpy_array = np.zeros((height, width), np.float64)
for row in range(height):
for col in range(width):
numpy_array.itemset((col, row), tensor[col, row])
fig = plt.figure(1, [scale, scale * height // width], dpi)
ax = fig.add_axes([0.0, 0.0, 1.0, 1.0], False, 1)
light = colors.LightSource(315, 10, 0, 1, 1, 0)
image = light.shade(
numpy_array, plt.cm.hot, colors.PowerNorm(0.3), "hsv", 0, 0, 1.5
)
plt.imshow(image)
plt.axis("off")
plt.show()
fn mandelbrot_kernel_SIMD[
simd_width: Int
](c: ComplexSIMD[float_type, simd_width]) -> SIMD[float_type, simd_width]:
"""A vectorized implementation of the inner mandelbrot computation."""
var z = ComplexSIMD[float_type, simd_width](0, 0)
var iters = SIMD[float_type, simd_width](0)
var in_set_mask: SIMD[DType.bool, simd_width] = True
for i in range(MAX_ITERS):
if not in_set_mask.reduce_or():
break
in_set_mask = z.squared_norm() <= 4
iters = in_set_mask.select(iters + 1, iters)
z = z.squared_add(c)
return iters
fn parallelized():
let t = Tensor[float_type](height, width)
@parameter
fn worker(row: Int):
let scale_x = (max_x - min_x) / width
let scale_y = (max_y - min_y) / height
@parameter
fn compute_vector[simd_width: Int](col: Int):
"""Each time we oeprate on a `simd_width` vector of pixels."""
let cx = min_x + (col + iota[float_type, simd_width]()) * scale_x
let cy = min_y + row * scale_y
let c = ComplexSIMD[float_type, simd_width](cx, cy)
t.data().simd_store[simd_width](
row * width + col, mandelbrot_kernel_SIMD[simd_width](c)
)
# Vectorize the call to compute_vector where call gets a chunk of pixels.
vectorize[simd_width, compute_vector](width)
with Runtime() as rt:
@parameter
fn bench_parallel[simd_width: Int]():
parallelize[worker](rt, height, 5 * num_cores())
alias simd_width = simdwidthof[DType.float64]()
let parallelized = Benchmark().run[bench_parallel[simd_width]]() / 1e6
print("Parallelized:", parallelized, "ms")
try:
_ = show_plot(t)
except e:
print("failed to show plot:", e.value)
def main():
parallelized()
This gave me the result:
Parallelized: 2.1398090000000001 ms
On my machine, the Mojo code will run in 2.14 ms. The Julia code takes 7.45 ms. I’m using 32 threads for Julia, but I admit that I haven’t fine tuned that number. I was wondering if anyone would be willing to offer up ways to improve the Julia code further? I don’t really know what else I can do and I think this would be an excellent way to learn more about squeezing every last drop of performance out of Julia. If we could end up beating the Mojo code that’d be fantastic! Doubly so because of how much more elegant the Julia code is.
Thank you all for your time.