Modified benchmark + @tturbo
using LinearAlgebra, StaticArrays, LoopVectorization, Tullio, BenchmarkTools
using DataFrames
function serial1!(ds, s, h)
@inbounds @fastmath for ci in 1:size(ds, 1)
ds[ci, 1] = s[ci, 2]*h[ci, 3] - s[ci, 3]*h[ci, 2]
ds[ci, 2] = s[ci, 3]*h[ci, 1] - s[ci, 1]*h[ci, 3]
ds[ci, 3] = s[ci, 1]*h[ci, 2] - s[ci, 2]*h[ci, 1]
end
end
function turbo1!(ds, s, h)
@turbo for ci in 1:size(ds, 1)
ds[ci, 1] = s[ci, 2]*h[ci, 3] - s[ci, 3]*h[ci, 2]
ds[ci, 2] = s[ci, 3]*h[ci, 1] - s[ci, 1]*h[ci, 3]
ds[ci, 3] = s[ci, 1]*h[ci, 2] - s[ci, 2]*h[ci, 1]
end
end
function tturbo1!(ds, s, h)
@tturbo for ci in 1:size(ds, 1)
ds[ci, 1] = s[ci, 2]*h[ci, 3] - s[ci, 3]*h[ci, 2]
ds[ci, 2] = s[ci, 3]*h[ci, 1] - s[ci, 1]*h[ci, 3]
ds[ci, 3] = s[ci, 1]*h[ci, 2] - s[ci, 2]*h[ci, 1]
end
end
function parallel1!(ds, s, h)
idx1 = (2, 3, 1)
idx2 = (3, 1, 2)
@tullio ds[ci, cj] = s[ci, idx1[cj]]*h[ci, idx2[cj]] - s[ci, idx2[cj]]*h[ci, idx1[cj]]
end
function serial2!(ds, s, h)
@inbounds @fastmath for cj in 1:size(ds, 2)
ds[1, cj] = s[2, cj]*h[3, cj] - s[3, cj]*h[2, cj]
ds[2, cj] = s[3, cj]*h[1, cj] - s[1, cj]*h[3, cj]
ds[3, cj] = s[1, cj]*h[2, cj] - s[2, cj]*h[1, cj]
end
end
function turbo2!(ds, s, h)
@turbo for cj in 1:size(ds, 2)
ds[1, cj] = s[2, cj]*h[3, cj] - s[3, cj]*h[2, cj]
ds[2, cj] = s[3, cj]*h[1, cj] - s[1, cj]*h[3, cj]
ds[3, cj] = s[1, cj]*h[2, cj] - s[2, cj]*h[1, cj]
end
end
function tturbo2!(ds, s, h)
@tturbo for cj in 1:size(ds, 2)
ds[1, cj] = s[2, cj]*h[3, cj] - s[3, cj]*h[2, cj]
ds[2, cj] = s[3, cj]*h[1, cj] - s[1, cj]*h[3, cj]
ds[3, cj] = s[1, cj]*h[2, cj] - s[2, cj]*h[1, cj]
end
end
function parallel2!(ds, s, h)
idx1 = (2, 3, 1)
idx2 = (3, 1, 2)
@tullio ds[ci, cj] = s[idx1[ci], cj]*h[idx2[ci], cj] - s[idx2[ci], cj]*h[idx1[ci], cj]
end
function serialSA!(ds, s, h)
@. ds = cross(s,h)
end
function parallelSA!(ds, s, h)
Threads.@threads for i β 1:length(s)
@inbounds ds[i] = cross(s[i],h[i])
end
end
s = rand(10, 3);
h = rand(10, 3);
ds11 = Matrix{Float64}(undef, 10, 3);
serial1!(ds11, s, h);
ds12 = Matrix{Float64}(undef, 10, 3);
parallel1!(ds12, s, h);
@assert ds12 β ds11
ds12 .= NaN;
turbo1!(ds12, s, h);
@assert ds12 β ds11
ds12 .= NaN;
tturbo1!(ds12, s, h);
@assert ds12 β ds11
s = permutedims(s);
h = permutedims(h);
ds21 = Matrix{Float64}(undef, 3, 10);
serial2!(ds21, s, h)
@assert ds21 β ds12'
ds21 .= NaN;
turbo2!(ds21, s, h)
@assert ds21 β ds12'
ds21 .= NaN;
tturbo2!(ds21, s, h)
@assert ds21 β ds12'
ds22 = Matrix{Float64}(undef, 3, 10);
parallel2!(ds22, s, h);
@assert ds22 β ds21
array_size_in_bytes(n) = 3n*8
gbs(t,n) = 3array_size_in_bytes(n)/(t*1.e9) # 2R + 1W
gflops(t,n) = 9n/(t*1.e9) # 6mul + 3minus
function bench()
df = DataFrame()
for p in [2,4,6,8]
n =10^p
println("n=$n")
for f β (serial1!,turbo1!,tturbo1!,parallel1!)
t=@belapsed $f(ds, s, h) setup=(ds = Matrix{Float64}(undef, $n, 3); s = rand($n, 3); h = rand($n, 3))
append!(df,DataFrame(n = n,fn=string(f),t=t,GFlops=gflops(t,n), GBs=gbs(t,n)))
println("$(string(f)) t=$t s (GB/s=$(gbs(t,n)))")
end
for f β (serial2!,turbo2!,tturbo2!,parallel2!)
t=@belapsed $f(ds, s, h) setup=(ds = Matrix{Float64}(undef, 3, $n); s = rand(3, $n); h = rand(3, $n))
append!(df,DataFrame(n = n,fn=string(f),t=t,GFlops=gflops(t,n), GBs=gbs(t,n)))
println("$(string(f)) t=$t s (GB/s=$(gbs(t,n)))")
end
for f β (serialSA!,parallelSA!)
t=@belapsed $f(ds, s, h) setup=(ds = rand(SVector{3, Float64},$n); s = rand(SVector{3, Float64},$n); h = rand(SVector{3, Float64},$n))
append!(df,DataFrame(n = n,fn=string(f),t=t,GFlops=gflops(t,n), GBs=gbs(t,n)))
println("$(string(f)) t=$t s (GB/s=$(gbs(t,n)))")
end
end
df
end
bench()
Unsurprisingly, the serial performance is similar between the M1 and M1 max, but something seems wrong with my multithreaded performance.
Also somewhat surprising (to me) is just how well the StaticArray methods perform, particularly at larger sizes.