Fixing an embarrassing indexing bug, adding validation, incorporating the improvements from @LaurentPlagne and @mcabbott (big thanks!) and changing the bench marking range downwards
using LinearAlgebra, StaticArrays, LoopVectorization, Tullio, BenchmarkTools
function serial1!(ds, s, h)
for ci in 1:size(ds, 1)
ds[ci, 1] = s[ci, 2]*h[ci, 3] - s[ci, 3]*h[ci, 2]
ds[ci, 2] = s[ci, 3]*h[ci, 1] - s[ci, 1]*h[ci, 3]
ds[ci, 3] = s[ci, 1]*h[ci, 2] - s[ci, 2]*h[ci, 1]
end
end
function parallel1!(ds, s, h)
idx1 = (2, 3, 1)
idx2 = (3, 1, 2)
@tullio ds[ci, cj] = s[ci, idx1[cj]]*h[ci, idx2[cj]] - s[ci, idx2[cj]]*h[ci, idx1[cj]]
end
function serial2!(ds, s, h)
for cj in 1:size(ds, 2)
ds[1, cj] = s[2, cj]*h[3, cj] - s[3, cj]*h[2, cj]
ds[2, cj] = s[3, cj]*h[1, cj] - s[1, cj]*h[3, cj]
ds[3, cj] = s[1, cj]*h[2, cj] - s[2, cj]*h[1, cj]
end
end
function parallel2!(ds, s, h)
idx1 = (2, 3, 1)
idx2 = (3, 1, 2)
@tullio ds[ci, cj] = s[idx1[ci], cj]*h[idx2[ci], cj] - s[idx2[ci], cj]*h[idx1[ci], cj]
end
function serialSA!(ds, s, h)
@. ds = cross(s,h)
end
function parallelSA!(ds, s, h)
Threads.@threads for i β 1:length(s)
@inbounds ds[i] = cross(s[i],h[i])
end
end
s = rand(10, 3)
h = rand(10, 3)
ds11 = Matrix{Float64}(undef, 10, 3)
serial1!(ds11, s, h)
ds12 = Matrix{Float64}(undef, 10, 3)
parallel1!(ds12, s, h)
@assert ds12 β ds11
# s = rand(3, 10)
# h = rand(3, 10)
ds21 = Matrix{Float64}(undef, 3, 10)
serial2!(ds21, s', h')
@assert ds21 β ds12'
ds22 = Matrix{Float64}(undef, 3, 10)
parallel2!(ds22, s', h')
@assert ds22 β ds21
for n in [100, 10_000, 1_000_000]
println("n=$n")
for f β (serial1!,parallel1!)
print(string(f)," ")
@btime $f(ds, s, h) setup=(ds = Matrix{Float64}(undef, $n, 3); s = rand($n, 3); h = rand($n, 3))
end
for f β (serial2!,parallel2!)
print(string(f)," ")
@btime $f(ds, s, h) setup=(ds = Matrix{Float64}(undef, 3, $n); s = rand(3, $n); h = rand(3, $n))
end
for f β (serialSA!,parallelSA!)
print(string(f))
@btime $f(ds, s, h) setup=(ds = rand(SVector{3, Float64},$n); s = rand(SVector{3, Float64},$n); h = rand(SVector{3, Float64},$n))
end
end
I see now
n=100
serial1! 279.452 ns (0 allocations: 0 bytes)
parallel1! 148.317 ns (0 allocations: 0 bytes)
serial2! 257.307 ns (0 allocations: 0 bytes)
parallel2! 481.026 ns (0 allocations: 0 bytes)
serialSA! 147.768 ns (0 allocations: 0 bytes)
parallelSA! 2.712 ΞΌs (31 allocations: 3.38 KiB)
n=10000
serial1! 27.200 ΞΌs (0 allocations: 0 bytes)
parallel1! 22.500 ΞΌs (0 allocations: 0 bytes)
serial2! 26.900 ΞΌs (0 allocations: 0 bytes)
parallel2! 54.600 ΞΌs (0 allocations: 0 bytes)
serialSA! 16.600 ΞΌs (0 allocations: 0 bytes)
parallelSA! 16.800 ΞΌs (30 allocations: 3.34 KiB)
n=1000000
serial1! 6.023 ms (0 allocations: 0 bytes)
parallel1! 3.329 ms (44 allocations: 3.03 KiB)
serial2! 6.669 ms (0 allocations: 0 bytes)
parallel2! 3.521 ms (44 allocations: 3.03 KiB)
serialSA! 4.624 ms (0 allocations: 0 bytes)
parallelSA! 3.197 ms (31 allocations: 3.38 KiB)
so the speedup is more like 2x for 6 cores on my machine.