Some alternatives:
serial:
julia> function loop_dot_serial(x, A, y)
s = 0.
for i = eachindex(y)
for k = eachindex(x)
s += x[k]*A[k,i]*y[i]
end
end
return s
end
loop_dot_serial (generic function with 1 method)
julia> @btime loop_dot_serial($x,$A,$y)
259.091 ms (0 allocations: 0 bytes)
98.64233873489854
manual:
julia> function loop_dot_manual(x, A, y; ntasks = Threads.nthreads())
s = zeros(ntasks)
Threads.@threads for it in 1:ntasks
for i in it:ntasks:length(y)
for k = eachindex(x)
s[it] += x[k]*A[k,i]*y[i]
end
end
end
return sum(s)
end
loop_dot_manual (generic function with 1 method)
julia> @btime loop_dot_manual($x,$A,$y)
105.539 ms (42 allocations: 4.02 KiB)
98.64233873543105
using Floops
julia> using FLoops
julia> function loop_dot(x, A, y)
@floop for i = eachindex(y)
for k = eachindex(x)
@reduce s += x[k]*A[k,i]*y[i]
end
end
return s
end
loop_dot (generic function with 1 method)
julia> @btime loop_dot($x,$A,$y)
91.441 ms (116 allocations: 5.89 KiB)
98.64233873490912
using LoopVectorization (applies for this simple operation):
julia> using LoopVectorization
julia> function loop_dot_turbo(x, A, y)
s = 0.
@tturbo for i = eachindex(y)
for k = eachindex(x)
s += x[k]*A[k,i]*y[i]
end
end
return s
end
loop_dot_turbo (generic function with 1 method)
julia> @btime loop_dot_turbo($x,$A,$y)
69.347 ms (0 allocations: 0 bytes)
98.64233873475337
using Tullio:
julia> using Tullio
julia> function loop_dot_tullio(x,A,y)
s = 0.
@tullio s += x[k]*A[k,i]*y[i]
return s
end
loop_dot_tullio (generic function with 1 method)
julia> @btime loop_dot_tullio($x,$A,$y)
72.149 ms (203 allocations: 10.30 KiB)
98.64233873475337