It’ll go just a little faster if you add @inbounds
. Or use broadcasting, which does that for you:
julia> b_axpy!(a,b,c) = (c .= a .* b .+ c; nothing)
b_axpy! (generic function with 1 method)
julia> @btime b_axpy!(a,b,c);
92.255 ms (0 allocations: 0 bytes)
julia> function inbounds_axpy!(a,b,c)
for i = 1:length(c)
@inbounds c[i] = a*b[i]+c[i]
end
return nothing
end
inbounds_axpy! (generic function with 1 method)
julia> @btime inbounds_axpy!(a,b,c);
92.229 ms (0 allocations: 0 bytes)
julia> @btime LinearAlgebra.BLAS.axpy!(a,b,c);
92.531 ms (0 allocations: 0 bytes)
julia> @btime custom_non_threaded_axpy!(a,b,c);
99.350 ms (0 allocations: 0 bytes)