FYI, since @threads
creates a closure and @inbounds
does not penetrate closures, you need to put @inbounds
inside @threads
(as in @inbounds x[i+1, j+1] = Int32(i*i + j*j)
).
The difference is probably very hard to observe for this kind of code, though. It’s easy to observe this with a bit heavier SIMD’able computation is indie. For example:
julia> function f!(ys ,xs)
@inbounds Threads.@threads for i in eachindex(xs, ys)
ys[i] = Base.FastMath.sqrt_fast(xs[i])
end
end;
julia> function g!(ys ,xs)
Threads.@threads for i in eachindex(xs, ys)
@inbounds ys[i] = Base.FastMath.sqrt_fast(xs[i])
end
end;
julia> xs = ones(2^25); ys = similar(xs);
julia> @btime f!(ys ,xs)
13.981 ms (41 allocations: 3.64 KiB)
julia> @btime g!(ys ,xs)
5.630 ms (41 allocations: 3.64 KiB)
julia> Threads.nthreads()
8