Thanks, that did help a lot! The performance is the same for the simple operation from the opening post, and the performance of a stencil is close to the devectorized version, although still not exactly the same (~25% slower):
function test_vectorized(a, a_old)
@views @. a[2:end-1, 2:end-1] = (
(a_old[1:end-2, 2:end-1] + a_old[3:end, 2:end-1]) * 0.1
+ (a_old[2:end-1,1:end-2] + a_old[2:end-1, 3:end]) * 0.2)
end
function test_devectorized(a, a_old)
nx, ny = size(a)
@inbounds @simd for j = 2:(ny-1)
for i = 2:(nx-1)
a[i,j] = ((a_old[i-1, j] + a_old[i+1, j]) * 0.1
+ (a_old[i, j-1] + a_old[i, j+1]) * 0.2)
end
end
end
a = rand(10000, 10000)
a1 = copy(a)
test_vectorized(a1, a)
@time test_vectorized(a1, a)
a2 = copy(a)
test_devectorized(a2, a)
@time test_devectorized(a2, a)
@assert isapprox(a1, a2)
This gives
> julia test.jl
0.218883 seconds (87 allocations: 6.623 KiB)
0.152084 seconds (4 allocations: 160 bytes)
(without the @views
, test_vectorized()
is about 5 times slower).