Performance drop for explicit indexing in fused loops

Thanks, that did help a lot! The performance is the same for the simple operation from the opening post, and the performance of a stencil is close to the devectorized version, although still not exactly the same (~25% slower):

function test_vectorized(a, a_old)
    @views @. a[2:end-1, 2:end-1] = (
        (a_old[1:end-2, 2:end-1] + a_old[3:end, 2:end-1]) * 0.1
        + (a_old[2:end-1,1:end-2] + a_old[2:end-1, 3:end]) * 0.2)
end

function test_devectorized(a, a_old)
    nx, ny = size(a)
    @inbounds @simd for j = 2:(ny-1)
        for i = 2:(nx-1)
            a[i,j] = ((a_old[i-1, j] + a_old[i+1, j]) * 0.1 
                + (a_old[i, j-1] + a_old[i, j+1]) * 0.2)
        end
    end
end


a = rand(10000, 10000)

a1 = copy(a)
test_vectorized(a1, a)
@time test_vectorized(a1, a)

a2 = copy(a)
test_devectorized(a2, a)
@time test_devectorized(a2, a)

@assert isapprox(a1, a2)

This gives

> julia test.jl
  0.218883 seconds (87 allocations: 6.623 KiB)
  0.152084 seconds (4 allocations: 160 bytes)

(without the @views, test_vectorized() is about 5 times slower).