On Julia v0.5 and v0.6 we observe the following timings:
function add1!{T}(A::Matrix{T}, subA::Matrix{T}, I::UnitRange{Int}, J::UnitRange{Int})
size_subA = size(subA)
checkbounds(A,I.stop,J.stop)
checkbounds(subA,length(I),length(J))
k=1;l=1
@inbounds for j in J
l = 1
@simd for i in I
A[i,j] += subA[l,k]
l += 1
end
k +=1
end
return nothing
end
function add2!{T}(A::Matrix{T}, subA::Matrix{T}, I::UnitRange{Int}, J::UnitRange{Int})
A[I,J] += subA
return nothing
end
function add3!{T}(A::Matrix{T}, subA::Matrix{T}, I::UnitRange{Int}, J::UnitRange{Int})
A[I,J] .+= subA
return nothing
end
function test()
A = rand(10000,10000)
I = 1:100
J = 1:200
subA = rand(100,200)
ctr = 1
info("A[I,J]+=subA (loops over I,J)")
@time begin
for i = 1:10000
add1!(A,subA,I,J)
ctr += length(A)
end
end
info("A[I,J] += subA")
ctr = 1
@time begin
for i = 1:10000
add2!(A,subA,I,J)
ctr += length(A)
end
end
@time begin
ctr = 1
info("A[I,J] .+= subA")
for i = 1:10000
add3!(A,subA,I,J)
ctr += length(A)
end
end
end
test()
version 0.5.0:
julia> test()
INFO: A[I,J]+=subA (loops over I,J)
0.120754 seconds
INFO: A[I,J] += subA
2.149707 seconds (100.00 k allocations: 2.984 GB, 18.07% gc time)
INFO: A[I,J] .+= subA
1.985320 seconds (200.01 k allocations: 2.987 GB, 11.53% gc time)
version 0.6.0-pre.beta.217
julia> test()
INFO: A[I,J]+=subA (loops over I,J)
0.123705 seconds
INFO: A[I,J] += subA
2.085398 seconds (80.00 k allocations: 2.983 GiB, 21.45% gc time)
INFO: A[I,J] .+= subA
1.010136 seconds (40.04 k allocations: 1.491 GiB, 13.06% gc time)
Is there a reason why the range-indexing expression is so much slower and allocates so much more memory than the loop-based version? And why does the allocation double on 0.5 between .+=
and +=
, while it halves on 0.6?