A pattern I often find useful is to assign to a slice of a matrix in a loop, forgoing the innermost loop, e.g.:
for idat in 1:ndata
array[:, idat] = [xval, yval, zval]
end
The vector of values assigned to the slice could be the output of another function, or a vector constructed as above. I mostly use this as it is convenient. However, independent of how the vector be assigned is generated, there are performance penalties with how one chooses to assign the values to the matrix.
Consider the following cases, here the vals
vector is generated upfront which I hope isolates differences between the different methods:
function assign_nofuse!(array::Matrix{T}) where {T}
dim, ndata = size(array)
vals = [convert(T, i) for i in 1:dim]
for idat in 1:ndata
array[:, idat] = vals
end
end
function assign_fused!(array::Matrix{T}) where {T}
dim, ndata = size(array)
vals = [convert(T, i) for i in 1:dim]
for idat in 1:ndata
array[:, idat] .= vals
end
end
function assign_doubleloop!(array::Matrix{T}) where {T}
dim, ndata = size(array)
vals = [convert(T, i) for i in 1:dim]
for idat in 1:ndata
for ivar in 1:dim
array[ivar, idat] = vals[ivar]
end
end
end
and timing:
julia> arr = zeros(3, 100000)
julia> @benchmark assign_nofuse!(arr)
BenchmarkTools.Trial:
memory estimate: 3.04 MiB
allocs estimate: 199490
--------------
minimum time: 1.587 ms (0.00% GC)
median time: 1.749 ms (0.00% GC)
mean time: 1.998 ms (3.64% GC)
maximum time: 40.098 ms (0.00% GC)
--------------
samples: 2498
evals/sample: 1
julia> @benchmark assign_fused!(arr)
BenchmarkTools.Trial:
memory estimate: 112 bytes
allocs estimate: 1
--------------
minimum time: 760.706 μs (0.00% GC)
median time: 842.758 μs (0.00% GC)
mean time: 991.368 μs (0.00% GC)
maximum time: 3.792 ms (0.00% GC)
--------------
samples: 5034
evals/sample: 1
julia> @benchmark assign_doubleloop!(arr)
BenchmarkTools.Trial:
memory estimate: 112 bytes
allocs estimate: 1
--------------
minimum time: 250.529 μs (0.00% GC)
median time: 258.918 μs (0.00% GC)
mean time: 311.633 μs (0.00% GC)
maximum time: 1.551 ms (0.00% GC)
--------------
samples: 10000
evals/sample: 1
Is there something I am missing? This is the first time I have profiled speed differences where I use this pattern so chances are I’ve been unknowingly incurring a penalty for this style in other languages. Is there a better way to use this specific style? or just best to avoid it and individually assign values to the slice of the matrix as in assign_doubleloop!
Thanks.