and, because I instigated this, here it is all put it together on an i7-7700k:
1.111 ms (79491 allocations: 2.82 MiB)
83.008 μs (103 allocations: 322.83 KiB)
14.454 μs (2 allocations: 234.45 KiB) . <-- fastest is steven's simple loop with direct assignment
28.244 ms (215534 allocations: 81.67 MiB)
27.926 ms (215534 allocations: 81.67 MiB)
157.606 μs (87 allocations: 772.67 KiB)
24.193 ms (163685 allocations: 80.42 MiB)
for the code below. interestingly, when one replaces the matrix entries with
m[1,i]=... with m[:,i] .= [x,y,f(x,y)]
1.093 ms (79491 allocations: 2.82 MiB)
then the performance drops by two orders of magnitude, to about 1.1ms. apparently, this makes all the difference in performance. looking at the memory allocations, this is what takes the extra time here. .=
does not avoid it.
f(x,y) = sqrt( x^2 + y^2 );
xset= 1:100; yset= 1:100; i= 1;
function loop1( xset, yset )
i= 1
m= Matrix{Float64}( length(xset)*length(yset), 3)
for x in xset
for y in yset
m[i,:]= [ x, y, f(x,y) ]
i+=1
end#for x
end#for##
m
end;#loop1##
function loop2( xset, yset )
i= 1
m= Matrix{Float64}( 3, length(xset)*length(yset))
@inbounds for x in Float64.(xset)
@simd for y in Float64.(yset)
m[:,i] .= ( x, y, f(x,y) ) ## fails
i += 1
end#for
end#for
end#function
function loop3( xset, yset )
m= Matrix{Float64}( 3, length(xset)*length(yset))
i = 1
@inbounds for x in xset, y in yset
m[1,i] = x
m[2,i] = y
m[3,i] = f(x,y)
i += 1
end#for
end#function
using Iterators;
rv( xset, yset) = reduce(vcat, [[x, y, f(x, y)]' for (y, x) in Iterators.product(yset, xset)])
ra( xset, yset) = ([ [z(x,y) for x in xset for y in yset] for z in ((x,y)->x, (x,y)->y, f) ]...)
using StaticArrays
sa( xset, yset) = (vv= [ @SVector [x, y, f(x,y)] for x in xset for y in yset ]; reinterpret(Float64, vv, (3, length(vv) )) )
rh( xset, yset) = reduce(hcat, [[x, y, f(x, y)] for (y, x) in Iterators.product(yset, xset)]) ## actually, this produces 10000x3
using BenchmarkTools
@btime z1= loop1( xset, yset );
@btime z2= loop2( xset, yset );
@btime z3= loop3( xset, yset );
@btime z4= rv( xset, yset );
@btime z5= rv( xset, yset );
@btime z6= ra( xset, yset );
@btime z7= rh( xset, yset );