I found an example performances quite diff in Julia 1.10 vs other version in
MWE:
# GC.enable_logging(true)
function myinterp1(X, V, Xq, method)
complex.(
_myinterp1(X, real(V), Xq, method),
_myinterp1(X, imag(V), Xq, method),
)
end
function _myinterp1(x, v, xq, md)
x = float(x)
v = float(v)
sxq = size(xq)
sv2end = size(v)[2:end]
out = Array{float(Base.promote_eltype(x, v, xq))}(undef, sxq..., sv2end...)
last_axes = axes(v)[2:end]
x1 = x[1]
x2 = x[2]
dx = x2 - x1
Threads.@threads for i in CartesianIndices(xq)
linear_core_uniform!(@view(out[i.I..., last_axes...]), x, v, xq[i], x1, dx)
end
out
end
@noinline function find_index_uniform(x::AbstractVector, xq, x1, dx)
lx = length(x)
if xq <= x1
index = 1
elseif xq >= last(x)
index = lx - 1
else
ind = (xq - x1) / dx
index = clamp(ceil(Int, ind), 1, lx - 1)
x2 = x[index + 1]
x1 = x[index]
if xq < x1
index = max(index - 1, 1)
elseif xq > x2
index = min(index + 1, lx - 1)
end
end
return index
end
@noinline function linear_core_uniform!(out, x, v, xq, x1, dx)
if !isnan(xq)
index = find_index_uniform(x, xq, x1, dx)
x2 = x[index + 1]
x1 = x[index]
r = (xq - x1) / (x2 - x1)
if iszero(r)
vy1 = selectdim(v, 1, index)
copyto!(out, vy1)
elseif isone(r)
vy2 = selectdim(v, 1, index + 1)
copyto!(out, vy2)
elseif isinf(r)
@inbounds @simd for i in CartesianIndices(out)
y2 = v[index + 1, i.I...]
y1 = v[index, i.I...]
out[i] = ifelse(y1 == y2, y1, (y2 - y1) * r)
end
else
@inbounds @simd for i in CartesianIndices(out)
y2 = v[index + 1, i.I...]
y1 = v[index, i.I...]
out[i] = ifelse(y1 == y2, y1, (1.0 - r) * y1 + r * y2)
end
end
else
out .= xq
end
return nothing
end
N_sym = 1000000
H_data = zeros(ComplexF64, 56, N_sym)
H_pilot = [ 1.0289450805197744 + 0.05655308087766442im
1.454797484385196 + 0.002110720785675794im
0.3395020495046162 - 0.3247504157005432im
1.4979795289030202 - 0.24327972612882864im
1.1853909781963035 + 0.09244412671200611im
1.0904548629665436 - 0.03819692645652406im
1.2068283469332997 - 0.29843165086554696im
0.6339387787478238 + 0.1187855333646726im]
P_f_station = [
1
9
17
25
33
41
49
57
]
data_station = [2, 3, 4, 5, 6, 7, 8, 10, 11, 12, 13, 14, 15, 16, 18, 19, 20, 21, 22, 23, 24, 26, 27, 28, 29, 30, 31, 32, 34, 35, 36, 37, 38, 39, 40, 42, 43, 44, 45, 46, 47, 48, 50, 51, 52, 53, 54, 55, 56, 58, 59, 60, 61, 62, 63, 64]
function main(N_sym, H_data, P_f_station, H_pilot, data_station)
@timev for col in 1:N_sym
H_data[:, col] = vec(myinterp1(collect(P_f_station), H_pilot, collect(data_station), "linear"))
end
end
main(N_sym, H_data, P_f_station, H_pilot, data_station)
performance:
> julia +1.9 -t4 test.jl
9.106428 seconds (60.31 M allocations: 7.750 GiB, 5.13% gc time, 6.90% compilation time)
elapsed time (ns): 9106428213
gc time (ns): 467476902
bytes allocated: 8321032773
pool allocs: 60307995
non-pool GC allocs: 62
free() calls: 26
minor collections: 177
full collections: 0
> julia +1.10 -t4 test.jl
17.689621 seconds (51.15 M allocations: 7.386 GiB, 10.41% gc time, 3.00% compilation time)
elapsed time (ns): 17689621244
gc time (ns): 1841949403
bytes allocated: 7930900704
pool allocs: 51153352
non-pool GC allocs: 47
minor collections: 17
full collections: 1
> julia +1.11 -t4 test.jl
9.363672 seconds (62.29 M allocations: 7.405 GiB, 5.93% gc time, 8.20% compilation time)
elapsed time (ns): 9.363671719e9
gc time (ns): 555457964
bytes allocated: 7951022160
pool allocs: 62285624
non-pool GC allocs: 7
malloc() calls: 144
free() calls: 988
minor collections: 140
full collections: 1
> julia +1.12 -t4 test.jl
9.406524 seconds (62.40 M allocations: 6.218 GiB, 5.08% gc time, 7.15% compilation time)
elapsed time (ns): 9.406523758e9
gc time (ns): 477554554
bytes allocated: 6676672168
pool allocs: 62402786
non-pool GC allocs: 2
malloc() calls: 179
free() calls: 1108
minor collections: 111
full collections: 1
I kown it’a a bad code in threads due to allocs, and it would perform better in single thread.
The question is why it’s so slow in v1.10. I notice the gc time is not so wrose.