Mulithreads performances in different Julia version

I found an example performances quite diff in Julia 1.10 vs other version in

MWE:

# GC.enable_logging(true)
function myinterp1(X, V, Xq, method)
complex.(
    _myinterp1(X, real(V), Xq, method),
    _myinterp1(X, imag(V), Xq, method),
    )
end

function _myinterp1(x, v, xq, md)
    x = float(x)
    v = float(v)
    sxq = size(xq)
    sv2end = size(v)[2:end]
    out = Array{float(Base.promote_eltype(x, v, xq))}(undef, sxq..., sv2end...)
    last_axes = axes(v)[2:end]
    x1 = x[1]
    x2 = x[2]
    dx = x2 - x1

    Threads.@threads for i in CartesianIndices(xq)
        linear_core_uniform!(@view(out[i.I..., last_axes...]), x, v, xq[i], x1, dx)
    end
    out
end


@noinline function find_index_uniform(x::AbstractVector, xq, x1, dx)
    lx = length(x)
    if xq <= x1
        index = 1
    elseif xq >= last(x)
        index = lx - 1
    else
        ind = (xq - x1) / dx
        index = clamp(ceil(Int, ind), 1, lx - 1)
        x2 = x[index + 1]
        x1 = x[index]
        if xq < x1
            index = max(index - 1, 1)
        elseif xq > x2
            index = min(index + 1, lx - 1)
        end
    end
    return index
end


@noinline function linear_core_uniform!(out, x, v, xq, x1, dx)
    if !isnan(xq)
        index = find_index_uniform(x, xq, x1, dx)
        x2 = x[index + 1]
        x1 = x[index]
        r = (xq - x1) / (x2 - x1)
        if iszero(r)
            vy1 = selectdim(v, 1, index)
            copyto!(out, vy1)
        elseif isone(r)
            vy2 = selectdim(v, 1, index + 1)
            copyto!(out, vy2)
        elseif isinf(r)
            @inbounds @simd for i in CartesianIndices(out)
                y2 = v[index + 1, i.I...]
                y1 = v[index, i.I...]
                out[i] = ifelse(y1 == y2, y1, (y2 - y1) * r)
            end
        else
            @inbounds @simd for i in CartesianIndices(out)
                y2 = v[index + 1, i.I...]
                y1 = v[index, i.I...]
                out[i] = ifelse(y1 == y2, y1, (1.0 - r) * y1 + r * y2)
            end
        end
    else
        out .= xq
    end
    return nothing
end

N_sym = 1000000

H_data = zeros(ComplexF64, 56, N_sym)

H_pilot = [ 1.0289450805197744 + 0.05655308087766442im
1.454797484385196 + 0.002110720785675794im
0.3395020495046162 - 0.3247504157005432im
1.4979795289030202 - 0.24327972612882864im
1.1853909781963035 + 0.09244412671200611im
1.0904548629665436 - 0.03819692645652406im
1.2068283469332997 - 0.29843165086554696im
0.6339387787478238 + 0.1187855333646726im]
P_f_station = [
    1
    9
   17
   25
   33
   41
   49
   57
]

data_station = [2, 3, 4, 5, 6, 7, 8, 10, 11, 12, 13, 14, 15, 16, 18, 19, 20, 21, 22, 23, 24, 26, 27, 28, 29, 30, 31, 32, 34, 35, 36, 37, 38, 39, 40, 42, 43, 44, 45, 46, 47, 48, 50, 51, 52, 53, 54, 55, 56, 58, 59, 60, 61, 62, 63, 64]


function main(N_sym, H_data, P_f_station, H_pilot, data_station)
    @timev for col in 1:N_sym
        H_data[:, col] = vec(myinterp1(collect(P_f_station), H_pilot, collect(data_station), "linear"))
    end
end

main(N_sym, H_data, P_f_station, H_pilot, data_station)

performance:

> julia +1.9 -t4 test.jl
  9.106428 seconds (60.31 M allocations: 7.750 GiB, 5.13% gc time, 6.90% compilation time)
elapsed time (ns):  9106428213
gc time (ns):       467476902
bytes allocated:    8321032773
pool allocs:        60307995
non-pool GC allocs: 62
free() calls:       26
minor collections:  177
full collections:   0

> julia +1.10 -t4 test.jl
 17.689621 seconds (51.15 M allocations: 7.386 GiB, 10.41% gc time, 3.00% compilation time)
elapsed time (ns):  17689621244
gc time (ns):       1841949403
bytes allocated:    7930900704
pool allocs:        51153352
non-pool GC allocs: 47
minor collections:  17
full collections:   1

> julia +1.11 -t4 test.jl
  9.363672 seconds (62.29 M allocations: 7.405 GiB, 5.93% gc time, 8.20% compilation time)
elapsed time (ns):  9.363671719e9
gc time (ns):       555457964
bytes allocated:    7951022160
pool allocs:        62285624
non-pool GC allocs: 7
malloc() calls:     144
free() calls:       988
minor collections:  140
full collections:   1

> julia +1.12 -t4 test.jl
  9.406524 seconds (62.40 M allocations: 6.218 GiB, 5.08% gc time, 7.15% compilation time)
elapsed time (ns):  9.406523758e9
gc time (ns):       477554554
bytes allocated:    6676672168
pool allocs:        62402786
non-pool GC allocs: 2
malloc() calls:     179
free() calls:       1108
minor collections:  111
full collections:   1

I kown it’a a bad code in threads due to allocs, and it would perform better in single thread.

The question is why it’s so slow in v1.10. I notice the gc time is not so wrose.