Threads are not speeding up computation a lot

I made some changes to code. New version:

module Keygen
using Base.Threads
using Base.Iterators
include("./secp256k1.jl")

function gen_keys_range(k_range, out_matrix=nothing)
    if isnothing(out_matrix)
        data_size = length(k_range)
        keys = zeros(BigInt, data_size, 2)
    else
        keys = out_matrix
    end
    for (i, k) in enumerate(k_range)
        x,y = Secp256k1.der_keys(k)
        keys[i,1] = x
        keys[i,2] = y
    end
    return keys, k_range
end

function gen_keys_0(k_start, k_num)
    keys = zeros(BigInt, k_num, 2)
    nth = Threads.nthreads()
    k_end = k_start + k_num - 1
    part_size = div(k_num, nth)
    tasks = []
    for k_r in partition(k_start:k_end, part_size)
        v_start = k_r.start - k_start + 1
        v_stop = k_r.stop - k_start + 1
        @views keys_view = keys[v_start:v_stop, :]
        push!(tasks, Threads.@spawn gen_keys_range(k_r, keys_view))
    end

    wait.(tasks)

    return keys

end

function gen_keys_1(k_start, k_num)
    nth = Threads.nthreads()
    keys = zeros(BigInt, k_num, 2)
    k_end = k_start + k_num - 1
    part_size = div(k_num, nth)
    tasks = []

    for k_r in partition(k_start:k_end, part_size)
        push!(tasks, Threads.@spawn gen_keys_range(k_r))
    end

    results = fetch.(tasks)
    for res in results
        k, k_r = res
        v_start = k_r.start - k_start + 1
        v_stop = k_r.stop - k_start + 1
        keys[v_start:v_stop,:] = k
    end

    return keys

end

function gen_keys_2(k_start, k_num)
    nth = Threads.nthreads()
    keys = zeros(BigInt, k_num, 2)
    k_end = k_start + k_num - 1
    part_size = div(k_num, nth)
    tasks = [Threads.@spawn(gen_keys_range(k_r)) for k_r in partition(k_start:k_end, part_size)]
    results = fetch.(tasks)
    for res in results
        k, k_r = res
        v_start = k_r.start - k_start + 1
        v_stop = k_r.stop - k_start + 1
        @views keys[v_start:v_stop,:] = k
    end

    return keys

end

nth = Threads.nthreads()
println("Threads num $nth")


function test_gen_keys0()
    gen_keys_0(123,100000)
end

test_gen_keys0()
@time test_gen_keys0()

function test_gen_keys1()
    gen_keys_1(123,100000)
end

test_gen_keys1()
@time test_gen_keys1()

function test_gen_keys2()
    gen_keys_2(123,100000)
end

test_gen_keys2()
@time test_gen_keys2()

end

Results:

Threads num 1
 19.447613 seconds (337.12 M allocations: 8.414 GiB, 20.14% gc time)
 24.110456 seconds (337.12 M allocations: 8.416 GiB, 17.54% gc time)
 29.227764 seconds (337.12 M allocations: 8.416 GiB, 15.77% gc time)

Threads num 4
 12.691396 seconds (337.12 M allocations: 8.414 GiB, 36.10% gc time)
 11.250642 seconds (337.12 M allocations: 8.416 GiB, 40.23% gc time)
 13.027368 seconds (337.12 M allocations: 8.416 GiB, 37.02% gc time)

Threads num 16
  9.462348 seconds (337.12 M allocations: 8.414 GiB, 40.88% gc time)
  9.219033 seconds (337.12 M allocations: 8.416 GiB, 42.11% gc time)
  9.425437 seconds (337.12 M allocations: 8.416 GiB, 42.99% gc time)

Whatever is happening here, I still expect linear speedup. Even if 42% is GC time, it still must be 8x at least.

Is there a way I can tune GC for such workload?

1 Like