Threads are not speeding up computation a lot

Nyan · February 25, 2025, 7:23pm

I made some changes to code. New version:

module Keygen
using Base.Threads
using Base.Iterators
include("./secp256k1.jl")

function gen_keys_range(k_range, out_matrix=nothing)
    if isnothing(out_matrix)
        data_size = length(k_range)
        keys = zeros(BigInt, data_size, 2)
    else
        keys = out_matrix
    end
    for (i, k) in enumerate(k_range)
        x,y = Secp256k1.der_keys(k)
        keys[i,1] = x
        keys[i,2] = y
    end
    return keys, k_range
end

function gen_keys_0(k_start, k_num)
    keys = zeros(BigInt, k_num, 2)
    nth = Threads.nthreads()
    k_end = k_start + k_num - 1
    part_size = div(k_num, nth)
    tasks = []
    for k_r in partition(k_start:k_end, part_size)
        v_start = k_r.start - k_start + 1
        v_stop = k_r.stop - k_start + 1
        @views keys_view = keys[v_start:v_stop, :]
        push!(tasks, Threads.@spawn gen_keys_range(k_r, keys_view))
    end

    wait.(tasks)

    return keys

end

function gen_keys_1(k_start, k_num)
    nth = Threads.nthreads()
    keys = zeros(BigInt, k_num, 2)
    k_end = k_start + k_num - 1
    part_size = div(k_num, nth)
    tasks = []

    for k_r in partition(k_start:k_end, part_size)
        push!(tasks, Threads.@spawn gen_keys_range(k_r))
    end

    results = fetch.(tasks)
    for res in results
        k, k_r = res
        v_start = k_r.start - k_start + 1
        v_stop = k_r.stop - k_start + 1
        keys[v_start:v_stop,:] = k
    end

    return keys

end

function gen_keys_2(k_start, k_num)
    nth = Threads.nthreads()
    keys = zeros(BigInt, k_num, 2)
    k_end = k_start + k_num - 1
    part_size = div(k_num, nth)
    tasks = [Threads.@spawn(gen_keys_range(k_r)) for k_r in partition(k_start:k_end, part_size)]
    results = fetch.(tasks)
    for res in results
        k, k_r = res
        v_start = k_r.start - k_start + 1
        v_stop = k_r.stop - k_start + 1
        @views keys[v_start:v_stop,:] = k
    end

    return keys

end

nth = Threads.nthreads()
println("Threads num $nth")


function test_gen_keys0()
    gen_keys_0(123,100000)
end

test_gen_keys0()
@time test_gen_keys0()

function test_gen_keys1()
    gen_keys_1(123,100000)
end

test_gen_keys1()
@time test_gen_keys1()

function test_gen_keys2()
    gen_keys_2(123,100000)
end

test_gen_keys2()
@time test_gen_keys2()

end

Results:

Threads num 1
 19.447613 seconds (337.12 M allocations: 8.414 GiB, 20.14% gc time)
 24.110456 seconds (337.12 M allocations: 8.416 GiB, 17.54% gc time)
 29.227764 seconds (337.12 M allocations: 8.416 GiB, 15.77% gc time)

Threads num 4
 12.691396 seconds (337.12 M allocations: 8.414 GiB, 36.10% gc time)
 11.250642 seconds (337.12 M allocations: 8.416 GiB, 40.23% gc time)
 13.027368 seconds (337.12 M allocations: 8.416 GiB, 37.02% gc time)

Threads num 16
  9.462348 seconds (337.12 M allocations: 8.414 GiB, 40.88% gc time)
  9.219033 seconds (337.12 M allocations: 8.416 GiB, 42.11% gc time)
  9.425437 seconds (337.12 M allocations: 8.416 GiB, 42.99% gc time)

Whatever is happening here, I still expect linear speedup. Even if 42% is GC time, it still must be 8x at least.

Is there a way I can tune GC for such workload?

Topic		Replies	Views
Poor performance while multithreading (Julia 1.0) Performance multithreading	28	4036	February 11, 2019
Default sin / cos functions do not scale on multithreading Performance parallel , multithreading	19	1616	July 13, 2018
Poor performance on cluster multithreading Performance performance , parallel , multithreading , cluster	40	4240	July 11, 2018
Thread overhead variability across machines Performance	13	1891	November 28, 2017
Wrong results in calculation Performance	21	3290	March 12, 2019

Threads are not speeding up computation a lot

Related topics