I made some changes to code. New version:
module Keygen
using Base.Threads
using Base.Iterators
include("./secp256k1.jl")
function gen_keys_range(k_range, out_matrix=nothing)
if isnothing(out_matrix)
data_size = length(k_range)
keys = zeros(BigInt, data_size, 2)
else
keys = out_matrix
end
for (i, k) in enumerate(k_range)
x,y = Secp256k1.der_keys(k)
keys[i,1] = x
keys[i,2] = y
end
return keys, k_range
end
function gen_keys_0(k_start, k_num)
keys = zeros(BigInt, k_num, 2)
nth = Threads.nthreads()
k_end = k_start + k_num - 1
part_size = div(k_num, nth)
tasks = []
for k_r in partition(k_start:k_end, part_size)
v_start = k_r.start - k_start + 1
v_stop = k_r.stop - k_start + 1
@views keys_view = keys[v_start:v_stop, :]
push!(tasks, Threads.@spawn gen_keys_range(k_r, keys_view))
end
wait.(tasks)
return keys
end
function gen_keys_1(k_start, k_num)
nth = Threads.nthreads()
keys = zeros(BigInt, k_num, 2)
k_end = k_start + k_num - 1
part_size = div(k_num, nth)
tasks = []
for k_r in partition(k_start:k_end, part_size)
push!(tasks, Threads.@spawn gen_keys_range(k_r))
end
results = fetch.(tasks)
for res in results
k, k_r = res
v_start = k_r.start - k_start + 1
v_stop = k_r.stop - k_start + 1
keys[v_start:v_stop,:] = k
end
return keys
end
function gen_keys_2(k_start, k_num)
nth = Threads.nthreads()
keys = zeros(BigInt, k_num, 2)
k_end = k_start + k_num - 1
part_size = div(k_num, nth)
tasks = [Threads.@spawn(gen_keys_range(k_r)) for k_r in partition(k_start:k_end, part_size)]
results = fetch.(tasks)
for res in results
k, k_r = res
v_start = k_r.start - k_start + 1
v_stop = k_r.stop - k_start + 1
@views keys[v_start:v_stop,:] = k
end
return keys
end
nth = Threads.nthreads()
println("Threads num $nth")
function test_gen_keys0()
gen_keys_0(123,100000)
end
test_gen_keys0()
@time test_gen_keys0()
function test_gen_keys1()
gen_keys_1(123,100000)
end
test_gen_keys1()
@time test_gen_keys1()
function test_gen_keys2()
gen_keys_2(123,100000)
end
test_gen_keys2()
@time test_gen_keys2()
end
Results:
Threads num 1
19.447613 seconds (337.12 M allocations: 8.414 GiB, 20.14% gc time)
24.110456 seconds (337.12 M allocations: 8.416 GiB, 17.54% gc time)
29.227764 seconds (337.12 M allocations: 8.416 GiB, 15.77% gc time)
Threads num 4
12.691396 seconds (337.12 M allocations: 8.414 GiB, 36.10% gc time)
11.250642 seconds (337.12 M allocations: 8.416 GiB, 40.23% gc time)
13.027368 seconds (337.12 M allocations: 8.416 GiB, 37.02% gc time)
Threads num 16
9.462348 seconds (337.12 M allocations: 8.414 GiB, 40.88% gc time)
9.219033 seconds (337.12 M allocations: 8.416 GiB, 42.11% gc time)
9.425437 seconds (337.12 M allocations: 8.416 GiB, 42.99% gc time)
Whatever is happening here, I still expect linear speedup. Even if 42% is GC time, it still must be 8x at least.
Is there a way I can tune GC for such workload?