Hi !
I am trying to learn how to distribute or use parellel computing on CPU, I don’t understand why, it is slower that a sequential for… As I really do not understand why, I ask for your help, thank you
using BenchmarkTools
using Test
ADD on CPU
N = 2^20
x = fill(1.0, N)
y = fill(2.0, N);
y .+= x
@test all(y .== 3.0)
e[32me[1mTest Passede[22me[39m
function sequential_add!(y, x) #add x to y
for i in eachindex(y,x)
@inbounds y[i] += x[i]
end
return nothing
end
function parallel_add!(y, x)
Threads.@threads for i in eachindex(y,x)
@inbounds y[i] += x[i]
end
return nothing
end
Threads.nthreads()
8
using Distributed
nprocs() == 1 && addprocs()
@everywhere using SharedArrays
@everywhere begin
using Test
using BenchmarkTools
end
y_shar = SharedArray{Float64}(N)
y_shar .= 2.0
function distributed_add!(y,x)
@sync @distributed for i in 1:length(x)
@inbounds y[i] += x[i]
end
return nothing
end
distributed_add! (generic function with 1 method)
y_shar .= 2.0
distributed_add!(y_shar, x)
@test all(y_shar .== 3.0)
e[32me[1mTest Passede[22me[39m
fill!(y,2.0)
sequential_add!(y, x)
@test all(y .== 3.0)
fill!(y,2.0)
parallel_add!(y,x)
@test all(y .== 3.0)
e[32me[1mTest Passede[22me[39m
function add_cpu_bench!(y,x)
y .+= x
return nothing
end
add_cpu_bench! (generic function with 1 method)
fill!(y,2.0)
@btime add_cpu_bench!($y,$x)
610.000 μs (0 allocations: 0 bytes)
fill!(y,2.0)
@btime sequential_add!($y,$x)
611.100 μs (0 allocations: 0 bytes)
fill!(y,2.0)
@btime parallel_add!($y,$x)
608.800 μs (40 allocations: 4.11 KiB)
y_shar .= 2.0
@benchmark distributed_add!($y_shar,$x)
28.836 ms (1288 allocations: 60.72 KiB)