# What's a good and easy way to parallelize this?

Ultimately, I want to speed up `fullsys!`, and as you can see there are functions that fill arrays. I thought those filling operations could be parallelized. How can I get started with that?

``````const szAge = 64 - 19 + 1
const szE = 2
const β = 0.92
const δ = 0.10

maxT(am, af; Z=realage(szAge)) = Z - max(am, af) + 1

realage(x) = x + 18
realageinv(x) = x - 18

function insideproduct_num(am, em, af, ef; μ0m=μ0m, μ0f=μ0f)
exponent = 0.5*(β*(1 - δ))
power = 1.0
p = one(promote_type(eltype(μ0m), eltype(μ0f)))
for k in 0:maxT(realage(am), realage(af))-1
p *= abs(μ0m[am + k, em] * μ0f[af + k, ef])^power
power *= exponent
end
return p
end

function insideproduct_den(am, em, af, ef; Sm=Sm, Sf=Sf)
exponent = 0.5*(β*(1 - δ))
power = 1.0
p = one(promote_type(eltype(Sm), eltype(Sf)))
for k in 0:maxT(realage(am), realage(af))-1
p *= abs(Sm[am + k, em] * Sf[af + k, ef])^power
power *= exponent
end
return p
end

function fillauxmat!(auxmat; Π=Π, Sm=Sm, Sf=Sf, μ0m=μ0m, μ0f=μ0f)
for ef in 1:szE, af in 1:szAge, em in 1:szE, am in 1:szAge
auxmat[am, em, af, ef] = Π[am, em, af, ef] * sqrt(Sm[am, em]*Sf[af, ef]) /
insideproduct_den(am, em, af, ef; Sm=Sm, Sf=Sf)
end
return auxmat
end

function MMsysM!(res, μ0m; auxmat=auxmat, Sm=Sm, μ0f=μ0f)
for em in 1:szE, am in 1:szAge
s = zero(eltype(μ0m))
for ef in 1:szE, af in 1:szAge
s += auxmat[am, em, af, ef] * insideproduct_num(am, em, af, ef; μ0m=μ0m, μ0f=μ0f)
end
res[am, em] = Sm[am, em] - μ0m[am, em] - s
end
return res
end

function MMsysF!(res, μ0f; auxmat=auxmat, Sf=Sf, μ0m=μ0m)
for ef in 1:szE, af in 1:szAge
s = zero(eltype(μ0f))
for em in 1:szE, am in 1:szAge
s += auxmat[am, em, af, ef] * insideproduct_num(am, em, af, ef; μ0m=μ0m, μ0f=μ0f)
end
res[af, ef] = Sf[af, ef] - μ0f[af, ef] - s
end
return res
end

function fullsys!(resm, resf; auxmat=auxmat, Sm=Sm, Sf=Sf, μ0m=μ0m, μ0f=μ0f)
fillauxmat!(auxmat; Π=Π, Sm=Sm, Sf=Sf, μ0m=μ0m, μ0f=μ0f)

MMsysM!(resm, μ0m; auxmat=auxmat, Sm=Sm, μ0f=μ0f)
MMsysF!(resf, μ0f; auxmat=auxmat, Sf=Sf, μ0m=μ0m)
return resm, resf
end

Π = 1000 .* rand(szAge, szE, szAge, szE)
auxmat = similar(Π)

Sm = 1000 .* rand(szAge, szE)
Sf = 1000 .* rand(szAge, szE)

μ0m = rand(szAge, szE)
μ0f = rand(szAge, szE)

resm = similar(μ0m)
resf = similar(μ0f)
``````

for reference, here’s the benchmark for this version:

``````julia> @benchmark fullsys!(\$resm, \$resf; auxmat=\$auxmat, Sm=\$Sm, Sf=\$Sf, μ0m=\$μ0m, μ0f=\$μ0f)
BenchmarkTools.Trial: 625 samples with 1 evaluation.
Range (min … max):  6.852 ms …  13.761 ms  ┊ GC (min … max): 0.00% … 0.00%
Time  (median):     7.932 ms               ┊ GC (median):    0.00%
Time  (mean ± σ):   7.996 ms ± 515.543 μs  ┊ GC (mean ± σ):  0.00% ± 0.00%

▄▂▃▄█▁▃▅▅▂▅▃▂  ▁ ▂
▂▂▂▂▃▃▃▄▄▃▄▇▆▄▆█████████████████▇█▇▆▅▅▅▆▅▃▃▃▃▃▃▂▃▂▄▃▃▁▁▂▁▁▂ ▄
6.85 ms         Histogram: frequency by time        9.47 ms <

Memory estimate: 96 bytes, allocs estimate: 2.
``````

Before trying to speed your code up via paralellization, you should try to optimize it. For example, `insideproduct_num` can be calculated in the log domain, which would remove the floating point powers which would make your code roughly 20x faster.

2 Likes

Good idea, thank you!

``````julia> @benchmark insideproduct_den(30, 2, 25, 2; Sm=\$Sm, Sf=\$Sf)
BenchmarkTools.Trial: 10000 samples with 243 evaluations.
Range (min … max):  283.642 ns … 939.449 ns  ┊ GC (min … max): 0.00% … 0.00%
Time  (median):     322.564 ns               ┊ GC (median):    0.00%
Time  (mean ± σ):   328.856 ns ±  44.206 ns  ┊ GC (mean ± σ):  0.00% ± 0.00%

▄ ▆▇█▄▂ ▁        ▁                                        ▁
▇▂█▄█▆███████▇▇▇█▆▆█▇█▆▆▇▇▇▆▆█▆▅▆▆▆▆▆▅▆▆▆▅▆▆▆▆▅▆▅▅▅▅▆▄▅▃▄▅▄▅▅ █
284 ns        Histogram: log(frequency) by time        550 ns <

Memory estimate: 0 bytes, allocs estimate: 0.

julia> @benchmark insideproduct_den_log(30, 2, 25, 2; Sm=\$Sm, Sf=\$Sf)
BenchmarkTools.Trial: 10000 samples with 873 evaluations.
Range (min … max):  126.515 ns …  1.086 μs  ┊ GC (min … max): 0.00% … 0.00%
Time  (median):     143.638 ns              ┊ GC (median):    0.00%
Time  (mean ± σ):   147.881 ns ± 24.177 ns  ┊ GC (mean ± σ):  0.00% ± 0.00%

▁▂   ▄▆▇█▃▂▂▁▁▁     ▁         ▁                             ▂
▆██▁▇██████████████████▇█▇███▇▇█▇▇▇▇▇▇▇▆▇▇▆▇▆▆▆▄▆▄▅▆▅▄▅▅▆▅▅▅ █
127 ns        Histogram: log(frequency) by time       240 ns <

Memory estimate: 0 bytes, allocs estimate: 0.
``````

so now it’s

``````const szAge = 64 - 19 + 1
const szE = 2
const β = 0.92
const δ = 0.10

maxT(am, af; Z=realage(szAge)) = Z - max(am, af) + 1

realage(x) = x + 18
realageinv(x) = x - 18

function insideproduct_num_log(am, em, af, ef; μ0m=μ0m, μ0f=μ0f)
exponent = 0.5*(β*(1 - δ))
power = 1.0
p = zero(promote_type(eltype(μ0m), eltype(μ0f)))
for k in 0:maxT(realage(am), realage(af))-1
p += power * log(abs(μ0m[am + k, em] * μ0f[af + k, ef]))
power *= exponent
end
return exp(p)
end

function insideproduct_den_log(am, em, af, ef; Sm=Sm, Sf=Sf)
exponent = 0.5*(β*(1 - δ))
power = 1.0
p = zero(promote_type(eltype(Sm), eltype(Sf)))
for k in 0:maxT(realage(am), realage(af))-1
p += power * log(abs(Sm[am + k, em] * Sf[af + k, ef]))
power *= exponent
end
return exp(p)
end

function fillauxmat!(auxmat; Π=Π, Sm=Sm, Sf=Sf, μ0m=μ0m, μ0f=μ0f)
for ef in 1:szE, af in 1:szAge, em in 1:szE, am in 1:szAge
auxmat[am, em, af, ef] = Π[am, em, af, ef] * sqrt(Sm[am, em]*Sf[af, ef]) /
insideproduct_den_log(am, em, af, ef; Sm=Sm, Sf=Sf)
end
return auxmat
end

function MMsysM!(res, μ0m; auxmat=auxmat, Sm=Sm, μ0f=μ0f)
for em in 1:szE, am in 1:szAge
s = zero(eltype(μ0m))
for ef in 1:szE, af in 1:szAge
s += auxmat[am, em, af, ef] * insideproduct_num_log(am, em, af, ef; μ0m=μ0m, μ0f=μ0f)
end
res[am, em] = Sm[am, em] - μ0m[am, em] - s
end
return res
end

function MMsysF!(res, μ0f; auxmat=auxmat, Sf=Sf, μ0m=μ0m)
for ef in 1:szE, af in 1:szAge
s = zero(eltype(μ0f))
for em in 1:szE, am in 1:szAge
s += auxmat[am, em, af, ef] * insideproduct_num_log(am, em, af, ef; μ0m=μ0m, μ0f=μ0f)
end
res[af, ef] = Sf[af, ef] - μ0f[af, ef] - s
end
return res
end

function fullsys!(resm, resf; auxmat=auxmat, Sm=Sm, Sf=Sf, μ0m=μ0m, μ0f=μ0f)
fillauxmat!(auxmat; Π=Π, Sm=Sm, Sf=Sf, μ0m=μ0m, μ0f=μ0f)

MMsysM!(resm, μ0m; auxmat=auxmat, Sm=Sm, μ0f=μ0f)
MMsysF!(resf, μ0f; auxmat=auxmat, Sf=Sf, μ0m=μ0m)
return resm, resf
end

Π = 1000 .* rand(szAge, szE, szAge, szE)
auxmat = similar(Π)

Sm = 1000 .* rand(szAge, szE)
Sf = 1000 .* rand(szAge, szE)

μ0m = rand(szAge, szE)
μ0f = rand(szAge, szE)

resm = similar(μ0m)
resf = similar(μ0f)
``````

Why is this a problem?

In other words: I don’t believe the overhead of parallelization is justified by this workload.

2 Likes