What's a good and easy way to parallelize this?

Ultimately, I want to speed up fullsys!, and as you can see there are functions that fill arrays. I thought those filling operations could be parallelized. How can I get started with that?

const szAge = 64 - 19 + 1
const szE = 2
const β = 0.92
const δ = 0.10

maxT(am, af; Z=realage(szAge)) = Z - max(am, af) + 1

realage(x) = x + 18
realageinv(x) = x - 18

function insideproduct_num(am, em, af, ef; μ0m=μ0m, μ0f=μ0f)
    exponent = 0.5*(β*(1 - δ))
    power = 1.0
    p = one(promote_type(eltype(μ0m), eltype(μ0f)))
    for k in 0:maxT(realage(am), realage(af))-1
        p *= abs(μ0m[am + k, em] * μ0f[af + k, ef])^power
        power *= exponent
    end
    return p
end

function insideproduct_den(am, em, af, ef; Sm=Sm, Sf=Sf)
    exponent = 0.5*(β*(1 - δ))
    power = 1.0
    p = one(promote_type(eltype(Sm), eltype(Sf)))
    for k in 0:maxT(realage(am), realage(af))-1
        p *= abs(Sm[am + k, em] * Sf[af + k, ef])^power
        power *= exponent
    end
    return p
end

function fillauxmat!(auxmat; Π=Π, Sm=Sm, Sf=Sf, μ0m=μ0m, μ0f=μ0f)
    for ef in 1:szE, af in 1:szAge, em in 1:szE, am in 1:szAge
        auxmat[am, em, af, ef] = Π[am, em, af, ef] * sqrt(Sm[am, em]*Sf[af, ef]) /
                insideproduct_den(am, em, af, ef; Sm=Sm, Sf=Sf)
    end
    return auxmat
end

function MMsysM!(res, μ0m; auxmat=auxmat, Sm=Sm, μ0f=μ0f)
    for em in 1:szE, am in 1:szAge
        s = zero(eltype(μ0m))
        for ef in 1:szE, af in 1:szAge
            s += auxmat[am, em, af, ef] * insideproduct_num(am, em, af, ef; μ0m=μ0m, μ0f=μ0f)
        end
        res[am, em] = Sm[am, em] - μ0m[am, em] - s
    end
    return res
end

function MMsysF!(res, μ0f; auxmat=auxmat, Sf=Sf, μ0m=μ0m)
    for ef in 1:szE, af in 1:szAge
        s = zero(eltype(μ0f))
        for em in 1:szE, am in 1:szAge
            s += auxmat[am, em, af, ef] * insideproduct_num(am, em, af, ef; μ0m=μ0m, μ0f=μ0f)
        end
        res[af, ef] = Sf[af, ef] - μ0f[af, ef] - s
    end
    return res
end

function fullsys!(resm, resf; auxmat=auxmat, Sm=Sm, Sf=Sf, μ0m=μ0m, μ0f=μ0f)
    fillauxmat!(auxmat; Π=Π, Sm=Sm, Sf=Sf, μ0m=μ0m, μ0f=μ0f)

    MMsysM!(resm, μ0m; auxmat=auxmat, Sm=Sm, μ0f=μ0f)
    MMsysF!(resf, μ0f; auxmat=auxmat, Sf=Sf, μ0m=μ0m)
    return resm, resf
end

Π = 1000 .* rand(szAge, szE, szAge, szE)
auxmat = similar(Π)

Sm = 1000 .* rand(szAge, szE)
Sf = 1000 .* rand(szAge, szE)

μ0m = rand(szAge, szE)
μ0f = rand(szAge, szE)

resm = similar(μ0m)
resf = similar(μ0f)

for reference, here’s the benchmark for this version:

julia> @benchmark fullsys!($resm, $resf; auxmat=$auxmat, Sm=$Sm, Sf=$Sf, μ0m=$μ0m, μ0f=$μ0f)
BenchmarkTools.Trial: 625 samples with 1 evaluation.
 Range (min … max):  6.852 ms …  13.761 ms  ┊ GC (min … max): 0.00% … 0.00%
 Time  (median):     7.932 ms               ┊ GC (median):    0.00%
 Time  (mean ± σ):   7.996 ms ± 515.543 μs  ┊ GC (mean ± σ):  0.00% ± 0.00%

                  ▄▂▃▄█▁▃▅▅▂▅▃▂  ▁ ▂                           
  ▂▂▂▂▃▃▃▄▄▃▄▇▆▄▆█████████████████▇█▇▆▅▅▅▆▅▃▃▃▃▃▃▂▃▂▄▃▃▁▁▂▁▁▂ ▄
  6.85 ms         Histogram: frequency by time        9.47 ms <

 Memory estimate: 96 bytes, allocs estimate: 2.

Before trying to speed your code up via paralellization, you should try to optimize it. For example, insideproduct_num can be calculated in the log domain, which would remove the floating point powers which would make your code roughly 20x faster.

2 Likes

Good idea, thank you!

julia> @benchmark insideproduct_den(30, 2, 25, 2; Sm=$Sm, Sf=$Sf)
BenchmarkTools.Trial: 10000 samples with 243 evaluations.
 Range (min … max):  283.642 ns … 939.449 ns  ┊ GC (min … max): 0.00% … 0.00%
 Time  (median):     322.564 ns               ┊ GC (median):    0.00%
 Time  (mean ± σ):   328.856 ns ±  44.206 ns  ┊ GC (mean ± σ):  0.00% ± 0.00%

      ▄ ▆▇█▄▂ ▁        ▁                                        ▁
  ▇▂█▄█▆███████▇▇▇█▆▆█▇█▆▆▇▇▇▆▆█▆▅▆▆▆▆▆▅▆▆▆▅▆▆▆▆▅▆▅▅▅▅▆▄▅▃▄▅▄▅▅ █
  284 ns        Histogram: log(frequency) by time        550 ns <

 Memory estimate: 0 bytes, allocs estimate: 0.

julia> @benchmark insideproduct_den_log(30, 2, 25, 2; Sm=$Sm, Sf=$Sf)
BenchmarkTools.Trial: 10000 samples with 873 evaluations.
 Range (min … max):  126.515 ns …  1.086 μs  ┊ GC (min … max): 0.00% … 0.00%
 Time  (median):     143.638 ns              ┊ GC (median):    0.00%
 Time  (mean ± σ):   147.881 ns ± 24.177 ns  ┊ GC (mean ± σ):  0.00% ± 0.00%

   ▁▂   ▄▆▇█▃▂▂▁▁▁     ▁         ▁                             ▂
  ▆██▁▇██████████████████▇█▇███▇▇█▇▇▇▇▇▇▇▆▇▇▆▇▆▆▆▄▆▄▅▆▅▄▅▅▆▅▅▅ █
  127 ns        Histogram: log(frequency) by time       240 ns <

 Memory estimate: 0 bytes, allocs estimate: 0.

so now it’s

const szAge = 64 - 19 + 1
const szE = 2
const β = 0.92
const δ = 0.10

maxT(am, af; Z=realage(szAge)) = Z - max(am, af) + 1

realage(x) = x + 18
realageinv(x) = x - 18

function insideproduct_num_log(am, em, af, ef; μ0m=μ0m, μ0f=μ0f)
    exponent = 0.5*(β*(1 - δ))
    power = 1.0
    p = zero(promote_type(eltype(μ0m), eltype(μ0f)))
    for k in 0:maxT(realage(am), realage(af))-1
        p += power * log(abs(μ0m[am + k, em] * μ0f[af + k, ef]))
        power *= exponent
    end
    return exp(p)
end

function insideproduct_den_log(am, em, af, ef; Sm=Sm, Sf=Sf)
    exponent = 0.5*(β*(1 - δ))
    power = 1.0
    p = zero(promote_type(eltype(Sm), eltype(Sf)))
    for k in 0:maxT(realage(am), realage(af))-1
        p += power * log(abs(Sm[am + k, em] * Sf[af + k, ef]))
        power *= exponent
    end
    return exp(p)
end

function fillauxmat!(auxmat; Π=Π, Sm=Sm, Sf=Sf, μ0m=μ0m, μ0f=μ0f)
    for ef in 1:szE, af in 1:szAge, em in 1:szE, am in 1:szAge
        auxmat[am, em, af, ef] = Π[am, em, af, ef] * sqrt(Sm[am, em]*Sf[af, ef]) /
                insideproduct_den_log(am, em, af, ef; Sm=Sm, Sf=Sf)
    end
    return auxmat
end

function MMsysM!(res, μ0m; auxmat=auxmat, Sm=Sm, μ0f=μ0f)
    for em in 1:szE, am in 1:szAge
        s = zero(eltype(μ0m))
        for ef in 1:szE, af in 1:szAge
            s += auxmat[am, em, af, ef] * insideproduct_num_log(am, em, af, ef; μ0m=μ0m, μ0f=μ0f)
        end
        res[am, em] = Sm[am, em] - μ0m[am, em] - s
    end
    return res
end

function MMsysF!(res, μ0f; auxmat=auxmat, Sf=Sf, μ0m=μ0m)
    for ef in 1:szE, af in 1:szAge
        s = zero(eltype(μ0f))
        for em in 1:szE, am in 1:szAge
            s += auxmat[am, em, af, ef] * insideproduct_num_log(am, em, af, ef; μ0m=μ0m, μ0f=μ0f)
        end
        res[af, ef] = Sf[af, ef] - μ0f[af, ef] - s
    end
    return res
end

function fullsys!(resm, resf; auxmat=auxmat, Sm=Sm, Sf=Sf, μ0m=μ0m, μ0f=μ0f)
    fillauxmat!(auxmat; Π=Π, Sm=Sm, Sf=Sf, μ0m=μ0m, μ0f=μ0f)

    MMsysM!(resm, μ0m; auxmat=auxmat, Sm=Sm, μ0f=μ0f)
    MMsysF!(resf, μ0f; auxmat=auxmat, Sf=Sf, μ0m=μ0m)
    return resm, resf
end

Π = 1000 .* rand(szAge, szE, szAge, szE)
auxmat = similar(Π)

Sm = 1000 .* rand(szAge, szE)
Sf = 1000 .* rand(szAge, szE)

μ0m = rand(szAge, szE)
μ0f = rand(szAge, szE)

resm = similar(μ0m)
resf = similar(μ0f)

Why is this a problem?

In other words: I don’t believe the overhead of parallelization is justified by this workload.

2 Likes