What's a good and easy way to parallelize this?

amrods · January 20, 2022, 12:55am

Ultimately, I want to speed up fullsys!, and as you can see there are functions that fill arrays. I thought those filling operations could be parallelized. How can I get started with that?

const szAge = 64 - 19 + 1
const szE = 2
const β = 0.92
const δ = 0.10

maxT(am, af; Z=realage(szAge)) = Z - max(am, af) + 1

realage(x) = x + 18
realageinv(x) = x - 18

function insideproduct_num(am, em, af, ef; μ0m=μ0m, μ0f=μ0f)
    exponent = 0.5*(β*(1 - δ))
    power = 1.0
    p = one(promote_type(eltype(μ0m), eltype(μ0f)))
    for k in 0:maxT(realage(am), realage(af))-1
        p *= abs(μ0m[am + k, em] * μ0f[af + k, ef])^power
        power *= exponent
    end
    return p
end

function insideproduct_den(am, em, af, ef; Sm=Sm, Sf=Sf)
    exponent = 0.5*(β*(1 - δ))
    power = 1.0
    p = one(promote_type(eltype(Sm), eltype(Sf)))
    for k in 0:maxT(realage(am), realage(af))-1
        p *= abs(Sm[am + k, em] * Sf[af + k, ef])^power
        power *= exponent
    end
    return p
end

function fillauxmat!(auxmat; Π=Π, Sm=Sm, Sf=Sf, μ0m=μ0m, μ0f=μ0f)
    for ef in 1:szE, af in 1:szAge, em in 1:szE, am in 1:szAge
        auxmat[am, em, af, ef] = Π[am, em, af, ef] * sqrt(Sm[am, em]*Sf[af, ef]) /
                insideproduct_den(am, em, af, ef; Sm=Sm, Sf=Sf)
    end
    return auxmat
end

function MMsysM!(res, μ0m; auxmat=auxmat, Sm=Sm, μ0f=μ0f)
    for em in 1:szE, am in 1:szAge
        s = zero(eltype(μ0m))
        for ef in 1:szE, af in 1:szAge
            s += auxmat[am, em, af, ef] * insideproduct_num(am, em, af, ef; μ0m=μ0m, μ0f=μ0f)
        end
        res[am, em] = Sm[am, em] - μ0m[am, em] - s
    end
    return res
end

function MMsysF!(res, μ0f; auxmat=auxmat, Sf=Sf, μ0m=μ0m)
    for ef in 1:szE, af in 1:szAge
        s = zero(eltype(μ0f))
        for em in 1:szE, am in 1:szAge
            s += auxmat[am, em, af, ef] * insideproduct_num(am, em, af, ef; μ0m=μ0m, μ0f=μ0f)
        end
        res[af, ef] = Sf[af, ef] - μ0f[af, ef] - s
    end
    return res
end

function fullsys!(resm, resf; auxmat=auxmat, Sm=Sm, Sf=Sf, μ0m=μ0m, μ0f=μ0f)
    fillauxmat!(auxmat; Π=Π, Sm=Sm, Sf=Sf, μ0m=μ0m, μ0f=μ0f)

    MMsysM!(resm, μ0m; auxmat=auxmat, Sm=Sm, μ0f=μ0f)
    MMsysF!(resf, μ0f; auxmat=auxmat, Sf=Sf, μ0m=μ0m)
    return resm, resf
end

Π = 1000 .* rand(szAge, szE, szAge, szE)
auxmat = similar(Π)

Sm = 1000 .* rand(szAge, szE)
Sf = 1000 .* rand(szAge, szE)

μ0m = rand(szAge, szE)
μ0f = rand(szAge, szE)

resm = similar(μ0m)
resf = similar(μ0f)

for reference, here’s the benchmark for this version:

julia> @benchmark fullsys!($resm, $resf; auxmat=$auxmat, Sm=$Sm, Sf=$Sf, μ0m=$μ0m, μ0f=$μ0f)
BenchmarkTools.Trial: 625 samples with 1 evaluation.
 Range (min … max):  6.852 ms …  13.761 ms  ┊ GC (min … max): 0.00% … 0.00%
 Time  (median):     7.932 ms               ┊ GC (median):    0.00%
 Time  (mean ± σ):   7.996 ms ± 515.543 μs  ┊ GC (mean ± σ):  0.00% ± 0.00%

                  ▄▂▃▄█▁▃▅▅▂▅▃▂  ▁ ▂                           
  ▂▂▂▂▃▃▃▄▄▃▄▇▆▄▆█████████████████▇█▇▆▅▅▅▆▅▃▃▃▃▃▃▂▃▂▄▃▃▁▁▂▁▁▂ ▄
  6.85 ms         Histogram: frequency by time        9.47 ms <

 Memory estimate: 96 bytes, allocs estimate: 2.

Oscar_Smith · January 20, 2022, 1:05am

Before trying to speed your code up via paralellization, you should try to optimize it. For example, insideproduct_num can be calculated in the log domain, which would remove the floating point powers which would make your code roughly 20x faster.

amrods · January 20, 2022, 1:25am

Good idea, thank you!

julia> @benchmark insideproduct_den(30, 2, 25, 2; Sm=$Sm, Sf=$Sf)
BenchmarkTools.Trial: 10000 samples with 243 evaluations.
 Range (min … max):  283.642 ns … 939.449 ns  ┊ GC (min … max): 0.00% … 0.00%
 Time  (median):     322.564 ns               ┊ GC (median):    0.00%
 Time  (mean ± σ):   328.856 ns ±  44.206 ns  ┊ GC (mean ± σ):  0.00% ± 0.00%

      ▄ ▆▇█▄▂ ▁        ▁                                        ▁
  ▇▂█▄█▆███████▇▇▇█▆▆█▇█▆▆▇▇▇▆▆█▆▅▆▆▆▆▆▅▆▆▆▅▆▆▆▆▅▆▅▅▅▅▆▄▅▃▄▅▄▅▅ █
  284 ns        Histogram: log(frequency) by time        550 ns <

 Memory estimate: 0 bytes, allocs estimate: 0.

julia> @benchmark insideproduct_den_log(30, 2, 25, 2; Sm=$Sm, Sf=$Sf)
BenchmarkTools.Trial: 10000 samples with 873 evaluations.
 Range (min … max):  126.515 ns …  1.086 μs  ┊ GC (min … max): 0.00% … 0.00%
 Time  (median):     143.638 ns              ┊ GC (median):    0.00%
 Time  (mean ± σ):   147.881 ns ± 24.177 ns  ┊ GC (mean ± σ):  0.00% ± 0.00%

   ▁▂   ▄▆▇█▃▂▂▁▁▁     ▁         ▁                             ▂
  ▆██▁▇██████████████████▇█▇███▇▇█▇▇▇▇▇▇▇▆▇▇▆▇▆▆▆▄▆▄▅▆▅▄▅▅▆▅▅▅ █
  127 ns        Histogram: log(frequency) by time       240 ns <

 Memory estimate: 0 bytes, allocs estimate: 0.

so now it’s

const szAge = 64 - 19 + 1
const szE = 2
const β = 0.92
const δ = 0.10

maxT(am, af; Z=realage(szAge)) = Z - max(am, af) + 1

realage(x) = x + 18
realageinv(x) = x - 18

function insideproduct_num_log(am, em, af, ef; μ0m=μ0m, μ0f=μ0f)
    exponent = 0.5*(β*(1 - δ))
    power = 1.0
    p = zero(promote_type(eltype(μ0m), eltype(μ0f)))
    for k in 0:maxT(realage(am), realage(af))-1
        p += power * log(abs(μ0m[am + k, em] * μ0f[af + k, ef]))
        power *= exponent
    end
    return exp(p)
end

function insideproduct_den_log(am, em, af, ef; Sm=Sm, Sf=Sf)
    exponent = 0.5*(β*(1 - δ))
    power = 1.0
    p = zero(promote_type(eltype(Sm), eltype(Sf)))
    for k in 0:maxT(realage(am), realage(af))-1
        p += power * log(abs(Sm[am + k, em] * Sf[af + k, ef]))
        power *= exponent
    end
    return exp(p)
end

function fillauxmat!(auxmat; Π=Π, Sm=Sm, Sf=Sf, μ0m=μ0m, μ0f=μ0f)
    for ef in 1:szE, af in 1:szAge, em in 1:szE, am in 1:szAge
        auxmat[am, em, af, ef] = Π[am, em, af, ef] * sqrt(Sm[am, em]*Sf[af, ef]) /
                insideproduct_den_log(am, em, af, ef; Sm=Sm, Sf=Sf)
    end
    return auxmat
end

function MMsysM!(res, μ0m; auxmat=auxmat, Sm=Sm, μ0f=μ0f)
    for em in 1:szE, am in 1:szAge
        s = zero(eltype(μ0m))
        for ef in 1:szE, af in 1:szAge
            s += auxmat[am, em, af, ef] * insideproduct_num_log(am, em, af, ef; μ0m=μ0m, μ0f=μ0f)
        end
        res[am, em] = Sm[am, em] - μ0m[am, em] - s
    end
    return res
end

function MMsysF!(res, μ0f; auxmat=auxmat, Sf=Sf, μ0m=μ0m)
    for ef in 1:szE, af in 1:szAge
        s = zero(eltype(μ0f))
        for em in 1:szE, am in 1:szAge
            s += auxmat[am, em, af, ef] * insideproduct_num_log(am, em, af, ef; μ0m=μ0m, μ0f=μ0f)
        end
        res[af, ef] = Sf[af, ef] - μ0f[af, ef] - s
    end
    return res
end

function fullsys!(resm, resf; auxmat=auxmat, Sm=Sm, Sf=Sf, μ0m=μ0m, μ0f=μ0f)
    fillauxmat!(auxmat; Π=Π, Sm=Sm, Sf=Sf, μ0m=μ0m, μ0f=μ0f)

    MMsysM!(resm, μ0m; auxmat=auxmat, Sm=Sm, μ0f=μ0f)
    MMsysF!(resf, μ0f; auxmat=auxmat, Sf=Sf, μ0m=μ0m)
    return resm, resf
end

Π = 1000 .* rand(szAge, szE, szAge, szE)
auxmat = similar(Π)

Sm = 1000 .* rand(szAge, szE)
Sf = 1000 .* rand(szAge, szE)

μ0m = rand(szAge, szE)
μ0f = rand(szAge, szE)

resm = similar(μ0m)
resf = similar(μ0f)

goerch · January 20, 2022, 5:35am

Why is this a problem?

In other words: I don’t believe the overhead of parallelization is justified by this workload.

Topic		Replies	Views
Sum operations between arrays Performance	21	5628	April 7, 2020
Parallel implementation is not very effective General Usage question , performance , parallel , multithreading	19	790	October 8, 2022
Improve performance of matrix computation Performance	10	1146	April 25, 2018
Speeding up array multiplication General Usage multithreading , high-performance	7	420	March 9, 2025
How to improve performance in a function that repeatedly defines and multiplies matrices Julia at Scale performance , parallel , linearalgebra , complex-numbers , matrix	64	1658	January 6, 2024

What's a good and easy way to parallelize this?

Related topics