Help optimizing IPFP algorithm

I’m trying to speed up this version of the IPFP algorithm, which alternates between solving 2 systems of nonlinear equations until convergence:

using NLsolve
using LinearAlgebra

const szAge = 64 - 19 + 1
const szE = 2
const β = 0.92
const δ = 0.10

maxT(am, af; Z=realage(szAge)) = Z - max(am, af) + 1

realage(x) = x + 18
realageinv(x) = x - 18

const IDX_ae = CartesianIndices((1:szAge, 1:szE))

@views function insideproduct(am, em, af, ef; μ0m=μ0m, μ0f=μ0f, Sm=Sm, Sf=Sf)
    prod(abs((μ0m[am + k, em] * μ0f[af + k, ef])/(Sm[am + k, em] * Sf[af + k, ef]))^(0.5*(β*(1 - δ))^k) for k in 0:maxT(realage(am), realage(af))-1)
end

@views function MMsysM!(res, μ0m; Π::F=UMM, Sm=Sm, Sf=Sf, μ0f=μ0f) where F
    for idx_ae in IDX_ae
        am, em = Tuple(idx_ae)
        res[am, em] = Sm[am, em] - μ0m[am, em] - 
            sum( Π(realage(am), em, realage(idx_fi[1]), idx_fi[2]) * sqrt(Sm[am, em]*Sf[idx_fi[1], idx_fi[2]]) * 
                insideproduct(am, em, idx_fi[1], idx_fi[2]; μ0m=μ0m, μ0f=μ0f, Sm=Sm, Sf=Sf) for idx_fi in IDX_ae )
    end
    return res
end

@views function MMsysF!(res, μ0f; Π::F=UMM, Sm=Sm, Sf=Sf, μ0m=μ0m) where F
    for idx_ae in IDX_ae
        af, ef = Tuple(idx_ae)
        res[af, ef] = Sf[af, ef] - μ0f[af, ef] - 
            sum( Π(realage(idx_mi[1]), idx_mi[2], realage(af), ef) * sqrt(Sm[idx_mi[1], idx_mi[2]]*Sf[af, ef]) * 
                insideproduct(idx_mi[1], idx_mi[2], af, ef; μ0m=μ0m, μ0f=μ0f, Sm=Sm, Sf=Sf) for idx_mi in IDX_ae )
    end
    return res
end

function ipfp(Π::F, Sm, Sf; initμ0=initμ0, tol=1e-8, method=:trust_region, iterations=1_000, autodiff=:forward) where F
    i = 0

    μ0f1 = Sf
    μ0m = initμ0
    d = Inf
    while d > tol && i <= iterations
        solM = NLsolve.nlsolve((res, μ0m) -> MMsysM!(res, μ0m; Π=Π, Sm=Sm, Sf=Sf, μ0f=μ0f1), initμ0; method=method, autodiff=autodiff)
        μ0m = solM.zero
        solF = NLsolve.nlsolve((res, μ0f) -> MMsysF!(res, μ0f; Π=Π, Sm=Sm, Sf=Sf, μ0m=μ0m), initμ0; method=method, autodiff=autodiff)
        μ0f2 = solF.zero
        d = norm(μ0f1 - μ0f2, Inf)
        μ0f1 = μ0f2
        i += 1
    end
    return abs.(μ0m), abs.(μ0f1)
end

@views Π(am, em, af, ef; Πmat=Πmat) = Πmat[realageinv(am), em, realageinv(af), ef]

Πmat = 1000 .* rand(szAge, szE, szAge, szE)

Sm = 1000 .* rand(szAge, szE)
Sf = 1000 .* rand(szAge, szE)

initμ0 = ones(size(Sf))

Right now it takes 5.6 seconds on my computer:

julia> @benchmark ipfp((am, em, af, ef; Πmat=$Πmat) -> Π(am, em, af, ef; Πmat=$Πmat), $Sm, $Sf; method=:trust_region, autodiff=:finite)
BenchmarkTools.Trial: 1 sample with 1 evaluation.
 Single result which took 5.585 s (0.00% GC) to evaluate,
 with a memory estimate of 24.29 MiB, over 78419 allocations.

I believe almost all of the time is getting spent in ...)^(0.5*(β*(1 - δ))^k). ^ with floating point values is really expensive, so rewriting this line could pretty easily give a 10x speedup. Specifically, you can calculate 0.5*(β*(1 - δ)) outside the loop, and replace the ^k with repeated multiplication. I think there might be other simplifications here as well.

1 Like

I did that and rewrote the sums and prods as loops. This is how it looks now:

using NLsolve
using LinearAlgebra

const szAge = 64 - 19 + 1
const szE = 2
const β = 0.92
const δ = 0.10

maxT(am, af; Z=realage(szAge)) = Z - max(am, af) + 1

realage(x) = x + 18
realageinv(x) = x - 18

const IDX_ae = CartesianIndices((1:szAge, 1:szE))

@views function insideproduct(am, em, af, ef; μ0m=μ0m, μ0f=μ0f, Sm=Sm, Sf=Sf)
    exponent = 0.5*(β*(1 - δ))
    p = one(promote_type(eltype(μ0m), eltype(μ0f)))
    for k in 0:maxT(realage(am), realage(af))-1
        p *= abs((μ0m[am + k, em] * μ0f[af + k, ef])/(Sm[am + k, em] * Sf[af + k, ef]))^(exponent^k)
    end
    return p
end

@views function MMsysM!(res, μ0m; Π::F=UMM, Sm=Sm, Sf=Sf, μ0f=μ0f) where F
    for idx_ae in IDX_ae
        am, em = Tuple(idx_ae)
        s = zero(eltype(μ0m))
        for idx_fi in IDX_ae
            s += Π(realage(am), em, realage(idx_fi[1]), idx_fi[2]) * sqrt(Sm[am, em]*Sf[idx_fi[1], idx_fi[2]]) * 
                insideproduct(am, em, idx_fi[1], idx_fi[2]; μ0m=μ0m, μ0f=μ0f, Sm=Sm, Sf=Sf)
        end
        res[am, em] = Sm[am, em] - μ0m[am, em] - s
    end
    return res
end

@views function MMsysF!(res, μ0f; Π::F=UMM, Sm=Sm, Sf=Sf, μ0m=μ0m) where F
    for idx_ae in IDX_ae
        af, ef = Tuple(idx_ae)
        s = zero(eltype(μ0f))
        for idx_mi in IDX_ae
            s += Π(realage(idx_mi[1]), idx_mi[2], realage(af), ef) * sqrt(Sm[idx_mi[1], idx_mi[2]]*Sf[af, ef]) * 
                insideproduct(idx_mi[1], idx_mi[2], af, ef; μ0m=μ0m, μ0f=μ0f, Sm=Sm, Sf=Sf)
        end
        res[af, ef] = Sf[af, ef] - μ0f[af, ef] - s
    end
    return res
end

function ipfp(Π::F, Sm, Sf; initμ0=initμ0, tol=1e-8, method=:trust_region, iterations=1_000, autodiff=:forward) where F
    i = 0

    μ0f1 = Sf
    μ0m = initμ0
    d = Inf
    while d > tol && i <= iterations
        solM = NLsolve.nlsolve((res, μ0m) -> MMsysM!(res, μ0m; Π=Π, Sm=Sm, Sf=Sf, μ0f=μ0f1), initμ0; method=method, autodiff=autodiff)
        μ0m = solM.zero
        solF = NLsolve.nlsolve((res, μ0f) -> MMsysF!(res, μ0f; Π=Π, Sm=Sm, Sf=Sf, μ0m=μ0m), initμ0; method=method, autodiff=autodiff)
        μ0f2 = solF.zero
        d = norm(μ0f1 - μ0f2, Inf)
        μ0f1 = μ0f2
        i += 1
    end
    return abs.(μ0m), abs.(μ0f1)
end

@views Π(am, em, af, ef; Πmat=Πmat) = Πmat[realageinv(am), em, realageinv(af), ef]

Πmat = 1000 .* rand(szAge, szE, szAge, szE)

Sm = 1000 .* rand(szAge, szE)
Sf = 1000 .* rand(szAge, szE)

initμ0 = similar(Sf)
julia> @benchmark ipfp((am, em, af, ef; Πmat=$Πmat) -> Π(am, em, af, ef; Πmat=$Πmat), $Sm, $Sf; method=:trust_region, autodiff=:finite)
BenchmarkTools.Trial: 2 samples with 1 evaluation.
 Range (min … max):  2.662 s …    2.820 s  ┊ GC (min … max): 0.05% … 0.00%
 Time  (median):     2.741 s               ┊ GC (median):    0.02%
 Time  (mean ± σ):   2.741 s ± 111.889 ms  ┊ GC (mean ± σ):  0.02% ± 0.03%

  █                                                        █  
  █▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁█ ▁
  2.66 s         Histogram: frequency by time         2.82 s <

 Memory estimate: 24.29 MiB, allocs estimate: 78419.

What about

power = 1.0
for k in 0:maxT(realage(am), realage(af))-1
    p *= abs((μ0m[am + k, em] * μ0f[af + k, ef])/(Sm[am + k, em] * Sf[af + k, ef]))^power
    power *= exponent
end
1 Like

Oh I see what you meant now. Wow that really helps, thank you:

julia> @benchmark ipfp((am, em, af, ef; Πmat=$Πmat) -> Π(am, em, af, ef; Πmat=$Πmat), $Sm, $Sf; method=:trust_region, autodiff=:finite)
BenchmarkTools.Trial: 4 samples with 1 evaluation.
 Range (min … max):  1.272 s …  1.295 s  ┊ GC (min … max): 0.07% … 0.00%
 Time  (median):     1.283 s             ┊ GC (median):    0.03%
 Time  (mean ± σ):   1.283 s ± 9.710 ms  ┊ GC (mean ± σ):  0.03% ± 0.04%

  █                    █        █                        █  
  █▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁█▁▁▁▁▁▁▁▁█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁█ ▁
  1.27 s        Histogram: frequency by time         1.3 s <

 Memory estimate: 24.29 MiB, allocs estimate: 78419.
1 Like

I am a firm believer that a^b is a bad operation that should not exist. (and I should know, https://github.com/JuliaLang/julia/pull/42271). Also, that PR means that this code should be about 2x faster on 1.8 than it is on 1.6/1.7. a^b has awful conditioning, and as far as I know, no good way of computing without extensive use of extended precision.

1 Like

I’m surprised by how much improvement I get by rewriting the sums and prods into for loops. Do you have any insight about that?