Advanced tricks to reduce memory allocations of ODE

Hello,

I’m trying to further reduce memory allocations when solving an ODE using DifferentialEquations.jl. I have already implemented most of the tricks I’m aware of, but I still get a ton of memory allocation when calling min..

Here’s the du function I’m trying to optimize:

function SIA2D!(dH::Matrix{F}, H::Matrix{F}, simulation::SIM, t::F) where {F <: AbstractFloat, SIM <: Simulation}
    
    # Retrieve parameters
    @timeit get_timer("ODINN") "Variable assignment" begin
    SIA2D_model::SIA2Dmodel = simulation.model.iceflow
    glacier::Glacier = simulation.glaciers[simulation.model.iceflow.glacier_idx[]]
    params::Parameters = simulation.parameters
    int_type = simulation.parameters.simulation.int_type
    A::Ref{F} = SIA2D_model.A
    B::Matrix{F} = glacier.B
    S::Matrix{F} = SIA2D_model.S
    dSdx::Matrix{F} = SIA2D_model.dSdx
    dSdy::Matrix{F} = SIA2D_model.dSdy
    D::Matrix{F} = SIA2D_model.D
    dSdx_edges::Matrix{F} = SIA2D_model.dSdx_edges
    dSdy_edges::Matrix{F} = SIA2D_model.dSdy_edges
    ∇S::Matrix{F} = SIA2D_model.∇S
    Fx::Matrix{F} = SIA2D_model.Fx
    Fy::Matrix{F} = SIA2D_model.Fy
    Δx::F = glacier.Δx
    Δy::F = glacier.Δy
    Γ::Ref{F} = SIA2D_model.Γ
    n::int_type = simulation.parameters.physical.n
    ρ::F = simulation.parameters.physical.ρ
    g::F = simulation.parameters.physical.g
    end

    # First, enforce values to be positive
    @timeit get_timer("ODINN") "H to zero" begin
    H[H.<0.0] .= 0.0
    # Update glacier surface altimetry
    S .= B .+ H
    end

    # All grid variables computed in a staggered grid
    # Compute surface gradients on edges
    @timeit get_timer("ODINN") "Surface gradients" begin
    diff_x!(dSdx, S, Δx)  
    diff_y!(dSdy, S, Δy) 
    ∇S .= (avg_y(dSdx).^2 .+ avg_x(dSdy).^2).^((n - 1)/2) 
    end

    @timeit get_timer("ODINN") "Diffusivity" begin
    Γ[] = 2.0 * A[] * (ρ * g)^n / (n+2) # 1 / m^3 s 
    D .= Γ[] .* avg(H).^(n + 2) .* ∇S
    end

    # Compute flux components
    @timeit get_timer("ODINN") "Gradients edges" begin
    @views diff_x!(dSdx_edges, S[:,2:end - 1], Δx)
    @views diff_y!(dSdy_edges, S[2:end - 1,:], Δy)
    end
    # Cap surface elevaton differences with the upstream ice thickness to 
    # imporse boundary condition of the SIA equation
    @timeit get_timer("ODINN") "Capping flux" begin
    η₀ = params.physical.η₀
    dSdx_edges .= min.(dSdx_edges,  η₀ * H[1:end-1, 2:end-1]./Δx,  η₀ * H[2:end, 2:end-1]./Δx)
    dSdy_edges .= min.(dSdy_edges,  η₀ * H[2:end-1, 1:end-1]./Δy,  η₀ * H[2:end-1, 2:end]./Δy) 
    dSdx_edges .= max.(dSdx_edges, -η₀ * H[1:end-1, 2:end-1]./Δx, -η₀ * H[2:end, 2:end-1]./Δx)
    dSdy_edges .= max.(dSdy_edges, -η₀ * H[2:end-1, 1:end-1]./Δy, -η₀ * H[2:end-1, 2:end]./Δy)
    end

    @timeit get_timer("ODINN") "Flux" begin
    Fx .= .-avg_y(D) .* dSdx_edges
    Fy .= .-avg_x(D) .* dSdy_edges 
    end

    #  Flux divergence
    @timeit get_timer("ODINN") "dH" begin
    inn(dH) .= .-(diff_x(Fx) ./ Δx .+ diff_y(Fy) ./ Δy) 
    end
end

And these are the main results in terms of memory allocation:

to =  ────────────────────────────────────────────────────────────────────────────────────
                                            Time                    Allocations      
                                   ───────────────────────   ────────────────────────
         Tot / % measured:              22.1s /  93.6%           37.1GiB /  99.9%    

 Section                   ncalls     time    %tot     avg     alloc    %tot      avg
 ────────────────────────────────────────────────────────────────────────────────────
 Run prediction                 1    20.7s  100.0%   20.7s   37.1GiB  100.0%  37.1GiB
   Simulate iceflow PDE         1    19.8s   95.7%   19.8s   37.1GiB  100.0%  37.1GiB
     Capping flux           13.2k    9.65s   46.7%   729μs   25.4GiB   68.6%  1.97MiB
     Surface gradients      13.2k    3.22s   15.6%   243μs   3.21GiB    8.6%   254KiB
     Diffusivity            13.2k    2.49s   12.0%   188μs   1.61GiB    4.3%   127KiB
     Mass balance             120    1.17s    5.7%  9.75ms    390MiB    1.0%  3.25MiB
     Flux                   13.2k    969ms    4.7%  73.2μs   3.18GiB    8.6%   252KiB
     dH                     13.2k    965ms    4.7%  72.9μs   3.15GiB    8.5%   249KiB
     Gradients edges        13.2k    377ms    1.8%  28.5μs     0.00B    0.0%    0.00B
     H to zero              13.2k    292ms    1.4%  22.1μs   84.9MiB    0.2%  6.56KiB
     Variable assignment    13.2k    122ms    0.6%  9.18μs    828KiB    0.0%    64.0B
 ────────────────────────────────────────────────────────────────────────────────────

What other tricks could I use the further reduce memory allocation? Particularly for the min. part. Thanks a lot in advance!

1 Like

I think the problem is not the min. itself but the array expression afterwards. Does it work if you write
η₀ .* H[1:end-1, 2:end-1]./Δx instead?

In general, I think code visibility suffers a bit from trying to vectorize everything instead of writing explicit loops. If you loop explicitly, it is often easier to avoid allocations as well (at least I think so)

By the way, you can use the macro @. to vectorize the entire line. Also, annotations like

B::Matrix{F} = glacier.B

usually do not do anything for performance

Thanks for the feedback, @Salmon. I updated those lines to (I’m aware that it could be written with a begin end clause for encompass all lines. Still I prefer this syntax to have a more homogenous style throughout the function):

    dSdx_edges .= @. @views min(dSdx_edges,  η₀ * H[1:end-1, 2:end-1]/Δx,  η₀ * H[2:end, 2:end-1]/Δx)
    dSdy_edges .= @. @views min(dSdy_edges,  η₀ * H[2:end-1, 1:end-1]/Δy,  η₀ * H[2:end-1, 2:end]/Δy) 
    dSdx_edges .= @. @views max(dSdx_edges, -η₀ * H[1:end-1, 2:end-1]/Δx, -η₀ * H[2:end, 2:end-1]/Δx)
    dSdy_edges .= @. @views max(dSdy_edges, -η₀ * H[2:end-1, 1:end-1]/Δy, -η₀ * H[2:end-1, 2:end]/Δy)

And indeed, the performance increased:

Capping flux    13.2k    6.57s   36.7%   497μs   12.7GiB   52.2%  0.98MiB

However, it’s still pretty high. Anything else that could be done to improve it further? Same goes for the rest of the code. Thanks!

Where are the lines that the allocation profiler says are allocating?

1 Like

OK, I’ve been playing around and I’ve managed to optimize it quite a lot. Here’s the latest version:

function SIA2D!(dH::Matrix{F}, H::Matrix{F}, simulation::SIM, t::F) where {F <: AbstractFloat, SIM <: Simulation}
    
    # Retrieve parameters
    @timeit get_timer("ODINN") "Variable assignment" begin
    SIA2D_model::SIA2Dmodel = simulation.model.iceflow
    glacier::Glacier = simulation.glaciers[simulation.model.iceflow.glacier_idx[]]
    params::Parameters = simulation.parameters
    int_type = simulation.parameters.simulation.int_type
    H̄::Matrix{F} = SIA2D_model.H̄
    A::Ref{F} = SIA2D_model.A
    B::Matrix{F} = glacier.B
    S::Matrix{F} = SIA2D_model.S
    dSdx::Matrix{F} = SIA2D_model.dSdx
    dSdy::Matrix{F} = SIA2D_model.dSdy
    D::Matrix{F} = SIA2D_model.D
    Dx::Matrix{F} = SIA2D_model.Dx
    Dy::Matrix{F} = SIA2D_model.Dy
    dSdx_edges::Matrix{F} = SIA2D_model.dSdx_edges
    dSdy_edges::Matrix{F} = SIA2D_model.dSdy_edges
    ∇S::Matrix{F} = SIA2D_model.∇S
    ∇Sx::Matrix{F} = SIA2D_model.∇Sx
    ∇Sy::Matrix{F} = SIA2D_model.∇Sy
    Fx::Matrix{F} = SIA2D_model.Fx
    Fy::Matrix{F} = SIA2D_model.Fy
    Fxx::Matrix{F} = SIA2D_model.Fxx
    Fyy::Matrix{F} = SIA2D_model.Fyy
    Δx::F = glacier.Δx
    Δy::F = glacier.Δy
    Γ::Ref{F} = SIA2D_model.Γ
    n::int_type = simulation.parameters.physical.n
    ρ::F = simulation.parameters.physical.ρ
    g::F = simulation.parameters.physical.g
    end

    # First, enforce values to be positive
    @timeit get_timer("ODINN") "H to zero" begin
    H[H.<0.0] .= 0.0
    # Update glacier surface altimetry
    S .= B .+ H
    end

    # All grid variables computed in a staggered grid
    # Compute surface gradients on edges
    @timeit get_timer("ODINN") "Surface gradients" begin
    diff_x!(dSdx, S, Δx)  
    diff_y!(dSdy, S, Δy) 
    avg_y!(∇Sx, dSdx)
    avg_x!(∇Sy, dSdy)
    ∇S .= (∇Sx.^2 .+ ∇Sy.^2).^((n - 1)/2) 
    end

    @timeit get_timer("ODINN") "Diffusivity" begin
    # @infiltrate
    avg!(H̄, H)
    Γ[] = 2.0 * A[] * (ρ * g)^n / (n+2) # 1 / m^3 s 
    D .= Γ[] .* H̄.^(n + 2) .* ∇S
    end

    # Compute flux components
    @timeit get_timer("ODINN") "Gradients edges" begin
    @views diff_x!(dSdx_edges, S[:,2:end - 1], Δx)
    @views diff_y!(dSdy_edges, S[2:end - 1,:], Δy)
    end
    # Cap surface elevaton differences with the upstream ice thickness to 
    # imporse boundary condition of the SIA equation
    @timeit get_timer("ODINN") "Capping flux" begin
    η₀ = params.physical.η₀
    dSdx_edges .= @views @. min.(dSdx_edges,  η₀ * H[1:end-1, 2:end-1]./Δx,  η₀ * H[2:end, 2:end-1]./Δx)
    dSdy_edges .= @views @. min.(dSdy_edges,  η₀ * H[2:end-1, 1:end-1]./Δy,  η₀ * H[2:end-1, 2:end]./Δy) 
    dSdx_edges .= @views @. max.(dSdx_edges, -η₀ * H[1:end-1, 2:end-1]./Δx, -η₀ * H[2:end, 2:end-1]./Δx)
    dSdy_edges .= @views @. max.(dSdy_edges, -η₀ * H[2:end-1, 1:end-1]./Δy, -η₀ * H[2:end-1, 2:end]./Δy)
    end

    @timeit get_timer("ODINN") "Flux" begin
    avg_y!(Dx, D)
    avg_x!(Dy, D)
    Fx .= .-Dx .* dSdx_edges
    Fy .= .-Dy .* dSdy_edges 
    end

    #  Flux divergence
    @timeit get_timer("ODINN") "dH" begin
    diff_x!(Fxx, Fx, Δx)
    diff_y!(Fyy, Fy, Δy)
    inn(dH) .= .-(Fxx .+ Fyy) 
    end
end

Which yields these results:

to =  ────────────────────────────────────────────────────────────────────────────────────
                                            Time                    Allocations      
                                   ───────────────────────   ────────────────────────
         Tot / % measured:              15.6s /  91.0%            644MiB /  92.5%    

 Section                   ncalls     time    %tot     avg     alloc    %tot      avg
 ────────────────────────────────────────────────────────────────────────────────────
 Run prediction                 1    14.2s  100.0%   14.2s    595MiB  100.0%   595MiB
   Simulate iceflow PDE         1    13.1s   91.8%   13.1s    579MiB   97.2%   579MiB
     Capping flux           13.2k    4.50s   31.6%   340μs   61.6MiB   10.4%  4.77KiB
     Surface gradients      13.2k    2.68s   18.9%   203μs   3.03MiB    0.5%     240B
     Diffusivity            13.2k    2.35s   16.5%   177μs   4.04MiB    0.7%     320B
     Mass balance             120    1.19s    8.4%  9.94ms    390MiB   65.5%  3.25MiB
     dH                     13.2k    509ms    3.6%  38.4μs     0.00B    0.0%    0.00B
     Flux                   13.2k    493ms    3.5%  37.2μs     0.00B    0.0%    0.00B
     Gradients edges        13.2k    377ms    2.6%  28.5μs     0.00B    0.0%    0.00B
     H to zero              13.2k    296ms    2.1%  22.3μs   84.9MiB   14.3%  6.56KiB
     Variable assignment    13.2k    134ms    0.9%  10.1μs    828KiB    0.1%    64.0B
 ────────────────────────────────────────────────────────────────────────────────────

Went down from 37GB to just 0.6GB; not bad.

Wait, so the allocations are gone if you use @views on each line as opposed to the whole begin end block? Or what exactly gets rid of them?
By the way you now have @. together with some individual dots. This is not wrong but maybe a bit redundant :slight_smile:

It’s not the critical part of your code, but you can get rid of the allocations of the H to zero block as

H0 = rand(32,32).-0.5
H1 = copy(H0)
H2 = copy(H0)

function f1!(H)
    @views H[H.<0.0] .= 0.0
end

function f2!(H)
    map!(x -> x < 0.0 ? 0.0 : x, H, H)
end
julia> @btime f1!($H1)
  790.967 ns (6 allocations: 4.56 KiB)

julia> @btime f2!($H2)
  65.754 ns (0 allocations: 0 bytes)

julia> H1 == H2
true
1 Like

I don’t know if it’s a measurement error or if there’s a substantial reason, but the ifelse function seems to perform a little bit better than the ternary operator

function f3!(H)
    map!(x -> ifelse(x>0.0,x,0.0), H, H)
end
1 Like