Advanced tricks to reduce memory allocations of ODE

JordiBolibar · June 26, 2023, 12:38pm

Hello,

I’m trying to further reduce memory allocations when solving an ODE using DifferentialEquations.jl. I have already implemented most of the tricks I’m aware of, but I still get a ton of memory allocation when calling min..

Here’s the du function I’m trying to optimize:

function SIA2D!(dH::Matrix{F}, H::Matrix{F}, simulation::SIM, t::F) where {F <: AbstractFloat, SIM <: Simulation}
    
    # Retrieve parameters
    @timeit get_timer("ODINN") "Variable assignment" begin
    SIA2D_model::SIA2Dmodel = simulation.model.iceflow
    glacier::Glacier = simulation.glaciers[simulation.model.iceflow.glacier_idx[]]
    params::Parameters = simulation.parameters
    int_type = simulation.parameters.simulation.int_type
    A::Ref{F} = SIA2D_model.A
    B::Matrix{F} = glacier.B
    S::Matrix{F} = SIA2D_model.S
    dSdx::Matrix{F} = SIA2D_model.dSdx
    dSdy::Matrix{F} = SIA2D_model.dSdy
    D::Matrix{F} = SIA2D_model.D
    dSdx_edges::Matrix{F} = SIA2D_model.dSdx_edges
    dSdy_edges::Matrix{F} = SIA2D_model.dSdy_edges
    ∇S::Matrix{F} = SIA2D_model.∇S
    Fx::Matrix{F} = SIA2D_model.Fx
    Fy::Matrix{F} = SIA2D_model.Fy
    Δx::F = glacier.Δx
    Δy::F = glacier.Δy
    Γ::Ref{F} = SIA2D_model.Γ
    n::int_type = simulation.parameters.physical.n
    ρ::F = simulation.parameters.physical.ρ
    g::F = simulation.parameters.physical.g
    end

    # First, enforce values to be positive
    @timeit get_timer("ODINN") "H to zero" begin
    H[H.<0.0] .= 0.0
    # Update glacier surface altimetry
    S .= B .+ H
    end

    # All grid variables computed in a staggered grid
    # Compute surface gradients on edges
    @timeit get_timer("ODINN") "Surface gradients" begin
    diff_x!(dSdx, S, Δx)  
    diff_y!(dSdy, S, Δy) 
    ∇S .= (avg_y(dSdx).^2 .+ avg_x(dSdy).^2).^((n - 1)/2) 
    end

    @timeit get_timer("ODINN") "Diffusivity" begin
    Γ[] = 2.0 * A[] * (ρ * g)^n / (n+2) # 1 / m^3 s 
    D .= Γ[] .* avg(H).^(n + 2) .* ∇S
    end

    # Compute flux components
    @timeit get_timer("ODINN") "Gradients edges" begin
    @views diff_x!(dSdx_edges, S[:,2:end - 1], Δx)
    @views diff_y!(dSdy_edges, S[2:end - 1,:], Δy)
    end
    # Cap surface elevaton differences with the upstream ice thickness to 
    # imporse boundary condition of the SIA equation
    @timeit get_timer("ODINN") "Capping flux" begin
    η₀ = params.physical.η₀
    dSdx_edges .= min.(dSdx_edges,  η₀ * H[1:end-1, 2:end-1]./Δx,  η₀ * H[2:end, 2:end-1]./Δx)
    dSdy_edges .= min.(dSdy_edges,  η₀ * H[2:end-1, 1:end-1]./Δy,  η₀ * H[2:end-1, 2:end]./Δy) 
    dSdx_edges .= max.(dSdx_edges, -η₀ * H[1:end-1, 2:end-1]./Δx, -η₀ * H[2:end, 2:end-1]./Δx)
    dSdy_edges .= max.(dSdy_edges, -η₀ * H[2:end-1, 1:end-1]./Δy, -η₀ * H[2:end-1, 2:end]./Δy)
    end

    @timeit get_timer("ODINN") "Flux" begin
    Fx .= .-avg_y(D) .* dSdx_edges
    Fy .= .-avg_x(D) .* dSdy_edges 
    end

    #  Flux divergence
    @timeit get_timer("ODINN") "dH" begin
    inn(dH) .= .-(diff_x(Fx) ./ Δx .+ diff_y(Fy) ./ Δy) 
    end
end

And these are the main results in terms of memory allocation:

to =  ────────────────────────────────────────────────────────────────────────────────────
                                            Time                    Allocations      
                                   ───────────────────────   ────────────────────────
         Tot / % measured:              22.1s /  93.6%           37.1GiB /  99.9%    

 Section                   ncalls     time    %tot     avg     alloc    %tot      avg
 ────────────────────────────────────────────────────────────────────────────────────
 Run prediction                 1    20.7s  100.0%   20.7s   37.1GiB  100.0%  37.1GiB
   Simulate iceflow PDE         1    19.8s   95.7%   19.8s   37.1GiB  100.0%  37.1GiB
     Capping flux           13.2k    9.65s   46.7%   729μs   25.4GiB   68.6%  1.97MiB
     Surface gradients      13.2k    3.22s   15.6%   243μs   3.21GiB    8.6%   254KiB
     Diffusivity            13.2k    2.49s   12.0%   188μs   1.61GiB    4.3%   127KiB
     Mass balance             120    1.17s    5.7%  9.75ms    390MiB    1.0%  3.25MiB
     Flux                   13.2k    969ms    4.7%  73.2μs   3.18GiB    8.6%   252KiB
     dH                     13.2k    965ms    4.7%  72.9μs   3.15GiB    8.5%   249KiB
     Gradients edges        13.2k    377ms    1.8%  28.5μs     0.00B    0.0%    0.00B
     H to zero              13.2k    292ms    1.4%  22.1μs   84.9MiB    0.2%  6.56KiB
     Variable assignment    13.2k    122ms    0.6%  9.18μs    828KiB    0.0%    64.0B
 ────────────────────────────────────────────────────────────────────────────────────

What other tricks could I use the further reduce memory allocation? Particularly for the min. part. Thanks a lot in advance!

Salmon · June 26, 2023, 1:08pm

I think the problem is not the min. itself but the array expression afterwards. Does it work if you write
η₀ .* H[1:end-1, 2:end-1]./Δx instead?

In general, I think code visibility suffers a bit from trying to vectorize everything instead of writing explicit loops. If you loop explicitly, it is often easier to avoid allocations as well (at least I think so)

By the way, you can use the macro @. to vectorize the entire line. Also, annotations like

B::Matrix{F} = glacier.B

usually do not do anything for performance

JordiBolibar · June 26, 2023, 2:04pm

Thanks for the feedback, @Salmon. I updated those lines to (I’m aware that it could be written with a begin end clause for encompass all lines. Still I prefer this syntax to have a more homogenous style throughout the function):

    dSdx_edges .= @. @views min(dSdx_edges,  η₀ * H[1:end-1, 2:end-1]/Δx,  η₀ * H[2:end, 2:end-1]/Δx)
    dSdy_edges .= @. @views min(dSdy_edges,  η₀ * H[2:end-1, 1:end-1]/Δy,  η₀ * H[2:end-1, 2:end]/Δy) 
    dSdx_edges .= @. @views max(dSdx_edges, -η₀ * H[1:end-1, 2:end-1]/Δx, -η₀ * H[2:end, 2:end-1]/Δx)
    dSdy_edges .= @. @views max(dSdy_edges, -η₀ * H[2:end-1, 1:end-1]/Δy, -η₀ * H[2:end-1, 2:end]/Δy)

And indeed, the performance increased:

Capping flux    13.2k    6.57s   36.7%   497μs   12.7GiB   52.2%  0.98MiB

However, it’s still pretty high. Anything else that could be done to improve it further? Same goes for the rest of the code. Thanks!

ChrisRackauckas · June 26, 2023, 2:54pm

Where are the lines that the allocation profiler says are allocating?

JordiBolibar · June 26, 2023, 5:32pm

OK, I’ve been playing around and I’ve managed to optimize it quite a lot. Here’s the latest version:

function SIA2D!(dH::Matrix{F}, H::Matrix{F}, simulation::SIM, t::F) where {F <: AbstractFloat, SIM <: Simulation}
    
    # Retrieve parameters
    @timeit get_timer("ODINN") "Variable assignment" begin
    SIA2D_model::SIA2Dmodel = simulation.model.iceflow
    glacier::Glacier = simulation.glaciers[simulation.model.iceflow.glacier_idx[]]
    params::Parameters = simulation.parameters
    int_type = simulation.parameters.simulation.int_type
    H̄::Matrix{F} = SIA2D_model.H̄
    A::Ref{F} = SIA2D_model.A
    B::Matrix{F} = glacier.B
    S::Matrix{F} = SIA2D_model.S
    dSdx::Matrix{F} = SIA2D_model.dSdx
    dSdy::Matrix{F} = SIA2D_model.dSdy
    D::Matrix{F} = SIA2D_model.D
    Dx::Matrix{F} = SIA2D_model.Dx
    Dy::Matrix{F} = SIA2D_model.Dy
    dSdx_edges::Matrix{F} = SIA2D_model.dSdx_edges
    dSdy_edges::Matrix{F} = SIA2D_model.dSdy_edges
    ∇S::Matrix{F} = SIA2D_model.∇S
    ∇Sx::Matrix{F} = SIA2D_model.∇Sx
    ∇Sy::Matrix{F} = SIA2D_model.∇Sy
    Fx::Matrix{F} = SIA2D_model.Fx
    Fy::Matrix{F} = SIA2D_model.Fy
    Fxx::Matrix{F} = SIA2D_model.Fxx
    Fyy::Matrix{F} = SIA2D_model.Fyy
    Δx::F = glacier.Δx
    Δy::F = glacier.Δy
    Γ::Ref{F} = SIA2D_model.Γ
    n::int_type = simulation.parameters.physical.n
    ρ::F = simulation.parameters.physical.ρ
    g::F = simulation.parameters.physical.g
    end

    # First, enforce values to be positive
    @timeit get_timer("ODINN") "H to zero" begin
    H[H.<0.0] .= 0.0
    # Update glacier surface altimetry
    S .= B .+ H
    end

    # All grid variables computed in a staggered grid
    # Compute surface gradients on edges
    @timeit get_timer("ODINN") "Surface gradients" begin
    diff_x!(dSdx, S, Δx)  
    diff_y!(dSdy, S, Δy) 
    avg_y!(∇Sx, dSdx)
    avg_x!(∇Sy, dSdy)
    ∇S .= (∇Sx.^2 .+ ∇Sy.^2).^((n - 1)/2) 
    end

    @timeit get_timer("ODINN") "Diffusivity" begin
    # @infiltrate
    avg!(H̄, H)
    Γ[] = 2.0 * A[] * (ρ * g)^n / (n+2) # 1 / m^3 s 
    D .= Γ[] .* H̄.^(n + 2) .* ∇S
    end

    # Compute flux components
    @timeit get_timer("ODINN") "Gradients edges" begin
    @views diff_x!(dSdx_edges, S[:,2:end - 1], Δx)
    @views diff_y!(dSdy_edges, S[2:end - 1,:], Δy)
    end
    # Cap surface elevaton differences with the upstream ice thickness to 
    # imporse boundary condition of the SIA equation
    @timeit get_timer("ODINN") "Capping flux" begin
    η₀ = params.physical.η₀
    dSdx_edges .= @views @. min.(dSdx_edges,  η₀ * H[1:end-1, 2:end-1]./Δx,  η₀ * H[2:end, 2:end-1]./Δx)
    dSdy_edges .= @views @. min.(dSdy_edges,  η₀ * H[2:end-1, 1:end-1]./Δy,  η₀ * H[2:end-1, 2:end]./Δy) 
    dSdx_edges .= @views @. max.(dSdx_edges, -η₀ * H[1:end-1, 2:end-1]./Δx, -η₀ * H[2:end, 2:end-1]./Δx)
    dSdy_edges .= @views @. max.(dSdy_edges, -η₀ * H[2:end-1, 1:end-1]./Δy, -η₀ * H[2:end-1, 2:end]./Δy)
    end

    @timeit get_timer("ODINN") "Flux" begin
    avg_y!(Dx, D)
    avg_x!(Dy, D)
    Fx .= .-Dx .* dSdx_edges
    Fy .= .-Dy .* dSdy_edges 
    end

    #  Flux divergence
    @timeit get_timer("ODINN") "dH" begin
    diff_x!(Fxx, Fx, Δx)
    diff_y!(Fyy, Fy, Δy)
    inn(dH) .= .-(Fxx .+ Fyy) 
    end
end

Which yields these results:

to =  ────────────────────────────────────────────────────────────────────────────────────
                                            Time                    Allocations      
                                   ───────────────────────   ────────────────────────
         Tot / % measured:              15.6s /  91.0%            644MiB /  92.5%    

 Section                   ncalls     time    %tot     avg     alloc    %tot      avg
 ────────────────────────────────────────────────────────────────────────────────────
 Run prediction                 1    14.2s  100.0%   14.2s    595MiB  100.0%   595MiB
   Simulate iceflow PDE         1    13.1s   91.8%   13.1s    579MiB   97.2%   579MiB
     Capping flux           13.2k    4.50s   31.6%   340μs   61.6MiB   10.4%  4.77KiB
     Surface gradients      13.2k    2.68s   18.9%   203μs   3.03MiB    0.5%     240B
     Diffusivity            13.2k    2.35s   16.5%   177μs   4.04MiB    0.7%     320B
     Mass balance             120    1.19s    8.4%  9.94ms    390MiB   65.5%  3.25MiB
     dH                     13.2k    509ms    3.6%  38.4μs     0.00B    0.0%    0.00B
     Flux                   13.2k    493ms    3.5%  37.2μs     0.00B    0.0%    0.00B
     Gradients edges        13.2k    377ms    2.6%  28.5μs     0.00B    0.0%    0.00B
     H to zero              13.2k    296ms    2.1%  22.3μs   84.9MiB   14.3%  6.56KiB
     Variable assignment    13.2k    134ms    0.9%  10.1μs    828KiB    0.1%    64.0B
 ────────────────────────────────────────────────────────────────────────────────────

Went down from 37GB to just 0.6GB; not bad.

Salmon · June 26, 2023, 5:56pm

Wait, so the allocations are gone if you use @views on each line as opposed to the whole begin end block? Or what exactly gets rid of them?
By the way you now have @. together with some individual dots. This is not wrong but maybe a bit redundant

ademonts · June 26, 2023, 7:51pm

It’s not the critical part of your code, but you can get rid of the allocations of the H to zero block as

H0 = rand(32,32).-0.5
H1 = copy(H0)
H2 = copy(H0)

function f1!(H)
    @views H[H.<0.0] .= 0.0
end

function f2!(H)
    map!(x -> x < 0.0 ? 0.0 : x, H, H)
end

julia> @btime f1!($H1)
  790.967 ns (6 allocations: 4.56 KiB)

julia> @btime f2!($H2)
  65.754 ns (0 allocations: 0 bytes)

julia> H1 == H2
true

rocco_sprmnt21 · June 27, 2023, 11:19am

I don’t know if it’s a measurement error or if there’s a substantial reason, but the ifelse function seems to perform a little bit better than the ternary operator

function f3!(H)
    map!(x -> ifelse(x>0.0,x,0.0), H, H)
end

Topic		Replies	Views
Minimising DifferentialEquations GPU memory allocation Modelling & Simulations question	8	285	May 19, 2023
Reducing RAM usage when solving large set of differential equations Performance question	5	144	February 6, 2025
In-place allocations in DifferentialEquations Modelling & Simulations diffeq	9	770	December 14, 2020
Reducing the memory allocation in ODEs Performance question	11	1317	January 22, 2022
Preallocate u and du in DifferentialEquations.jl General Usage ordinarydiffeq	8	374	September 15, 2023

Advanced tricks to reduce memory allocations of ODE

Related topics