I have a UDE based on a 2D nonlinear ice flow diffusion PDE. It works correctly both for the forward and backwards pass, but despite using a semi-implicit solver the memory usage for backpropagation is just too big for even very short simulations, resulting in the process being killed. So far my model is implemented to parallelize multiple simulations for each training batch using pmap (not shown), but this doesn’t help, since the memory issues occur within each of the simulations of each batch.
I’m still very new to Zygote, so I was wondering: what would be the best way to optimize Zygote in that case in order to circumvent memory issues?
Here’s a rough overview of what I’m doing:
# Loss function
function loss(H, glacier_ref, UA, p, t, t₁)
H = iceflow!(H, UA, p,t,t₁)
l_H = sqrt(Flux.Losses.mse(H[H .!= 0.0], glacier_ref["H"][end][H.!= 0.0]; agg=sum))
return l_H
end
# Ice flow UDE
function iceflow!(H, UA, p,t,t₁)
# Retrieve input variables
let
current_year = 0
total_iter = 0
t_step = 0
temps = p[6]
# Forward scheme implementation
while t < t₁
let
iter = 1
err = 2 * tolnl
Hold = copy(H)
dHdt = zeros(nx, ny)
# Get current year for MB and ELA
year = floor(Int, t) + 1
if year != current_year
# Predict value of `A`
temp = [temps[year]]'
ŶA = predict_A̅(UA, temp)
## Unpack and repack tuple with updated A value
Δx, Δy, Γ, A, B, temps, C, α = p
p = (Δx, Δy, Γ, ŶA, B, temps, C, α)
current_year = year
end
while err > tolnl && iter < itMax+1
Err = copy(H)
# Compute the Shallow Ice Approximation in a staggered grid
F, dτ = SIA(H, p)
# Compute the residual ice thickness for the inertia
@tullio ResH[i,j] := -(H[i,j] - Hold[i,j])/Δt + F[pad(i-1,1,1),pad(j-1,1,1)]
dHdt_ = copy(dHdt)
@tullio dHdt[i,j] := dHdt_[i,j]*damp + ResH[i,j]
# We keep local copies for tullio
H_ = copy(H)
# Update the ice thickness
@tullio H[i,j] := max(0.0, H_[i,j] + dHdt[i,j]*dτ)
iter += 1
total_iter += 1
end
t += Δt
t_step += 1
end # let
end
return H
end # let
end
# Shallow Ice Approximation
function SIA(H, p)
Δx, Δy, Γ, A, B, temps, C, α = p
# Update glacier surface altimetry
S = B .+ H
# All grid variables computed in a staggered grid
# Compute surface gradients on edges
dSdx = diff(S, dims=1) / Δx
dSdy = diff(S, dims=2) / Δy
∇S² = avg_y(dSdx).^2 .+ avg_x(dSdy).^2
Γ = 2 * A * (ρ * g)^n / (n+2) # 1 / m^3 s
D = Γ .* avg(H).^(n + 2) .* ∇S².^((n - 1)/2)
# Compute flux components
dSdx_edges = diff(S[:,2:end - 1], dims=1) / Δx
dSdy_edges = diff(S[2:end - 1,:], dims=2) / Δy
Fx = .-avg_y(D) .* dSdx_edges
Fy = .-avg_x(D) .* dSdy_edges
# Flux divergence
F = .-(diff(Fx, dims=1) / Δx .+ diff(Fy, dims=2) / Δy) # MB to be added here
# Compute dτ for the implicit method
#dτ = dτsc * min.( 10.0 , 1.0./(1.0/Δt .+ 1.0./(cfl./(ϵ .+ avg(D)))))
D_max = 3000000
current_D_max = maximum(D)
if D_max < current_D_max
error("Increase Maximum diffusivity. Required value must be larger than $current_D_max")
end
dτ = dτsc * min( 10.0 , 1.0/(1.0/Δt + 1.0/(cfl/(ϵ + D_max))))
return F, dτ
end
# Compute pullback for ice flow UDE
loss_UA, back_UA = Zygote.pullback(() -> loss(H, glacier_ref, UA, p, t, t₁), θ)
Thanks in advance!