Hi everyone,

I am using DifferentialEquations.jl to solve 2 couple non-linear PDEs used to model a two-phase flow in a porous media:

\begin{align} \frac{\partial \phi}{\partial t} = \nabla \cdot [\phi^n(\nabla P + u_z)] \\ De \frac{\partial P}{\partial t} = \nabla \cdot [\phi^n(\nabla P + u_z)]- \frac{\phi^m}{R - H(P)(R-1)} P \end{align}

The implementation and the equations are similar to this topic (Coupled non-linear PDEs using DifferentialEquations.jl and the MOL), except that I have a new term with R, a constant and H(Pe), the Heaviside function of P on the second equation.

This is the final form of my equation so I would like to have something fast for it because I need to do 2D models. The problem is that adding this new term is slowing down considerably my models (from a few minutes to a few hours for a settings that I could use in 2D). The fastest linear solver is KLUFactorization() but I would like to use KrylovJL_GMRES() with preconditionners (like in this new tutorial Solving Large Stiff Equations · DifferentialEquations.jl) because it should scale really well for lots of nodes.

The problem is that the performance is quite disappointing so I would like to see if someone find it strange or if it cannot really be applied to my equations. I thought first that it was coming from the Heaviside function but after approximating it with a tanh function, it is definitely better but not as good as I would like.

Enough talking, here is the main function in 2D and the benchmarks I did:

```
function limit_neumann(a, n)
# check if index is on the boundary, if true take value on the opposite for homogeneous neumann boundary, if false don't change the value
a == n+1 ? a = n-1 : a == 0 ? a = 2 : a = a
end
# Approximation of the Heaviside function copied from here: https://discourse.julialang.org/t/handling-instability-when-solving-ode-problems/9019/5
const v1 = 0 # min of the Heaviside function
const v2 = 1.0 # max of the Heaviside function
H(Pe) = (v2-v1)*tanh(50*(Pe-(-0.5)))/2 + (v1 + v2) / 2
# Define the problem in a form of ODEs by discretizing in space
function porosity_wave(du, u, p, t)
ϕ_ad = @view u[:, :, 1]
Pe_ad = @view u[:, :, 2]
Δz_ad, Δx_ad, n, m, R, De = p
# loop over all the points
@inbounds for I in CartesianIndices((nz, nx))
i, j = Tuple(I)
jw, je = limit_neumann(j-1, nx), limit_neumann(j+1, nx)
# bottom boundary with Dirichlet
if i == 1
# porosity
du[i, j, 1] = - (((ϕ_ad[i+1, j]^n + ϕ_ad[i, j]^n) / 2 * ((Pe_ad[i+1, j] - Pe_ad[i, j]) / Δz_ad + 1) -
(ϕ_ad[i, j]^n + ϕ_bot^n) / 2 * ((Pe_ad[i, j] - Pe_bot) / Δz_ad + 1)) / Δz_ad +
(((ϕ_ad[i, je]^n + ϕ_ad[i, j]^n) / 2 * ((Pe_ad[i, je] - Pe_ad[i, j]) / Δx_ad) -
(ϕ_ad[i, j]^n + ϕ_ad[i, jw]^n) / 2 * ((Pe_ad[i, j] - Pe_ad[i, jw]) / Δx_ad)) / Δx_ad))
# pressure
du[i, j, 2] = 1 / (ϕ_ad[i, j]^b * De) *
(((ϕ_ad[i+1, j]^n + ϕ_ad[i, j]^n) / 2 * ((Pe_ad[i+1, j] - Pe_ad[i, j]) / Δz_ad + 1) -
(ϕ_ad[i, j]^n + ϕ_bot^n) / 2 * ((Pe_ad[i, j] - Pe_bot) / Δz_ad + 1)) / Δz_ad +
(((ϕ_ad[i, je]^n + ϕ_ad[i, j]^n) / 2 * ((Pe_ad[i, je] - Pe_ad[i, j]) / Δx_ad) -
(ϕ_ad[i, j]^n + ϕ_ad[i, jw]^n) / 2 * ((Pe_ad[i, j] - Pe_ad[i, jw]) / Δx_ad)) / Δx_ad) -
ϕ_ad[i, j]^m * Pe_ad[i, j] / (R - H(Pe_ad[i, j]) * (R - 1)))
# top boundary with Dirichlet
elseif i == nz
# porosity
du[i, j, 1] = - (((ϕ_top^n + ϕ_ad[i, j]^n) / 2 * ((Pe_top - Pe_ad[i, j]) / Δz_ad + 1) -
(ϕ_ad[i, j]^n + ϕ_ad[i-1, j]^n) / 2 * ((Pe_ad[i, j] - Pe_ad[i-1, j]) / Δz_ad + 1)) / Δz_ad +
(((ϕ_ad[i, je]^n + ϕ_ad[i, j]^n) / 2 * ((Pe_ad[i, je] - Pe_ad[i, j]) / Δx_ad) -
(ϕ_ad[i, j]^n + ϕ_ad[i, jw]^n) / 2 * ((Pe_ad[i, j] - Pe_ad[i, jw]) / Δx_ad)) / Δx_ad))
# pressure
du[i, j, 2] = 1 / (ϕ_ad[i, j]^b * De) *
(((ϕ_top^n + ϕ_ad[i, j]^n) / 2 * ((Pe_top - Pe_ad[i, j]) / Δz_ad + 1) -
(ϕ_ad[i, j]^n + ϕ_ad[i-1, j]^n) / 2 * ((Pe_ad[i, j] - Pe_ad[i-1, j]) / Δz_ad + 1)) / Δz_ad +
(((ϕ_ad[i, je]^n + ϕ_ad[i, j]^n) / 2 * ((Pe_ad[i, je] - Pe_ad[i, j]) / Δx_ad) -
(ϕ_ad[i, j]^n + ϕ_ad[i, jw]^n) / 2 * ((Pe_ad[i, j] - Pe_ad[i, jw]) / Δx_ad)) / Δx_ad) -
ϕ_ad[i, j]^m * Pe_ad[i, j] / (R - H(Pe_ad[i, j]) * (R - 1)))
# everything else
else
# porosity
du[i, j, 1] = - (((ϕ_ad[i+1, j]^n + ϕ_ad[i, j]^n) / 2 * ((Pe_ad[i+1, j] - Pe_ad[i, j]) / Δz_ad + 1) -
(ϕ_ad[i, j]^n + ϕ_ad[i-1, j]^n) / 2 * ((Pe_ad[i, j] - Pe_ad[i-1, j]) / Δz_ad + 1)) / Δz_ad +
(((ϕ_ad[i, je]^n + ϕ_ad[i, j]^n) / 2 * ((Pe_ad[i, je] - Pe_ad[i, j]) / Δx_ad) -
(ϕ_ad[i, j]^n + ϕ_ad[i, jw]^n) / 2 * ((Pe_ad[i, j] - Pe_ad[i, jw]) / Δx_ad)) / Δx_ad))
# pressure
du[i, j, 2] = 1 /(ϕ_ad[i, j]^b * De) *
(((ϕ_ad[i+1, j]^n + ϕ_ad[i, j]^n) / 2 * ((Pe_ad[i+1, j] - Pe_ad[i, j]) / Δz_ad + 1) -
(ϕ_ad[i, j]^n + ϕ_ad[i-1, j]^n) / 2 * ((Pe_ad[i, j] - Pe_ad[i-1, j]) / Δz_ad + 1)) / Δz_ad +
(((ϕ_ad[i, je]^n + ϕ_ad[i, j]^n) / 2 * ((Pe_ad[i, je] - Pe_ad[i, j]) / Δx_ad) -
(ϕ_ad[i, j]^n + ϕ_ad[i, jw]^n) / 2 * ((Pe_ad[i, j] - Pe_ad[i, jw]) / Δx_ad)) / Δx_ad) -
ϕ_ad[i, j]^m * Pe_ad[i, j] / (R - H(Pe_ad[i, j]) * (R - 1)))
end
end
end
```

TRBDF2 is the fastest method for my case and here are some computational time for 501 nodes and from 0 to 6 dimensionless timestep for the 1D equivalent of that code:

```
@time sol = solve(prob_sparse, TRBDF2(linsolve=UMFPACKFactorization()), progress = true, progress_steps = 1, save_everystep=false);
# 12.793877 seconds (1.36 M allocations: 9.276 GiB, 1.87% gc time)
@time solve(prob_sparse, TRBDF2(linsolve=KLUFactorization()), progress = true, progress_steps = 1, save_everystep=false);
# 13.084809 seconds (4.52 M allocations: 5.201 GiB, 1.36% gc time, 1.10% compilation time)
@time solve(prob, TRBDF2(linsolve=KrylovJL_GMRES()), progress = true, progress_steps = 1, save_everystep=false);
# 149.106861 seconds (17.02 M allocations: 1.390 GiB, 0.14% gc time, 1.40% compilation time)
```

KrylovJL_GMRES beeing slower without a preconditionner is fine for me. But here is with preconditionners:

```
# multigrid
@time solve(prob_sparse, TRBDF2(linsolve=KrylovJL_GMRES(),precs=algebraicmultigrid,concrete_jac=true),progress = true, progress_steps = 1, save_everystep=false);
# more than 1h for multigrid
# jacobian smoother
@time solve(prob_sparse, TRBDF2(linsolve=KrylovJL_GMRES(),precs=algebraicmultigrid2,concrete_jac=true),progress = true, progress_steps = 1, save_everystep=false);
# 2399.434492 seconds (73.80 M allocations: 72.087 GiB, 0.41% gc time, 0.21% compilation time)
```

for incompletelu, I had to use a very big Tau to see a result but it is not really good (better than without at least):

```
# τ = 50.0
@time solve(prob_sparse, TRBDF2(linsolve=KrylovJL_GMRES(),precs=incompletelu,concrete_jac=true),progress = true, progress_steps = 1, save_everystep=false);
# 369.047623 seconds (19.37 M allocations: 1.705 GiB, 0.08% gc time, 1.26% compilation time)
# τ = 5
@time solve(prob_sparse, TRBDF2(linsolve=KrylovJL_GMRES(),precs=incompletelu,concrete_jac=true),progress = true, progress_steps = 1, save_everystep=false);
# 371.915629 seconds (10.18 M allocations: 1.069 GiB, 0.05% gc time, 0.28% compilation time)
# τ = 100
@time solve(prob_sparse, TRBDF2(linsolve=KrylovJL_GMRES(),precs=incompletelu,concrete_jac=true),progress = true, progress_steps = 1, save_everystep=false);
# 363.336921 seconds (10.03 M allocations: 1.018 GiB, 0.04% gc time, 0.26% compilation time)
# τ = 500
@time solve(prob_sparse, TRBDF2(linsolve=KrylovJL_GMRES(),precs=incompletelu,concrete_jac=true),progress = true, progress_steps = 1, save_everystep=false);
# 388.538630 seconds (10.34 M allocations: 1.030 GiB, 0.04% gc time, 0.24% compilation time)
# τ = 1000
@time solve(prob_sparse, TRBDF2(linsolve=KrylovJL_GMRES(),precs=incompletelu,concrete_jac=true),progress = true, progress_steps = 1, save_everystep=false);
# 380.170013 seconds (10.33 M allocations: 1.032 GiB, 0.19% gc time, 0.23% compilation time)
# τ = 5e5
@time solve(prob_sparse, TRBDF2(linsolve=KrylovJL_GMRES(),precs=incompletelu,concrete_jac=true),progress = true, progress_steps = 1, save_everystep=false);
# 62.048754 seconds (3.94 M allocations: 346.247 MiB, 0.06% gc time, 1.31% compilation time)
# τ = 5e4
@time solve(prob_sparse, TRBDF2(linsolve=KrylovJL_GMRES(),precs=incompletelu,concrete_jac=true),progress = true, progress_steps = 1, save_everystep=false);
# 60.994903 seconds (2.36 M allocations: 219.073 MiB, 0.10% gc t
```

Around tau=5e4 is the best I could do, so 1 min. Is there some hope to speed this up, or is the new non-linearity just too hard to solve?