Why does my devectorized code keep allocating memory?

I am trying to solve a complex Ginzburg-Landau equation in Julia, defined by

function CGLfun!(du,u,p,t)
NBmat,gridsize,η,g,V,α,σ=p    
du.=1im*(1 .-1im*η).*(NBmat*u.-4*u)/gridsize^2 +(.-(1im).*V+α.-(σ.+1im*g).*abs2.(u)).*u; 
end

this was the first vectorized implementation, strongly similar to how I initially programmed it in MATLAB. Evolving it with Vern7() is maybe 20% faster than MATLAB’s ode113 result and takes about 600 sec on my desktop to evolve t from 0 until 100.
In this PDE, du is actually defined on a grid (of N=Nx^2=101^2 datapoints) as a complex variable, but I’ve rewritten it all in a column matrix for convenience. NBmat is an NxN sparse matrix that couples neighbouring sites, so the first term corresponds to the discretized laplacian. \eta,V,\alpha,\sigma are all spatially dependent (N entries) and real-valued (incidentally they are proportional to one another so perhaps there’s a better way to import them, but for the current sake I don’t think that should be the issue), while g is a constant.

The official benchmarks of JulliaDiffEq. mention an about 100x speedup compared to MATLAB, and in this sense I was a bit disappointed by the only very modest gain, even when already using the in-place version.

From @ChrisRackauckas recent FAQ, I found this interesting reference for optimization Optimizing DiffEq Code . I saw from the example of the Gierer-Meinhardt Reaction-Diffusion PDE that most of the improvement can be obtained by devectorizing. So I checked the same example on my machine.

This is my output when evolving from 0 to 0.1 with save_every_timestep=false

BenchmarkTools.Trial: 
  memory estimate:  2.90 MiB
  allocs estimate:  62
  --------------
  minimum time:     4.835 ms (0.00% GC)
  median time:      4.917 ms (0.00% GC)
  mean time:        4.987 ms (1.13% GC)
  maximum time:     7.015 ms (26.40% GC)
  --------------
  samples:          1002
  evals/sample:     1

and from 0 to 1.0

BenchmarkTools.Trial: 
  memory estimate:  2.90 MiB
  allocs estimate:  62
  --------------
  minimum time:     44.611 ms (0.00% GC)
  median time:      45.434 ms (0.00% GC)
  mean time:        45.610 ms (0.11% GC)
  maximum time:     47.961 ms (3.81% GC)
  --------------
  samples:          110
  evals/sample:     1

so we see that the amount of memory assigned is constant.

In a very analogous way to this example, I also tried to rewrite my PDE in a devectorized way. Note that I’ve modified the original complex vector U of length N into a real Nx times Nx times 2 array, where the real and imaginary parts are separated. This gives in full

function CGLfunrealdevect!(du,u,p3,t)
Nx,D,η,g,V,α,σ=p3 
 
    X= @view u[:,:,1]
    P= @view u[:,:,2]
    dX= @view du[:,:,1]
    dP= @view du[:,:,2]
    
@inbounds for kk in 2:Nx-1, jj in 2:Nx-1  
    dX[jj,kk]=D*η[jj,kk]*(X[jj-1,kk]+X[jj+1,kk]+X[jj,kk-1]+X[jj,kk+1]-4*X[jj,kk])-D*(P[jj-1,kk]+P[jj+1,kk]+P[jj,kk-1]+P[jj,kk+1]-4*P[jj,kk]) +
    α[jj,kk]*X[jj,kk]+V[jj,kk]*P[jj,kk]+(X[jj,kk]^2+P[jj,kk]^2)*(-σ[jj,kk]*X[jj,kk]+g*P[jj,kk]);
end


@inbounds for kk in 2:Nx-1,jj in 2:Nx-1  
    dP[jj,kk]=D*η[jj,kk]*(P[jj-1,kk]+P[jj+1,kk]+P[jj,kk-1]+P[jj,kk+1]-4*P[jj,kk])+D*(X[jj-1,kk]+X[jj+1,kk]+X[jj,kk-1]+X[jj,kk+1]-4*X[jj,kk]) +
    α[jj,kk]*P[jj,kk]-V[jj,kk]*X[jj,kk]+(X[jj,kk]^2+P[jj,kk]^2)*(-σ[jj,kk]*P[jj,kk]-g*X[jj,kk]);
end

@inbounds for jj in 2:Nx-1
    kk=1
    dX[jj,kk]=D*η[jj,kk]*(X[jj-1,kk]+X[jj+1,kk]+2*X[jj,kk+1]-4*X[jj,kk])-D*(P[jj-1,kk]+P[jj+1,kk]+2*P[jj,kk+1]-4*P[jj,kk])+
    α[jj,kk]*X[jj,kk]+V[jj,kk]*P[jj,kk]+(X[jj,kk]^2+P[jj,kk]^2)*(-σ[jj,kk]*X[jj,kk]+g*P[jj,kk]);
end


@inbounds for jj in 2:Nx-1
    kk=1
    dP[jj,kk]=D*η[jj,kk]*(P[jj-1,kk]+P[jj+1,kk]+2*P[jj,kk+1]-4*P[jj,kk])+D*(X[jj-1,kk]+X[jj+1,kk]+2*X[jj,kk+1]-4*X[jj,kk]) +
    α[jj,kk]*P[jj,kk]-V[jj,kk]*X[jj,kk]+(X[jj,kk]^2+P[jj,kk]^2)*(-σ[jj,kk]*P[jj,kk]-g*X[jj,kk]);
end

@inbounds for jj in 2:Nx-1
    kk=Nx
    dX[jj,kk]=D*η[jj,kk]*(X[jj-1,kk]+X[jj+1,kk]+2*X[jj,kk-1]-4*X[jj,kk])-D*(P[jj-1,kk]+P[jj+1,kk]+2*P[jj,kk-1]-4*P[jj,kk])+
    α[jj,kk]*X[jj,kk]+V[jj,kk]*P[jj,kk]+(X[jj,kk]^2+P[jj,kk]^2)*(-σ[jj,kk]*X[jj,kk]+g*P[jj,kk]);
end


@inbounds for jj in 2:Nx-1
    kk=Nx
    dP[jj,kk]=D*η[jj,kk]*(P[jj-1,kk]+P[jj+1,kk]+2*P[jj,kk-1]-4*P[jj,kk])+D*(X[jj-1,kk]+X[jj+1,kk]+2*X[jj,kk-1]-4*X[jj,kk])+
    α[jj,kk]*P[jj,kk]-V[jj,kk]*X[jj,kk]+(X[jj,kk]^2+P[jj,kk]^2)*(-σ[jj,kk]*P[jj,kk]-g*X[jj,kk]);
end

@inbounds for kk in 2:Nx-1 
    jj=1
    dX[jj,kk]=D*η[jj,kk]*(2*X[jj+1,kk]+X[jj,kk-1]+X[jj,kk+1]-4*X[jj,kk])-D*(2*P[jj+1,kk]+P[jj,kk-1]+P[jj,kk+1]-4*P[jj,kk]) +
    α[jj,kk]*X[jj,kk]+V[jj,kk]*P[jj,kk]+(X[jj,kk]^2+P[jj,kk]^2)*(-σ[jj,kk]*X[jj,kk]+g*P[jj,kk]);
end


@inbounds for kk in 2:Nx-1 
    jj=1
    dP[jj,kk]=D*η[jj,kk]*(2*P[jj+1,kk]+P[jj,kk-1]+P[jj,kk+1]-4*P[jj,kk])+D*(2*X[jj+1,kk]+X[jj,kk-1]+X[jj,kk+1]-4*X[jj,kk]) +
    α[jj,kk]*P[jj,kk]-V[jj,kk]*X[jj,kk]+(X[jj,kk]^2+P[jj,kk]^2)*(-σ[jj,kk]*P[jj,kk]-g*X[jj,kk]);
end

@inbounds for kk in 2:Nx-1 
    jj=Nx
    dX[jj,kk]=D*η[jj,kk]*(2*X[jj-1,kk]+X[jj,kk-1]+X[jj,kk+1]-4*X[jj,kk])-D*(2*P[jj-1,kk]+P[jj,kk-1]+P[jj,kk+1]-4*P[jj,kk]) +
    α[jj,kk]*X[jj,kk]+V[jj,kk]*P[jj,kk]+(X[jj,kk]^2+P[jj,kk]^2)*(-σ[jj,kk]*X[jj,kk]+g*P[jj,kk]);
end

@inbounds for kk in 2:Nx-1 
    jj=Nx
    dP[jj,kk]=D*η[jj,kk]*(2*P[jj-1,kk]+P[jj,kk-1]+P[jj,kk+1]-4*P[jj,kk])+D*(2*X[jj-1,kk]+X[jj,kk-1]+X[jj,kk+1]-4*X[jj,kk]) +
    α[jj,kk]*P[jj,kk]-V[jj,kk]*X[jj,kk]+(X[jj,kk]^2+P[jj,kk]^2)*(-σ[jj,kk]*P[jj,kk]-g*X[jj,kk]);
end

@inbounds begin
    jj=1; kk=1
    dX[jj,kk]=D*η[jj,kk]*(2*X[jj+1,kk]+2*X[jj,kk+1]-4*X[jj,kk])-D*(2*P[jj+1,kk]+2*P[jj,kk+1]-4*P[jj,kk]) +
    α[jj,kk]*X[jj,kk]+V[jj,kk]*P[jj,kk]+(X[jj,kk]^2+P[jj,kk]^2)*(-σ[jj,kk]*X[jj,kk]+g*P[jj,kk]);
    dP[jj,kk]=D*η[jj,kk]*(2*P[jj+1,kk]+2*P[jj,kk+1]-4*P[jj,kk])+D*(2*X[jj+1,kk]+2*X[jj,kk+1]-4*X[jj,kk]) +
    α[jj,kk]*P[jj,kk]-V[jj,kk]*X[jj,kk]+(X[jj,kk]^2+P[jj,kk]^2)*(-σ[jj,kk]*P[jj,kk]-g*X[jj,kk]);

    jj=1; kk=Nx
    dX[jj,kk]=D*η[jj,kk]*(2*X[jj+1,kk]+2*X[jj,kk-1]-4*X[jj,kk])-D*(2*P[jj+1,kk]+2*P[jj,kk-1]-4*P[jj,kk])+
    α[jj,kk]*X[jj,kk]+V[jj,kk]*P[jj,kk]+(X[jj,kk]^2+P[jj,kk]^2)*(-σ[jj,kk]*X[jj,kk]+g*P[jj,kk]);
    dP[jj,kk]=D*η[jj,kk]*(2*P[jj+1,kk]+2*P[jj,kk-1]-4*P[jj,kk])+D*(2*X[jj+1,kk]+2*X[jj,kk-1]-4*X[jj,kk])+
    α[jj,kk]*P[jj,kk]-V[jj,kk]*X[jj,kk]+(X[jj,kk]^2+P[jj,kk]^2)*(-σ[jj,kk]*P[jj,kk]-g*X[jj,kk]);

    jj=Nx; kk=1
    dX[jj,kk]=D*η[jj,kk]*(2*X[jj-1,kk]+2*X[jj,kk+1]-4*X[jj,kk])-D*(2*P[jj-1,kk]+2*P[jj,kk+1]-4*P[jj,kk]) +
    α[jj,kk]*X[jj,kk]+V[jj,kk]*P[jj,kk]+(X[jj,kk]^2+P[jj,kk]^2)*(-σ[jj,kk]*X[jj,kk]+g*P[jj,kk]);
    dP[jj,kk]=D*η[jj,kk]*(2*P[jj-1,kk]+2*P[jj,kk+1]-4*P[jj,kk])+D*(2*X[jj-1,kk]+2*X[jj,kk+1]-4*X[jj,kk]) +
    α[jj,kk]*P[jj,kk]-V[jj,kk]*X[jj,kk]+(X[jj,kk]^2+P[jj,kk]^2)*(-σ[jj,kk]*P[jj,kk]-g*X[jj,kk]);
    
    jj=Nx; kk=Nx
    dX[jj,kk]=D*η[jj,kk]*(2*X[jj-1,kk]+2*X[jj,kk-1]-4*X[jj,kk])-D*(2*P[jj-1,kk]+2*P[jj,kk-1]-4*P[jj,kk])+
    α[jj,kk]*X[jj,kk]+V[jj,kk]*P[jj,kk]+(X[jj,kk]^2+P[jj,kk]^2)*(-σ[jj,kk]*X[jj,kk]+g*P[jj,kk]);
    dP[jj,kk]=D*η[jj,kk]*(2*P[jj-1,kk]+2*P[jj,kk-1]-4*P[jj,kk])+D*(2*X[jj-1,kk]+2*X[jj,kk-1]-4*X[jj,kk]) +
    α[jj,kk]*P[jj,kk]-V[jj,kk]*X[jj,kk]+(X[jj,kk]^2+P[jj,kk]^2)*(-σ[jj,kk]*P[jj,kk]-g*X[jj,kk]);
end
        
end

Gives for (0,0.1) evolution

BenchmarkTools.Trial: 
  memory estimate:  46.61 GiB
  allocs estimate:  2857792769
  --------------
  minimum time:     41.576 s (7.85% GC)
  median time:      41.576 s (7.85% GC)
  mean time:        41.576 s (7.85% GC)
  maximum time:     41.576 s (7.85% GC)
  --------------
  samples:          1
  evals/sample:     1

And for (0,1) evolution

BenchmarkTools.Trial: 
  memory estimate:  450.45 GiB
  allocs estimate:  27623039639
  --------------
  minimum time:     387.538 s (8.10% GC)
  median time:      387.538 s (8.10% GC)
  mean time:        387.538 s (8.10% GC)
  maximum time:     387.538 s (8.10% GC)
  --------------
  samples:          1
  evals/sample:     1

So in this case, there is clearly a huge amount of allocations, and in contrast to the example PDE, the memory is extensive with evolution length.

Any idea what I missed and how I can achieve a similar speedup as the example?

What I already tried in attempt to reproduce the ‘slow’ behavior with the Gierer-Meinhardt example:
*change the solver algorithm from Tsit5() to Vern7() as well
*make \alpha as there as well a spatially dependent grid
*put in some @view to rename (:,:,1) and (:,:,2)
*Putting a ; at the end of expressions
*Implicitly writing the multiplication signs between numbers and constants
*putting random pieces of the expression in parentheses
inserting a perturbation in the form of 1e-10(u[i,j,1]^2+u[i,j,2]^2)u[i,j,1]
*changing intendation
but the memory-efficiency of that example has seemed robust against all these things…

Your code is very long and verbose, so it’s kind of hard to give a concrete suggestion. As far as I can tell, you only ever write to dP, dX and \alpha, and all other variables are only read from, correct? As a first step to make this more debuggable, I’d suggest eliminating common subexpressions that you recalculate over and over again and put them into their own variables. If the results are single integers or floats, those don’t allocate if the code is type stable! Only arrays and other mutable objects cause allocations in type stable code (there are some exceptions, but that’s much too detailed and doesn’t apply here).

Is your function type stable? What are the types of e.g. η and D, what about p3? If they’re 3D arrays, accessing only two indices may allocate (since you’d try to retrieve an array slice and slicing allocates).

; is only used in the REPL to suppress the output from being printed - it has no effect in code that’s not run at the top level.

Indentation is not relevant when discussing performance, it has no effect here. In general, whitespace is almost never significant in julia (and if it is, it changes behavior, not purely performance. This is the case when writing e.g. array literals).

2 Likes

Why not just @. du = ...? You’re not fusing the whole operation: you forgot a few dots so it’s allocating a few steps. That RHS function is not written in a very fast way, so that will impact the speed you see.

Edit: I see, matmul. Everything else should be dotted those, and it’s not.

Is that the right solver? Sounds like more of a problem for VCABM, QNDF, CVODE_BDF, LSODA, ROCK2, or ROCK4. Did you do a comparative study of methods?

Not sharing code makes it impossible to really debug, but from experience, I would guess that your function is not type-stable because you made p3 a Vector when it should be a Tuple since it’s filled with different typed objects. Did you look at @code_warntype to see?

2 Likes

thank you for your answer. I’ve been checking some of these things kind of from desperation, I saw for example people reporting a problem where Juno skips ill-intended lines (although i use VSC now). And I remember a case where it didn’t work because I called the array of variables \psi instead of u.

Being able to add intermediate values would indeed be something useful, if that works with (pointwise) matrix operations as well. I thought I couldn’t, so good to know

p3 being a vector was indeed the big problem. New output from simulation until 0.1

BenchmarkTools.Trial: 
  memory estimate:  12.95 MiB
  allocs estimate:  71
  --------------
  minimum time:     265.388 ms (0.00% GC)
  median time:      269.040 ms (0.00% GC)
  mean time:        270.049 ms (0.07% GC)
  maximum time:     283.038 ms (0.00% GC)
  --------------
  samples:          19
  evals/sample:     1

And evolution until 100 takes 160sec, a few times better than before :slight_smile:

My not .@ was partiallly because NBmat needs a normal multiplication. But I wrote that before I properly saw the optimization tutorial.

Likely, there are better solvers. It’s pretty much the first one I tried. I saw that Vern7() was the goto solver from matlab’s ode113, and for now I just wanted to check the advantage of Julia itself for a similar solver (the 100x improvement for the Lotke-Volterra benchmarks intrigued me)

I have no objections with sharing code if you’d like to play with it, though? (I was assuming most people here to be too busy to do that)

If you share it some other people might take a look. I would bookmark it to look like a week from now and see if no one else has. High order RK methods are just not stable and not a great idea for PDEs, so Vern7 would be one of the last methods I would check here. CVODE_BDF(linear_solver = :GMRES) might be a very easy way to take a chunk off, or ROCK4().

2 Likes

For those interested in the full code:

using LinearAlgebra, DifferentialEquations, SparseArrays, Statistics,Plots

tstart =0.
tstep=1.
tfinal=100.
tarray=tstart:tstep:tfinal;

xvec=-50.:0.5:50.;yvec=-50.:0.5:50.; 
Nx=length(xvec);Ny=length(yvec);N=Nx*Ny;
X=xvec'.*ones(Ny); Y=ones(Nx)'.*yvec;


ħ=0.6591; 
c = 300.;  
m0 = 511. * 1e6/c^2; 

m=1e-5*m0;
g=1e-4;
t0=7.;
l0=sqrt(ħ*t0/(2*m));

xvectilde=xvec/l0; yvectilde=yvec/l0;
gridsize=(xvectilde[2]-xvectilde[1]); 
@assert ((yvectilde[2]-yvectilde[1])==gridsize);#Have to use a uniform grid for the derivative in the kinetic E
Xtilde=xvectilde'.*ones(Ny); Ytilde=ones(Nx)'.*yvectilde;

V0orig=2.; 
Vtilde0=V0orig*t0/ħ;
gtilde=g*2*m/ħ^2;

dist=10.;
Nspots=4;
L=zeros(Nspots,2)
for ss=1:Nspots
    L[ss,:]=dist*[cos(2*pi/Nspots*ss+pi/2),sin(2*pi/Nspots*ss+pi/2)];
end

Ltilde=L/l0;
pumpwidth=4;pumpwidthtilde=pumpwidth/l0; 
Pfun=zeros(Nx,Nx); 
pumpidx=zeros(Int64,Nspots,1);
pumpstrength=1;
for pp=1:size(L,1)
global Pfun
thispump=pumpstrength*exp.(-((Xtilde .- Ltilde[pp,1]).^2 .+ (Ytilde.- Ltilde[pp,2]).^2)./(2*pumpwidthtilde.^2)); #The exponent can remain dimensionful (micron) without problems (I think)
Pfun = Pfun +thispump;
~,thispumpidx=findmax(thispump[:]);
pumpidx[pp]=thispumpidx
end

middle=mean(L,dims=1);
~,centreidx=findmin((middle[1].-X[:]).^2+(middle[2].-Y[:]).^2);


V=Vtilde0*Pfun
η=0.1*Pfun
σ=0.3*Pfun
α=Pfun

function CGLfunrealdevect!(du,u,p3,t)
Nx,D,η,g,V,α,σ=p3 
 
    X= @view u[:,:,1]
    P= @view u[:,:,2]
    dX= @view du[:,:,1]
    dP= @view du[:,:,2]
    
@inbounds for kk in 2:Nx-1, jj in 2:Nx-1  
    dX[jj,kk]=D*η[jj,kk]*(X[jj-1,kk]+X[jj+1,kk]+X[jj,kk-1]+X[jj,kk+1]-4*X[jj,kk])-D*(P[jj-1,kk]+P[jj+1,kk]+P[jj,kk-1]+P[jj,kk+1]-4*P[jj,kk]) +
    α[jj,kk]*X[jj,kk]+V[jj,kk]*P[jj,kk]+(X[jj,kk]^2+P[jj,kk]^2)*(-σ[jj,kk]*X[jj,kk]+g*P[jj,kk]);
end


@inbounds for kk in 2:Nx-1,jj in 2:Nx-1  
    dP[jj,kk]=D*η[jj,kk]*(P[jj-1,kk]+P[jj+1,kk]+P[jj,kk-1]+P[jj,kk+1]-4*P[jj,kk])+D*(X[jj-1,kk]+X[jj+1,kk]+X[jj,kk-1]+X[jj,kk+1]-4*X[jj,kk]) +
    α[jj,kk]*P[jj,kk]-V[jj,kk]*X[jj,kk]+(X[jj,kk]^2+P[jj,kk]^2)*(-σ[jj,kk]*P[jj,kk]-g*X[jj,kk]);
end

@inbounds for jj in 2:Nx-1
    kk=1
    dX[jj,kk]=D*η[jj,kk]*(X[jj-1,kk]+X[jj+1,kk]+2*X[jj,kk+1]-4*X[jj,kk])-D*(P[jj-1,kk]+P[jj+1,kk]+2*P[jj,kk+1]-4*P[jj,kk])+
    α[jj,kk]*X[jj,kk]+V[jj,kk]*P[jj,kk]+(X[jj,kk]^2+P[jj,kk]^2)*(-σ[jj,kk]*X[jj,kk]+g*P[jj,kk]);
end


@inbounds for jj in 2:Nx-1
    kk=1
    dP[jj,kk]=D*η[jj,kk]*(P[jj-1,kk]+P[jj+1,kk]+2*P[jj,kk+1]-4*P[jj,kk])+D*(X[jj-1,kk]+X[jj+1,kk]+2*X[jj,kk+1]-4*X[jj,kk]) +
    α[jj,kk]*P[jj,kk]-V[jj,kk]*X[jj,kk]+(X[jj,kk]^2+P[jj,kk]^2)*(-σ[jj,kk]*P[jj,kk]-g*X[jj,kk]);
end

@inbounds for jj in 2:Nx-1
    kk=Nx
    dX[jj,kk]=D*η[jj,kk]*(X[jj-1,kk]+X[jj+1,kk]+2*X[jj,kk-1]-4*X[jj,kk])-D*(P[jj-1,kk]+P[jj+1,kk]+2*P[jj,kk-1]-4*P[jj,kk])+
    α[jj,kk]*X[jj,kk]+V[jj,kk]*P[jj,kk]+(X[jj,kk]^2+P[jj,kk]^2)*(-σ[jj,kk]*X[jj,kk]+g*P[jj,kk]);
end


@inbounds for jj in 2:Nx-1
    kk=Nx
    dP[jj,kk]=D*η[jj,kk]*(P[jj-1,kk]+P[jj+1,kk]+2*P[jj,kk-1]-4*P[jj,kk])+D*(X[jj-1,kk]+X[jj+1,kk]+2*X[jj,kk-1]-4*X[jj,kk])+
    α[jj,kk]*P[jj,kk]-V[jj,kk]*X[jj,kk]+(X[jj,kk]^2+P[jj,kk]^2)*(-σ[jj,kk]*P[jj,kk]-g*X[jj,kk]);
end

@inbounds for kk in 2:Nx-1 
    jj=1
    dX[jj,kk]=D*η[jj,kk]*(2*X[jj+1,kk]+X[jj,kk-1]+X[jj,kk+1]-4*X[jj,kk])-D*(2*P[jj+1,kk]+P[jj,kk-1]+P[jj,kk+1]-4*P[jj,kk]) +
    α[jj,kk]*X[jj,kk]+V[jj,kk]*P[jj,kk]+(X[jj,kk]^2+P[jj,kk]^2)*(-σ[jj,kk]*X[jj,kk]+g*P[jj,kk]);
end


@inbounds for kk in 2:Nx-1 
    jj=1
    dP[jj,kk]=D*η[jj,kk]*(2*P[jj+1,kk]+P[jj,kk-1]+P[jj,kk+1]-4*P[jj,kk])+D*(2*X[jj+1,kk]+X[jj,kk-1]+X[jj,kk+1]-4*X[jj,kk]) +
    α[jj,kk]*P[jj,kk]-V[jj,kk]*X[jj,kk]+(X[jj,kk]^2+P[jj,kk]^2)*(-σ[jj,kk]*P[jj,kk]-g*X[jj,kk]);
end

@inbounds for kk in 2:Nx-1 
    jj=Nx
    dX[jj,kk]=D*η[jj,kk]*(2*X[jj-1,kk]+X[jj,kk-1]+X[jj,kk+1]-4*X[jj,kk])-D*(2*P[jj-1,kk]+P[jj,kk-1]+P[jj,kk+1]-4*P[jj,kk]) +
    α[jj,kk]*X[jj,kk]+V[jj,kk]*P[jj,kk]+(X[jj,kk]^2+P[jj,kk]^2)*(-σ[jj,kk]*X[jj,kk]+g*P[jj,kk]);
end

@inbounds for kk in 2:Nx-1 
    jj=Nx
    dP[jj,kk]=D*η[jj,kk]*(2*P[jj-1,kk]+P[jj,kk-1]+P[jj,kk+1]-4*P[jj,kk])+D*(2*X[jj-1,kk]+X[jj,kk-1]+X[jj,kk+1]-4*X[jj,kk]) +
    α[jj,kk]*P[jj,kk]-V[jj,kk]*X[jj,kk]+(X[jj,kk]^2+P[jj,kk]^2)*(-σ[jj,kk]*P[jj,kk]-g*X[jj,kk]);
end

@inbounds begin
    jj=1; kk=1
    dX[jj,kk]=D*η[jj,kk]*(2*X[jj+1,kk]+2*X[jj,kk+1]-4*X[jj,kk])-D*(2*P[jj+1,kk]+2*P[jj,kk+1]-4*P[jj,kk]) +
    α[jj,kk]*X[jj,kk]+V[jj,kk]*P[jj,kk]+(X[jj,kk]^2+P[jj,kk]^2)*(-σ[jj,kk]*X[jj,kk]+g*P[jj,kk]);
    dP[jj,kk]=D*η[jj,kk]*(2*P[jj+1,kk]+2*P[jj,kk+1]-4*P[jj,kk])+D*(2*X[jj+1,kk]+2*X[jj,kk+1]-4*X[jj,kk]) +
    α[jj,kk]*P[jj,kk]-V[jj,kk]*X[jj,kk]+(X[jj,kk]^2+P[jj,kk]^2)*(-σ[jj,kk]*P[jj,kk]-g*X[jj,kk]);

    jj=1; kk=Nx
    dX[jj,kk]=D*η[jj,kk]*(2*X[jj+1,kk]+2*X[jj,kk-1]-4*X[jj,kk])-D*(2*P[jj+1,kk]+2*P[jj,kk-1]-4*P[jj,kk])+
    α[jj,kk]*X[jj,kk]+V[jj,kk]*P[jj,kk]+(X[jj,kk]^2+P[jj,kk]^2)*(-σ[jj,kk]*X[jj,kk]+g*P[jj,kk]);
    dP[jj,kk]=D*η[jj,kk]*(2*P[jj+1,kk]+2*P[jj,kk-1]-4*P[jj,kk])+D*(2*X[jj+1,kk]+2*X[jj,kk-1]-4*X[jj,kk])+
    α[jj,kk]*P[jj,kk]-V[jj,kk]*X[jj,kk]+(X[jj,kk]^2+P[jj,kk]^2)*(-σ[jj,kk]*P[jj,kk]-g*X[jj,kk]);

    jj=Nx; kk=1
    dX[jj,kk]=D*η[jj,kk]*(2*X[jj-1,kk]+2*X[jj,kk+1]-4*X[jj,kk])-D*(2*P[jj-1,kk]+2*P[jj,kk+1]-4*P[jj,kk]) +
    α[jj,kk]*X[jj,kk]+V[jj,kk]*P[jj,kk]+(X[jj,kk]^2+P[jj,kk]^2)*(-σ[jj,kk]*X[jj,kk]+g*P[jj,kk]);
    dP[jj,kk]=D*η[jj,kk]*(2*P[jj-1,kk]+2*P[jj,kk+1]-4*P[jj,kk])+D*(2*X[jj-1,kk]+2*X[jj,kk+1]-4*X[jj,kk]) +
    α[jj,kk]*P[jj,kk]-V[jj,kk]*X[jj,kk]+(X[jj,kk]^2+P[jj,kk]^2)*(-σ[jj,kk]*P[jj,kk]-g*X[jj,kk]);
    
    jj=Nx; kk=Nx
    dX[jj,kk]=D*η[jj,kk]*(2*X[jj-1,kk]+2*X[jj,kk-1]-4*X[jj,kk])-D*(2*P[jj-1,kk]+2*P[jj,kk-1]-4*P[jj,kk])+
    α[jj,kk]*X[jj,kk]+V[jj,kk]*P[jj,kk]+(X[jj,kk]^2+P[jj,kk]^2)*(-σ[jj,kk]*X[jj,kk]+g*P[jj,kk]);
    dP[jj,kk]=D*η[jj,kk]*(2*P[jj-1,kk]+2*P[jj,kk-1]-4*P[jj,kk])+D*(2*X[jj-1,kk]+2*X[jj,kk-1]-4*X[jj,kk]) +
    α[jj,kk]*P[jj,kk]-V[jj,kk]*X[jj,kk]+(X[jj,kk]^2+P[jj,kk]^2)*(-σ[jj,kk]*P[jj,kk]-g*X[jj,kk]);
end
        
end

u0=1e-3*randn(Nx,Nx,2);

D=1/gridsize^2;
p3=(Nx,D,η,g,V,α,σ)

prob=ODEProblem(CGLfunrealdevect!,u0,(0,100.),p3)
@benchmark sol=solve(prob,Vern7(),saveat=tarray); 


I did a quick check with ROCK4() as well and CVODE_BDF(linear_solver = :GMRES) as suggested by, but they were not super good, memory does increase with time and they are slower especially for short evolution times. For a time evolution of 100, ROCK4 seems to have catched up with Vern7 in computation time (but at much larger memory)

ROCK4, (0,100.)

BenchmarkTools.Trial: 
  memory estimate:  101.78 GiB
  allocs estimate:  339782
  --------------
  minimum time:     157.428 s (1.22% GC)
  median time:      157.428 s (1.22% GC)
  mean time:        157.428 s (1.22% GC)
  maximum time:     157.428 s (1.22% GC)
  --------------
  samples:          1
  evals/sample:     1

CVODE_BDF(linear_solver = :GMRES) ,(0,10.)

BenchmarkTools.Trial: 
  memory estimate:  42.00 MiB
  allocs estimate:  729536
  --------------
  minimum time:     127.524 s (0.00% GC)
  median time:      127.524 s (0.00% GC)
  mean time:        127.524 s (0.00% GC)
  maximum time:     127.524 s (0.00% GC)
  --------------
  samples:          1
  evals/sample:     1
1 Like