Taking advantage of Apple M1?

Here is a memory bound example (wave propag with low order F.D. scheme) with an Makie (GLMakie) animation running during the computations.

SingleSpring.jl
using GLMakie
using PlotUtils #for colorant and color gradient (cgrad)

Base.@kwdef struct SpringParam
    ls::Float64 # Spring length
    ms::Float64 # Mass
    ks::Float64 # Spring Strength
    ns::Int     # Mass number
end

Base.@kwdef struct InitParam
    λ::Float64      # Wavelength of the initial deformation
    shift::Float64  # Maximal initial node displacement
    pos::Float64    # Central position of the node displacement
end

Base.@kwdef struct AnimParam
    δt::Float64      # Time step for the dynamic simulation
    nδt::Int         # Number of timesteps
    nδtperframe::Int # Number of timesteps per animation frame
end

steady_position(i, ls) = (i - 1) * ls
function initial_position(i,j,ls,λ,shift,posx,posy)
    xs=steady_position(i,ls)
    ys=steady_position(j,ls)
    dx=xs-posx 
    dy=ys-posy
    dr=sqrt(dx^2+dy^2)
    ds=(-1/λ^2)*exp(-(dr/λ)^2/2)*shift
    xs+dx*ds,ys+dy*ds
end 

function display_perf(td,xc,nδt)
    nflops=20 # Manual count floating point ops
    nRW=12    # Manual count Array Reads and Writes
    T=eltype(xc) # i.e. Float64
    floatsize_inbytes=sizeof(T) #Float64 -> 8 Bytes
    nbytes=nRW*floatsize_inbytes # Float64 -> 96 Bytes
    
    mupns=length(xc)*nδt/(td*1.e9) # mass pos updates per ns
    println("$(round((mupns*nflops),sigdigits=3))\t GFLOPS")
    println("$(round((mupns*nbytes),sigdigits=3))\t GB/s")
end

#V is the type of the returned arrays (type are first class citizens !)
function initialize_arrays(sp,ip,V) 
    (;ls,ns) = sp
    (;λ,shift,pos) = ip

    #2D Array comprehensions 
    #  -> cxs,xys,cxc,cyc are 2D arrays of size (ns,ns)
    cxs=[steady_position(i,ls) for i ∈ 1:ns, _ ∈ 1:ns] 
    cys=[steady_position(j,ls) for _ ∈ 1:ns, j ∈ 1:ns]
    cxc=[initial_position(i,j,ls,λ,shift,pos,pos)[1] for i ∈ 1:ns, j ∈ 1:ns]
    cyc=[initial_position(i,j,ls,λ,shift,pos,pos)[2] for i ∈ 1:ns, j ∈ 1:ns]

    #Broadcast of V Ctor equivalent to xs=V(cxs);ys=V(cys);xc=V(cxc);yc=V(cyc)
    xs,ys,xc,yc=V.((cxs,cys,cxc,cyc))
    #Assume zero initial velocities : xp(revious)=xc(urrent)
    fx,xt,xp=zero(xc),zero(xc),copy(xc)
    fy,yt,yp=zero(yc),zero(yc),copy(yc)

    xs,xc,xp,xt,fx,ys,yc,yp,yt,fy
end

function update_position!(xt::Array{T,2},xc::Array{T,2},xp::Array{T,2},
    fx::Array{T,2},δt, ms) where {T}

    nsx,nsy=size(xc)
    coef = T(δt^2 / ms)
    t2 = T(2)
    Threads.@threads for j ∈ 1:nsy
        @inbounds for i ∈ 1:nsx
            xt[i,j] = t2*xc[i,j]-xp[i,j]+fx[i,j]*coef
        end
    end
end

function update_force!(fx::Array{T,2},xc::Array{T,2},ks) where {T}
    nsx,nsy=size(xc)
    mtks=T(-ks)
    t4=T(4)
    Threads.@threads for j ∈ 2:nsy-1
        @inbounds for i ∈ 2:nsx-1
            fx[i,j] = mtks*(t4*xc[i,j]
            -xc[i,j-1]
            -xc[i-1,j]
            -xc[i+1,j]
            -xc[i,j+1])
        end
    end
end

function density_shift!(dc,xc,yc,ls)
    ns=size(xc,1) #assuming square array
    r0,r1,r2=1:ns-2,2:ns-1,3:ns # UnitRange
    ds=1/(4ls^2)  #steady density
    @views @. dc[r1,r1] = 1/((xc[r2,r1] - xc[r0,r1])*(yc[r1,r2] - yc[r1,r0])) - ds
end

function setup_scene(xaxis,dc,nf)
    # scene = Scene(resolution = (1500, 1500))
    dcnode = Observable(dc)

    xM=maximum(xaxis)

    dM=maximum(dc)
    fig = Figure(resolution = (2000, 1400), fontsize = 22)

    ax = Axis3(fig[1, 1], limits=(0,xM,0,xM,-dM/8,dM/8))

    surface!(ax,xaxis,xaxis,lift(d->d,dcnode),colormap=corporate_gradient(),colorrange = (-0.1,0.1))
    # scale!(ax, 1, 1, 0.1*xM/dM)

    #   translate_cam!(scene,(10,10,2))
    #     update_cam!(scene, Vec3f0(0, 0, 5), Vec3f0(0.01, 0.01, 0))
    #     rotate_cam!(scene,(π/4 ,-π/2 ,0))
    #     αstep=-(π/4)/(nf÷2)
    #     lift(d->rotate_cam!(scene,(0 ,αstep ,0)),dcnode)



    fig,dcnode
end

#Triscale colors ;)
corporate_gradient() = cgrad([colorant"#177272",colorant"#fafafa",colorant"#ee6f40"])

function advance_nδtpf(xc, xp, xt, fx, sp, ap)
    (;ms, ks) = sp
    (;δt, nδtperframe) = ap

    for _ ∈ 1:nδtperframe # Anonymous iter variable (_)
        update_force!(fx, xc, ks)
        update_position!(xt, xc, xp, fx, δt, ms)
        xc, xp, xt = xt, xc, xp # Julia's swap
    end
    xc, xp, xt
end

function animate_makie(sp,ip,ap,V)
    (;δt,nδt,nδtperframe) = ap
    xs,xc,xp,xt,fx,ys,yc,yp,yt,fy=initialize_arrays(sp,ip,V)

    dc=zero(xc)
    cdc=Array(xc)
    density_shift!(dc,xc,yc,sp.ls)
    @. cdc = dc 
    xaxis=Array(xs[:,1])
    yaxis=Array(ys[1,:])
    nf=nδt÷nδtperframe

    scene,dcnode=setup_scene(xaxis,cdc,nf)
    display(scene)
    
    t=0.0
    tdynamic=0.0
    for _ ∈ 1:nf
        tdynamic+= @elapsed begin
            xc,xp,xt=advance_nδtpf(xc,xp,xt,fx,sp,ap)
            yc,yp,yt=advance_nδtpf(yc,yp,yt,fy,sp,ap)
        end
        density_shift!(dc,xc,yc,sp.ls)
        dcnode[] = dc
        sleep(0.01)
        t+=nδtperframe*δt
    end

    display_perf(tdynamic,xc,nδt) #Display GFLOPS and GB/s
end

function go()
    sp=SpringParam(ls=0.1,ms=1,ks=2.5,ns=1000)
    ip=InitParam(λ=3.0,shift=0.1,pos=sp.ls*sp.ns/2)
    ap=AnimParam(δt=0.2,nδt=5000,nδtperframe=20)

    V=Array{Float64,2}

    @time animate_makie(sp,ip,ap,V)
end
go()

I obtain (M1 Max):

julia> include("SingleSpring.jl")
21.1     GFLOPS
101.0    GB/s
  8.506742 seconds (1.46 M allocations: 1.164 GiB, 0.86% gc time, 1.18% compilation time)

with a final image :

8 Likes