Taking advantage of Apple M1?

LaurentPlagne · January 22, 2023, 4:05pm

Here is a memory bound example (wave propag with low order F.D. scheme) with an Makie (GLMakie) animation running during the computations.

SingleSpring.jl

using GLMakie
using PlotUtils #for colorant and color gradient (cgrad)

Base.@kwdef struct SpringParam
    ls::Float64 # Spring length
    ms::Float64 # Mass
    ks::Float64 # Spring Strength
    ns::Int     # Mass number
end

Base.@kwdef struct InitParam
    λ::Float64      # Wavelength of the initial deformation
    shift::Float64  # Maximal initial node displacement
    pos::Float64    # Central position of the node displacement
end

Base.@kwdef struct AnimParam
    δt::Float64      # Time step for the dynamic simulation
    nδt::Int         # Number of timesteps
    nδtperframe::Int # Number of timesteps per animation frame
end

steady_position(i, ls) = (i - 1) * ls
function initial_position(i,j,ls,λ,shift,posx,posy)
    xs=steady_position(i,ls)
    ys=steady_position(j,ls)
    dx=xs-posx 
    dy=ys-posy
    dr=sqrt(dx^2+dy^2)
    ds=(-1/λ^2)*exp(-(dr/λ)^2/2)*shift
    xs+dx*ds,ys+dy*ds
end 

function display_perf(td,xc,nδt)
    nflops=20 # Manual count floating point ops
    nRW=12    # Manual count Array Reads and Writes
    T=eltype(xc) # i.e. Float64
    floatsize_inbytes=sizeof(T) #Float64 -> 8 Bytes
    nbytes=nRW*floatsize_inbytes # Float64 -> 96 Bytes
    
    mupns=length(xc)*nδt/(td*1.e9) # mass pos updates per ns
    println("$(round((mupns*nflops),sigdigits=3))\t GFLOPS")
    println("$(round((mupns*nbytes),sigdigits=3))\t GB/s")
end

#V is the type of the returned arrays (type are first class citizens !)
function initialize_arrays(sp,ip,V) 
    (;ls,ns) = sp
    (;λ,shift,pos) = ip

    #2D Array comprehensions 
    #  -> cxs,xys,cxc,cyc are 2D arrays of size (ns,ns)
    cxs=[steady_position(i,ls) for i ∈ 1:ns, _ ∈ 1:ns] 
    cys=[steady_position(j,ls) for _ ∈ 1:ns, j ∈ 1:ns]
    cxc=[initial_position(i,j,ls,λ,shift,pos,pos)[1] for i ∈ 1:ns, j ∈ 1:ns]
    cyc=[initial_position(i,j,ls,λ,shift,pos,pos)[2] for i ∈ 1:ns, j ∈ 1:ns]

    #Broadcast of V Ctor equivalent to xs=V(cxs);ys=V(cys);xc=V(cxc);yc=V(cyc)
    xs,ys,xc,yc=V.((cxs,cys,cxc,cyc))
    #Assume zero initial velocities : xp(revious)=xc(urrent)
    fx,xt,xp=zero(xc),zero(xc),copy(xc)
    fy,yt,yp=zero(yc),zero(yc),copy(yc)

    xs,xc,xp,xt,fx,ys,yc,yp,yt,fy
end

function update_position!(xt::Array{T,2},xc::Array{T,2},xp::Array{T,2},
    fx::Array{T,2},δt, ms) where {T}

    nsx,nsy=size(xc)
    coef = T(δt^2 / ms)
    t2 = T(2)
    Threads.@threads for j ∈ 1:nsy
        @inbounds for i ∈ 1:nsx
            xt[i,j] = t2*xc[i,j]-xp[i,j]+fx[i,j]*coef
        end
    end
end

function update_force!(fx::Array{T,2},xc::Array{T,2},ks) where {T}
    nsx,nsy=size(xc)
    mtks=T(-ks)
    t4=T(4)
    Threads.@threads for j ∈ 2:nsy-1
        @inbounds for i ∈ 2:nsx-1
            fx[i,j] = mtks*(t4*xc[i,j]
            -xc[i,j-1]
            -xc[i-1,j]
            -xc[i+1,j]
            -xc[i,j+1])
        end
    end
end

function density_shift!(dc,xc,yc,ls)
    ns=size(xc,1) #assuming square array
    r0,r1,r2=1:ns-2,2:ns-1,3:ns # UnitRange
    ds=1/(4ls^2)  #steady density
    @views @. dc[r1,r1] = 1/((xc[r2,r1] - xc[r0,r1])*(yc[r1,r2] - yc[r1,r0])) - ds
end

function setup_scene(xaxis,dc,nf)
    # scene = Scene(resolution = (1500, 1500))
    dcnode = Observable(dc)

    xM=maximum(xaxis)

    dM=maximum(dc)
    fig = Figure(resolution = (2000, 1400), fontsize = 22)

    ax = Axis3(fig[1, 1], limits=(0,xM,0,xM,-dM/8,dM/8))

    surface!(ax,xaxis,xaxis,lift(d->d,dcnode),colormap=corporate_gradient(),colorrange = (-0.1,0.1))
    # scale!(ax, 1, 1, 0.1*xM/dM)

    #   translate_cam!(scene,(10,10,2))
    #     update_cam!(scene, Vec3f0(0, 0, 5), Vec3f0(0.01, 0.01, 0))
    #     rotate_cam!(scene,(π/4 ,-π/2 ,0))
    #     αstep=-(π/4)/(nf÷2)
    #     lift(d->rotate_cam!(scene,(0 ,αstep ,0)),dcnode)



    fig,dcnode
end

#Triscale colors ;)
corporate_gradient() = cgrad([colorant"#177272",colorant"#fafafa",colorant"#ee6f40"])

function advance_nδtpf(xc, xp, xt, fx, sp, ap)
    (;ms, ks) = sp
    (;δt, nδtperframe) = ap

    for _ ∈ 1:nδtperframe # Anonymous iter variable (_)
        update_force!(fx, xc, ks)
        update_position!(xt, xc, xp, fx, δt, ms)
        xc, xp, xt = xt, xc, xp # Julia's swap
    end
    xc, xp, xt
end

function animate_makie(sp,ip,ap,V)
    (;δt,nδt,nδtperframe) = ap
    xs,xc,xp,xt,fx,ys,yc,yp,yt,fy=initialize_arrays(sp,ip,V)

    dc=zero(xc)
    cdc=Array(xc)
    density_shift!(dc,xc,yc,sp.ls)
    @. cdc = dc 
    xaxis=Array(xs[:,1])
    yaxis=Array(ys[1,:])
    nf=nδt÷nδtperframe

    scene,dcnode=setup_scene(xaxis,cdc,nf)
    display(scene)
    
    t=0.0
    tdynamic=0.0
    for _ ∈ 1:nf
        tdynamic+= @elapsed begin
            xc,xp,xt=advance_nδtpf(xc,xp,xt,fx,sp,ap)
            yc,yp,yt=advance_nδtpf(yc,yp,yt,fy,sp,ap)
        end
        density_shift!(dc,xc,yc,sp.ls)
        dcnode[] = dc
        sleep(0.01)
        t+=nδtperframe*δt
    end

    display_perf(tdynamic,xc,nδt) #Display GFLOPS and GB/s
end

function go()
    sp=SpringParam(ls=0.1,ms=1,ks=2.5,ns=1000)
    ip=InitParam(λ=3.0,shift=0.1,pos=sp.ls*sp.ns/2)
    ap=AnimParam(δt=0.2,nδt=5000,nδtperframe=20)

    V=Array{Float64,2}

    @time animate_makie(sp,ip,ap,V)
end
go()

I obtain (M1 Max):

julia> include("SingleSpring.jl")
21.1     GFLOPS
101.0    GB/s
  8.506742 seconds (1.46 M allocations: 1.164 GiB, 0.86% gc time, 1.18% compilation time)

with a final image :

Topic		Replies	Views
Does Mac M1 in multithreads is slower that in single thread? Performance mac-m1	10	3533	May 16, 2021
Apple M1, M1 pro M1 Max and Julia developpers Offtopic	17	5415	November 1, 2021
Apple M1 GPU from Julia? GPU question	20	5874	March 31, 2023
Apple silicon full power Performance hardware , apple	19	6684	November 18, 2021
JuMP.jl and DifferentialEquation.jl benchmarks on M1 Max Julia 1.7.0 x89 vs ARM. (spoiler: ARM is 1.5-2x faster) General Usage jump , diffeq , apple	12	2758	December 5, 2021

Taking advantage of Apple M1?

Related topics