Use static arrays, and take a close look to the allocations in inner loops. Here is an example where I have replaced some of your inner matrices with static matrices:
Result:
julia> include("./deb.jl")
0.182349 seconds (60 allocations: 15.426 MiB)
(vs ~1s and 3GB allocs from the original code)
using FastGaussQuadrature, LinearAlgebra, BenchmarkTools, TimerOutputs
using StaticArrays, Test, Setfield
function filamentstokes(nx,ng,A,t)
# Specify the number of nodes
# nx = 501
# Note that the number of elements = nx-1
# Specify the odd-numbered mo20
# A = [1, 0.01, -0.01, 0.001, - 0.001]
# How many points needed for integration using Gauss Quadrature?
# ng = 20
# Points and weights "using FastGaussQuadrature"
xg, wg = gausslegendre(ng)
# time
# t = 0
# Generate a sin curve
# x and y are nodes while x_m and y_m are mid points of an element
x = LinRange(-π,π,nx)
nm = 1:1:length(A)
y = (sin.((x.-t).*(2nm'.-1)))*A #(Only odd numbered modes)
# Mid-points (collocation points) and tangents at the mid points.
xm = 0.5*(x[2:nx] .+ x[1:nx-1])
ym = 0.5*(y[2:nx] .+ y[1:nx-1])
ls = sqrt.((x[2:nx] .- x[1:nx-1]).^2 .+ (y[2:nx] .- y[1:nx-1]).^2) # length of each element
tmx = (x[2:nx] .- x[1:nx-1])./ls
tmy = (y[2:nx] .- y[1:nx-1])./ls
# Arc length of node points
s = zeros(nx,1)
for i = 1:nx-1
s[i+1] = s[i] + ls[i]
end
# Arc length of mid points
sm = 0.5*(s[2:nx] .+ s[1:nx-1])
# Filament velocity at mid points (forcing for the equations)
vm = (-(2nm'.-1).*cos.((xm.-t).*(2nm'.-1)))*A
ρ = 0.001
c = 2*log(ρ/s[nx]) + 1
RM, rhs = threenestedloops(xm,ym,tmx,tmy,sm,vm,ls,s,c,x,y,xg,wg,ng,nx)
fh = RM\rhs
# println("The x-velocity is ", fh[2*(nx-1)+1])
#display(plot(x,y, aspect_ratio=:equal))
#display(quiver!(xm,ym,quiver=(tmx,tmy), aspect_ratio=:equal))
end
function threenestedloops(xm,ym,tmx,tmy,sm,vm,ls,s,c,x,y,xg,wg,ng,nx)
# Initialize the matrix and right hand side
RM = zeros(2*(nx-1) + 1,2*(nx-1) + 1)
rhs = zeros(2*(nx-1) + 1,1)
#Threads.@threads for j = 1:nx-1 # use Threads if needed
for j = 1:nx-1 # looping over columns first
j1 = (j-1)*2 + 1
j2 = (j-1)*2 + 2
tcx = tmx[j]
tcy = tmy[j]
# Local operator terms( \Lambda [f_h] )
RM[j1,j1] += -(1*(2-c) - tcx*tcx*(c+2))/8π
RM[j1,j2] += -(0*(2-c) - tcx*tcy*(c+2))/8π
RM[j2,j1] += -(0*(2-c) - tcy*tcx*(c+2))/8π
RM[j2,j2] += -(1*(2-c) - tcy*tcy*(c+2))/8π
rhs[j1] = 0
rhs[j2] = vm[j]
RM[2*(nx-1) + 1,j1] = ls[j]
RM[j1,2*(nx-1) + 1] = 1
# trying the @view macro
#RMj1 = @view RM[:,j1] # just a vector with j1 column
#RMj2 = @view RM[:,j2] # just a vector with j2 column
for i = 1:nx-1
i1 = (i-1)*2 + 1
i2 = (i-1)*2 + 2
xc = xm[i]
yc = ym[i]
tcx = tmx[i]
tcy = tmy[i]
sc = sm[i]
# Single element k-Loop Integration using quadratures
GE,GEself = elementintegrate(xc,yc,tcx,tcy,sc,x[j],y[j],x[j+1],y[j+1],s[j],s[j+1],xg,wg,ng)
# Non-local operator terms( K [f_h] )
#RMj1[i1] += GE[1,1]/8π
#RMj2[i1] += GE[1,2]/8π
#RMj1[i2] += GE[2,1]/8π
#RMj2[i2] += GE[2,2]/8π
RM[i1,j1] += GE[1,1]/8π
RM[i1,j2] += GE[1,2]/8π
RM[i2,j1] += GE[2,1]/8π
RM[i2,j2] += GE[2,2]/8π
RM[i1,i1] += -GEself[1,1]/8π
RM[i1,i2] += -GEself[1,2]/8π
RM[i2,i1] += -GEself[2,1]/8π
RM[i2,i2] += -GEself[2,2]/8π
end
end
return RM, rhs
end
function elementintegrate(xc,yc,tcx,tcy,sc,x1,y1,x2,y2,s1,s2,xg,wg,ng)
# x1,y1,x2,y2 are coordinates of element j with nodes at j and j+1
# xc, yc are the coordinates of collocation point
GE = zeros(SMatrix{3,3,Float64}) #
GEself = zeros(SMatrix{3,3,Float64}) #
d = 0.001 # regularisation parameter
for k = 1:ng
xi = xg[k]
xint= 0.5*( (x2 + x1) + (x2 - x1)*xi)
yint = 0.5*( (y2 + y1) + (y2 - y1)*xi)
sint = 0.5*( (s2 + s1) + (s2 - s1)*xi)
hl = 0.5*sqrt( (x2-x1)^2 + (y2-y1)^2)
# Call Green's function for Stokes equation
GEk = sgf_3d_fs(xint,yint,0,xc,yc,0) # two dimensional problem: z=0
GE += GEk*hl*wg[k]
ds = abs(sc-sint)
dss = sqrt(ds^2 + d^2)
@set! GEself[1,1] += ((1.0 + tcx*tcx)/dss)*hl*wg[k]
@set! GEself[1,2] += ((0.0 + tcx*tcy)/dss)*hl*wg[k]
@set! GEself[2,1] += ((0.0 + tcy*tcx)/dss)*hl*wg[k]
@set! GEself[2,2] += ((1.0 + tcy*tcy)/dss)*hl*wg[k]
end
return GE, GEself
end
function sgf_3d_fs(x,y,z,x0,y0,z0)
dx = x-x0
dy = y-y0
dz = z-z0
dxx = dx^2
dxy = dx*dy
dxz = dx*dz
dyy = dy^2
dyz = dy*dz
dzz = dz^2
r = sqrt(dxx+dyy+dzz);
r3 = r*r*r
ri = 1.0/r
ri3 = 1.0/r3
g11 = ri + dxx*ri3
g12 = 0.0 + dxy*ri3
g13 = 0.0 + dxz*ri3
g21 = g12
g22 = 1.0 + dyy*ri3
g23 = 0.0 + dyz*ri3
g31 = g13
g32 = g23
g33 = 1.0 + dzz*ri3
GEk = @SMatrix [ g11 g12 g13
g21 g22 g23
g31 g32 g33 ]
return GEk
end
# save original result to check
#result = filamentstokes(501,20,[1, 0.01, -0.01, 0.001, - 0.001],0)
@test result ≈ filamentstokes(501,20,[1, 0.01, -0.01, 0.001, - 0.001],0)
@time filamentstokes(501,20,[1, 0.01, -0.01, 0.001, - 0.001],0)
nothing
Having your inner loops completely allocation free is the main thing that you need to pursue to make the code fast. Take a look at these notes on how to track the allocations.
Using static arrays will avoid allocations of those small vectors or matrices in the inner loops (because, being static, the compiler can optimize them to be used only on the stack memory or the processor cache). (these other notes may help as well, maybe, but certainly better texts exist out there).