Tutorials or examples on multithreading and distributed processing for numerical PDEs

Thank you very much for the prompt response. I have written a minimal boundary element method code for a 2D sine wave. It solves the Stokes equation using boundary integral formulation.

Are you saying I will not benefit from parallelising the for-loops or storing the matrix RM in different processors when it gets too large, like (40,000 x 40,000) or so?

julia> include("filamentstokesserial.jl"); @btime filamentefficiency(2^8,20,[1.00, -0.01, 0.01, 0.005, 0.015, -0.008],0)
  42.376 ms (56 allocations: 4.07 MiB)
0.009152331502935747
using FastGaussQuadrature,
    Plots, LinearAlgebra, BenchmarkTools, TimerOutputs, StaticArrays

function sgf_3d_fs(x, y, z, x0, y0, z0)
    dx = x - x0
    dy = y - y0
    dz = z - z0
    dxx = dx^2
    dxy = dx * dy
    dxz = dx * dz
    dyy = dy^2
    dyz = dy * dz
    dzz = dz^2

    r = sqrt(dxx + dyy + dzz)
    r3 = r * r * r
    ri = 1.0 / r
    ri3 = 1.0 / r3

    g11 = ri + dxx * ri3
    g12 = dxy * ri3
    g13 = dxz * ri3

    g21 = g12
    g22 = ri + dyy * ri3
    g23 = dyz * ri3

    g31 = g13
    g32 = g23
    g33 = ri + dzz * ri3

    GEk = @SMatrix [
        g11 g12 g13
        g21 g22 g23
        g31 g32 g33
    ]
    return GEk
end

function inner_loop(xc, yc, tcx, tcy, sc, x1, y1, x2, y2, s1, s2, xg, wg, ng)
    # x1,y1,x2,y2 are coordinates of element j with nodes at j and j+1
    # xc, yc are the coordinates of collocation point
    GE = zeros(SMatrix{3,3,Float64})
    GEself = zeros(SMatrix{3,3,Float64})
    d = 0.001 # regularisation parameter
    @inbounds @simd for k = 1:ng
        xi = xg[k]
        xint = 0.5 * ((x2 + x1) + (x2 - x1) * xi)
        yint = 0.5 * ((y2 + y1) + (y2 - y1) * xi)
        sint = 0.5 * ((s2 + s1) + (s2 - s1) * xi)
        hl = 0.5 * sqrt((x2 - x1)^2 + (y2 - y1)^2)
        # Call Green's function for Stokes equation
        GEk = sgf_3d_fs(xint, yint, 0, xc, yc, 0) # two dimensional problem: z=0
        # REDUCE ME
        GE += GEk * hl * wg[k]
        ds = abs(sc - sint)
        dss = sqrt(ds^2 + d^2)
        Gs11 = ((1.0 + tcx * tcx) / dss) * hl * wg[k]
        Gs12 = ((0.0 + tcx * tcy) / dss) * hl * wg[k]
        Gs21 = ((0.0 + tcy * tcx) / dss) * hl * wg[k]
        Gs22 = ((1.0 + tcy * tcy) / dss) * hl * wg[k]
        # REDUCE ME
        GEself += @SMatrix [
            Gs11 Gs12 0
            Gs21 Gs22 0
            0 0 0
        ]
    end
    return GE, GEself
end

function outer_middle_loops(
    xm,
    ym,
    tmx,
    tmy,
    sm,
    vm,
    ls,
    s,
    c,
    x,
    y,
    xg,
    wg,
    ng,
    nx,
)
    # Initialize the matrix and right hand side
    RM = zeros(2 * (nx - 1) + 1, 2 * (nx - 1) + 1)
    rhs = zeros(2 * (nx - 1) + 1, 1)

    for j = 1:nx-1 # looping over columns first
        j1 = (j - 1) * 2 + 1
        j2 = (j - 1) * 2 + 2

        tcx = tmx[j]
        tcy = tmy[j]

        # Local operator terms( \Lambda [f_h] )
        # REDUCE ME
        RM[j1, j1] += -(1 * (2 - c) - tcx * tcx * (c + 2)) / 8π
        RM[j1, j2] += -(0 * (2 - c) - tcx * tcy * (c + 2)) / 8π
        RM[j2, j1] += -(0 * (2 - c) - tcy * tcx * (c + 2)) / 8π
        RM[j2, j2] += -(1 * (2 - c) - tcy * tcy * (c + 2)) / 8π

        rhs[j1] = 0
        rhs[j2] = vm[j]
        RM[2*(nx-1)+1, j1] = ls[j]
        RM[j1, 2*(nx-1)+1] = 1

        for i = 1:nx-1
            i1 = (i - 1) * 2 + 1
            i2 = (i - 1) * 2 + 2
            xc = xm[i]
            yc = ym[i]
            tcx = tmx[i]
            tcy = tmy[i]
            sc = sm[i]
            # Single element k-Loop Integration using quadratures
            GE, GEself = inner_loop(
                xc,
                yc,
                tcx,
                tcy,
                sc,
                x[j],
                y[j],
                x[j+1],
                y[j+1],
                s[j],
                s[j+1],
                xg,
                wg,
                ng,
            )

            # Non-local operator terms( K [f_h] )
            # REDUCE ME
            RM[i1, j1] += GE[1, 1] / 8π
            RM[i1, j2] += GE[1, 2] / 8π
            RM[i2, j1] += GE[2, 1] / 8π
            RM[i2, j2] += GE[2, 2] / 8π

            RM[i1, i1] += -GEself[1, 1] / 8π
            RM[i1, i2] += -GEself[1, 2] / 8π
            RM[i2, i1] += -GEself[2, 1] / 8π
            RM[i2, i2] += -GEself[2, 2] / 8π
        end
    end
    return RM, rhs
end

function filamentefficiency(nx, ng, A, t)
    # Empty grad vector
    # Specify the number of nodes
    # nx = 101
    # Note that the number of elements = nx-1
    # How many points needed for integration using Gauss Quadrature?
    # ng = 20
    # Specify the odd-numbered mo20
    # A = [1.0045792982170993, -0.009331896288504353, 0.011941839033552903, 0.004696343389728706, 0.015159353628757498, -0.0017903083000960515, -0.002894232919948337, -0.020512982818940012, 0.002637764690195105, -0.004037513173757581, -0.0018003802187872722, 3.678005969005036e-5, -0.0014290163838325764, 0.006142381043574873, -0.004210790695122118, 0.011885762061040916, -0.010245776149711871, -0.0023350247253041455, 0.9906425306905926, 0.0064477708652553225]
    # time
    # t = 0
    # Points and weights "using FastGaussQuadrature"
    xg, wg = gausslegendre(ng)
    # Generate a sin curve
    # x and y are nodes while x_m and y_m are mid points of an element
    x = LinRange(-π, π, nx)
    nm = 1:1:length(A)
    y = (sin.((x .- t) .* (2nm' .- 1))) * A #(Only odd numbered modes)
    # Mid-points (collocation points) and tangents at the mid points.
    xm = 0.5 * (x[2:nx] .+ x[1:nx-1])
    ym = 0.5 * (y[2:nx] .+ y[1:nx-1])
    ls = sqrt.((x[2:nx] .- x[1:nx-1]) .^ 2 .+ (y[2:nx] .- y[1:nx-1]) .^ 2) # length of each element
    tmx = (x[2:nx] .- x[1:nx-1]) ./ ls
    tmy = (y[2:nx] .- y[1:nx-1]) ./ ls
    sqrt.(tmx .^ 2 + tmy .^ 2)
    # Arc length of node points
    s = zeros(nx, 1)
    for i = 1:nx-1
        s[i+1] = s[i] + ls[i]
    end
    # Arc length of mid points
    sm = 0.5 * (s[2:nx] .+ s[1:nx-1])
    # Filament velocity at mid points (forcing for the equations)
    vm = (-(2nm' .- 1) .* cos.((xm .- t) .* (2nm' .- 1))) * A
    ρ = 0.001
    c = 2 * log(ρ / s[nx]) + 1
    RM, rhs = outer_middle_loops(
        xm,
        ym,
        tmx,
        tmy,
        sm,
        vm,
        ls,
        s,
        c,
        x,
        y,
        xg,
        wg,
        ng,
        nx,
    )

    fh = RM \ rhs
    Effc = -(fh[2*(nx-1)+1])^2 / (sum(fh[2:2:2*(nx-1)] .* ls .* vm))
    return Effc
    # println("The x-velocity is ", fh[2*(nx-1)+1])
    # display(plot(x,y, aspect_ratio=:equal))
    #display(quiver!(xm,ym,quiver=(tmx,tmy), aspect_ratio=:equal))
end