Parallelization in Julia vs C++

Do you tend to have problems parallelizing thousands of fast computations over a large matrix? I’m going to show a Julia and then a C++ version of the code snippet. I get a lot more performance benefits from parallelization with #pragma omp parallel for in C++ than I can get with Julia for some reason…

    P1a::Matrix{Float64} = zeros(4, mesh.nt)
    P1b::Matrix{Float64} = zeros(4, mesh.nt)
    P1c::Matrix{Float64} = zeros(4, mesh.nt)
    P1d::Matrix{Float64} = zeros(4, mesh.nt)

    # Singlethread
    @time P1!(mesh, P1a, P1b, P1c, P1d, 1:mesh.
#Outputs: 0.781466 seconds (18.03 M allocations: 736.519 MiB, 9.84% gc time, 6.23% compilation time)

    # Multithreaded
    @time begin
        chunks = Iterators.partition(1:mesh.nt, cld(mesh.nt, Threads.nthreads()))
        @sync for chunk in chunks
            Threads.@spawn P1!(mesh, P1a, P1b, P1c, P1d, chunk)
        end
    end
#Outputs: 1.051446 seconds (18.00 M allocations: 735.137 MiB, 8.05% gc time, 15.01% compilation time)

The mesh has ‘nt’ elements (sometimes 1M), and each column of P1a,b,c,d is independent, so I batch the elements in chunks and send each chunk to a thread and @sync. Its always somehow slower than the single threaded version.
Here is the P1! function.

function P1!(mesh::MESH, 
            P1a::Matrix{Float64},
            P1b::Matrix{Float64},
            P1c::Matrix{Float64},
            P1d::Matrix{Float64},
            chunk)
    
    for k in chunk
        nds = @view mesh.t[:, k]
        for i in 1:4
            P1a[i, k], P1b[i, k], 
            P1c[i, k], P1d[i, k] = abcd(mesh.p, nds, nds[i])
        end
    end

end

The same code in C++, paralellized with #pragma omp parallel for gives me 10x speed ups.

#pragma omp parallel for // Each column is independent 
for(int k = 0; k<mesh.nt; k++){ // For each element
	for(int i = 0; i<4; i++){ // For each node of current element
		Eigen::Vector4d abcd = linearBasis3D(mesh.p, mesh.t.col(k), mesh.t(i, k));
			
		P1a(i, k) = abcd(0);
		P1b(i, k) = abcd(1);
		P1c(i, k) = abcd(2);
		P1d(i, k) = abcd(3);
	}
}

Any help with this would be amazing.

As far as I can see the function P1! should not allocate anything, so you probably must solve that first in the single threaded call.

What is the definition of your MESH type? What is abcd?

MESH is a mutable struct with types.

mutable struct MESH

    # Order | 1 - Linear, 2 - Quadratic
    order::Int

    # Node coordinates
    p::Matrix{Float64}

    # Connectivity list
    t::Matrix{Int32}
    
    # Surface triangles
    surfaceT::Matrix{Int32}

    # Edges (updated when doing second order meshes)
    edges::Vector{Int32} # Gmsh label of each edge midpoint (2nd order mesh node)
    
    # Map the global edge label to a local edge from 1 to 'ne'
    edge2localMap::Vector{Int32} 

    # Map mesh volume elements to the parent Cell/Volume ID
    elementID::Vector{Int32}

    # Elements inside material
    InsideElements::Vector{Int32}
    
    # Nodes inside material
    InsideNodes::Vector{Int32}
    
    # Volume of each mesh element
    VE::Vector{Float64}
    
    # Area of each surface mesh element
    AE::Vector{Float64} 

    # Normal of each surface triangle
    normal::Matrix{Float64}
    
    nv::Int32           # Number of nodes
    nt::Int32           # Number of elements
    ns::Int32           # Number of surface elements
    ne::Int32           # Number of edges

    nInside::Int32      # Number of elements for the inside cells
    nInsideNodes::Int32 # Numberf of nodes for the inside cells

    shell_id # Id of the surface boundaries of the bounding shell

    # Constructor
    MESH() = new()
end

And abcd is

# 3D linear basis function
function abcd(p::Matrix{Float64},nodes::AbstractVector,nd::Int32)
    n1,n2,n3 = nodes[nodes .!= nd]
    x = @view p[1, [nd, n1, n2, n3]]
    y = @view p[2, [nd, n1, n2, n3]]
    z = @view p[3, [nd, n1, n2, n3]]

    r::Vector{Float64} =[1.0 x[1] y[1] z[1];
                         1.0 x[2] y[2] z[2];
                         1.0 x[3] y[3] z[3];
                         1.0 x[4] y[4] z[4]]\[1;0;0;0]

    return r[1],r[2],r[3],r[4] # a b c d
end # Basis function coef.

Its the abcd() function call that likely allocates memory. That shouldn’t be the cause, I think.

Probably it is. Try writting a non-allocating version of that, I’m sure your serial and parallel versions will be much, much faster.