Speeding up FVM code

parallel

#1

Dear All,

I have been trying to speed up the following piece of code for a FVM method. I used Threads in the first for-loop but it is slower. Ideally, I would like it to be parallel. So I am about to re-write it using sparse matrix-vector products but it is quite a rewrite…

Before I do this, I would like to be sure there is not something better to do,

Thank you for your suggestions,

Best regards

Romain


function rhs!(divergence,Nv,Nw,invDv,invDw,g,V,W)
	# this is type stable!!
	@inbounds for i=1:Nv
		for j= Nw:-1:1
			# Values
			gij  = g[j,i]
			gimj = i > 1         ? g[j,i-1]:zero(gij)
			gipj = (i+1) <= Nv   ? g[j,i+1]:zero(gij)

			gijp = j+1 < Nw      ? g[j+1,i]:zero(gij)
			gijm = j-1 >= 1      ? g[j-1,i]:zero(gij)
			# Upwind fluxes + null flux boundary conditions
			Fij  = i > 1         ? (V[j,i]>=0.  ? V[j,i]*gimj: V[j,i]  *gij): zero(gij)
			Fipj = i < Nv        ? (V[j,i+1]>=0.? V[j,i+1]*gij:V[j,i+1]*gipj):zero(gij)

			Gijp = j < Nw        ? (W[j+1,i]>0. ? W[j+1,i]*gij:W[j+1,i]*gijp):zero(gij)
			Gij  = j>1           ? (W[j,i]>0.   ? W[j,i]*gijm: W[j,i]  *gij): zero(gij)
			# Divergence
			divergence[j,i] = invDv*(Fipj - Fij) + invDw*(Gijp - Gij)
		end
	end
end

N = 2000
divergence = rand(N,N)
g = rand(N,N)
V = rand(N,N)
W = rand(N,N)
@time rhs!(divergence,N,N,1.,1.,g,V,W)


#2

SIMD can give amazing speedups especially on Float32. You can try to bribe julia into using it by eliminating branches. e.g. use ifelse instead of ? and annotate the inner loop with @simd. I that does not work (probably it doesn’t, your code is complicated), you can do it manually using SIMD.jl.


#3

Agreed, this could be optimized further before going to threads (which might be trivial to implement with shared arrays?). The branches due to the BC are the simplest to eliminate: use an array of size N+2 x N+2, with zeroes on the boundary (or have a special loop for inside the domain and the boundary, which is more annoying). The branches resulting from the upwind fluxes are more tricky, it would be interesting to see if this can still be SIMDed (as @jw3126 says, ifelse might be useful here)


#4

Thank you for your answer. The following is 3 times slower actually

function rhs!(divergence,Nv,Nw,invDv,invDw,g,V,W)
	# this is type stable!!
	@inbounds for i=1:Nv 
        @simd for j= Nw:-1:1
			gij,gimj,gipj,gijp,gijm,Fij,Fipj,Gijp,Gij = 0.,0.,0.,0.,0.,0.,0.,0.,0.
			# Values
			gij  = g[j,i]
			# gimj = (i-1) >=1     ? g[j,i-1]:zero(gij)
			if i-1 >= 1
				gimj = g[j,i-1]
			end
			# gipj = (i+1) <= Nv   ? g[j,i+1]:zero(gij)
			if i+1<=Nv
				gipj = g[j,i+1]
			end
			# gijp = j+1 < Nw      ? g[j+1,i]:zero(gij)
			if j+1 < Nw
				gijp = g[j+1,i]
			end
			# gijm = j-1 >= 1        ? g[j-1,i]:zero(gij)
            if j-1>=1
                gijm = g[j-1,i]
            end
			# Upwind fluxes + null flux boundary conditions
			Fij  = i > 1         ? (V[j,i]>=0.  ? V[j,i]*gimj:V[j,i]*gij):zero(gij)
            if i>1
                if V[i,j]>=0.
                    Fij = V[j,i]*gimj
                else
                    Fij = V[j,i]*gij
                end
            end
			# Fipj = i < Nv        ? (V[j,i+1]>=0.? V[j,i+1]*gij:V[j,i+1]*gipj):zero(gij)
            if i < Nv
                if V[j,i+1]>=0.
                    Fipj = V[j,i+1]*gij
                else
                    Fipj = V[j,i+1]*gipj
                end
            end

			# Gijp = j < Nw        ? (W[j+1,i]>0. ? W[j+1,i]*gij:W[j+1,i]*gijp):zero(gij)
            if j < Nw
                if W[j+1,i]>0.
                    Gijp = W[j+1,i]*gij
                else
                    Gijp = W[j+1,i]*gijp
                end
            end
			# Gij  = j>1           ? (W[j,i]>0.   ? W[j,i]*gijm:W[j,i]*gij):zero(gij)
            if j > 1
                if W[j,i]>0.
                    Gij = W[j,i]*gijm
                else
                    Gijp = W[j,i]*gij
                end
            end
			# Divergence
			divergence[j,i] = invDv*(Fipj - Fij) + invDw*(Gijp - Gij)
		end
    end
end

#5

@threads easily makes code type unstable, so you should always first try to use a function barrier:


function inner(i, divergence,Nv,Nw,invDv,invDw,g,V,W)
    @inbounds for j= Nw:-1:1
        # Values
        gij  = g[j,i]
        gimj = i > 1         ? g[j,i-1]:zero(gij)
        gipj = (i+1) <= Nv   ? g[j,i+1]:zero(gij)

        gijp = j+1 < Nw      ? g[j+1,i]:zero(gij)
        gijm = j-1 >= 1      ? g[j-1,i]:zero(gij)
        # Upwind fluxes + null flux boundary conditions
        Fij  = i > 1         ? (V[j,i]>=0.  ? V[j,i]*gimj: V[j,i]  *gij): zero(gij)
        Fipj = i < Nv        ? (V[j,i+1]>=0.? V[j,i+1]*gij:V[j,i+1]*gipj):zero(gij)

        Gijp = j < Nw        ? (W[j+1,i]>0. ? W[j+1,i]*gij:W[j+1,i]*gijp):zero(gij)
        Gij  = j>1           ? (W[j,i]>0.   ? W[j,i]*gijm: W[j,i]  *gij): zero(gij)
        # Divergence
        divergence[j,i] = invDv*(Fipj - Fij) + invDw*(Gijp - Gij)
    end
end
function rhs!(divergence,Nv,Nw,invDv,invDw,g,V,W)
	# this is type stable!!
	Threads.@threads for i=1:Nv
        inner(i, divergence,Nv,Nw,invDv,invDw,g,V,W)
	end
end

using BenchmarkTools
function init()
    srand(42)
    N = 2000
    divergence = rand(N,N)
    g = rand(N,N)
    V = rand(N,N)
    W = rand(N,N)
    divergence, g, V, W
end
N = 2000
@btime rhs!(divergence,N,N,1.,1.,g,V,W) setup= ((divergence, g, V, W) = init())

This gives me a 3.3 speed up!


#6

ifelse is different

help?> ifelse
search: ifelse

  ifelse(condition::Bool, x, y)

  Return x if condition is true, otherwise return y. This differs from ? or if in that it is an ordinary function, so all the arguments are evaluated first. In some cases, using ifelse instead of an if
  statement can eliminate the branch in generated code and provide higher performance in tight loops.

  julia> ifelse(1 > 2, 1, 2)
  2

#7

For removing branches, I’d just go over the boundary in a seperate loop… That gives me a 2.5 speed up - but actually adding the threading then doesn’t improve performance much and i still end up with only 3.6x speed up overall - that doesn’t account for the extra loop which I was to lazy to code up yet :wink:


function inner(i, divergence,Nv,Nw,invDv,invDw,g,V,W)
    @simd for j = Nw-1:-1:2
        @inbounds begin
            # Values
            gij  = g[j, i]
            gimj = g[j, i-1]
            gipj = g[j, i+1]

            gijp = g[j+1, i]
            gijm = g[j-1, i]

            # Upwind fluxes + null flux boundary conditions
            vji = V[j,i]
            mul1 = ifelse(vji >= 0.0, gimj, gij)
            Fij  = vji * mul1
            
            vji = V[j,i+1]
            mul1 = ifelse(vji >= 0.0, gij, gipj)
            Fipj  = vji * mul1
            
            wji = W[j+1,i]
            mul = ifelse(wji > 0.0, gij, gijp)
            Gijp  = wji * mul
            
            wji = W[j,i]
            mul = ifelse(wji > 0.0, gijm, gij)
            Gij  = wji * mul

            # Divergence
            divergence[j,i] = invDv*(Fipj - Fij) + invDw*(Gijp - Gij)
        end
    end
end
function rhs!(divergence,Nv,Nw,invDv,invDw,g,V,W)
	Threads.@threads for i=2:Nv-1
        inner(i, divergence,Nv,Nw,invDv,invDw,g,V,W)
	end
end
using BenchmarkTools
function init()
    srand(42)
    N = 2000
    divergence = rand(N,N)
    g = rand(N,N)
    V = rand(N,N)
    W = rand(N,N)
    divergence, g, V, W
end
N = 2000
@btime rhs!(divergence,N,N,1.,1.,g,V,W) setup= ((divergence, g, V, W) = init())
20/5.5

Maybe this version will scale better for even bigger inputs!


#8

Thank you a lot for your quick answers!! I learned two things, function barriers and ifelse