I have written a finite difference 4th-order stencil generator macro and use the @tturbo
to accelerate my loop. My code gives perfect results for a single thread run, but triggers either a segfault or a deadlock in multithreaded mode. Given the lack of errors in single core mode with tturbo
removed, I conclude that there are no out of bounds. Therefore, I find it really hard to find the cause of this. I am also new in Julia, so I might have made some beginner’s mistake. My code is below. This is the whole script (300 lines), as I could not reduce this further to a smaller example showing the same behavior. The stencil is valid Julia code, so in principle the generated stencil could be pasted in and the macro removed. This also gives the same segfault.
## Packages
using BenchmarkTools
using LoopVectorization
## Macros
function make_index(a, arrays, i, j, k)
if a in arrays
i += (a in [ Symbol("u"), Symbol("ut")]) ? 0.5 : 0
j += (a in [ Symbol("v"), Symbol("vt")]) ? 0.5 : 0
k += (a in [ Symbol("w"), Symbol("wt")]) ? 0.5 : 0
if i < 0
i_int = convert(Int, abs(i))
ex_i = :( i-$i_int )
elseif i > 0
i_int = convert(Int, i)
ex_i = :( i+$i_int )
else
ex_i = :( i )
end
if j < 0
j_int = convert(Int, abs(j))
ex_j = :( j-$j_int )
elseif j > 0
j_int = convert(Int, j)
ex_j = :( j+$j_int )
else
ex_j = :( j )
end
if k < 0
k_int = convert(Int, abs(k))
ex_k = :( k-$k_int )
elseif k > 0
k_int = convert(Int, k)
ex_k = :( k+$k_int )
else
ex_k = :( k )
end
return :( $a[ $ex_i, $ex_j, $ex_k] )
else
return :( $a )
end
end
function process_expr(ex, arrays, i, j, k)
n = 1
if (isa(ex.args[1], Symbol) && ex.args[1] == Symbol("gradx"))
ex.args[1] = Symbol("gradx_")
ex = :( $ex * dxi )
elseif (isa(ex.args[1], Symbol) && ex.args[1] == Symbol("grady"))
ex.args[1] = Symbol("grady_")
ex = :( $ex * dyi )
elseif (isa(ex.args[1], Symbol) && ex.args[1] == Symbol("gradz"))
ex.args[1] = Symbol("gradz_")
ex = :( $ex * dzi )
end
args = ex.args
while n <= length(args)
if isa(args[n], Expr)
args[n] = process_expr(args[n], arrays, i, j, k)
n += 1
elseif isa(args[n], Symbol)
if args[n] == Symbol("gradx_")
if isa(args[n+1], Expr)
args[n] = copy(args[n+1])
push!(args, copy(args[n+1]))
push!(args, copy(args[n+1]))
args[n ] = process_expr(args[n ], arrays, i-1.5, j, k)
args[n+1] = process_expr(args[n+1], arrays, i-0.5, j, k)
args[n+2] = process_expr(args[n+2], arrays, i+0.5, j, k)
args[n+3] = process_expr(args[n+3], arrays, i+1.5, j, k)
elseif isa(args[n+1], Symbol)
args[n] = args[n+1]
push!(args, args[n+1])
push!(args, args[n+1])
args[n ] = make_index(args[n ], arrays, i-1.5, j, k)
args[n+1] = make_index(args[n+1], arrays, i-0.5, j, k)
args[n+2] = make_index(args[n+2], arrays, i+0.5, j, k)
args[n+3] = make_index(args[n+3], arrays, i+1.5, j, k)
end
args[n ] = :( ( 1/24) * $(args[n ]) )
args[n+1] = :( (-27/24) * $(args[n+1]) )
args[n+2] = :( ( 27/24) * $(args[n+2]) )
args[n+3] = :( ( -1/24) * $(args[n+3]) )
insert!(args, n, Symbol("+"))
n += 5
elseif args[n] == Symbol("grady_")
if isa(args[n+1], Expr)
args[n] = copy(args[n+1])
push!(args, copy(args[n+1]))
push!(args, copy(args[n+1]))
args[n ] = process_expr(args[n ], arrays, i, j-1.5, k)
args[n+1] = process_expr(args[n+1], arrays, i, j-0.5, k)
args[n+2] = process_expr(args[n+2], arrays, i, j+0.5, k)
args[n+3] = process_expr(args[n+3], arrays, i, j+1.5, k)
elseif isa(args[n+1], Symbol)
args[n] = args[n+1]
push!(args, args[n+1])
push!(args, args[n+1])
args[n ] = make_index(args[n ], arrays, i, j-1.5, k)
args[n+1] = make_index(args[n+1], arrays, i, j-0.5, k)
args[n+2] = make_index(args[n+2], arrays, i, j+0.5, k)
args[n+3] = make_index(args[n+3], arrays, i, j+1.5, k)
end
args[n ] = :( ( 1/24) * $(args[n ]) )
args[n+1] = :( (-27/24) * $(args[n+1]) )
args[n+2] = :( ( 27/24) * $(args[n+2]) )
args[n+3] = :( ( -1/24) * $(args[n+3]) )
insert!(args, n, Symbol("+"))
n += 5
elseif args[n] == Symbol("gradz_")
if isa(args[n+1], Expr)
args[n] = copy(args[n+1])
push!(args, copy(args[n+1]))
push!(args, copy(args[n+1]))
args[n ] = process_expr(args[n ], arrays, i, j, k-1.5)
args[n+1] = process_expr(args[n+1], arrays, i, j, k-0.5)
args[n+2] = process_expr(args[n+2], arrays, i, j, k+0.5)
args[n+3] = process_expr(args[n+3], arrays, i, j, k+1.5)
elseif isa(args[n+1], Symbol)
args[n] = args[n+1]
push!(args, args[n+1])
push!(args, args[n+1])
args[n ] = make_index(args[n ], arrays, i, j, k-1.5)
args[n+1] = make_index(args[n+1], arrays, i, j, k-0.5)
args[n+2] = make_index(args[n+2], arrays, i, j, k+0.5)
args[n+3] = make_index(args[n+3], arrays, i, j, k+1.5)
end
args[n ] = :( ( 1/24) * $(args[n ]) )
args[n+1] = :( (-27/24) * $(args[n+1]) )
args[n+2] = :( ( 27/24) * $(args[n+2]) )
args[n+3] = :( ( -1/24) * $(args[n+3]) )
insert!(args, n, Symbol("+"))
n += 5
elseif args[n] == Symbol("interpx")
if isa(args[n+1], Expr)
args[n] = copy(args[n+1])
push!(args, copy(args[n+1]))
push!(args, copy(args[n+1]))
args[n ] = process_expr(args[n ], arrays, i-1.5, j, k)
args[n+1] = process_expr(args[n+1], arrays, i-0.5, j, k)
args[n+2] = process_expr(args[n+2], arrays, i+0.5, j, k)
args[n+3] = process_expr(args[n+3], arrays, i+1.5, j, k)
elseif isa(args[n+1], Symbol)
args[n] = args[n+1]
push!(args, args[n+1])
push!(args, args[n+1])
args[n ] = make_index(args[n ], arrays, i-1.5, j, k)
args[n+1] = make_index(args[n+1], arrays, i-0.5, j, k)
args[n+2] = make_index(args[n+2], arrays, i+0.5, j, k)
args[n+3] = make_index(args[n+3], arrays, i+1.5, j, k)
end
args[n ] = :( ( -1/16) * $(args[n ]) )
args[n+1] = :( ( 9/16) * $(args[n+1]) )
args[n+2] = :( ( 9/16) * $(args[n+2]) )
args[n+3] = :( ( -1/16) * $(args[n+3]) )
insert!(args, n, Symbol("+"))
n += 5
elseif args[n] == Symbol("interpy")
if isa(args[n+1], Expr)
args[n] = copy(args[n+1])
push!(args, copy(args[n+1]))
push!(args, copy(args[n+1]))
args[n ] = process_expr(args[n ], arrays, i, j-1.5, k)
args[n+1] = process_expr(args[n+1], arrays, i, j-0.5, k)
args[n+2] = process_expr(args[n+2], arrays, i, j+0.5, k)
args[n+3] = process_expr(args[n+3], arrays, i, j+1.5, k)
elseif isa(args[n+1], Symbol)
args[n] = args[n+1]
push!(args, args[n+1])
push!(args, args[n+1])
args[n ] = make_index(args[n ], arrays, i, j-1.5, k)
args[n+1] = make_index(args[n+1], arrays, i, j-0.5, k)
args[n+2] = make_index(args[n+2], arrays, i, j+0.5, k)
args[n+3] = make_index(args[n+3], arrays, i, j+1.5, k)
end
args[n ] = :( ( -1/16) * $(args[n ]) )
args[n+1] = :( ( 9/16) * $(args[n+1]) )
args[n+2] = :( ( 9/16) * $(args[n+2]) )
args[n+3] = :( ( -1/16) * $(args[n+3]) )
insert!(args, n, Symbol("+"))
n += 5
elseif args[n] == Symbol("interpz")
if isa(args[n+1], Expr)
args[n] = copy(args[n+1])
push!(args, copy(args[n+1]))
push!(args, copy(args[n+1]))
args[n ] = process_expr(args[n ], arrays, i, j, k-1.5)
args[n+1] = process_expr(args[n+1], arrays, i, j, k-0.5)
args[n+2] = process_expr(args[n+2], arrays, i, j, k+0.5)
args[n+3] = process_expr(args[n+3], arrays, i, j, k+1.5)
elseif isa(args[n+1], Symbol)
args[n] = args[n+1]
push!(args, args[n+1])
push!(args, args[n+1])
args[n ] = make_index(args[n ], arrays, i, j, k-1.5)
args[n+1] = make_index(args[n+1], arrays, i, j, k-0.5)
args[n+2] = make_index(args[n+2], arrays, i, j, k+0.5)
args[n+3] = make_index(args[n+3], arrays, i, j, k+1.5)
end
args[n ] = :( ( -1/16) * $(args[n ]) )
args[n+1] = :( ( 9/16) * $(args[n+1]) )
args[n+2] = :( ( 9/16) * $(args[n+2]) )
args[n+3] = :( ( -1/16) * $(args[n+3]) )
insert!(args, n, Symbol("+"))
n += 5
else
args[n] = make_index(args[n], arrays, i, j, k)
n += 1
end
else
n += 1
end
end
return ex
end
macro fd(arrays, ex)
i = (ex.args[1] in [ Symbol("u"), Symbol("ut")]) ? -0.5 : 0
j = (ex.args[1] in [ Symbol("v"), Symbol("vt")]) ? -0.5 : 0
k = (ex.args[1] in [ Symbol("w"), Symbol("wt")]) ? -0.5 : 0
ex = process_expr(ex, arrays.args, i, j, k)
println("Generated stencil: ")
println(ex)
return esc(ex)
end
## Advection, diffusion, time kernel.
function kernel!(
ut, u, v, w,
visc, dxi, dyi, dzi, dt,
is, ie, js, je, ks, ke)
@tturbo for k in ks:ke
for j in js:je
for i in is:ie
@fd (ut, u, v, w) ut += (
- gradx(interpx(u) * interpx(u)) + visc * (gradx(gradx(u)))
- grady(interpx(v) * interpy(u)) + visc * (grady(grady(u))) )
@fd (ut, u, v, w) ut += (
- gradz(interpx(w) * interpz(u)) + visc * (gradz(gradz(u))) )
end
end
end
@tturbo for k in ks:ke
for j in js:je
for i in is:ie
@fd (ut, u) u += dt*ut
@fd (ut, u) ut = 0.f0
end
end
end
end
## Initialize the grid.
itot = 384; jtot = 384; ktot = 384
igc = 4; jgc = 4; kgc = 4
dx = 1/itot; dy = 1/jtot; dz = 1/ktot
dxi = 1/dx; dyi = 1/dy; dzi = 1/dz
x = dx*collect(0:itot-1)
y = dy*collect(0:jtot-1)
z = dz*collect(0:ktot-1)
## Solve the problem in double precision.
visc = 1.5
dt = 1.e-3
u = zeros(Float64, (itot+2*igc, jtot+2*kgc, ktot+2*kgc))
v = zeros(Float64, (itot+2*igc, jtot+2*kgc, ktot+2*kgc))
w = zeros(Float64, (itot+2*igc, jtot+2*kgc, ktot+2*kgc))
ut = zeros(Float64, (itot+2*igc, jtot+2*kgc, ktot+2*kgc))
## Initialize with a sinus.
n_waves = 3
is = igc+1; ie = igc+ktot; js = jgc+1; je = jgc+jtot; ks = kgc+1; ke = kgc+ktot
ijk = [is:ie, js:je, ks:ke]
u[ijk...] = [ sin(n_waves*2*pi*x) + cos(n_waves*2*pi*y) + sin(n_waves*2*pi*z) for x=x, y=y, z=z]
## Run kernel.
is = igc+1; ie = igc+itot; js = jgc+1; je = jgc+jtot; ks = kgc+1; ke = kgc+ktot
@btime kernel!(
$ut, $u, $v, $w,
$visc, $dxi, $dyi, $dzi, $dt,
$is, $ie, $js, $je, $ks, $ke)