I have a code that generates with a macro a finite difference stencil. I have disabled the macro in order to make a minimal working example, so please forgive me for this absurdly long line, it is not there in my normal code. I have three questions concerning the performance of two functions, one with LoopVectorization
and one without.
- The performance difference is a factor 3. Is there a way to get the
_ref
function faster without having to useLoopVectorization
, because the compilation time is very long (~250 s withLoopVectorization
, ~ 1s without). - Is there a way to use precompilation here to avoid this 250 s?
- Why is the performance gain of the
_ref
function so minimal if I move toFloat32
while the function with@tturbo
gains nearly a factor 2?
## Packages and settings.
using BenchmarkTools
using LoopVectorization
const float_type = Float64
const half = convert(float_type, 1/2)
## CPU dynamics kernel.
function dynamics_w_kernel!(
wt, u, v, w, s,
visc, alpha,
dxi, dyi, dzi, dzhi,
is, ie, js, je, ks, ke)
@tturbo for k in ks:ke
for j in js:je
for i in is:ie
# @fd (wt, u, v, w, s) wt += (
# - gradx(interpz(u) * interpx(w)) + visc * (gradx(gradx(w)))
# - grady(interpz(v) * interpy(w)) + visc * (grady(grady(w)))
# - gradz(interpz(w) * interpz(w)) + visc * (gradz(gradz(w)))
# + alpha*interpz(s) )
wt[i, j, k] += ((((-((-1 * ((half * u[i, j, k - 1] + half * u[i, j, k]) * (half * w[i - 1, j, k] + half * w[i, j, k])) + 1 * ((half * u[i + 1, j, k - 1] + half * u[i + 1, j, k]) * (half * w[i, j, k] + half * w[i + 1, j, k]))) * dxi) + visc * ((-1 * ((-1 * w[i - 1, j, k] + 1 * w[i, j, k]) * dxi) + 1 * ((-1 * w[i, j, k] + 1 * w[i + 1, j, k]) * dxi)) * dxi)) - (-1 * ((half * v[i, j, k - 1] + half * v[i, j, k]) * (half * w[i, j - 1, k] + half * w[i, j, k])) + 1 * ((half * v[i, j + 1, k - 1] + half * v[i, j + 1, k]) * (half * w[i, j, k] + half * w[i, j + 1, k]))) * dyi) + visc * ((-1 * ((-1 * w[i, j - 1, k] + 1 * w[i, j, k]) * dyi) + 1 * ((-1 * w[i, j, k] + 1 * w[i, j + 1, k]) * dyi)) * dyi)) - (-1 * ((half * w[i, j, k - 1] + half * w[i, j, k]) * (half * w[i, j, k - 1] + half * w[i, j, k])) + 1 * ((half * w[i, j, k] + half * w[i, j, k + 1]) * (half * w[i, j, k] + half * w[i, j, k + 1]))) * dzhi[k]) + visc * ((-1 * ((-1 * w[i, j, k - 1] + 1 * w[i, j, k]) * dzi[k - 1]) + 1 * ((-1 * w[i, j, k] + 1 * w[i, j, k + 1]) * dzi[k])) * dzhi[k]) + alpha * (half * s[i, j, k - 1] + half * s[i, j, k])
end
end
end
end
function dynamics_w_kernel_ref!(
wt, u, v, w, s,
visc, alpha,
dxi, dyi, dzi, dzhi,
is, ie, js, je, ks, ke)
Threads.@threads for k in ks:ke
for j in js:je
@simd for i in is:ie
@inbounds wt[i, j, k] += ((((-((-1 * ((half * u[i, j, k - 1] + half * u[i, j, k]) * (half * w[i - 1, j, k] + half * w[i, j, k])) + 1 * ((half * u[i + 1, j, k - 1] + half * u[i + 1, j, k]) * (half * w[i, j, k] + half * w[i + 1, j, k]))) * dxi) + visc * ((-1 * ((-1 * w[i - 1, j, k] + 1 * w[i, j, k]) * dxi) + 1 * ((-1 * w[i, j, k] + 1 * w[i + 1, j, k]) * dxi)) * dxi)) - (-1 * ((half * v[i, j, k - 1] + half * v[i, j, k]) * (half * w[i, j - 1, k] + half * w[i, j, k])) + 1 * ((half * v[i, j + 1, k - 1] + half * v[i, j + 1, k]) * (half * w[i, j, k] + half * w[i, j + 1, k]))) * dyi) + visc * ((-1 * ((-1 * w[i, j - 1, k] + 1 * w[i, j, k]) * dyi) + 1 * ((-1 * w[i, j, k] + 1 * w[i, j + 1, k]) * dyi)) * dyi)) - (-1 * ((half * w[i, j, k - 1] + half * w[i, j, k]) * (half * w[i, j, k - 1] + half * w[i, j, k])) + 1 * ((half * w[i, j, k] + half * w[i, j, k + 1]) * (half * w[i, j, k] + half * w[i, j, k + 1]))) * dzhi[k]) + visc * ((-1 * ((-1 * w[i, j, k - 1] + 1 * w[i, j, k]) * dzi[k - 1]) + 1 * ((-1 * w[i, j, k] + 1 * w[i, j, k + 1]) * dzi[k])) * dzhi[k]) + alpha * (half * s[i, j, k - 1] + half * s[i, j, k])
end
end
end
end
## User input
itot = 384; jtot = 384; ktot = 384;
igc = 1; jgc = 1; kgc = 1;
icells = itot + 2igc; jcells = jtot + 2jgc; kcells = ktot + 2kgc
is = igc+1; ie = igc+itot; js = jgc+1; je = jgc+jtot; ks = kgc+1; ke = kgc+ktot
wt = rand(float_type, icells, jcells, kcells)
u = rand(float_type, icells, jcells, kcells)
v = rand(float_type, icells, jcells, kcells)
w = rand(float_type, icells, jcells, kcells)
s = rand(float_type, icells, jcells, kcells)
dxi = rand(float_type)
dyi = rand(float_type)
dzi = rand(float_type, kcells)
dzhi = rand(float_type, kcells)
visc = convert(float_type, 1)
alpha = convert(float_type, 9.81/300)
## Benchmark
for i in 1:10
@time dynamics_w_kernel!(
wt, u, v, w, s,
visc, alpha,
dxi, dyi, dzi, dzhi,
is, ie, js, je, ks, ke)
end
for i in 1:10
@time dynamics_w_kernel_ref!(
wt, u, v, w, s,
visc, alpha,
dxi, dyi, dzi, dzhi,
is, ie, js, je, ks, ke)
end