Hello, I am back form vacation and the cluster is up since yesterday.

I have this GPU code (I cannot reduce it more)

```
# Package
# ≡≡≡≡≡≡≡≡≡≡≡≡≡
using CUDA
using Statistics
Ti = Int64
Tf = Float64
Ta = CuArray
WarpsT = 16
Bx = 8
Bz = 8
thrds2D = (WarpsT, WarpsT)
blcks2D = (Bx, Bz)
# Size
Nx = Ti(WarpsT * Bx)
Nz = Ti(WarpsT * Bz)
# Weigth
w = Ta([4/9 1/9 1/9 1/9 1/9 1/36 1/36 1/36 1/36])
# Initialization of population
# ––––––––––––––––––––––––––––––
function kernel_initialize_pop!(f, w)
i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
j = (blockIdx().y - 1) * blockDim().y + threadIdx().y
for k in eachindex(w)
f[i,j,k] = w[k]
end
return nothing
end
# Collide and stream (periodic)
# –––––––––––––––––––––––––––––––
function kernel_collide_stream!(fn, f, F, ρ_, v, ω, Nx, Nz)
i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
j = (blockIdx().y - 1) * blockDim().y + threadIdx().y
@inbounds begin
# For streaming
i_ = mod(i-1,1:Nx); ip = mod(i+1,1:Nx)
jm = mod(j-1,1:Nz); jp = mod(j+1,1:Nz)
# Density and velocity
ρ = ρ_[i,j]; vx = v[i,j,1]; vz = v[i,j,2]
ρvx = ρ*vx; ρvz=ρ*vz
# Source term factor
ω_ = (1-0.5*ω)*1/3
Fx=F[i,j,1]; Fz=F[i,j,2]
# Post collision-Streaming populations
ρvx2 = ρ*vx*vx; ρvz2 = ρ*vz*vz; ρvxvz = vx*vz*ρ; ω__ = 1-ω
fn[i,j,1] = ω__*f[i,j,1] + ω*4/9*(ρ - 1.5*(ρvx2+ρvz2) ) + 4*ω_*( -Fx*vx-Fz*vz) # ξ_0 = (0,0)
fn[ip, j, 2] = ω__*f[i,j,2] + ω*1/9*(ρ + 3*ρvx + 3*ρvx2 - 1.5*ρvz2 ) + ω_*(Fx +2*vx*Fx-Fz*vz) # ξ_1 = (1,0)
fn[i_, j, 4] = ω__*f[i,j,4] + ω*1/9*(ρ - 3*ρvx + 3*ρvx2 - 1.5*ρvz2 ) + ω_*(-Fx +2*vx*Fx-Fz*vz) # ξ_3 = (-1,0)
fn[i, jp, 3] = ω__*f[i,j,3] + ω*1/9*(ρ + 3*ρvz + 3*ρvz2 - 1.5*ρvx2 ) + ω_*(Fz +2*vz*Fz-Fx*vx) # ξ_2 = (0,1)
fn[i, jm, 5] = ω__*f[i,j,5] + ω*1/9*(ρ - 3*ρvz + 3*ρvz2 - 1.5*ρvx2 ) + ω_*(-Fz +2*vz*Fz-Fx*vx) # ξ_4 = (0,-1)
fn[ip, jp, 6] = ω__*f[i,j,6] + ω*1/36*(ρ + 3*(ρvx+ρvz) + 3*(ρvx2+ρvz2) + 9*ρvxvz ) + 0.25*ω_*(Fx+Fz +2*(vx*Fx+vz*Fz) +3*(Fx*vz+vx*Fz) ) # ξ_5 = (1,1)
fn[i_, jp, 7] = ω__*f[i,j,7] + ω*1/36*(ρ - 3*(ρvx-ρvz) + 3*(ρvx2+ρvz2) - 9*ρvxvz ) + 0.25*ω_*(-Fx+Fz +2*(vx*Fx+vz*Fz) -3*(Fx*vz+vx*Fz) ) # ξ_6 = (-1,1)
fn[i_, jm, 8] = ω__*f[i,j,8] + ω*1/36*(ρ - 3*(ρvx+ρvz) + 3*(ρvx2+ρvz2) + 9*ρvxvz ) + 0.25*ω_*(-Fx-Fz +2*(vx*Fx+vz*Fz) +3*(Fx*vz+vx*Fz) ) # ξ_7 = (-1,-1)
fn[ip, jm, 9] = ω__*f[i,j,9] + ω*1/36*(ρ + 3*(ρvx-ρvz) + 3*(ρvx2+ρvz2) - 9*ρvxvz ) + 0.25*ω_*(Fx-Fz +2*(vx*Fx+vz*Fz) -3*(Fx*vz+vx*Fz) ) # ξ_8 = (1,-1)
end
return nothing
end
# Compute moments (ρ, v)
# ––––––––––––––––––––––––
function kernel_comp_ρ_v!(ρ, v, f, F)
i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
j = (blockIdx().y - 1) * blockDim().y + threadIdx().y
@inbounds ρ[i, j] = f[i, j, 1] + f[i, j, 2] + f[i, j, 3] + f[i, j, 4] + f[i, j, 5] +
f[i, j, 6] + f[i, j, 7] + f[i, j, 8] + f[i, j, 9]
@inbounds v[i, j, 1] = (f[i, j, 2] - f[i, j, 4] + f[i, j, 6] - f[i, j, 7] -
f[i, j, 8] + f[i, j, 9] + 0.5*F[i,j,1])/ρ[i,j]
@inbounds v[i, j, 2] = (f[i, j, 3] - f[i, j, 5] + f[i, j, 6] + f[i, j, 7] -
f[i, j, 8] - f[i, j, 9] + 0.5*F[i,j,2])/ρ[i,j]
return nothing
end
# Stencil
# =========
@inline function stencil_4(dist)
if dist<1.
return Float64(0.125*(3-2*dist+CUDA.sqrt(1+4*dist-4*dist*dist)))
elseif dist<2.
return Float64(0.125*(5-2*dist-CUDA.sqrt(-7+12*dist-4*dist*dist)))
else
return Float64(0.0)
end
end
# Interpolate the fluid velocity to the object nodes
# ====================================================
function kernel_interpolate!(boundary, n_nodes, v, Nx, Nz, stencil)
i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
if i > n_nodes
return nothing
end
boundary[i,5] = 0
boundary[i,6] = 0
x_int = trunc(Int, boundary[i,1])
y_int = trunc(Int, boundary[i,2])
xs = x_int-1; xe = x_int+2; ys = y_int-1; ye = y_int+2
for x=xs:xe
for y=ys:ye
xx = mod(x,1:Nx)
yy = mod(y,1:Nz)
wx = stencil(Float64(abs(x+0.5 - boundary[i,1])))
wy = stencil(Float64(abs(y+0.5 - boundary[i,2])))
boundary[i,5] += v[xx,yy,1]*wx*wy
boundary[i,6] += v[xx,yy,2]*wx*wy
end
end
return nothing
end
# update node positions
# =======================
function kernel_update_particle_position!(boundary, n_nodes, Nx, Nz)
i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
if i > n_nodes
return nothing
end
boundary[i,1] = boundary[i,1] + boundary[i,5]
boundary[i,2] = boundary[i,2] + boundary[i,6]
return nothing
end
# compute node forces based on the object's deformation
# =======================================================
function kernel_comp_particle_forces!(boundary, n_nodes, Nx)
i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
if i > n_nodes
return nothing
end
area = Nx / n_nodes
boundary[i,7] = -boundary[i,9] * (boundary[i,1] - boundary[i,3]) * area
boundary[i,8] = -boundary[i,9] * (boundary[i,2] - boundary[i,4]) * area
return nothing
end
# spread forces to the fluid lattice
# ====================================
function kernel_spread!(boundary, F, Nx, Nz, n_nodes, stencil)
i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
if i > n_nodes
return nothing
end
x_int = trunc(Int, boundary[i,1])
y_int = trunc(Int, boundary[i,2])
xs = x_int-1; xe = x_int+2; ys = y_int-1; ye = y_int+2
for x=xs:xe
for y=ys:ye
xx = mod(x,1:Nx)
yy = mod(y,1:Nz)
wx = stencil(Float64(abs(boundary[i,1] - 0.5 - x)))
wy = stencil(Float64(abs(boundary[i,2] - 0.5 - y)))
F[xx,yy,1] += boundary[i,7]*wx*wy
F[xx,yy,2] += boundary[i,8]*wx*wy
end
end
return nothing
end
# Building object
# ≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡
function kernel_wall!(wall, n_nodes, begx, begz, endx, endz, stiff, Nx, Nz)
i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
if i > n_nodes
return nothing
end
posx = begx + (i-1)*(endx-begx)/n_nodes
posx = mod(posx, Nx)
posz = begz + (i-1)*(endz-begz)/n_nodes
posz = mod(posz, Nz)
wall[i,1] = posx
wall[i,3] = posx
wall[i,2] = posz
wall[i,4] = posz
wall[i,9] = stiff
return nothing
end
# Poiseuille
# ≡≡≡≡≡≡≡≡≡≡≡≡
# Parameters
# ============
wall_num_nodes = Nx*10
wall_stiffness = 1.0 # IBM spring constant
B = trunc(Ti, wall_num_nodes/WarpsT)+1
NΔt = 10000
τ=sqrt(3/16)+0.5 # relaxation time (BGK model)
ω=1/τ
vx_max=0.001 # maximum velocity
ν=(2*τ-1)/6 # kinematic shear viscosity
∇P = Tf(8*ν*vx_max/(Nz^2)) # Pressure gradient
# Initialization
# ================
f = CUDA.zeros(Tf, Nx, Nz, 9)
f_t = CUDA.zeros(Tf, Nx, Nz, 9)
ρ = CUDA.ones(Tf, Nx, Nz)
v = CUDA.zeros(Tf, Nx, Nz, 2)
F = CUDA.zeros(Tf, Nx, Nz, 2)
boundary = CUDA.zeros(Tf, wall_num_nodes, 9)
boundary_cpu = zeros(Tf, wall_num_nodes, 9)
# Time loop
# ===========
@cuda threads = thrds2D blocks = blcks2D kernel_initialize_pop!(f, w)
@cuda threads = WarpsT blocks = B kernel_wall!(boundary, wall_num_nodes, 0.0, Nz*0.5, Nx-0.5, Nz*0.5, wall_stiffness, Nx, Nz)
for t=0:NΔt
@. F[:,:,1] = ∇P
@. F[:,:,2] = 0.0
@cuda threads = WarpsT blocks = B kernel_interpolate!(boundary, wall_num_nodes, v, Nx, Nz, stencil_4)
@cuda threads = WarpsT blocks = B kernel_update_particle_position!(boundary, wall_num_nodes, Nx, Nz)
@cuda threads = WarpsT blocks = B kernel_comp_particle_forces!(boundary, wall_num_nodes, Nx)
@cuda threads = WarpsT blocks = B kernel_spread!(boundary, F, Nx, Nz, wall_num_nodes, stencil_4)
@cuda threads = thrds2D blocks = blcks2D kernel_collide_stream!(f_t, f, F, ρ, v, ω, Nx, Nz)
@. f = f_t
@cuda threads = thrds2D blocks = blcks2D kernel_comp_ρ_v!(ρ, v, f, F)
end
boundary_cpu= boundary;
err = std(boundary_cpu[:,2])
println("Error: ", err)
```

Corresponding to this CPU code (I just changed the gpu idexing by `for`

loop(s)

```
# Package
# ≡≡≡≡≡≡≡≡≡≡≡≡≡
using Statistics
Ti = Int64
Tf = Float64
Ta = Array
# Size
Nx = Ti(128)
Nz = Ti(128)
# Weigth
w = Ta([4/9 1/9 1/9 1/9 1/9 1/36 1/36 1/36 1/36])
# Initialization of population
# ––––––––––––––––––––––––––––––
function cpu_initialize_pop!(f, w)
for i in 1:Nx, j in 1:Nz, k in eachindex(w)
f[i,j,k] = w[k]
end
return nothing
end
# Collide and stream (periodic)
# –––––––––––––––––––––––––––––––
function cpu_collide_stream!(fn, f, F, ρ_, v, ω, Nx, Nz)
for i in 1:Nx, j in 1:Nz
@inbounds begin
# For streaming
i_ = mod(i-1,1:Nx); ip = mod(i+1,1:Nx)
jm = mod(j-1,1:Nz); jp = mod(j+1,1:Nz)
# Density and velocity
ρ = ρ_[i,j]; vx = v[i,j,1]; vz = v[i,j,2]
ρvx = ρ*vx; ρvz=ρ*vz
# Source term factor
ω_ = (1-0.5*ω)*1/3
Fx=F[i,j,1]; Fz=F[i,j,2]
# Post collision-Streaming populations
ρvx2 = ρ*vx*vx; ρvz2 = ρ*vz*vz; ρvxvz = vx*vz*ρ; ω__ = 1-ω
fn[i,j,1] = ω__*f[i,j,1] + ω*4/9*(ρ - 1.5*(ρvx2+ρvz2) ) + 4*ω_*( -Fx*vx-Fz*vz) # ξ_0 = (0,0)
fn[ip, j, 2] = ω__*f[i,j,2] + ω*1/9*(ρ + 3*ρvx + 3*ρvx2 - 1.5*ρvz2 ) + ω_*(Fx +2*vx*Fx-Fz*vz) # ξ_1 = (1,0)
fn[i_, j, 4] = ω__*f[i,j,4] + ω*1/9*(ρ - 3*ρvx + 3*ρvx2 - 1.5*ρvz2 ) + ω_*(-Fx +2*vx*Fx-Fz*vz) # ξ_3 = (-1,0)
fn[i, jp, 3] = ω__*f[i,j,3] + ω*1/9*(ρ + 3*ρvz + 3*ρvz2 - 1.5*ρvx2 ) + ω_*(Fz +2*vz*Fz-Fx*vx) # ξ_2 = (0,1)
fn[i, jm, 5] = ω__*f[i,j,5] + ω*1/9*(ρ - 3*ρvz + 3*ρvz2 - 1.5*ρvx2 ) + ω_*(-Fz +2*vz*Fz-Fx*vx) # ξ_4 = (0,-1)
fn[ip, jp, 6] = ω__*f[i,j,6] + ω*1/36*(ρ + 3*(ρvx+ρvz) + 3*(ρvx2+ρvz2) + 9*ρvxvz ) + 0.25*ω_*(Fx+Fz +2*(vx*Fx+vz*Fz) +3*(Fx*vz+vx*Fz) ) # ξ_5 = (1,1)
fn[i_, jp, 7] = ω__*f[i,j,7] + ω*1/36*(ρ - 3*(ρvx-ρvz) + 3*(ρvx2+ρvz2) - 9*ρvxvz ) + 0.25*ω_*(-Fx+Fz +2*(vx*Fx+vz*Fz) -3*(Fx*vz+vx*Fz) ) # ξ_6 = (-1,1)
fn[i_, jm, 8] = ω__*f[i,j,8] + ω*1/36*(ρ - 3*(ρvx+ρvz) + 3*(ρvx2+ρvz2) + 9*ρvxvz ) + 0.25*ω_*(-Fx-Fz +2*(vx*Fx+vz*Fz) +3*(Fx*vz+vx*Fz) ) # ξ_7 = (-1,-1)
fn[ip, jm, 9] = ω__*f[i,j,9] + ω*1/36*(ρ + 3*(ρvx-ρvz) + 3*(ρvx2+ρvz2) - 9*ρvxvz ) + 0.25*ω_*(Fx-Fz +2*(vx*Fx+vz*Fz) -3*(Fx*vz+vx*Fz) ) # ξ_8 = (1,-1)
end
end
return nothing
end
# Compute moments (ρ, v)
# ––––––––––––––––––––––––
function cpu_comp_ρ_v!(ρ, v, f, F)
for i in 1:Nx, j in 1:Nz
@inbounds ρ[i, j] = f[i, j, 1] + f[i, j, 2] + f[i, j, 3] + f[i, j, 4] + f[i, j, 5] +
f[i, j, 6] + f[i, j, 7] + f[i, j, 8] + f[i, j, 9]
@inbounds v[i, j, 1] = (f[i, j, 2] - f[i, j, 4] + f[i, j, 6] - f[i, j, 7] -
f[i, j, 8] + f[i, j, 9] + 0.5*F[i,j,1])/ρ[i,j]
@inbounds v[i, j, 2] = (f[i, j, 3] - f[i, j, 5] + f[i, j, 6] + f[i, j, 7] -
f[i, j, 8] - f[i, j, 9] + 0.5*F[i,j,2])/ρ[i,j]
end
return nothing
end
# Stencil
# =========
@inline function stencil_4(dist)
if dist<1.
return 0.125*(3-2*dist+sqrt(1+4*dist-4*dist*dist))
elseif dist<2.
return 0.125*(5-2*dist-sqrt(-7+12*dist-4*dist*dist))
else
return 0.0
end
end
# Interpolate the fluid velocity to the object nodes
# ====================================================
function cpu_interpolate!(boundary, n_nodes, v, Nx, Nz, stencil)
for i = 1:n_nodes
boundary[i,5] = 0
boundary[i,6] = 0
x_int = trunc(Int, boundary[i,1])
y_int = trunc(Int, boundary[i,2])
xs = x_int-1; xe = x_int+2; ys = y_int-1; ye = y_int+2
for x=xs:xe
for y=ys:ye
xx = mod(x,1:Nx)
yy = mod(y,1:Nz)
wx = stencil(abs(x+0.5 - boundary[i,1]))
wy = stencil(abs(y+0.5 - boundary[i,2]))
boundary[i,5] += v[xx,yy,1]*wx*wy
boundary[i,6] += v[xx,yy,2]*wx*wy
end
end
end
return nothing
end
# update node positions
# =======================
function cpu_update_particle_position!(boundary, n_nodes, Nx, Nz)
for i = 1:n_nodes
boundary[i,1] = boundary[i,1] + boundary[i,5]
boundary[i,2] = boundary[i,2] + boundary[i,6]
end
return nothing
end
# compute node forces based on the object's deformation
# =======================================================
function cpu_comp_particle_forces!(boundary, n_nodes, Nx)
for i = 1:n_nodes
area = Nx / n_nodes
boundary[i,7] = -boundary[i,9] * (boundary[i,1] - boundary[i,3]) * area
boundary[i,8] = -boundary[i,9] * (boundary[i,2] - boundary[i,4]) * area
end
return nothing
end
# spread forces to the fluid lattice
# ====================================
function cpu_spread!(boundary, F, Nx, Nz, n_nodes, stencil)
for i = 1:n_nodes
x_int = trunc(Int, boundary[i,1])
y_int = trunc(Int, boundary[i,2])
xs = x_int-1; xe = x_int+2; ys = y_int-1; ye = y_int+2
for x=xs:xe
for y=ys:ye
xx = mod(x,1:Nx)
yy = mod(y,1:Nz)
wx = stencil(abs(boundary[i,1] - 0.5 - x))
wy = stencil(abs(boundary[i,2] - 0.5 - y))
F[xx,yy,1] += boundary[i,7]*wx*wy
F[xx,yy,2] += boundary[i,8]*wx*wy
end
end
end
return nothing
end
# Building object
# ≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡
function cpu_wall!(wall, n_nodes, begx, begz, endx, endz, stiff, Nx, Nz)
for i = 1:n_nodes
posx = begx + (i-1)*(endx-begx)/n_nodes
posx = mod(posx, Nx)
posz = begz + (i-1)*(endz-begz)/n_nodes
posz = mod(posz, Nz)
wall[i,1] = posx
wall[i,3] = posx
wall[i,2] = posz
wall[i,4] = posz
wall[i,9] = stiff
end
return nothing
end
# Poiseuille
# ≡≡≡≡≡≡≡≡≡≡≡≡
# Parameters
# ============
wall_num_nodes = Nx*10
wall_stiffness = 1.0 # IBM spring constant
NΔt = 10000
τ=sqrt(3/16)+0.5 # relaxation time (BGK model)
ω=1/τ
vx_max=0.001 # maximum velocity
ν=(2*τ-1)/6 # kinematic shear viscosity
∇P = Tf(8*ν*vx_max/(Nz^2)) # Pressure gradient
# Initialization
# ================
f = zeros(Tf, Nx, Nz, 9)
f_t = zeros(Tf, Nx, Nz, 9)
ρ = ones(Tf, Nx, Nz)
v = zeros(Tf, Nx, Nz, 2)
F = zeros(Tf, Nx, Nz, 2)
boundary = zeros(Tf, wall_num_nodes, 9)
# Time loop
# ===========
cpu_initialize_pop!(f, w)
cpu_wall!(boundary, wall_num_nodes, 0.0, Nz*0.5, Nx-0.5, Nz*0.5, wall_stiffness, Nx, Nz)
for t=0:NΔt
if t%100==0
print(t,", ")
end
@. F[:,:,1] = ∇P
@. F[:,:,2] = 0.0
cpu_interpolate!(boundary, wall_num_nodes, v, Nx, Nz, stencil_4)
cpu_update_particle_position!(boundary, wall_num_nodes, Nx, Nz)
cpu_comp_particle_forces!(boundary, wall_num_nodes, Nx)
cpu_spread!(boundary, F, Nx, Nz, wall_num_nodes, stencil_4)
cpu_collide_stream!(f_t, f, F, ρ, v, ω, Nx, Nz)
@. f = f_t
cpu_comp_ρ_v!(ρ, v, f, F)
end
err = std(boundary[:,2])
println("Error: ", err)
```

I obtain an error of 0.0 on cpu, of order 1e-6 on K620, and 1e-3 on A100…

I tried many things, my next step will be to write the same conde using CUDA C or pycuda to see if I have the same problem.

I have absolutely no idea of where the mistake come from, and my ignorance in gpu computing is probably the main issue.

Thank you for you help,

If you see something wrong it would be incredibly nice.