Adapt.adapt_structure(CUDA.KernelAdaptor(),...)
also seems to be the culprit here. Changing from CUDA.KernelAdaptor()
to something else and using Adapt.@adapt_structure Ferrite.GPUGrid
etc seems to fix the issue. I assume that it boils down to e.g.
function Adapt.adapt_structure(to, grid::Grid)
# map Int64 to Int32 to reduce number of registers
cu_cells = grid.cells .|> (x -> Int32.(x.nodes)) .|> Quadrilateral |> cu
cells = Adapt.adapt_structure(to, cu_cells)
nodes = Adapt.adapt_structure(to, cu(grid.nodes))
GPUGrid(cells,nodes)
end
being broken with KernelAdaptor, because the CuArrays are freed at the end of the function and hence the CuDeviceArrays refer to some dead memory? However, even if this is the case I do not really understand why calling adapt multiple times (or running the program multiple times) also fixes the issue.
Working example on the PR
function assemble_element_gpu!(#=assembler,cv,=#grid,n_cells_colored, eles_colored)
tx = threadIdx().x
bx = blockIdx().x
bd = blockDim().x
e_color = tx + (bx-Int32(1))*bd # element number per color
e_color ≤ n_cells_colored || return nothing # e here is the current element index.
# @cushow e_color
# n_basefuncs = getnbasefunctions(cv)
e = eles_colored[e_color]
# @cushow e
cell_coords = getcoordinates(grid, e)
# ke = MMatrix{4,4,Float32}(undef) # Note: using n_basefuncs instead of 4 will throw an error because this type of dynamisim is not supported in GPU.
# fill!(ke, 0.0f0)
# fe = MVector{4,Float32}(undef)
# fill!(fe, 0.0f0)
#Loop over quadrature points
# for qv in Ferrite.QuadratureValuesIterator(cv,cell_coords)
# ## Get the quadrature weight
# dΩ = getdetJdV(qv)
# ## Loop over test shape functions
# for i in 1:n_basefuncs
# δu = shape_value(qv, i)
# ∇δu = shape_gradient(qv, i)
# ## Add contribution to fe
# fe[i] += δu * dΩ
# ## Loop over trial shape functions
# for j in 1:n_basefuncs
# ∇u = shape_gradient(qv, j)
# ## Add contribution to Ke
# ke[i,j] += (∇δu ⋅ ∇u) * dΩ
# end
# end
# end
## Assemble Ke into Kgpu ##
# assemble!(assembler, celldofs(dh,e),SMatrix(ke),SVector(fe)) # when passin mutable objects, throws and error
return nothing
end
Adapt.@adapt_structure Ferrite.GPUGrid
function assemble_global_gpu_color(cellvalues,dh,colors)
K = create_sparsity_pattern(dh,Float32)
# Kgpu = CUSPARSE.CuSparseMatrixCSC(K)
fgpu = CUDA.zeros(ndofs(dh))
# assembler = start_assemble(Kgpu, fgpu)
n_colors = length(colors)
# set up kernel adaption
# FIXME: The following three lines are necessary to circumvent getting rubbish values.
# one call will make dofs[i] give 6 instead of 4. (ref: see `gpu_assembler.jl`)
# second call will fix the first issue but will give rubbish values for node_ids[i] (ref: see `gpu_grid.jl`)
# third call will fix everything.
gpu_grid = Adapt.adapt_structure(CuArray, dh.grid)
cellvalues_gpu = Adapt.adapt_structure(CUDA.KernelAdaptor(), cellvalues)
for i in 1:n_colors
color = colors[i]
gpucolor = cu(color)
@show gpucolor
kernel = @cuda launch=false assemble_element_gpu!(gpu_grid,Int32(length(color)),gpucolor)
#@show CUDA.registers(kernel)
config = launch_configuration(kernel.fun)
threads = 1#min(length(colors[i]), config.threads)
blocks = cld(length(color), threads)
kernel(gpu_grid,Int32(length(color)),gpucolor; threads, blocks)
end
# return Kgpu,fgpu
end
to break it switch to
...
gpu_grid = Adapt.adapt_structure(CUDA.KernelAdaptor(), dh.grid)
...
I cannot procude any meaningful error messages beyond
Out-of-bounds array access
Stacktrace:
[1] throw_boundserror at /home/dogiermann/.julia/packages/CUDA/75aiI/src/device/quirks.jl:15
[2] multiple call sites at unknown:0
on CUDA 5.4.2 .