I am trying to run sync_threads()
within a GPU kernel but I can’t seem to get it to work within a for
loop. Here is the MWE
using CuArrays, CUDAnative, CUDAdrv
import Flux: cpu
CuArrays.allowscalar(false)
x = cu(rand(8))
function kernel_for_loop!(x)
index = 2*((blockIdx().x - 1) * blockDim().x + threadIdx().x)-1
stride = 2*(blockDim().x * gridDim().x)
for offset in [0,1]
sync_threads()
end
return nothing
end
function run_kernel_for_loop!(x)
numblocks = ceil(Int, length(x)/256)
@cuda threads=256 blocks=numblocks kernel_for_loop!(x)
end
@time run_kernel_for_loop!(x)
which gives error
ERROR: InvalidIRError: compiling kernel_for_loop!(CuDeviceArray{Float32,1,CUDAnative.AS.Global}) resulted in invalid
LLVM IR
Reason: unsupported call through a literal pointer (call to jl_alloc_array_1d)
Stacktrace:
[1] Type at boot.jl:402
[2] Type at boot.jl:411
[3] Type at boot.jl:419
[4] similar at abstractarray.jl:618
[5] similar at abstractarray.jl:617
[6] vect at array.jl:130
[7] kernel_for_loop! at C:\git\intro_to_julia\13_mwe.jl:12
Stacktrace:
[1] check_ir(::CUDAnative.CompilerContext, ::LLVM.Module) at C:\Users\RTX2080\.julia\packages\CUDAnative\l7sDn\src\compiler\validation.jl:77
[2] compile(::CUDAnative.CompilerContext) at C:\Users\RTX2080\.julia\packages\CUDAnative\l7sDn\src\compiler\driver.jl:88
[3] #compile#95 at C:\Users\RTX2080\.julia\packages\CUDAnative\l7sDn\src\compiler\driver.jl:38 [inlined]
[4] compile at C:\Users\RTX2080\.julia\packages\CUDAnative\l7sDn\src\compiler\driver.jl:36 [inlined]
[5] #compile#94 at C:\Users\RTX2080\.julia\packages\CUDAnative\l7sDn\src\compiler\driver.jl:18 [inlined]
[6] compile at C:\Users\RTX2080\.julia\packages\CUDAnative\l7sDn\src\compiler\driver.jl:16 [inlined]
[7] macro expansion at C:\Users\RTX2080\.julia\packages\CUDAnative\l7sDn\src\execution.jl:266 [inlined]
[8] #cufunction#109(::Base.Iterators.Pairs{Union{},Union{},Tuple{},NamedTuple{(),Tuple{}}}, ::typeof(cufunction), ::typeof(kernel_for_loop!), ::Type{Tuple{CuDeviceArray{Float32,1,CUDAnative.AS.Global}}}) at C:\Users\RTX2080\.julia\packages\CUDAnative\l7sDn\src\execution.jl:237
[9] cufunction(::Function, ::Type) at C:\Users\RTX2080\.julia\packages\CUDAnative\l7sDn\src\execution.jl:237
[10] macro expansion at C:\Users\RTX2080\.julia\packages\CUDAnative\l7sDn\src\execution.jl:205 [inlined]
[11] macro expansion at .\gcutils.jl:87 [inlined]
[12] macro expansion at C:\Users\RTX2080\.julia\packages\CUDAnative\l7sDn\src\execution.jl:202 [inlined]
[13] run_kernel_for_loop!(::CuArray{Float32,1}) at C:\git\intro_to_julia\13_mwe.jl:21
[14] top-level scope at none:0
However, if I don’t call it within a for
loop then it works
function kernel_no_loop!(x)
index = 2*((blockIdx().x - 1) * blockDim().x + threadIdx().x)-1
stride = 2*(blockDim().x * gridDim().x)
sync_threads()
sync_threads()
return nothing
end
function run_kernel_no_loop!(x)
numblocks = ceil(Int, length(x)/256)
@cuda threads=256 blocks=numblocks kernel_no_loop!(x)
end
run_kernel_no_loop!(x)