Hello,
I’m trying to implement the generalized Laguerre function acting element-wise to the elements of a CuArray.
The basic code uses a recursive function to evaluate the laguerre polynomial
using CUDA
using LinearAlgebra: BlasFloat
CUDA.allowscalar(false)
function gen_laguerre(n::Integer, alpha::Number, x::T)::T where T <: BlasFloat
n == 0 && return 1
n == 1 && return 1 + alpha - x
return ((2*n-1+alpha-x)*gen_laguerre(n-1,alpha,x)-(n-1+alpha)*gen_laguerre(n-2,alpha,x))/n
end
A = CUDA.rand(1000)
gen_laguerre.(10, 3, A)
works, but if I increase the degree of the polynomial
gen_laguerre.(30, 3, A)
CUDA error: an illegal memory access was encountered (code 700, ERROR_ILLEGAL_ADDRESS)
Stacktrace:
[1] throw_api_error(res::CUDA.cudaError_enum)
@ CUDA ~/.julia/packages/CUDA/HzJ3E/lib/cudadrv/libcuda.jl:27
[2] check
@ ~/.julia/packages/CUDA/HzJ3E/lib/cudadrv/libcuda.jl:34 [inlined]
[3] cuMemAllocAsync
@ ~/.julia/packages/CUDA/HzJ3E/lib/utils/call.jl:26 [inlined]
[4] alloc(::Type{CUDA.Mem.DeviceBuffer}, bytesize::Int64; async::Bool, stream::CuStream, pool::Nothing)
@ CUDA.Mem ~/.julia/packages/CUDA/HzJ3E/lib/cudadrv/memory.jl:83
[5] alloc
@ ~/.julia/packages/CUDA/HzJ3E/lib/cudadrv/memory.jl:71 [inlined]
[6] actual_alloc(bytes::Int64; async::Bool, stream::CuStream)
@ CUDA ~/.julia/packages/CUDA/HzJ3E/src/pool.jl:65
[7] actual_alloc
@ ~/.julia/packages/CUDA/HzJ3E/src/pool.jl:59 [inlined]
[8] #993
@ ~/.julia/packages/CUDA/HzJ3E/src/pool.jl:417 [inlined]
[9] retry_reclaim(f::CUDA.var"#993#995"{CuStream, Int64}, isfailed::typeof(isnothing))
@ CUDA ~/.julia/packages/CUDA/HzJ3E/src/pool.jl:337
[10] macro expansion
@ ~/.julia/packages/CUDA/HzJ3E/src/pool.jl:416 [inlined]
[11] macro expansion
@ ./timing.jl:393 [inlined]
[12] #_alloc#992
@ ~/.julia/packages/CUDA/HzJ3E/src/pool.jl:413 [inlined]
[13] _alloc
@ ~/.julia/packages/CUDA/HzJ3E/src/pool.jl:408 [inlined]
[14] #alloc#991
@ ~/.julia/packages/CUDA/HzJ3E/src/pool.jl:398 [inlined]
[15] alloc
@ ~/.julia/packages/CUDA/HzJ3E/src/pool.jl:392 [inlined]
[16] CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}(#unused#::UndefInitializer, dims::Tuple{Int64})
@ CUDA ~/.julia/packages/CUDA/HzJ3E/src/array.jl:93
[17] CuArray
@ ~/.julia/packages/CUDA/HzJ3E/src/array.jl:176 [inlined]
[18] CuArray
@ ~/.julia/packages/CUDA/HzJ3E/src/array.jl:187 [inlined]
[19] similar
@ ./abstractarray.jl:882 [inlined]
[20] similar
@ ./abstractarray.jl:881 [inlined]
[21] similar
@ ~/.julia/packages/CUDA/HzJ3E/src/broadcast.jl:11 [inlined]
[22] copy
@ ~/.julia/packages/GPUArrays/TnEpb/src/host/broadcast.jl:37 [inlined]
[23] materialize(bc::Base.Broadcast.Broadcasted{CUDA.CuArrayStyle{1}, Nothing, typeof(gen_laguerre), Tuple{Int64, Int64, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}})
@ Base.Broadcast ./broadcast.jl:873
[24] top-level scope
@ In[25]:1
And then it is impossible to calculate again a lower order Laguerre polynomial.
I can’t figure out the source of the problem.
EDIT
I noticed that on the CPU it tooks a lot of time when n>10. Maybe this is the reason of the fail. Howerver I can’t understand why it fails, while I understand the long calculation time on the CPU.