I’m trying to implement the vectorized square root for complex CUDA arrays, and am starting from the commented-out implementation in https://github.com/JuliaLang/julia/blob/5d8225a1116685e449fcc08b207acdd0f5106409/base/complex.jl#L508, just to get something working; it probably has underflow/overflow problems but it’s easier for a beginner to vectorize, also it’s horribly slow and maybe I should just write a kernel from the start. My current effort is
using CUDA
CUDA.allowscalar(false)
Xz_h = rand(Complex{Float64}, 5, 5)
Xz_d = CuMatrix(Xz_h)
function sqrt2(z::CuArray{Complex{T}}) where T
ret = CuArray{Complex{T}}(undef, size(z)...)
rz = float(real(z))
iz = float(imag(z))
r = sqrt.((hypot.(rz, iz) + abs.(rz)) / 2)
ix1 = (r .== 0)
# ret[ix1] = (zero(iz) + iz*im)[ix1]
ix2 = (rz .>= 0)
# ret[ix2] = (r + (iz ./ r / 2)*im)[ix2]
ix3 = iszero.(ix1 + ix2)
# ret[ix3] = ((abs.(iz) ./ r / 2) + copysign.(r, iz)*im)[ix3]
return ret
end
function sqrt3(z::AbstractArray{Complex{T}}) where T
ret = Array{Complex{T}}(undef, size(z)...)
rz = float(real(z))
iz = float(imag(z))
r = sqrt.((hypot.(rz, iz) + abs.(rz)) / 2)
ix1 = (r .== 0)
ret[ix1] = (zero(iz) + iz*im)[ix1]
ix2 = (rz .>= 0)
ret[ix2] = (r + (iz ./ r / 2)*im)[ix2]
ix3 = iszero.(ix1 + ix2)
ret[ix3] = ((abs.(iz) ./ r / 2) + copysign.(r, iz)*im)[ix3]
return ret
end
where the CPU version sqrt3
matches the sqrt.
result, but the ret[ix]
lines in the GPU version give this traceback:
julia> sqrt2(Xz_d)
ERROR: ReadOnlyMemoryError()
Stacktrace:
[1] macro expansion at /home/eric/.julia/packages/LLVM/AGEVG/src/interop/base.jl:80
[inlined]
[2] macro expansion at /home/eric/.julia/packages/LLVM/AGEVG/src/interop/pointer.jl:
7 [inlined]
[3] pointerref at /home/eric/.julia/packages/LLVM/AGEVG/src/interop/pointer.jl:7 [in
lined]
[4] unsafe_load at /home/eric/.julia/packages/LLVM/AGEVG/src/interop/pointer.jl:79 [
inlined]
[5] arrayref at /home/eric/.julia/packages/CUDA/gKMm0/src/device/array.jl:80 [inline
d]
[6] getindex at /home/eric/.julia/packages/CUDA/gKMm0/src/device/array.jl:99 [inline
d]
[7] iterate at /home/eric/.julia/packages/CUDA/gKMm0/src/device/array.jl:145 [inline
d]
[8] iterate at /home/eric/.julia/packages/CUDA/gKMm0/src/device/array.jl:144 [inline
d]
[9] _simple_count(::typeof(identity), ::CuDeviceArray{Bool,2,1}) at ./reduce.jl:868
[10] _count at ./reducedim.jl:392 [inlined]
[11] #count#624 at ./reducedim.jl:390 [inlined]
[12] #count#623 at ./reducedim.jl:389 [inlined]
[13] count at ./reducedim.jl:389 [inlined]
[14] LogicalIndex at ./multidimensional.jl:636 [inlined]
[15] LogicalIndex at ./multidimensional.jl:639 [inlined]
[16] adapt_structure at /home/eric/.julia/packages/Adapt/8kQMV/src/wrappers.jl:12 [i
nlined]
[17] adapt at /home/eric/.julia/packages/Adapt/8kQMV/src/Adapt.jl:40 [inlined]
[18] cudaconvert at /home/eric/.julia/packages/CUDA/gKMm0/src/compiler/execution.jl:
143 [inlined]
[19] map at ./tuple.jl:159 [inlined]
[20] map at ./tuple.jl:160 [inlined] (repeats 2 times)
[21] #launch_heuristic#845 at /home/eric/.julia/packages/CUDA/gKMm0/src/gpuarrays.jl
:17 [inlined]
[22] launch_heuristic at /home/eric/.julia/packages/CUDA/gKMm0/src/gpuarrays.jl:17 [
inlined]
[23] gpu_call(::typeof(GPUArrays.setindex_kernel), ::CuArray{Complex{Float64},2}, ::
CuArray{Complex{Float64},1}, ::Tuple{Int64}, ::Int64, ::Base.LogicalIndex{CartesianIn
dex{2},CuArray{Bool,2}}; target::CuArray{Complex{Float64},2}, total_threads::Int64, t
hreads::Nothing, blocks::Nothing, name::Nothing) at /home/eric/.julia/packages/GPUArr
ays/Ck0bk/src/device/execution.jl:61
[24] _setindex!(::CuArray{Complex{Float64},2}, ::CuArray{Complex{Float64},1}, ::Base
.LogicalIndex{CartesianIndex{2},CuArray{Bool,2}}) at /home/eric/.julia/packages/GPUAr
rays/Ck0bk/src/host/indexing.jl:161
[25] setindex! at /home/eric/.julia/packages/GPUArrays/Ck0bk/src/host/indexing.jl:15
1 [inlined]
[26] sqrt2(::CuArray{Complex{Float64},2}) at /tmp/load.jl:16
[27] top-level scope at REPL[4]:1
[28] run_repl(::REPL.AbstractREPL, ::Any) at /build/julia/src/julia-1.5.2/usr/share/
julia/stdlib/v1.5/REPL/src/REPL.jl:288
and even this doesn’t work:
julia> Xz_d[1:3] = 2.2
2.2
julia> ERROR: a exception was thrown during kernel execution.
Run Julia on debug level 2 for device stack traces.
Nothing should be wrong with my GPU, since all the CUDA.jl tests pass.