Issue with 1.5 month old code based on CuArrays

Hi,

I am updating my library PseudoArcLengthContinuation.jl and one of the tutorials based on CuArrays does not work anymore. It used to work by the end of December.

Can someone explain me the error please?

Thank you for your help,

The error I am getting is the following:

julia> F_shfft(sol0, -.1, 1.3, shlop = L)
ERROR: GPU compilation of #25(CuArrays.CuKernelState, CUDAnative.CuDeviceArray{Complex{Float64},2,CUDAnative.AS.Global}, Base.Broadcast.Broadcasted{Nothing,Tuple{Base.OneTo{Int64},Base.OneTo{Int64}},Type{Complex},Tuple{Base.Broadcast.Extruded{Array{Float64,2},Tuple{Bool,Bool},Tuple{Int64,Int64}}}}) failed
KernelError: passing and using non-bitstype argument

Argument 4 to your kernel function is of type Base.Broadcast.Broadcasted{Nothing,Tuple{Base.OneTo{Int64},Base.OneTo{Int64}},Type{Complex},Tuple{Base.Broadcast.Extruded{Array{Float64,2},Tuple{Bool,Bool},Tuple{Int64,Int64}}}}.
That type is not isbits, and such arguments are only allowed when they are unused by the kernel.  .f is of type Type{Complex} which is not isbits.
  .args is of type Tuple{Base.Broadcast.Extruded{Array{Float64,2},Tuple{Bool,Bool},Tuple{Int64,Int64}}} which is not isbits.
    .1 is of type Base.Broadcast.Extruded{Array{Float64,2},Tuple{Bool,Bool},Tuple{Int64,Int64}} which is not isbits.
      .x is of type Array{Float64,2} which is not isbits.

The code is as follows. Note that I remove the parts used by PseudoArcLengthContinuation.jl so the code is independant of PseudoArcLengthContinuation.

using AbstractFFTs, FFTW, KrylovKit
using LinearAlgebra, Plots

struct SHLinearOp
	tmp_real         # temporary
	tmp_complex      # temporary
	l1
	fftplan
	ifftplan
end

function SHLinearOp(Nx, lx, Ny, ly; AF = Array{TY})
	# AF is a type, it could be CuArray{TY} to run the following on GPU
	k1 = vcat(collect(0:Nx/2), collect(Nx/2+1:Nx-1) .- Nx)
	k2 = vcat(collect(0:Ny/2), collect(Ny/2+1:Ny-1) .- Ny)
	d2 = [(1-(pi/lx * kx)^2 - (pi/ly * ky)^2)^2 + 1. for kx in k1, ky in k2]
	tmpc = Complex.(AF(zeros(Nx,Ny)))
	return SHLinearOp(AF(zeros(Nx,Ny)),tmpc,AF(d2),plan_fft!(tmpc),plan_ifft!(tmpc))
end

import Base: *, \

function *(c::SHLinearOp, u)
	c.tmp_complex .= Complex.(u)
	c.fftplan * c.tmp_complex
	c.tmp_complex .= c.l1 .* c.tmp_complex
	c.ifftplan * c.tmp_complex
	c.tmp_real .= real.(c.tmp_complex)
	return copy(c.tmp_real)
end

function \(c::SHLinearOp, u)
	c.tmp_complex .= Complex.(u)
	c.fftplan * c.tmp_complex
	c.tmp_complex .=  c.tmp_complex ./ c.l1
	c.ifftplan * c.tmp_complex
	c.tmp_real .= real.(c.tmp_complex)
	return copy(c.tmp_real)
end

function F_shfft(u, l = -0.15, ν = 1.3; shlop::SHLinearOp)
	return -(shlop * u) .+ ((l+1) .* u .+ ν .* u.^2 .- u.^3)
end

using CuArrays
CuArrays.allowscalar(false)
import LinearAlgebra: mul!, axpby!
mul!(x::CuArray, y::CuArray, α::T) where {T <: Number} = (x .= α .* y)
mul!(x::CuArray, α::T, y::CuArray) where {T <: Number} = (x .= α .* y)
axpby!(a::T, X::CuArray, b::T, Y::CuArray) where {T <: Number} = (Y .= a .* X .+ b .* Y)

TY = Float64
AF = CuArray{TY}

Nx = 2^10
Ny = 2^10
lx = 8pi * 2
ly = 2*2pi/sqrt(3) * 2

X = -lx .+ 2lx/(Nx) * collect(0:Nx-1)
Y = -ly .+ 2ly/(Ny) * collect(0:Ny-1)

sol0 = [(cos(x) .+ cos(x/2) * cos(sqrt(3) * y/2) ) for x in X, y in Y]
		sol0 .= sol0 .- minimum(vec(sol0))
		sol0 ./= maximum(vec(sol0))
		sol0 = sol0 .- 0.25
		sol0 .*= 1.7
		
L = SHLinearOp(Nx, lx, Ny, ly, AF = AF)
F_shfft(sol0, -.1, 1.3, shlop = L);

Please post the full back-trace, it’s now hard to undestand which expression this error came from.

There’s been several changes to the underlying infrastructure that might have caused this (generally, prefer Manifests if you want to be sure code keeps working). You seem to be passing a CPU Array into a broadcast somewhere, see the Array in there, so this is likely “user error”.

here it is:

julia> F_shfft(sol0, -.1, 1.3, shlop = L);
ERROR: GPU compilation of #25(CuArrays.CuKernelState, CUDAnative.CuDeviceArray{Complex{Float64},2,CUDAnative.AS.Global}, Base.Broadcast.Broadcasted{Nothing,Tuple{Base.OneTo{Int64},Base.OneTo{Int64}},Type{Complex},Tuple{Base.Broadcast.Extruded{Array{Float64,2},Tuple{Bool,Bool},Tuple{Int64,Int64}}}}) failed
KernelError: passing and using non-bitstype argument

Argument 4 to your kernel function is of type Base.Broadcast.Broadcasted{Nothing,Tuple{Base.OneTo{Int64},Base.OneTo{Int64}},Type{Complex},Tuple{Base.Broadcast.Extruded{Array{Float64,2},Tuple{Bool,Bool},Tuple{Int64,Int64}}}}.
That type is not isbits, and such arguments are only allowed when they are unused by the kernel.  .f is of type Type{Complex} which is not isbits.
  .args is of type Tuple{Base.Broadcast.Extruded{Array{Float64,2},Tuple{Bool,Bool},Tuple{Int64,Int64}}} which is not isbits.
    .1 is of type Base.Broadcast.Extruded{Array{Float64,2},Tuple{Bool,Bool},Tuple{Int64,Int64}} which is not isbits.
      .x is of type Array{Float64,2} which is not isbits.


Stacktrace:
 [1] check_invocation(::CUDAnative.CompilerJob, ::LLVM.Function) at /home/rveltz/.julia/packages/CUDAnative/JfXpo/src/compiler/validation.jl:72
 [2] macro expansion at /home/rveltz/.julia/packages/CUDAnative/JfXpo/src/compiler/driver.jl:192 [inlined]
 [3] macro expansion at /home/rveltz/.julia/packages/TimerOutputs/7Id5J/src/TimerOutput.jl:228 [inlined]
 [4] #codegen#154(::Bool, ::Bool, ::Bool, ::Bool, ::Bool, ::typeof(CUDAnative.codegen), ::Symbol, ::CUDAnative.CompilerJob) at /home/rveltz/.julia/packages/CUDAnative/JfXpo/src/compiler/driver.jl:191
 [5] #codegen at ./none:0 [inlined]
 [6] #compile#153(::Bool, ::Bool, ::Bool, ::Bool, ::Bool, ::typeof(CUDAnative.compile), ::Symbol, ::CUDAnative.CompilerJob) at /home/rveltz/.julia/packages/CUDAnative/JfXpo/src/compiler/driver.jl:52
 [7] #compile at ./none:0 [inlined]
 [8] #compile#152 at /home/rveltz/.julia/packages/CUDAnative/JfXpo/src/compiler/driver.jl:33 [inlined]
 [9] #compile at ./none:0 [inlined] (repeats 2 times)
 [10] macro expansion at /home/rveltz/.julia/packages/CUDAnative/JfXpo/src/execution.jl:393 [inlined]
 [11] #cufunction#198(::Nothing, ::Base.Iterators.Pairs{Union{},Union{},Tuple{},NamedTuple{(),Tuple{}}}, ::typeof(CUDAnative.cufunction), ::GPUArrays.var"#25#26", ::Type{Tuple{CuArrays.CuKernelState,CUDAnative.CuDeviceArray{Complex{Float64},2,CUDAnative.AS.Global},Base.Broadcast.Broadcasted{Nothing,Tuple{Base.OneTo{Int64},Base.OneTo{Int64}},Type{Complex},Tuple{Base.Broadcast.Extruded{Array{Float64,2},Tuple{Bool,Bool},Tuple{Int64,Int64}}}}}}) at /home/rveltz/.julia/packages/CUDAnative/JfXpo/src/execution.jl:360
 [12] cufunction(::Function, ::Type) at /home/rveltz/.julia/packages/CUDAnative/JfXpo/src/execution.jl:360
 [13] macro expansion at /home/rveltz/.julia/packages/CUDAnative/JfXpo/src/execution.jl:179 [inlined]
 [14] macro expansion at ./gcutils.jl:91 [inlined]
 [15] macro expansion at /home/rveltz/.julia/packages/CUDAnative/JfXpo/src/execution.jl:176 [inlined]
 [16] _gpu_call(::CuArrays.CuArrayBackend, ::Function, ::CuArray{Complex{Float64},2,Nothing}, ::Tuple{CuArray{Complex{Float64},2,Nothing},Base.Broadcast.Broadcasted{Nothing,Tuple{Base.OneTo{Int64},Base.OneTo{Int64}},Type{Complex},Tuple{Base.Broadcast.Extruded{Array{Float64,2},Tuple{Bool,Bool},Tuple{Int64,Int64}}}}}, ::Tuple{Tuple{Int64},Tuple{Int64}}) at /home/rveltz/.julia/packages/CuArrays/1njKF/src/gpuarray_interface.jl:62
 [17] gpu_call(::Function, ::CuArray{Complex{Float64},2,Nothing}, ::Tuple{CuArray{Complex{Float64},2,Nothing},Base.Broadcast.Broadcasted{Nothing,Tuple{Base.OneTo{Int64},Base.OneTo{Int64}},Type{Complex},Tuple{Base.Broadcast.Extruded{Array{Float64,2},Tuple{Bool,Bool},Tuple{Int64,Int64}}}}}, ::Int64) at /home/rveltz/.julia/packages/GPUArrays/1wgPO/src/abstract_gpu_interface.jl:151
 [18] gpu_call at /home/rveltz/.julia/packages/GPUArrays/1wgPO/src/abstract_gpu_interface.jl:128 [inlined]
 [19] copyto! at /home/rveltz/.julia/packages/GPUArrays/1wgPO/src/broadcast.jl:48 [inlined]
 [20] copyto! at ./broadcast.jl:863 [inlined]
 [21] materialize!(::CuArray{Complex{Float64},2,Nothing}, ::Base.Broadcast.Broadcasted{Base.Broadcast.DefaultArrayStyle{2},Nothing,Type{Complex},Tuple{Array{Float64,2}}}) at ./broadcast.jl:822
 [22] *(::SHLinearOp, ::Array{Float64,2}) at ./REPL[82]:2
 [23] #F_shfft#6(::SHLinearOp, ::typeof(F_shfft), ::Array{Float64,2}, ::Float64, ::Float64) at ./REPL[84]:2
 [24] (::var"#kw##F_shfft")(::NamedTuple{(:shlop,),Tuple{SHLinearOp}}, ::typeof(F_shfft), ::Array{Float64,2}, ::Float64, ::Float64) at ./none:0
 [25] top-level scope at REPL[105]:1

What a dummy…

sol0 is not on the GPU. Thank you for your help