Error in Cuda function : ERROR: LoadError: CuError(1, nothing)

I have this error while running Cuda

ERROR: LoadError: CuError(1, nothing)
Stacktrace:
 [1] (::getfield(CUDAdrv, Symbol("##25#26")){Bool,Int64,CuStream,CuFunction})(::Array{Ptr{Nothing},1}) at C:\Users\Wiktor\.julia\packages\CUDAdrv\WVU1H\src\base.jl:147
 [2] macro expansion at .\gcutils.jl:87 [inlined]
 [3] macro expansion at C:\Users\Wiktor\.julia\packages\CUDAdrv\WVU1H\src\execution.jl:61 [inlined]
 [4] pack_arguments(::getfield(CUDAdrv, Symbol("##25#26")){Bool,Int64,CuStream,CuFunction}, ::CuDeviceArray{Float32,1,CUDAnative.AS.Global}, ::CuDeviceArray{Float32,1,CUDAnative.AS.Global}, ::CuDeviceArray{Float32,1,CUDAnative.AS.Global}, ::CuDeviceArray{Float32,2,CUDAnative.AS.Global}, ::CuDeviceArray{Complex{Float32},2,CUDAnative.AS.Global}, ::Int64, ::Int64, ::CuDeviceArray{Float32,3,CUDAnative.AS.Global}, ::CuDeviceArray{Float32,3,CUDAnative.AS.Global}, ::CuDeviceArray{Float32,3,CUDAnative.AS.Global}, ::CuDeviceArray{Float32,1,CUDAnative.AS.Global}) at C:\Users\Wiktor\.julia\packages\CUDAdrv\WVU1H\src\execution.jl:40
 [5] #launch#24(::Tuple{Int64,Int64,Int64}, ::Tuple{Int64,Int64,Int64}, ::Bool, ::Int64, ::CuStream, ::Function, ::CuFunction, ::CuDeviceArray{Float32,1,CUDAnative.AS.Global}, ::Vararg{Any,N} where N) at C:\Users\Wiktor\.julia\packages\CUDAdrv\WVU1H\src\execution.jl:90
 [6] #launch at .\none:0 [inlined]
 [7] #30 at C:\Users\Wiktor\.julia\packages\CUDAdrv\WVU1H\src\execution.jl:179 [inlined]
 [8] macro expansion at .\gcutils.jl:87 [inlined]
 [9] macro expansion at C:\Users\Wiktor\.julia\packages\CUDAdrv\WVU1H\src\execution.jl:139 [inlined]
 [10] convert_arguments at C:\Users\Wiktor\.julia\packages\CUDAdrv\WVU1H\src\execution.jl:123 [inlined]
 [11] #cudacall#29 at C:\Users\Wiktor\.julia\packages\CUDAdrv\WVU1H\src\execution.jl:178 [inlined]
 [12] #cudacall at .\none:0 [inlined]
 [13] #cudacall#160 at C:\Users\Wiktor\.julia\packages\CUDAnative\nItlk\src\execution.jl:279 [inlined]
 [14] #cudacall at .\none:0 [inlined]
 [15] macro expansion at C:\Users\Wiktor\.julia\packages\CUDAnative\nItlk\src\execution.jl:260 [inlined]
 [16] #call#148(::Base.Iterators.Pairs{Symbol,Tuple{Int64,Int64,Int64},Tuple{Symbol,Symbol},NamedTuple{(:blocks, :threads),Tuple{Tuple{Int64,Int64,Int64},Tuple{Int64,Int64,Int64}}}}, ::typeof(CUDAnative.call), ::CUDAnative.HostKernel{BiotSavartCalculation.biotSavartCalculation,Tuple{CuDeviceArray{Float32,1,CUDAnative.AS.Global},CuDeviceArray{Float32,1,CUDAnative.AS.Global},CuDeviceArray{Float32,1,CUDAnative.AS.Global},CuDeviceArray{Float32,2,CUDAnative.AS.Global},CuDeviceArray{Complex{Float32},2,CUDAnative.AS.Global},Int64,Int64,CuDeviceArray{Float32,3,CUDAnative.AS.Global},CuDeviceArray{Float32,3,CUDAnative.AS.Global},CuDeviceArray{Float32,3,CUDAnative.AS.Global},CuDeviceArray{Float32,1,CUDAnative.AS.Global}}}, ::CuDeviceArray{Float32,1,CUDAnative.AS.Global}, ::CuDeviceArray{Float32,1,CUDAnative.AS.Global}, ::CuDeviceArray{Float32,1,CUDAnative.AS.Global}, ::CuDeviceArray{Float32,2,CUDAnative.AS.Global}, ::CuDeviceArray{Complex{Float32},2,CUDAnative.AS.Global}, ::Int64, ::Int64, ::CuDeviceArray{Float32,3,CUDAnative.AS.Global}, ::CuDeviceArray{Float32,3,CUDAnative.AS.Global}, ::CuDeviceArray{Float32,3,CUDAnative.AS.Global}, ::CuDeviceArray{Float32,1,CUDAnative.AS.Global}) at C:\Users\Wiktor\.julia\packages\CUDAnative\nItlk\src\execution.jl:237
 [17] (::getfield(CUDAnative, Symbol("#kw##call")))(::NamedTuple{(:blocks, :threads),Tuple{Tuple{Int64,Int64,Int64},Tuple{Int64,Int64,Int64}}}, ::typeof(CUDAnative.call), ::CUDAnative.HostKernel{BiotSavartCalculation.biotSavartCalculation,Tuple{CuDeviceArray{Float32,1,CUDAnative.AS.Global},CuDeviceArray{Float32,1,CUDAnative.AS.Global},CuDeviceArray{Float32,1,CUDAnative.AS.Global},CuDeviceArray{Float32,2,CUDAnative.AS.Global},CuDeviceArray{Complex{Float32},2,CUDAnative.AS.Global},Int64,Int64,CuDeviceArray{Float32,3,CUDAnative.AS.Global},CuDeviceArray{Float32,3,CUDAnative.AS.Global},CuDeviceArray{Float32,3,CUDAnative.AS.Global},CuDeviceArray{Float32,1,CUDAnative.AS.Global}}}, ::CuDeviceArray{Float32,1,CUDAnative.AS.Global}, ::Vararg{Any,N} where N) at .\none:0
 [18] #call#163(::Base.Iterators.Pairs{Symbol,Tuple{Int64,Int64,Int64},Tuple{Symbol,Symbol},NamedTuple{(:blocks, :threads),Tuple{Tuple{Int64,Int64,Int64},Tuple{Int64,Int64,Int64}}}}, ::CUDAnative.HostKernel{BiotSavartCalculation.biotSavartCalculation,Tuple{CuDeviceArray{Float32,1,CUDAnative.AS.Global},CuDeviceArray{Float32,1,CUDAnative.AS.Global},CuDeviceArray{Float32,1,CUDAnative.AS.Global},CuDeviceArray{Float32,2,CUDAnative.AS.Global},CuDeviceArray{Complex{Float32},2,CUDAnative.AS.Global},Int64,Int64,CuDeviceArray{Float32,3,CUDAnative.AS.Global},CuDeviceArray{Float32,3,CUDAnative.AS.Global},CuDeviceArray{Float32,3,CUDAnative.AS.Global},CuDeviceArray{Float32,1,CUDAnative.AS.Global}}}, ::CuDeviceArray{Float32,1,CUDAnative.AS.Global}, ::Vararg{Any,N} where N) at C:\Users\Wiktor\.julia\packages\CUDAnative\nItlk\src\execution.jl:406
 [19] (::getfield(CUDAnative, Symbol("#kw#HostKernel")))(::NamedTuple{(:blocks, :threads),Tuple{Tuple{Int64,Int64,Int64},Tuple{Int64,Int64,Int64}}}, ::CUDAnative.HostKernel{BiotSavartCalculation.biotSavartCalculation,Tuple{CuDeviceArray{Float32,1,CUDAnative.AS.Global},CuDeviceArray{Float32,1,CUDAnative.AS.Global},CuDeviceArray{Float32,1,CUDAnative.AS.Global},CuDeviceArray{Float32,2,CUDAnative.AS.Global},CuDeviceArray{Complex{Float32},2,CUDAnative.AS.Global},Int64,Int64,CuDeviceArray{Float32,3,CUDAnative.AS.Global},CuDeviceArray{Float32,3,CUDAnative.AS.Global},CuDeviceArray{Float32,3,CUDAnative.AS.Global},CuDeviceArray{Float32,1,CUDAnative.AS.Global}}}, ::CuDeviceArray{Float32,1,CUDAnative.AS.Global}, ::Vararg{Any,N} where N) at .\none:0
 [20] macro expansion at .\gcutils.jl:87 [inlined]
 [21] macro expansion at C:\Users\Wiktor\.julia\packages\CUDAnative\nItlk\src\execution.jl:171 [inlined]
 [22] PrepareArrangement(::Base.RefValue{Bool}) at c:\Users\Wiktor\MagneticField3DGPUVersion\src\generateMap.jl:59
 [23] top-level scope at c:\Users\Wiktor\MagneticField3DGPUVersion\src\MagneticField3D.jl:95 [inlined]
 [24] top-level scope at .\none:0
 [25] include_string(::Module, ::String, ::String) at .\loading.jl:1008
 [26] (::getfield(Main._vscodeserver, Symbol("##8#10")){String,Int64,Int64,String})() at c:\Users\Wiktor\.vscode\extensions\julialang.language-julia-0.12.0\scripts\terminalserver\terminalserver.jl:153
 [27] hideprompt(::getfield(Main._vscodeserver, Symbol("##8#10")){String,Int64,Int64,String}) at c:\Users\Wiktor\.vscode\extensions\julialang.language-julia-0.12.0\scripts\terminalserver\repl.jl:28
 [28] macro expansion at c:\Users\Wiktor\.vscode\extensions\julialang.language-julia-0.12.0\scripts\terminalserver\terminalserver.jl:148 [inlined]
 [29] (::getfield(Main._vscodeserver, Symbol("##7#9")))() at .\task.jl:259
in expression starting at c:\Users\Wiktor\MagneticField3DGPUVersion\src\MagneticField3D.jl:82

Here’s the moment when I run function:

if CImGui.Button("Generate plot")
                x=cu(collect(X[1]:X[2]:X[3]))
                y=cu(collect(Y[1]:Y[2]:Y[3]))
                z=cu(collect(Z[1]:Z[2]:Z[3]))
                # try
                lenX=length(x)
                lenY=length(y)
                lenZ=length(z)
                segmentlength=length(S)
                variant=cu(zeros(segmentlength))
                d_Segment=cu(hcat(S...))
                I=cu(hcat(Itab))
                SegmentsCalculated=CuArray{Float32}(undef,numberOnLine*3,segmentlength)
                optimalBlocks=2*attribute(CuDevice(0),CUDAdrv.MULTIPROCESSOR_COUNT)
                @cuda blocks=optimalBlocks,1,1 threads=Int(numberOnLine/optimalBlocks),3,segmentlength divideLine(d_Segment,numberOnLine,SegmentsCalculated)
                @cuda blocks=segmentlength threads=1 checkvariant(d_Segment,variant)
                # SegmentsCalculated=Array{Float32}(collect(SegmentsCalculated))
                # variant=Array{Float32}(collect(variant))
                B=cu(zeros(lenX,lenY,lenZ))
                Bx=By=Bz=B

                @cuda blocks=1,1,1 threads = lenX,lenY,lenZ biotSavartCalculation(x,y,z,SegmentsCalculated,I,numberOnLine,segmentlength,Bx,By,Bz,variant)
...

On CPU version it works perfect. The problem starts in last line.
Here’s the program(but very simplified):

function biotSavartCalculation(x,y,z,Segment,I,SegmentsOnElement,segmentlength,Bx,By,Bz,variant)
  xIndex=(blockIdx().x-1) * blockDim().x + threadIdx().x
  yIndex=(blockIdx().y-1) * blockDim().y + threadIdx().y
  zIndex=(blockIdx().z-1) * blockDim().z + threadIdx().z

  offset=xIndex+(yIndex-1)*blockDim().x*gridDim().x+(zIndex-1)*blockDim().x*gridDim().x*blockDim().y*gridDim().y
  while segmentlength>0
    segmentlength-=1
    columnX=3*SegmentsOnElement*segmentlength
    columnY=Int(3*SegmentsOnElement*segmentlength+SegmentsOnElement/3)
    columnZ=Int(3*SegmentsOnElement*segmentlength+SegmentsOnElement*2/3)
    i=SegmentsOnElement-1
    while i>0
      δx=x[xIndex]-Segment[columnX+i]
      δy=y[yIndex]-Segment[columnY+i]
      integral=1.0f0
      variant[segmentlength+1]==1.0f0 ? (L=Segment[columnZ+i+1]-Segment[columnZ+i];
       q=δx*δx+δy*δy;
        Bx[offset]+=integral*δy; By[offset]+=integral*δx) :
         (Sx=Segment[columnX+i+1]-Segment[columnX+i]; Sy=Segment[columnY+i+1]-Segment[columnY+i];
          L=sqrt(Sx*Sx+Sy*Sy);
          mi=(Segment[columnZ+i+1]-Segment[columnZ+i])/Sy;
          α=Sx/L; β=Sy/L;
          a=1+mi*mi;δz=z[zIndex]+mi*Segment[columnY+i]-Segment[columnZ+i]; p=(α*δx+β*δy+mi*δz)/a; q=δx*δx+δy*δy+δz*δz-p*p*a;
            Bx[offset]+=integral*(β*δz-mi*δy); By[offset]+=integral*(α*δz-mi*δx); Bz[offset]+=integral*(α*δy-β*δx))
      i-=1
    end
  end
  
  return nothing
end

I think that there isn’t any detail problem(like wrong indexing), but more global problem like using incorrect loops, wrong data types, wrong dependencies in package or something like that, because details(checked and tested by me) looks fine.

Have you got any idea?

Please provide a MWE, it’s almost impossible to debug and help an issue like this. Error 1 is an INVALID_VALUE, and typically means some invalid value is passed to an API call. You should never see it rendered like this though (CuError(1, nothing)), what versions of Julia and the CUDA packages are you using?

I run my function outside imgui program for correct input data and I get:

KernelError: recursion is currently not supported

Try inspecting the generated code with any of the @device_code_... macros.

In my input data I have to int values and 9 CuArrays:

  • typeof(x): 19-element CuArray{Float32,1}
  • typeof(y): 19-element CuArray{Float32,1}
  • typeof(z): 5-element CuArray{Float32,1}
  • typeof(SegmentsCalculated): 300×2 CuArray{Float32,2}
  • typeof(I): 3×1 CuArray{Complex{Float32},2}
  • typeof(SegmentsOnElement)
  • typeof(numberOnLine): Int64
  • typeof(segmentlength): Int64
  • typeof(Bx): 19×19×5 CuArray{Float32,3}
  • typeof(By): 19×19×5 CuArray{Float32,3}
  • typeof(Bz): 19×19×5 CuArray{Float32,3}
  • typeof(variant): 2-element CuArray{Float32,1}
    Function is run by:(lenX=length(x); lenY=length(y); lenZ=length(z)
    @cuda blocks=1,1,1 threads = lenX,lenY,lenZ biotSavartCalculation(x,y,z,SegmentsCalculated,I,numberOnLine,segmentlength,Bx,By,Bz,variant)
    and function is posted before(it have two nested while loops and I even don’t use this I array to not complicate this).
    And in later part of error I have something like this:
Stacktrace:
 [1] unaliascopy at abstractarray.jl:1118
 [2] unalias at abstractarray.jl:1101
 [3] copyto! at multidimensional.jl:874
 [4] copymutable at abstractarray.jl:844
 [5] copy at abstractarray.jl:794
 [6] unaliascopy at abstractarray.jl:1118
 [7] broadcast at broadcast.jl:707
 [8] biotSavartCalculation at C:\Users\Wiktor\.julia\dev\BiotSavartCalculation\src\BiotSavartCalculation.jl:8
Stacktrace:
 [1] (::getfield(CUDAnative, Symbol("#hook_emit_function#71")){CUDAnative.CompilerJob,Array{Core.MethodInstance,1}})(::Core.MethodInstance, ::Core.CodeInfo, ::UInt64) at C:\Users\Wiktor\.julia\packages\CUDAnative\nItlk\src\compiler\irgen.jl:93
 [2] compile_method_instance(::CUDAnative.CompilerJob, ::Core.MethodInstance, ::UInt64) at C:\Users\Wiktor\.julia\packages\CUDAnative\nItlk\src\compiler\irgen.jl:127
 [3] irgen(::CUDAnative.CompilerJob, ::Core.MethodInstance, ::UInt64) at C:\Users\Wiktor\.julia\packages\TimerOutputs\7zSea\src\TimerOutput.jl:216
 [4] #codegen#121(::Bool, ::Bool, ::Bool, ::Bool, ::Bool, ::Function, ::Symbol, ::CUDAnative.CompilerJob) at C:\Users\Wiktor\.julia\packages\TimerOutputs\7zSea\src\TimerOutput.jl:216
 [5] #codegen at C:\Users\Wiktor\.julia\packages\CUDAnative\nItlk\src\compiler\driver.jl:0 [inlined]
 [6] #compile#120(::Bool, ::Bool, ::Bool, ::Bool, ::Bool, ::Function, ::Symbol, ::CUDAnative.CompilerJob) at C:\Users\Wiktor\.julia\packages\CUDAnative\nItlk\src\compiler\driver.jl:47
 [7] #compile#119 at .\none:0 [inlined]
 [8] #compile at .\none:0 [inlined] (repeats 2 times)
 [9] macro expansion at C:\Users\Wiktor\.julia\packages\CUDAnative\nItlk\src\execution.jl:388 [inlined]
 [10] #cufunction#161(::Nothing, ::Base.Iterators.Pairs{Union{},Union{},Tuple{},NamedTuple{(),Tuple{}}}, ::typeof(cufunction), ::typeof(biotSavartCalculation), ::Type{Tuple{CuDeviceArray{Float32,1,CUDAnative.AS.Global},CuDeviceArray{Float32,1,CUDAnative.AS.Global},CuDeviceArray{Float32,1,CUDAnative.AS.Global},CuDeviceArray{Float32,2,CUDAnative.AS.Global},CuDeviceArray{Complex{Float32},2,CUDAnative.AS.Global},Int64,Int64,CuDeviceArray{Float32,3,CUDAnative.AS.Global},CuDeviceArray{Float32,3,CUDAnative.AS.Global},CuDeviceArray{Float32,3,CUDAnative.AS.Global},CuDeviceArray{Float32,1,CUDAnative.AS.Global}}}) at C:\Users\Wiktor\.julia\packages\CUDAnative\nItlk\src\execution.jl:356
 [11] cufunction(::Function, ::Type) at C:\Users\Wiktor\.julia\packages\CUDAnative\nItlk\src\execution.jl:356
 [12] top-level scope at C:\Users\Wiktor\.julia\packages\CUDAnative\nItlk\src\execution.jl:174
 [13] top-level scope at gcutils.jl:87
 [14] top-level scope at C:\Users\Wiktor\.julia\packages\CUDAnative\nItlk\src\execution.jl:171

and part [8] biotSavartCalculation at C:\Users\Wiktor\.julia\dev\BiotSavartCalculation\src\BiotSavartCalculation.jl:8 is just nonsense, because it’s just:
xIndex=(blockIdx().x-1) * blockDim().x + threadIdx().x which I do in every other working function.

Again, without code to execute it’s difficult to help. You’re calling into broadcast from within a kernel, which isn’t supported.

Ok, so here is the code:

using CUDAnative
using CUDAdrv
using CuArrays
function divideLine(Segment,numberofsegments,SegmentsCalculated)
  segmentNumber = (blockIdx().x-1) * blockDim().x + threadIdx().x
  coordinateNumber = (blockIdx().y-1) * blockDim().y + threadIdx().y
  elementNumber = (blockIdx().z-1) * blockDim().z + threadIdx().z

  offset=segmentNumber+(coordinateNumber-1)*blockDim().x*gridDim().x+(elementNumber-1)*blockDim().x*gridDim().x*blockDim().y*gridDim().y

  segmentoffset=coordinateNumber+(elementNumber-1)*blockDim().y*gridDim().y*2
  
  SegmentsCalculated[offset]=Segment[segmentoffset]+(segmentNumber-1)%numberofsegments*(Segment[segmentoffset+3]-Segment[segmentoffset])/(numberofsegments-1)
  return nothing
end

function checkvariant(Segment,variant)
  segmentNumber = (blockIdx().x-1) * blockDim().x + threadIdx().x
  ((Segment[(segmentNumber-1)*6+1]==Segment[(segmentNumber-1)*6+4])&&(Segment[(segmentNumber-1)*6+2]==Segment[(segmentNumber-1)*6+5])) ? variant[segmentNumber]=1.0f0 : nothing
  return nothing
end

function biotSavartCalculation(x,y,z,Segment,SegmentsOnElement,segmentlength,I,Bx,By,Bz,variant)
  xIndex=(blockIdx().x-1) * blockDim().x + threadIdx().x
  yIndex=(blockIdx().y-1) * blockDim().y + threadIdx().y
  zIndex=(blockIdx().z-1) * blockDim().z + threadIdx().z

  offset=xIndex+(yIndex-1)*blockDim().x*gridDim().x+(zIndex-1)*blockDim().x*gridDim().x*blockDim().y*gridDim().y
  while segmentlength>0
    segmentlength-=1
    columnX=3*SegmentsOnElement*segmentlength
    columnY=Int(3*SegmentsOnElement*segmentlength+SegmentsOnElement/3)
    columnZ=Int(3*SegmentsOnElement*segmentlength+SegmentsOnElement*2/3)
    i=SegmentsOnElement-1
    while i>0
      δx=x[xIndex]-Segment[columnX+i]
      δy=y[yIndex]-Segment[columnY+i]
      integral=1.0f0
      variant[segmentlength+1]==1.0f0 ? (L=Segment[columnZ+i+1]-Segment[columnZ+i];
       q=δx*δx+δy*δy;
        Bx[offset]+=integral*δy; By[offset]+=integral*δx) :
         (Sx=Segment[columnX+i+1]-Segment[columnX+i]; Sy=Segment[columnY+i+1]-Segment[columnY+i];
          L=sqrt(Sx*Sx+Sy*Sy);
          mi=(Segment[columnZ+i+1]-Segment[columnZ+i])/Sy;
          δz=z[zIndex]+mi*Segment[columnY+i]-Segment[columnZ+i]; 
          α=Sx/L; β=Sy/L;
          a=1+mi*mi; p=(α*δx+β*δy+mi*δz)/a; q=δx*δx+δy*δy+δz*δz-p*p*a;
            Bx[offset]+=integral*(β*δz-mi*δy); By[offset]+=integral*(α*δz-mi*δx); Bz[offset]+=integral*(α*δy-β*δx))
      i-=1
    end
  end
  
  return nothing
end

numberofsegments=100

x=cu(collect(1:0.5:10))

y=cu(collect(1:0.5:10))

z=cu(collect(1:1:5))

lenX=length(x)

lenY=length(y)

lenZ=length(z)

B=CuArray{Float32}(undef,lenX,lenY,lenZ)

Bx=By=Bz=B

Segment=[]

startline1=Float32[-500.0, 0.0, 1.0]

endline1=Float32[500.0, 0.0, 1.0]

startline2=Float32[0.0, -500.0 ,1.0]

endline2=Float32[0.0, 500.0 ,1.0]

push!(Segment,[startline1; endline1])

push!(Segment,[startline2; endline2])

segmentlength=length(Segment)

variant=cu(zeros(segmentlength))

d_Segment=cu(hcat(Segment...))

Itab=[]

angle=pi/5.0f0

angle2=pi/2.0f0

I=640.0f0

I2=500.0f0

push!(Itab,I*exp(im*angle))

push!(Itab,I2*exp(im*angle2))

push!(Itab,I3*exp(im*angle3))

I=cu(hcat(Itab))

SegmentsCalculated=CuArray{Float32}(undef,numberofsegments*3,segmentlength)

optimalBlocks=2*attribute(CuDevice(0),CUDAdrv.MULTIPROCESSOR_COUNT)

@cuda blocks=optimalBlocks,1,1 threads=Int(numberofsegments/optimalBlocks),3,segmentlength divideLine(d_Segment,numberofsegments,SegmentsCalculated)

@cuda blocks=segmentlength threads=1 checkvariant(d_Segment,variant)

@cuda threads=lenX,lenY,lenZ biotSavartCalculation(x,y,z,SegmentsCalculated,I,numberofsegments,segmentlength,Bx,By,Bz,variant)

Normally for integral I use QuadGK package to calculate some values, but it’s different problem.

That’s now a minimal example. Please see PSA: make it easier to help you

Debugging an issue like this first requires getting rid of as much as code as possible while preserving the error. Doing so would reduce your last kernel to SegmentsOnElement*segmentlength, a multiplication if a device array and a scalar. That’s not supported on the GPU (requires allocating a new array).

Actually, I’m going to use small steps method to go to final part, because I think that my function structure is too messy.
At this time that work:

using CUDAnative
using CuArrays
using CUDAdrv
x=3
y=4
z=5

a=cu(rand(x,y,z))
b=cu(rand(x,y,z))
c=cu(rand(x,y,z))
number=10
number2=2

element=cu(rand(number,1))
function whatever(a,b,c,number,number2,element)
  x=(blockIdx().x-1) * blockDim().x + threadIdx().x
  y=(blockIdx().y-1) * blockDim().y + threadIdx().y
  z=(blockIdx().z-1) * blockDim().z + threadIdx().z
  offset=x+(y-1)*gridDim().x*blockDim().x+(z-1)*gridDim().x*blockDim().x*gridDim().y*blockDim().y
  while number2>0
    number2-=1
    while number>1
      a[offset]+=number*(element[number]-element[number-1])
      b[offset]+=number*2*(element[number]-element[number-1])
      c[offset]+=number*3*(element[number]-element[number-1])
      number-=1
    end
  end
  return nothing
end

@cuda threads=x,y,z whatever(a,b,c,number,number2,element)

Next I’ll try to index element array(which will be two dimensional) with nested index and see how it works, create three more arrays with sizes of x, y, z and index them with single cuda indexes(x for first, y for second and z for third). And last I will check if I can execute conditional insturction(like if/else or tenary operator in that and see if it all works.

Ok, so majority of things in test function is working, so I will rewrite my package(maybe there is problem with package, not with function) like that:

using CUDAnative
using CuArrays
using CUDAdrv
x=3
y=4
z=5

a=cu(rand(x,y,z))
b=cu(rand(x,y,z))
c=cu(rand(x,y,z))
number=10
number2=2

element=cu(rand(number*3,number2))

xtab=cu(rand(x))
ytab=cu(rand(y))
ztab=cu(rand(z))

variant=cu(ones(number2))





function whatever(a,b,c,number,number2,element,xtab,ytab,ztab,variant)
  x=(blockIdx().x-1) * blockDim().x + threadIdx().x
  y=(blockIdx().y-1) * blockDim().y + threadIdx().y
  z=(blockIdx().z-1) * blockDim().z + threadIdx().z
  offset=x+(y-1)*gridDim().x*blockDim().x+(z-1)*gridDim().x*blockDim().x*gridDim().y*blockDim().y
  while number2>0
    number2-=1
    columnx=number2*3*number
    columny=number2*3*number
    columnz=number2*3*number
    while number>1
      variant[number2+1]==1.0f0 ?
      (a[offset]+=number*(element[columnx+number]-element[columnx+number-1])+xtab[x];
      b[offset]+=number*2*(element[columny+number]-element[columny+number-1])+ytab[y];
      c[offset]+=number*3*(element[columnz+number]-element[columnz+number-1])+ztab[z]) :
      (a[offset]+=number*(element[columnx+number]-element[columnx+number-1])+xtab[x];
      b[offset]+=number*2*(element[columny+number]-element[columny+number-1])+ytab[y];
      c[offset]+=number*3*(element[columnz+number]-element[columnz+number-1])+ztab[z]
      )
      number-=1
    end
  end
  return nothing
end

@cuda threads=x,y,z whatever(a,b,c,number,number2,element,xtab,ytab,ztab,variant)