My program got an error code 700 after running for a day, with the following error message. Can I trust the error message that the program errors at exactly where it breaks?
...
step = 51, energy/site = -0.28279876708984375, fidelity = 0.3100615535536275
step = 52, energy/site = -0.2982025146484375, fidelity = 0.31670590995681797
error in running finalizer: CUDAdrv.CuError(code=700, meta=nothing)
error in running finalizer: CUDAdrv.CuError(code=700, meta=nothing)
error in running finalizer: CUDAdrv.CuError(code=700, meta=nothing)
error in running finalizer: CUDAdrv.CuError(code=700, meta=nothing)
error in running finalizer: CUDAdrv.CuError(code=700, meta=nothing)
error in running finalizer: CUDAdrv.CuError(code=700, meta=nothing)
error in running finalizer: CUDAdrv.CuError(code=700, meta=nothing)
error in running finalizer: CUDAdrv.CuError(code=700, meta=nothing)
error in running finalizer: CUDAdrv.CuError(code=700, meta=nothing)
error in running finalizer: CUDAdrv.CuError(code=700, meta=nothing)
error in running finalizer: CUDAdrv.CuError(code=700, meta=nothing)
ERROR: LoadError: CUDA error: an illegal memory access was encountered (code #700, ERROR_ILLEGAL_ADDRESS)
Stacktrace:
[1] macro expansion at /home/liujinguo/.julia/packages/CUDAdrv/y9e4P/src/base.jl:147 [inlined]
[2] #download!#11(::Bool, ::Function, ::Ptr{Float32}, ::CUDAdrv.Mem.Buffer, ::Int64, ::CUDAdrv.CuStream) at /home/ liujinguo/.julia/packages/CUDAdrv/y9e4P/src/memory.jl:254
[3] download! at /home/liujinguo/.julia/packages/CUDAdrv/y9e4P/src/memory.jl:248 [inlined] (repeats 2 times)
[4] unsafe_copyto! at /home/liujinguo/.julia/dev/CuArrays/src/array.jl:127 [inlined]
[5] copyto!(::Array{Float32,2}, ::CuArray{Float32,2}) at /home/liujinguo/.julia/dev/GPUArrays/src/abstractarray. jl:110
[6] #measure_reset!#16(::Int64, ::Function, ::DefaultRegister{4096,Complex{Float32},CuArray{Complex{Float32},2}}) at ./array.jl:497
[7] #measure_reset! at ./none:0 [inlined]
[8] #measure_reset!#29 at /home/liujinguo/.julia/dev/Yao/src/Registers/measure.jl:80 [inlined]
[9] (::getfield(Yao.Registers, Symbol("#kw##measure_reset!")))(::NamedTuple{(:val,),Tuple{Int64}}, :: typeof(measure_reset!), ::DefaultRegister{4096,Complex{Float32},CuArray{Complex{Float32},2}}, ::Int64) at ./none:0
[10] energy(::QuantumMPS{DefaultRegister{4096,Complex{Float32},CuArray{Complex{Float32},2}}}, ::Yao.Blocks. YGate{Complex{Float32}}, ::Heisenberg{2}) at /home/liujinguo/jcode/QuantumMPS/src/heisenberg.jl:114
[11] energy at /home/liujinguo/jcode/QuantumMPS/src/heisenberg.jl:56 [inlined]
[12] gradient(::QuantumMPS{DefaultRegister{4096,Complex{Float32},CuArray{Complex{Float32},2}}}, ::Yao.Blocks. QDiff{Yao.Blocks.RotationGate{1,Float32,Yao.Blocks.ZGate{Complex{Float32}}},1,Float32}, ::Heisenberg{2}) at /home/ liujinguo/jcode/QuantumMPS/src/gradient.jl:16
[13] _broadcast_getindex at ./broadcast.jl:582 [inlined]
[14] getindex at ./broadcast.jl:515 [inlined]
[15] macro expansion at ./broadcast.jl:846 [inlined]
[16] macro expansion at ./simdloop.jl:73 [inlined]
[17] copyto! at ./broadcast.jl:845 [inlined]
[18] copyto! at ./broadcast.jl:800 [inlined]
[19] copy at ./broadcast.jl:776 [inlined]
Here is the source code:
function measure_reset!(reg::GPUReg{B, T}; val=0) where {B, T}
regm = reg |> rank3
pl = dropdims(mapreduce(abs2, +, regm, dims=2), dims=2)
pl_cpu = pl |> Matrix
res_cpu = map(ib->_measure(view(pl_cpu, :, ib), 1)[], 1:B)
res = CuArray(res_cpu)
@inline function kernel(regm, res, pl, val)
state = (blockIdx().x-1) * blockDim().x + threadIdx().x
if state <= length(regm)
k,i,j = GPUArrays.gpu_ind2sub(regm, state)
@inbounds rind = res[j] + 1
@inbounds k==val+1 && (regm[k,i,j] = regm[rind,i,j]/CUDAnative.sqrt(pl[rind, j]))
CuArrays.sync_threads()
@inbounds k!=val+1 && (regm[k,i,j] = 0)
end
return
end
X, Y = cudiv(length(regm))
@cuda threads=X blocks=Y kernel(regm, res, pl, val)
res
end
This function is called millions times in my program, It is strange that it breaks suddenly after running a long time. Is it possible that my GPU card is unstable?