ERROR: LoadError: CUDNNError: CUDNN_STATUS_EXECUTION_FAILED (code 8)

I’m not able to get this code running on a GPU, but it works on CPU. I’ve tried different versions of CUDA, Nvidia drivers and Julia versions. Every time it ended up with the same error. As the code runs, the memory consumed keeps increasing till its full and after a couple of minutes results in the following error.

Code:


import Pkg; Pkg.activate(".");
using DiffEqFlux, OrdinaryDiffEq, Flux, NNlib, MLDataUtils, Printf
using Flux: logitcrossentropy
using Flux.Data: DataLoader
using MLDatasets
using CUDA
using Random: seed!
CUDA.allowscalar(false)
function loadmnist(batchsize = bs, train_split = 0.9)
    # Use MLDataUtils LabelEnc for natural onehot conversion
    onehot(labels_raw) = convertlabel(LabelEnc.OneOfK, labels_raw,
                                      LabelEnc.NativeLabels(collect(0:9)))
    # Load MNIST
    imgs, labels_raw = MNIST.traindata();
    # Process images into (H,W,C,BS) batches
    x_data = Float32.(reshape(imgs, size(imgs,1), size(imgs,2), 1, size(imgs,3)))
    y_data = onehot(labels_raw)
    (x_train, y_train), (x_test, y_test) = stratifiedobs((x_data, y_data),
                                                         p = train_split)
    return (
        # Use Flux's DataLoader to automatically minibatch and shuffle the data
        DataLoader(gpu.(collect.((x_train, y_train))); batchsize = batchsize,
                   shuffle = true),
        # Don't shuffle the test data
        DataLoader(gpu.(collect.((x_test, y_test))); batchsize = batchsize,
                   shuffle = false)
    )
end
# Main
const bs = 128
const train_split = 0.9
train_dataloader, test_dataloader = loadmnist(bs, train_split)
#down = Chain(flatten, Dense(784, 20, tanh)) |> gpu
nn = Chain(Dense(288, 64, relu),
           Dense(64, 64, relu),
           Dense(64, 288, relu)) |> gpu
nn2 = Chain(Dense(288, 64, relu),
        Dense(64, 64, relu),
        Dense(64, 288, relu)) |> gpu
nn_ode2 = NeuralODE(nn2, (0.f0, 1.f0), Tsit5(),
        save_everystep = false,
        reltol = 1e-3, abstol = 1e-3,
        save_start = false) |> gpu
nn_ode = NeuralODE(nn, (0.f0, 1.f0), Tsit5(),
           save_everystep = false,
           reltol = 1e-3, abstol = 1e-3,
           save_start = false) |> gpu
nn_ode(randn(288, 1) |> gpu)
fc  = Chain(Dense(288, 10)) |> gpu
function DiffEqArray_to_Array(x)
    xarr = gpu(x)
    return reshape(xarr, size(xarr)[1:2])
end
# Build our over-all model topology
model = Chain(Conv((3, 3), 1=>16, pad=(1,1), relu),
              x -> maxpool(x, (2,2)), Conv((3, 3), 16=>32, pad=(1,1), relu),
              x -> maxpool(x, (2,2)), Conv((3, 3), 32=>32, pad=(1,1), relu),
              x -> maxpool(x, (2,2)),
              x -> reshape(x, :, size(x, 4)),
              nn_ode,
              DiffEqArray_to_Array,
              nn_ode2,
              DiffEqArray_to_Array,
              fc) |> gpu;
# To understand the intermediate NN-ODE layer, we can examine it's dimensionality
img, lab = train_dataloader.data[1][:, :, :, 1:1], train_dataloader.data[2][:, 1:1]
# We can see that we can compute the forward pass through the NN topology
# featuring an NNODE layer.
x_m = model(img)
classify(x) = argmax.(eachcol(x))
function accuracy(model, data; n_batches = 100)
    total_correct = 0
    total = 0
    for (i, (x, y)) in enumerate(collect(data))
        # Only evaluate accuracy for n_batches
        i > n_batches && break
        target_class = classify(cpu(y))
        predicted_class = classify(cpu(model(x)))
        total_correct += sum(target_class .== predicted_class)
        total += length(target_class)
    end
    return total_correct / total
end
# burn in accuracy
accuracy(model, train_dataloader)
loss(x, y) = logitcrossentropy(model(x), y)
# burn in loss
loss(img, lab)
#implementation of SGLD
function SGLD!(graddict, paramdict, a, b, γ, t)
    ε = a*(b + t)^-γ
    for p in paramdict
        ∇p = graddict[p]
        η = ε .* gpu(randn(size(p)))
        Δp = 0.5ε*∇p + η
        p .-= Δp
    end
end
function trainMNIST()
    seed!(1)
    weights = []
    iter = 0
    cb() = begin
        iter += 1
        # Monitor that the weights do infact update
        # Every 10 training iterations show accuracy
        if iter % 10 == 1
            train_accuracy = accuracy(model, train_dataloader) * 100
            test_accuracy = accuracy(model, test_dataloader;
                                     n_batches = length(test_dataloader)) * 100
            @printf("Iter: %3d || Train Accuracy: %2.3f || Test Accuracy: %2.3f\n",
                    iter, train_accuracy, test_accuracy)
        end
    end
    Flux.@epochs 5 for (x, y) in train_dataloader
        g = gradient(() -> loss(x, y), params(model))
        SGLD!(g, params(model), .05, .5, 0.5, iter)
        cb()
        append!(weights, [deepcopy(params(model))])
    end
    weights
end
weights = trainMNIST()

Error:

[ Info: Epoch 1
ERROR: LoadError: CUDNNError: CUDNN_STATUS_EXECUTION_FAILED (code 8)
Stacktrace:
 [1] throw_api_error(::CUDA.CUDNN.cudnnStatus_t) at /home/swamy/.julia/packages/CUDA/dZvbp/lib/cudnn/error.jl:19
 [2] macro expansion at /home/swamy/.julia/packages/CUDA/dZvbp/lib/cudnn/error.jl:30 [inlined]
 [3] cudnnPoolingBackward(::Ptr{Nothing}, ::CUDA.CUDNN.PoolDesc, ::Base.RefValue{Float32}, ::CUDA.CUDNN.TensorDesc, ::CuArray{Float32,4}, ::CUDA.CUDNN.TensorDesc, ::CuArray{Float32,4}, ::CUDA.CUDNN.TensorDesc, ::CuArray{Float32,4}, ::Base.RefValue{Float32}, ::CUDA.CUDNN.TensorDesc, ::CuArray{Float32,4}) at /home/swamy/.julia/packages/CUDA/dZvbp/lib/utils/call.jl:93
 [4] cudnnPoolingBackward(::CuArray{Float32,4}, ::CuArray{Float32,4}, ::CuArray{Float32,4}, ::CuArray{Float32,4}, ::PoolDims{2,(2, 2),(2, 2),(0, 0, 0, 0),(1, 1)}; alpha::Int64, mode::Int64) at /home/swamy/.julia/packages/CUDA/dZvbp/lib/cudnn/pooling.jl:46
 [5] ∇maxpool! at /home/swamy/.julia/packages/CUDA/dZvbp/lib/cudnn/nnlib.jl:90 [inlined]
 [6] #∇maxpool#157 at /home/swamy/.julia/packages/NNlib/fxLrD/src/pooling.jl:125 [inlined]
 [7] ∇maxpool at /home/swamy/.julia/packages/NNlib/fxLrD/src/pooling.jl:123 [inlined]
 [8] #1180 at /home/swamy/.julia/packages/Zygote/ggM8Z/src/lib/nnlib.jl:92 [inlined]
 [9] #4098#back at /home/swamy/.julia/packages/ZygoteRules/OjfTt/src/adjoint.jl:59 [inlined]
 [10] #maxpool#170 at /home/swamy/.julia/packages/NNlib/fxLrD/src/pooling.jl:154 [inlined]
 [11] maxpool at /home/swamy/.julia/packages/NNlib/fxLrD/src/pooling.jl:151 [inlined]
 [12] #4 at /home/swamy/vishal/MSML21_BayesianNODE/mnist_test.jl:70 [inlined]
 [13] (::typeof(∂(#4)))(::CuArray{Float32,4}) at /home/swamy/.julia/packages/Zygote/ggM8Z/src/compiler/interface2.jl:0
 [14] applychain at /home/swamy/.julia/packages/Flux/05b38/src/layers/basic.jl:36 [inlined]
 [15] (::typeof(∂(applychain)))(::CuArray{Float32,2}) at /home/swamy/.julia/packages/Zygote/ggM8Z/src/compiler/interface2.jl:0
 [16] applychain at /home/swamy/.julia/packages/Flux/05b38/src/layers/basic.jl:36 [inlined]
 [17] (::typeof(∂(applychain)))(::CuArray{Float32,2}) at /home/swamy/.julia/packages/Zygote/ggM8Z/src/compiler/interface2.jl:0
 [18] applychain at /home/swamy/.julia/packages/Flux/05b38/src/layers/basic.jl:36 [inlined]
 [19] (::typeof(∂(applychain)))(::CuArray{Float32,2}) at /home/swamy/.julia/packages/Zygote/ggM8Z/src/compiler/interface2.jl:0
 [20] applychain at /home/swamy/.julia/packages/Flux/05b38/src/layers/basic.jl:36 [inlined]
 [21] (::typeof(∂(applychain)))(::CuArray{Float32,2}) at /home/swamy/.julia/packages/Zygote/ggM8Z/src/compiler/interface2.jl:0
 [22] applychain at /home/swamy/.julia/packages/Flux/05b38/src/layers/basic.jl:36 [inlined]
 [23] (::typeof(∂(applychain)))(::CuArray{Float32,2}) at /home/swamy/.julia/packages/Zygote/ggM8Z/src/compiler/interface2.jl:0
 [24] applychain at /home/swamy/.julia/packages/Flux/05b38/src/layers/basic.jl:36 [inlined]
 [25] (::typeof(∂(applychain)))(::CuArray{Float32,2}) at /home/swamy/.julia/packages/Zygote/ggM8Z/src/compiler/interface2.jl:0
 [26] Chain at /home/swamy/.julia/packages/Flux/05b38/src/layers/basic.jl:38 [inlined]
 [27] (::typeof(∂(λ)))(::CuArray{Float32,2}) at /home/swamy/.julia/packages/Zygote/ggM8Z/src/compiler/interface2.jl:0
 [28] loss at /home/swamy/vishal/MSML21_BayesianNODE/mnist_test.jl:104 [inlined]
 [29] (::typeof(∂(loss)))(::Float32) at /home/swamy/.julia/packages/Zygote/ggM8Z/src/compiler/interface2.jl:0
 [30] #13 at /home/swamy/vishal/MSML21_BayesianNODE/mnist_test.jl:142 [inlined]
 [31] (::typeof(∂(λ)))(::Float32) at /home/swamy/.julia/packages/Zygote/ggM8Z/src/compiler/interface2.jl:0
 [32] (::Zygote.var"#54#55"{Zygote.Params,Zygote.Context,typeof(∂(λ))})(::Float32) at /home/swamy/.julia/packages/Zygote/ggM8Z/src/compiler/interface.jl:172
 [33] gradient(::Function, ::Zygote.Params) at /home/swamy/.julia/packages/Zygote/ggM8Z/src/compiler/interface.jl:49
 [34] macro expansion at /home/swamy/vishal/MSML21_BayesianNODE/mnist_test.jl:142 [inlined]
 [35] macro expansion at /home/swamy/.julia/packages/Flux/05b38/src/optimise/train.jl:115 [inlined]
 [36] macro expansion at /home/swamy/.julia/packages/Juno/n6wyj/src/progress.jl:134 [inlined]
 [37] trainMNIST() at /home/swamy/vishal/MSML21_BayesianNODE/mnist_test.jl:141
 [38] top-level scope at /home/swamy/vishal/MSML21_BayesianNODE/mnist_test.jl:151
 [39] include(::Function, ::Module, ::String) at ./Base.jl:380
 [40] include(::Module, ::String) at ./Base.jl:368
 [41] exec_options(::Base.JLOptions) at ./client.jl:296
 [42] _start() at ./client.jl:506
in expression starting at /home/swamy/vishal/MSML21_BayesianNODE/mnist_test.jl:151