ERROR: LoadError: ArgumentError: cannot take the CPU address of a CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}

extremety1989 · August 22, 2021, 12:25pm

the problem is in " x_batch, y_batch = x_batch|>gpu, y_batch|>gpu " when i convert batches to gpu, removing it will work, but how can i make it work on gpu? pullback function from Zygote.

here is fullcode

using Flux
using Zygote
using CUDA
using CSV
using DataFrames
using Images
using MLDatasets
using BSON: @save, @load


function get_dataloaders(batch_size::Int, shuffle::Bool)
    train_x, train_y = MLDatasets.MNIST.traindata(Float32)
    test_x, test_y = MLDatasets.MNIST.testdata(Float32)
    train_y, test_y = Flux.onehotbatch(train_y, 0:9), Flux.onehotbatch(test_y, 0:9)
    train_loader = Flux.Data.DataLoader((train_x, train_y), batchsize=batch_size, shuffle=shuffle)
    test_loader = Flux.Data.DataLoader((test_x, test_y), batchsize=batch_size, shuffle=shuffle)
    return train_loader, test_loader
end

struct FFNetwork
    fc_1
    dropout
    fc_2
    FFNetwork(
        input_dim::Int, hidden_dim::Int, dropout::Float32, num_classes::Int
    ) = new(
        Dense(input_dim, hidden_dim, relu),
        Dropout(dropout),
        Dense(hidden_dim, num_classes),
    )
end

function (net::FFNetwork)(x)
    x = Flux.flatten(x)
    return net.fc_2(net.dropout(net.fc_1(x)))
end

function main(num_epochs::Int, batch_size::Int, shuffle::Bool, λ::Float64)
    train_loader, test_loader = get_dataloaders(batch_size, shuffle)
    model = FFNetwork(28*28, 128, 0.2f0, 10) |> gpu
    trainable_params = Flux.params(model.fc_1, model.fc_2)
    optimiser = ADAM(λ)
    optimiser = Flux.Optimise.Optimiser(Flux.Optimise.WeightDecay(λ), optimiser)
    for epoch = 1:num_epochs
        acc_loss = 0.0
        for (x_batch, y_batch) in train_loader
            x_batch, y_batch = x_batch|>gpu, y_batch|>gpu
            loss, back = pullback(trainable_params) do
                        ŷ = model(x_batch)
                        Flux.Losses.logitcrossentropy(ŷ, y_batch)
            end
            gradients = back(1f0)
            Flux.Optimise.update!(optimiser, trainable_params, gradients)
            acc_loss += loss
        end
        avg_loss = acc_loss / length(train_loader)
        @info "Epoch: $epoch - Average loss: $avg_loss"
    end
end

main(10,128,true,0.001)

eliassno · August 22, 2021, 1:27pm

Actually, the issue is with

model = FFNetwork(28*28, 128, 0.2f0, 10) |> gpu

as it does nothing to your FFNetwork by default. Your model weights, etc. are still on the CPU.

Try the following:

struct FFNetwork
    fc_1
    dropout
    fc_2
end

# Use an outer constructor instead
function FFNetwork(
    input_dim::Int, hidden_dim::Int, dropout::Float32, num_classes::Int
    )
    return FFNetwork(
        Dense(input_dim, hidden_dim, relu),
        Dropout(dropout),
        Dense(hidden_dim, num_classes),
    )
end

# `net |> gpu` should work with this
Flux.gpu(net::FFNetwork) = FFNetwork(gpu(net.fc_1), gpu(net.dropout), gpu(net.fc_2))

With these changes, I can run main.

extremety1989 · August 22, 2021, 1:57pm

okey, thanks

extremety1989 · August 22, 2021, 2:44pm

hey, when i saved the model

        let model = cpu(model) #return model to cpu before serialization
            @save "model.bson" model
        end

and then load it

@load "model.bson" model
y_pred = model(img)

ERROR: LoadError: Scalar indexing is disallowed.
Invocation of getindex resulted in scalar indexing of a GPU array.
This is typically caused by calling an iterating implementation of a method.
Such implementations do not execute on the GPU, but very slowly on the CPU,
and therefore are only permitted from the REPL for prototyping purposes.
If you did intend to index this array, annotate the caller with @allowscalar.
Stacktrace:

Should i open new issue ?

eliassno · August 22, 2021, 2:47pm

What image img are you loading?

Can you provide a complete MWE of your model saving/loading + error?

extremety1989 · August 22, 2021, 2:51pm

images are loaded from Kaggle mnist dataset

using Flux
using Zygote
using CUDA
using CSV
using DataFrames
using Images
using MLDatasets
using BSON: @save, @load


function get_dataloaders(batch_size::Int, shuffle::Bool)
    train_x, train_y = MLDatasets.MNIST.traindata(Float32)
    test_x, test_y = MLDatasets.MNIST.testdata(Float32)
    train_y, test_y = Flux.onehotbatch(train_y, 0:9), Flux.onehotbatch(test_y, 0:9)
    train_loader = Flux.Data.DataLoader((train_x, train_y), batchsize=batch_size, shuffle=shuffle)
    test_loader = Flux.Data.DataLoader((test_x, test_y), batchsize=batch_size, shuffle=shuffle)
    return train_loader, test_loader
end

struct FFNetwork
    fc_1
    dropout
    fc_2
end

function FFNetwork(
    input_dim::Int, hidden_dim::Int, dropout::Float32, num_classes::Int
) return FFNetwork(
    Dense(input_dim, hidden_dim, relu),
    Dropout(dropout),
    Dense(hidden_dim, num_classes),
)  
end

Flux.gpu(net::FFNetwork) = FFNetwork(gpu(net.fc_1), gpu(net.dropout), gpu(net.fc_2))

function (net::FFNetwork)(x)
    x = Flux.flatten(x)
    x = net.fc_1(x)
    x = net.dropout(x)
    return net.fc_2(x)
end



function main(num_epochs::Int, batch_size::Int, shuffle::Bool, λ::Float64)
    train_loader, test_loader = get_dataloaders(batch_size, shuffle)
    model = FFNetwork(28*28, 128, 0.2f0, 10)|>gpu
    trainable_params = Flux.params(model.fc_1, model.fc_2)
    optimiser = ADAM(λ)
    optimiser = Flux.Optimise.Optimiser(Flux.Optimise.WeightDecay(λ), optimiser)
    for epoch = 1:num_epochs
        acc_loss = 0.0
        for (x_batch, y_batch) in train_loader
            x_batch, y_batch = x_batch|>gpu, y_batch|>gpu
            loss, back = pullback(trainable_params) do
                        Flux.Losses.logitcrossentropy(model(x_batch), y_batch)
            end
            gradients = back(1f0)
            Flux.Optimise.update!(optimiser, trainable_params, gradients)
            acc_loss += loss
        end
        avg_loss = acc_loss / length(train_loader)
        @info "Epoch: $epoch - Average loss: $avg_loss"
        let model = cpu(model) #return model to cpu before serialization
            @save "model.bson" model
        end
    end
end

main(10,128,true,0.001)

test = CSV.File("mnist/test.csv") |> DataFrame
img = test[11, :] |> Array
img = img/255 # 0 to 1
img = reshape(img,28,28)
img = transpose(img)
Gray.(img)

img = test[11,:] |> Array
img = img/255
img = transpose(img)
img = reshape(img,28,28,1,:)
@load "model.bson" model
y_pred = model(img)
print((Flux.onecold(y_pred).-1)[1])

extremety1989 · August 22, 2021, 2:54pm

here is the link Digit Recognizer | Kaggle for test.csv

eliassno · August 22, 2021, 2:54pm

Thanks for the code. Note that you’re serializing your GPU-model. You also need to dispatch cpu on your FFNetwork.

EDIT: You can verify this with:

julia> Flux.cpu(net::FFNetwork) = FFNetwork(cpu(net.fc_1), cpu(net.dropout), cpu(net.fc_2));

julia> gpu_model = FFNetwork(28*28, 128, 0.2f0, 10) |> gpu
FFNetwork(Dense(784, 128, relu), Dropout(0.2), Dense(128, 10))

julia> cpu_model = gpu_model |> cpu
FFNetwork(Dense(784, 128, relu), Dropout(0.2), Dense(128, 10))

julia> cpu_model.fc_1.weight
128×784 Array{Float32,2}:
...

extremety1989 · August 22, 2021, 3:05pm

principles that i want are to train model on gpu and in production to predict on cpu.
do i need to dispatch to cpu before saving or after loading of the model ? or just at the beggining?

Flux.gpu(net::FFNetwork) = FFNetwork(gpu(net.fc_1), gpu(net.dropout), gpu(net.fc_2))
Flux.cpu(net::FFNetwork) = FFNetwork(cpu(net.fc_1), cpu(net.dropout), cpu(net.fc_2))

eliassno · August 22, 2021, 3:12pm

It should be sufficient to define these.

You are free to choose on what device you do inference. You would ultimately want to do:

@load "model.bson" model

# `device = cpu` or `device = gpu`
model = model |> device
img = img |> device
y_pred = model(img)

print((Flux.onecold(y_pred).-1)[1])

extremety1989 · August 22, 2021, 3:19pm

thanks!

eliassno · August 22, 2021, 10:23pm

Sorry for not recommending it earlier, but it’s far simpler to use Functors.jl.
This is especially true if you have more layers in your models, like:

struct FFNetworkV2
    fc_1
    ...
    fc_20
end

Just replace your code with:

- Flux.gpu(net::FFNetwork) = FFNetwork(gpu(net.fc_1), gpu(net.dropout), gpu(net.fc_2))
- Flux.cpu(net::FFNetwork) = FFNetwork(cpu(net.fc_1), cpu(net.dropout), cpu(net.fc_2))

+ using Functors
+ @functor FFNetwork

extremety1989 · August 23, 2021, 7:35am

aa no problem, thanks so much

Topic		Replies	Views
Training with Flux.jl on the GPU causes ArgumentError: cannot take the CPU address of a CuArray GPU question , gpu , flux , machine-learning , neural-network	4	1100	May 28, 2022
Unable to save and load model (or parameters) with BSON either on GPU or CPU Machine Learning gpu , cuda , flux , bson	2	1251	July 15, 2021
Data Science lessons: Making "10 - Neural Networks" run on GPU? New to Julia gpu , flux	4	740	January 14, 2022
ERROR: LoadError: CUDNNError: CUDNN_STATUS_EXECUTION_FAILED (code 8) GPU gpu , cuda , flux	0	587	December 28, 2020
Flux.jl: training fails at GPU but works on CPU Machine Learning gpu , flux	1	630	September 19, 2019

ERROR: LoadError: ArgumentError: cannot take the CPU address of a CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}

Related topics