ERROR: LoadError: ArgumentError: cannot take the CPU address of a CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}

the problem is in " x_batch, y_batch = x_batch|>gpu, y_batch|>gpu " when i convert batches to gpu, removing it will work, but how can i make it work on gpu? pullback function from Zygote.

here is fullcode

using Flux
using Zygote
using CUDA
using CSV
using DataFrames
using Images
using MLDatasets
using BSON: @save, @load


function get_dataloaders(batch_size::Int, shuffle::Bool)
    train_x, train_y = MLDatasets.MNIST.traindata(Float32)
    test_x, test_y = MLDatasets.MNIST.testdata(Float32)
    train_y, test_y = Flux.onehotbatch(train_y, 0:9), Flux.onehotbatch(test_y, 0:9)
    train_loader = Flux.Data.DataLoader((train_x, train_y), batchsize=batch_size, shuffle=shuffle)
    test_loader = Flux.Data.DataLoader((test_x, test_y), batchsize=batch_size, shuffle=shuffle)
    return train_loader, test_loader
end

struct FFNetwork
    fc_1
    dropout
    fc_2
    FFNetwork(
        input_dim::Int, hidden_dim::Int, dropout::Float32, num_classes::Int
    ) = new(
        Dense(input_dim, hidden_dim, relu),
        Dropout(dropout),
        Dense(hidden_dim, num_classes),
    )
end

function (net::FFNetwork)(x)
    x = Flux.flatten(x)
    return net.fc_2(net.dropout(net.fc_1(x)))
end

function main(num_epochs::Int, batch_size::Int, shuffle::Bool, λ::Float64)
    train_loader, test_loader = get_dataloaders(batch_size, shuffle)
    model = FFNetwork(28*28, 128, 0.2f0, 10) |> gpu
    trainable_params = Flux.params(model.fc_1, model.fc_2)
    optimiser = ADAM(λ)
    optimiser = Flux.Optimise.Optimiser(Flux.Optimise.WeightDecay(λ), optimiser)
    for epoch = 1:num_epochs
        acc_loss = 0.0
        for (x_batch, y_batch) in train_loader
            x_batch, y_batch = x_batch|>gpu, y_batch|>gpu
            loss, back = pullback(trainable_params) do
                        ŷ = model(x_batch)
                        Flux.Losses.logitcrossentropy(ŷ, y_batch)
            end
            gradients = back(1f0)
            Flux.Optimise.update!(optimiser, trainable_params, gradients)
            acc_loss += loss
        end
        avg_loss = acc_loss / length(train_loader)
        @info "Epoch: $epoch - Average loss: $avg_loss"
    end
end

main(10,128,true,0.001)

Actually, the issue is with

model = FFNetwork(28*28, 128, 0.2f0, 10) |> gpu

as it does nothing to your FFNetwork by default. Your model weights, etc. are still on the CPU.

Try the following:

struct FFNetwork
    fc_1
    dropout
    fc_2
end

# Use an outer constructor instead
function FFNetwork(
    input_dim::Int, hidden_dim::Int, dropout::Float32, num_classes::Int
    )
    return FFNetwork(
        Dense(input_dim, hidden_dim, relu),
        Dropout(dropout),
        Dense(hidden_dim, num_classes),
    )
end

# `net |> gpu` should work with this
Flux.gpu(net::FFNetwork) = FFNetwork(gpu(net.fc_1), gpu(net.dropout), gpu(net.fc_2))

With these changes, I can run main.

1 Like

okey, thanks

hey, when i saved the model

        let model = cpu(model) #return model to cpu before serialization
            @save "model.bson" model
        end

and then load it

@load "model.bson" model
y_pred = model(img)

ERROR: LoadError: Scalar indexing is disallowed.
Invocation of getindex resulted in scalar indexing of a GPU array.
This is typically caused by calling an iterating implementation of a method.
Such implementations do not execute on the GPU, but very slowly on the CPU,
and therefore are only permitted from the REPL for prototyping purposes.
If you did intend to index this array, annotate the caller with @allowscalar.
Stacktrace:

Should i open new issue ?

What image img are you loading?

Can you provide a complete MWE of your model saving/loading + error?

images are loaded from Kaggle mnist dataset

using Flux
using Zygote
using CUDA
using CSV
using DataFrames
using Images
using MLDatasets
using BSON: @save, @load


function get_dataloaders(batch_size::Int, shuffle::Bool)
    train_x, train_y = MLDatasets.MNIST.traindata(Float32)
    test_x, test_y = MLDatasets.MNIST.testdata(Float32)
    train_y, test_y = Flux.onehotbatch(train_y, 0:9), Flux.onehotbatch(test_y, 0:9)
    train_loader = Flux.Data.DataLoader((train_x, train_y), batchsize=batch_size, shuffle=shuffle)
    test_loader = Flux.Data.DataLoader((test_x, test_y), batchsize=batch_size, shuffle=shuffle)
    return train_loader, test_loader
end

struct FFNetwork
    fc_1
    dropout
    fc_2
end

function FFNetwork(
    input_dim::Int, hidden_dim::Int, dropout::Float32, num_classes::Int
) return FFNetwork(
    Dense(input_dim, hidden_dim, relu),
    Dropout(dropout),
    Dense(hidden_dim, num_classes),
)  
end

Flux.gpu(net::FFNetwork) = FFNetwork(gpu(net.fc_1), gpu(net.dropout), gpu(net.fc_2))

function (net::FFNetwork)(x)
    x = Flux.flatten(x)
    x = net.fc_1(x)
    x = net.dropout(x)
    return net.fc_2(x)
end



function main(num_epochs::Int, batch_size::Int, shuffle::Bool, λ::Float64)
    train_loader, test_loader = get_dataloaders(batch_size, shuffle)
    model = FFNetwork(28*28, 128, 0.2f0, 10)|>gpu
    trainable_params = Flux.params(model.fc_1, model.fc_2)
    optimiser = ADAM(λ)
    optimiser = Flux.Optimise.Optimiser(Flux.Optimise.WeightDecay(λ), optimiser)
    for epoch = 1:num_epochs
        acc_loss = 0.0
        for (x_batch, y_batch) in train_loader
            x_batch, y_batch = x_batch|>gpu, y_batch|>gpu
            loss, back = pullback(trainable_params) do
                        Flux.Losses.logitcrossentropy(model(x_batch), y_batch)
            end
            gradients = back(1f0)
            Flux.Optimise.update!(optimiser, trainable_params, gradients)
            acc_loss += loss
        end
        avg_loss = acc_loss / length(train_loader)
        @info "Epoch: $epoch - Average loss: $avg_loss"
        let model = cpu(model) #return model to cpu before serialization
            @save "model.bson" model
        end
    end
end

main(10,128,true,0.001)

test = CSV.File("mnist/test.csv") |> DataFrame
img = test[11, :] |> Array
img = img/255 # 0 to 1
img = reshape(img,28,28)
img = transpose(img)
Gray.(img)

img = test[11,:] |> Array
img = img/255
img = transpose(img)
img = reshape(img,28,28,1,:)
@load "model.bson" model
y_pred = model(img)
print((Flux.onecold(y_pred).-1)[1])
1 Like

here is the link Digit Recognizer | Kaggle for test.csv

Thanks for the code. Note that you’re serializing your GPU-model. You also need to dispatch cpu on your FFNetwork.

EDIT: You can verify this with:

julia> Flux.cpu(net::FFNetwork) = FFNetwork(cpu(net.fc_1), cpu(net.dropout), cpu(net.fc_2));

julia> gpu_model = FFNetwork(28*28, 128, 0.2f0, 10) |> gpu
FFNetwork(Dense(784, 128, relu), Dropout(0.2), Dense(128, 10))

julia> cpu_model = gpu_model |> cpu
FFNetwork(Dense(784, 128, relu), Dropout(0.2), Dense(128, 10))

julia> cpu_model.fc_1.weight
128×784 Array{Float32,2}:
...

principles that i want are to train model on gpu and in production to predict on cpu.
do i need to dispatch to cpu before saving or after loading of the model ? or just at the beggining?

Flux.gpu(net::FFNetwork) = FFNetwork(gpu(net.fc_1), gpu(net.dropout), gpu(net.fc_2))
Flux.cpu(net::FFNetwork) = FFNetwork(cpu(net.fc_1), cpu(net.dropout), cpu(net.fc_2))
1 Like

It should be sufficient to define these.

You are free to choose on what device you do inference. You would ultimately want to do:

@load "model.bson" model

# `device = cpu` or `device = gpu`
model = model |> device
img = img |> device
y_pred = model(img)

print((Flux.onecold(y_pred).-1)[1])

thanks!

Sorry for not recommending it earlier, but it’s far simpler to use Functors.jl.
This is especially true if you have more layers in your models, like:

struct FFNetworkV2
    fc_1
    ...
    fc_20
end

Just replace your code with:

- Flux.gpu(net::FFNetwork) = FFNetwork(gpu(net.fc_1), gpu(net.dropout), gpu(net.fc_2))
- Flux.cpu(net::FFNetwork) = FFNetwork(cpu(net.fc_1), cpu(net.dropout), cpu(net.fc_2))

+ using Functors
+ @functor FFNetwork

aa no problem, thanks so much