Moving a custom loss function to GPU

I have developed the following custom loss function, I am trying to run it on the GPU efficiently but I am unable to do so; I get the following error in the line Matrix(ω' * yₖ_batch)

ERROR: MethodError: no method matching unsafe_convert(::Type{Ptr{Float32}}, ::CUDA.CuPtr{Float32})
Closest candidates are:
  unsafe_convert(::Type{CUDA.CuRef{T}}, ::Any) where T at ~/.julia/packages/CUDA/35NC6/src/pointer.jl:208
  unsafe_convert(::Type{CUDA.PtrOrCuPtr{T}}, ::Any) where T at ~/.julia/packages/CUDA/35NC6/src/pointer.jl:118
  unsafe_convert(::Type{<:Union{CUDA.CuArrayPtr, CUDA.CuPtr, Ptr}}, ::CUDA.Mem.AbstractBuffer) at ~/.julia/packages/CUDA/35NC6/lib/cudadrv/memory.jl:33

The lost function is the following one,

function sliced_invariant_statistical_loss_optimized_2(nn_model, loader, hparams)
    @assert loader.batchsize == hparams.samples
    @assert length(loader) == hparams.epochs
    losses = Vector{Float32}()
    optim = Flux.setup(Flux.Adam(hparams.η), nn_model)

    @showprogress for data in loader
        Ω = gpu(ThreadsX.map(_ -> sample_random_direction(size(data)[1]), 1:(hparams.m)))
        loss, grads = Flux.withgradient(nn_model) do nn
            total = 0.0f0
            # Generate all random numbers in one go
            x_batch = gpu(rand(hparams.noise_model, hparams.samples * hparams.K))

            # Process batch through nn_model
            yₖ_batch = nn(Float32.(x_batch))
            for ω in Ω
                aₖ = zeros(Float32, hparams.K + 1)  # Reset aₖ for each new ω

                s = Matrix(ω' * yₖ_batch)

                # Pre-compute column indices for slicing
                start_cols = hparams.K * (1:(hparams.samples - 1))
                end_cols = hparams.K * (2:(hparams.samples)) .- 1

                # Create slices of 's' for all 'aₖ_slice'
                aₖ_slices = [
                    s[:, start_col:(end_col - 1)] for
                    (start_col, end_col) in zip(start_cols, end_cols)
                ]

                # Compute the dot products for all iterations at once
                ω_data_dot_products = [dot(ω, data[:, i]) for i in 2:(hparams.samples)]

                # Apply 'generate_aₖ' for each pair and sum the results
                aₖ = sum([
                    generate_aₖ(aₖ_slice, ω_data_dot_product) for
                    (aₖ_slice, ω_data_dot_product) in zip(aₖ_slices, ω_data_dot_products)
                ])
                total += scalar_diff(aₖ ./ sum(aₖ))
            end
            total / hparams.m
        end
        Flux.update!(optim, nn_model, grads[1])
        push!(losses, loss)
    end
    return losses
end

And I call it this way,

device = gpu

model = device(Generator(latent_dim))
#model = Chain( ConvTranspose((7, 7), 100 => 256, stride=1, padding=0), BatchNorm(256, relu), ConvTranspose((4, 4), 256 => 128, stride=2, padding=1), BatchNorm(128, relu), ConvTranspose((4, 4), 128 => 1, stride=2, padding=1), tanh ))

# Mean vector (zero vector of length dim)
mean_vector = zeros(dims)

# Covariance matrix (identity matrix of size dim x dim)
cov_matrix = Diagonal(ones(dims))

# Create the multivariate normal distribution
noise_model = device(MvNormal(mean_vector, cov_matrix))

n_samples = 10000

hparams = device(
    HyperParamsSlicedISL(;
        K=10, samples=1000, epochs=60, η=1e-2, noise_model=noise_model, m=100
    ),
)

# Create a data loader for training
batch_size = 1000
#train_loader = DataLoader(train_x; batchsize=batch_size, shuffle=false, partial=false)
train_loader = gpu(DataLoader(train_x; batchsize=batch_size, shuffle=true, partial=false))

sliced_invariant_statistical_loss_optimized_2(model, train_loader, hparams)

can someone help me?

Without looking into the details, the above error suggests that you’re invoking CPU functionality assuming a CPU pointer with a GPU input (assuming the pointers were obtained by calling the pointer function). The backtrace should reveal more, but you didn’t include that in your post.

1 Like

Thank you very much for your answer! The all backtrace is,

ERROR: MethodError: no method matching unsafe_convert(::Type{Ptr{Float32}}, ::CUDA.CuPtr{Float32})
Closest candidates are:
  unsafe_convert(::Type{CUDA.CuRef{T}}, ::Any) where T at ~/.julia/packages/CUDA/35NC6/src/pointer.jl:208
  unsafe_convert(::Type{CUDA.PtrOrCuPtr{T}}, ::Any) where T at ~/.julia/packages/CUDA/35NC6/src/pointer.jl:118
  unsafe_convert(::Type{<:Union{CUDA.CuArrayPtr, CUDA.CuPtr, Ptr}}, ::CUDA.Mem.AbstractBuffer) at ~/.julia/packages/CUDA/35NC6/lib/cudadrv/memory.jl:33
  ...
Stacktrace:
  [1] gemv!(trans::Char, alpha::Float32, A::CUDA.CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, X::Vector{Float32}, beta::Float32, Y::Vector{Float32})
    @ LinearAlgebra.BLAS /usr/share/julia/stdlib/v1.8/LinearAlgebra/src/blas.jl:666
  [2] gemv!(y::Vector{Float32}, tA::Char, A::CUDA.CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, x::Vector{Float32}, α::Bool, β::Bool)
    @ LinearAlgebra /usr/share/julia/stdlib/v1.8/LinearAlgebra/src/matmul.jl:503
  [3] mul!
    @ /usr/share/julia/stdlib/v1.8/LinearAlgebra/src/matmul.jl:65 [inlined]
  [4] mul!
    @ /usr/share/julia/stdlib/v1.8/LinearAlgebra/src/matmul.jl:276 [inlined]
  [5] *
    @ /usr/share/julia/stdlib/v1.8/LinearAlgebra/src/matmul.jl:52 [inlined]
  [6] *
    @ /usr/share/julia/stdlib/v1.8/LinearAlgebra/src/matmul.jl:119 [inlined]
  [7] (::ChainRules.var"#1457#1460"{Adjoint{Float32, Vector{Float32}}, CUDA.CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, ChainRulesCore.ProjectTo{Adjoint, NamedTuple{(:parent,), Tuple{ChainRulesCore.ProjectTo{AbstractArray, NamedTuple{(:element, :axes), Tuple{ChainRulesCore.ProjectTo{Float32, NamedTuple{(), Tuple{}}}, Tuple{Base.OneTo{Int64}}}}}}}}})()
    @ ChainRules ~/.julia/packages/ChainRules/pEOSw/src/rulesets/Base/arraymath.jl:36
  [8] unthunk
    @ ~/.julia/packages/ChainRulesCore/zoCjl/src/tangent_types/thunks.jl:204 [inlined]
  [9] wrap_chainrules_output
    @ ~/.julia/packages/Zygote/WOy6z/src/compiler/chainrules.jl:110 [inlined]
 [10] map
    @ ./tuple.jl:223 [inlined]
 [11] wrap_chainrules_output
    @ ~/.julia/packages/Zygote/WOy6z/src/compiler/chainrules.jl:111 [inlined]
 [12] (::Zygote.ZBack{ChainRules.var"#times_pullback#1459"{Adjoint{Float32, CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}, CUDA.CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, ChainRulesCore.ProjectTo{AbstractArray, NamedTuple{(:element, :axes), Tuple{ChainRulesCore.ProjectTo{Float32, NamedTuple{(), Tuple{}}}, Tuple{Base.OneTo{Int64}, Base.OneTo{Int64}}}}}, ChainRulesCore.ProjectTo{Adjoint, NamedTuple{(:parent,), Tuple{ChainRulesCore.ProjectTo{AbstractArray, NamedTuple{(:element, :axes), Tuple{ChainRulesCore.ProjectTo{Float32, NamedTuple{(), Tuple{}}}, Tuple{Base.OneTo{Int64}}}}}}}}}})(dy::Adjoint{Float32, Vector{Float32}})
    @ Zygote ~/.julia/packages/Zygote/WOy6z/src/compiler/chainrules.jl:211
 [13] Pullback
    @ ~/Datos/github/ISL/src/CustomLossFunction.jl:667 [inlined]
 [14] (::Zygote.Pullback{Tuple{ISL.var"#376#381"{HyperParamsSlicedISL, CUDA.CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, Vector{CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}}, Chain{Tuple{Dense{typeof(identity), CUDA.CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}, BatchNorm{typeof(relu), CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Float32, CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}, var"#5#7", ConvTranspose{2, 4, typeof(identity), CUDA.CuArray{Float32, 4, CUDA.Mem.DeviceBuffer}, CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}, BatchNorm{typeof(relu), CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Float32, CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}, ConvTranspose{2, 4, typeof(identity), CUDA.CuArray{Float32, 4, CUDA.Mem.DeviceBuffer}, CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}, BatchNorm{typeof(relu), CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Float32, CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}, ConvTranspose{2, 4, typeof(identity), CUDA.CuArray{Float32, 4, CUDA.Mem.DeviceBuffer}, CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}, typeof(Flux.flatten), var"#6#8"}}}, Any})(Δ::Float32)
    @ Zygote ~/.julia/packages/Zygote/WOy6z/src/compiler/interface2.jl:0
 [15] (::Zygote.var"#75#76"{Zygote.Pullback{Tuple{ISL.var"#376#381"{HyperParamsSlicedISL, CUDA.CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, Vector{CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}}, Chain{Tuple{Dense{typeof(identity), CUDA.CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}, BatchNorm{typeof(relu), CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Float32, CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}, var"#5#7", ConvTranspose{2, 4, typeof(identity), CUDA.CuArray{Float32, 4, CUDA.Mem.DeviceBuffer}, CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}, BatchNorm{typeof(relu), CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Float32, CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}, ConvTranspose{2, 4, typeof(identity), CUDA.CuArray{Float32, 4, CUDA.Mem.DeviceBuffer}, CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}, BatchNorm{typeof(relu), CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Float32, CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}, ConvTranspose{2, 4, typeof(identity), CUDA.CuArray{Float32, 4, CUDA.Mem.DeviceBuffer}, CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}, typeof(Flux.flatten), var"#6#8"}}}, Any}})(Δ::Float32)
    @ Zygote ~/.julia/packages/Zygote/WOy6z/src/compiler/interface.jl:45
 [16] withgradient(f::Function, args::Chain{Tuple{Dense{typeof(identity), CUDA.CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}, BatchNorm{typeof(relu), CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Float32, CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}, var"#5#7", ConvTranspose{2, 4, typeof(identity), CUDA.CuArray{Float32, 4, CUDA.Mem.DeviceBuffer}, CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}, BatchNorm{typeof(relu), CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Float32, CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}, ConvTranspose{2, 4, typeof(identity), CUDA.CuArray{Float32, 4, CUDA.Mem.DeviceBuffer}, CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}, BatchNorm{typeof(relu), CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Float32, CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}, ConvTranspose{2, 4, typeof(identity), CUDA.CuArray{Float32, 4, CUDA.Mem.DeviceBuffer}, CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}, typeof(Flux.flatten), var"#6#8"}})
    @ Zygote ~/.julia/packages/Zygote/WOy6z/src/compiler/interface.jl:162
 [17] macro expansion
    @ ~/Datos/github/ISL/src/CustomLossFunction.jl:657 [inlined]
 [18] macro expansion
    @ ~/.julia/packages/ProgressMeter/vnCY0/src/ProgressMeter.jl:957 [inlined]
 [19] sliced_invariant_statistical_loss_optimized_2(nn_model::Chain{Tuple{Dense{typeof(identity), CUDA.CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}, BatchNorm{typeof(relu), CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Float32, CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}, var"#5#7", ConvTranspose{2, 4, typeof(identity), CUDA.CuArray{Float32, 4, CUDA.Mem.DeviceBuffer}, CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}, BatchNorm{typeof(relu), CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Float32, CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}, ConvTranspose{2, 4, typeof(identity), CUDA.CuArray{Float32, 4, CUDA.Mem.DeviceBuffer}, CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}, BatchNorm{typeof(relu), CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Float32, CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}, ConvTranspose{2, 4, typeof(identity), CUDA.CuArray{Float32, 4, CUDA.Mem.DeviceBuffer}, CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}, typeof(Flux.flatten), var"#6#8"}}, loader::DataLoader{MLUtils.MappedData{:auto, typeof(gpu), Matrix{Float32}}, Random._GLOBAL_RNG, Val{nothing}}, hparams::HyperParamsSlicedISL)
    @ ISL ~/Datos/github/ISL/src/CustomLossFunction.jl:655
 [20] macro expansion
    @ ~/Datos/github/ISL/examples/Sliced_ISL/MNIST_sliced.jl:166 [inlined]
 [21] top-level scope
    @ ~/.julia/packages/ProgressMeter/vnCY0/src/ProgressMeter.jl:957

That reveals a lot. The core problem is that you’re mixing a CPU and GPU operation:

gemv!(y::Vector{Float32}, tA::Char, A::CUDA.CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, x::Vector{Float32}, α::Bool, β::Bool)

Notice the CPU output, and a single GPU input. That results in CPU BLAS being used with a GPU buffer, which is unsupported (unless you use unified memory, which currently isn’t the default).

I’m not sure where the CPU arguments are introduced, but maybe this helps you figuring out the root of the issue.

2 Likes

Hello everyone, again. Thank you very much for the help. I think in the end I have located where I am mixing up GPU with CPU. And I believe it is here in this call, Is in this line,

aₖ = sum([ 
  generate_aₖ(aₖ_slice, ω_data_dot_product) for 
    (aₖ_slice, ω_data_dot_product) in zip(aₖ_slices, ω_data_dot_products)
  ])

To be more precise, the function generate_aₖ is,

function generate_aₖ(ŷ::CuArray{T}, y::T) where {T<:AbstractFloat}
    return CUDA.sum([γ(ŷ, y, k) for k in 0:length(ŷ)])
end

and the rest of the functions are,

function γ(yₖ::CuMatrix{T}, yₙ::T, m::Int64) where {T<:AbstractFloat}
    function eₘ_cuda(m, length)
        return CuArray([j == m ? T(1.0) : T(0.0) for j in 0:length])
    end

    return eₘ_cuda(m, size(yₖ, 2)) * ψₘ(ϕ(yₖ, yₙ), m) 
end

function ϕ(yₖ::CuMatrix{T}, yₙ::T) where {T<:AbstractFloat}
    return sum(_sigmoid(yₖ, yₙ))
end

function ψₘ(y::CuArray{T}, m::Int64) where {T<:AbstractFloat}
    stddev = T(0.1)
    return exp.(-0.5f0 * ((y .- m) / stddev) .^ 2)
end

function _sigmoid(ŷ::CuArray{T}, y::T) where {T<:AbstractFloat}
    return sigmoid_fast.((y .- ŷ) .* 10.0f0)
end

I think I can’t do a list comprehension in CUDA CUDA.sum([γ(ŷ, y, k) for k in 0:length(ŷ)]). Can someone tell me how to rewrite this or if this is where the problem is? Thank you very much.

You can pass a map function to sum (which is basically mapreduce(identity, +), in case you need more flexibility).