SimpleChains method error with custom loss function and train_unbatched

Hi SimpleChains.jl people!

I am trying to build a contrastive classifier on small dimensional data. I built the custom loss function, following the example in the SimpleChains docs. valgrad! doesn’t error EDIT: but doesn’t touch the gradients I feed it. I also hit a method error in train_unbatched.

ERROR: MethodError: no method matching StrideArraysCore.PtrArray(::Nothing)
Closest candidates are:
  StrideArraysCore.PtrArray(::Union{StrideArray, StrideArraysCore.StrideBitArray}) at ~/.julia/packages/StrideArraysCore/VQxXL/src/stridearray.jl:189
  StrideArraysCore.PtrArray(::Ptr{T}, ::Tuple{Vararg{Union{Int128, Int16, Int32, Int64, Int8, UInt128, UInt16, UInt32, UInt64, UInt8, Static.StaticInt}, N}}) where {T, N} at ~/.julia/packages/StrideArraysCore/VQxXL/src/ptr_array.jl:178
  StrideArraysCore.PtrArray(::Ptr, ::Tuple{Vararg{Union{Int128, Int16, Int32, Int64, Int8, UInt128, UInt16, UInt32, UInt64, UInt8, Static.StaticInt}}}, ::Static.StaticInt{1}) at ~/.julia/packages/StrideArraysCore/VQxXL/src/ptr_array.jl:184
  ...
Stacktrace:
 [1] update!(g::StrideArray{Tuple{Static.StaticInt{707}, Static.StaticInt{4}}, (true, false), Float32, 2, 1, 0, (1, 2), Tuple{Static.StaticInt{4}, Int64}, Tuple{Static.StaticInt{1}, Static.StaticInt{1}}, Vector{Float32}}, opt::SimpleChains.ADAM, Xp::StrideArraysCore.PtrArray{Tuple{Static.StaticInt{4}, Int64}, (true, true), Float64, 2, 1, 0, (1, 2), Tuple{Static.StaticInt{8}, Static.StaticInt{32}}, Tuple{Static.StaticInt{1}, Static.StaticInt{1}}}, layers::Tuple{TurboDense{true, Static.StaticInt{32}, typeof(tanh)}, TurboDense{true, Static.StaticInt{16}, typeof(tanh)}, TurboDense{true, Static.StaticInt{1}, typeof(identity)}, TurboDense{true, Static.StaticInt{1}, typeof(identity)}, ContrastiveCrossEntropyLoss{SampleContrast, Vector{SampleContrast}}}, pen::NoPenalty{SimpleChain{Tuple{Static.StaticInt{4}}, Tuple{TurboDense{true, Static.StaticInt{32}, typeof(tanh)}, TurboDense{true, Static.StaticInt{16}, typeof(tanh)}, TurboDense{true, Static.StaticInt{1}, typeof(identity)}, TurboDense{true, Static.StaticInt{1}, typeof(identity)}, ContrastiveCrossEntropyLoss{SampleContrast, Vector{SampleContrast}}}}}, sx::Tuple{Static.StaticInt{4}, Int64}, p::StrideArraysCore.StaticStrideArray{Tuple{Static.StaticInt{707}}, (true,), Float32, 1, 1, 0, (1,), Tuple{Static.StaticInt{4}}, Tuple{Static.StaticInt{1}}, 707}, pm::Ptr{UInt8}, optbuffer::Tuple{StrideArraysCore.PtrArray{Tuple{Static.StaticInt{707}}, (true,), Float32, 1, 1, 0, (1,), Tuple{Static.StaticInt{4}}, Tuple{Static.StaticInt{1}}}, StrideArraysCore.PtrArray{Tuple{Static.StaticInt{707}}, (true,), Float32, 1, 1, 0, (1,), Tuple{Static.StaticInt{4}}, Tuple{Static.StaticInt{1}}}, StrideArraysCore.PtrArray{Tuple{Static.StaticInt{2}}, (true,), Float64, 1, 1, 0, (1,), Tuple{Static.StaticInt{8}}, Tuple{Static.StaticInt{1}}}}, mpt::Int64)
   @ SimpleChains ~/.julia/packages/SimpleChains/fifFm/src/optimize.jl:124
 [2] train_unbatched_core!(c::SimpleChain{Tuple{Static.StaticInt{4}}, Tuple{TurboDense{true, Static.StaticInt{32}, typeof(tanh)}, TurboDense{true, Static.StaticInt{16}, typeof(tanh)}, TurboDense{true, Static.StaticInt{1}, typeof(identity)}, TurboDense{true, Static.StaticInt{1}, typeof(identity)}, ContrastiveCrossEntropyLoss{SampleContrast, Vector{SampleContrast}}}}, pu::Ptr{UInt8}, g::StrideArray{Tuple{Static.StaticInt{707}, Static.StaticInt{4}}, (true, false), Float32, 2, 1, 0, (1, 2), Tuple{Static.StaticInt{4}, Int64}, Tuple{Static.StaticInt{1}, Static.StaticInt{1}}, Vector{Float32}}, pX::StrideArraysCore.PtrArray{Tuple{Static.StaticInt{4}, Int64}, (true, true), Float64, 2, 1, 0, (1, 2), Tuple{Static.StaticInt{8}, Static.StaticInt{32}}, Tuple{Static.StaticInt{1}, Static.StaticInt{1}}}, p::StrideArraysCore.StaticStrideArray{Tuple{Static.StaticInt{707}}, (true,), Float32, 1, 1, 0, (1,), Tuple{Static.StaticInt{4}}, Tuple{Static.StaticInt{1}}, 707}, opt::SimpleChains.ADAM, iters::Int64, mpt::Int64)
   @ SimpleChains ~/.julia/packages/SimpleChains/fifFm/src/optimize.jl:342
 [3] with_heap_memory
   @ ~/.julia/packages/SimpleChains/fifFm/src/memory.jl:36 [inlined]
 [4] with_memory
   @ ~/.julia/packages/SimpleChains/fifFm/src/memory.jl:47 [inlined]
 [5] train_unbatched!(g::StrideArray{Tuple{Static.StaticInt{707}, Static.StaticInt{4}}, (true, false), Float32, 2, 1, 0, (1, 2), Tuple{Static.StaticInt{4}, Int64}, Tuple{Static.StaticInt{1}, Static.StaticInt{1}}, Vector{Float32}}, p::StrideArraysCore.StaticStrideArray{Tuple{Static.StaticInt{707}}, (true,), Float32, 1, 1, 0, (1,), Tuple{Static.StaticInt{4}}, Tuple{Static.StaticInt{1}}, 707}, _chn::SimpleChain{Tuple{Static.StaticInt{4}}, Tuple{TurboDense{true, Static.StaticInt{32}, typeof(tanh)}, TurboDense{true, Static.StaticInt{16}, typeof(tanh)}, TurboDense{true, Static.StaticInt{1}, typeof(identity)}, TurboDense{true, Static.StaticInt{1}, typeof(identity)}, ContrastiveCrossEntropyLoss{SampleContrast, Vector{SampleContrast}}}}, X::Matrix{Float64}, opt::SimpleChains.ADAM, t::Int64)
   @ SimpleChains ~/.julia/packages/SimpleChains/fifFm/src/optimize.jl:399

New to machine learning, SIMD, and this is beyond my pay grade.

EDIT: Note that my targets, y are half the length of the number of outputs because of this whole weird contrast loss I’m trying out. One target for each pair of inputs. Maybe? I have to let SimpleShains know about this considering the code in src\loss for the regular cross-ent:

function layer_output_size(::Val{T}, sl::LogitCrossEntropyLoss, s::Tuple) where {T}
  _layer_output_size_needs_temp_of_equal_len_as_target(Val{T}(), sl, s)
end
function forward_layer_output_size(::Val{T}, sl::LogitCrossEntropyLoss, s) where {T}
  _layer_output_size_needs_temp_of_equal_len_as_target(Val{T}(), sl, s)
end

Don’t really know what the loss layer is doing and what needs to be defined.

If some code context is needed I can show you my custom loss. Trying to use this for reenforcement learning. Mostly copied and pasted docs.


mutable struct ChainActor{C,P,G,F}
    β::Float64 # β is a component of the actor.
    chain::C
    params::P
    grads::G
    prefun::F
end

function train!(ca::ChainActor, memory; grad_steps = length(memory))
    (x,y) = make_xy(memory; prefun = ca.prefun)
    chain_loss = SimpleChains.add_loss(ca.chain, ContrastiveCrossEntropyLoss(y))
    SimpleChains.valgrad!(ca.grads, chain_loss, x, ca.params)
    #SimpleChains.train_unbatched!(ca.grads, ca.params, chain_loss, x, SimpleChains.ADAM(), grad_steps); 
# I switch to training unbatched here with even just a single grad_step and MethodError.
end

function (a::ChainActor)(state, action)
    x = a.prefun(vcat(state, action, a.β)) # remove time in pendulum example
    return a.chain(x, a.params)[1]
end

function init_chain_actor(arch...; prefun = identity, ndims = 1)
    chain = SimpleChain(static(ndims), arch..., TurboDense(identity, 1)) # must terminate with a single dimension
    params = SimpleChains.init_params(chain)
    grads = SimpleChains.alloc_threaded_grad(chain)
    ChainActor(1.0, chain, params, grads, prefun)
end




struct ContrastiveCrossEntropyLoss{T,Y<:AbstractVector{T}} <: SimpleChains.AbstractLoss{T}
    targets::Y
end

function calculate_loss(loss::ContrastiveCrossEntropyLoss, logits)
    # logits is an even number of outputs for the neural net
    y = loss.targets
    total_loss = zero(Float64)
    for ii in eachindex(y)
        Δε = logits[2*ii - 1] - logits[2*ii]
        total_loss += contrast_loss(y[ii],Δε)
    end
    total_loss
end

function SimpleChains.layer_output_size(::Val{T}, sl::ContrastiveCrossEntropyLoss, s::Tuple) where {T}
    SimpleChains._layer_output_size_no_temp(Val{T}(), sl, s)
end

function SimpleChains.forward_layer_output_size(::Val{T}, sl::ContrastiveCrossEntropyLoss, s) where {T}
    SimpleChains._layer_output_size_no_temp(Val{T}(), sl, s)
end


function (loss::ContrastiveCrossEntropyLoss)(previous_layer_output::AbstractArray, p::Ptr, pu)
    total_loss = calculate_loss(loss, previous_layer_output)
    total_loss, p, pu
end



function SimpleChains.chain_valgrad!(
    __,
    previous_layer_output::AbstractArray{T},
    layers::Tuple{ContrastiveCrossEntropyLoss},
    _::Ptr,
    pu::Ptr{UInt8},
) where {T}
    loss = getfield(layers, 1)
    total_loss = calculate_loss(loss, previous_layer_output)
    y = loss.targets
    # Store the backpropagated gradient in the previous_layer_output array.
    for i in eachindex(y)
        # Get the value of the last logit
        e1 = previous_layer_output[2*i-1]
        e2 = previous_layer_output[2*i]
        sign_arg = Float32(contrast_grad(y[i], e1-e2))
        if isnan(sign_arg)
            println("badloss")
        end
        previous_layer_output[2i-1] = sign_arg
        previous_layer_output[2i] = - sign_arg
    end
    return total_loss, previous_layer_output, pu
end

thanks for the help

1 Like

Hi,

I think I wrote some of the docs you followed for this, so I apologise for it causing some errors! I only ever used the valgrad! for the gradients and never got as far as using the in-built training methods. I didn’t write any of the code for SimpleChains.jl so it’s likely I overlooked something important in the docs.

I don’t know exactly what the issue is, but I will have a look when I get some time and update the docs when it gets fixed so this won’t be an issue in the future.

1 Like

I really appreciate it, and it was because of this doc page, I chose to use simple chains. My use case: Small data, realtime training, custom loss function, so this will be the perfect package if I can get it working :grinning:

I seem to get training when go from allocating a threaded gradient version

grads = SimpleChains.alloc_threaded_grad(chain)

to just a single-thread appropriate version:

grads = similar(chain)

Don’t know why, but quite excited! Single thread is fine for me.

@jmair Good luck with the docs, and let me know if you figure out how to get multithreading to work. Let me know if you want data to test this unusual cost function with.

I think I’ve found the source of the error, you need to define the target function:

target(loss::ContrastiveCrossEntropyLoss) = loss.targets

By default this is returning Nothing which is why optmize.jl:124 is not finding the correct method. I will add this to the docs. I don’t have the full code to run to see if this works, but it needs adding to the docs anyway.

EDIT:

And also you might need a line which I am guessing is:

(loss::ContrastiveCrossEntropyLoss)(::Int) = loss

Where the input I think is the current epoch, which I think is helpful if you want to change the loss over time. This runs when applied to the example in the docs with train_unbatched, but I haven’t checked whether the gradient calculation is actually correct yet. If this works, let me know here but for now I will submit a PR to update the docs: Fix custom loss layer documentation by JamieMair · Pull Request #120 · PumasAI/SimpleChains.jl · GitHub

1 Like

Docs should be updated now with the extra functions you need - Adding a custom loss layer · SimpleChains.jl

3 Likes

Hi I was now following the new docs with julia 1.9 and I am getting the error again. If I swith gradient = SimpleChains.alloc_threaded_grad(model) to gradients = similar(model) I get the error:

MethodError: no method matching pointer(::SimpleChain{Tuple{Static.StaticInt{2}}, Tuple{TurboDense{true, Static.StaticInt{32}, typeof(tanh)}, TurboDense{true, Static.StaticInt{16}, typeof(tanh)}, TurboDense{true, 
Static.StaticInt{1}, typeof(identity)}}})

The Code:

using SimpleChains
struct BinaryLogitCrossEntropyLoss{T,Y<:AbstractVector{T}} <: SimpleChains.AbstractLoss{T}
  targets::Y
end
target(loss::BinaryLogitCrossEntropyLoss) = loss.targets
(loss::BinaryLogitCrossEntropyLoss)(::Int) = loss

function calculate_loss(loss::BinaryLogitCrossEntropyLoss, logits)
  y = loss.targets
  total_loss = zero(eltype(logits))
  for i in eachindex(y)
      p_i = inv(1 + exp(-logits[i]))
      y_i = y[i]
      total_loss -= y_i * log(p_i) + (1 - y_i) * (1 - log(p_i))
  end
  total_loss
end
function (loss::BinaryLogitCrossEntropyLoss)(previous_layer_output::AbstractArray{T}, p::Ptr, pu) where {T}
  total_loss = calculate_loss(loss, previous_layer_output)
  total_loss, p, pu
end

function SimpleChains.layer_output_size(::Val{T}, sl::BinaryLogitCrossEntropyLoss, s::Tuple) where {T}
  SimpleChains._layer_output_size_no_temp(Val{T}(), sl, s)
end
function SimpleChains.forward_layer_output_size(::Val{T}, sl::BinaryLogitCrossEntropyLoss, s) where {T}
  SimpleChains._layer_output_size_no_temp(Val{T}(), sl, s)
end

function SimpleChains.chain_valgrad!(
  __,
  previous_layer_output::AbstractArray{T},
  layers::Tuple{BinaryLogitCrossEntropyLoss},
  _::Ptr,
  pu::Ptr{UInt8},
) where {T}
  loss = getfield(layers, 1)
  total_loss = calculate_loss(loss, previous_layer_output)
  y = loss.targets

  # Store the backpropagated gradient in the previous_layer_output array.
  for i in eachindex(y)
      sign_arg = 2 * y[i] - 1
      # Get the value of the last logit
      logit_i = previous_layer_output[i]
      previous_layer_output[i] = -(sign_arg * inv(1 + exp(sign_arg * logit_i)))
  end

  return total_loss, previous_layer_output, pu
end



model = SimpleChain(
  static(2),
  TurboDense(tanh, 32),
  TurboDense(tanh, 16),
  TurboDense(identity, 1)
)

batch_size = 64
X = rand(Float32, 2, batch_size)
Y = rand(Bool, batch_size)

parameters = SimpleChains.init_params(model);
#gradients = SimpleChains.alloc_threaded_grad(model);
gradients = similar(model)

# Add the loss like any other loss type
model_loss = SimpleChains.add_loss(model, BinaryLogitCrossEntropyLoss(Y));


SimpleChains.valgrad!(gradients, model_loss, X, parameters)

epochs = 10
SimpleChains.train_unbatched!(gradients, parameters, model_loss, X, SimpleChains.ADAM(), 1:epochs); 

A PR to the documentation would be very welcome!
You’re missing two methods:

SimpleChains.target(l::BinaryLogitCrossEntropyLoss) = getfield(l,:targets)
(l::BinaryLogitCrossEntropyLoss)(t::AbstractArray) = BinaryLogitCrossEntropyLoss(t)

With that, I get

julia> @time SimpleChains.train_unbatched!(gradients, parameters, model_loss, X, SimpleChains.ADAM(), 1:epochs);
  0.737761 seconds (1.30 M allocations: 85.167 MiB, 122.57% compilation time: 80% of which was recompilation)

julia> @time SimpleChains.train_unbatched!(gradients, parameters, model_loss, X, SimpleChains.ADAM(), 1:epochs);
  0.000253 seconds (1 allocation: 32 bytes)

julia> @time SimpleChains.train_unbatched!(gradients, parameters, model_loss, X, SimpleChains.ADAM(), 1:epochs);
  0.000240 seconds (1 allocation: 32 bytes)

julia> @time SimpleChains.train_unbatched!(gradients, parameters, model_loss, X, SimpleChains.ADAM(), 1:epochs);
  0.000212 seconds (1 allocation: 32 bytes)
1 Like