Not sure where function is going wrong

Here is my code:

using CUDA, Optimization, OptimizationNLopt

function calc(x0,p)

    a = x0[1:293]  # Convert tau to a CUDA array
    b = x0[293+1:2*293]  # Convert q0 to a CUDA array
    c = CUDA.zeros(293)  # Create CUDA arrays for PI and WD
    d = CUDA.zeros(293)

    f = CuArray(fill(Num(0.0),170 * 293))

    f[1] .= x0[2]  # Use . notation for indexing and assignment

    f = reshape(f,293, 170)

    println(1)

    return a,b,c,d, transpose(f)
end

l = 1
p=[l]
x0 = CuArray(rand(2979))
lower_bounds = zeros(2979).-100
upper_bounds = zeros(2979).+100
f = OptimizationFunction(calc, AutoModelingToolkit(false,false))
prob = Optimization.OptimizationProblem(f,x0,p,lb = lower_bounds, ub = upper_bounds)
sol = solve(prob, NLopt.LD_LBFGS(),abstol=1e0)

I get the following error:

1
CuArray only supports element types that are allocated inline.
Any is not allocated inline


Stacktrace:
  [1] error(s::String)
    @ Base ./error.jl:35
  [2] check_eltype(T::Type)
    @ CUDA ~/.julia/packages/CUDA/YIj5X/src/array.jl:51
  [3] CuArray{Any, 1, CUDA.Mem.DeviceBuffer}(#unused#::UndefInitializer, dims::Tuple{Int64})
    @ CUDA ~/.julia/packages/CUDA/YIj5X/src/array.jl:66
  [4] (CuArray{Any, 1})(#unused#::UndefInitializer, dims::Tuple{Int64})
    @ CUDA ~/.julia/packages/CUDA/YIj5X/src/array.jl:147
  [5] (CuArray{Any})(#unused#::UndefInitializer, dims::Tuple{Int64})
    @ CUDA ~/.julia/packages/CUDA/YIj5X/src/array.jl:166
  [6] similar(#unused#::Type{CuArray{Any}}, dims::Tuple{Int64})
    @ Base ./abstractarray.jl:884
  [7] similar(#unused#::Type{CuArray{Any}}, shape::Tuple{Base.OneTo{Int64}})
    @ Base ./abstractarray.jl:883
  [8] similar(bc::Base.Broadcast.Broadcasted{CUDA.CuArrayStyle{1}, Tuple{Base.OneTo{Int64}}, typeof(Symbolics.value), Tuple{CuArray{Num, 1, CUDA.Mem.DeviceBuffer}}}, #unused#::Type{Any})
    @ CUDA ~/.julia/packages/CUDA/YIj5X/src/broadcast.jl:11
  [9] copy
    @ ~/.julia/packages/GPUArrays/dAUOE/src/host/broadcast.jl:42 [inlined]
 [10] materialize(bc::Base.Broadcast.Broadcasted{CUDA.CuArrayStyle{1}, Nothing, typeof(Symbolics.value), Tuple{CuArray{Num, 1, CUDA.Mem.DeviceBuffer}}})
    @ Base.Broadcast ./broadcast.jl:873
 [11] OptimizationSystem(op::Tuple{CuArray{Num, 1, CUDA.Mem.DeviceBuffer}, CuArray{Num, 1, CUDA.Mem.DeviceBuffer}, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, LinearAlgebra.Transpose{Num, CuArray{Num, 2, CUDA.Mem.DeviceBuffer}}}, states::CuArray{Num, 1, CUDA.Mem.DeviceBuffer}, ps::Vector{Num}; observed::Vector{Any}, constraints::Vector{Any}, default_u0::Dict{Any, Any}, default_p::Dict{Any, Any}, defaults::Dict{Any, Any}, name::Symbol, systems::Vector{OptimizationSystem}, checks::Bool, metadata::Nothing, gui_metadata::Nothing)
    @ ModelingToolkit ~/.julia/packages/ModelingToolkit/x6I0O/src/systems/optimization/optimizationsystem.jl:95
 [12] modelingtoolkitize(prob::OptimizationProblem{true, OptimizationFunction{true, AutoModelingToolkit, typeof(calc), Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, typeof(SciMLBase.DEFAULT_OBSERVED_NO_TIME), Nothing, Nothing, SymbolicIndexingInterface.SymbolCache{Nothing, Nothing, Nothing}, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing}, CuArray{Float64, 1, CUDA.Mem.DeviceBuffer}, Vector{Int64}, Nothing, Nothing, Nothing, Vector{Float64}, Vector{Float64}, Nothing, Base.Pairs{Symbol, Union{}, Tuple{}, NamedTuple{(), Tuple{}}}}; kwargs::Base.Pairs{Symbol, Union{}, Tuple{}, NamedTuple{(), Tuple{}}})
    @ ModelingToolkit ~/.julia/packages/ModelingToolkit/x6I0O/src/systems/optimization/modelingtoolkitize.jl:57
 [13] modelingtoolkitize(prob::OptimizationProblem{true, OptimizationFunction{true, AutoModelingToolkit, typeof(calc), Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, typeof(SciMLBase.DEFAULT_OBSERVED_NO_TIME), Nothing, Nothing, SymbolicIndexingInterface.SymbolCache{Nothing, Nothing, Nothing}, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing}, CuArray{Float64, 1, CUDA.Mem.DeviceBuffer}, Vector{Int64}, Nothing, Nothing, Nothing, Vector{Float64}, Vector{Float64}, Nothing, Base.Pairs{Symbol, Union{}, Tuple{}, NamedTuple{(), Tuple{}}}})
    @ ModelingToolkit ~/.julia/packages/ModelingToolkit/x6I0O/src/systems/optimization/modelingtoolkitize.jl:6
 [14] instantiate_function(f::Function, cache::Optimization.ReInitCache{CuArray{Float64, 1, CUDA.Mem.DeviceBuffer}, Vector{Int64}}, adtype::AutoModelingToolkit, num_cons::Int64)
    @ OptimizationMTKExt ~/.julia/packages/Optimization/H7hi7/ext/OptimizationMTKExt.jl:58
 [15] OptimizationCache(prob::OptimizationProblem{true, OptimizationFunction{true, AutoModelingToolkit, typeof(calc), Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, typeof(SciMLBase.DEFAULT_OBSERVED_NO_TIME), Nothing, Nothing, SymbolicIndexingInterface.SymbolCache{Nothing, Nothing, Nothing}, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing}, CuArray{Float64, 1, CUDA.Mem.DeviceBuffer}, Vector{Int64}, Vector{Float64}, Vector{Float64}, Nothing, Nothing, Nothing, Nothing, Base.Pairs{Symbol, Union{}, Tuple{}, NamedTuple{(), Tuple{}}}}, opt::Algorithm, data::Base.Iterators.Cycle{Tuple{Optimization.NullData}}; callback::Optimization.NullCallback, maxiters::Nothing, maxtime::Nothing, abstol::Float64, reltol::Nothing, progress::Bool, kwargs::Base.Pairs{Symbol, Union{}, Tuple{}, NamedTuple{(), Tuple{}}})
    @ Optimization ~/.julia/packages/Optimization/H7hi7/src/cache.jl:64
 [16] OptimizationCache
    @ ~/.julia/packages/Optimization/H7hi7/src/cache.jl:54 [inlined]
 [17] #__init#36
    @ ~/.julia/packages/Optimization/H7hi7/src/cache.jl:81 [inlined]
 [18] __init (repeats 2 times)
    @ ~/.julia/packages/Optimization/H7hi7/src/cache.jl:72 [inlined]
 [19] #init#604
    @ ~/.julia/packages/SciMLBase/1ISDJ/src/solve.jl:163 [inlined]
 [20] init
    @ ~/.julia/packages/SciMLBase/1ISDJ/src/solve.jl:161 [inlined]
 [21] solve(::OptimizationProblem{true, OptimizationFunction{true, AutoModelingToolkit, typeof(calc), Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, typeof(SciMLBase.DEFAULT_OBSERVED_NO_TIME), Nothing, Nothing, SymbolicIndexingInterface.SymbolCache{Nothing, Nothing, Nothing}, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing}, CuArray{Float64, 1, CUDA.Mem.DeviceBuffer}, Vector{Int64}, Vector{Float64}, Vector{Float64}, Nothing, Nothing, Nothing, Nothing, Base.Pairs{Symbol, Union{}, Tuple{}, NamedTuple{(), Tuple{}}}}, ::Algorithm; kwargs::Base.Pairs{Symbol, Float64, Tuple{Symbol}, NamedTuple{(:abstol,), Tuple{Float64}}})
    @ SciMLBase ~/.julia/packages/SciMLBase/1ISDJ/src/solve.jl:94
 [22] top-level scope
    @ In[95]:8

This is odd because it looks like the code completely run (it prints out ā€œ1ā€). Iā€™m not sure where the code is going wrong.

I tried looking into it, but honestly Iā€™m still confused.

You can tell from the stacktrace that the relevant CuArray{Any} first shows up at [7] similar. Working backwards:

  • [8] similar takes in Any as an argument
  • [9] copy computed that Any from Broadcast.combine_eltypes(bc.f, bc.args), which does Base._return_type(f, argT). Thereā€™s a lot going on there, but that basically means some function callā€™s return type is inferred as ::Any.
  • copy occurs in [10] materialize of a Broadcasted instance, which happens when you do a dotted call, in this case of Symbolics.value on a CuArray{Num,...}.
  • [11] OptimizationSystem calls value.(scalarize(states)) at line 95. My best guess from Symbolics.jl/src/arrays.jl is scalarize actually does nothing there because the input states is a CuArray{Num,...}, not any of the types scalarize changes. As far as I can tell from Symbolics.jl/src/num.jl, the value function just accesses the sole val::Any field of the Num type, so it makes sense that itā€™s inferred as ::Any.
  • [12] modelingtoolkitize provides the ops input to [11] OptimizationSystem by calling your calc in the prob.f(vars, params) line, and the types match. Thatā€™s where your println(1) runs. The origin of the CuArray{Num,...} instance is the ArrayInterface.restructure calls for vars and params, and that seems to do something like similar(x0, Num), which boils down to a CuArray{Num,...} constructor call.

So the weird thing is, my understanding is CuArrays can only contain inline elements, like isbits types or small Unions of them. Accordingly, the CuArray constructor checks Base.allocatedinline. However, while Base.allocatedinline(Any) is false, Base.allocatedinline(Num) is true! I donā€™t know why, nor can I imagine how Num is being stored on the GPU in practice.

1 Like

Huh, okay. Maybe I should open up a ticket on CUDA.jl on Github.

I appreciate you taking the time to look at this. Iā€™m a completely newbie at using CUDA, so any help is appreciated.

You should, but my guess itā€™ll likely shift to a base Julia issue about allocatedinline because Num shouldnā€™t be stored inline, to my understanding anyway. After some thought, it may be that allocatedinline is about whether the direct instance is stored inline, and it need not be isbits. While both an Any array and a Num array would both be implemented with a sequence of pointers, the Any pointers reference element instances entirely outside the sequence, while the element Num instances are the pointers to val::Any instances entirely outside the sequence. I still assume that inline non-isbits elements lack the data locality to be GPU-friendly, so maybe the pull request for supporting isbits unions made too permissive a change of isbitstype to allocatedinline?

Besides all that, Num is inherently type-unstable and thus seems unsuitable for the GPU. Maybe thereā€™s other optimization libraries that are? Sorry, I donā€™t know this area.

That similar function is supposed to allocate a CuArray with the eltype taken from the second argument, which is explicitly Type{Any} there. So this isnā€™t a CUDA.jl issue.

@Benny and @maleadt , I want to thank you for assisting me in this issue.

Do you think this is an Optimization.jl issue or a Base issue?

Why would this be a Base issue? The any comes from MTK broadcasting Symbolics.value over a GPU array of Nums, while Base._return_type(Symbolics.value, Tuple{Num}) == Any (this is invoked from Broadcast.combine_eltypes). So this is ā€˜justā€™ a ill-typed broadcast. Admittedly, the way this errors in the GPU stack isnā€™t nice, but itā€™s hard to do better (we canā€™t easily postpone the error to kernel execution time, where weā€™d be able to report on the ill-typed code being executed, because we canā€™t construct a CuArray{Any} in the first place).

Are inline elements with references, like Num, intended to be allowed in CuArrays, then? For an example without type stability issues, String is not allocatedinline but struct Str str::String end is. Seems like operations on Str("blah").str should be just as problematic as those on "blah", I canā€™t see how references would be handled better just because the element type is wrangled to contain them.

They are not. I guess we need to tighten the allocatedinline check; we generalized it from isbits in order to cover the union-isbits case (which is supported), but it looks like it covers too much now.