Setfield! allocates memory when setting Float64 values

The task is to copy all elements of a vector (x::Vector{Float64}) into a hierarchical data structure many times. Hereby the mapping is fixed at the beginning. For example x[3] is always copied to a.b.c.d. Intuitively, it should be possible to perform such a copy operation without allocating memory (which is slow, when performed many times). I hoped that this task could be implemented with the setfield!(..) function. However, I did not manage: Either, I do something wrong, or setfield!(..) is the wrong function. Find below my test case:

module Test

mutable struct Vars
    v1::Float64
    v2::Float64
    v3::Float64
    v4::Float64
    v5::Float64
end

mutable struct CopyTable
    x_comp::Vector{Any}       # x_comp[i] is the component in which x[i] is present
    x_index::Vector{Int}      # x_index[i] is the index of x[i] within x_comp[i]

    function CopyTable()
        x_comp  = Any[]
        x_index = Int[]
        vars_temp  = Vars(11.0, 12.0, 13.0, 14.0, 15.0)
        for i=1:100
            v  = deepcopy(vars_temp)
            for j=1:5
                push!(x_comp , v)
                push!(x_index, j)
            end
        end
        new(x_comp, x_index)
    end
end

function copy_from_x(ct::CopyTable, x::Vector{Float64})
    for j=1:1000
        for i = 1:length(ct.x_index)
            setfield!(ct.x_comp[i], ct.x_index[i], x[i])   # Critical statement
        end
    end
end

ct = CopyTable()
x  = randn(length(ct.x_index))

      copy_from_x(ct, x)
@time copy_from_x(ct, x)

end

Executing this module in Julia 1.3.0, gives the following print-out:

  0.014597 seconds (500.00 k allocations: 7.630 MiB)

Therefore about 500 000 objects are allocated with a total of 7.6 Mbyte of memory. I would expect that if setfield! should copy a Float64 value into a Float64 field of a mutable struct, that this can be performed without allocating memory.

First of all, I noticed that your CopyTable has a Vector{Any} field, which will tend to cause poor performance and additional allocations in general. Replacing it with a Vector{Vars} does not solve the immediate problem, but it’s still a good idea.

Back to the central issue: We can actually demonstrate it with a much simpler example:

julia> mutable struct Vars
           v1::Float64
           v2::Float64
           v3::Float64
           v4::Float64
           v5::Float64
       end

julia> v = Vars(1, 2, 3, 4, 5);

julia> using BenchmarkTools

julia> @btime setfield!(v, i, x) setup=(i = 1; x = 2.0)
  18.975 ns (1 allocation: 16 bytes)

Exactly one allocation per setfield! call, which is consistent with your results above.

It seems like the compiler just isn’t quite clever enough to optimize this all the way. In general this is a hard problem: given a call like setfield!(::Foo, ::Int, ::Float64), the types of the inputs don’t generally provide enough information because the type of the field being set might depend on the value of the field index. It happens that for Vars that’s not the case because every field is a Float64, but it’s not entirely surprising that the compiler doesn’t have some special logic to figure that out here.

If your data were truly just a single struct, and you didn’t need it to be mutable, then you could use reinterpret:

julia> struct ImmutableVars
         v1::Float64
         v2::Float64
       end

julia> reinterpret(ImmutableVars, [1.0, 2.0])
1-element reinterpret(ImmutableVars, ::Array{Float64,1}):
 ImmutableVars(1.0, 2.0)

otherwise you might actually want to descend into @generated function territory. We can generate a function for an input x::T which generates code that looks like:

setproperty!(x, :field1, values[1])
setproperty!(x, :field2, values[2])
setproperty!(x, :field3, values[3])

by iterating over the field names of our type:

julia> @generated function set_fields(x::T, values::AbstractVector) where {T}
         expressions = []
         for (i, field) in enumerate(fieldnames(T))
           push!(expressions, quote
             setproperty!(x, $(QuoteNode(field)), values[$i])
           end)
         end
         return quote 
           begin
             @assert length(values) == $(length(fieldnames(T)))
             $(expressions...)
             x
           end
         end
       end
set_fields (generic function with 1 method)
julia> v = Vars(1, 2, 3, 4, 5);

julia> set_fields(v, [5, 4, 3, 2, 1])
Vars(5.0, 4.0, 3.0, 2.0, 1.0)

julia> v
Vars(5.0, 4.0, 3.0, 2.0, 1.0)

Since all of the field accesses are now by their literal names, the compiler has no trouble optimizing the resulting expression:

julia> @btime set_fields($v, $([5, 4, 3, 2, 1]))
  4.470 ns (0 allocations: 0 bytes)
Vars(5.0, 4.0, 3.0, 2.0, 1.0)

If you want to see more about how the @generated function works, we can split up the actual @generated function from the part that builds the expression:

julia> function make_set_fields_expression(T)
         expressions = []
         for (i, field) in enumerate(fieldnames(T))
           push!(expressions, quote
             setproperty!(x, $(QuoteNode(field)), values[$i])
           end)
         end
         return quote 
           begin
             @assert length(values) == $(length(fieldnames(T)))
             $(expressions...)
             x
           end
         end
       end
make_set_fields_expression (generic function with 1 method)

julia> @generated function set_fields(x::T, values::AbstractVector) where {T}
         make_set_fields_expression(T)
       end
set_fields (generic function with 1 method)

Here’s the expression that is built. Note how each field in Vars has been stuck literally into the resulting expression:

julia> make_set_fields_expression(Vars)
quote
    #= REPL[42]:9 =#
    begin
        #= REPL[42]:10 =#
        #= REPL[42]:10 =# @assert length(values) == 5
        #= REPL[42]:11 =#
        begin
            #= REPL[42]:5 =#
            setproperty!(x, :v1, values[1])
        end
        begin
            #= REPL[42]:5 =#
            setproperty!(x, :v2, values[2])
        end
        begin
            #= REPL[42]:5 =#
            setproperty!(x, :v3, values[3])
        end
        begin
            #= REPL[42]:5 =#
            setproperty!(x, :v4, values[4])
        end
        begin
            #= REPL[42]:5 =#
            setproperty!(x, :v5, values[5])
        end
        #= REPL[42]:12 =#
        x
    end
end
8 Likes

Thanks very much for your reply.

First of all, I noticed that your CopyTable has a Vector{Any} field, which will tend to cause poor performance and additional allocations in general. Replacing it with a Vector{Vars} does not solve the immediate problem, but it’s still a good idea.

Unfortunately, this is not possible: In reality x_comp::Vector{Any} contains references to many types (see extended test example below).

Exactly one allocation per setfield! call, which is consistent with your results above.
It seems like the compiler just isn’t quite clever enough to optimize this all the way. In general this is a hard problem: …

So it is probably unrealistic to hope that at some time in the future setfield! is specifically improved for basic value types, such as Float64, Int, Bool if the type information is only available at run-time.

If your data were truly just a single struct, and you didn’t need it to be mutable, then you could use reinterpret …

Hm. Need to evaluate. But probably, this not possible for our application

… otherwise you might actually want to descend into @generated function territory. …

Your solution proposal is a good idea. Since the data structure can consist of various types, your proposal cannot be directly applied. I tried to extend your solution proposal so that it can work with a vector of different types:

module Test2

mutable struct Vars1
    v1::Float64
    v2::Float64
    v3::Float64
    v4::Float64
    v5::Float64
end

mutable struct Vars2
    r1::Float64
    r2::Float64
    r3::Float64
    r4::Float64
    r5::Float64
end

# Build data structure (vector of nc components of Vars1 and Vars2 objects)
function buildVars(nc::Int)
    v1 = Vars1(1.0, 2.0, 3.0, 4.0, 5.0)
    r1 = Vars2(6.0, 7.0, 8.0, 9.0, 10.0)
    vars = Any[]
    for i = 1:nc
        push!(vars, deepcopy(v1))
        push!(vars, deepcopy(r1))
    end
    return vars
end

getIndex(i,j) = j+i-1

@generated function set_fields!(comp::T, x::Vector{Float64}, j::Int)::Nothing where {T}
    expressions = []
    for (i, field) in enumerate(fieldnames( T ))
        push!(expressions, quote
            setproperty!(comp, $(QuoteNode(field)), x[getIndex($i,j)])
        end)
    end

    return quote
        begin
            $(expressions...)
            return nothing
        end
    end
end

function copy_from_x!(vars::Vector{Any}, x::Vector{Float64})::Nothing
    j = 1
    for v in vars
        set_fields!(v, x, j)
        j = j+5
    end
    return nothing;
end


nc=2; vars = buildVars(nc);  x = randn(nc*10)
println("\nnc = $nc")
      copy_from_x!(vars, x)
@time copy_from_x!(vars, x)


nc=10; vars = buildVars(nc);  x = randn(nc*10)
println("\nnc = $nc")
      copy_from_x!(vars, x)
@time copy_from_x!(vars, x)


nc=100; vars = buildVars(nc);  x = randn(nc*10)
println("\nnc = $nc")
      copy_from_x!(vars, x)
@time copy_from_x!(vars, x)

nc=10000; vars = buildVars(nc);  x = randn(nc*10)
println("\nnc = $nc")
      copy_from_x!(vars, x)
@time copy_from_x!(vars, x)

This module produces the following output:

nc = 2
  0.000003 seconds (4 allocations: 160 bytes)

nc = 10
  0.000003 seconds (4 allocations: 160 bytes)

nc = 100
  0.000009 seconds (101 allocations: 1.672 KiB)

nc = 10000
  0.000508 seconds (19.90 k allocations: 311.047 KiB)

This means, that in principal the solution seems to work (which is good!), but when the data structure becomes larger (nc=100 and nc=10000), then suddenly memory is allocated.

Any suggestions how this memory allocation can be avoid (note, the copy_from_x! function needs to be called several thousand times)?

Not-nice-solution with pointers

I have one solution (which I do not like), by working with pointers. Hereby Vars1 is changed to Vars1_ptr:

mutable struct Variable
    value::Float64
end

mutable struct Vars1_ptr
    v1::Variable
    v2::Variable
    v3::Variable
    v4::Variable
    v5::Variable
    Vars1_ptr(v1, v2, v3, v4, v5) = new(Variable(v1), Variable(v2), Variable(v3), Variable(v4), Variable(v5))
end

vars1 = Vars1_ptr(1.0, 2.0, 3.0, 4.0, 5.0)
pointer_to_v1 = vars1.v1
value_of_v1  = point_to_v1.value

It is now possible to have a pointer to every field of Vars1_ptr and then build up a table that contains the copy information, for example that x[3] should be copied to pointer_to_v1. This solution does not allocate memory during the copy operation and is suited for large data structures. The severe disadvantage is that all operations on Vars1_ptr must be implemented with an additional unnecessary hierarchy (e.g. use vars1.v1.value instead of vars1.v1.

1 Like

A variant of the @generated function approach could be to generate “somehow” a “set_field!” function explicitly with a macro once for every type involved and then call these functions. Below is a test (replacing the macro by manually providing the set_fields! functions for simplicity):

module Test3

mutable struct Vars1
    v1::Float64
    v2::Float64
    v3::Float64
    v4::Float64
    v5::Float64
end
function set_fields!(v::Vars1, x::Vector{Float64}, j::Int)
    v.v1 = x[j]
    v.v2 = x[j+1]
    v.v3 = x[j+2]
    v.v4 = x[j+3]
    v.v5 = x[j+4]
end

mutable struct Vars2
    r1::Float64
    r2::Float64
    r3::Float64
    r4::Float64
    r5::Float64
end
function set_fields!(v::Vars2, x::Vector{Float64}, j::Int)
    v.r1 = x[j]
    v.r2 = x[j+1]
    v.r3 = x[j+2]
    v.r4 = x[j+3]
    v.r5 = x[j+4]
end

function copy_from_x!(vars::Vector{Any}, x::Vector{Float64})::Nothing
    j = 1
    for v in vars
        set_fields!(v, x, j)
        j = j+5
    end
    return nothing;
end

# Build Data structure (vector of Vars1 and Vars2 objects)
function buildVars(nc::Int)
    v1 = Vars1(1.0, 2.0, 3.0, 4.0, 5.0)
    r1 = Vars2(6.0, 7.0, 8.0, 9.0, 10.0)
    vars = Any[]
    for i = 1:nc
        push!(vars, deepcopy(v1))
        push!(vars, deepcopy(r1))
    end
    return vars
end

nc=2; vars = buildVars(nc);  x = randn(nc*10)
println("\nnc = $nc")
      copy_from_x!(vars, x)
@time copy_from_x!(vars, x)


nc=10; vars = buildVars(nc);  x = randn(nc*10)
println("\nnc = $nc")
@time copy_from_x!(vars, x)


nc=100; vars = buildVars(nc);  x = randn(nc*10)
println("\nnc = $nc")
@time copy_from_x!(vars, x)


nc=10000; vars = buildVars(nc);  x = randn(nc*10)
println("\nnc = $nc")
@time copy_from_x!(vars, x)

end

The output is:

nc = 2
  0.000003 seconds (4 allocations: 160 bytes)

nc = 10
  0.000003 seconds (4 allocations: 160 bytes)

nc = 100
  0.000005 seconds (4 allocations: 160 bytes)

nc = 10000
  0.000184 seconds (4 allocations: 160 bytes)

So this looks good with respect to memory (no unnecessary memory allocated).

However, multiple dispatch is used in a way so that functions set_fields! are resolved at runtime and it seems not possible to avoid this. The question is whether there will be issues if many different types are used in vector vars. In this discourse discussion a similar topic seems to be discussed and when I read this correctly, there can be in fact a substantial overhead of multiple dispatch in such a case.

I found now a reasonable solution where all the type-information is available at compile-time and no function calls are used in the copy-operation. The only minor drawback is that the copy-operation (structured according to type) has to be generated on-the-fly with eval (for simplicity and clarity, the result of the eval operation is shown below and not the code that produces the expression for eval):

module Test4

mutable struct Vars1
    v1::Float64
    v2::Float64
    v3::Float64
    v4::Float64
    v5::Float64
    startIndex::Int64       # Start-index in x-vector
end


mutable struct Vars2
    r1::Float64
    r2::Float64
    r3::Float64
    r4::Float64
    r5::Float64
    startIndex::Int64       # Start-index in x-vector
end


# Generate with "eval"
mutable struct Components
    field_Vars1::Vector{Vars1}
    field_Vars2::Vector{Vars2}
    Components() = new()
end
components = Components()

function copy_from_x!(x::Vector{Float64}, components::Components)
    for obj in components.field_Vars1
        obj.v1 = x[obj.startIndex]
        obj.v2 = x[obj.startIndex+1]
        obj.v3 = x[obj.startIndex+2]
        obj.v4 = x[obj.startIndex+3]
        obj.v5 = x[obj.startIndex+4]
    end

    for obj in components.field_Vars2
        obj.r1 = x[obj.startIndex]
        obj.r2 = x[obj.startIndex+1]
        obj.r3 = x[obj.startIndex+2]
        obj.r4 = x[obj.startIndex+3]
        obj.r5 = x[obj.startIndex+4]
    end
end

# Build test data structure (vector of Vars1 and Vars2 objects)
function buildComponents!(vars::Vector{Any})
    dict = Dict()
    for obj in vars
        key = typeof(obj)
        if !haskey(dict, key)
            # dict[key] = Any[]
            dict[key] = key[]   # Vector of type key
        end
        push!(dict[key], obj)
    end
    return dict
end

function buildVars!(nc::Int, components::Components)
    vars = Any[]
    j = 1
    for i = 1:nc
        push!(vars, Vars1(1.0, 2.0, 3.0, 4.0, 5.0, j))
        push!(vars, Vars2(6.0, 7.0, 8.0, 9.0, 10.0, j+5))
        j = j+10
    end
    x = randn(nc*10)
    dict = buildComponents!(vars)
    components.field_Vars1 = dict[Vars1]
    components.field_Vars2 = dict[Vars2]

    return x
end


nc=2; x = buildVars!(nc, components)
println("\nnc = $nc")
      copy_from_x!(x, components)
@time copy_from_x!(x, components)

nc=10; x = buildVars!(nc, components)
println("\nnc = $nc")
      copy_from_x!(x, components)
@time copy_from_x!(x, components)

nc=100; x = buildVars!(nc, components)
println("\nnc = $nc")
      copy_from_x!(x, components)
@time copy_from_x!(x, components)

nc=10000; x = buildVars!(nc, components)
println("\nnc = $nc")
      copy_from_x!(x, components)
@time copy_from_x!(x, components)

Whe executing the above, the result is:

nc = 2
  0.000002 seconds (4 allocations: 160 bytes)

nc = 10
  0.000001 seconds (3 allocations: 144 bytes)

nc = 100
  0.000002 seconds (4 allocations: 160 bytes)

nc = 10000
  0.000115 seconds (4 allocations: 160 bytes)

So no memory is allocated during the copy-operation and there are no function calls in the copy-operation (and especially no run-time multiple dispatch function calls as in the previous solution).