"Optional" in-place output arguments

What I meant is something like this, not sure if it adapts to what you want:

struct OutputX
    x
end

struct OutputXY
    x
    y
end

function _inner_solver(output::OutputX)
    # stuff dealing with x
end

function _inner_solver(output::OutputXY)
    # stuff dealing with x and y
end

function solver(f)
    output = f(x) 
    _inner_solver(output)
end

# the user

function foo1!(x)
    # stuff
    return OutputX(x)
end

#or

function foo2!(x,y)
    # stuff
    return OutputXY(x,y)
end

# calling the solver

solver(foo1!)

#or

solver((x) -> foo2!(x,y))


1 Like

Now I see how you are thinking. Great pattern: dispatching on the output of a callback function.

This pattern relies on coding one or several foo! functions/methods. But I am beginning to accept that this is unavoidable (either that, or if-then…)

:rofl: You just might have a point!
KISS… But I couldn’t help trying!

Firstly: Hi there!
Your were on the right track there, but barely missed it :smiley:
@code_lowered foo!(o1,o2,o3,i1,i2)
is not optimized code. @code_typed can do that.

I'm not going to copy the whole of `@code_lowered optimize=true foo!(...)` here, but it does seem to work to a certain extend.

Even with optimize=false you can see that it figured out calls that do nothing:

julia> o1 = Vector{Float64}(undef,2); o2 = NotWanted(); o3 = NotWanted()
NotWanted()

julia> @code_typed optimize=false foo!(o1,o2,o3,i1,i2)
CodeInfo(
1 ─ %1 = (2 * i1)::Vector{Float64}
β”‚        Base.setindex!(o1, %1, Main.:(:))::Any
β”‚   %3 = Base.broadcasted(Main.:+, i1, i2)::Base.Broadcast.Broadcasted{Base.Broadcast.DefaultArrayStyle{1}, Nothing, typeof(+), Tuple{Vector{Float64}, Vector{Float64}}}
β”‚   %4 = Base.materialize(%3)::Vector{Float64}
β”‚   %5 = (3 * %4)::Vector{Float64}
β”‚        Base.setindex!(o2, %5, Main.:(:))::Union{}
β”‚        Core.Const(:(Base.broadcasted(Main.:+, i1, i2)))::Union{}
β”‚        Core.Const(:(tmp = Base.materialize(%7)))::Union{}
β”‚        Core.Const(:(3 * tmp))::Union{}
β”‚        Core.Const(:(Base.setindex!(o3, %9, Main.:(:))))::Union{}
└──      Core.Const(:(return %9))::Union{}
) => Union{}

julia> o1 = Vector{Float64}(undef,2); o2 = Vector{Int}(undef, 2); o3 = Vector{Int}(undef, 2)
2-element Vector{Int64}:
 209904464
 209904896

julia> @code_typed optimize=false foo!(o1,o2,o3,i1,i2)
CodeInfo(
1 ─ %1 = (2 * i1)::Vector{Float64}
β”‚        Base.setindex!(o1, %1, Main.:(:))::Any
β”‚   %3 = Base.broadcasted(Main.:+, i1, i2)::Base.Broadcast.Broadcasted{Base.Broadcast.DefaultArrayStyle{1}, Nothing, typeof(+), Tuple{Vector{Float64}, Vector{Float64}}}
β”‚   %4 = Base.materialize(%3)::Vector{Float64}
β”‚   %5 = (3 * %4)::Vector{Float64}
β”‚        Base.setindex!(o2, %5, Main.:(:))::Any
β”‚   %7 = Base.broadcasted(Main.:+, i1, i2)::Base.Broadcast.Broadcasted{Base.Broadcast.DefaultArrayStyle{1}, Nothing, typeof(+), Tuple{Vector{Float64}, Vector{Float64}}}
β”‚        (tmp = Base.materialize(%7))::Vector{Float64}
β”‚   %9 = (3 * tmp)::Vector{Float64}
β”‚        Base.setindex!(o3, %9, Main.:(:))::Any
└──      return %9
) => Vector{Float64}

The same with optimize=true is way too unwieldy to fit here, but if you look at the length and certain key lines, it does seem to do quite well:

julia> o1 = Vector{Float64}(undef,2); o2 = NotWanted(); o3 = NotWanted();

julia> @code_typed optimize=true foo!(o1,o2,o3,i1,i2)
CodeInfo(
1 ─── %1   = invoke Main.:*(2::Int64, i1::Vector{Float64})::Vector{Float64}
β”‚     %2   = Base.arraylen(o1)::Int64
└────        goto #6 if not true
2 ─── %4   = Base.arraylen(%1)::Int64
β”‚     %5   = (%4 === %2)::Bool
└────        goto #4 if not %5
3 ───        goto #5
4 ─── %8   = Core.tuple(%2)::Tuple{Int64}
β”‚            invoke Base.throw_setindex_mismatch(%1::Vector{Float64}, %8::Tuple{Int64})::Union{}
└────        unreachable
5 ───        nothing::Nothing
6 ┄── %12  = Base.slt_int(0, %2)::Bool
└────        goto #16 if not %12
7 ─── %14  = $(Expr(:gc_preserve_begin, Core.Argument(2)))
β”‚     %15  = $(Expr(:gc_preserve_begin, :(%1)))
β”‚     %16  = $(Expr(:foreigncall, :(:jl_array_ptr), Ptr{Float64}, svec(Any), 0, :(:ccall), Core.Argument(2)))::Ptr{Float64}
β”‚            Base.arraysize(o1, 1)::Int64
β”‚     %18  = Core.bitcast(Core.UInt, %16)::UInt64
β”‚     %19  = Base.add_ptr(%18, 0x0000000000000000)::UInt64
β”‚     %20  = Core.bitcast(Ptr{Float64}, %19)::Ptr{Float64}
β”‚     %21  = $(Expr(:foreigncall, :(:jl_array_ptr), Ptr{Float64}, svec(Any), 0, :(:ccall), :(%1)))::Ptr{Float64}
β”‚            Base.arraysize(%1, 1)::Int64
β”‚     %23  = Core.bitcast(Core.UInt, %21)::UInt64
β”‚     %24  = Base.add_ptr(%23, 0x0000000000000000)::UInt64
β”‚     %25  = Core.bitcast(Ptr{Float64}, %24)::Ptr{Float64}
β”‚     %26  = Base.mul_int(%2, 8)::Int64
β”‚     %27  = Core.lshr_int(%26, 63)::Int64
β”‚     %28  = Core.trunc_int(Core.UInt8, %27)::UInt8
β”‚     %29  = Core.eq_int(%28, 0x01)::Bool
└────        goto #9 if not %29
8 ───        invoke Core.throw_inexacterror(:check_top_bit::Symbol, UInt64::Type{UInt64}, %26::Int64)::Union{}
└────        unreachable
9 ───        goto #10
10 ── %34  = Core.bitcast(Core.UInt64, %26)::UInt64
└────        goto #11
11 ──        goto #12
12 ──        goto #13
13 ──        goto #14
.
.
.
%109::Tuple{Base.OneTo{Int64}})::Union{}
└────        unreachable
99 ──        goto #100
100 ─        goto #101
101 ─        goto #102
102 ─ %273 = invoke Main.:*(3::Int64, %114::Vector{Float64})::Vector{Float64}
β”‚            Base.setindex!(o2, %273, Main.:(:))::Union{}
└────        unreachable
) => Union{}

And I found no remaining mentions of o2 aside from the unreachable at the end or of o3.

Even better:

julia> o1 = NotWanted(); o2 = NotWanted(); o3 = NotWanted();

julia> @code_typed optimize=true foo!(o1,o2,o3,i1,i2)
CodeInfo(
1 ─ %1 = invoke Main.:*(2::Int64, i1::Vector{Float64})::Vector{Float64}
β”‚        Base.setindex!(o1, %1, Main.:(:))::Union{}
└──      unreachable
) => Union{}

Without NotWanteds:

julia> o1 = Vector{Float64}(undef,2); o2 = Vector{Int}(undef, 2); o3 = Vector{Int}(undef, 2);

julia> @code_typed optimize=true foo!(o1,o2,o3,i1,i2)
CodeInfo(
1 ─── %1   = invoke Main.:*(2::Int64, i1::Vector{Float64})::Vector{Float64}
β”‚     %2   = Base.arraylen(o1)::Int64
└────        goto #6 if not true
2 ─── %4   = Base.arraylen(%1)::Int64
β”‚     %5   = (%4 === %2)::Bool
└────        goto #4 if not %5
3 ───        goto #5
4 ─── %8   = Core.tuple(%2)::Tuple{Int64}
β”‚            invoke Base.throw_setindex_mismatch(%1::Vector{Float64}, %8::Tuple{Int64})::Union{}
└────        unreachable
5 ───        nothing::Nothing
6 ┄── %12  = Base.slt_int(0, %2)::Bool
└────        goto #16 if not %12
7 ─── %14  = $(Expr(:gc_preserve_begin, Core.Argument(2)))
β”‚     %15  = $(Expr(:gc_preserve_begin, :(%1)))
β”‚     %16  = $(Expr(:foreigncall, :(:jl_array_ptr), Ptr{Float64}, svec(Any), 0, :(:ccall), Core.Argument(2)))::Ptr{Float64}
β”‚            Base.arraysize(o1, 1)::Int64
β”‚     %18  = Core.bitcast(Core.UInt, %16)::UInt64
β”‚     %19  = Base.add_ptr(%18, 0x0000000000000000)::UInt64
β”‚     %20  = Core.bitcast(Ptr{Float64}, %19)::Ptr{Float64}
β”‚     %21  = $(Expr(:foreigncall, :(:jl_array_ptr), Ptr{Float64}, svec(Any), 0, :(:ccall), :(%1)))::Ptr{Float64}
β”‚            Base.arraysize(%1, 1)::Int64
β”‚     %23  = Core.bitcast(Core.UInt, %21)::UInt64
β”‚     %24  = Base.add_ptr(%23, 0x0000000000000000)::UInt64
β”‚     %25  = Core.bitcast(Ptr{Float64}, %24)::Ptr{Float64}
β”‚     %26  = Base.mul_int(%2, 8)::Int64
β”‚     %27  = Core.lshr_int(%26, 63)::Int64
β”‚     %28  = Core.trunc_int(Core.UInt8, %27)::UInt8
β”‚     %29  = Core.eq_int(%28, 0x01)::Bool
└────        goto #9 if not %29
8 ───        invoke Core.throw_inexacterror(:check_top_bit::Symbol, UInt64::Type{UInt64}, %26::Int64)::Union{}
└────        unreachable
9 ───        goto #10
10 ── %34  = Core.bitcast(Core.UInt64, %26)::UInt64
└────        goto #11
11 ──        goto #12
12 ──        goto #13
13 ──        goto #14
.
.
.
102 ─ %273 = invoke Main.:*(3::Int64, %114::Vector{Float64})::Vector{Float64}
β”‚     %274 = Base.arraysize(o2, 1)::Int64
β”‚     %275 = Base.slt_int(%274, 0)::Bool
β”‚     %276 = Base.ifelse(%275, 0, %274)::Int64
β”‚     %277 = %new(Base.OneTo{Int64}, %276)::Base.OneTo{Int64}
β”‚     %278 = %new(Base.Slice{Base.OneTo{Int64}}, %277)::Base.Slice{Base.OneTo{Int64}}
└────        goto #104 if not true
103 ─        Base.arraysize(o2, 1)::Int64
└────        Base.nothing::Nothing
104 β”„        invoke Base._unsafe_setindex!($(QuoteNode(IndexLinear()))::IndexLinear, o2::Vector{Int64}, %273::Vector{Float64}, %278::Base.Slice{Base.OneTo{Int64}})::Any
.
.
.
191 ─ %511 = invoke Main.:*(3::Int64, %352::Vector{Float64})::Vector{Float64}
β”‚     %512 = Base.arraysize(o3, 1)::Int64
β”‚     %513 = Base.slt_int(%512, 0)::Bool
β”‚     %514 = Base.ifelse(%513, 0, %512)::Int64
β”‚     %515 = %new(Base.OneTo{Int64}, %514)::Base.OneTo{Int64}
β”‚     %516 = %new(Base.Slice{Base.OneTo{Int64}}, %515)::Base.Slice{Base.OneTo{Int64}}
└────        goto #193 if not true
192 ─        Base.arraysize(o3, 1)::Int64
└────        Base.nothing::Nothing
193 β”„        invoke Base._unsafe_setindex!($(QuoteNode(IndexLinear()))::IndexLinear, o3::Vector{Int64}, %511::Vector{Float64}, %516::Base.Slice{Base.OneTo{Int64}})::Any
└────        goto #194
194 ─        goto #195
195 ─        return %511
) => Vector{Float64}
julia> o1 = NotWanted(); o2 = NotWanted(); o3 = NotWanted();

julia> @code_typed optimize=true foo!(o1,o2,o3,i1,i2)
CodeInfo(
1 ─ %1 = invoke Main.:*(2::Int64, i1::Vector{Float64})::Vector{Float64}
β”‚        Base.setindex!(o1, %1, Main.:(:))::Union{}
└──      unreachable
) => Union{}

And am I the only one who would be curious to see how well the compilers code elimination fares in such an application? I do have to say I support Imiq’s suggestion - it’ll probably be a lot less headachy to maintain in the long run. But I find your idea interesting and it would be cool to see how well it performs. Have a nice day!

Oh, so I was looking (@code_lowered) to an early stage in the compilation process, before specialisation into instances even takes place, so I would not see the code eliding if it occurs…

I see that in the case β€œnothing wanted” that you provide, the @code_typed is indeed elided. However, If I want β€˜o1’ (but not o2), no eliding occurs, as suggested by the partial output below.

102 ─ %273 = invoke Main.:*(3::Int64, %114::Vector{Float64})::Vector{Float64}
β”‚            Base.setindex!(o2, %273, Main.:(:))::Union{}
└────        unreachable

So I was not completely crazy to make a wish - but it’s not Christmas yet.

Hertzlichen Dank!

P.S. I am now working on redesigning my code to keep things conceptually simple…

Well, I’m not entirely sure how much elision happens. There’s a considerable difference in the number of instructions between the cases o1 wanted and o1 and o2 wanted, so something has to have happened ^^ additionally, even the case where we want nothing at all seems to keep the multiplication for some reason:

invoke Main.:*(2::Int64, i1::Vector{Float64})::Vector{Float64}
β”‚        Base.setindex!(o1, %1, Main.:(:))::Union{}

But I’m not experienced enough to know if this is just a display artifact or some weird limitation in how much instructions the compiler can get rid of. It does seem to work in principle, which is kinda nice to know though.

Native
julia> o1 = NotWanted(); o2 = NotWanted(); o3 = NotWanted();

julia> @code_native foo!(o1,o2,o3,i1,i2)
        .text
; β”Œ @ REPL[3]:1 within `foo!`
        pushq   %rbp
        movq    %rsp, %rbp
        pushq   %rsi
        subq    $88, %rsp
        movq    %rdx, %rsi
        vxorps  %xmm0, %xmm0, %xmm0
        vmovaps %xmm0, -32(%rbp)
        movq    $0, -16(%rbp)
        movq    %rsi, -64(%rbp)
        movl    $jl_get_pgcstack, %eax
        callq   *%rax
        movq    $4, -32(%rbp)
        movq    (%rax), %rcx
        movq    %rcx, -24(%rbp)
        leaq    -32(%rbp), %rcx
        movq    %rcx, (%rax)
        movq    24(%rsi), %rdx
; β”‚ @ REPL[3]:2 within `foo!`
        movabsq $"*", %rax
        movl    $2, %ecx
        callq   *%rax
        movq    %rax, -16(%rbp)
        movq    $184877720, -56(%rbp)           # imm = 0xB050298
        movq    %rax, -48(%rbp)
        movq    $286725520, -40(%rbp)           # imm = 0x11171590
        movabsq $jl_apply_generic, %rax
        leaq    -56(%rbp), %rdx
        movl    $338059168, %ecx                # imm = 0x14265FA0
        movl    $3, %r8d
        callq   *%rax
        ud2
; β””

Seems to be too much to assume it elided everything. So there is likely some kind of limitation in place. Make a wish then ^^

Good luck with your rework!

I think the issue has been covered pretty well by the previous answers, and wish you the best with your redesign.

I just wanted to mention a couple things here regarding Dead Code Elimination, because apart from the API/design discussion, this is what would have allowed users/developers to avoid worrying about what piece of code is needed in which context.

First, as always when optimizing code, it’s easy to get lost in code modifications and to end up interpreting the performance of a code that does not perform the desired computation anymore.

Here, having Union{} (i.e. the bottom type, for which there exist no value of that type) as a return type is a bad sign. This generally means that the compiler can prove that the code will always error.

(and indeed it does, as becomes apparent if we actually run it)
julia> struct NotWanted end

julia> function setindex!(A::NotWanted,X,inds...) end
setindex! (generic function with 1 method)

julia> function foo!(o1,o2,o3,i1,i2)
           o1[:] = 2*i1
           o2[:] = 3(i1.+i2)
           tmp   = i1.+i2
           o3[:] = 3tmp
       end
foo! (generic function with 1 method)

julia> o1 = Vector{Float64}(undef,2);

julia> o2 = NotWanted();

julia> o3 = NotWanted();

julia> i1 = randn(2);

julia> i2 = randn(2);

julia> @code_lowered foo!(o1,o2,o3,i1,i2)
CodeInfo(
1 ─ %1 = 2 * i1
β”‚        Base.setindex!(o1, %1, Main.:(:))
β”‚   %3 = Base.broadcasted(Main.:+, i1, i2)
β”‚   %4 = Base.materialize(%3)
β”‚   %5 = 3 * %4
β”‚        Base.setindex!(o2, %5, Main.:(:))
β”‚   %7 = Base.broadcasted(Main.:+, i1, i2)
β”‚        tmp = Base.materialize(%7)
β”‚   %9 = 3 * tmp
β”‚        Base.setindex!(o3, %9, Main.:(:))
└──      return %9
)

julia> foo!(o1,o2,o3,i1,i2)
ERROR: MethodError: no method matching setindex!(::NotWanted, ::Vector{Float64}, ::Colon)
Stacktrace:
 [1] foo!(o1::Vector{Float64}, o2::NotWanted, o3::NotWanted, i1::Vector{Float64}, i2::Vector{Float64})
   @ Main ./REPL[3]:3
 [2] top-level scope
   @ REPL[10]:1

FWIW, the problem here is that a lot of things are needed besides simply setindex! to make the broadcast assignment work.




Another note is that I would have expected dead code elimination to actually work in this case, but it does not appear to work so well in this case:

julia> function foo2!(o1,o2,o3,i1,i2)
           o1 .= 2 .* i1
           o2 .= 3 .* (i1.+i2)
           tmp = i1 .+ i2      # <- allocation here
           # o3 .= 3 .* tmp    # <- commenting this out turns the line above into dead code
                               #    will it trigger dead-code elimination?
           nothing
       end
foo2! (generic function with 1 method)

julia> o1 = Vector{Float64}(undef,2);
julia> o2 = similar(o1);
julia> o3 = similar(o1);
julia> i1 = randn(2);
julia> i2 = randn(2);

julia> using BenchmarkTools
julia> @btime foo2!($o1,$o2,$o3,$i1,$i2)
  31.566 ns (1 allocation: 80 bytes)

I interpret this (the allocation) as proof that the tmp array was allocated and computed, although it isn’t used afterwards.

This is actually quite surprising to me. I suspect that the allocation itself is considered as a side-effect that can not be elided through compiler optimizations, but I might very well be wrong.

In any case, it looks like DCE does work in other cases, for example:

julia> function foo3!(o1,o2,o3,i1,i2)
           @inbounds tmp = 1 ./ i2[1]
           # o3 .= i2 .* tmp  # <- commenting this out turns the line above into dead code
                              #    will it trigger dead-code elimination?
           nothing
       end
foo3! (generic function with 1 method)

julia> o1 = Vector{Float64}(undef,2);
julia> o2 = similar(o1);
julia> o3 = similar(o1);
julia> i1 = randn(2);
julia> i2 = randn(2);

julia> @code_native foo3!(o1,o2,o3,i1,i2)
        .text
; β”Œ @ REPL[44]:1 within `foo3!`
        movq    %rsi, -8(%rsp)
        movabsq $140354864816136, %rax          # imm = 0x7FA6E9DC2008
; β”‚ @ REPL[44]:5 within `foo3!`
        retq
; β””

foo3! does not seem to do anything at all (not FP division, in any case), so it looks like the computation of tmp was elided there.

2 Likes

I suspect it’s a possibility that some bounds check could error somewhere in that stack, which would cause this to not be removed? Not sure without diagnostics though.

2 Likes

Many thanks!

It never occurred to me that faulty code would pass early stages of compilation - although that’s obvious.

I should have written

function Base.setindex!(A::NotWanted,X,inds...) end 

(It’s all in the Base. part…)

I AND THIS WORKS.

At least I can see it work if the input arrays are StaticArrays (which is anyway my real usecase. There is a gain in performance, and @code_typed does clearly change length.

using StaticArrays
using BenchmarkTools

struct NotWanted end 
function Base.setindex!(A::NotWanted,X,inds...) end 

function foo!(o1,o2,o3,i1,i2)
        o1[:] = 2*i1
        o2[:] = 3(i1.+i2)
        tmp   = i1.+i2
        o3[:] = 3tmp
        return nothing
end

const n = 100
o1 = Vector{Float64}(undef,n)
o2 = Vector{Float64}(undef,n)
o3 = Vector{Float64}(undef,n)
nw = NotWanted()
i1 = @SVector [randn() for i = 1:n]
i2 = @SVector [randn() for i = 1:n]

@btime foo!(o1,o2,o3,i1,i2)
@btime foo!(nw,o2,o3,i1,i2)
@btime foo!(o1,nw,o3,i1,i2)
@btime foo!(o1,o2,nw,i1,i2)
@btime foo!(o1,nw,nw,i1,i2)
@btime foo!(nw,o2,nw,i1,i2)
@btime foo!(nw,nw,o3,i1,i2)
@btime foo!(nw,nw,nw,i1,i2)

yields

  268.151 ns (0 allocations: 0 bytes)
  181.903 ns (0 allocations: 0 bytes)
  180.184 ns (0 allocations: 0 bytes)
  179.268 ns (0 allocations: 0 bytes)
  98.613 ns (0 allocations: 0 bytes)
  105.870 ns (0 allocations: 0 bytes)
  110.408 ns (0 allocations: 0 bytes)
  16.733 ns (0 allocations: 0 bytes)

The Julia creators comment that β€œimmutables are easier to reason about”: maybe the compiler finds it easier to prove that a stack-allocated immutable variable will never be needed…

Thanks everyone!

1 Like

I think this is the right place to note that dead code elimination is nothing but an optimization, that may fail. You shouldn’t rely on it to produce the exact code you’re looking for. Anything the compiler does to β€œnot do work” is an optimization and may not happen at all.

As noted multiple times above, structuring your code/framework in such a way to be easy for both your users and the compiler to reason about will make everyone’s life easier.

1 Like

I agree - it’s risky to rely on the compiler, and my real-world code might be harder to optimise than this MWE.

But I think that this solutions stays well clear of the danger of being hard to explain to the user. Which would have been a serious concern.

I am still thinking about code restructuring. But I can clock off work today feeling mightily pleased with myself. :grinning: