Understanding relative performance of (returning vs. mutating vs. both) functions

I’m trying to choose the fastest among the following 3 functions:

using Test, BenchmarkTools

function Mut!(x::Array{Int64,2},y::Array{Int64,1})
    map!(z->z*2,x,x)
    y .= vec(reduce(+,x,dims=1))
end
function MutRet(x::Array{Int64,2})
    map!(z->z*2,x,x)
    vec(reduce(+,x,dims=1))
end
function Ret(x::Array{Int64,2})
    x2 = map(z->z*2,x)
    y = vec(reduce(+,x2,dims=1))
    return x2,y
end

@testset "mutate and return" begin

x = Int64.(fill(5,2,10))
y = fill(5,10)
Mut!(x,y)
@test all(x .== 10)
@test all(y .== 20)

x = fill(5,2,10)
y = MutRet(x)
@test all(x .== 10)
@test all(y .== 20)

x = Int64.(fill(5,2,10))
x,y = Ret(x)
@test all(x .== 10)
@test all(y .== 20)
end

x = Int64.(fill(5,2,10))
y = fill(5,10)
@code_warntype Mut!(x,y)

x = fill(5,2,10)
@code_warntype MutRet(x)

x = Int64.(fill(5,2,10))
@code_warntype Ret(x)


x = Int64.(fill(5,2,10))
y = fill(5,10)
@btime Mut!(x,y);

x = fill(5,2,10)
@btime MutRet(x);

x = Int64.(fill(5,2,10))
@btime Ret(x);
type-stability
Variables
  #self#::Core.Compiler.Const(Mut!, false)
  x::Array{Int64,2}
  y::Array{Int64,1}
  #321::var"#321#322"

Body::Array{Int64,1}
1 ─       (#321 = %new(Main.:(var"#321#322")))
β”‚   %2  = #321::Core.Compiler.Const(var"#321#322"(), false)
β”‚         Main.map!(%2, x, x)
β”‚   %4  = (:dims,)::Core.Compiler.Const((:dims,), false)
β”‚   %5  = Core.apply_type(Core.NamedTuple, %4)::Core.Compiler.Const(NamedTuple{(:dims,),T} where T<:Tuple, false)
β”‚   %6  = Core.tuple(1)::Core.Compiler.Const((1,), false)
β”‚   %7  = (%5)(%6)::NamedTuple{(:dims,),Tuple{Int64}}
β”‚   %8  = Core.kwfunc(Main.reduce)::Core.Compiler.Const(Base.var"#reduce##kw"(), false)
β”‚   %9  = (%8)(%7, Main.reduce, Main.:+, x)::Array{Int64,2}
β”‚   %10 = Main.vec(%9)::Array{Int64,1}
β”‚   %11 = Base.broadcasted(Base.identity, %10)::Base.Broadcast.Broadcasted{Base.Broadcast.DefaultArrayStyle{1},Nothing,typeof(identity),Tuple{Array{Int64,1}}}
β”‚   %12 = Base.materialize!(y, %11)::Array{Int64,1}
└──       return %12
Variables
  #self#::Core.Compiler.Const(MutRet, false)
  x::Array{Int64,2}
  #323::var"#323#324"

Body::Array{Int64,1}
1 ─       (#323 = %new(Main.:(var"#323#324")))
β”‚   %2  = #323::Core.Compiler.Const(var"#323#324"(), false)
β”‚         Main.map!(%2, x, x)
β”‚   %4  = (:dims,)::Core.Compiler.Const((:dims,), false)
β”‚   %5  = Core.apply_type(Core.NamedTuple, %4)::Core.Compiler.Const(NamedTuple{(:dims,),T} where T<:Tuple, false)
β”‚   %6  = Core.tuple(1)::Core.Compiler.Const((1,), false)
β”‚   %7  = (%5)(%6)::NamedTuple{(:dims,),Tuple{Int64}}
β”‚   %8  = Core.kwfunc(Main.reduce)::Core.Compiler.Const(Base.var"#reduce##kw"(), false)
β”‚   %9  = (%8)(%7, Main.reduce, Main.:+, x)::Array{Int64,2}
β”‚   %10 = Main.vec(%9)::Array{Int64,1}
└──       return %10
Variables
  #self#::Core.Compiler.Const(Ret, false)
  x::Array{Int64,2}
  #325::var"#325#326"
  x2::Array{Int64,2}
  y::Array{Int64,1}

Body::Tuple{Array{Int64,2},Array{Int64,1}}
1 ─       (#325 = %new(Main.:(var"#325#326")))
β”‚   %2  = #325::Core.Compiler.Const(var"#325#326"(), false)
β”‚         (x2 = Main.map(%2, x))
β”‚   %4  = (:dims,)::Core.Compiler.Const((:dims,), false)
β”‚   %5  = Core.apply_type(Core.NamedTuple, %4)::Core.Compiler.Const(NamedTuple{(:dims,),T} where T<:Tuple, false)
β”‚   %6  = Core.tuple(1)::Core.Compiler.Const((1,), false)
β”‚   %7  = (%5)(%6)::NamedTuple{(:dims,),Tuple{Int64}}
β”‚   %8  = Core.kwfunc(Main.reduce)::Core.Compiler.Const(Base.var"#reduce##kw"(), false)
β”‚   %9  = (%8)(%7, Main.reduce, Main.:+, x2)::Array{Int64,2}
β”‚         (y = Main.vec(%9))
β”‚   %11 = Core.tuple(x2, y)::Tuple{Array{Int64,2},Array{Int64,1}}
└──       return %11
benchmarks
82.646 ns (3 allocations: 240 bytes)
  74.559 ns (3 allocations: 240 bytes)
  105.459 ns (6 allocations: 528 bytes)

I have a couple questions:

  1. why is MutRet faster than Ret?
  2. why is the number of allocations listed for MutRet is the same as Mut, even though MutRet needs to create the [additional] return value vec(reduce(+,x,dims=1))

β€”edited original for several errors as pointed out in commentsβ€”

Note that this makes a new array, and then copies its contents into foo. You might be looking for map!. Or for sum(abs2, foo), if you don’t need the array – your functions return different things. And finally, you are timing things on really tiny arrays; is your real problem this size?

Thanks @mcabbott, I’m updating the examples to fix the issue of different calculations and increase data size (my actual use case is indeed very large).
Looks like the results are now more in line with expected:

  1. functions that don’t allocate as much are faster

It is still odd that MutRet() is still faster than Mut() despite the fact that it has to allocate another N words for the output bar.

You’re still allocating the result of the reduction though. Loop through views of columns and sum the results to scalars and you’ll see that work out better.

Thanks, Chris. Somehow my implementation of summed views is doing extra allocations (please see below). Do you have any suggestions?:

using Test, BenchmarkTools

function Mut!(x::Array{Int64,2},y::Array{Int64,1})
    map!(z->z*2,x,x)
    y .= vec(reduce(+,x,dims=1))
end
function MutRet(x::Array{Int64,2})
    map!(z->z*2,x,x)
    vec(reduce(+,x,dims=1))
end
function MutView!(x::Array{Int64,2},y::Array{Int64,1})
    map!(z->z*2,x,x)
    y .= [sum(view(x,:,z)) for z in 1:size(x,2)]
end
function MutRetView(x::Array{Int64,2})
    map!(z->z*2,x,x)
    [sum(view(x,:,z)) for z in 1:size(x,2)]
end
tests

@testset β€œmutate and return” begin
x = Int64.(fill(5,2,10))
y = fill(5,10)
Mut!(x,y)
@test all(x .== 10)
@test all(y .== 20)

x = fill(5,2,10)
y = MutRet(x)
@test all(x .== 10)
@test all(y .== 20)

x = Int64.(fill(5,2,10))
y = fill(5,10)
MutView!(x,y)
@test all(x .== 10)
@test all(y .== 20)

x = fill(5,2,10)
y = MutRetView(x)
@test all(x .== 10)
@test all(y .== 20)
end

Test Summary: | Pass Total
mutate and return | 8 8
Test.DefaultTestSet(β€œmutate and return”, Any, 8, false)

type-stability

x = Int64.(fill(5,2,10))
y = fill(5,10)
@code_warntype Mut!(x,y)

x = fill(5,2,10)
@code_warntype MutRet(x)

x = Int64.(fill(5,2,10))
y = fill(5,10)
@code_warntype MutView!(x,y)

x = fill(5,2,10)
@code_warntype MutRet(x)

Body::Array{Int64,1}
1 ─ (#114 = %new(Main.:(var"#114#115")))
β”‚ %2 = #114::Core.Compiler.Const(var"#114#115"(), false)
β”‚ Main.map!(%2, x, x)
β”‚ %4 = (:dims,)::Core.Compiler.Const((:dims,), false)
β”‚ %5 = Core.apply_type(Core.NamedTuple, %4)::Core.Compiler.Const(NamedTuple{(:dims,),T} where T<:Tuple, false)
β”‚ %6 = Core.tuple(1)::Core.Compiler.Const((1,), false)
β”‚ %7 = (%5)(%6)::NamedTuple{(:dims,),Tuple{Int64}}
β”‚ %8 = Core.kwfunc(Main.reduce)::Core.Compiler.Const(Base.var"#reduce##kw"(), false)
β”‚ %9 = (%8)(%7, Main.reduce, Main.:+, x)::Array{Int64,2}
β”‚ %10 = Main.vec(%9)::Array{Int64,1}
β”‚ %11 = Base.broadcasted(Base.identity, %10)::Base.Broadcast.Broadcasted{Base.Broadcast.DefaultArrayStyle{1},Nothing,typeof(identity),Tuple{Array{Int64,1}}}
β”‚ %12 = Base.materialize!(y, %11)::Array{Int64,1}
└── return %12
Variables
#self#::Core.Compiler.Const(MutRet, false)
x::Array{Int64,2}
#116::var"#116#117"

Body::Array{Int64,1}
1 ─ (#116 = %new(Main.:(var"#116#117")))
β”‚ %2 = #116::Core.Compiler.Const(var"#116#117"(), false)
β”‚ Main.map!(%2, x, x)
β”‚ %4 = (:dims,)::Core.Compiler.Const((:dims,), false)
β”‚ %5 = Core.apply_type(Core.NamedTuple, %4)::Core.Compiler.Const(NamedTuple{(:dims,),T} where T<:Tuple, false)
β”‚ %6 = Core.tuple(1)::Core.Compiler.Const((1,), false)
β”‚ %7 = (%5)(%6)::NamedTuple{(:dims,),Tuple{Int64}}
β”‚ %8 = Core.kwfunc(Main.reduce)::Core.Compiler.Const(Base.var"#reduce##kw"(), false)
β”‚ %9 = (%8)(%7, Main.reduce, Main.:+, x)::Array{Int64,2}
β”‚ %10 = Main.vec(%9)::Array{Int64,1}
└── return %10
Variables
#self#::Core.Compiler.Const(MutView!, false)
x::Array{Int64,2}
y::Array{Int64,1}
#118::var"#118#120"
#119::var"#119#121"{Array{Int64,2}}

Body::Array{Int64,1}
1 ─ (#118 = %new(Main.:(var"#118#120")))
β”‚ %2 = #118::Core.Compiler.Const(var"#118#120"(), false)
β”‚ Main.map!(%2, x, x)
β”‚ %4 = Main.:(var"#119#121")::Core.Compiler.Const(var"#119#121", false)
β”‚ %5 = Core.typeof(x)::Core.Compiler.Const(Array{Int64,2}, false)
β”‚ %6 = Core.apply_type(%4, %5)::Core.Compiler.Const(var"#119#121"{Array{Int64,2}}, false)
β”‚ (#119 = %new(%6, x))
β”‚ %8 = #119::var"#119#121"{Array{Int64,2}}
β”‚ %9 = Main.size(x, 2)::Int64
β”‚ %10 = (1:%9)::Core.Compiler.PartialStruct(UnitRange{Int64}, Any[Core.Compiler.Const(1, false), Int64])
β”‚ %11 = Base.Generator(%8, %10)::Core.Compiler.PartialStruct(Base.Generator{UnitRange{Int64},var"#119#121"{Array{Int64,2}}}, Any[var"#119#121"{Array{Int64,2}}, Core.Compiler.PartialStruct(UnitRange{Int64}, Any[Core.Compiler.Const(1, false), Int64])])
β”‚ %12 = Base.collect(%11)::Array{Int64,1}
β”‚ %13 = Base.broadcasted(Base.identity, %12)::Base.Broadcast.Broadcasted{Base.Broadcast.DefaultArrayStyle{1},Nothing,typeof(identity),Tuple{Array{Int64,1}}}
β”‚ %14 = Base.materialize!(y, %13)::Array{Int64,1}
└── return %14
Variables
#self#::Core.Compiler.Const(MutRet, false)
x::Array{Int64,2}
#116::var"#116#117"

Body::Array{Int64,1}
1 ─ (#116 = %new(Main.:(var"#116#117")))
β”‚ %2 = #116::Core.Compiler.Const(var"#116#117"(), false)
β”‚ Main.map!(%2, x, x)
β”‚ %4 = (:dims,)::Core.Compiler.Const((:dims,), false)
β”‚ %5 = Core.apply_type(Core.NamedTuple, %4)::Core.Compiler.Const(NamedTuple{(:dims,),T} where T<:Tuple, false)
β”‚ %6 = Core.tuple(1)::Core.Compiler.Const((1,), false)
β”‚ %7 = (%5)(%6)::NamedTuple{(:dims,),Tuple{Int64}}
β”‚ %8 = Core.kwfunc(Main.reduce)::Core.Compiler.Const(Base.var"#reduce##kw"(), false)
β”‚ %9 = (%8)(%7, Main.reduce, Main.:+, x)::Array{Int64,2}
β”‚ %10 = Main.vec(%9)::Array{Int64,1}
└── return %10

benchmarks

x = Int64.(fill(5,2,10))
y = fill(5,10)
@btime Mut!(x,y);

x = fill(5,2,10)
@btime MutRet(x);

x = Int64.(fill(5,2,10))
y = fill(5,10)
@btime MutView!(x,y);

x = fill(5,2,10)
@btime MutRetView(x);

82.080 ns (3 allocations: 240 bytes)
74.842 ns (3 allocations: 240 bytes)
107.862 ns (13 allocations: 688 bytes)
99.907 ns (13 allocations: 688 bytes)

Comprehensions will allocate. You want to loop.

1 Like

Perfect, thanks!

function Mut!(x::Array{Int64,2},y::Array{Int64,1})
    map!(z->z*2,x,x)
    y .= vec(reduce(+,x,dims=1))
end
function MutView!(x::Array{Int64,2},y::Array{Int64,1})
    map!(z->z*2,x,x)
    for i in 1:length(y)
        view(y,i) = sum(@view x[:,i])
    end
end
benchmark
N = 2^10
x = Int64.(fill(5,2,N))
y = fill(5,N)
@btime Mut!(x,y);

x = Int64.(fill(5,2,N))
y = fill(5,N)
@btime MutView!(x,y);

N = 10
x = Int64.(fill(5,2,N))
y = fill(5,N)
@btime Mut!(x,y);

x = Int64.(fill(5,2,N))
y = fill(5,N)
@btime MutView!(x,y);

2.142 ΞΌs (3 allocations: 8.20 KiB)
543.016 ns (0 allocations: 0 bytes)
85.892 ns (3 allocations: 240 bytes)
9.877 ns (0 allocations: 0 bytes)

This syntax means function definition, not assignment. It’s fast because it doesn’t change y. You could write .=, or write y[i] = sum(abs2, @view x[:,i]). Or you could just write sum!(abs2, y', x).

3 Likes

You may loop over (sum(view(x,:,z)) for z in 1:size(x,2)) without allocations. There were allocations only because [] was used instead of (), using [] will collect the comprehension in a Vector.

Thanks @mcabbott , not sure why I left out the test block on that one…