# Understanding relative performance of (returning vs. mutating vs. both) functions

Iβm trying to choose the fastest among the following 3 functions:

``````using Test, BenchmarkTools

function Mut!(x::Array{Int64,2},y::Array{Int64,1})
map!(z->z*2,x,x)
y .= vec(reduce(+,x,dims=1))
end
function MutRet(x::Array{Int64,2})
map!(z->z*2,x,x)
vec(reduce(+,x,dims=1))
end
function Ret(x::Array{Int64,2})
x2 = map(z->z*2,x)
y = vec(reduce(+,x2,dims=1))
return x2,y
end

@testset "mutate and return" begin

x = Int64.(fill(5,2,10))
y = fill(5,10)
Mut!(x,y)
@test all(x .== 10)
@test all(y .== 20)

x = fill(5,2,10)
y = MutRet(x)
@test all(x .== 10)
@test all(y .== 20)

x = Int64.(fill(5,2,10))
x,y = Ret(x)
@test all(x .== 10)
@test all(y .== 20)
end

x = Int64.(fill(5,2,10))
y = fill(5,10)
@code_warntype Mut!(x,y)

x = fill(5,2,10)
@code_warntype MutRet(x)

x = Int64.(fill(5,2,10))
@code_warntype Ret(x)

x = Int64.(fill(5,2,10))
y = fill(5,10)
@btime Mut!(x,y);

x = fill(5,2,10)
@btime MutRet(x);

x = Int64.(fill(5,2,10))
@btime Ret(x);
``````
type-stability
``````Variables
#self#::Core.Compiler.Const(Mut!, false)
x::Array{Int64,2}
y::Array{Int64,1}
#321::var"#321#322"

Body::Array{Int64,1}
1 β       (#321 = %new(Main.:(var"#321#322")))
β   %2  = #321::Core.Compiler.Const(var"#321#322"(), false)
β         Main.map!(%2, x, x)
β   %4  = (:dims,)::Core.Compiler.Const((:dims,), false)
β   %5  = Core.apply_type(Core.NamedTuple, %4)::Core.Compiler.Const(NamedTuple{(:dims,),T} where T<:Tuple, false)
β   %6  = Core.tuple(1)::Core.Compiler.Const((1,), false)
β   %7  = (%5)(%6)::NamedTuple{(:dims,),Tuple{Int64}}
β   %8  = Core.kwfunc(Main.reduce)::Core.Compiler.Const(Base.var"#reduce##kw"(), false)
β   %9  = (%8)(%7, Main.reduce, Main.:+, x)::Array{Int64,2}
β   %10 = Main.vec(%9)::Array{Int64,1}
β   %12 = Base.materialize!(y, %11)::Array{Int64,1}
βββ       return %12
Variables
#self#::Core.Compiler.Const(MutRet, false)
x::Array{Int64,2}
#323::var"#323#324"

Body::Array{Int64,1}
1 β       (#323 = %new(Main.:(var"#323#324")))
β   %2  = #323::Core.Compiler.Const(var"#323#324"(), false)
β         Main.map!(%2, x, x)
β   %4  = (:dims,)::Core.Compiler.Const((:dims,), false)
β   %5  = Core.apply_type(Core.NamedTuple, %4)::Core.Compiler.Const(NamedTuple{(:dims,),T} where T<:Tuple, false)
β   %6  = Core.tuple(1)::Core.Compiler.Const((1,), false)
β   %7  = (%5)(%6)::NamedTuple{(:dims,),Tuple{Int64}}
β   %8  = Core.kwfunc(Main.reduce)::Core.Compiler.Const(Base.var"#reduce##kw"(), false)
β   %9  = (%8)(%7, Main.reduce, Main.:+, x)::Array{Int64,2}
β   %10 = Main.vec(%9)::Array{Int64,1}
βββ       return %10
Variables
#self#::Core.Compiler.Const(Ret, false)
x::Array{Int64,2}
#325::var"#325#326"
x2::Array{Int64,2}
y::Array{Int64,1}

Body::Tuple{Array{Int64,2},Array{Int64,1}}
1 β       (#325 = %new(Main.:(var"#325#326")))
β   %2  = #325::Core.Compiler.Const(var"#325#326"(), false)
β         (x2 = Main.map(%2, x))
β   %4  = (:dims,)::Core.Compiler.Const((:dims,), false)
β   %5  = Core.apply_type(Core.NamedTuple, %4)::Core.Compiler.Const(NamedTuple{(:dims,),T} where T<:Tuple, false)
β   %6  = Core.tuple(1)::Core.Compiler.Const((1,), false)
β   %7  = (%5)(%6)::NamedTuple{(:dims,),Tuple{Int64}}
β   %8  = Core.kwfunc(Main.reduce)::Core.Compiler.Const(Base.var"#reduce##kw"(), false)
β   %9  = (%8)(%7, Main.reduce, Main.:+, x2)::Array{Int64,2}
β         (y = Main.vec(%9))
β   %11 = Core.tuple(x2, y)::Tuple{Array{Int64,2},Array{Int64,1}}
βββ       return %11
``````
benchmarks
``````82.646 ns (3 allocations: 240 bytes)
74.559 ns (3 allocations: 240 bytes)
105.459 ns (6 allocations: 528 bytes)

``````

I have a couple questions:

1. why is MutRet faster than Ret?
2. why is the number of allocations listed for MutRet is the same as Mut, even though MutRet needs to create the [additional] return value `vec(reduce(+,x,dims=1))`

βedited original for several errors as pointed out in commentsβ

Note that this makes a new array, and then copies its contents into `foo`. You might be looking for `map!`. Or for `sum(abs2, foo)`, if you donβt need the array β your functions return different things. And finally, you are timing things on really tiny arrays; is your real problem this size?

Thanks @mcabbott, Iβm updating the examples to fix the issue of different calculations and increase data size (my actual use case is indeed very large).
Looks like the results are now more in line with expected:

1. functions that donβt allocate as much are faster

It is still odd that `MutRet()` is still faster than `Mut()` despite the fact that it has to allocate another N words for the output `bar`.

Youβre still allocating the result of the reduction though. Loop through views of columns and sum the results to scalars and youβll see that work out better.

Thanks, Chris. Somehow my implementation of summed views is doing extra allocations (please see below). Do you have any suggestions?:

``````using Test, BenchmarkTools

function Mut!(x::Array{Int64,2},y::Array{Int64,1})
map!(z->z*2,x,x)
y .= vec(reduce(+,x,dims=1))
end
function MutRet(x::Array{Int64,2})
map!(z->z*2,x,x)
vec(reduce(+,x,dims=1))
end
function MutView!(x::Array{Int64,2},y::Array{Int64,1})
map!(z->z*2,x,x)
y .= [sum(view(x,:,z)) for z in 1:size(x,2)]
end
function MutRetView(x::Array{Int64,2})
map!(z->z*2,x,x)
[sum(view(x,:,z)) for z in 1:size(x,2)]
end
``````
tests

@testset βmutate and returnβ begin
x = Int64.(fill(5,2,10))
y = fill(5,10)
Mut!(x,y)
@test all(x .== 10)
@test all(y .== 20)

x = fill(5,2,10)
y = MutRet(x)
@test all(x .== 10)
@test all(y .== 20)

x = Int64.(fill(5,2,10))
y = fill(5,10)
MutView!(x,y)
@test all(x .== 10)
@test all(y .== 20)

x = fill(5,2,10)
y = MutRetView(x)
@test all(x .== 10)
@test all(y .== 20)
end

Test Summary: | Pass Total
mutate and return | 8 8
Test.DefaultTestSet(βmutate and returnβ, Any, 8, false)

type-stability

x = Int64.(fill(5,2,10))
y = fill(5,10)
@code_warntype Mut!(x,y)

x = fill(5,2,10)
@code_warntype MutRet(x)

x = Int64.(fill(5,2,10))
y = fill(5,10)
@code_warntype MutView!(x,y)

x = fill(5,2,10)
@code_warntype MutRet(x)

Body::Array{Int64,1}
1 β (#114 = %new(Main.:(var"#114#115")))
β %2 = #114::Core.Compiler.Const(var"#114#115"(), false)
β Main.map!(%2, x, x)
β %4 = (:dims,)::Core.Compiler.Const((:dims,), false)
β %5 = Core.apply_type(Core.NamedTuple, %4)::Core.Compiler.Const(NamedTuple{(:dims,),T} where T<:Tuple, false)
β %6 = Core.tuple(1)::Core.Compiler.Const((1,), false)
β %7 = (%5)(%6)::NamedTuple{(:dims,),Tuple{Int64}}
β %8 = Core.kwfunc(Main.reduce)::Core.Compiler.Const(Base.var"#reduce##kw"(), false)
β %9 = (%8)(%7, Main.reduce, Main.:+, x)::Array{Int64,2}
β %10 = Main.vec(%9)::Array{Int64,1}
β %12 = Base.materialize!(y, %11)::Array{Int64,1}
βββ return %12
Variables
#self#::Core.Compiler.Const(MutRet, false)
x::Array{Int64,2}
#116::var"#116#117"

Body::Array{Int64,1}
1 β (#116 = %new(Main.:(var"#116#117")))
β %2 = #116::Core.Compiler.Const(var"#116#117"(), false)
β Main.map!(%2, x, x)
β %4 = (:dims,)::Core.Compiler.Const((:dims,), false)
β %5 = Core.apply_type(Core.NamedTuple, %4)::Core.Compiler.Const(NamedTuple{(:dims,),T} where T<:Tuple, false)
β %6 = Core.tuple(1)::Core.Compiler.Const((1,), false)
β %7 = (%5)(%6)::NamedTuple{(:dims,),Tuple{Int64}}
β %8 = Core.kwfunc(Main.reduce)::Core.Compiler.Const(Base.var"#reduce##kw"(), false)
β %9 = (%8)(%7, Main.reduce, Main.:+, x)::Array{Int64,2}
β %10 = Main.vec(%9)::Array{Int64,1}
βββ return %10
Variables
#self#::Core.Compiler.Const(MutView!, false)
x::Array{Int64,2}
y::Array{Int64,1}
#118::var"#118#120"
#119::var"#119#121"{Array{Int64,2}}

Body::Array{Int64,1}
1 β (#118 = %new(Main.:(var"#118#120")))
β %2 = #118::Core.Compiler.Const(var"#118#120"(), false)
β Main.map!(%2, x, x)
β %4 = Main.:(var"#119#121")::Core.Compiler.Const(var"#119#121", false)
β %5 = Core.typeof(x)::Core.Compiler.Const(Array{Int64,2}, false)
β %6 = Core.apply_type(%4, %5)::Core.Compiler.Const(var"#119#121"{Array{Int64,2}}, false)
β (#119 = %new(%6, x))
β %8 = #119::var"#119#121"{Array{Int64,2}}
β %9 = Main.size(x, 2)::Int64
β %10 = (1:%9)::Core.Compiler.PartialStruct(UnitRange{Int64}, Any[Core.Compiler.Const(1, false), Int64])
β %11 = Base.Generator(%8, %10)::Core.Compiler.PartialStruct(Base.Generator{UnitRange{Int64},var"#119#121"{Array{Int64,2}}}, Any[var"#119#121"{Array{Int64,2}}, Core.Compiler.PartialStruct(UnitRange{Int64}, Any[Core.Compiler.Const(1, false), Int64])])
β %12 = Base.collect(%11)::Array{Int64,1}
β %14 = Base.materialize!(y, %13)::Array{Int64,1}
βββ return %14
Variables
#self#::Core.Compiler.Const(MutRet, false)
x::Array{Int64,2}
#116::var"#116#117"

Body::Array{Int64,1}
1 β (#116 = %new(Main.:(var"#116#117")))
β %2 = #116::Core.Compiler.Const(var"#116#117"(), false)
β Main.map!(%2, x, x)
β %4 = (:dims,)::Core.Compiler.Const((:dims,), false)
β %5 = Core.apply_type(Core.NamedTuple, %4)::Core.Compiler.Const(NamedTuple{(:dims,),T} where T<:Tuple, false)
β %6 = Core.tuple(1)::Core.Compiler.Const((1,), false)
β %7 = (%5)(%6)::NamedTuple{(:dims,),Tuple{Int64}}
β %8 = Core.kwfunc(Main.reduce)::Core.Compiler.Const(Base.var"#reduce##kw"(), false)
β %9 = (%8)(%7, Main.reduce, Main.:+, x)::Array{Int64,2}
β %10 = Main.vec(%9)::Array{Int64,1}
βββ return %10

benchmarks

x = Int64.(fill(5,2,10))
y = fill(5,10)
@btime Mut!(x,y);

x = fill(5,2,10)
@btime MutRet(x);

x = Int64.(fill(5,2,10))
y = fill(5,10)
@btime MutView!(x,y);

x = fill(5,2,10)
@btime MutRetView(x);

82.080 ns (3 allocations: 240 bytes)
74.842 ns (3 allocations: 240 bytes)
107.862 ns (13 allocations: 688 bytes)
99.907 ns (13 allocations: 688 bytes)

Comprehensions will allocate. You want to loop.

1 Like

Perfect, thanks!

``````function Mut!(x::Array{Int64,2},y::Array{Int64,1})
map!(z->z*2,x,x)
y .= vec(reduce(+,x,dims=1))
end
function MutView!(x::Array{Int64,2},y::Array{Int64,1})
map!(z->z*2,x,x)
for i in 1:length(y)
view(y,i) = sum(@view x[:,i])
end
end
``````
benchmark
``````N = 2^10
x = Int64.(fill(5,2,N))
y = fill(5,N)
@btime Mut!(x,y);

x = Int64.(fill(5,2,N))
y = fill(5,N)
@btime MutView!(x,y);

N = 10
x = Int64.(fill(5,2,N))
y = fill(5,N)
@btime Mut!(x,y);

x = Int64.(fill(5,2,N))
y = fill(5,N)
@btime MutView!(x,y);

2.142 ΞΌs (3 allocations: 8.20 KiB)
543.016 ns (0 allocations: 0 bytes)
85.892 ns (3 allocations: 240 bytes)
9.877 ns (0 allocations: 0 bytes)
``````

This syntax means function definition, not assignment. Itβs fast because it doesnβt change `y`. You could write `.=`, or write `y[i] = sum(abs2, @view x[:,i])`. Or you could just write `sum!(abs2, y', x)`.

3 Likes

You may loop over `(sum(view(x,:,z)) for z in 1:size(x,2))` without allocations. There were allocations only because `[]` was used instead of `()`, using `[]` will collect the comprehension in a `Vector`.

Thanks @mcabbott , not sure why I left out the test block on that oneβ¦