Understanding relative performance of (returning vs. mutating vs. both) functions

mkarikom · August 31, 2020, 8:07pm

I’m trying to choose the fastest among the following 3 functions:

using Test, BenchmarkTools

function Mut!(x::Array{Int64,2},y::Array{Int64,1})
    map!(z->z*2,x,x)
    y .= vec(reduce(+,x,dims=1))
end
function MutRet(x::Array{Int64,2})
    map!(z->z*2,x,x)
    vec(reduce(+,x,dims=1))
end
function Ret(x::Array{Int64,2})
    x2 = map(z->z*2,x)
    y = vec(reduce(+,x2,dims=1))
    return x2,y
end

@testset "mutate and return" begin

x = Int64.(fill(5,2,10))
y = fill(5,10)
Mut!(x,y)
@test all(x .== 10)
@test all(y .== 20)

x = fill(5,2,10)
y = MutRet(x)
@test all(x .== 10)
@test all(y .== 20)

x = Int64.(fill(5,2,10))
x,y = Ret(x)
@test all(x .== 10)
@test all(y .== 20)
end

x = Int64.(fill(5,2,10))
y = fill(5,10)
@code_warntype Mut!(x,y)

x = fill(5,2,10)
@code_warntype MutRet(x)

x = Int64.(fill(5,2,10))
@code_warntype Ret(x)


x = Int64.(fill(5,2,10))
y = fill(5,10)
@btime Mut!(x,y);

x = fill(5,2,10)
@btime MutRet(x);

x = Int64.(fill(5,2,10))
@btime Ret(x);

type-stability

Variables
  #self#::Core.Compiler.Const(Mut!, false)
  x::Array{Int64,2}
  y::Array{Int64,1}
  #321::var"#321#322"

Body::Array{Int64,1}
1 ─       (#321 = %new(Main.:(var"#321#322")))
│   %2  = #321::Core.Compiler.Const(var"#321#322"(), false)
│         Main.map!(%2, x, x)
│   %4  = (:dims,)::Core.Compiler.Const((:dims,), false)
│   %5  = Core.apply_type(Core.NamedTuple, %4)::Core.Compiler.Const(NamedTuple{(:dims,),T} where T<:Tuple, false)
│   %6  = Core.tuple(1)::Core.Compiler.Const((1,), false)
│   %7  = (%5)(%6)::NamedTuple{(:dims,),Tuple{Int64}}
│   %8  = Core.kwfunc(Main.reduce)::Core.Compiler.Const(Base.var"#reduce##kw"(), false)
│   %9  = (%8)(%7, Main.reduce, Main.:+, x)::Array{Int64,2}
│   %10 = Main.vec(%9)::Array{Int64,1}
│   %11 = Base.broadcasted(Base.identity, %10)::Base.Broadcast.Broadcasted{Base.Broadcast.DefaultArrayStyle{1},Nothing,typeof(identity),Tuple{Array{Int64,1}}}
│   %12 = Base.materialize!(y, %11)::Array{Int64,1}
└──       return %12
Variables
  #self#::Core.Compiler.Const(MutRet, false)
  x::Array{Int64,2}
  #323::var"#323#324"

Body::Array{Int64,1}
1 ─       (#323 = %new(Main.:(var"#323#324")))
│   %2  = #323::Core.Compiler.Const(var"#323#324"(), false)
│         Main.map!(%2, x, x)
│   %4  = (:dims,)::Core.Compiler.Const((:dims,), false)
│   %5  = Core.apply_type(Core.NamedTuple, %4)::Core.Compiler.Const(NamedTuple{(:dims,),T} where T<:Tuple, false)
│   %6  = Core.tuple(1)::Core.Compiler.Const((1,), false)
│   %7  = (%5)(%6)::NamedTuple{(:dims,),Tuple{Int64}}
│   %8  = Core.kwfunc(Main.reduce)::Core.Compiler.Const(Base.var"#reduce##kw"(), false)
│   %9  = (%8)(%7, Main.reduce, Main.:+, x)::Array{Int64,2}
│   %10 = Main.vec(%9)::Array{Int64,1}
└──       return %10
Variables
  #self#::Core.Compiler.Const(Ret, false)
  x::Array{Int64,2}
  #325::var"#325#326"
  x2::Array{Int64,2}
  y::Array{Int64,1}

Body::Tuple{Array{Int64,2},Array{Int64,1}}
1 ─       (#325 = %new(Main.:(var"#325#326")))
│   %2  = #325::Core.Compiler.Const(var"#325#326"(), false)
│         (x2 = Main.map(%2, x))
│   %4  = (:dims,)::Core.Compiler.Const((:dims,), false)
│   %5  = Core.apply_type(Core.NamedTuple, %4)::Core.Compiler.Const(NamedTuple{(:dims,),T} where T<:Tuple, false)
│   %6  = Core.tuple(1)::Core.Compiler.Const((1,), false)
│   %7  = (%5)(%6)::NamedTuple{(:dims,),Tuple{Int64}}
│   %8  = Core.kwfunc(Main.reduce)::Core.Compiler.Const(Base.var"#reduce##kw"(), false)
│   %9  = (%8)(%7, Main.reduce, Main.:+, x2)::Array{Int64,2}
│         (y = Main.vec(%9))
│   %11 = Core.tuple(x2, y)::Tuple{Array{Int64,2},Array{Int64,1}}
└──       return %11

benchmarks

82.646 ns (3 allocations: 240 bytes)
  74.559 ns (3 allocations: 240 bytes)
  105.459 ns (6 allocations: 528 bytes)

I have a couple questions:

why is MutRet faster than Ret?
why is the number of allocations listed for MutRet is the same as Mut, even though MutRet needs to create the [additional] return value vec(reduce(+,x,dims=1))

—edited original for several errors as pointed out in comments—

mcabbott · August 31, 2020, 8:18pm

Note that this makes a new array, and then copies its contents into foo. You might be looking for map!. Or for sum(abs2, foo), if you don’t need the array – your functions return different things. And finally, you are timing things on really tiny arrays; is your real problem this size?

mkarikom · August 31, 2020, 8:52pm

Thanks @mcabbott, I’m updating the examples to fix the issue of different calculations and increase data size (my actual use case is indeed very large).
Looks like the results are now more in line with expected:

functions that don’t allocate as much are faster

It is still odd that MutRet() is still faster than Mut() despite the fact that it has to allocate another N words for the output bar.

ChrisRackauckas · August 31, 2020, 9:10pm

You’re still allocating the result of the reduction though. Loop through views of columns and sum the results to scalars and you’ll see that work out better.

mkarikom · September 1, 2020, 7:15pm

Thanks, Chris. Somehow my implementation of summed views is doing extra allocations (please see below). Do you have any suggestions?:

using Test, BenchmarkTools

function Mut!(x::Array{Int64,2},y::Array{Int64,1})
    map!(z->z*2,x,x)
    y .= vec(reduce(+,x,dims=1))
end
function MutRet(x::Array{Int64,2})
    map!(z->z*2,x,x)
    vec(reduce(+,x,dims=1))
end
function MutView!(x::Array{Int64,2},y::Array{Int64,1})
    map!(z->z*2,x,x)
    y .= [sum(view(x,:,z)) for z in 1:size(x,2)]
end
function MutRetView(x::Array{Int64,2})
    map!(z->z*2,x,x)
    [sum(view(x,:,z)) for z in 1:size(x,2)]
end

tests

@testset “mutate and return” begin
x = Int64.(fill(5,2,10))
y = fill(5,10)
Mut!(x,y)
@test all(x .== 10)
@test all(y .== 20)

x = fill(5,2,10)
y = MutRet(x)
@test all(x .== 10)
@test all(y .== 20)

x = Int64.(fill(5,2,10))
y = fill(5,10)
MutView!(x,y)
@test all(x .== 10)
@test all(y .== 20)

x = fill(5,2,10)
y = MutRetView(x)
@test all(x .== 10)
@test all(y .== 20)
end

Test Summary: | Pass Total
mutate and return | 8 8
Test.DefaultTestSet(“mutate and return”, Any, 8, false)

type-stability

x = Int64.(fill(5,2,10))
y = fill(5,10)
@code_warntype Mut!(x,y)

x = fill(5,2,10)
@code_warntype MutRet(x)

x = Int64.(fill(5,2,10))
y = fill(5,10)
@code_warntype MutView!(x,y)

x = fill(5,2,10)
@code_warntype MutRet(x)

Body::Array{Int64,1}
1 ─ (#114 = %new(Main.:(var"#114#115")))
│ %2 = #114::Core.Compiler.Const(var"#114#115"(), false)
│ Main.map!(%2, x, x)
│ %4 = (:dims,)::Core.Compiler.Const((:dims,), false)
│ %5 = Core.apply_type(Core.NamedTuple, %4)::Core.Compiler.Const(NamedTuple{(:dims,),T} where T<:Tuple, false)
│ %6 = Core.tuple(1)::Core.Compiler.Const((1,), false)
│ %7 = (%5)(%6)::NamedTuple{(:dims,),Tuple{Int64}}
│ %8 = Core.kwfunc(Main.reduce)::Core.Compiler.Const(Base.var"#reduce##kw"(), false)
│ %9 = (%8)(%7, Main.reduce, Main.:+, x)::Array{Int64,2}
│ %10 = Main.vec(%9)::Array{Int64,1}
│ %11 = Base.broadcasted(Base.identity, %10)::Base.Broadcast.Broadcasted{Base.Broadcast.DefaultArrayStyle{1},Nothing,typeof(identity),Tuple{Array{Int64,1}}}
│ %12 = Base.materialize!(y, %11)::Array{Int64,1}
└── return %12
Variables
#self#::Core.Compiler.Const(MutRet, false)
x::Array{Int64,2}
#116::var"#116#117"

Body::Array{Int64,1}
1 ─ (#116 = %new(Main.:(var"#116#117")))
│ %2 = #116::Core.Compiler.Const(var"#116#117"(), false)
│ Main.map!(%2, x, x)
│ %4 = (:dims,)::Core.Compiler.Const((:dims,), false)
│ %5 = Core.apply_type(Core.NamedTuple, %4)::Core.Compiler.Const(NamedTuple{(:dims,),T} where T<:Tuple, false)
│ %6 = Core.tuple(1)::Core.Compiler.Const((1,), false)
│ %7 = (%5)(%6)::NamedTuple{(:dims,),Tuple{Int64}}
│ %8 = Core.kwfunc(Main.reduce)::Core.Compiler.Const(Base.var"#reduce##kw"(), false)
│ %9 = (%8)(%7, Main.reduce, Main.:+, x)::Array{Int64,2}
│ %10 = Main.vec(%9)::Array{Int64,1}
└── return %10
Variables
#self#::Core.Compiler.Const(MutView!, false)
x::Array{Int64,2}
y::Array{Int64,1}
#118::var"#118#120"
#119::var"#119#121"{Array{Int64,2}}

Body::Array{Int64,1}
1 ─ (#118 = %new(Main.:(var"#118#120")))
│ %2 = #118::Core.Compiler.Const(var"#118#120"(), false)
│ Main.map!(%2, x, x)
│ %4 = Main.:(var"#119#121")::Core.Compiler.Const(var"#119#121", false)
│ %5 = Core.typeof(x)::Core.Compiler.Const(Array{Int64,2}, false)
│ %6 = Core.apply_type(%4, %5)::Core.Compiler.Const(var"#119#121"{Array{Int64,2}}, false)
│ (#119 = %new(%6, x))
│ %8 = #119::var"#119#121"{Array{Int64,2}}
│ %9 = Main.size(x, 2)::Int64
│ %10 = (1:%9)::Core.Compiler.PartialStruct(UnitRange{Int64}, Any[Core.Compiler.Const(1, false), Int64])
│ %11 = Base.Generator(%8, %10)::Core.Compiler.PartialStruct(Base.Generator{UnitRange{Int64},var"#119#121"{Array{Int64,2}}}, Any[var"#119#121"{Array{Int64,2}}, Core.Compiler.PartialStruct(UnitRange{Int64}, Any[Core.Compiler.Const(1, false), Int64])])
│ %12 = Base.collect(%11)::Array{Int64,1}
│ %13 = Base.broadcasted(Base.identity, %12)::Base.Broadcast.Broadcasted{Base.Broadcast.DefaultArrayStyle{1},Nothing,typeof(identity),Tuple{Array{Int64,1}}}
│ %14 = Base.materialize!(y, %13)::Array{Int64,1}
└── return %14
Variables
#self#::Core.Compiler.Const(MutRet, false)
x::Array{Int64,2}
#116::var"#116#117"

Body::Array{Int64,1}
1 ─ (#116 = %new(Main.:(var"#116#117")))
│ %2 = #116::Core.Compiler.Const(var"#116#117"(), false)
│ Main.map!(%2, x, x)
│ %4 = (:dims,)::Core.Compiler.Const((:dims,), false)
│ %5 = Core.apply_type(Core.NamedTuple, %4)::Core.Compiler.Const(NamedTuple{(:dims,),T} where T<:Tuple, false)
│ %6 = Core.tuple(1)::Core.Compiler.Const((1,), false)
│ %7 = (%5)(%6)::NamedTuple{(:dims,),Tuple{Int64}}
│ %8 = Core.kwfunc(Main.reduce)::Core.Compiler.Const(Base.var"#reduce##kw"(), false)
│ %9 = (%8)(%7, Main.reduce, Main.:+, x)::Array{Int64,2}
│ %10 = Main.vec(%9)::Array{Int64,1}
└── return %10

benchmarks

x = Int64.(fill(5,2,10))
y = fill(5,10)
@btime Mut!(x,y);

x = fill(5,2,10)
@btime MutRet(x);

x = Int64.(fill(5,2,10))
y = fill(5,10)
@btime MutView!(x,y);

x = fill(5,2,10)
@btime MutRetView(x);

82.080 ns (3 allocations: 240 bytes)
74.842 ns (3 allocations: 240 bytes)
107.862 ns (13 allocations: 688 bytes)
99.907 ns (13 allocations: 688 bytes)

ChrisRackauckas · September 1, 2020, 7:16pm

Comprehensions will allocate. You want to loop.

mkarikom · September 1, 2020, 8:54pm

Perfect, thanks!

function Mut!(x::Array{Int64,2},y::Array{Int64,1})
    map!(z->z*2,x,x)
    y .= vec(reduce(+,x,dims=1))
end
function MutView!(x::Array{Int64,2},y::Array{Int64,1})
    map!(z->z*2,x,x)
    for i in 1:length(y)
        view(y,i) = sum(@view x[:,i])
    end
end

benchmark

N = 2^10
x = Int64.(fill(5,2,N))
y = fill(5,N)
@btime Mut!(x,y);

x = Int64.(fill(5,2,N))
y = fill(5,N)
@btime MutView!(x,y);

N = 10
x = Int64.(fill(5,2,N))
y = fill(5,N)
@btime Mut!(x,y);

x = Int64.(fill(5,2,N))
y = fill(5,N)
@btime MutView!(x,y);

2.142 μs (3 allocations: 8.20 KiB)
543.016 ns (0 allocations: 0 bytes)
85.892 ns (3 allocations: 240 bytes)
9.877 ns (0 allocations: 0 bytes)

mcabbott · September 1, 2020, 9:17pm

This syntax means function definition, not assignment. It’s fast because it doesn’t change y. You could write .=, or write y[i] = sum(abs2, @view x[:,i]). Or you could just write sum!(abs2, y', x).

Henrique_Becker · September 1, 2020, 10:09pm

You may loop over (sum(view(x,:,z)) for z in 1:size(x,2)) without allocations. There were allocations only because [] was used instead of (), using [] will collect the comprehension in a Vector.

mkarikom · September 1, 2020, 10:29pm

Thanks @mcabbott , not sure why I left out the test block on that one…

Topic		Replies	Views
Way to have a function "mutate an immutable" without much performance loss General Usage	4	1186	December 20, 2016
Performance differences when using a mutable struct that contains an array Performance	5	741	April 24, 2018
Performance regression in 1.0.1 Performance	5	879	October 2, 2018
Speed of internal function General Usage	7	459	June 16, 2021
Very simple code, why would there be performance problems Performance question	12	948	December 21, 2020

Understanding relative performance of (returning vs. mutating vs. both) functions

Related topics