Julian way to write this code

Is there a pretty way to write this and have good performance? g1 produces runtime dispatch, g2 produces an allocation and g3 doesn’t look very “Julian” and is also unsafe.

function g1(x, y, w)
  sum(i -> x[i] * y[i] * w[i], 1:length(x))
end
function g2(x, y, w)
    sum(x .* y .* w)
end
function g3(x::AbstractArray{T}, y::AbstractArray{T}, w::AbstractArray{T}) where T
    l = length(x)
    r = T(0)
    @inbounds for i in 1:l
        r += x[i] * y[i] * w[i]
    end
    return r
end
function g4(x, y, w)
    x .*= y
    x .*= w
    sum(x)
end
x = rand(3)
y = rand(3)
w = rand(3)
julia> @benchmark g1($x, $y, $w)
BenchmarkTools.Trial: 10000 samples with 999 evaluations.
 Range (min … max):   9.969 ns … 46.624 ns  ┊ GC (min … max): 0.00% … 0.00%
 Time  (median):     12.405 ns              ┊ GC (median):    0.00%
 Time  (mean ± σ):   12.423 ns ±  0.565 ns  ┊ GC (mean ± σ):  0.00% ± 0.00%

                           ▃▁ ▂ ▁  ▄▇█▇                       ▂
  ▃▃▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁██▁█▁█▄▆████▅▆▇▅▆▄▄▁▃▄▃▄▆▅▅▅▅▅▆▆▆▇ █
  9.97 ns      Histogram: log(frequency) by time      14.1 ns <

 Memory estimate: 0 bytes, allocs estimate: 0.

julia> @benchmark g2($x, $y, $w)
BenchmarkTools.Trial: 10000 samples with 991 evaluations.
 Range (min … max):  38.842 ns … 115.352 ns  ┊ GC (min … max): 0.00% … 0.00%
 Time  (median):     49.113 ns               ┊ GC (median):    0.00%
 Time  (mean ± σ):   48.779 ns ±   4.394 ns  ┊ GC (mean ± σ):  0.00% ± 0.00%

                       ▁▅█▇▃                                    
  ▁▁▁▂▂▂▂▂▃▃▂▂▂▂▂▃▃▃▃▄▆█████▇▃▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁ ▂
  38.8 ns         Histogram: frequency by time         65.6 ns <

 Memory estimate: 80 bytes, allocs estimate: 1.

julia> @benchmark g3($x, $y, $w)
BenchmarkTools.Trial: 10000 samples with 1000 evaluations.
 Range (min … max):  3.135 ns … 22.953 ns  ┊ GC (min … max): 0.00% … 0.00%
 Time  (median):     3.367 ns              ┊ GC (median):    0.00%
 Time  (mean ± σ):   3.433 ns ±  0.426 ns  ┊ GC (mean ± σ):  0.00% ± 0.00%

              ▂█                                              
  ▅▃▁▃▁▁▃▅▄▅▅▆███▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▃▂▂▂▂▃▃▂▃ ▃
  3.14 ns        Histogram: frequency by time        4.25 ns <

 Memory estimate: 0 bytes, allocs estimate: 0.

julia> @benchmark g4($x, $y, $w)
BenchmarkTools.Trial: 10000 samples with 999 evaluations.
 Range (min … max):  8.153 ns … 35.191 ns  ┊ GC (min … max): 0.00% … 0.00%
 Time  (median):     8.876 ns              ┊ GC (median):    0.00%
 Time  (mean ± σ):   8.961 ns ±  0.570 ns  ┊ GC (mean ± σ):  0.00% ± 0.00%

  ▄    ▁        ▅▆▇▇█▅▄▃▁ ▁▁ ▁▂▃▃▂▂▃ ▁▁▁ ▁                   ▂
  █▄▁▁▇█▄▆▁▄█▇▄▁███████████████████████████▆▆▅▅▅▅▅▄▅▅▄▄▅▄▅▆▆ █
  8.15 ns      Histogram: log(frequency) by time     10.5 ns <

 Memory estimate: 0 bytes, allocs estimate: 0.
1 Like

I think g3 comes closest but you can make it safer by removing @inbounds and using eachindex instead:

function g5(x, y, w)
    T = promote_type(eltype(x), eltype(y), eltype(w))
    r = zero(T)
    for i in eachindex(x, y, w)
        r += x[i] * y[i] * w[i]
    end
    return r
end

EDIT: weirdly enough this one is rather slow too, unsure why

7 Likes

didn’t know that eachindex could take more than one array.

it isn’t too bad:

julia> @benchmark g5($x1, $y1, $w1)
BenchmarkTools.Trial: 10000 samples with 1000 evaluations.
 Range (min … max):  5.240 ns … 39.784 ns  ┊ GC (min … max): 0.00% … 0.00%
 Time  (median):     5.290 ns              ┊ GC (median):    0.00%
 Time  (mean ± σ):   5.314 ns ±  0.444 ns  ┊ GC (mean ± σ):  0.00% ± 0.00%

       ▅█▂                                                    
  ▂▄▃▆▃███▅▂▄▂▃▂▁▂▂▂▂▁▂▂▂▁▁▂▂▂▁▁▂▂▂▂▂▂▁▂▁▁▂▂▂▁▂▂▂▂▁▂▂▁▂▁▁▁▂▂ ▂
  5.24 ns        Histogram: frequency by time         5.7 ns <

 Memory estimate: 0 bytes, allocs estimate: 0.
1 Like

Also be careful not to benchmark g4 that way, because it will modify your x to store very small numbers, and then subsequent benchmarks will be slower (presumably because of NaNs appearing?). I recommend you use a setup phase instead, see Manual · BenchmarkTools.jl

2 Likes

You can also a @simd to your g5 to improve it further.

julia> function g5(x, y, w)
           T = promote_type(eltype(x), eltype(y), eltype(w))
           r = zero(T)
           for i in eachindex(x, y, w)
               r += x[i] * y[i] * w[i]
           end
           return r
       end
g5 (generic function with 1 method)

julia> function g6(x, y, w)
           T = promote_type(eltype(x), eltype(y), eltype(w))
           r = zero(T)
           @simd for i in eachindex(x, y, w)
               r += x[i] * y[i] * w[i]
           end
           return r
       end
g6 (generic function with 1 method)

julia> @benchmark g5(x, y, z) (setup=(x = rand(100); y = rand(100); z = rand(100)))
BenchmarkTools.Trial: 10000 samples with 993 evaluations.
 Range (min … max):  37.865 ns … 268.983 ns  ┊ GC (min … max): 0.00% … 0.00%
 Time  (median):     38.469 ns               ┊ GC (median):    0.00%
 Time  (mean ± σ):   40.368 ns ±   5.999 ns  ┊ GC (mean ± σ):  0.00% ± 0.00%

  ▃█▄ ▂       ▁▅▅                                              ▁
  ██████▇██▇▇▇██████▇█▇▇▇▆▇▇▇▆▆▆▆▅▆▆▅▆▅▄▅▅▃▄▆▅▄▄▅▆▄▃▄▅▅▄▅▄▅▄▆▅ █
  37.9 ns       Histogram: log(frequency) by time      56.7 ns <

 Memory estimate: 0 bytes, allocs estimate: 0.

julia> @benchmark g6(x, y, z) (setup=(x = rand(100); y = rand(100); z = rand(100)))
BenchmarkTools.Trial: 10000 samples with 999 evaluations.
 Range (min … max):  12.112 ns … 137.337 ns  ┊ GC (min … max): 0.00% … 0.00%
 Time  (median):     12.412 ns               ┊ GC (median):    0.00%
 Time  (mean ± σ):   13.260 ns ±   2.603 ns  ┊ GC (mean ± σ):  0.00% ± 0.00%

  █▃   ▄ ▂▇▁    ▁▃▄▂▁                                          ▂
  ██▇▃▅█▇█████▇▅█████▆▇██▆▆▇▆▆▇▇▁▇▅▅▆▅▅▅▇▅▄▄▄▁▃▆▄▅▄▅▁▃▃▁▄▃▃▃▁▄ █
  12.1 ns       Histogram: log(frequency) by time      22.3 ns <

 Memory estimate: 0 bytes, allocs estimate: 0.
6 Likes

Note the caveat that @simd can produce slightly different results because floating point addition is not associative, and it’s allowed to reorder the inputs:

x = rand(1000)
y = rand(1000)
w = rand(1000)

g5(x, y, w)
> 136.6347749999703

g6(x, y, w)
> 136.6347749999705
4 Likes

Is that so? Neither @code_warntype nor @descent (Cthulhu.jl) seem to indicate that for me. Can you share how to test that?

The performance difference might simply be due to the fact that sum does not exactly do the same as g3, but as far as I know uses a more accurate algorithm for summing a (large) sequence of numbers (please correct me if I’m wrong).

For small vectors like in your benchmarks, there might be no difference, but for longer ones you can check that the results are slightly different. Also @simd can change the output (ah, @Satvik beat me to it :grin: ).

Whether you need the extra accuracy or speed depends on your use case of course. If the vectors stay small, maybe static vectors are another option? Not sure if they will improve the summation performance though, just throwing it out there.

4 Likes

Imho, this is nice and quite fast:

g7(x, y, z) = sum(splat(*), zip(x, y, z))
julia> @btime g5(x, y, z) (setup=(x = rand(100); y = rand(100); z = rand(100)));

  210.963 ns (0 allocations: 0 bytes)

julia> @btime g7(x, y, z) (setup=(x = rand(100); y = rand(100); z = rand(100)));

  223.047 ns (0 allocations: 0 bytes)
9 Likes

One more option with LazyArrays (with the caveat of @simd)

using LazyArrays
g8(x, y, w) = sum(@~ x .* y .* w)

@btime g5(x, y, z) (setup=(x = rand(100); y = rand(100); z = rand(100)))
@btime g8(x, y, z) (setup=(x = rand(100); y = rand(100); z = rand(100)))

with times

#g5  32.425 ns (0 allocations: 0 bytes)
#g8  17.512 ns (0 allocations: 0 bytes)

This is good to keep in mind but at a point of investigation where the alternatives involve calling sum, the @simd reordering is a minor effect and more likely than not to be numerically beneficial compared to the straight accumulation.

3 Likes

This is what I get for g1. @code_warntype says it is fine, haven’t tested @descent

julia> @report_opt g1(x1, y1, w1)
═════ 1 possible error found ═════
┌ g1(x::Vector{Float64}, y::Vector{Float64}, w::Vector{Float64}) @ Main ./REPL[32]:2
│┌ sum(f::var"#5#6"{Vector{Float64}, Vector{Float64}, Vector{Float64}}, a::UnitRange{Int64}) @ Base ./reducedim.jl:1011                                                                                                  
││┌ sum(f::var"#5#6"{Vector{Float64}, Vector{Float64}, Vector{Float64}}, a::UnitRange{Int64}; dims::Colon, kw::@Kwargs
{}) @ Base ./reducedim.jl:1011                                                                                       
│││┌ _sum(f::var"#5#6"{Vector{Float64}, Vector{Float64}, Vector{Float64}}, a::UnitRange{Int64}, ::Colon) @ Base ./reducedim.jl:1015                                                                                                  
││││┌ _sum(f::var"#5#6"{Vector{Float64}, Vector{Float64}, Vector{Float64}}, a::UnitRange{Int64}, ::Colon; kw::@Kwargs{}) @ Base ./reducedim.jl:1015                                                                                        
│││││┌ mapreduce(f::var"#5#6"{Vector{…}, Vector{…}, Vector{…}}, op::typeof(Base.add_sum), A::UnitRange{Int64}) @ Base ./reducedim.jl:357                                                                                                   
││││││┌ mapreduce(f::var"#5#6"{…}, op::typeof(Base.add_sum), A::UnitRange{…}; dims::Colon, init::Base._InitialValue) @ Base ./reducedim.jl:357                                                                                             
│││││││┌ _mapreduce_dim(f::var"#5#6"{…}, op::typeof(Base.add_sum), ::Base._InitialValue, A::UnitRange{…}, ::Colon) @ Base ./reducedim.jl:365                                                                                               
││││││││┌ _mapreduce(f::var"#5#6"{Vector{…}, Vector{…}, Vector{…}}, op::typeof(Base.add_sum), ::IndexLinear, A::UnitRange{Int64}) @ Base ./reduce.jl:432                                                                              
│││││││││┌ mapreduce_empty_iter(f::var"#5#6"{…}, op::typeof(Base.add_sum), itr::UnitRange{…}, ItrEltype::Base.HasEltype) @ Base ./reduce.jl:380                                                                                
││││││││││┌ reduce_empty_iter(op::Base.MappingRF{var"#5#6"{…}, typeof(Base.add_sum)}, itr::UnitRange{Int64}, ::Base.HasEltype) @ Base ./reduce.jl:384                                                                                
│││││││││││┌ reduce_empty(op::Base.MappingRF{var"#5#6"{Vector{…}, Vector{…}, Vector{…}}, typeof(Base.add_sum)}, ::Type
{Int64}) @ Base ./reduce.jl:361                                                                                      
││││││││││││ runtime dispatch detected: Base.mapreduce_empty(%1::var"#5#6"{Vector{Float64}, Vector{Float64}, Vector{Float64}}, add_sum, ::Int64)                                     
│││││││││││└────────────────────

The vectors are quite short, so I don’t think I will benefit from @simd.

simd instructions operate on very small pack of data (2,4,8) and can be beneficial for small vectors (no overhead).
For this operation, if the data is not already in cache the major price will be to read the data from the RAM. It may be non trivial to evaluate because the benchmark tends to create an artificial temporal locality that may or may not represents your actual computation.

1 Like

afaik, you should keep the @inbounds there, which is then safe because it is guaranteed. As of today the each index used that way is not used by the compiler to avoid bounds checking.

That’s incorrect (at least for my Julia 1.10.5). Generally using eachindex signals the compiler that the index access are in fact inbounds. This is the great advantage of using it in the first place :slight_smile: You get safe @inbounds from it!

I checked this example and adding @inbounds gives the exact same LLVM-code.

7 Likes

Here is yet another suggestion along the same lines as g1 and g7:

julia> g9(x,y,z) = sum(x[i] * y[i] * z[i] for i in eachindex(x, y, z))
g9 (generic function with 1 method)

julia> x, y, z = rand(3), rand(3), rand(3);

julia> @btime g9($x, $y, $z)
  4.472 ns (0 allocations: 0 bytes)
0.7977237806641253
6 Likes

Good to know, the last time I checked that was not the case (it isn´t, for instance, in Julia 1.6).

code
julia> using BenchmarkTools

julia> function g6(x, y, w)
           T = promote_type(eltype(x), eltype(y), eltype(w))
           r = zero(T)
           for i in eachindex(x, y, w)
               r += x[i] * y[i] * w[i]
           end
           return r
       end
g6 (generic function with 1 method)

julia> @btime g6($(rand(1000)),$(rand(1000)),$(rand(1000)))
  518.948 ns (0 allocations: 0 bytes)
116.36617338389264

julia> function g6(x, y, w)
           T = promote_type(eltype(x), eltype(y), eltype(w))
           r = zero(T)
           for i in eachindex(x, y, w)
               @inbounds r += x[i] * y[i] * w[i]
           end
           return r
       end
g6 (generic function with 1 method)

julia> @btime g6($(rand(1000)),$(rand(1000)),$(rand(1000)))
  416.673 ns (0 allocations: 0 bytes)
120.4890370653176

ps: nobody mentioned, I think, that LoopVectorization.@turbo is still around:

julia> x, y, z = (rand(10^3), rand(10^3), rand(10^3));

julia> function g6(x, y, w)
           T = promote_type(eltype(x), eltype(y), eltype(w))
           r = zero(T)
           @turbo for i in eachindex(x, y, w)
               r += x[i] * y[i] * w[i]
           end
           return r
       end
g6 (generic function with 1 method)

julia> @btime g6($x,$y,$z)
  65.375 ns (0 allocations: 0 bytes)
117.85528649756924

julia> function g6(x, y, w)
           T = promote_type(eltype(x), eltype(y), eltype(w))
           r = zero(T)
           @simd for i in eachindex(x, y, w)
               r += x[i] * y[i] * w[i]
           end
           return r
       end
g6 (generic function with 1 method)

julia> @btime g6($x,$y,$z)
  79.421 ns (0 allocations: 0 bytes)
117.85528649756922

but also the OP should probably clarify if the goal is to actually perform these operations in those short (3-element) arrays.

The length 3 seems odd here. If you don’t need anything longer than 3, StaticArrays.jl yields a comparable (if not marginally faster) benefit to SIMD for me, and is generally preferred for vectors of this size. Is 3 a representative length for this problem, or should we benchmark with a larger vector?

julia> using Chairmarks, StaticArrays

julia> function g6(x, y, w)
                  T = promote_type(eltype(x), eltype(y), eltype(w))
                  r = zero(T)
                  @simd for i in eachindex(x, y, w)
                      r += x[i] * y[i] * w[i]
                  end
                  return r
              end
g6 (generic function with 1 method)

julia> x = rand(3); y = rand(3); w = rand(3);

julia> xstatic = SVector{3}(x); ystatic = SVector{3}(y); wstatic = SVector{3}(w);

julia> g_mapreduce(x, y, w) = mapreduce(*, +, x, y, w)
g_mapreduce (generic function with 1 method)

julia> @b g6(x, y, w)
20.227 ns (1.01 allocs: 16.175 bytes)

julia> @b g6(xstatic, ystatic, wstatic)
18.295 ns (1.01 allocs: 16.135 bytes)

julia> @b g_mapreduce(x, y, w)
754.138 ns (6.28 allocs: 214.621 bytes)

julia> @b g_mapreduce(xstatic, ystatic, wstatic)
17.724 ns (1.01 allocs: 16.127 bytes)

2 Likes

Do you know why splat(*) behaves differently from prod here?
I tried it and while prod triggers some allocations, it’s still faster for larger array sizes.

Sorry, don’t know where the allocations are coming from … it seems that there are several special paths/fallbacks in prod though:

julia> g8(x, y, z) = sum(prod, zip(x, y, z))
g8 (generic function with 1 method)

julia> @btime g8(x, y, z) (setup=(x = rand(100); y = rand(100); z = rand(100)));

  201.941 ns (3 allocations: 80 bytes)

julia> @btime g8(x, y, z) (setup=(x = rand(Int, 100); y = rand(Int, 100); z = rand(Int, 100)));
  124.463 ns (0 allocations: 0 bytes)
1 Like