Efficient weighted sum for arbitrary data types

baxmittens · May 8, 2024, 12:12pm

Hello all,

I have some old Julia code, e.g.,

written in 2016, which uses in-place math operations on custom data types implemented by proprietary in-place operator functions.

The task I always have to complete is basically always the assembly of a basis, i.e. a weighted sum
f(\mathbf{x}) = \sum\limits_{i=1}^N w_i * \phi_i(\mathbf{x}) .

The weight w_i can be an arbitrary type, e.g. a Float64, a Vector{Float64}, a Matrix{Float64}, or something more complex like a Vector{Matrix{Float64}} or an abstract type T which has the operators +,-,^,* or their proprietary in-place counterparts implemented.

The basis function \phi always acts scalar-wise on the weights, and I would like to implement broadcasting in order to use .+, .-, .^, .* but I am not quite sure how to do it the most efficient way.

I assembled an example:

#import LinearAlgebra: mul!
#import AltInplaceOpsInterface: add!

function Base.fill!(a::Matrix{Float64}, b::Matrix{Float64})
	@inbounds @simd for i in 1:length(a)
		 a[i] = b[i]
	end
	return nothing	
end
function add!(a::Matrix{Float64}, b::Matrix{Float64})
	@inbounds @simd for i in 1:length(a)
		 a[i] += b[i]
	end
	return nothing	
end
function add!(a::Matrix{Float64}, b::Float64)
	@inbounds @simd for i in 1:length(a)
		 a[i] += b
	end
	return nothing	
end
function mul!(a::Matrix{Float64}, b::Float64)
	@inbounds @simd for i in 1:length(a)
		 a[i] += b
	end
	return nothing		
end
Base.fill!(a::Vector{Matrix{Float64}}, b::Float64) = foreach(x->fill!(x,b), a)
function Base.fill!(a::Vector{Matrix{Float64}}, b::Vector{Matrix{Float64}})
	@inbounds for i = 1:length(a); fill!(a[i],b[i]) end 
	return nothing	
end
function add!(a::Vector{Matrix{Float64}}, b::Vector{Matrix{Float64}})
	@inbounds for i = 1:length(a); add!(a[i],b[i]) end 
	return nothing	
end
function add!(a::Vector{Matrix{Float64}}, b::Float64)
	@inbounds for i = 1:length(a); add!(a[i],b) end 
	return nothing	
end
function mul!(a::Vector{Matrix{Float64}}, b::Float64)
	@inbounds for i = 1:length(a); mul!(a[i],b) end 
	return nothing		
end

# test weighted sum with broadcasting
function test_wsum!(ret::T, val::T, ntimes::Int) where T
	fill!(ret,0.0);
	for i in 1:ntimes
		ret .+= val.*rand()
	end
	return nothing
end

# test weighted sum with broadcasting and a temp array
function test_wsum!(ret::T, val::T, tmp::T, ntimes::Int) where T
	fill!(ret,0.0);
	for i in 1:ntimes
		tmp .= val
		tmp .*= rand()
		ret .+= tmp
	end
	return nothing
end

# test weighted sum with broadcasting and an additional loop
function test_wsum_vec!(rets::Vector{T}, vals::Vector{T}, ntimes::Int) where T
	fill!(rets,0.0);
	_n = length(rets)
	for i in 1:ntimes
		@inbounds for i in 1:_n
			rets[i] .+= vals[i].*rand()
		end
	end
	return nothing
end

# test weighted sum with inplace operations (temp array is needed)
function test_wsum_inplace_ops!(ret::T, val::T, tmp::T, ntimes::Int) where T
	fill!(ret,0.0);
	for i in 1:ntimes
		fill!(tmp, val)
		mul!(tmp, rand())
		add!(ret, tmp)
	end
	return nothing
end

n = 20
m = 100
N = 30
ntimes = 20_000

val = rand(Float64,n,m)
tmp = zeros(Float64,n,m)
ret = zeros(Float64,n,m)

vals = map(x->rand(Float64,n,m), 1:N)
tmps = map(x->zeros(Float64,n,m), 1:N)
rets = map(x->zeros(Float64,n,m), 1:N)

# test with matrices
@time test_wsum!(ret, val, ntimes)
# 0.010501 seconds
@time test_wsum!(ret, val, tmp, ntimes)
# 0.036385 seconds
@time test_wsum_inplace_ops!(ret, val, tmp, ntimes)
# 0.010074 seconds

# test with vector of matrices
@time test_wsum!(rets, vals, ntimes)
# 1.609179 seconds (1.20 M allocations: 18.024 GiB, 5.03% gc time)
@time test_wsum!(rets, vals, tmps, ntimes)
# 1.617881 seconds (1.20 M allocations: 18.024 GiB, 5.87% gc time)
@time test_wsum_vec!(rets, vals, ntimes)
#  0.382193 seconds
@time test_wsum_inplace_ops!(rets, vals, tmps, ntimes)
# 0.696167 seconds

I basically have two functions:
test_wsum!(ret::T, val::T, ntimes::Int) where I use broadcasting
and
test_wsum_inplace_ops!(ret::T, val::T, tmp::T, ntimes::Int) where I use the in-place functions

For Matrix{Float64} the broadcast version and the in-place operations version are equally fast.

For Vector{Matrix{Float64}} only the in-place operations version is fast, while the broadcast version allocates memory.

Only if I put a loop around the broadcast version as in
test_wsum_vec!(rets::Vector{T}, vals::Vector{T}, ntimes::Int) the broadcast version is fast (it is actually the fastest).

My question would be:
Can I maintain the syntax of test_wsum!(ret::T, val::T, ntimes::Int) (without the additional loop) and be as fast or even faster as test_wsum_vec!(rets::Vector{T}, vals::Vector{T}, ntimes::Int)?

That way all data types could remain using the same lines of code and I do not have to specialize or something like that.

Thank you in advance.

Greetz max

abraemer · May 8, 2024, 12:47pm

The reason here is that broadcasting only affects the “outermost layer” and the “inner layers” don’t see the broadcasting. Consider e.g. a Vector{Matrix{Float64}}:

julia> using BenchmarkTools
julia> v = [rand(4,4) for _ in 1:5];
julia> factors = rand(5);
julia> @btime ($v .*= $factors);
  218.254 ns (5 allocations: 960 bytes)

julia> @btime ($v .= $v .* $factors);
  217.998 ns (5 allocations: 960 bytes)

julia> @btime let v=$v, factors=$factors
           @inbounds for i in eachindex(v,factors)
               v[i] = v[i] * factors[i]
           end
       end
  206.785 ns (5 allocations: 960 bytes)

where all of the timed code blocks are semantically equivalent. Essentially I just desugared the first expression.
To avoid the allocations in this case you’d need to have another broadcasting step inside this loop:

julia> @btime let v=$v, factors=$factors
           @inbounds for i in eachindex(v,factors)
               v[i] .= v[i] .* factors[i]
           end
       end
  76.987 ns (0 allocations: 0 bytes)

So it is clear that this result can never be achieved with pure, vanilla broadcasting. You’d need some way of handling these recursive structures where the values need to be mutated at the lowest level. You might be able to achieve this with some clever “recursive multiplication” function.

matthias314 · May 8, 2024, 1:06pm

Maybe BangBang.jl can help here. It defines functions like add!!(x, y) that does x .+= y if possible and x+y otherwise.

abraemer · May 8, 2024, 1:10pm

I thought a bit more about what you described and I don’t exactly get what your types can be. E.g. what should happen for

vals = [rand(1,1),rand(2,2), rand(3,3)] # isa Vector{Matrix{Float64}}
# but what should tmp be in

Can you give more restrictions to work with? Or perhaps a better minimal implementation of the actual functionality you want because test_wsum! and the like do not even implement a reduction of some kind.

Btw: This is type-piracy of the worst degree. You should never define methods like these:

baxmittens · May 8, 2024, 2:19pm

Thank you for your answer. appreciate it!

I often do some type piracy if the complexity of my code is reduced by that. but the fill! the method was just for the example and resulted from copy-pasting things. normally, it would look something like that in the following snippet (some type piracy is present as well)

I changed the code so that it actually does a reduction. maybe it gets more clear that way.

using BenchmarkTools

function add!(a::Matrix{Float64}, b::Matrix{Float64})
	@inbounds @simd for i in 1:length(a)
		 a[i] += b[i]
	end
	return nothing	
end
function add!(a::Matrix{Float64}, b::Float64)
	@inbounds @simd for i in 1:length(a)
		 a[i] += b
	end
	return nothing	
end
function mul!(a::Matrix{Float64}, b::Float64)
	@inbounds @simd for i in 1:length(a)
		 a[i] *= b
	end
	return nothing		
end

Base.fill!(a::Vector{Matrix{Float64}}, b::Float64) = foreach(x->fill!(x,b), a)
function add!(a::Vector{Matrix{Float64}}, b::Vector{Matrix{Float64}})
	@inbounds for i = 1:length(a); add!(a[i],b[i]) end 
	return nothing	
end
function add!(a::Vector{Matrix{Float64}}, b::Float64)
	@inbounds for i = 1:length(a); add!(a[i],b) end 
	return nothing	
end
function mul!(a::Vector{Matrix{Float64}}, b::Float64)
	@inbounds for i = 1:length(a); mul!(a[i],b) end 
	return nothing		
end

# test weighted sum with broadcasting
function test_wsum!(ret::T, weights::Vector{T}, φs::Vector{Float64}) where T
	fill!(ret,0.0);
	for i in 1:length(φs)
		ret .+= weights[i].*φs[i]
	end
	return nothing
end

# test weighted sum with broadcasting and an additional loop
function test_wsum_vec!(ret::Vector{T}, weights::Vector{Vector{T}}, φs::Vector{Float64}) where T
	fill!(ret,0.0);
	for i in 1:length(φs)
		for j = 1:length(ret)
			ret[j] .+= weights[i][j].*φs[i]
		end
	end
	return nothing
end

# test weighted sum with inplace operations
function test_wsum_inplace_ops!(ret::T, tmp::T, weights::Vector{T}, φs::Vector{Float64}) where T
	fill!(ret,0.0);
	for i in 1:length(φs)
		fill!(tmp, 0.0)
		add!(tmp, weights[i])
		mul!(tmp, φs[i])
		add!(ret, tmp)
	end
	return nothing
end

n = 3
m = 5
N = 2
ntimes = 100

weights = map(x->rand(Float64,n,m),1:ntimes)
φs = map(x->rand(), 1:ntimes)
tmp = zeros(Float64,n,m)
ret = zeros(Float64,n,m)


weights_vec = map(x->map(x->rand(Float64,n,m), 1:N), 1:ntimes)
tmp_vec = map(x->zeros(Float64,n,m), 1:N)
ret_vec = map(x->zeros(Float64,n,m), 1:N)

# test with matrices
@btime test_wsum!($ret, $weights, $φs)
@btime test_wsum_inplace_ops!($ret, $tmp, $weights, $φs)

# test with vector of matrices
@btime test_wsum!($ret_vec, $weights_vec, $φs)
@btime test_wsum_vec!($ret_vec, $weights_vec, $φs)
@btime test_wsum_inplace_ops!($ret_vec, $tmp_vec, $weights_vec, $φs)

baxmittens · May 8, 2024, 2:27pm

Thank you for the recommendation. I will have a look into it.

matthias314 · May 8, 2024, 2:58pm

What about something like this:

using BangBang

add!!!(x, y) = add!!(x, y)
add!!!(x::Array{<:Array}, y) = map(add!!!, x, y)   # EDIT: use add!!!, not add!!

Then

julia> v = [[1,2], [3,4]]; v1, v2 = v;

julia> w = [[5,6], [7,8]]; w1, w2 = w;

julia> add!!!(v, w)
2-element Vector{Vector{Int64}}:
 [6, 8]
 [10, 12]

julia> v
2-element Vector{Vector{Int64}}:
 [6, 8]
 [10, 12]

julia> v1
2-element Vector{Int64}:
 6
 8

baxmittens · May 9, 2024, 11:26am

I will think about using BangBang.jl

At the moment, I think I maintain using my proprietary in-place interface

#import LinearAlgebra: mul!
#import AltInplaceOpsInterface: add!
using BenchmarkTools

function add!(a::Matrix{Float64}, b::Matrix{Float64})
	a .+= b
	return nothing	
end
function add!(a::Matrix{Float64}, b::Float64)
	a .+= b
	return nothing	
end
function mul!(a::Matrix{Float64}, b::Float64)
	a .*= b
	return nothing		
end
function mul_add!(a::Matrix{Float64}, b::Matrix{Float64}, c::Float64)
	a .+= b.*c
	return nothing		
end

Base.fill!(a::Vector{Matrix{Float64}}, b::Float64) = foreach(x->fill!(x,b), a)
function add!(a::Vector{Matrix{Float64}}, b::Vector{Matrix{Float64}})
	@inbounds for i = 1:length(a); add!(a[i],b[i]) end 
	return nothing	
end
function add!(a::Vector{Matrix{Float64}}, b::Float64)
	@inbounds for i = 1:length(a); add!(a[i],b) end 
	return nothing	
end
function mul!(a::Vector{Matrix{Float64}}, b::Float64)
	@inbounds for i = 1:length(a); mul!(a[i],b) end 
	return nothing		
end
function mul_add!(a::Vector{Matrix{Float64}}, b::Vector{Matrix{Float64}}, c::Float64)
	@inbounds for i = 1:length(a); mul_add!(a[i],b[i],c) end 
	return nothing		
end

# test weighted sum with broadcasting
function test_wsum!(ret::T, weights::Vector{T}, φs::Vector{Float64}) where T
	fill!(ret,0.0);
	for i in 1:length(φs)
		ret .+= weights[i].*φs[i]
	end
	return nothing
end

# test weighted sum with broadcasting and an additional loop
function test_wsum_vec!(ret::Vector{T}, weights::Vector{Vector{T}}, φs::Vector{Float64}) where T
	fill!(ret,0.0);
	for i in 1:length(φs)
		for j = 1:length(ret)
			ret[j] .+= weights[i][j].*φs[i]
		end
	end
	return nothing
end

# test weighted sum with inplace operations
function test_wsum_inplace_ops!(ret::T, weights::Vector{T}, φs::Vector{Float64}) where T
	#fill!(ret,0.0);
	for i in 1:length(φs)
		mul_add!(ret, weights[i], φs[i])
	end
	return nothing
end

n = 3
m = 5
N = 2
ntimes = 100

weights = map(x->rand(Float64,n,m),1:ntimes)
φs = map(x->rand(), 1:ntimes)
ret = zeros(Float64,n,m)


weights_vec = map(x->map(x->rand(Float64,n,m), 1:N), 1:ntimes)
tmp_vec = map(x->zeros(Float64,n,m), 1:N)
ret_vec = map(x->zeros(Float64,n,m), 1:N)

# test with matrices
@btime test_wsum!($ret, $weights, $φs)
@btime test_wsum_inplace_ops!($ret, $weights, $φs)

# test with vector of matrices
@btime test_wsum!($ret_vec, $weights_vec, $φs)
@btime test_wsum_vec!($ret_vec, $weights_vec, $φs)
@btime test_wsum_inplace_ops!($ret_vec, $weights_vec, $φs)

and then I just implement the interface functions with help of .+, .*, .-, .^
That way, I can use the same code in my projects and put additional loops for nested data types in the implementation of the interface.

Topic		Replies	Views
Performance of simple broadcasting operations with many arguments Performance performance , broadcast	15	1592	November 29, 2021
Why is a multi-argument inplace map much faster in this case than a broadcast? Performance question , broadcast , map	16	674	December 12, 2022
Design of efficient lazy broadcastable operator Performance	11	121	April 14, 2025
Performance scaling of broadcasting compared to looping Performance question , optimization , loops , broadcasting	19	737	March 21, 2023
Optimize combination of element-wise products and matrix multiplication Performance	10	1019	July 15, 2020

Efficient weighted sum for arbitrary data types

Related topics