Hello all,
I have some old Julia code, e.g.,
written in 2016, which uses in-place math operations on custom data types implemented by proprietary in-place operator functions.
The task I always have to complete is basically always the assembly of a basis, i.e. a weighted sum
f(\mathbf{x}) = \sum\limits_{i=1}^N  w_i * \phi_i(\mathbf{x})  .
The weight w_i can be an arbitrary type, e.g. a Float64, a Vector{Float64}, a Matrix{Float64}, or something more complex like a Vector{Matrix{Float64}} or an abstract type T which has the operators +,-,^,* or their proprietary in-place counterparts implemented.
The basis function \phi always acts scalar-wise on the weights, and I would like to implement broadcasting in order to use .+, .-, .^, .* but I am not quite sure how to do it the most efficient way.
I assembled an example:
#import LinearAlgebra: mul!
#import AltInplaceOpsInterface: add!
function Base.fill!(a::Matrix{Float64}, b::Matrix{Float64})
	@inbounds @simd for i in 1:length(a)
		 a[i] = b[i]
	end
	return nothing	
end
function add!(a::Matrix{Float64}, b::Matrix{Float64})
	@inbounds @simd for i in 1:length(a)
		 a[i] += b[i]
	end
	return nothing	
end
function add!(a::Matrix{Float64}, b::Float64)
	@inbounds @simd for i in 1:length(a)
		 a[i] += b
	end
	return nothing	
end
function mul!(a::Matrix{Float64}, b::Float64)
	@inbounds @simd for i in 1:length(a)
		 a[i] += b
	end
	return nothing		
end
Base.fill!(a::Vector{Matrix{Float64}}, b::Float64) = foreach(x->fill!(x,b), a)
function Base.fill!(a::Vector{Matrix{Float64}}, b::Vector{Matrix{Float64}})
	@inbounds for i = 1:length(a); fill!(a[i],b[i]) end 
	return nothing	
end
function add!(a::Vector{Matrix{Float64}}, b::Vector{Matrix{Float64}})
	@inbounds for i = 1:length(a); add!(a[i],b[i]) end 
	return nothing	
end
function add!(a::Vector{Matrix{Float64}}, b::Float64)
	@inbounds for i = 1:length(a); add!(a[i],b) end 
	return nothing	
end
function mul!(a::Vector{Matrix{Float64}}, b::Float64)
	@inbounds for i = 1:length(a); mul!(a[i],b) end 
	return nothing		
end
# test weighted sum with broadcasting
function test_wsum!(ret::T, val::T, ntimes::Int) where T
	fill!(ret,0.0);
	for i in 1:ntimes
		ret .+= val.*rand()
	end
	return nothing
end
# test weighted sum with broadcasting and a temp array
function test_wsum!(ret::T, val::T, tmp::T, ntimes::Int) where T
	fill!(ret,0.0);
	for i in 1:ntimes
		tmp .= val
		tmp .*= rand()
		ret .+= tmp
	end
	return nothing
end
# test weighted sum with broadcasting and an additional loop
function test_wsum_vec!(rets::Vector{T}, vals::Vector{T}, ntimes::Int) where T
	fill!(rets,0.0);
	_n = length(rets)
	for i in 1:ntimes
		@inbounds for i in 1:_n
			rets[i] .+= vals[i].*rand()
		end
	end
	return nothing
end
# test weighted sum with inplace operations (temp array is needed)
function test_wsum_inplace_ops!(ret::T, val::T, tmp::T, ntimes::Int) where T
	fill!(ret,0.0);
	for i in 1:ntimes
		fill!(tmp, val)
		mul!(tmp, rand())
		add!(ret, tmp)
	end
	return nothing
end
n = 20
m = 100
N = 30
ntimes = 20_000
val = rand(Float64,n,m)
tmp = zeros(Float64,n,m)
ret = zeros(Float64,n,m)
vals = map(x->rand(Float64,n,m), 1:N)
tmps = map(x->zeros(Float64,n,m), 1:N)
rets = map(x->zeros(Float64,n,m), 1:N)
# test with matrices
@time test_wsum!(ret, val, ntimes)
# 0.010501 seconds
@time test_wsum!(ret, val, tmp, ntimes)
# 0.036385 seconds
@time test_wsum_inplace_ops!(ret, val, tmp, ntimes)
# 0.010074 seconds
# test with vector of matrices
@time test_wsum!(rets, vals, ntimes)
# 1.609179 seconds (1.20 M allocations: 18.024 GiB, 5.03% gc time)
@time test_wsum!(rets, vals, tmps, ntimes)
# 1.617881 seconds (1.20 M allocations: 18.024 GiB, 5.87% gc time)
@time test_wsum_vec!(rets, vals, ntimes)
#  0.382193 seconds
@time test_wsum_inplace_ops!(rets, vals, tmps, ntimes)
# 0.696167 seconds
I basically have two functions:
test_wsum!(ret::T, val::T, ntimes::Int) where I use broadcasting
and
test_wsum_inplace_ops!(ret::T, val::T, tmp::T, ntimes::Int)  where I use the in-place functions
For Matrix{Float64} the broadcast version and the in-place operations version are equally fast.
For Vector{Matrix{Float64}} only the in-place operations version is fast, while the broadcast version allocates memory.
Only if I put a loop around the broadcast version as in
test_wsum_vec!(rets::Vector{T}, vals::Vector{T}, ntimes::Int) the broadcast version is fast (it is actually the fastest).
My question would be:
Can I maintain the syntax of test_wsum!(ret::T, val::T, ntimes::Int) (without the additional loop) and be as fast or even faster as test_wsum_vec!(rets::Vector{T}, vals::Vector{T}, ntimes::Int)?
That way all data types could remain using the same lines of code and I do not have to specialize or something like that.
Thank you in advance.
Greetz max