Hello all,

I have some old Julia code, e.g.,

written in 2016, which uses in-place math operations on custom data types implemented by proprietary in-place operator functions.

The task I always have to complete is basically always the assembly of a basis, i.e. a weighted sum

f(\mathbf{x}) = \sum\limits_{i=1}^N w_i * \phi_i(\mathbf{x}) .

The weight w_i can be an arbitrary type, e.g. a `Float64`

, a `Vector{Float64}`

, a `Matrix{Float64}`

, or something more complex like a `Vector{Matrix{Float64}}`

or an abstract type T which has the operators `+,-,^,*`

or their proprietary in-place counterparts implemented.

The basis function \phi always acts scalar-wise on the weights, and I would like to implement broadcasting in order to use `.+, .-, .^, .*`

but I am not quite sure how to do it the most efficient way.

I assembled an example:

```
#import LinearAlgebra: mul!
#import AltInplaceOpsInterface: add!
function Base.fill!(a::Matrix{Float64}, b::Matrix{Float64})
@inbounds @simd for i in 1:length(a)
a[i] = b[i]
end
return nothing
end
function add!(a::Matrix{Float64}, b::Matrix{Float64})
@inbounds @simd for i in 1:length(a)
a[i] += b[i]
end
return nothing
end
function add!(a::Matrix{Float64}, b::Float64)
@inbounds @simd for i in 1:length(a)
a[i] += b
end
return nothing
end
function mul!(a::Matrix{Float64}, b::Float64)
@inbounds @simd for i in 1:length(a)
a[i] += b
end
return nothing
end
Base.fill!(a::Vector{Matrix{Float64}}, b::Float64) = foreach(x->fill!(x,b), a)
function Base.fill!(a::Vector{Matrix{Float64}}, b::Vector{Matrix{Float64}})
@inbounds for i = 1:length(a); fill!(a[i],b[i]) end
return nothing
end
function add!(a::Vector{Matrix{Float64}}, b::Vector{Matrix{Float64}})
@inbounds for i = 1:length(a); add!(a[i],b[i]) end
return nothing
end
function add!(a::Vector{Matrix{Float64}}, b::Float64)
@inbounds for i = 1:length(a); add!(a[i],b) end
return nothing
end
function mul!(a::Vector{Matrix{Float64}}, b::Float64)
@inbounds for i = 1:length(a); mul!(a[i],b) end
return nothing
end
# test weighted sum with broadcasting
function test_wsum!(ret::T, val::T, ntimes::Int) where T
fill!(ret,0.0);
for i in 1:ntimes
ret .+= val.*rand()
end
return nothing
end
# test weighted sum with broadcasting and a temp array
function test_wsum!(ret::T, val::T, tmp::T, ntimes::Int) where T
fill!(ret,0.0);
for i in 1:ntimes
tmp .= val
tmp .*= rand()
ret .+= tmp
end
return nothing
end
# test weighted sum with broadcasting and an additional loop
function test_wsum_vec!(rets::Vector{T}, vals::Vector{T}, ntimes::Int) where T
fill!(rets,0.0);
_n = length(rets)
for i in 1:ntimes
@inbounds for i in 1:_n
rets[i] .+= vals[i].*rand()
end
end
return nothing
end
# test weighted sum with inplace operations (temp array is needed)
function test_wsum_inplace_ops!(ret::T, val::T, tmp::T, ntimes::Int) where T
fill!(ret,0.0);
for i in 1:ntimes
fill!(tmp, val)
mul!(tmp, rand())
add!(ret, tmp)
end
return nothing
end
n = 20
m = 100
N = 30
ntimes = 20_000
val = rand(Float64,n,m)
tmp = zeros(Float64,n,m)
ret = zeros(Float64,n,m)
vals = map(x->rand(Float64,n,m), 1:N)
tmps = map(x->zeros(Float64,n,m), 1:N)
rets = map(x->zeros(Float64,n,m), 1:N)
# test with matrices
@time test_wsum!(ret, val, ntimes)
# 0.010501 seconds
@time test_wsum!(ret, val, tmp, ntimes)
# 0.036385 seconds
@time test_wsum_inplace_ops!(ret, val, tmp, ntimes)
# 0.010074 seconds
# test with vector of matrices
@time test_wsum!(rets, vals, ntimes)
# 1.609179 seconds (1.20 M allocations: 18.024 GiB, 5.03% gc time)
@time test_wsum!(rets, vals, tmps, ntimes)
# 1.617881 seconds (1.20 M allocations: 18.024 GiB, 5.87% gc time)
@time test_wsum_vec!(rets, vals, ntimes)
# 0.382193 seconds
@time test_wsum_inplace_ops!(rets, vals, tmps, ntimes)
# 0.696167 seconds
```

I basically have two functions:

`test_wsum!(ret::T, val::T, ntimes::Int)`

where I use broadcasting

and

`test_wsum_inplace_ops!(ret::T, val::T, tmp::T, ntimes::Int) `

where I use the in-place functions

For `Matrix{Float64}`

the broadcast version and the in-place operations version are equally fast.

For `Vector{Matrix{Float64}}`

only the in-place operations version is fast, while the broadcast version allocates memory.

Only if I put a loop around the broadcast version as in

`test_wsum_vec!(rets::Vector{T}, vals::Vector{T}, ntimes::Int)`

the broadcast version is fast (it is actually the fastest).

My question would be:

Can I maintain the syntax of `test_wsum!(ret::T, val::T, ntimes::Int)`

(without the additional loop) and be as fast or even faster as `test_wsum_vec!(rets::Vector{T}, vals::Vector{T}, ntimes::Int)`

?

That way all data types could remain using the same lines of code and I do not have to specialize or something like that.

Thank you in advance.

Greetz max