Hello all,
I have some old Julia code, e.g.,
written in 2016, which uses in-place math operations on custom data types implemented by proprietary in-place operator functions.
The task I always have to complete is basically always the assembly of a basis, i.e. a weighted sum
f(\mathbf{x}) = \sum\limits_{i=1}^N w_i * \phi_i(\mathbf{x}) .
The weight w_i can be an arbitrary type, e.g. a Float64
, a Vector{Float64}
, a Matrix{Float64}
, or something more complex like a Vector{Matrix{Float64}}
or an abstract type T which has the operators +,-,^,*
or their proprietary in-place counterparts implemented.
The basis function \phi always acts scalar-wise on the weights, and I would like to implement broadcasting in order to use .+, .-, .^, .*
but I am not quite sure how to do it the most efficient way.
I assembled an example:
#import LinearAlgebra: mul!
#import AltInplaceOpsInterface: add!
function Base.fill!(a::Matrix{Float64}, b::Matrix{Float64})
@inbounds @simd for i in 1:length(a)
a[i] = b[i]
end
return nothing
end
function add!(a::Matrix{Float64}, b::Matrix{Float64})
@inbounds @simd for i in 1:length(a)
a[i] += b[i]
end
return nothing
end
function add!(a::Matrix{Float64}, b::Float64)
@inbounds @simd for i in 1:length(a)
a[i] += b
end
return nothing
end
function mul!(a::Matrix{Float64}, b::Float64)
@inbounds @simd for i in 1:length(a)
a[i] += b
end
return nothing
end
Base.fill!(a::Vector{Matrix{Float64}}, b::Float64) = foreach(x->fill!(x,b), a)
function Base.fill!(a::Vector{Matrix{Float64}}, b::Vector{Matrix{Float64}})
@inbounds for i = 1:length(a); fill!(a[i],b[i]) end
return nothing
end
function add!(a::Vector{Matrix{Float64}}, b::Vector{Matrix{Float64}})
@inbounds for i = 1:length(a); add!(a[i],b[i]) end
return nothing
end
function add!(a::Vector{Matrix{Float64}}, b::Float64)
@inbounds for i = 1:length(a); add!(a[i],b) end
return nothing
end
function mul!(a::Vector{Matrix{Float64}}, b::Float64)
@inbounds for i = 1:length(a); mul!(a[i],b) end
return nothing
end
# test weighted sum with broadcasting
function test_wsum!(ret::T, val::T, ntimes::Int) where T
fill!(ret,0.0);
for i in 1:ntimes
ret .+= val.*rand()
end
return nothing
end
# test weighted sum with broadcasting and a temp array
function test_wsum!(ret::T, val::T, tmp::T, ntimes::Int) where T
fill!(ret,0.0);
for i in 1:ntimes
tmp .= val
tmp .*= rand()
ret .+= tmp
end
return nothing
end
# test weighted sum with broadcasting and an additional loop
function test_wsum_vec!(rets::Vector{T}, vals::Vector{T}, ntimes::Int) where T
fill!(rets,0.0);
_n = length(rets)
for i in 1:ntimes
@inbounds for i in 1:_n
rets[i] .+= vals[i].*rand()
end
end
return nothing
end
# test weighted sum with inplace operations (temp array is needed)
function test_wsum_inplace_ops!(ret::T, val::T, tmp::T, ntimes::Int) where T
fill!(ret,0.0);
for i in 1:ntimes
fill!(tmp, val)
mul!(tmp, rand())
add!(ret, tmp)
end
return nothing
end
n = 20
m = 100
N = 30
ntimes = 20_000
val = rand(Float64,n,m)
tmp = zeros(Float64,n,m)
ret = zeros(Float64,n,m)
vals = map(x->rand(Float64,n,m), 1:N)
tmps = map(x->zeros(Float64,n,m), 1:N)
rets = map(x->zeros(Float64,n,m), 1:N)
# test with matrices
@time test_wsum!(ret, val, ntimes)
# 0.010501 seconds
@time test_wsum!(ret, val, tmp, ntimes)
# 0.036385 seconds
@time test_wsum_inplace_ops!(ret, val, tmp, ntimes)
# 0.010074 seconds
# test with vector of matrices
@time test_wsum!(rets, vals, ntimes)
# 1.609179 seconds (1.20 M allocations: 18.024 GiB, 5.03% gc time)
@time test_wsum!(rets, vals, tmps, ntimes)
# 1.617881 seconds (1.20 M allocations: 18.024 GiB, 5.87% gc time)
@time test_wsum_vec!(rets, vals, ntimes)
# 0.382193 seconds
@time test_wsum_inplace_ops!(rets, vals, tmps, ntimes)
# 0.696167 seconds
I basically have two functions:
test_wsum!(ret::T, val::T, ntimes::Int)
where I use broadcasting
and
test_wsum_inplace_ops!(ret::T, val::T, tmp::T, ntimes::Int)
where I use the in-place functions
For Matrix{Float64}
the broadcast version and the in-place operations version are equally fast.
For Vector{Matrix{Float64}}
only the in-place operations version is fast, while the broadcast version allocates memory.
Only if I put a loop around the broadcast version as in
test_wsum_vec!(rets::Vector{T}, vals::Vector{T}, ntimes::Int)
the broadcast version is fast (it is actually the fastest).
My question would be:
Can I maintain the syntax of test_wsum!(ret::T, val::T, ntimes::Int)
(without the additional loop) and be as fast or even faster as test_wsum_vec!(rets::Vector{T}, vals::Vector{T}, ntimes::Int)
?
That way all data types could remain using the same lines of code and I do not have to specialize or something like that.
Thank you in advance.
Greetz max