For my FEM work, I want to be able to multiply higher dimension arrays with each other, as in “cartensian tensors”. I know there are packages, I want to learn.
Function dots2 is specialised to multiply 3-Arrays with vectors. It is fast.
Function dots1 is more general, it sums over the last index of first argument and first index of last argument. It’s slow because it’s time-unstable, even though I shielded for instability in the function tenprod! A factor 4 or 5 loss in performance.
I want it general, and I want it fast. Why is my dots1 code below type unstable? Given Ta and Tb, I’d say all types are clearly defined. What did I miss?
: )
Philippe
function dots1(a::Array{Ta},b::Array{Tb}) where {Ta,Tb}
Tc = promote_type(Ta,Tb)
n = 1
sc = (size(a)[1:end-1]...,size(b)[1+1:end]...)
c = zeros(Tc,sc)
r1 = CartesianRange(size(a)[1:end-n])
r2 = CartesianRange(size(b)[1:n])
r3 = CartesianRange(size(b)[n+1:end])
tenprod!(c,a,b,r1,r2,r3)
return c
end
function dots2(a::Array{T,3},b::Vector{T}) where {T}
c = zeros(T,size(a,1),size(a,2))
for k=indices(a,3),j=indices(a,2),i=indices(a,1)
@inbounds @fastmath c[i,j] += a[i,j,k]*b[k]
end
return c
end
@noinline function tenprod!(c,a,b,r1,r2,r3)
for i3 in r3,i2 in r2,i1 in r1
@inbounds @fastmath c[i1,i3] += a[i1,i2]*b[i2,i3]
end
end
function runit(b,c)
for i = 1:100000
a=dots1(b,c)
end
end
b = randn(2,3,4)
c = randn(4)
B = randn(20,30,40)
C = randn(40)
# warm up
a=dots1(b,c)
a=dots2(b,c)
A=dots1(B,C)
A=dots2(B,C)
runit(b,c)
# go!
@time a=dots1(b,c)
@time a=dots2(b,c)
@time A=dots1(B,C)
@time A=dots2(B,C)
if false
Base.Profile.init(delay=0.000001)
Base.Profile.clear()
Base.Profile.clear_malloc_data()
tic()
@profile @time runit(b,c)
toc()
Base.Profile.print()
using ProfileView
ProfileView.view()
end
P.S.
I also tried
function dots1(a::Array{Ta,Na},b::Array{Tb,Nb}) where {Ta,Tb,Na,Nb}
Tc = promote_type(Ta,Tb)
n = 1
sc = (size(a)[1:end-1]...,size(b)[1+1:end]...)
#c = zeros(Tc,sc)
c = Array{Tc,Na+Nb-2}(sc...)
fill!(c,zero(Tc))
r1 = CartesianRange(size(a)[1:end-n])
r2 = CartesianRange(size(b)[1:n])
r3 = CartesianRange(size(b)[n+1:end])
tenprod!(c,a,b,r1,r2,r3)
return c
end
which is as bad.