Just for fun, I implemented my idea using SIMD.jl. It only works for real-valued matrices, complex values are a different kettle of fish.
For 1000x1000 matrices, it is approximately 2x as fast as a regular elementwise multiplication over the (dense) parent array, pretty close to what I predicted. I also tried LoopVectorization It is not fully optimal, nor is it really ‘safe’ with respect to all sizes and indexing schemes (it will probably fail for very small matrices, for example):
function simdmul!(C::LowerTriangular{T}, A::LowerTriangular{T}, B::LowerTriangular{T}) where {T<:Real}
size(A) == size(B) == size(C) || error("Incompatible sizes.")
N = div(512, 8 * sizeof(T))
lowermul!(parent(C), parent(A), parent(B), Val(N))
end
function lowermul!(C, A, B, ::Val{N}) where {N}
lane = VecRange{N}(firstindex(A, 1))
(J, K) = size(C)
J_ = J
skip = 0
for _ in 1:K-1
for j in skip:N:J_
lanej = lane + j
C[lanej] = A[lanej] * B[lanej]
end
skip += J + 1 # <- this is the trick that saves operations
J_ += J
end
C[end] = A[end] * B[end]
return C
end
Edit: Actually, you don’t need manual SIMD, you can just ask LoopVectorization to vectorize the inner loop for you do the completely simple naive thing, leading to cleaner code:
function automul!(C::LowerTriangular{T}, A::LowerTriangular{T}, B::LowerTriangular{T}) where {T<:Real}
size(A) == size(B) == size(C) || error("Incompatible sizes.")
lowermul!(parent(C), parent(A), parent(B))
return C
end
function lowermul!(C, A, B)
for k in axes(C, 2)
for j in k:lastindex(C, 1)
C[j, k] = A[j, k] * B[j, k]
end
end
return C
end