Thanks for posting this. Coincidentally, I was trying to compare the previous versions with a numba
-accelerated python
version (just to see if LLVM made any difference) but now with this, Julia is about 6x faster
from numpy.linalg import inv as npinv
from numpy import zeros_like, einsum, random
from timeit import timeit
from numba import jit, prange
@jit(nopython=True, cache=True, nogil=True)
def vdet(F):
J = zeros_like(F[0,0])
for a in range(J.shape[0]):
for b in range(J.shape[1]):
J[a,b] += F[0, 0, a, b] * (F[1, 1, a, b] * F[2, 2, a, b] -
F[1, 2, a, b] * F[2, 1, a, b]) -\
F[0, 1, a, b] * (F[1, 0, a, b] * F[2, 2, a, b] -
F[1, 2, a, b] * F[2, 0, a, b]) +\
F[0, 2, a, b] * (F[1, 0,a ,b] * F[2, 1, a, b] -
F[1, 1, a, b] * F[2, 0, a, b])
return J
@jit(nopython=True, cache=True, nogil=True)
def vinv(F):
J = vdet(F)
Finv = zeros_like(F)
for a in range(J.shape[0]):
for b in range(J.shape[1]):
Finv[0, 0, a, b] += (-F[1, 2, a, b] * F[2, 1, a, b] +
F[1, 1, a, b] * F[2, 2, a, b]) / J[a, b]
Finv[1, 0, a, b] += (F[1, 2, a, b] * F[2, 0, a, b] -
F[1, 0, a, b] * F[2, 2, a, b]) / J[a, b]
Finv[2, 0, a, b] += (-F[1, 1, a, b] * F[2, 0, a, b] +
F[1, 0, a, b] * F[2, 1, a, b]) / J[a, b]
Finv[0, 1, a, b] += (F[0, 2, a, b] * F[2, 1, a, b] -
F[0, 1, a, b] * F[2, 2, a, b]) / J[a, b]
Finv[1, 1, a, b] += (-F[0, 2, a, b] * F[2, 0, a, b] +
F[0, 0, a, b] * F[2, 2, a, b]) / J[a, b]
Finv[2, 1, a, b] += (F[0, 1, a, b] * F[2, 0, a, b] -
F[0, 0, a, b] * F[2, 1, a, b]) / J[a, b]
Finv[0, 2, a, b] += (-F[0, 2, a, b] * F[1, 1, a, b] +
F[0, 1, a, b] * F[1, 2, a, b]) / J[a, b]
Finv[1, 2, a, b] += (F[0, 2, a, b] * F[1, 0, a, b] -
F[0, 0, a, b] * F[1, 2, a, b]) / J[a, b]
Finv[2, 2, a, b] += (-F[0, 1, a, b] * F[1, 0, a, b] +
F[0, 0, a, b] * F[1, 1, a, b]) / J[a, b]
return Finv
%timeit vinv(F)
69.3 µs ± 1.56 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)
vs your code
julia> @btime many3x3inverts!($Barray, $Aarray)
10.399 μs (0 allocations: 0 bytes)
It’s notoriously fast. Thanks for such an enlightening discussion.