I have two 4-dimensional arrays of sizes (r1, m, n, r2) and (s1, n, p, s2), respectively, and these have to be multiplied such that the resulting 4-dimensional array has shape (r1 * s1, m, p, r2 * s2). My naive implementation does the following:

```
function core_mult_col(lhs, rhs)
r1, m, n, r2 = size(lhs)
s1, n, p, s2 = size(rhs)
dim_1 = r1 * s1
dim_2 = m
dim_3 = p
dim_4 = r2 * s2
# Pre-allocate memory for resulting core
result = Array{Float64}(undef, (r1 * s1, m, p, r2 * s2))
for j2 = 1:s2, i2 = 1:r2, j1 = 1:s1, i1 = 1:r1
i = (i1-1) * s1 + j1
j = (i2-1) * s2 + j2
mul!(result[i, :, :, j], lhs[i1, :, :, i2] , rhs[j1, :, :, j2])
end
end
```

However, I know that in this way I am making a lot of “jumps” in memory and therefore, I’m not being very efficient.

My other idea was to permute the dimensions of all the arrays such that the iteration occurs in contiguous blocks of memory:

```
function core_mult_col(lhs, rhs)
r1, m, n, r2 = size(lhs)
s1, n, p, s2 = size(rhs)
dim_1 = r1 * s1
dim_2 = m
dim_3 = p
dim_4 = r2 * s2
permuted_lhs = Array{Float64}(undef, (r1, r2, m, n))
permuted_rhs = Array{Float64}(undef, (s1, s2, n, p))
permutedims!(permuted_lhs, lhs, (1, 4, 2, 3))
permutedims!(permuted_rhs, rhs, (1, 4, 2, 3))
pre_result = Array{Float64}(undef, (r1 * s1, r2 * s2, m, p))
# here I would like to iterate over the first two dimensions of
# each array efficiently and multiply the slices permuted_lhs[i, j, :, :], permuted_rhs[i, j, :, :]
result = Array{Float64}(undef, (r1 * s1, m, p, r2 * s2)
result = permutedims!(pre_result, (1, 3, 4, 2))
return result
end
```

However, I’m not sure how to do this iteration efficiently. Is there a way to do this in Julia specifically?

PS: I already used the TensorOperations.jl package, but the performance was slower in comparison to the implementation that I had in Python.

Thanks in advance.