Thank you for an enlightening discussion! I see that the issue is quite delicate. With your help I was able to make the Julia code competitive. Below are my timings and code for Matlab, C++ and Julia.

Language |
Time (s) |

C++ (Eigen) |
12 |

Matlab (normal use) |
14 |

Julia |
18 |

Matlab (single thread) |
20 |

###
C++ (Eigen)

```
#include <iostream>
#include <chrono>
#include <Eigen/Dense>
using namespace Eigen;
template <typename Derived1, typename Derived2>
MatrixXd MFdiff(
DenseBase<Derived1> const & x1,
DenseBase<Derived2> const & x2)
{
int d = x1.rows();
int n1 = x1.cols();
int n2 = x2.cols();
MatrixXd out(d, n1*n2);
for (int c = 0, idx = 0; c != n2; c+=1, idx += n1)
out.block(0, idx, d, n1) = x1.colwise() - x2.col(c);
return out;
}
void test(VectorXi const & ds,
VectorXi const & n1s,
VectorXi const & n2s)
{
auto start = std::chrono::high_resolution_clock::now();
MatrixXd z1 = MatrixXd::Random(ds.maxCoeff(), n1s.maxCoeff());
MatrixXd z2 = MatrixXd::Random(ds.maxCoeff(), n2s.maxCoeff());
std::chrono::duration<double> timer1 =
std::chrono::high_resolution_clock::now() - start;
start = std::chrono::high_resolution_clock::now();
double s = 0;
int k = 0;
for (int idx = 0; idx!=ds.rows(); idx++) {
for (int jdx = 0; jdx!=n1s.rows(); jdx++) {
for (int kdx = 0; kdx!=n2s.rows(); kdx++) {
MatrixXd K = MFdiff(z1.topLeftCorner(ds(idx), n1s(jdx)),
z2.topLeftCorner(ds(idx), n2s(kdx)));
s += K(0,0);
k += 1;
}
}
}
std::chrono::duration<double> timer2 =
std::chrono::high_resolution_clock::now() - start;
std::cout << "Generate " << ds.maxCoeff() * n1s.maxCoeff() + ds.maxCoeff() * n2s.maxCoeff()
<< " random numers: " << timer1.count() << " s.\n";
std::cout << "Calculate " << k << " blocks: "
<< timer2.count() << " s.\n";
std::cout << "Sum: " << s << '\n';
}
int main() {
VectorXi ds = VectorXi::LinSpaced(5, 1, 5);
VectorXi n1s = VectorXi::LinSpaced(100, 5, 500);
VectorXi n2s = VectorXi::LinSpaced(100, 5, 500);
test(ds, n1s, n2s);
}
```

###
Matlab

```
% Reference implementation of the MFdiff
% matlab -singleCompThread -nojvm -nodisplay -r test6
ds = 1:5;
n1s = 5:5:500;
n2s = 5:5:500;
z1 = randn(max(ds), max(n1s));
z2 = randn(max(ds), max(n2s));
%%
tic;
s = 0;
t = 0;
for idx = 1:numel(ds)
for jdx = 1:numel(n1s)
for kdx = 1:numel(n2s)
K = MFdiff(z1(1:ds(idx), 1:n1s(jdx)),...
z2(1:ds(idx), 1:n2s(kdx)));
s = s + K(1,1);
t = t + 1;
end
end
end
toc
function M = MFdiff(x1, x2)
% Computes a flattened multidimensional x1' - x2
% x1 is (d x n1)
% x2 is (d x n2)
% M is (d x n1 * n2)
n1 = size(x1, 2);
n2 = size(x2, 2);
M = repmat(x1, 1, n2) - repelem(x2, 1, n1);
end
```

###
Julia

```
function toTest(ds, n1s, n2s, z1, z2)
s = 0
t = 0
for idx = 1:length(ds)
for jdx = 1:length(n1s)
for kdx = 1:length(n2s)
K = BMdiff(z1[1:ds[idx], 1:n1s[jdx]], z2[1:ds[idx], 1:n2s[kdx]])
s = s + K[1,1]
t = t + 1
end
end
end
return s, t
end
function BMdiff(x1, x2)
# Broadcasted matrix diff
d, n1 = size(x1)
d, n2 = size(x2)
out = similar(x1, d, n1 * n2)
BMdiff!(out, x1, x2)
return out
end
function BMdiff!(out, x1, x2)
# Broadcasted matrix diff
n1 = size(x1, 2)
n2 = size(x2, 2)
for k = 1:n2
out[:, 1+n1*(k-1):n1*k] .= x1 .- getindex.((x2,), axes(x2,1), k)
end
end
const ds = 1:5
const n1s = 5:5:500
const n2s = 5:5:500
const z1s = randn(maximum(ds), maximum(n1s))
const z2s = randn(maximum(ds), maximum(n2s))
println(toTest(ds, n1s, n2s, z1s, z2s))
```