I found a very simple task where the Julia performances are unexpectedly and significantly worse than C++.
The core function to be benchmarked is as follows:
function subr!(d1, nx, ny, d2)
for iy in 1:ny
for ix in 1:nx
d2[ix,iy] = d1[ix,iy] + ix + iy
end
end
end
where both d1
and d2
are Matrix{Float32}
. The complete MWE for both Julia and C++ are available at the bottom of this post.
The performance ratio with respect to C++ (-O3
) is of the order of ~5. The original author of the benchmark (see below) found an even worse ratio of ~10.
My julia version is:
julia> versioninfo()
Julia Version 1.1.0
Commit 80516ca202 (2019-01-21 21:24 UTC)
Platform Info:
OS: Linux (x86_64-pc-linux-gnu)
CPU: Intel(R) Core(TM) i7-8750H CPU @ 2.20GHz
WORD_SIZE: 64
LIBM: libopenlibm
LLVM: libLLVM-6.0.1 (ORCJIT, skylake)
but I obtained the same results with v1.3.0-rc2.
I tried all the tricks I am aware of. Among these:
- start julia with
julia -O3
; - inspect the output of
@code_warntype
; - exchange row/column access order;
- using
@simd
,@inbounds
,@fastmath
in many combinations; - using a
Vector{Vector{Float32}}
(as it actually works in C++), rather than aMatrix{Float32}
;
but none solved the problem.
Is there a reason why the Julia performances are so bad in this case?
Is there a way to improve performances by slightly tweaking the code shown below?
More info (as well as a comparison with many other languages) is available in the repo of the original author of the benchmark.
MWE in C++:
// Compile with:
// g++ -o test -O3 test.cpp
#include <stdio.h>
#include <stdlib.h>
void subr (float* d1[], int nx, int ny, float* d2[]) {
for (int iy = 0; iy < ny; iy++)
for (int ix = 0; ix < nx; ix++)
d2[iy][ix] = d1[iy][ix] + ix + iy;
}
int main (int argc, char* argv[]) {
int nrpt = 1000000;
int nx = 2000;
int ny = 10;
printf ("Arrays have %d rows of %d columns, repeats = %d\n",ny,nx,nrpt);
// Allocate and initialize
float* d1Data = (float*) malloc (nx * ny * sizeof(float));
float* d2Data = (float*) malloc (nx * ny * sizeof(float));
float** d1 = (float**) malloc (ny * sizeof(float*));
float** d2 = (float**) malloc (ny * sizeof(float*));
for (int iy = 0; iy < ny; iy++) {
d1[iy] = d1Data + iy * nx;
d2[iy] = d2Data + iy * nx;
}
for (int iy = 0; iy < ny; iy++)
for (int ix = 0; ix < nx; ix++)
d1[iy][ix] = float(nx - ix + ny - iy);
// Main job
for (int Loop = 0; Loop < nrpt; Loop++) {
subr(d1, nx, ny, d2);
}
// Check the results (all cells should contain the same value: nx+ny).
bool Error = false;
for (int iy = 0; iy < ny; iy++) {
for (int ix = 0; ix < nx; ix++) {
if (d2[iy][ix] != nx+ny) {
Error = true;
printf ("Error Out[%d][%d] = %f, not %f\n",iy,ix,
d2[iy][ix],float(d1[iy][ix] + ix + iy));
break;
}
}
if (Error) break;
}
return 0;
}
MWE in Julia:
function subr!(d1, nx, ny, d2)
for iy in 1:ny
for ix in 1:nx
d2[ix,iy] = d1[ix,iy] + ix + iy
end
end
end
function main()
nrpt = 1000000
nx = 2000
ny = 10
println("Arrays have ",ny," rows of ",nx," columns, repeats = ",nrpt)
# Allocate and initialize
d1 = zeros(Float32,nx,ny)
d2 = zeros(Float32,nx,ny)
for iy in 1:ny, ix in 1:nx
d1[ix,iy] = nx - ix + ny - iy
end
# Warm-up
subr!(d1,nx,ny,d2)
# Main job
@time for irpt in 1:nrpt
subr!(d1, nx, ny, d2)
end
# Check the results (all cells should contain the same value: nx+ny).
for iy in 1:ny, ix in 1:nx
if d2[ix,iy] != nx + ny
println("error ", d2[ix,iy]," ",ix," ",iy)
break
end
end
end
main()