Hi all!

This is for the performance tweaking addicts out there

I had a conversation with a “C++ friend” today in which upon request I tried to showcase to him that Julia can be as performant as C++ (say, within 30% performance difference). I spare you the entire back-and-forth but, eventually, we ended up with the following Julia code:

```
using Random
using BenchmarkTools
const rng = MersenneTwister()
function get(spin_conf, x, y)
(Nx, Ny) = size(spin_conf)
return @inbounds spin_conf[(x-1+Nx)%Nx + 1, (y-1+Ny)%Ny + 1]
end
function deltaE(spin_conf, i, j)
return 2.0*get(spin_conf,i,j) * (get(spin_conf,i-1,j) + get(spin_conf,i+1,j) +
get(spin_conf,i,j+1) + get(spin_conf,i,j-1))
end
function energy(spin_conf)
(Nx, Ny) = size(spin_conf)
res = 0
for i = 1:Nx
@simd for j = 1:Ny
res += -get(spin_conf,i,j)*(get(spin_conf,i+1,j) + get(spin_conf,i,j+1))
end
end
return res
end
magnetism(spin_conf) = sum(spin_conf)
function sweep!(rec, spin_conf, beta, M)
(Nx, Ny) = size(spin_conf)
@inbounds @simd for i = 1:M
x = rand(rng, 1:Nx)
y = rand(rng, 1:Ny)
r = rand(rng)
dE = deltaE(spin_conf, x, y)
if exp(-beta*dE) > r
spin_conf[x,y] *= -1
end
rec[i] = energy(spin_conf)
end
return rec
end
function main()
beta = 0.2
L = 64
total_iteration = 100_000
spin_conf = rand(rng, (-1,1), L, L)
rec = Vector{Int64}(undef,total_iteration)
sweep!(rec, spin_conf, beta, total_iteration)
s = sum(last(rec, 1000))/1000
s /= L^2
# println(s)
return nothing
end
```

Timings on my machine:

```
julia> @btime main();
1.836 s (5 allocations: 821.34 KiB)
```

Unfortunately, on my machine, this is still 5.56x slower than the C++ code which takes `330 ms`

(see all the C++ details below.)

Anticipating that the C++ version unrolls some loops I tried to optimise the `energy`

function by using

**Explicit loop-unrolling**. (Note that, unfortunately, LoopVectorization.jl’s `@avx`

doesn’t seem to work here.)

```
# Copied from https://github.com/StephenVavasis/Unroll.jl
include("Unroll.jl")
using .Unroll
function energy(spin_conf)
(Nx, Ny) = size(spin_conf)
res = 0
for i = 1:Nx
@unroll for j = 1:64 # THIS LINE HAS CHANGED!
res += -get(spin_conf,i,j)*(get(spin_conf,i+1,j) + get(spin_conf,i,j+1))
end
end
return res
end
```

Timings on my machine:

```
julia> @btime main();
429.526 ms (5 allocations: 821.34 KiB)
```

This gets the Julia code in the same ballpark at the cost of using a macro from an old unmaintained package and hard-coding the iterator length `64`

. However, adding an explicit `#pragma GCC unroll 64`

statement to the C++ code as well (to be fair) the latter only takes `150 ms`

, so the Julia code is, again, ~2.9x slower.

So, the mission, if you choose to accept it, is to tweak this Julia code to the C++ speed (30% performance difference is the goal). While restructuring the code is perfectly fine, algorithmic improvements are not allowed. The imaginary Julia and C++ programmers should be similarly smart

Of course, I would also just appreciate hints/tipps/tricks/comments of any kind! Thanks in advance, guys!

Best,

Carsten

**Unroll.jl**

```
module Unroll
copy_and_substitute_tree(e, varname, newtext) = e
copy_and_substitute_tree(e::Symbol, varname, newtext) =
e == varname ? newtext : e
function copy_and_substitute_tree(e::Expr, varname, newtext)
e2 = Expr(e.head)
for subexp in e.args
push!(e2.args, copy_and_substitute_tree(subexp, varname, newtext))
end
if e.head == :if
newe = e2
try
u = Core.eval(@__MODULE__, e2.args[1])
if u == true
newe = e2.args[2]
elseif u == false
if length(e2.args) == 3
newe = e2.args[3]
else
newe = :nothing
end
end
catch
end
e2 = newe
end
e2
end
macro unroll(expr)
if expr.head != :for || length(expr.args) != 2 ||
expr.args[1].head != :(=) ||
typeof(expr.args[1].args[1]) != Symbol ||
expr.args[2].head != :block
error("Expression following unroll macro must be a for-loop as described in the documentation")
end
varname = expr.args[1].args[1]
ret = Expr(:block)
for k in Core.eval(@__MODULE__, expr.args[1].args[2])
e2 = copy_and_substitute_tree(expr.args[2], varname, k)
push!(ret.args, e2)
end
esc(ret)
end
macro tuplegen(expr)
if expr.head != :comprehension || length(expr.args) != 2 ||
expr.args[2].head != :(=) ||
typeof(expr.args[2].args[1]) != Symbol
error("Expression following tuplegen macro must be a comprehension as described in the documentation")
end
varname = expr.args[2].args[1]
ret = Expr(:tuple)
for k in Core.eval(@__MODULE__, expr.args[2].args[2])
e2 = copy_and_substitute_tree(expr.args[1], varname, k)
push!(ret.args,e2)
end
esc(ret)
end
export @unroll
export @tuplegen
end
```

**C++ code, compilation & timings**

Ising2D.cpp:

```
#include <Eigen/Dense>
#include <random>
#include <fstream>
#include <iostream>
class Ising2D
{
private:
int64_t L_;
Eigen::Array<int64_t,Eigen::Dynamic,Eigen::Dynamic> confs_;
public:
template<class RandomEngine>
Ising2D(int64_t L, RandomEngine& re)
: L_{L}, confs_(L, L)
{
std::uniform_int_distribution<int64_t> uid(0, 1);
for(int64_t i = 0; i < L; i++)
{
for(int64_t j = 0; j < L; j++)
{
confs_(i,j) = 2*uid(re)-1;
}
}
}
__attribute__((always_inline)) inline int64_t get(int64_t i, int64_t j) const
{
return confs_((i+L_)%L_, (j+L_)%L_);
}
int64_t H() const
{
int64_t res = 0;
for(int64_t i = 0; i < L_; i++)
{
for(int64_t j = 0; j < L_; j++)
{
res += -get(i,j)*(get(i+1,j) + get(i,j+1));
}
}
return res;
}
//energy diff when flipping (i,j)
inline double dH(int64_t i, int64_t j) const
{
return 2.0*get(i, j)*(get(i+1,j) + get(i-1,j) + get(i,j+1) + get(i, j-1));
}
template<class RandomEngine>
Eigen::ArrayXi sweep(RandomEngine& re, const double beta, const int64_t steps)
{
Eigen::ArrayXi res(steps);
std::uniform_int_distribution<int64_t> uid(0, L_-1);
std::uniform_real_distribution<double> urd(0., 1.);
for(int64_t k = 0; k < steps; k++)
{
int64_t x = uid(re);
int64_t y = uid(re);
double p = std::min(std::exp(-beta*dH(x, y)),1.);
if(p > urd(re))
{
confs_(x,y) *= -1;
}
res(k) = H();
}
return res;
}
double magnetization() const
{
return double(confs_.sum())/(L_*L_);
}
void swapConf(Ising2D& other)
{
confs_.swap(other.confs_);
}
};
int main()
{
const double beta = 0.2;
const int64_t L = 64;
const int64_t total_iteration = 100'000;
std::uniform_real_distribution<double> urd(0., 1.);
std::random_device rd;
std::default_random_engine re{rd()};
Ising2D ising(L, re);
auto rec = ising.sweep(re, beta, total_iteration);
printf("%f\n", (double)rec.tail(1000).sum()/1000.0/(L*L));
return 0;
}
```

Dependency: Eigen (simply `git clone https://gitlab.com/libeigen/eigen.git`

and link with `-I`

)

**Naive compilation**

`g++ -O3 -march=native -I ~/.local/lib/eigen/ Ising2D.cpp -o Ising2D`

Timings on my machine: ` 8.16s user 0.01s system 96% cpu 8.465 total`

`-funroll-loops`

`g++ -O3 -march=native -funroll-loops -I ~/.local/lib/eigen/ Ising2D.cpp -o Ising2D`

Timings on my machine: `5.61s user 0.01s system 96% cpu 5.801 total`

`-fwhole-program`

`g++ -O3 -march=native -fwhole-program -I ~/.local/lib/eigen/ Ising2D.cpp -o Ising2D`

Timings on my machine: `0.42s user 0.00s system 61% cpu 0.687 total`

`-fwhole-program`

+ `-funroll-loops`

`g++ -O3 -march=native -funroll-loops -fwhole-program -I ~/.local/lib/eigen/ Ising2D.cpp -o Ising2D`

Timings on my machine: `0.33s user 0.00s system 99% cpu 0.332 total`

`#pragma GCC unroll 64`

Adding explicit loop unrolling by putting `#pragma GCC unroll 64`

in front of the j loop in `H()`

(which is `energy`

in the Julia code):

`g++ -O3 -march=native -funroll-loops -fwhole-program -I ~/.local/lib/eigen/ Ising2D_pragma.cpp`

Timings on my machine: `0.15s user 0.00s system 98% cpu 0.158 total`