This is a continuation from this thread: Performance drawback with subtyping - #31 by paulmelis
Here we have two codes, which compute someting simple (the sum of the values of a field of a type in an array of mixed types of objects). Typical case where it is hard to be type-stable and avoid run-time dispatch.
With 2 different types in the list of types, union splitting kicks in in Julia and the code is fast. With more (in the present example 5 types), it does not. In the C++ code clearly it does kick in and the code is fast. Thus, the C++ compiler is doing a better job here.
There are many alternatives to these codes (like restructuring the data to avoid mixed containers, and that is by far the best option in terms of performance, or using a macro to automatically do the union splitting for any number of subtypes). But this is not the point.
The point is that clearly the most simple way to write the code is faster in C++ than in Julia, because the compiler is more aggressive in C++ (or at least this is what I understand is happening).
So, the question is: is it possible to tune some compiler option in Julia so that the compiler more aggressively optimizes this code? (specifically, the number of subtypes above which it quits trying to do union splitting?).
This is a recurrent subject here, and here we have a clear example of C++ being faster than Julia for the code written in the “most natural” way. (not saying that the C++ is more natural in general, it is painfully ugly ). But the structure of the code is the same.
These are the benchmarks:
C++:
g++ -o disp -O3 -march=native -W -Wall disp.cpp
./disp
n = 1000000
dynamic dispatch : 0.008514 us per iteration (8.514000 ms total) [500006.610053]
Julia (1.6.1):
julia disp.jl
result = 499496.79878554505
with runtime dispatch: 59.806 ms (2000000 allocations: 30.52 MiB)
with manual splitting: 8.133 ms (0 allocations: 0 bytes)
And the codes:
C++
// g++ -o disp -O3 -march=native -W -Wall disp.cpp
#include <sys/time.h>
#include <cstdio>
#include <stdlib.h>
#include <vector>
struct LineAbstract
{
virtual double paint() =0;
};
struct LineA : public LineAbstract
{
LineA(double l) { length = l; }
virtual double paint() { return length; }
double length;
};
struct LineB : public LineAbstract
{
LineB(double l) { length = l; }
virtual double paint() { return length; }
double length;
};
struct LineC : public LineAbstract
{
LineC(double l) { length = l; }
virtual double paint() { return length; }
double length;
};
struct LineD : public LineAbstract
{
LineD(double l) { length = l; }
virtual double paint() { return length; }
double length;
};
struct LineE : public LineAbstract
{
LineE(double l) { length = l; }
virtual double paint() { return length; }
double length;
};
struct Picture
{
Picture(std::vector<LineAbstract*> l) { lines = l; }
std::vector<LineAbstract*> lines;
};
// Dynamic dispatch at runtime
double paint(Picture& p)
{
double s = 0.0;
for (auto l : p.lines)
s += l->paint();
return s;
}
double tdiff(struct timeval t0, struct timeval t1)
{
return (t1.tv_sec - t0.tv_sec) + (t1.tv_usec - t0.tv_usec)/1000000.0;
}
int main()
{
int n = 1000000;
printf("n = %i \n",n);
std::vector<LineAbstract*> lines;
for (int i = 0; i < n; i++)
{
double r = ((double) rand() / (RAND_MAX));
if (r <= 0.2)
lines.push_back((LineAbstract*)new LineA(r));
else if (r <= 0.4)
lines.push_back((LineAbstract*)new LineB(r));
else if (r <= 0.6)
lines.push_back((LineAbstract*)new LineC(r));
else if (r <= 0.8)
lines.push_back((LineAbstract*)new LineD(r));
else if (r <= 1.0)
lines.push_back((LineAbstract*)new LineE(r));
}
Picture p(lines);
struct timeval t0, t1;
double t, res;
// Store resulting sum, to make sure the calls aren't erased
gettimeofday(&t0, NULL);
res = paint(p);
gettimeofday(&t1, NULL);
t = tdiff(t0,t1);
printf("dynamic dispatch : %.6f us per iteration (%.6f ms total) [%.6f]\n", t/n*1000000, t*1000, res);
}
Julia
module Lines
using BenchmarkTools
using Test
abstract type LineAbstract end
struct Line1 <: LineAbstract length::Float64 end
struct Line2 <: LineAbstract length::Float64 end
struct Line3 <: LineAbstract length::Float64 end
struct Line4 <: LineAbstract length::Float64 end
struct Line5 <: LineAbstract length::Float64 end
struct Picture{T<:LineAbstract}
lines::Vector{T}
end
paint(l :: Line1) = l.length
paint(l :: Line2) = l.length
paint(l :: Line3) = l.length
paint(l :: Line4) = l.length
paint(l :: Line5) = l.length
# Dynamical dispatch at runtime
function paint1(p)
s = 0.
for l in p.lines
s += paint(l)
end
s
end
# Union splitting
function paint2(p)
s = 0.
for l in p.lines
if l isa Line1
s += paint(l)
elseif l isa Line2
s += paint(l)
elseif l isa Line3
s += paint(l)
elseif l isa Line4
s += paint(l)
elseif l isa Line5
s += paint(l)
end
end
s
end
function run(n)
line_types = [ Line1, Line2, Line3, Line4, Line5 ]
p = Picture([line_types[rand(1:5)](rand()) for i in 1:n])
@test paint1(p) ≈ paint2(p)
println("result = ", paint1(p))
print(" with runtime dispatch: "); @btime paint1($p)
print(" with splitting: "); @btime paint2($p)
end
end
# running
Lines.run(1_000_000)
edit: updated the julia code to make it easier to test things with Revise
.