Performance drawback with subtyping

paulmelis · May 24, 2021, 12:55pm

As a follow-up to What Python Creator Guido van Rossum Thinks of Rust, Go, Julia, and TypeScript - #24 by lmiq here’s a straightforward C++ port of @lmiq’s Julia code.

code

// g++ -o paint -O3 -march=native -W -Wall  paint.cpp
#include <sys/time.h>
#include <cstdio>
#include <stdlib.h>
#include <vector>

struct LineAbstract
{
    virtual double paint() =0;
};

struct LineA : public LineAbstract
{     
    LineA(double w) { width = w; }
    virtual double paint() { return width; }
    
    double width;
};

struct LineB : public LineAbstract
{        
    LineB(double l) { length = l; }
    virtual double paint() { return length; }
    
    double length;
};

struct Picture
{
    Picture(std::vector<LineAbstract*> l) { lines = l; }
    
    std::vector<LineAbstract*> lines;
};

// Dynamic dispatch at runtime 

double 
paint(Picture& p)
{
    double s = 0.0;
    
    for (auto l : p.lines)
        s += l->paint();
    
    return s;
}

// Union splitting 

double
paint2(Picture& p)
{
    double s = 0.0;
    LineA *a;
    LineB *b;
    
    for (auto l : p.lines)
    {
        a = dynamic_cast<LineA*>(l);        
        if (a != nullptr)
            s += a->paint();
        else
        {
            b = dynamic_cast<LineB*>(l);        
            if (b != nullptr)
                s += b->paint();
        }
    }
    
    return s;
}

// Functors

double line(LineA *a) { return a->width; }
double line(LineB *b) { return b->length; }

double 
paint3(Picture& p)
{
    double s = 0.0;
    
    for (auto l : p.lines)
    {
        if (dynamic_cast<LineA*>(l))
            s += line(static_cast<LineA*>(l));
        else if (dynamic_cast<LineB*>(l))
            s += line(static_cast<LineB*>(l));
    }
    
    return s;
}

// Naive sums

double 
naivesum(std::vector<double> x)
{
    double s = 0.0;
    for (auto v : x)
        s += v;
    return s;
}

double 
naivesum2(std::vector<double> x)
{
    double s = 0.0;
    for (unsigned int i = 0; i < x.size(); i++)
        s += x[i];
    return s;
}

double tdiff(struct timeval t0, struct timeval t1)
{
    return (t1.tv_sec - t0.tv_sec) + (t1.tv_usec - t0.tv_usec)/1000000.0;
}

int main()
{
    const int n = 1000000;
    
    std::vector<double> x;
    for (int i = 0; i < n; i++)
        x.push_back(random());
    
    std::vector<LineAbstract*> lines;
    for (int i = 0; i < n; i++)
    {        
        // Need the casts to make ? : expression get accepted
        lines.push_back((1.0*random()/RAND_MAX) < 0.5 ? (LineAbstract*)new LineA(x[i]) : (LineAbstract*)new LineB(x[i]));
    }
    
    Picture p(lines);

    struct timeval t0, t1;
    double t, res;
    
    // Store resulting sum, to make sure the calls aren't erased
    gettimeofday(&t0, NULL);
    res = paint(p);
    gettimeofday(&t1, NULL);
    t = tdiff(t0, t1);
    printf("dynamic dispatch : %.6f us per iteration (%.6f ms total) [%.6f]\n", t/n*1000000, t*1000, res);
    
    gettimeofday(&t0, NULL);
    res = paint2(p);
    gettimeofday(&t1, NULL);
    t = tdiff(t0, t1);
    printf("union splitting  : %.6f us per iteration (%.6f ms total) [%.6f]\n", t/n*1000000, t*1000, res);

    gettimeofday(&t0, NULL);
    res = paint3(p);
    gettimeofday(&t1, NULL);
    t = tdiff(t0, t1);
    printf("functors         : %.6f us per iteration (%.6f ms total) [%.6f]\n", t/n*1000000, t*1000, res);

    gettimeofday(&t0, NULL);
    res = naivesum(x);
    gettimeofday(&t1, NULL);
    t = tdiff(t0, t1);
    printf("naive sum        : %.6f us per iteration (%.6f ms total) [%.6f]\n", t/n*1000000, t*1000, res);

    gettimeofday(&t0, NULL);
    res = naivesum2(x);
    gettimeofday(&t1, NULL);
    t = tdiff(t0, t1);
    printf("naive sum 2      : %.6f us per iteration (%.6f ms total) [%.6f]\n", t/n*1000000, t*1000, res);
}

Results on my system (GCC 10.2) (updated to do the random filling of the array correctly):

dynamic dispatch : 0.006555 us per iteration (6.555000 ms total) [1073756018481283.000000]
union splitting  : 0.026228 us per iteration (26.228000 ms total) [1073756018481283.000000]
functors         : 0.026444 us per iteration (26.444000 ms total) [1073756018481283.000000]
naive sum        : 0.004331 us per iteration (4.331000 ms total) [1073756018481283.000000]
naive sum 2      : 0.001780 us per iteration (1.780000 ms total) [1073756018481283.000000]

Julia 1.6.1 results:

n = 1000000
 with dynamic dispatch:   6.644 ms (0 allocations: 0 bytes)
 with splitting:   5.892 ms (0 allocations: 0 bytes)
 with functors:   6.627 ms (0 allocations: 0 bytes)
 simple sum of an array of Float64 of same size:   6.623 ms (0 allocations: 0 bytes)

~~So the completely straightforward dynamic dispatch is easily fastest in C++.~~ Not any longer with the random fix (see edit 2 below)

Edit: the initial naive sum used the iterator protocol, which apparently isn’t the fastest. I added a version that uses vector indexing, which is much faster, because it can address the underlying array directly.

Edit 2: see below for a porting error I made, random() in C/C++ returns an integer, so the array filled with a single type. Forcing the random value in [0,1] changes the results substantially.