Are there tricks to reduce compile-time for this code?


#1

I am exploring to use generated function for the evaluation of multivariate polynomials and I am all in all very pleased with the results I get. The only problem is that for larger polynomials the compile time can get quite significant since the generated code gets rather long.
Here is one (already truncated) example

function grad(u::AbstractVector{T}, coefficients, x) where T
    begin 
        x1 = x[1]
        x1_1 = x1
        x1_2 = x1_1 * x1
        x1_3 = x1_2 * x1
        x2 = x[2]
        x2_1 = x2
        x2_2 = x2_1 * x2
        x2_3 = x2_2 * x2
        x2_4 = x2_3 * x2
        x2_5 = x2_4 * x2
        x3 = x[3]
        x3_1 = x3
        x3_2 = x3_1 * x3
        x3_3 = x3_2 * x3
        x4 = x[4]
        x4_1 = x4
        x4_2 = x4_1 * x4
        x4_3 = x4_2 * x4
        x4_4= x4_3 * x4
        x4_5 = x4_4 * x4
        u1 = zero(T)
        u2 = zero(T)
        u3 = zero(T)
        u4 = zero(T)
        c = coefficients[1] * x3_1 * x4_5
        _x3 = x3
        _y4 = x4
        _y3 = _y4 * x3
        u3 = muladd(c * 2, _y4, u3)
        u4 = muladd(c * 6, _x3, u4)
        c = coefficients[2] * x1_1 * x4_5
        _x1 = x1
        _y4 = x4
        u1 = muladd(c * 2, _y4, u1)
        u4 = muladd(c * 6, _x1, u4)
        c = coefficients[3] * x3_2 * x4_4
        _x3 = x3
        _y4 = x4
        _y3 = _y4 * x3
        u3 = muladd(c * 3, _y4, u3)
        u4 = muladd(c * 5, _x3, u4)
        c = coefficients[4] * x4_4
        _x1 = x1
        _x2 = _x1 * x2
        _x3 = _x2 * x3
        _y4 = x4
        _y3 = _y4 * x3
        _y2 = _y3 * x2
        u1 = muladd(c, _y2, u1)
        u2 = muladd(c * _x1, _y3, u2)
        u3 = muladd(c * _x2, _y4, u3)
        u4 = muladd(c * 5, _x3, u4)
        c = coefficients[5] * x1_1 * x4_4
        _x1 = x1
        _x3 = _x1 * x3
        _y4 = x4
        _y3 = _y4 * x3
        u1 = muladd(c * 2, _y3, u1)
        u3 = muladd(c * _x1, _y4, u3)
        u4 = muladd(c * 5, _x3, u4)
        c = coefficients[6] * x4_4
        _x1 = x1
        _x3 = _x1 * x3
        _y4 = x4
        _y3 = _y4 * x3
        u1 = muladd(c, _y3, u1)
        u3 = muladd(c * _x1, _y4, u3)
        u4 = muladd(c * 5, _x3, u4)
        c = coefficients[7] * x4_4
        _x3 = x3
        _y4 = x4
        _y3 = _y4 * x3
        u3 = muladd(c, _y4, u3)
        u4 = muladd(c * 5, _x3, u4)
        c = coefficients[8] * x3_3 * x4_3
        _x3 = x3
        _y4 = x4
        _y3 = _y4 * x3
        u3 = muladd(c * 4, _y4, u3)
        u4 = muladd(c * 4, _x3, u4)
        c = coefficients[9] * x2_1 * x3_1 * x4_3
        _x2 = x2
        _x3 = _x2 * x3
        _y4 = x4
        _y3 = _y4 * x3
        _y2 = _y3 * x2
        u2 = muladd(c * 2, _y3, u2)
        u3 = muladd(c * 2 * _x2, _y4, u3)
        u4 = muladd(c * 4, _x3, u4)
        c = coefficients[10] * x3_1 * x4_3
        _x1 = x1
        _x2 = _x1 * x2
        _x3 = _x2 * x3
        _y4 = x4
        _y3 = _y4 * x3
        _y2 = _y3 * x2
        u1 = muladd(c, _y2, u1)
        u2 = muladd(c * _x1, _y3, u2)
        u3 = muladd(c * 2 * _x2, _y4, u3)
        u4 = muladd(c * 4, _x3, u4)
        c = coefficients[11] * x3_1 * x4_3
        _x2 = x2
        _x3 = _x2 * x3
        _y4 = x4
        _y3 = _y4 * x3
        _y2 = _y3 * x2
        u2 = muladd(c, _y3, u2)
        u3 = muladd(c * 2 * _x2, _y4, u3)
        u4 = muladd(c * 4, _x3, u4)
        c = coefficients[12] * x1_1 * x3_1 * x4_3
        _x1 = x1
        _x3 = _x1 * x3
        _y4 = x4
        _y3 = _y4 * x3
        u1 = muladd(c * 2, _y3, u1)
        u3 = muladd(c * 2 * _x1, _y4, u3)
        u4 = muladd(c * 4, _x3, u4)
        c = coefficients[13] * x3_1 * x4_3
        _x1 = x1
        _x3 = _x1 * x3
        _y4 = x4
        _y3 = _y4 * x3
        u1 = muladd(c, _y3, u1)
        u3 = muladd(c * 2 * _x1, _y4, u3)
        u4 = muladd(c * 4, _x3, u4)
        c = coefficients[14] * x3_1 * x4_3
        _x3 = x3
        _y4 = x4
        _y3 = _y4 * x3
        u3 = muladd(c * 2, _y4, u3)
        u4 = muladd(c * 4, _x3, u4)
        c = coefficients[15] * x1_1 * x2_1 * x4_3
        _x1 = x1
        _x2 = _x1 * x2
        _y4 = x4
        _y2 = _y4 * x2
        u1 = muladd(c * 2, _y2, u1)
        u2 = muladd(c * 2 * _x1, _y4, u2)
        u4 = muladd(c * 4, _x2, u4)
        c = coefficients[16] * x1_2 * x4_3
        _x1 = x1
        _x2 = _x1 * x2
        _y4 = x4
        _y2 = _y4 * x2
        u1 = muladd(c * 3, _y2, u1)
        u2 = muladd(c * _x1, _y4, u2)
        u4 = muladd(c * 4, _x2, u4)
        c = coefficients[17] * x1_1 * x4_3
        _x1 = x1
        _x2 = _x1 * x2
        _y4 = x4
        _y2 = _y4 * x2
        u1 = muladd(c * 2, _y2, u1)
        u2 = muladd(c * _x1, _y4, u2)
        u4 = muladd(c * 4, _x2, u4)
        c = coefficients[18] * x4_3
        _x1 = x1
        _x2 = _x1 * x2
        _y4 = x4
        _y2 = _y4 * x2
        u1 = muladd(c, _y2, u1)
        u2 = muladd(c * _x1, _y4, u2)
        u4 = muladd(c * 4, _x2, u4)
        c = coefficients[19] * x1_3 * x4_3
        _x1 = x1
        _y4 = x4
        u1 = muladd(c * 4, _y4, u1)
        u4 = muladd(c * 4, _x1, u4)
        c = coefficients[20] * x1_2 * x4_3
        _x1 = x1
        _y4 = x4
        u1 = muladd(c * 3, _y4, u1)
        u4 = muladd(c * 4, _x1, u4)
        c = coefficients[21] * x1_1 * x4_3
        _x1 = x1
        _y4 = x4
        u1 = muladd(c * 2, _y4, u1)
        u4 = muladd(c * 4, _x1, u4)
        c = coefficients[22] * x4_3
        _x1 = x1
        _y4 = x4
        u1 = muladd(c, _y4, u1)
        u4 = muladd(c * 4, _x1, u4)
        c = coefficients[23] * x4_3
        _y4 = x4
        u4 = muladd(c, 4, u4)
        c = coefficients[24] * x2_1 * x3_2 * x4_2
        _x2 = x2
        _x3 = _x2 * x3
        _y4 = x4
        _y3 = _y4 * x3
        _y2 = _y3 * x2
        u2 = muladd(c * 2, _y3, u2)
        u3 = muladd(c * 3 * _x2, _y4, u3)
        u4 = muladd(c * 3, _x3, u4)
        c = coefficients[25] * x3_2 * x4_2
        _x1 = x1
        _x2 = _x1 * x2
        _x3 = _x2 * x3
        _y4 = x4
        _y3 = _y4 * x3
        _y2 = _y3 * x2
        u1 = muladd(c, _y2, u1)
        u2 = muladd(c * _x1, _y3, u2)
        u3 = muladd(c * 3 * _x2, _y4, u3)
        u4 = muladd(c * 3, _x3, u4)
        c = coefficients[26] * x3_2 * x4_2
        _x2 = x2
        _x3 = _x2 * x3
        _y4 = x4
        _y3 = _y4 * x3
        _y2 = _y3 * x2
        u2 = muladd(c, _y3, u2)
        u3 = muladd(c * 3 * _x2, _y4, u3)
        u4 = muladd(c * 3, _x3, u4)
        c = coefficients[27] * x3_2 * x4_2
        _x1 = x1
        _x3 = _x1 * x3
        _y4 = x4
        _y3 = _y4 * x3
        u1 = muladd(c, _y3, u1)
        u3 = muladd(c * 3 * _x1, _y4, u3)
        u4 = muladd(c * 3, _x3, u4)
        c = coefficients[28] * x3_2 * x4_2
        _x3 = x3
        _y4 = x4
        _y3 = _y4 * x3
        u3 = muladd(c * 3, _y4, u3)
        u4 = muladd(c * 3, _x3, u4)
        c = coefficients[29] * x2_2 * x4_2
        _x1 = x1
        _x2 = _x1 * x2
        _x3 = _x2 * x3
        _y4 = x4
        _y3 = _y4 * x3
        _y2 = _y3 * x2
        u1 = muladd(c, _y2, u1)
        u2 = muladd(c * 3 * _x1, _y3, u2)
        u3 = muladd(c * _x2, _y4, u3)
        u4 = muladd(c * 3, _x3, u4)
        c = coefficients[30] * x1_1 * x2_1 * x4_2
        _x1 = x1
        _x2 = _x1 * x2
        _x3 = _x2 * x3
        _y4 = x4
        _y3 = _y4 * x3
        _y2 = _y3 * x2
        u1 = muladd(c * 2, _y2, u1)
        u2 = muladd(c * 2 * _x1, _y3, u2)
        u3 = muladd(c * _x2, _y4, u3)
        u4 = muladd(c * 3, _x3, u4)
        c = coefficients[31] * x2_1 * x4_2
        _x1 = x1
        _x2 = _x1 * x2
        _x3 = _x2 * x3
        _y4 = x4
        _y3 = _y4 * x3
        _y2 = _y3 * x2
        u1 = muladd(c, _y2, u1)
        u2 = muladd(c * 2 * _x1, _y3, u2)
        u3 = muladd(c * _x2, _y4, u3)
        u4 = muladd(c * 3, _x3, u4)
        c = coefficients[32] * x2_1 * x4_2
        _x2 = x2
        _x3 = _x2 * x3
        _y4 = x4
        _y3 = _y4 * x3
        _y2 = _y3 * x2
        u2 = muladd(c * 2, _y3, u2)
        u3 = muladd(c * _x2, _y4, u3)
        u4 = muladd(c * 3, _x3, u4)
        c = coefficients[33] * x1_2 * x4_2
        _x1 = x1
        _x2 = _x1 * x2
        _x3 = _x2 * x3
        _y4 = x4
        _y3 = _y4 * x3
        _y2 = _y3 * x2
        u1 = muladd(c * 3, _y2, u1)
        u2 = muladd(c * _x1, _y3, u2)
        u3 = muladd(c * _x2, _y4, u3)
        u4 = muladd(c * 3, _x3, u4)
        c = coefficients[34] * x1_1 * x4_2
        _x1 = x1
        _x2 = _x1 * x2
        _x3 = _x2 * x3
        _y4 = x4
        _y3 = _y4 * x3
        _y2 = _y3 * x2
        u1 = muladd(c * 2, _y2, u1)
        u2 = muladd(c * _x1, _y3, u2)
        u3 = muladd(c * _x2, _y4, u3)
        u4 = muladd(c * 3, _x3, u4)
        c = coefficients[35] * x4_2
        _x1 = x1
        _x2 = _x1 * x2
        _x3 = _x2 * x3
        _y4 = x4
        _y3 = _y4 * x3
        _y2 = _y3 * x2
        u1 = muladd(c, _y2, u1)
        u2 = muladd(c * _x1, _y3, u2)
        u3 = muladd(c * _x2, _y4, u3)
        u4 = muladd(c * 3, _x3, u4)
        c = coefficients[36] * x4_2
        _x2 = x2
        _x3 = _x2 * x3
        _y4 = x4
        _y3 = _y4 * x3
        _y2 = _y3 * x2
        u2 = muladd(c, _y3, u2)
        u3 = muladd(c * _x2, _y4, u3)
        u4 = muladd(c * 3, _x3, u4)
        c = coefficients[37] * x1_2 * x4_2
        _x1 = x1
        _x3 = _x1 * x3
        _y4 = x4
        _y3 = _y4 * x3
        u1 = muladd(c * 3, _y3, u1)
        u3 = muladd(c * _x1, _y4, u3)
        u4 = muladd(c * 3, _x3, u4)
        c = coefficients[38] * x1_1 * x4_2
        _x1 = x1
        _x3 = _x1 * x3
        _y4 = x4
        _y3 = _y4 * x3
        u1 = muladd(c * 2, _y3, u1)
        u3 = muladd(c * _x1, _y4, u3)
        u4 = muladd(c * 3, _x3, u4)
        c = coefficients[39] * x4_2
        _x1 = x1
        _x3 = _x1 * x3
        _y4 = x4
        _y3 = _y4 * x3
        u1 = muladd(c, _y3, u1)
        u3 = muladd(c * _x1, _y4, u3)
        u4 = muladd(c * 3, _x3, u4)
        c = coefficients[40] * x4_2
        _x3 = x3
        _y4 = x4
        _y3 = _y4 * x3
        u3 = muladd(c, _y4, u3)
        u4 = muladd(c * 3, _x3, u4)
        c = coefficients[41] * x2_1 * x3_3 * x4_1
        _x2 = x2
        _x3 = _x2 * x3
        _y4 = x4
        _y3 = _y4 * x3
        _y2 = _y3 * x2
        u2 = muladd(c * 2, _y3, u2)
        u3 = muladd(c * 4 * _x2, _y4, u3)
        u4 = muladd(c * 2, _x3, u4)
        c = coefficients[42] * x3_3 * x4_1
        _x2 = x2
        _x3 = _x2 * x3
        _y4 = x4
        _y3 = _y4 * x3
        _y2 = _y3 * x2
        u2 = muladd(c, _y3, u2)
        u3 = muladd(c * 4 * _x2, _y4, u3)
        u4 = muladd(c * 2, _x3, u4)
        c = coefficients[43] * x3_3 * x4_1
        _x3 = x3
        _y4 = x4
        _y3 = _y4 * x3
        u3 = muladd(c * 4, _y4, u3)
        u4 = muladd(c * 2, _x3, u4)
        c = coefficients[44] * x2_3 * x3_1 * x4_1
        _x2 = x2
        _x3 = _x2 * x3
        _y4 = x4
        _y3 = _y4 * x3
        _y2 = _y3 * x2
        u2 = muladd(c * 4, _y3, u2)
        u3 = muladd(c * 2 * _x2, _y4, u3)
        u4 = muladd(c * 2, _x3, u4)
        c = coefficients[45] * x2_2 * x3_1 * x4_1
        _x1 = x1
        _x2 = _x1 * x2
        _x3 = _x2 * x3
        _y4 = x4
        _y3 = _y4 * x3
        _y2 = _y3 * x2
        u1 = muladd(c, _y2, u1)
        u2 = muladd(c * 3 * _x1, _y3, u2)
        u3 = muladd(c * 2 * _x2, _y4, u3)
        u4 = muladd(c * 2, _x3, u4)
        c = coefficients[46] * x2_2 * x3_1 * x4_1
        _x2 = x2
        _x3 = _x2 * x3
        _y4 = x4
        _y3 = _y4 * x3
        _y2 = _y3 * x2
        u2 = muladd(c * 3, _y3, u2)
        u3 = muladd(c * 2 * _x2, _y4, u3)
        u4 = muladd(c * 2, _x3, u4)
        c = (coefficients[47] * x1_1 * x2_1 * x3_1) * x4_1
        _x1 = x1
        _x2 = _x1 * x2
        _x3 = _x2 * x3
        _y4 = x4
        _y3 = _y4 * x3
        _y2 = _y3 * x2
        u1 = muladd(c * 2, _y2, u1)
        u2 = muladd(c * 2 * _x1, _y3, u2)
        u3 = muladd(c * 2 * _x2, _y4, u3)
        u4 = muladd(c * 2, _x3, u4)
        begin 
            u[1] = u1
            u[2] = u2
            u[3] = u3
            u[4] = u4
        end
        u
    end
end

If I time the compile time with

w = rand(Complex128, 4)
u = zeros(w)
coefficients = rand(47)
@time grad(u, coefficients, w)

then the result (on 0.6.1) is a whopping 0.77 seconds. In comparison the Float64 variant only needs 0.08 seconds.

The latest nightly (7 days old, so without the constant propagation) is already better. The complex one needs
0.30 seconds and the Float64 variant needs 0.075 seconds.

But I wonder, are there known tricks to “help” the compiler in order to improve the compile speed? I already tried to sprinkle ::T statements but it seems that the problem is not the inference part, since these didn’t help.

Any ideas are appreciated :slight_smile:


#2

Just a random guess but splitting common patterns into small functions might help.


#3

Good idea, but unfortunately this is not really an option here since the code is auto generated and although every thing looks similar there are subtle differences everywhere. Another thing is that the cost of additional function calls is probably too big. For complex arguments the function needs on my system only around 300ns thanks to SIMD instructions.


#4

The functions will get inlined (or you can force them to be) so the question is only if separate functions is easier for the Julia compiler. It is worth giving it a try.


#5

I batched the muladd calls together into a new function which accepted 3 tuples of equal size but this only made things worse :frowning: