# Are there tricks to reduce compile-time for this code?

I am exploring to use generated function for the evaluation of multivariate polynomials and I am all in all very pleased with the results I get. The only problem is that for larger polynomials the compile time can get quite significant since the generated code gets rather long.
Here is one (already truncated) example

function grad(u::AbstractVector{T}, coefficients, x) where T
begin
x1 = x[1]
x1_1 = x1
x1_2 = x1_1 * x1
x1_3 = x1_2 * x1
x2 = x[2]
x2_1 = x2
x2_2 = x2_1 * x2
x2_3 = x2_2 * x2
x2_4 = x2_3 * x2
x2_5 = x2_4 * x2
x3 = x[3]
x3_1 = x3
x3_2 = x3_1 * x3
x3_3 = x3_2 * x3
x4 = x[4]
x4_1 = x4
x4_2 = x4_1 * x4
x4_3 = x4_2 * x4
x4_4= x4_3 * x4
x4_5 = x4_4 * x4
u1 = zero(T)
u2 = zero(T)
u3 = zero(T)
u4 = zero(T)
c = coefficients[1] * x3_1 * x4_5
_x3 = x3
_y4 = x4
_y3 = _y4 * x3
u3 = muladd(c * 2, _y4, u3)
u4 = muladd(c * 6, _x3, u4)
c = coefficients[2] * x1_1 * x4_5
_x1 = x1
_y4 = x4
u1 = muladd(c * 2, _y4, u1)
u4 = muladd(c * 6, _x1, u4)
c = coefficients[3] * x3_2 * x4_4
_x3 = x3
_y4 = x4
_y3 = _y4 * x3
u3 = muladd(c * 3, _y4, u3)
u4 = muladd(c * 5, _x3, u4)
c = coefficients[4] * x4_4
_x1 = x1
_x2 = _x1 * x2
_x3 = _x2 * x3
_y4 = x4
_y3 = _y4 * x3
_y2 = _y3 * x2
u2 = muladd(c * _x1, _y3, u2)
u3 = muladd(c * _x2, _y4, u3)
u4 = muladd(c * 5, _x3, u4)
c = coefficients[5] * x1_1 * x4_4
_x1 = x1
_x3 = _x1 * x3
_y4 = x4
_y3 = _y4 * x3
u1 = muladd(c * 2, _y3, u1)
u3 = muladd(c * _x1, _y4, u3)
u4 = muladd(c * 5, _x3, u4)
c = coefficients[6] * x4_4
_x1 = x1
_x3 = _x1 * x3
_y4 = x4
_y3 = _y4 * x3
u3 = muladd(c * _x1, _y4, u3)
u4 = muladd(c * 5, _x3, u4)
c = coefficients[7] * x4_4
_x3 = x3
_y4 = x4
_y3 = _y4 * x3
u4 = muladd(c * 5, _x3, u4)
c = coefficients[8] * x3_3 * x4_3
_x3 = x3
_y4 = x4
_y3 = _y4 * x3
u3 = muladd(c * 4, _y4, u3)
u4 = muladd(c * 4, _x3, u4)
c = coefficients[9] * x2_1 * x3_1 * x4_3
_x2 = x2
_x3 = _x2 * x3
_y4 = x4
_y3 = _y4 * x3
_y2 = _y3 * x2
u2 = muladd(c * 2, _y3, u2)
u3 = muladd(c * 2 * _x2, _y4, u3)
u4 = muladd(c * 4, _x3, u4)
c = coefficients[10] * x3_1 * x4_3
_x1 = x1
_x2 = _x1 * x2
_x3 = _x2 * x3
_y4 = x4
_y3 = _y4 * x3
_y2 = _y3 * x2
u2 = muladd(c * _x1, _y3, u2)
u3 = muladd(c * 2 * _x2, _y4, u3)
u4 = muladd(c * 4, _x3, u4)
c = coefficients[11] * x3_1 * x4_3
_x2 = x2
_x3 = _x2 * x3
_y4 = x4
_y3 = _y4 * x3
_y2 = _y3 * x2
u3 = muladd(c * 2 * _x2, _y4, u3)
u4 = muladd(c * 4, _x3, u4)
c = coefficients[12] * x1_1 * x3_1 * x4_3
_x1 = x1
_x3 = _x1 * x3
_y4 = x4
_y3 = _y4 * x3
u1 = muladd(c * 2, _y3, u1)
u3 = muladd(c * 2 * _x1, _y4, u3)
u4 = muladd(c * 4, _x3, u4)
c = coefficients[13] * x3_1 * x4_3
_x1 = x1
_x3 = _x1 * x3
_y4 = x4
_y3 = _y4 * x3
u3 = muladd(c * 2 * _x1, _y4, u3)
u4 = muladd(c * 4, _x3, u4)
c = coefficients[14] * x3_1 * x4_3
_x3 = x3
_y4 = x4
_y3 = _y4 * x3
u3 = muladd(c * 2, _y4, u3)
u4 = muladd(c * 4, _x3, u4)
c = coefficients[15] * x1_1 * x2_1 * x4_3
_x1 = x1
_x2 = _x1 * x2
_y4 = x4
_y2 = _y4 * x2
u1 = muladd(c * 2, _y2, u1)
u2 = muladd(c * 2 * _x1, _y4, u2)
u4 = muladd(c * 4, _x2, u4)
c = coefficients[16] * x1_2 * x4_3
_x1 = x1
_x2 = _x1 * x2
_y4 = x4
_y2 = _y4 * x2
u1 = muladd(c * 3, _y2, u1)
u2 = muladd(c * _x1, _y4, u2)
u4 = muladd(c * 4, _x2, u4)
c = coefficients[17] * x1_1 * x4_3
_x1 = x1
_x2 = _x1 * x2
_y4 = x4
_y2 = _y4 * x2
u1 = muladd(c * 2, _y2, u1)
u2 = muladd(c * _x1, _y4, u2)
u4 = muladd(c * 4, _x2, u4)
c = coefficients[18] * x4_3
_x1 = x1
_x2 = _x1 * x2
_y4 = x4
_y2 = _y4 * x2
u2 = muladd(c * _x1, _y4, u2)
u4 = muladd(c * 4, _x2, u4)
c = coefficients[19] * x1_3 * x4_3
_x1 = x1
_y4 = x4
u1 = muladd(c * 4, _y4, u1)
u4 = muladd(c * 4, _x1, u4)
c = coefficients[20] * x1_2 * x4_3
_x1 = x1
_y4 = x4
u1 = muladd(c * 3, _y4, u1)
u4 = muladd(c * 4, _x1, u4)
c = coefficients[21] * x1_1 * x4_3
_x1 = x1
_y4 = x4
u1 = muladd(c * 2, _y4, u1)
u4 = muladd(c * 4, _x1, u4)
c = coefficients[22] * x4_3
_x1 = x1
_y4 = x4
u4 = muladd(c * 4, _x1, u4)
c = coefficients[23] * x4_3
_y4 = x4
c = coefficients[24] * x2_1 * x3_2 * x4_2
_x2 = x2
_x3 = _x2 * x3
_y4 = x4
_y3 = _y4 * x3
_y2 = _y3 * x2
u2 = muladd(c * 2, _y3, u2)
u3 = muladd(c * 3 * _x2, _y4, u3)
u4 = muladd(c * 3, _x3, u4)
c = coefficients[25] * x3_2 * x4_2
_x1 = x1
_x2 = _x1 * x2
_x3 = _x2 * x3
_y4 = x4
_y3 = _y4 * x3
_y2 = _y3 * x2
u2 = muladd(c * _x1, _y3, u2)
u3 = muladd(c * 3 * _x2, _y4, u3)
u4 = muladd(c * 3, _x3, u4)
c = coefficients[26] * x3_2 * x4_2
_x2 = x2
_x3 = _x2 * x3
_y4 = x4
_y3 = _y4 * x3
_y2 = _y3 * x2
u3 = muladd(c * 3 * _x2, _y4, u3)
u4 = muladd(c * 3, _x3, u4)
c = coefficients[27] * x3_2 * x4_2
_x1 = x1
_x3 = _x1 * x3
_y4 = x4
_y3 = _y4 * x3
u3 = muladd(c * 3 * _x1, _y4, u3)
u4 = muladd(c * 3, _x3, u4)
c = coefficients[28] * x3_2 * x4_2
_x3 = x3
_y4 = x4
_y3 = _y4 * x3
u3 = muladd(c * 3, _y4, u3)
u4 = muladd(c * 3, _x3, u4)
c = coefficients[29] * x2_2 * x4_2
_x1 = x1
_x2 = _x1 * x2
_x3 = _x2 * x3
_y4 = x4
_y3 = _y4 * x3
_y2 = _y3 * x2
u2 = muladd(c * 3 * _x1, _y3, u2)
u3 = muladd(c * _x2, _y4, u3)
u4 = muladd(c * 3, _x3, u4)
c = coefficients[30] * x1_1 * x2_1 * x4_2
_x1 = x1
_x2 = _x1 * x2
_x3 = _x2 * x3
_y4 = x4
_y3 = _y4 * x3
_y2 = _y3 * x2
u1 = muladd(c * 2, _y2, u1)
u2 = muladd(c * 2 * _x1, _y3, u2)
u3 = muladd(c * _x2, _y4, u3)
u4 = muladd(c * 3, _x3, u4)
c = coefficients[31] * x2_1 * x4_2
_x1 = x1
_x2 = _x1 * x2
_x3 = _x2 * x3
_y4 = x4
_y3 = _y4 * x3
_y2 = _y3 * x2
u2 = muladd(c * 2 * _x1, _y3, u2)
u3 = muladd(c * _x2, _y4, u3)
u4 = muladd(c * 3, _x3, u4)
c = coefficients[32] * x2_1 * x4_2
_x2 = x2
_x3 = _x2 * x3
_y4 = x4
_y3 = _y4 * x3
_y2 = _y3 * x2
u2 = muladd(c * 2, _y3, u2)
u3 = muladd(c * _x2, _y4, u3)
u4 = muladd(c * 3, _x3, u4)
c = coefficients[33] * x1_2 * x4_2
_x1 = x1
_x2 = _x1 * x2
_x3 = _x2 * x3
_y4 = x4
_y3 = _y4 * x3
_y2 = _y3 * x2
u1 = muladd(c * 3, _y2, u1)
u2 = muladd(c * _x1, _y3, u2)
u3 = muladd(c * _x2, _y4, u3)
u4 = muladd(c * 3, _x3, u4)
c = coefficients[34] * x1_1 * x4_2
_x1 = x1
_x2 = _x1 * x2
_x3 = _x2 * x3
_y4 = x4
_y3 = _y4 * x3
_y2 = _y3 * x2
u1 = muladd(c * 2, _y2, u1)
u2 = muladd(c * _x1, _y3, u2)
u3 = muladd(c * _x2, _y4, u3)
u4 = muladd(c * 3, _x3, u4)
c = coefficients[35] * x4_2
_x1 = x1
_x2 = _x1 * x2
_x3 = _x2 * x3
_y4 = x4
_y3 = _y4 * x3
_y2 = _y3 * x2
u2 = muladd(c * _x1, _y3, u2)
u3 = muladd(c * _x2, _y4, u3)
u4 = muladd(c * 3, _x3, u4)
c = coefficients[36] * x4_2
_x2 = x2
_x3 = _x2 * x3
_y4 = x4
_y3 = _y4 * x3
_y2 = _y3 * x2
u3 = muladd(c * _x2, _y4, u3)
u4 = muladd(c * 3, _x3, u4)
c = coefficients[37] * x1_2 * x4_2
_x1 = x1
_x3 = _x1 * x3
_y4 = x4
_y3 = _y4 * x3
u1 = muladd(c * 3, _y3, u1)
u3 = muladd(c * _x1, _y4, u3)
u4 = muladd(c * 3, _x3, u4)
c = coefficients[38] * x1_1 * x4_2
_x1 = x1
_x3 = _x1 * x3
_y4 = x4
_y3 = _y4 * x3
u1 = muladd(c * 2, _y3, u1)
u3 = muladd(c * _x1, _y4, u3)
u4 = muladd(c * 3, _x3, u4)
c = coefficients[39] * x4_2
_x1 = x1
_x3 = _x1 * x3
_y4 = x4
_y3 = _y4 * x3
u3 = muladd(c * _x1, _y4, u3)
u4 = muladd(c * 3, _x3, u4)
c = coefficients[40] * x4_2
_x3 = x3
_y4 = x4
_y3 = _y4 * x3
u4 = muladd(c * 3, _x3, u4)
c = coefficients[41] * x2_1 * x3_3 * x4_1
_x2 = x2
_x3 = _x2 * x3
_y4 = x4
_y3 = _y4 * x3
_y2 = _y3 * x2
u2 = muladd(c * 2, _y3, u2)
u3 = muladd(c * 4 * _x2, _y4, u3)
u4 = muladd(c * 2, _x3, u4)
c = coefficients[42] * x3_3 * x4_1
_x2 = x2
_x3 = _x2 * x3
_y4 = x4
_y3 = _y4 * x3
_y2 = _y3 * x2
u3 = muladd(c * 4 * _x2, _y4, u3)
u4 = muladd(c * 2, _x3, u4)
c = coefficients[43] * x3_3 * x4_1
_x3 = x3
_y4 = x4
_y3 = _y4 * x3
u3 = muladd(c * 4, _y4, u3)
u4 = muladd(c * 2, _x3, u4)
c = coefficients[44] * x2_3 * x3_1 * x4_1
_x2 = x2
_x3 = _x2 * x3
_y4 = x4
_y3 = _y4 * x3
_y2 = _y3 * x2
u2 = muladd(c * 4, _y3, u2)
u3 = muladd(c * 2 * _x2, _y4, u3)
u4 = muladd(c * 2, _x3, u4)
c = coefficients[45] * x2_2 * x3_1 * x4_1
_x1 = x1
_x2 = _x1 * x2
_x3 = _x2 * x3
_y4 = x4
_y3 = _y4 * x3
_y2 = _y3 * x2
u2 = muladd(c * 3 * _x1, _y3, u2)
u3 = muladd(c * 2 * _x2, _y4, u3)
u4 = muladd(c * 2, _x3, u4)
c = coefficients[46] * x2_2 * x3_1 * x4_1
_x2 = x2
_x3 = _x2 * x3
_y4 = x4
_y3 = _y4 * x3
_y2 = _y3 * x2
u2 = muladd(c * 3, _y3, u2)
u3 = muladd(c * 2 * _x2, _y4, u3)
u4 = muladd(c * 2, _x3, u4)
c = (coefficients[47] * x1_1 * x2_1 * x3_1) * x4_1
_x1 = x1
_x2 = _x1 * x2
_x3 = _x2 * x3
_y4 = x4
_y3 = _y4 * x3
_y2 = _y3 * x2
u1 = muladd(c * 2, _y2, u1)
u2 = muladd(c * 2 * _x1, _y3, u2)
u3 = muladd(c * 2 * _x2, _y4, u3)
u4 = muladd(c * 2, _x3, u4)
begin
u[1] = u1
u[2] = u2
u[3] = u3
u[4] = u4
end
u
end
end

If I time the compile time with

w = rand(Complex128, 4)
u = zeros(w)
coefficients = rand(47)

then the result (on 0.6.1) is a whopping 0.77 seconds. In comparison the Float64 variant only needs 0.08 seconds.

The latest nightly (7 days old, so without the constant propagation) is already better. The complex one needs
0.30 seconds and the Float64 variant needs 0.075 seconds.

But I wonder, are there known tricks to â€śhelpâ€ť the compiler in order to improve the compile speed? I already tried to sprinkle ::T statements but it seems that the problem is not the inference part, since these didnâ€™t help.

Any ideas are appreciated

Just a random guess but splitting common patterns into small functions might help.

Good idea, but unfortunately this is not really an option here since the code is auto generated and although every thing looks similar there are subtle differences everywhere. Another thing is that the cost of additional function calls is probably too big. For complex arguments the function needs on my system only around 300ns thanks to SIMD instructions.

The functions will get inlined (or you can force them to be) so the question is only if separate functions is easier for the Julia compiler. It is worth giving it a try.

1 Like

I batched the muladd calls together into a new function which accepted 3 tuples of equal size but this only made things worse