I am exploring to use generated function for the evaluation of multivariate polynomials and I am all in all very pleased with the results I get. The only problem is that for larger polynomials the compile time can get quite significant since the generated code gets rather long.
Here is one (already truncated) example
function grad(u::AbstractVector{T}, coefficients, x) where T
begin
x1 = x[1]
x1_1 = x1
x1_2 = x1_1 * x1
x1_3 = x1_2 * x1
x2 = x[2]
x2_1 = x2
x2_2 = x2_1 * x2
x2_3 = x2_2 * x2
x2_4 = x2_3 * x2
x2_5 = x2_4 * x2
x3 = x[3]
x3_1 = x3
x3_2 = x3_1 * x3
x3_3 = x3_2 * x3
x4 = x[4]
x4_1 = x4
x4_2 = x4_1 * x4
x4_3 = x4_2 * x4
x4_4= x4_3 * x4
x4_5 = x4_4 * x4
u1 = zero(T)
u2 = zero(T)
u3 = zero(T)
u4 = zero(T)
c = coefficients[1] * x3_1 * x4_5
_x3 = x3
_y4 = x4
_y3 = _y4 * x3
u3 = muladd(c * 2, _y4, u3)
u4 = muladd(c * 6, _x3, u4)
c = coefficients[2] * x1_1 * x4_5
_x1 = x1
_y4 = x4
u1 = muladd(c * 2, _y4, u1)
u4 = muladd(c * 6, _x1, u4)
c = coefficients[3] * x3_2 * x4_4
_x3 = x3
_y4 = x4
_y3 = _y4 * x3
u3 = muladd(c * 3, _y4, u3)
u4 = muladd(c * 5, _x3, u4)
c = coefficients[4] * x4_4
_x1 = x1
_x2 = _x1 * x2
_x3 = _x2 * x3
_y4 = x4
_y3 = _y4 * x3
_y2 = _y3 * x2
u1 = muladd(c, _y2, u1)
u2 = muladd(c * _x1, _y3, u2)
u3 = muladd(c * _x2, _y4, u3)
u4 = muladd(c * 5, _x3, u4)
c = coefficients[5] * x1_1 * x4_4
_x1 = x1
_x3 = _x1 * x3
_y4 = x4
_y3 = _y4 * x3
u1 = muladd(c * 2, _y3, u1)
u3 = muladd(c * _x1, _y4, u3)
u4 = muladd(c * 5, _x3, u4)
c = coefficients[6] * x4_4
_x1 = x1
_x3 = _x1 * x3
_y4 = x4
_y3 = _y4 * x3
u1 = muladd(c, _y3, u1)
u3 = muladd(c * _x1, _y4, u3)
u4 = muladd(c * 5, _x3, u4)
c = coefficients[7] * x4_4
_x3 = x3
_y4 = x4
_y3 = _y4 * x3
u3 = muladd(c, _y4, u3)
u4 = muladd(c * 5, _x3, u4)
c = coefficients[8] * x3_3 * x4_3
_x3 = x3
_y4 = x4
_y3 = _y4 * x3
u3 = muladd(c * 4, _y4, u3)
u4 = muladd(c * 4, _x3, u4)
c = coefficients[9] * x2_1 * x3_1 * x4_3
_x2 = x2
_x3 = _x2 * x3
_y4 = x4
_y3 = _y4 * x3
_y2 = _y3 * x2
u2 = muladd(c * 2, _y3, u2)
u3 = muladd(c * 2 * _x2, _y4, u3)
u4 = muladd(c * 4, _x3, u4)
c = coefficients[10] * x3_1 * x4_3
_x1 = x1
_x2 = _x1 * x2
_x3 = _x2 * x3
_y4 = x4
_y3 = _y4 * x3
_y2 = _y3 * x2
u1 = muladd(c, _y2, u1)
u2 = muladd(c * _x1, _y3, u2)
u3 = muladd(c * 2 * _x2, _y4, u3)
u4 = muladd(c * 4, _x3, u4)
c = coefficients[11] * x3_1 * x4_3
_x2 = x2
_x3 = _x2 * x3
_y4 = x4
_y3 = _y4 * x3
_y2 = _y3 * x2
u2 = muladd(c, _y3, u2)
u3 = muladd(c * 2 * _x2, _y4, u3)
u4 = muladd(c * 4, _x3, u4)
c = coefficients[12] * x1_1 * x3_1 * x4_3
_x1 = x1
_x3 = _x1 * x3
_y4 = x4
_y3 = _y4 * x3
u1 = muladd(c * 2, _y3, u1)
u3 = muladd(c * 2 * _x1, _y4, u3)
u4 = muladd(c * 4, _x3, u4)
c = coefficients[13] * x3_1 * x4_3
_x1 = x1
_x3 = _x1 * x3
_y4 = x4
_y3 = _y4 * x3
u1 = muladd(c, _y3, u1)
u3 = muladd(c * 2 * _x1, _y4, u3)
u4 = muladd(c * 4, _x3, u4)
c = coefficients[14] * x3_1 * x4_3
_x3 = x3
_y4 = x4
_y3 = _y4 * x3
u3 = muladd(c * 2, _y4, u3)
u4 = muladd(c * 4, _x3, u4)
c = coefficients[15] * x1_1 * x2_1 * x4_3
_x1 = x1
_x2 = _x1 * x2
_y4 = x4
_y2 = _y4 * x2
u1 = muladd(c * 2, _y2, u1)
u2 = muladd(c * 2 * _x1, _y4, u2)
u4 = muladd(c * 4, _x2, u4)
c = coefficients[16] * x1_2 * x4_3
_x1 = x1
_x2 = _x1 * x2
_y4 = x4
_y2 = _y4 * x2
u1 = muladd(c * 3, _y2, u1)
u2 = muladd(c * _x1, _y4, u2)
u4 = muladd(c * 4, _x2, u4)
c = coefficients[17] * x1_1 * x4_3
_x1 = x1
_x2 = _x1 * x2
_y4 = x4
_y2 = _y4 * x2
u1 = muladd(c * 2, _y2, u1)
u2 = muladd(c * _x1, _y4, u2)
u4 = muladd(c * 4, _x2, u4)
c = coefficients[18] * x4_3
_x1 = x1
_x2 = _x1 * x2
_y4 = x4
_y2 = _y4 * x2
u1 = muladd(c, _y2, u1)
u2 = muladd(c * _x1, _y4, u2)
u4 = muladd(c * 4, _x2, u4)
c = coefficients[19] * x1_3 * x4_3
_x1 = x1
_y4 = x4
u1 = muladd(c * 4, _y4, u1)
u4 = muladd(c * 4, _x1, u4)
c = coefficients[20] * x1_2 * x4_3
_x1 = x1
_y4 = x4
u1 = muladd(c * 3, _y4, u1)
u4 = muladd(c * 4, _x1, u4)
c = coefficients[21] * x1_1 * x4_3
_x1 = x1
_y4 = x4
u1 = muladd(c * 2, _y4, u1)
u4 = muladd(c * 4, _x1, u4)
c = coefficients[22] * x4_3
_x1 = x1
_y4 = x4
u1 = muladd(c, _y4, u1)
u4 = muladd(c * 4, _x1, u4)
c = coefficients[23] * x4_3
_y4 = x4
u4 = muladd(c, 4, u4)
c = coefficients[24] * x2_1 * x3_2 * x4_2
_x2 = x2
_x3 = _x2 * x3
_y4 = x4
_y3 = _y4 * x3
_y2 = _y3 * x2
u2 = muladd(c * 2, _y3, u2)
u3 = muladd(c * 3 * _x2, _y4, u3)
u4 = muladd(c * 3, _x3, u4)
c = coefficients[25] * x3_2 * x4_2
_x1 = x1
_x2 = _x1 * x2
_x3 = _x2 * x3
_y4 = x4
_y3 = _y4 * x3
_y2 = _y3 * x2
u1 = muladd(c, _y2, u1)
u2 = muladd(c * _x1, _y3, u2)
u3 = muladd(c * 3 * _x2, _y4, u3)
u4 = muladd(c * 3, _x3, u4)
c = coefficients[26] * x3_2 * x4_2
_x2 = x2
_x3 = _x2 * x3
_y4 = x4
_y3 = _y4 * x3
_y2 = _y3 * x2
u2 = muladd(c, _y3, u2)
u3 = muladd(c * 3 * _x2, _y4, u3)
u4 = muladd(c * 3, _x3, u4)
c = coefficients[27] * x3_2 * x4_2
_x1 = x1
_x3 = _x1 * x3
_y4 = x4
_y3 = _y4 * x3
u1 = muladd(c, _y3, u1)
u3 = muladd(c * 3 * _x1, _y4, u3)
u4 = muladd(c * 3, _x3, u4)
c = coefficients[28] * x3_2 * x4_2
_x3 = x3
_y4 = x4
_y3 = _y4 * x3
u3 = muladd(c * 3, _y4, u3)
u4 = muladd(c * 3, _x3, u4)
c = coefficients[29] * x2_2 * x4_2
_x1 = x1
_x2 = _x1 * x2
_x3 = _x2 * x3
_y4 = x4
_y3 = _y4 * x3
_y2 = _y3 * x2
u1 = muladd(c, _y2, u1)
u2 = muladd(c * 3 * _x1, _y3, u2)
u3 = muladd(c * _x2, _y4, u3)
u4 = muladd(c * 3, _x3, u4)
c = coefficients[30] * x1_1 * x2_1 * x4_2
_x1 = x1
_x2 = _x1 * x2
_x3 = _x2 * x3
_y4 = x4
_y3 = _y4 * x3
_y2 = _y3 * x2
u1 = muladd(c * 2, _y2, u1)
u2 = muladd(c * 2 * _x1, _y3, u2)
u3 = muladd(c * _x2, _y4, u3)
u4 = muladd(c * 3, _x3, u4)
c = coefficients[31] * x2_1 * x4_2
_x1 = x1
_x2 = _x1 * x2
_x3 = _x2 * x3
_y4 = x4
_y3 = _y4 * x3
_y2 = _y3 * x2
u1 = muladd(c, _y2, u1)
u2 = muladd(c * 2 * _x1, _y3, u2)
u3 = muladd(c * _x2, _y4, u3)
u4 = muladd(c * 3, _x3, u4)
c = coefficients[32] * x2_1 * x4_2
_x2 = x2
_x3 = _x2 * x3
_y4 = x4
_y3 = _y4 * x3
_y2 = _y3 * x2
u2 = muladd(c * 2, _y3, u2)
u3 = muladd(c * _x2, _y4, u3)
u4 = muladd(c * 3, _x3, u4)
c = coefficients[33] * x1_2 * x4_2
_x1 = x1
_x2 = _x1 * x2
_x3 = _x2 * x3
_y4 = x4
_y3 = _y4 * x3
_y2 = _y3 * x2
u1 = muladd(c * 3, _y2, u1)
u2 = muladd(c * _x1, _y3, u2)
u3 = muladd(c * _x2, _y4, u3)
u4 = muladd(c * 3, _x3, u4)
c = coefficients[34] * x1_1 * x4_2
_x1 = x1
_x2 = _x1 * x2
_x3 = _x2 * x3
_y4 = x4
_y3 = _y4 * x3
_y2 = _y3 * x2
u1 = muladd(c * 2, _y2, u1)
u2 = muladd(c * _x1, _y3, u2)
u3 = muladd(c * _x2, _y4, u3)
u4 = muladd(c * 3, _x3, u4)
c = coefficients[35] * x4_2
_x1 = x1
_x2 = _x1 * x2
_x3 = _x2 * x3
_y4 = x4
_y3 = _y4 * x3
_y2 = _y3 * x2
u1 = muladd(c, _y2, u1)
u2 = muladd(c * _x1, _y3, u2)
u3 = muladd(c * _x2, _y4, u3)
u4 = muladd(c * 3, _x3, u4)
c = coefficients[36] * x4_2
_x2 = x2
_x3 = _x2 * x3
_y4 = x4
_y3 = _y4 * x3
_y2 = _y3 * x2
u2 = muladd(c, _y3, u2)
u3 = muladd(c * _x2, _y4, u3)
u4 = muladd(c * 3, _x3, u4)
c = coefficients[37] * x1_2 * x4_2
_x1 = x1
_x3 = _x1 * x3
_y4 = x4
_y3 = _y4 * x3
u1 = muladd(c * 3, _y3, u1)
u3 = muladd(c * _x1, _y4, u3)
u4 = muladd(c * 3, _x3, u4)
c = coefficients[38] * x1_1 * x4_2
_x1 = x1
_x3 = _x1 * x3
_y4 = x4
_y3 = _y4 * x3
u1 = muladd(c * 2, _y3, u1)
u3 = muladd(c * _x1, _y4, u3)
u4 = muladd(c * 3, _x3, u4)
c = coefficients[39] * x4_2
_x1 = x1
_x3 = _x1 * x3
_y4 = x4
_y3 = _y4 * x3
u1 = muladd(c, _y3, u1)
u3 = muladd(c * _x1, _y4, u3)
u4 = muladd(c * 3, _x3, u4)
c = coefficients[40] * x4_2
_x3 = x3
_y4 = x4
_y3 = _y4 * x3
u3 = muladd(c, _y4, u3)
u4 = muladd(c * 3, _x3, u4)
c = coefficients[41] * x2_1 * x3_3 * x4_1
_x2 = x2
_x3 = _x2 * x3
_y4 = x4
_y3 = _y4 * x3
_y2 = _y3 * x2
u2 = muladd(c * 2, _y3, u2)
u3 = muladd(c * 4 * _x2, _y4, u3)
u4 = muladd(c * 2, _x3, u4)
c = coefficients[42] * x3_3 * x4_1
_x2 = x2
_x3 = _x2 * x3
_y4 = x4
_y3 = _y4 * x3
_y2 = _y3 * x2
u2 = muladd(c, _y3, u2)
u3 = muladd(c * 4 * _x2, _y4, u3)
u4 = muladd(c * 2, _x3, u4)
c = coefficients[43] * x3_3 * x4_1
_x3 = x3
_y4 = x4
_y3 = _y4 * x3
u3 = muladd(c * 4, _y4, u3)
u4 = muladd(c * 2, _x3, u4)
c = coefficients[44] * x2_3 * x3_1 * x4_1
_x2 = x2
_x3 = _x2 * x3
_y4 = x4
_y3 = _y4 * x3
_y2 = _y3 * x2
u2 = muladd(c * 4, _y3, u2)
u3 = muladd(c * 2 * _x2, _y4, u3)
u4 = muladd(c * 2, _x3, u4)
c = coefficients[45] * x2_2 * x3_1 * x4_1
_x1 = x1
_x2 = _x1 * x2
_x3 = _x2 * x3
_y4 = x4
_y3 = _y4 * x3
_y2 = _y3 * x2
u1 = muladd(c, _y2, u1)
u2 = muladd(c * 3 * _x1, _y3, u2)
u3 = muladd(c * 2 * _x2, _y4, u3)
u4 = muladd(c * 2, _x3, u4)
c = coefficients[46] * x2_2 * x3_1 * x4_1
_x2 = x2
_x3 = _x2 * x3
_y4 = x4
_y3 = _y4 * x3
_y2 = _y3 * x2
u2 = muladd(c * 3, _y3, u2)
u3 = muladd(c * 2 * _x2, _y4, u3)
u4 = muladd(c * 2, _x3, u4)
c = (coefficients[47] * x1_1 * x2_1 * x3_1) * x4_1
_x1 = x1
_x2 = _x1 * x2
_x3 = _x2 * x3
_y4 = x4
_y3 = _y4 * x3
_y2 = _y3 * x2
u1 = muladd(c * 2, _y2, u1)
u2 = muladd(c * 2 * _x1, _y3, u2)
u3 = muladd(c * 2 * _x2, _y4, u3)
u4 = muladd(c * 2, _x3, u4)
begin
u[1] = u1
u[2] = u2
u[3] = u3
u[4] = u4
end
u
end
end
If I time the compile time with
w = rand(Complex128, 4)
u = zeros(w)
coefficients = rand(47)
@time grad(u, coefficients, w)
then the result (on 0.6.1) is a whopping 0.77 seconds. In comparison the Float64
variant only needs 0.08
seconds.
The latest nightly (7 days old, so without the constant propagation) is already better. The complex one needs
0.30 seconds and the Float64
variant needs 0.075 seconds.
But I wonder, are there known tricks to “help” the compiler in order to improve the compile speed? I already tried to sprinkle ::T
statements but it seems that the problem is not the inference part, since these didn’t help.
Any ideas are appreciated