Very Mysterious Memory Allocation

question

#1

I had a similar issue couple of years and looks like its back. See the MWE function below:

function getdW{T<:AbstractFloat}(k::T,work::AbstractVector{T})
     local sqrt2l = 1.41421356237309510
     local K2c0 = 0.7542472332656508
     local K2c1 = -0.2
     local K2c2 = 0.08081220356417687
     local K2c3 = -0.031746031746031744
     local K2c4 = 0.012244273267299524
     local K2c5 = -2.0/429.0
     local K2c6 = 8.0 * 1.41421356237309510/6435.0
     local K2c7 = -8.0/12155.0
     local K2c8 = 8.0*1.41421356237309510/46189.0
     local k00  = 0.3535533905932738
     local k02  = 3.00*k00*0.25
     local k03  = -2.00/3.0
     local k04  = 15.0*1.41421356237309510/128.0
     local k05  = -2.0/5.0
     local k06  = 35.0*1.41421356237309510/512.0
     local k07  = -8/35.0
     local k08  = 315.0*1.41421356237309510/8192.0
     local sqrt2by4 = sqrt2l*0.25
     local dk01 = 3.00*sqrt2by4/2.00
     local dk02 = -2.0
     local dk03 = 15.00*sqrt2l/32.0
     local dk04 = -2.0

     local dK2c0 = -1.0/5.0
     local dK2c1 = 4.0*sqrt2l/35.0
     local dK2c2 = -2.0/21.0
     local dK2c3 = 8.0*sqrt2l/231.0
     local dK2c4 = -10.0/429.0
     local dK2c5 = 16.0*sqrt2l/2145.0
     local dK2c6 = -56.0/12155.0
     local dK2c7 =	64.0*sqrt2l/46189.0

    W::T = 0.0
    dW::T = 0.0
    t2::T = k*k
    m = (2.0-t2)
    bym = 1.0/m

    if k<=-1E-3
      t3 = acos(t2 - 1.0)
      t6 = 2.0*Ď€ - t3
      W	=	(t6 * sqrt(bym) - k)*bym
      dW = (-2.0+3.0*W*k)*bym
    elseif k >= 1.4342135623730952
      t7 = (k+1.0) * (k-1.00)
      t3 = log(t7 + sqrt(t7*t7-1.0))
      t4 =  t7-1.0
      t5 =  1.0/t4
      W	= (-t3 * sqrt(t5) + k)*t5
      dW =  (-2.0+3.0*W*k)*bym
  elseif k>=1E-3 && k < 1.3942135623730951::T
        t3 = acos(t2 - 1.0)
        t6 = t3
        W  = (t6 * sqrt(bym) - k)*bym
        dW = (-2.0+3.0*W*k)*bym
    elseif k<1E-3 # series for k ~= 0 improve convergence
      xt = π
      t3 = t2*k
      t4 = t2*t2
      t5 = t4*k
      t6 = t3*t3
      t7 = t5*t2
      t8 = t4*t4
      W  =	 xt*k00 - k + (xt*k02)*t2 + k03*t3 + (xt*k04)*t4 + k05*t5 + (xt*k06)*t6  + k07*t7 + (xt*k08)*t8
      dW  =	 -1.00 + xt*dk01*k +dk02*t2      + xt*dk03*t3 + dk04*t4
    elseif  k < 1.4342135623730952
      xt ::T = k-1.41421356237309510
      t2 = xt*xt
      t3 = t2*xt
      t4 = t3*xt
      t5 = t4*xt
      t6 = t5*xt
      t7 = t6*xt
      t8 = t7*xt
      W = K2c0 + K2c1*k + K2c2*t2 + K2c3*t3 + K2c4*t4 + K2c5*t5 + K2c6*t6 + K2c7*t7 + K2c8*t8
      dW = dK2c0 + dK2c1*xt+ dK2c2*t2 + dK2c3*t3 + dK2c4*t4 + dK2c5*t5 + dK2c6*t6 + dK2c7*t7
    end
    work[1] = W::T
    work[2] = dW::T
    return nothing
end

now look at the following calls:

const work = Vector{Float64}(2)

julia> @btime getdW(-1.2,work)
  29.962 ns (0 allocations: 0 bytes)

julia> @btime getdW(1.2,work)
  30.806 ns (0 allocations: 0 bytes)

julia> @btime getdW(1.414,work)
  86.559 ns (14 allocations: 224 bytes)

julia> @btime getdW(0.0001,work)
  47.341 ns (7 allocations: 112 bytes)

The function seems to allocate memory based on which branch it is in !
This makes no sense to me as the @code_warntype shows no type instabilities and the branches are statically types. I opened a similar issue an year or so ago on Julia google forums and it seemed to be fixed but the problem has cropped up again more severely this time.
Does anyone know whats going on ? I plan to call this function millions of times and not sure how to get around this weird performance issue.

I am on Julia 0.6.

Thanks!


#2

Upon further investigation, looks like the memory allocation happens if the code executes either of the last two elseif blocks. They dont seem to be allocating any memory so I am not sure know whats happening.


#3

The lines are too long so you’re hitting the splat penalty. Add some parentheses.


#4

Oh, I wasn’t even aware of such a penalty. Let me try that.

thanks


#5

Awesome… that did it. Thanks for this tip.
Is this penalty documented somewhere ?


#6

Not sure. But I hope it actually just gets “fixed” soon enough that it doesn’t matter. Right now the cutoff is 16 operations, which is a little too easy to accidentally run into. Issue which links to a bunch of other issues + PR:

https://github.com/JuliaLang/julia/issues/22370
https://github.com/JuliaLang/julia/pull/22545

The PR won’t actually go anywhere until someone who knows the internals well takes it up, but I hope it gets something started.


#7

There was a PR to document it at https://github.com/JuliaLang/julia/pull/15211 but it got reverted due to some problems with it and the corrections never made it back in.