Performance of power as function input

Hi,

I have a performance regression when including the power of a function as an input argument. I have boiled it down to something like this:

function f1(X)
    n,d = size(X)
    dist = 0.0

    for i = 2:n
        for j = 1:i-1
            dist_tmp = 0.0
            for k = 1:d
                @inbounds dist_comp = X[i,k]-X[j,k]
                dist_tmp += dist_comp^2
            end
            dist += 1/dist_tmp
        end
    end
    output = 1/dist
    return output
end

function f2(X,ae_power)
    n,d = size(X)
    dist = 0.0

    for i = 2:n
        for j = 1:i-1
            dist_tmp = 0.0
            for k = 1:d
                @inbounds dist_comp = X[i,k]-X[j,k]
                dist_tmp += dist_comp^ae_power
            end
            dist += 1/dist_tmp
        end
    end
    output = 1/dist
    return output
end

function f3(X,::Val{ae_power}) where ae_power
    n,d = size(X)
    dist = 0.0

    for i = 2:n
        for j = 1:i-1
            dist_tmp = 0.0
            for k = 1:d
                @inbounds dist_comp = X[i,k]-X[j,k]
                dist_tmp += dist_comp^ae_power
            end
            dist += 1/dist_tmp
        end
    end
    output = 1/dist
    return output
end

X=rand(1:500,500,500);
@btime f1(X)
@btime f2(X,2)
@btime f3(X,Val(2))

  102.178 ms (1 allocation: 16 bytes)
  181.348 ms (1 allocation: 16 bytes)
  175.748 ms (1 allocation: 16 bytes)

Using Val gives me a small improvement but not close enough to the original implementation.
Strangely @code_warntype only shows an output for function f1.

Could someone point me in a direction to improve this? I am using Julia 1.3.1 built from source.

1 Like

If you used a literal, the code a ^ b is lowered to call Base.literal_pow(^, a, Val(b)). You can do this manually with your ::Val{ae_power} version.

julia> Meta.@lower a ^ b
:($(Expr(:thunk, CodeInfo(
    @ none within `top-level scope'
1 ─ %1 = a ^ b
└──      return %1
))))

julia> Meta.@lower a ^ 3
:($(Expr(:thunk, CodeInfo(
    @ none within `top-level scope'
1 ─ %1 = Core.apply_type(Base.Val, 3)
│   %2 = (%1)()
│   %3 = Base.literal_pow(^, a, %2)
└──      return %3
))))
2 Likes

Literal exponents get lowered to the special Base.literal_pow, which can be more efficient than the generic ^:

julia> f(x) = x^2
f (generic function with 1 method)

julia> @code_lowered(f(1))
CodeInfo(
1 ─ %1 = Core.apply_type(Base.Val, 2)
│   %2 = (%1)()
│   %3 = Base.literal_pow(Main.:^, x, %2)
└──      return %3
)

You should be able to get the same performance by calling literal_pow yourself:

dist_tmp += Base.literal_pow(^, dist_comp, Val(ae_power))

Edit: whoops, @Elrod beat me to it :wink:

2 Likes

Thanks, this solves my problem!