# Speed up `x::Bool ^ y::Float64`

Hello i am trying to speed up this: `x::Bool ^ y::Float64`, and even something come out

Is there any way to avoid branching of f2() in example?

How else to speed up?

``````using BenchmarkTools, Test

# simple
f1(x::Bool, y::Float64) = x ^ y;

# faster
function f2(x::Bool, y::Float64)
if x
return 1.0  # true ^ any_float64 -> 1.0
elseif isnan(y)
return NaN  # false ^ NaN -> NaN
elseif y > 0
return 0.0
elseif y < 0
return Inf
else # y == 0
return 1.0
end
end;

# test data
n = 10000;
r = zeros(Float64, n);  # for result
x = rand(Bool, n);
y = randn(Float64, n) .^ 111;
y[rand(1:n, 100)] .= NaN;
y[rand(1:n, 100)] .= -Inf;
y[rand(1:n, 100)] .= Inf;
y[rand(1:n, 100)] .= nextfloat(-Inf);
y[rand(1:n, 100)] .= prevfloat(Inf);
y[rand(1:n, 100)] .= 0.0;
y[rand(1:n, 100)] .= -0.0;

#test
@testset begin
for i = 1 : n
@test f1(x[i],y[i]) === f2(x[i],y[i])
end
end;

# benchmarks
f!(f,r,x,y,n) = for i = 1 : n
r[i] = f(x[i], y[i])
end;

@btime f!(\$f1,\$r,\$x,\$y,\$n)  # -> 243.600 μs (0 allocations: 0 bytes)
@btime f!(\$f2,\$r,\$x,\$y,\$n)  # -> 5.783 μs (0 allocations: 0 bytes)
``````

You can write this as

``````function f3(x::Bool, y::T) where T<:AbstractFloat
ifelse(x | iszero(y),  one(T), abs(y) * T(Inf) * (!(y>0)))
end
``````

which on my computer benchmarks a bit faster.

4 Likes

cool! thank you

If you find a faster way, please make a PR to base for the `Float64^Float64` algorithm.

what is the magic of (!(y>0)) ??
if (y<=0) tests fail

`NaN>0` and `NaN<0` are both false.

1 Like

Out of curiosity, is the purpose of `abs(y)` just to propogate `NaN`, or is there another reason for it?

yeah. that was just the cheapest way 8 could find to propagate nan without screwing up anything else

Oscar, I found:

``````pow_fast(x::Bool, y::T) where T <: AbstractFloat = x ? one(T) : ifelse(iszero(y), one(T), ifelse(isnan(y), T(NaN), ifelse(signbit(y), T(Inf), zero(T))));
pow_fast(x::Bool, y::BigFloat) = x ? big(1.0) : big(ifelse(iszero(y), 1.0 , ifelse(isnan(y), NaN, ifelse(signbit(y), Inf, 0.0))));

pow_fast(x::T, y::Bool) where T <: AbstractFloat = y ? x : one(T);
pow_fast(x::BigFloat, y::Bool) = y ? x : big(1.0);  # only big(1.0), not one(x), not BigFloat(1), not one(T)
``````

benchmarks, tests:

``````module M
using BenchmarkTools, Test, InteractiveUtils

println("**", rstrip(Sys.cpu_info()[1].model), "**")
println("**Julia-",VERSION,"**\n")

println("```")

#
pow_native(x::X, y::Y) where {X,Y} = x ^ y

#
pow_fast(x::Bool, y::T) where T <: AbstractFloat =
x ? one(T) : ifelse(iszero(y), one(T), ifelse(isnan(y), T(NaN), ifelse(signbit(y), T(Inf), zero(T))));

pow_fast(x::Bool, y::BigFloat) =
x ? big(1.0) : big(ifelse(iszero(y), 1.0 , ifelse(isnan(y), NaN, ifelse(signbit(y), Inf, 0.0))));

# test data
function get_test_data1(::Type{T}) where T <: AbstractFloat
n = 10000
m = n ÷ 100  # for special values of the same type NaN, -Inf ...
r = zeros(T, n)  # for result
x = rand(Bool, n)
#x = ones(Bool, n)
#x = zeros(Bool, n)
y = (T <: BigFloat ? big.(randn(Float64, n)) : randn(T, n)) .^ 111
y[rand(1:n, m)] .= T(NaN)
y[rand(1:n, m)] .= -T(Inf)
y[rand(1:n, m)] .= T(Inf)
y[rand(1:n, m)] .= nextfloat(-T(Inf))
y[rand(1:n, m)] .= prevfloat(T(Inf))
y[rand(1:n, m)] .= T(0)
y[rand(1:n, m)] .= -T(0)
return n, r, x, y
end

# tests
@testset verbose = true "Tests" begin
@testset verbose = false "Fast `bool ^ float`" begin
for Flt in subtypes(AbstractFloat)
@testset verbose = true " \$Flt" begin
n, r, x, y = get_test_data1(Flt)
@testset "\$pow_fast" begin
for i = 1 : n
r_native = pow_native(x[i],y[i])
r_fast = pow_fast(x[i],y[i])
# big(1.0) !== big(1.0), NaN != NaN
@test r_native == r_fast ? true :
isnan(r_native) & isnan(r_fast) ? true : false
end
end
end
end
end
end;

# benchmarks
# func
f!(f,r,x,y,n) = for i = 1 : n
r[i] = f(x[i], y[i])
end;

println("\nFast `bool ^ float` benchmark (1 - native 2 - fast):")

for Flt in subtypes(AbstractFloat)
n, r, x, y = get_test_data1(Flt)
println(" \$Flt:")
@btime f!(\$pow_native,\$r,\$x,\$y,\$n)
@btime f!(\$pow_fast,\$r,\$x,\$y,\$n)
end

println("```")

end;
``````

AMD Ryzen 9 3900XT 12-Core Processor
Julia-1.8.0

``````Test Summary:         |  Pass  Total  Time
Tests                 | 40000  40000  1.3s
Fast `bool ^ float` | 40000  40000  1.3s

Fast `bool ^ float` benchmark (1 - native 2 - fast):
BigFloat:
1.147 ms (40000 allocations: 1.98 MiB)
770.800 μs (20000 allocations: 1015.62 KiB)
Float16:
203.600 μs (0 allocations: 0 bytes)
13.200 μs (0 allocations: 0 bytes)
Float32:
262.100 μs (0 allocations: 0 bytes)
1.130 μs (0 allocations: 0 bytes)
Float64:
247.500 μs (0 allocations: 0 bytes)
2.111 μs (0 allocations: 0 bytes)
``````