Speed up `x::Bool ^ y::Float64`

Hello i am trying to speed up this: x::Bool ^ y::Float64, and even something come out

Is there any way to avoid branching of f2() in example?

How else to speed up?

using BenchmarkTools, Test

# simple
f1(x::Bool, y::Float64) = x ^ y;

# faster
function f2(x::Bool, y::Float64)
    if x 
        return 1.0  # true ^ any_float64 -> 1.0
    elseif isnan(y) 
        return NaN  # false ^ NaN -> NaN
    elseif y > 0 
        return 0.0
    elseif y < 0 
        return Inf
    else # y == 0
        return 1.0  
    end
end;


# test data
n = 10000;
r = zeros(Float64, n);  # for result
x = rand(Bool, n);
y = randn(Float64, n) .^ 111;
y[rand(1:n, 100)] .= NaN;
y[rand(1:n, 100)] .= -Inf;
y[rand(1:n, 100)] .= Inf;
y[rand(1:n, 100)] .= nextfloat(-Inf);
y[rand(1:n, 100)] .= prevfloat(Inf);
y[rand(1:n, 100)] .= 0.0;
y[rand(1:n, 100)] .= -0.0;

#test
@testset begin
    for i = 1 : n
        @test f1(x[i],y[i]) === f2(x[i],y[i])
    end
end;

# benchmarks
f!(f,r,x,y,n) = for i = 1 : n
    r[i] = f(x[i], y[i])
end;

@btime f!($f1,$r,$x,$y,$n)  # -> 243.600 μs (0 allocations: 0 bytes)
@btime f!($f2,$r,$x,$y,$n)  # -> 5.783 μs (0 allocations: 0 bytes)

You can write this as

function f3(x::Bool, y::T) where T<:AbstractFloat
    ifelse(x | iszero(y),  one(T), abs(y) * T(Inf) * (!(y>0)))
end

which on my computer benchmarks a bit faster.

4 Likes

cool! thank you

If you find a faster way, please make a PR to base for the Float64^Float64 algorithm.

what is the magic of (!(y>0)) ??
if (y<=0) tests fail

NaN>0 and NaN<0 are both false.

1 Like

Out of curiosity, is the purpose of abs(y) just to propogate NaN, or is there another reason for it?

yeah. that was just the cheapest way 8 could find to propagate nan without screwing up anything else

Oscar, I found:

pow_fast(x::Bool, y::T) where T <: AbstractFloat = x ? one(T) : ifelse(iszero(y), one(T), ifelse(isnan(y), T(NaN), ifelse(signbit(y), T(Inf), zero(T))));
pow_fast(x::Bool, y::BigFloat) = x ? big(1.0) : big(ifelse(iszero(y), 1.0 , ifelse(isnan(y), NaN, ifelse(signbit(y), Inf, 0.0))));

pow_fast(x::T, y::Bool) where T <: AbstractFloat = y ? x : one(T);
pow_fast(x::BigFloat, y::Bool) = y ? x : big(1.0);  # only big(1.0), not one(x), not BigFloat(1), not one(T)

benchmarks, tests:

module M
	using BenchmarkTools, Test, InteractiveUtils

	println("**", rstrip(Sys.cpu_info()[1].model), "**")
	println("**Julia-",VERSION,"**\n")

	println("```")

	# 
	pow_native(x::X, y::Y) where {X,Y} = x ^ y

	# 
	pow_fast(x::Bool, y::T) where T <: AbstractFloat = 
		x ? one(T) : ifelse(iszero(y), one(T), ifelse(isnan(y), T(NaN), ifelse(signbit(y), T(Inf), zero(T))));

	pow_fast(x::Bool, y::BigFloat) = 
		x ? big(1.0) : big(ifelse(iszero(y), 1.0 , ifelse(isnan(y), NaN, ifelse(signbit(y), Inf, 0.0))));

	# test data
	function get_test_data1(::Type{T}) where T <: AbstractFloat
		n = 10000
		m = n ÷ 100  # for special values of the same type NaN, -Inf ...
		r = zeros(T, n)  # for result
		x = rand(Bool, n)
		#x = ones(Bool, n)
		#x = zeros(Bool, n)
		y = (T <: BigFloat ? big.(randn(Float64, n)) : randn(T, n)) .^ 111
		y[rand(1:n, m)] .= T(NaN)
		y[rand(1:n, m)] .= -T(Inf)
		y[rand(1:n, m)] .= T(Inf)
		y[rand(1:n, m)] .= nextfloat(-T(Inf))
		y[rand(1:n, m)] .= prevfloat(T(Inf))
		y[rand(1:n, m)] .= T(0)
		y[rand(1:n, m)] .= -T(0)
		return n, r, x, y
	end

	# tests
	@testset verbose = true "Tests" begin
		@testset verbose = false "Fast `bool ^ float`" begin
			for Flt in subtypes(AbstractFloat)
				@testset verbose = true " $Flt" begin
					n, r, x, y = get_test_data1(Flt)
					@testset "$pow_fast" begin 
						for i = 1 : n
							r_native = pow_native(x[i],y[i])
							r_fast = pow_fast(x[i],y[i])
							# big(1.0) !== big(1.0), NaN != NaN
							@test r_native == r_fast ? true :  
								isnan(r_native) & isnan(r_fast) ? true : false
						end
					end
				end
			end
		end
	end;

	# benchmarks
	# func
	f!(f,r,x,y,n) = for i = 1 : n
		r[i] = f(x[i], y[i])
	end;

	println("\nFast `bool ^ float` benchmark (1 - native 2 - fast):")

	for Flt in subtypes(AbstractFloat)
		n, r, x, y = get_test_data1(Flt)
		println(" $Flt:")
		@btime f!($pow_native,$r,$x,$y,$n) 
		@btime f!($pow_fast,$r,$x,$y,$n) 
	end

	println("```")

end;

AMD Ryzen 9 3900XT 12-Core Processor
Julia-1.8.0

Test Summary:         |  Pass  Total  Time
Tests                 | 40000  40000  1.3s
  Fast `bool ^ float` | 40000  40000  1.3s

Fast `bool ^ float` benchmark (1 - native 2 - fast):
 BigFloat:
  1.147 ms (40000 allocations: 1.98 MiB)
  770.800 μs (20000 allocations: 1015.62 KiB)
 Float16:
  203.600 μs (0 allocations: 0 bytes)
  13.200 μs (0 allocations: 0 bytes)
 Float32:
  262.100 μs (0 allocations: 0 bytes)
  1.130 μs (0 allocations: 0 bytes)
 Float64:
  247.500 μs (0 allocations: 0 bytes)
  2.111 μs (0 allocations: 0 bytes)