Speed up "bool ^ float" and "float ^ bool"

12th Gen Intel(R) Core™ i7-12700K
Julia-1.8.3
more faster

Test Summary:         |  Pass  Total  Time
Tests                 | 80000  80000  1.3s
  Fast `bool ^ float` | 40000  40000  0.9s
  Fast `float ^ bool` | 40000  40000  0.4s

Fast `bool ^ float` benchmark (1 - native 2 - fast):
 BigFloat:
  672.900 μs (40000 allocations: 1.98 MiB)
  438.800 μs (20000 allocations: 1015.62 KiB)
 Float16:
  98.600 μs (0 allocations: 0 bytes)
  9.200 μs (0 allocations: 0 bytes)
 Float32:
  91.100 μs (0 allocations: 0 bytes)
  1.220 μs (0 allocations: 0 bytes)
 Float64:
  117.600 μs (0 allocations: 0 bytes)
  2.344 μs (0 allocations: 0 bytes)

Fast `float ^ bool` benchmark (1 - native 2 - fast):
 BigFloat:
  333.500 μs (20000 allocations: 1015.62 KiB)
  275.800 μs (10100 allocations: 512.89 KiB)
 Float16:
  29.900 μs (0 allocations: 0 bytes)
  3.763 μs (0 allocations: 0 bytes)
 Float32:
  31.000 μs (0 allocations: 0 bytes)
  622.156 ns (0 allocations: 0 bytes)
 Float64:
  26.000 μs (0 allocations: 0 bytes)
  1.210 μs (0 allocations: 0 bytes)
1 Like

BigFloat have problems, I tried to get around them

“bool ^ bigfloat” and “bigfloat ^ bool” got faster too)

module M
	using BenchmarkTools, Test, InteractiveUtils

	println("**", rstrip(Sys.cpu_info()[1].model), "**")
	println("**Julia-",VERSION,"**")
	println("*More faster (BigFloat too)* \n")

	println("```")

	# 
	pow_native(x::X, y::Y) where {X,Y} = x ^ y

	# 
	pow_fast(x::Bool, y::T) where T <: AbstractFloat = 
		x ? one(T) : ifelse(iszero(y), one(T), ifelse(isnan(y), T(NaN), ifelse(signbit(y), T(Inf), zero(T))));

	pow_fast(x::Bool, y::BigFloat) = 
		x ? big"1.0" : ifelse(iszero(y), big"1.0" , ifelse(isnan(y), big"NaN", ifelse(signbit(y), big"Inf", big"0.0")));

	pow_fast(x::T, y::Bool) where T <: AbstractFloat = y ? x : one(T);

	pow_fast(x::BigFloat, y::Bool) = y ? x : big"1.0";
	
	# test data
	function get_test_data1(::Type{T}) where T <: AbstractFloat
		n = 10000
		m = n ÷ 100  # for special values of the same type NaN, -Inf ...
		r = zeros(T, n)  # for result
		x = rand(Bool, n)
		#x = ones(Bool, n)
		#x = zeros(Bool, n)
		y = (T <: BigFloat ? big.(randn(Float64, n)) : randn(T, n)) .^ 111
		y[rand(1:n, m)] .= T(NaN)
		y[rand(1:n, m)] .= -T(Inf)
		y[rand(1:n, m)] .= T(Inf)
		y[rand(1:n, m)] .= nextfloat(-T(Inf))
		y[rand(1:n, m)] .= prevfloat(T(Inf))
		y[rand(1:n, m)] .= T(0)
		y[rand(1:n, m)] .= -T(0)
		return n, r, x, y
	end

	function get_test_data2(::Type{T}) where T <: AbstractFloat
		n = 10000
		m = n ÷ 100  # for special values of the same type NaN, -Inf ...
		r = zeros(T, n)  # for result
		x = (T <: BigFloat ? big.(randn(Float64, n)) : randn(T, n)) .^ 111
		y = rand(Bool, n)
		#y = ones(Bool, n)
		#y = zeros(Bool, n)
		x[rand(1:n, m)] .= T(NaN)
		x[rand(1:n, m)] .= -T(Inf)
		x[rand(1:n, m)] .= T(Inf)
		x[rand(1:n, m)] .= nextfloat(-T(Inf))
		x[rand(1:n, m)] .= prevfloat(T(Inf))
		x[rand(1:n, m)] .= T(0)
		x[rand(1:n, m)] .= -T(0)
		return n, r, x, y
	end

	# tests
	@testset verbose = true "Tests" begin
		@testset verbose = false "Fast `bool ^ float`" begin
			for Flt in subtypes(AbstractFloat)
				@testset verbose = true " $Flt" begin
					n, r, x, y = get_test_data1(Flt)
					@testset "$pow_fast" begin 
						for i = 1 : n
							r_native = pow_native(x[i],y[i])
							r_fast = pow_fast(x[i],y[i])
							# big(1.0) !== big(1.0), NaN != NaN
							@test r_native == r_fast ? true :  
								isnan(r_native) & isnan(r_fast) ? true : false
						end
					end
				end
			end
		end
		@testset verbose = false "Fast `float ^ bool`" begin
			for Flt in subtypes(AbstractFloat)
				@testset verbose = true " $Flt" begin
					n, r, x, y = get_test_data2(Flt)
					@testset "$pow_fast" begin 
						for i = 1 : n
							r_native = pow_native(x[i],y[i])
							r_fast = pow_fast(x[i],y[i])
							# big(1.0) !== big(1.0), NaN != NaN
							@test r_native == r_fast ? true :  
								isnan(r_native) & isnan(r_fast) ? true : false
						end 
					end
				end
			end
		end
	end;

	# benchmarks
	# func
	f!(f,r,x,y,n) = for i = 1 : n
		r[i] = f(x[i], y[i])
	end;

	println("\nFast `bool ^ float` benchmark (1 - native 2 - fast):")

	for Flt in subtypes(AbstractFloat)
		n, r, x, y = get_test_data1(Flt)
		println(" $Flt:")
		@btime f!($pow_native,$r,$x,$y,$n) 
		@btime f!($pow_fast,$r,$x,$y,$n) 
	end

	println("\nFast `float ^ bool` benchmark (1 - native 2 - fast):")
	for Flt in subtypes(AbstractFloat)
		n, r, x, y = get_test_data2(Flt)
		println(" $Flt:")
		@btime f!($pow_native,$r,$x,$y,$n) 
		@btime f!($pow_fast,$r,$x,$y,$n) 
	end

	println("```")

end;

AMD Ryzen 9 3900XT 12-Core Processor
Julia-1.8.0
More faster (BigFloat too)

Test Summary:         |  Pass  Total  Time
Tests                 | 80000  80000  1.8s
  Fast `bool ^ float` | 40000  40000  1.2s
  Fast `float ^ bool` | 40000  40000  0.5s

Fast `bool ^ float` benchmark (1 - native 2 - fast):
 BigFloat:
  1.100 ms (40000 allocations: 1.98 MiB)
  107.200 μs (0 allocations: 0 bytes)
 Float16:
  197.300 μs (0 allocations: 0 bytes)
  12.600 μs (0 allocations: 0 bytes)
 Float32:
  248.600 μs (0 allocations: 0 bytes)
  1.080 μs (0 allocations: 0 bytes)
 Float64:
  242.000 μs (0 allocations: 0 bytes)
  2.056 μs (0 allocations: 0 bytes)

Fast `float ^ bool` benchmark (1 - native 2 - fast):
 BigFloat:
  575.300 μs (20000 allocations: 1015.62 KiB)
  9.600 μs (0 allocations: 0 bytes)
 Float16:
  36.600 μs (0 allocations: 0 bytes)
  6.380 μs (0 allocations: 0 bytes)
 Float32:
  33.800 μs (0 allocations: 0 bytes)
  668.987 ns (0 allocations: 0 bytes)
 Float64:
  32.500 μs (0 allocations: 0 bytes)
  1.570 μs (0 allocations: 0 bytes)

12th Gen Intel(R) Core™ i7-12700K
Julia-1.8.3
More faster (BigFloat too)

Test Summary:         |  Pass  Total  Time
Tests                 | 80000  80000  1.5s
  Fast `bool ^ float` | 40000  40000  1.0s
  Fast `float ^ bool` | 40000  40000  0.4s

Fast `bool ^ float` benchmark (1 - native 2 - fast):
 BigFloat:
  663.600 μs (40000 allocations: 1.98 MiB)
  92.700 μs (0 allocations: 0 bytes)
 Float16:
  106.400 μs (0 allocations: 0 bytes)
  10.200 μs (0 allocations: 0 bytes)
 Float32:
  97.800 μs (0 allocations: 0 bytes)
  1.240 μs (0 allocations: 0 bytes)
 Float64:
  116.300 μs (0 allocations: 0 bytes)
  2.500 μs (0 allocations: 0 bytes)

Fast `float ^ bool` benchmark (1 - native 2 - fast):
 BigFloat:
  353.700 μs (20000 allocations: 1015.62 KiB)
  8.833 μs (0 allocations: 0 bytes)
 Float16:
  32.500 μs (0 allocations: 0 bytes)
  4.075 μs (0 allocations: 0 bytes)
 Float32:
  33.100 μs (0 allocations: 0 bytes)
  634.694 ns (0 allocations: 0 bytes)
 Float64:
  25.200 μs (0 allocations: 0 bytes)
  1.260 μs (0 allocations: 0 bytes)
1 Like

Intel(R) Core™ i7-10750H CPU @ 2.60GHz
Julia-1.8.2
More faster (BigFloat too)

Test Summary:         |  Pass  Total  Time
Tests                 | 80000  80000  1.9s
  Fast `bool ^ float` | 40000  40000  1.3s
  Fast `float ^ bool` | 40000  40000  0.6s

Fast `bool ^ float` benchmark (1 - native 2 - fast):
 BigFloat:
  964.600 μs (40000 allocations: 1.98 MiB)
  112.700 μs (0 allocations: 0 bytes)
 Float16:
  131.800 μs (0 allocations: 0 bytes)
  16.200 μs (0 allocations: 0 bytes)
 Float32:
  125.600 μs (0 allocations: 0 bytes)
  1.490 μs (0 allocations: 0 bytes)
 Float64:
  161.700 μs (0 allocations: 0 bytes)
  2.956 μs (0 allocations: 0 bytes)

Fast `float ^ bool` benchmark (1 - native 2 - fast):
 BigFloat:
  489.400 μs (20000 allocations: 1015.62 KiB)
  11.000 μs (0 allocations: 0 bytes)
 Float16:
  42.700 μs (0 allocations: 0 bytes)
  10.800 μs (0 allocations: 0 bytes)
 Float32:
  43.100 μs (0 allocations: 0 bytes)
  887.500 ns (0 allocations: 0 bytes)
 Float64:
  36.700 μs (0 allocations: 0 bytes)
  1.520 μs (0 allocations: 0 bytes)
1 Like