Speed up "bool ^ float" and "float ^ bool"

I speed up “bool ^ float” and “float ^ bool”.

Сan you help me to know how it works on other processors ?

Need a print of this thing:

module M
	using BenchmarkTools, Test, InteractiveUtils

	println("**", rstrip(Sys.cpu_info()[1].model), "**")
	println("**Julia-",VERSION,"**\n")

	println("```")

	# 
	pow_native(x::X, y::Y) where {X,Y} = x ^ y

	# 
	pow_fast(x::Bool, y::T) where T <: AbstractFloat = ifelse(x | iszero(y), T(1) , ifelse(isnan(y), T(NaN), ifelse(signbit(y), T(Inf), T(0))));
	pow_fast(x::Bool, y::BigFloat) = big(ifelse(x | iszero(y), 1.0 , ifelse(isnan(y), NaN, ifelse(signbit(y), Inf, 0.0))));
	pow_fast(x::T, y::Bool) where T <: AbstractFloat = y ? copy(x) : T(1)
	pow_fast(x::BigFloat, y::Bool) = y ? x : big(1.0)

	# test data
	function get_test_data1(::Type{T}) where T <: AbstractFloat
		n = 10000
		m = n ÷ 100  # for special values of the same type NaN, -Inf ...
		r = zeros(T, n)  # for result
		x = rand(Bool, n)
		y = (T <: BigFloat ? big.(randn(Float64, n)) : randn(T, n)) .^ 111
		y[rand(1:n, m)] .= T(NaN)
		y[rand(1:n, m)] .= -T(Inf)
		y[rand(1:n, m)] .= T(Inf)
		y[rand(1:n, m)] .= nextfloat(-T(Inf))
		y[rand(1:n, m)] .= prevfloat(T(Inf))
		y[rand(1:n, m)] .= T(0)
		y[rand(1:n, m)] .= -T(0)
		return n, r, x, y
	end

	function get_test_data2(::Type{T}) where T <: AbstractFloat
		n = 10000
		m = n ÷ 100  # for special values of the same type NaN, -Inf ...
		r = zeros(T, n)  # for result
		x = (T <: BigFloat ? big.(randn(Float64, n)) : randn(T, n)) .^ 111
		y = rand(Bool, n)
		x[rand(1:n, m)] .= T(NaN)
		x[rand(1:n, m)] .= -T(Inf)
		x[rand(1:n, m)] .= T(Inf)
		x[rand(1:n, m)] .= nextfloat(-T(Inf))
		x[rand(1:n, m)] .= prevfloat(T(Inf))
		x[rand(1:n, m)] .= T(0)
		x[rand(1:n, m)] .= -T(0)
		return n, r, x, y
	end

	# tests
	@testset verbose = true "Tests" begin
		@testset verbose = false "Fast `bool ^ float`" begin
			for Flt in subtypes(AbstractFloat)
				@testset verbose = true " $Flt" begin
					n, r, x, y = get_test_data1(Flt)
					@testset "$pow_fast" begin 
						for i = 1 : n
							r_native = pow_native(x[i],y[i])
							r_fast = pow_fast(x[i],y[i])
							# big(1.0) !== big(1.0), NaN != NaN
							@test r_native == r_fast ? true :  
								isnan(r_native) & isnan(r_fast) ? true : false
						end
					end
				end
			end
		end
		@testset verbose = false "Fast `float ^ bool`" begin
			for Flt in subtypes(AbstractFloat)
				@testset verbose = true " $Flt" begin
					n, r, x, y = get_test_data2(Flt)
					@testset "$pow_fast" begin 
						for i = 1 : n
							r_native = pow_native(x[i],y[i])
							r_fast = pow_fast(x[i],y[i])
							# big(1.0) !== big(1.0), NaN != NaN
							@test r_native == r_fast ? true :  
								isnan(r_native) & isnan(r_fast) ? true : false
						end 
					end
				end
			end
		end
	end;

	# benchmarks
	f!(f,r,x,y,n) = for i = 1 : n
		r[i] = f(x[i], y[i])
	end;

	println("\nFast `bool ^ float` benchmark (1 - native 2 - fast):")

	for Flt in subtypes(AbstractFloat)
		n, r, x, y = get_test_data1(Flt)
		println(" $Flt:")
		@btime f!($pow_native,$r,$x,$y,$n) 
		@btime f!($pow_fast,$r,$x,$y,$n) 
	end

	println("\nFast `float ^ bool` benchmark (1 - native 2 - fast):")
	for Flt in subtypes(AbstractFloat)
		n, r, x, y = get_test_data2(Flt)
		println(" $Flt:")
		@btime f!($pow_native,$r,$x,$y,$n) 
		@btime f!($pow_fast,$r,$x,$y,$n) 
	end

	println("```")

end;

2 Likes

just Ctrl + C + V
like this:

AMD Ryzen 9 3900XT 12-Core Processor

Test Summary:         |  Pass  Total  Time
Tests                 | 80000  80000  1.0s
  Fast `bool ^ float` | 40000  40000  0.5s
  Fast `float ^ bool` | 40000  40000  0.5s

Fast `bool ^ float` benchmark (1 - native 2 - fast):
 BigFloat:
  1.130 ms (40000 allocations: 1.98 MiB)
  821.100 μs (20000 allocations: 1015.62 KiB)
 Float16:
  203.400 μs (0 allocations: 0 bytes)
  14.200 μs (0 allocations: 0 bytes)
 Float32:
  257.400 μs (0 allocations: 0 bytes)
  1.440 μs (0 allocations: 0 bytes)
 Float64:
  253.900 μs (0 allocations: 0 bytes)
  3.100 μs (0 allocations: 0 bytes)

Fast `float ^ bool` benchmark (1 - native 2 - fast):
 BigFloat:
  600.100 μs (20000 allocations: 1015.62 KiB)
  492.500 μs (10068 allocations: 511.27 KiB)
 Float16:
  38.000 μs (0 allocations: 0 bytes)
  6.840 μs (0 allocations: 0 bytes)
 Float32:
  37.000 μs (0 allocations: 0 bytes)
  706.207 ns (0 allocations: 0 bytes)
 Float64:
  31.600 μs (0 allocations: 0 bytes)
  1.460 μs (0 allocations: 0 bytes)

Intel(R) Core™ i5-4258U CPU @ 2.40GHz

Test Summary:         |  Pass  Total  Time
Tests                 | 80000  80000  2.9s
  Fast `bool ^ float` | 40000  40000  2.0s
  Fast `float ^ bool` | 40000  40000  0.9s

Fast `bool ^ float` benchmark (1 - native 2 - fast):
 BigFloat:
  1.287 ms (40000 allocations: 1.98 MiB)
  947.704 μs (20000 allocations: 1015.62 KiB)
 Float16:
  204.216 μs (0 allocations: 0 bytes)
  28.564 μs (0 allocations: 0 bytes)
 Float32:
  213.907 μs (0 allocations: 0 bytes)
  4.211 μs (0 allocations: 0 bytes)
 Float64:
  267.693 μs (0 allocations: 0 bytes)
  9.657 μs (0 allocations: 0 bytes)

Fast `float ^ bool` benchmark (1 - native 2 - fast):

 BigFloat:
  646.104 μs (20000 allocations: 1015.62 KiB)
  552.348 μs (10044 allocations: 510.05 KiB)
 Float16:
  70.192 μs (0 allocations: 0 bytes)
  15.225 μs (0 allocations: 0 bytes)
 Float32:
  68.144 μs (0 allocations: 0 bytes)
  1.885 μs (0 allocations: 0 bytes)
 Float64:
  59.002 μs (0 allocations: 0 bytes)
  3.813 μs (0 allocations: 0 bytes)
1 Like

OS: Windows (x86_64-w64-mingw32)
CPU: 8 × 11th Gen Intel(R) Core™ i7-1165G7 @ 2.80GHz

Test Summary:         |  Pass  Total  Time
Tests                 | 80000  80000  2.4s
  Fast `bool ^ float` | 40000  40000  1.7s
  Fast `float ^ bool` | 40000  40000  0.6s

Fast `bool ^ float` benchmark (1 - native 2 - fast):
 BigFloat:
  765.500 μs (40000 allocations: 1.98 MiB)
  570.800 μs (20000 allocations: 1015.62 KiB)
 Float16:
  113.100 μs (0 allocations: 0 bytes)
  22.200 μs (0 allocations: 0 bytes)
 Float32:
  111.200 μs (0 allocations: 0 bytes)
  1.090 μs (0 allocations: 0 bytes)
 Float64:
  140.200 μs (0 allocations: 0 bytes)
  2.144 μs (0 allocations: 0 bytes)

Fast `float ^ bool` benchmark (1 - native 2 - fast):

 BigFloat:
  391.300 μs (20000 allocations: 1015.62 KiB)
  320.400 μs (9962 allocations: 505.88 KiB)
 Float16:
  15.400 μs (0 allocations: 0 bytes)
  4.229 μs (0 allocations: 0 bytes)
 Float32:
  20.500 μs (0 allocations: 0 bytes)
  691.753 ns (0 allocations: 0 bytes)
 Float64:
  31.300 μs (0 allocations: 0 bytes)
  1.490 μs (0 allocations: 0 bytes)
1 Like

12th Gen Intel(R) Core™ i7-12700K

Test Summary:         |  Pass  Total  Time
Tests                 | 80000  80000  0.9s
  Fast `bool ^ float` | 40000  40000  0.5s
  Fast `float ^ bool` | 40000  40000  0.4s

Fast `bool ^ float` benchmark (1 - native 2 - fast):
 BigFloat:
  677.800 μs (40000 allocations: 1.98 MiB)
  488.700 μs (20000 allocations: 1015.62 KiB)
 Float16:
  200.600 μs (0 allocations: 0 bytes)
  9.700 μs (0 allocations: 0 bytes)
 Float32:
  241.400 μs (0 allocations: 0 bytes)
  1.560 μs (0 allocations: 0 bytes)
 Float64:
  222.400 μs (0 allocations: 0 bytes)
  3.400 μs (0 allocations: 0 bytes)

Fast `float ^ bool` benchmark (1 - native 2 - fast):
 BigFloat:
  330.600 μs (20000 allocations: 1015.62 KiB)
  288.500 μs (9928 allocations: 504.16 KiB)
 Float16:
  31.900 μs (0 allocations: 0 bytes)
  4.057 μs (0 allocations: 0 bytes)
 Float32:
  32.700 μs (0 allocations: 0 bytes)
  656.667 ns (0 allocations: 0 bytes)
 Float64:
  27.000 μs (0 allocations: 0 bytes)
  1.280 μs (0 allocations: 0 bytes)
1 Like

edited to simple run in REPL

Intel(R) Celeron(R) CPU B815 @ 1.60GHz
Julia-1.8.3

Test Summary:         |  Pass  Total  Time
Tests                 | 80000  80000  7.7s
  Fast `bool ^ float` | 40000  40000  5.5s
  Fast `float ^ bool` | 40000  40000  2.2s

Fast `bool ^ float` benchmark (1 - native 2 - fast):
 BigFloat:
  3.705 ms (40000 allocations: 1.98 MiB)
  2.529 ms (20000 allocations: 1015.62 KiB)
 Float16:
  674.400 μs (0 allocations: 0 bytes)
  182.400 μs (0 allocations: 0 bytes)
 Float32:
  400.100 μs (0 allocations: 0 bytes)
  13.000 μs (0 allocations: 0 bytes)
 Float64:
  776.300 μs (0 allocations: 0 bytes)
  29.000 μs (0 allocations: 0 bytes)

Fast `float ^ bool` benchmark (1 - native 2 - fast):
 BigFloat:
  1.909 ms (20000 allocations: 1015.62 KiB)
  1.522 ms (10056 allocations: 510.66 KiB)
 Float16:
  315.800 μs (0 allocations: 0 bytes)
  28.600 μs (0 allocations: 0 bytes)
 Float32:
  135.200 μs (0 allocations: 0 bytes)
  5.133 μs (0 allocations: 0 bytes)
 Float64:
  133.300 μs (0 allocations: 0 bytes)
  10.900 μs (0 allocations: 0 bytes)

Intel(R) Core™ i5-8250U CPU @ 1.60GHz
Julia-1.8.3

Test Summary:         |  Pass  Total  Time
Tests                 | 80000  80000  3.4s
  Fast `bool ^ float` | 40000  40000  2.7s
  Fast `float ^ bool` | 40000  40000  0.7s

Fast `bool ^ float` benchmark (1 - native 2 - fast):
 BigFloat:
  1.068 ms (40000 allocations: 1.98 MiB)
  782.911 μs (20000 allocations: 1015.62 KiB)
 Float16:
  189.180 μs (0 allocations: 0 bytes)
  24.224 μs (0 allocations: 0 bytes)
 Float32:
  171.924 μs (0 allocations: 0 bytes)
  2.362 μs (0 allocations: 0 bytes)
 Float64:
  239.502 μs (0 allocations: 0 bytes)
  5.628 μs (0 allocations: 0 bytes)

Fast `float ^ bool` benchmark (1 - native 2 - fast):
 BigFloat:
  552.802 μs (20000 allocations: 1015.62 KiB)
  466.761 μs (10030 allocations: 509.34 KiB)
 Float16:
  67.050 μs (0 allocations: 0 bytes)
  15.696 μs (0 allocations: 0 bytes)
 Float32:
  57.782 μs (0 allocations: 0 bytes)
  1.145 μs (0 allocations: 0 bytes)
 Float64:
  53.871 μs (0 allocations: 0 bytes)
  2.180 μs (0 allocations: 0 bytes)
1 Like

Apple M1

**Apple M1**
**Julia-1.10.0-DEV.39**

Test Summary:         |  Pass  Total  Time
Tests                 | 80000  80000  1.0s
  Fast `bool ^ float` | 40000  40000  0.7s
  Fast `float ^ bool` | 40000  40000  0.3s

Fast `bool ^ float` benchmark (1 - native 2 - fast):
 BigFloat:
  642.959 μs (40000 allocations: 1.98 MiB)
  505.334 μs (20000 allocations: 1015.62 KiB)
 Float16:
  120.708 μs (0 allocations: 0 bytes)
  1.179 μs (0 allocations: 0 bytes)
 Float32:
  124.208 μs (0 allocations: 0 bytes)
  2.546 μs (0 allocations: 0 bytes)
 Float64:
  252.917 μs (0 allocations: 0 bytes)
  5.104 μs (0 allocations: 0 bytes)

Fast `float ^ bool` benchmark (1 - native 2 - fast):
 BigFloat:
  354.458 μs (20000 allocations: 1015.62 KiB)
  245.042 μs (10096 allocations: 512.69 KiB)
 Float16:
  34.250 μs (0 allocations: 0 bytes)
  456.431 ns (0 allocations: 0 bytes)
 Float32:
  34.291 μs (0 allocations: 0 bytes)
  1.033 μs (0 allocations: 0 bytes)
 Float64:
  40.458 μs (0 allocations: 0 bytes)
  2.852 μs (0 allocations: 0 bytes)
1 Like

it is cool, M1 can process half faster then float and double, unlike all these intels and amd

1 Like

12th Gen Intel(R) Core™ i7-1270P
Julia-1.8.3

Test Summary:         |  Pass  Total  Time
Tests                 | 80000  80000  3.0s
  Fast `bool ^ float` | 40000  40000  2.1s
  Fast `float ^ bool` | 40000  40000  0.9s

Fast `bool ^ float` benchmark (1 - native 2 - fast):
 BigFloat:
  997.800 μs (40000 allocations: 1.98 MiB)
  765.700 μs (20000 allocations: 1015.62 KiB)
 Float16:
  146.900 μs (0 allocations: 0 bytes)
  14.100 μs (0 allocations: 0 bytes)
 Float32:
  146.800 μs (0 allocations: 0 bytes)
  2.622 μs (0 allocations: 0 bytes)
 Float64:
  165.500 μs (0 allocations: 0 bytes)
  5.157 μs (0 allocations: 0 bytes)

Fast `float ^ bool` benchmark (1 - native 2 - fast):
 BigFloat:
  571.300 μs (20000 allocations: 1015.62 KiB)
  483.400 μs (10036 allocations: 509.64 KiB)
 Float16:
  59.700 μs (0 allocations: 0 bytes)
  5.900 μs (0 allocations: 0 bytes)
 Float32:
  47.400 μs (0 allocations: 0 bytes)
  1.230 μs (0 allocations: 0 bytes)
 Float64:
  39.900 μs (0 allocations: 0 bytes)
  2.178 μs (0 allocations: 0 bytes)

AMD Ryzen 9 5950X 16-Core Processor
Julia-1.8.3

Test Summary:         |  Pass  Total  Time
Tests                 | 80000  80000  1.6s
  Fast `bool ^ float` | 40000  40000  1.1s
  Fast `float ^ bool` | 40000  40000  0.5s

Fast `bool ^ float` benchmark (1 - native 2 - fast):
 BigFloat:
  795.900 μs (40000 allocations: 1.98 MiB)
  584.600 μs (20000 allocations: 1015.62 KiB)
 Float16:
  93.900 μs (0 allocations: 0 bytes)
  11.200 μs (0 allocations: 0 bytes)
 Float32:
  84.400 μs (0 allocations: 0 bytes)
  1.190 μs (0 allocations: 0 bytes)
 Float64:
  111.600 μs (0 allocations: 0 bytes)
  2.378 μs (0 allocations: 0 bytes)

Fast `float ^ bool` benchmark (1 - native 2 - fast):
 BigFloat:
  420.100 μs (20000 allocations: 1015.62 KiB)
  328.800 μs (9980 allocations: 506.80 KiB)
 Float16:
  16.600 μs (0 allocations: 0 bytes)
  4.857 μs (0 allocations: 0 bytes)
 Float32:
  16.400 μs (0 allocations: 0 bytes)
  599.444 ns (0 allocations: 0 bytes)
 Float64:
  12.300 μs (0 allocations: 0 bytes)
  1.100 μs (0 allocations: 0 bytes)

Intel(R) Xeon(R) Silver 4210R CPU @ 2.40GHz
Julia-1.8.0

Test Summary:         |  Pass  Total  Time
Tests                 | 80000  80000  2.6s
  Fast `bool ^ float` | 40000  40000  1.8s
  Fast `float ^ bool` | 40000  40000  0.8s

Fast `bool ^ float` benchmark (1 - native 2 - fast):
 BigFloat:
  1.427 ms (40000 allocations: 1.98 MiB)
  1.037 ms (20000 allocations: 1015.62 KiB)
 Float16:
  334.900 μs (0 allocations: 0 bytes)
  37.600 μs (0 allocations: 0 bytes)
 Float32:
  392.500 μs (0 allocations: 0 bytes)
  1.820 μs (0 allocations: 0 bytes)
 Float64:
  373.200 μs (0 allocations: 0 bytes)
  3.538 μs (0 allocations: 0 bytes)

Fast `float ^ bool` benchmark (1 - native 2 - fast):
 BigFloat:
  751.700 μs (20000 allocations: 1015.62 KiB)
  633.900 μs (9984 allocations: 507.00 KiB)
 Float16:
  26.700 μs (0 allocations: 0 bytes)
  19.900 μs (0 allocations: 0 bytes)
 Float32:
  25.100 μs (0 allocations: 0 bytes)
  1.000 μs (0 allocations: 0 bytes)
 Float64:
  58.200 μs (0 allocations: 0 bytes)
  2.067 μs (0 allocations: 0 bytes)
1 Like

thank you

That’s not really surprising since it’s one of the CPUs at the moment which has hardware support for half-precision floating point numbers.

5 Likes

AMD Ryzen 5 3500X 6-Core Processor
Julia-1.7.2

Test Summary:         |  Pass  Total
Tests                 | 80000  80000
  Fast `bool ^ float` | 40000  40000
  Fast `float ^ bool` | 40000  40000

Fast `bool ^ float` benchmark (1 - native 2 - fast):
 BigFloat:
  1.311 ms (40000 allocations: 2.14 MiB)
  893.300 μs (20000 allocations: 1.07 MiB)
 Float16:
  115.200 μs (0 allocations: 0 bytes)
  17.500 μs (0 allocations: 0 bytes)
 Float32:
  279.100 μs (0 allocations: 0 bytes)
  52.900 μs (0 allocations: 0 bytes)
 Float64:
  236.300 μs (0 allocations: 0 bytes)
  45.200 μs (0 allocations: 0 bytes)

Fast `float ^ bool` benchmark (1 - native 2 - fast):
 BigFloat:
  670.200 μs (20000 allocations: 1.07 MiB)
  556.300 μs (9908 allocations: 541.84 KiB)
 Float16:
  25.600 μs (0 allocations: 0 bytes)
  22.900 μs (0 allocations: 0 bytes)
 Float32:
  22.600 μs (0 allocations: 0 bytes)
  22.600 μs (0 allocations: 0 bytes)
 Float64:
  20.200 μs (0 allocations: 0 bytes)
  20.400 μs (0 allocations: 0 bytes)
1 Like

valuable result

maybe exotic or ancient hardware or mobile processors?

Intel(R) Core™ i5-8365U CPU @ 1.60GHz
Julia-1.6.6

Test Summary:         |  Pass  Total
Tests                 | 80000  80000
  Fast `bool ^ float` | 40000  40000
  Fast `float ^ bool` | 40000  40000

Fast `bool ^ float` benchmark (1 - native 2 - fast):
 BigFloat:
  1.325 ms (40000 allocations: 2.14 MiB)
  1.200 ms (20000 allocations: 1.07 MiB)
 Float16:
  133.400 μs (0 allocations: 0 bytes)
  20.500 μs (0 allocations: 0 bytes)
 Float32:
  148.700 μs (0 allocations: 0 bytes)
  84.300 μs (0 allocations: 0 bytes)
 Float64:
  164.000 μs (0 allocations: 0 bytes)
  86.400 μs (0 allocations: 0 bytes)

Fast `float ^ bool` benchmark (1 - native 2 - fast):
 BigFloat:
  704.500 μs (20000 allocations: 1.07 MiB)
  607.900 μs (10076 allocations: 551.03 KiB)
 Float16:
  52.000 μs (0 allocations: 0 bytes)
  57.000 μs (0 allocations: 0 bytes)
 Float32:
  50.000 μs (0 allocations: 0 bytes)
  51.500 μs (0 allocations: 0 bytes)
 Float64:
  50.300 μs (0 allocations: 0 bytes)
  34.900 μs (0 allocations: 0 bytes)
1 Like

more faster:

module M
	using BenchmarkTools, Test, InteractiveUtils

	println("**", rstrip(Sys.cpu_info()[1].model), "**")
	println("**Julia-",VERSION,"**")
    println("*more faster\n")

	println("```")

	# 
	pow_native(x::X, y::Y) where {X,Y} = x ^ y

	# 
	pow_fast(x::Bool, y::T) where T <: AbstractFloat = 
		x ? one(T) : ifelse(iszero(y), one(T), ifelse(isnan(y), T(NaN), ifelse(signbit(y), T(Inf), zero(T))));

	pow_fast(x::Bool, y::BigFloat) = 
		x ? big(1.0) : big(ifelse(iszero(y), 1.0 , ifelse(isnan(y), NaN, ifelse(signbit(y), Inf, 0.0))));

	pow_fast(x::T, y::Bool) where T <: AbstractFloat = y ? x : one(T);
	pow_fast(x::BigFloat, y::Bool) = y ? x : big(1.0);  # only big(1.0), not one(x), not BigFloat(1), not one(T)

	# test data
	function get_test_data1(::Type{T}) where T <: AbstractFloat
		n = 10000
		m = n ÷ 100  # for special values of the same type NaN, -Inf ...
		r = zeros(T, n)  # for result
		x = rand(Bool, n)
		#x = ones(Bool, n)
		#x = zeros(Bool, n)
		y = (T <: BigFloat ? big.(randn(Float64, n)) : randn(T, n)) .^ 111
		y[rand(1:n, m)] .= T(NaN)
		y[rand(1:n, m)] .= -T(Inf)
		y[rand(1:n, m)] .= T(Inf)
		y[rand(1:n, m)] .= nextfloat(-T(Inf))
		y[rand(1:n, m)] .= prevfloat(T(Inf))
		y[rand(1:n, m)] .= T(0)
		y[rand(1:n, m)] .= -T(0)
		return n, r, x, y
	end

	function get_test_data2(::Type{T}) where T <: AbstractFloat
		n = 10000
		m = n ÷ 100  # for special values of the same type NaN, -Inf ...
		r = zeros(T, n)  # for result
		x = (T <: BigFloat ? big.(randn(Float64, n)) : randn(T, n)) .^ 111
		y = rand(Bool, n)
		#y = ones(Bool, n)
		#y = zeros(Bool, n)
		x[rand(1:n, m)] .= T(NaN)
		x[rand(1:n, m)] .= -T(Inf)
		x[rand(1:n, m)] .= T(Inf)
		x[rand(1:n, m)] .= nextfloat(-T(Inf))
		x[rand(1:n, m)] .= prevfloat(T(Inf))
		x[rand(1:n, m)] .= T(0)
		x[rand(1:n, m)] .= -T(0)
		return n, r, x, y
	end

	# tests
	@testset verbose = true "Tests" begin
		@testset verbose = false "Fast `bool ^ float`" begin
			for Flt in subtypes(AbstractFloat)
				@testset verbose = true " $Flt" begin
					n, r, x, y = get_test_data1(Flt)
					@testset "$pow_fast" begin 
						for i = 1 : n
							r_native = pow_native(x[i],y[i])
							r_fast = pow_fast(x[i],y[i])
							# big(1.0) !== big(1.0), NaN != NaN
							@test r_native == r_fast ? true :  
								isnan(r_native) & isnan(r_fast) ? true : false
						end
					end
				end
			end
		end
		@testset verbose = false "Fast `float ^ bool`" begin
			for Flt in subtypes(AbstractFloat)
				@testset verbose = true " $Flt" begin
					n, r, x, y = get_test_data2(Flt)
					@testset "$pow_fast" begin 
						for i = 1 : n
							r_native = pow_native(x[i],y[i])
							r_fast = pow_fast(x[i],y[i])
							# big(1.0) !== big(1.0), NaN != NaN
							@test r_native == r_fast ? true :  
								isnan(r_native) & isnan(r_fast) ? true : false
						end 
					end
				end
			end
		end
	end;

	# benchmarks
	# func
	f!(f,r,x,y,n) = for i = 1 : n
		r[i] = f(x[i], y[i])
	end;

	println("\nFast `bool ^ float` benchmark (1 - native 2 - fast):")

	for Flt in subtypes(AbstractFloat)
		n, r, x, y = get_test_data1(Flt)
		println(" $Flt:")
		@btime f!($pow_native,$r,$x,$y,$n) 
		@btime f!($pow_fast,$r,$x,$y,$n) 
	end

	println("\nFast `float ^ bool` benchmark (1 - native 2 - fast):")
	for Flt in subtypes(AbstractFloat)
		n, r, x, y = get_test_data2(Flt)
		println(" $Flt:")
		@btime f!($pow_native,$r,$x,$y,$n) 
		@btime f!($pow_fast,$r,$x,$y,$n) 
	end

	println("```")

end;

Intel(R) Core™ i7-9750H CPU @ 2.60GHz
Julia-1.8.3

Test Summary:         |  Pass  Total  Time
Tests                 | 80000  80000  2.2s
  Fast `bool ^ float` | 40000  40000  1.6s
  Fast `float ^ bool` | 40000  40000  0.6s

Fast `bool ^ float` benchmark (1 - native 2 - fast):
 BigFloat:
  723.195 μs (40000 allocations: 1.98 MiB)
  484.415 μs (20000 allocations: 1015.62 KiB) Float16:
  153.620 μs (0 allocations: 0 bytes)
  18.500 μs (0 allocations: 0 bytes)
 Float32:
  142.652 μs (0 allocations: 0 bytes)
  1.567 μs (0 allocations: 0 bytes)
 Float64:
  199.983 μs (0 allocations: 0 bytes)
  3.304 μs (0 allocations: 0 bytes)

Fast `float ^ bool` benchmark (1 - native 2 - fast):
 BigFloat:
  399.706 μs (20000 allocations: 1015.62 KiB)
  273.336 μs (10024 allocations: 509.03 KiB)
 Float16:
  58.514 μs (0 allocations: 0 bytes)
  13.401 μs (0 allocations: 0 bytes)
 Float32:
  56.400 μs (0 allocations: 0 bytes)
  950.435 ns (0 allocations: 0 bytes)
 Float64:
  45.594 μs (0 allocations: 0 bytes)
  1.707 μs (0 allocations: 0 bytes)
1 Like

AMD Ryzen 9 3900XT 12-Core Processor
Julia-1.8.0
*more faster

Test Summary:         |  Pass  Total  Time
Tests                 | 80000  80000  1.9s
  Fast `bool ^ float` | 40000  40000  1.3s
  Fast `float ^ bool` | 40000  40000  0.6s

Fast `bool ^ float` benchmark (1 - native 2 - fast):
 BigFloat:
  1.198 ms (40000 allocations: 1.98 MiB)
  808.500 μs (20000 allocations: 1015.62 KiB)
 Float16:
  224.500 μs (0 allocations: 0 bytes)
  14.100 μs (0 allocations: 0 bytes)
 Float32:
  276.900 μs (0 allocations: 0 bytes)
  1.210 μs (0 allocations: 0 bytes)
 Float64:
  257.400 μs (0 allocations: 0 bytes)
  2.100 μs (0 allocations: 0 bytes)

Fast `float ^ bool` benchmark (1 - native 2 - fast):
 BigFloat:
  637.100 μs (20000 allocations: 1015.62 KiB)
  526.000 μs (9952 allocations: 505.38 KiB)
 Float16:
  37.200 μs (0 allocations: 0 bytes)
  6.700 μs (0 allocations: 0 bytes)
 Float32:
  36.600 μs (0 allocations: 0 bytes)
  726.515 ns (0 allocations: 0 bytes)
 Float64:
  33.000 μs (0 allocations: 0 bytes)
  1.410 μs (0 allocations: 0 bytes)