# Speed up "bool ^ float" and "float ^ bool"

I speed up “bool ^ float” and “float ^ bool”.

Сan you help me to know how it works on other processors ?

Need a print of this thing:

``````module M
using BenchmarkTools, Test, InteractiveUtils

println("**", rstrip(Sys.cpu_info()[1].model), "**")
println("**Julia-",VERSION,"**\n")

println("```")

#
pow_native(x::X, y::Y) where {X,Y} = x ^ y

#
pow_fast(x::Bool, y::T) where T <: AbstractFloat = ifelse(x | iszero(y), T(1) , ifelse(isnan(y), T(NaN), ifelse(signbit(y), T(Inf), T(0))));
pow_fast(x::Bool, y::BigFloat) = big(ifelse(x | iszero(y), 1.0 , ifelse(isnan(y), NaN, ifelse(signbit(y), Inf, 0.0))));
pow_fast(x::T, y::Bool) where T <: AbstractFloat = y ? copy(x) : T(1)
pow_fast(x::BigFloat, y::Bool) = y ? x : big(1.0)

# test data
function get_test_data1(::Type{T}) where T <: AbstractFloat
n = 10000
m = n ÷ 100  # for special values of the same type NaN, -Inf ...
r = zeros(T, n)  # for result
x = rand(Bool, n)
y = (T <: BigFloat ? big.(randn(Float64, n)) : randn(T, n)) .^ 111
y[rand(1:n, m)] .= T(NaN)
y[rand(1:n, m)] .= -T(Inf)
y[rand(1:n, m)] .= T(Inf)
y[rand(1:n, m)] .= nextfloat(-T(Inf))
y[rand(1:n, m)] .= prevfloat(T(Inf))
y[rand(1:n, m)] .= T(0)
y[rand(1:n, m)] .= -T(0)
return n, r, x, y
end

function get_test_data2(::Type{T}) where T <: AbstractFloat
n = 10000
m = n ÷ 100  # for special values of the same type NaN, -Inf ...
r = zeros(T, n)  # for result
x = (T <: BigFloat ? big.(randn(Float64, n)) : randn(T, n)) .^ 111
y = rand(Bool, n)
x[rand(1:n, m)] .= T(NaN)
x[rand(1:n, m)] .= -T(Inf)
x[rand(1:n, m)] .= T(Inf)
x[rand(1:n, m)] .= nextfloat(-T(Inf))
x[rand(1:n, m)] .= prevfloat(T(Inf))
x[rand(1:n, m)] .= T(0)
x[rand(1:n, m)] .= -T(0)
return n, r, x, y
end

# tests
@testset verbose = true "Tests" begin
@testset verbose = false "Fast `bool ^ float`" begin
for Flt in subtypes(AbstractFloat)
@testset verbose = true " \$Flt" begin
n, r, x, y = get_test_data1(Flt)
@testset "\$pow_fast" begin
for i = 1 : n
r_native = pow_native(x[i],y[i])
r_fast = pow_fast(x[i],y[i])
# big(1.0) !== big(1.0), NaN != NaN
@test r_native == r_fast ? true :
isnan(r_native) & isnan(r_fast) ? true : false
end
end
end
end
end
@testset verbose = false "Fast `float ^ bool`" begin
for Flt in subtypes(AbstractFloat)
@testset verbose = true " \$Flt" begin
n, r, x, y = get_test_data2(Flt)
@testset "\$pow_fast" begin
for i = 1 : n
r_native = pow_native(x[i],y[i])
r_fast = pow_fast(x[i],y[i])
# big(1.0) !== big(1.0), NaN != NaN
@test r_native == r_fast ? true :
isnan(r_native) & isnan(r_fast) ? true : false
end
end
end
end
end
end;

# benchmarks
f!(f,r,x,y,n) = for i = 1 : n
r[i] = f(x[i], y[i])
end;

println("\nFast `bool ^ float` benchmark (1 - native 2 - fast):")

for Flt in subtypes(AbstractFloat)
n, r, x, y = get_test_data1(Flt)
println(" \$Flt:")
@btime f!(\$pow_native,\$r,\$x,\$y,\$n)
@btime f!(\$pow_fast,\$r,\$x,\$y,\$n)
end

println("\nFast `float ^ bool` benchmark (1 - native 2 - fast):")
for Flt in subtypes(AbstractFloat)
n, r, x, y = get_test_data2(Flt)
println(" \$Flt:")
@btime f!(\$pow_native,\$r,\$x,\$y,\$n)
@btime f!(\$pow_fast,\$r,\$x,\$y,\$n)
end

println("```")

end;

``````
2 Likes

just Ctrl + C + V
like this:

AMD Ryzen 9 3900XT 12-Core Processor

``````Test Summary:         |  Pass  Total  Time
Tests                 | 80000  80000  1.0s
Fast `bool ^ float` | 40000  40000  0.5s
Fast `float ^ bool` | 40000  40000  0.5s

Fast `bool ^ float` benchmark (1 - native 2 - fast):
BigFloat:
1.130 ms (40000 allocations: 1.98 MiB)
821.100 μs (20000 allocations: 1015.62 KiB)
Float16:
203.400 μs (0 allocations: 0 bytes)
14.200 μs (0 allocations: 0 bytes)
Float32:
257.400 μs (0 allocations: 0 bytes)
1.440 μs (0 allocations: 0 bytes)
Float64:
253.900 μs (0 allocations: 0 bytes)
3.100 μs (0 allocations: 0 bytes)

Fast `float ^ bool` benchmark (1 - native 2 - fast):
BigFloat:
600.100 μs (20000 allocations: 1015.62 KiB)
492.500 μs (10068 allocations: 511.27 KiB)
Float16:
38.000 μs (0 allocations: 0 bytes)
6.840 μs (0 allocations: 0 bytes)
Float32:
37.000 μs (0 allocations: 0 bytes)
706.207 ns (0 allocations: 0 bytes)
Float64:
31.600 μs (0 allocations: 0 bytes)
1.460 μs (0 allocations: 0 bytes)
``````

Intel(R) Core™ i5-4258U CPU @ 2.40GHz

``````Test Summary:         |  Pass  Total  Time
Tests                 | 80000  80000  2.9s
Fast `bool ^ float` | 40000  40000  2.0s
Fast `float ^ bool` | 40000  40000  0.9s

Fast `bool ^ float` benchmark (1 - native 2 - fast):
BigFloat:
1.287 ms (40000 allocations: 1.98 MiB)
947.704 μs (20000 allocations: 1015.62 KiB)
Float16:
204.216 μs (0 allocations: 0 bytes)
28.564 μs (0 allocations: 0 bytes)
Float32:
213.907 μs (0 allocations: 0 bytes)
4.211 μs (0 allocations: 0 bytes)
Float64:
267.693 μs (0 allocations: 0 bytes)
9.657 μs (0 allocations: 0 bytes)

Fast `float ^ bool` benchmark (1 - native 2 - fast):

BigFloat:
646.104 μs (20000 allocations: 1015.62 KiB)
552.348 μs (10044 allocations: 510.05 KiB)
Float16:
70.192 μs (0 allocations: 0 bytes)
15.225 μs (0 allocations: 0 bytes)
Float32:
68.144 μs (0 allocations: 0 bytes)
1.885 μs (0 allocations: 0 bytes)
Float64:
59.002 μs (0 allocations: 0 bytes)
3.813 μs (0 allocations: 0 bytes)
``````
1 Like

OS: Windows (x86_64-w64-mingw32)
CPU: 8 × 11th Gen Intel(R) Core™ i7-1165G7 @ 2.80GHz

``````Test Summary:         |  Pass  Total  Time
Tests                 | 80000  80000  2.4s
Fast `bool ^ float` | 40000  40000  1.7s
Fast `float ^ bool` | 40000  40000  0.6s

Fast `bool ^ float` benchmark (1 - native 2 - fast):
BigFloat:
765.500 μs (40000 allocations: 1.98 MiB)
570.800 μs (20000 allocations: 1015.62 KiB)
Float16:
113.100 μs (0 allocations: 0 bytes)
22.200 μs (0 allocations: 0 bytes)
Float32:
111.200 μs (0 allocations: 0 bytes)
1.090 μs (0 allocations: 0 bytes)
Float64:
140.200 μs (0 allocations: 0 bytes)
2.144 μs (0 allocations: 0 bytes)

Fast `float ^ bool` benchmark (1 - native 2 - fast):

BigFloat:
391.300 μs (20000 allocations: 1015.62 KiB)
320.400 μs (9962 allocations: 505.88 KiB)
Float16:
15.400 μs (0 allocations: 0 bytes)
4.229 μs (0 allocations: 0 bytes)
Float32:
20.500 μs (0 allocations: 0 bytes)
691.753 ns (0 allocations: 0 bytes)
Float64:
31.300 μs (0 allocations: 0 bytes)
1.490 μs (0 allocations: 0 bytes)
``````
1 Like

12th Gen Intel(R) Core™ i7-12700K

``````Test Summary:         |  Pass  Total  Time
Tests                 | 80000  80000  0.9s
Fast `bool ^ float` | 40000  40000  0.5s
Fast `float ^ bool` | 40000  40000  0.4s

Fast `bool ^ float` benchmark (1 - native 2 - fast):
BigFloat:
677.800 μs (40000 allocations: 1.98 MiB)
488.700 μs (20000 allocations: 1015.62 KiB)
Float16:
200.600 μs (0 allocations: 0 bytes)
9.700 μs (0 allocations: 0 bytes)
Float32:
241.400 μs (0 allocations: 0 bytes)
1.560 μs (0 allocations: 0 bytes)
Float64:
222.400 μs (0 allocations: 0 bytes)
3.400 μs (0 allocations: 0 bytes)

Fast `float ^ bool` benchmark (1 - native 2 - fast):
BigFloat:
330.600 μs (20000 allocations: 1015.62 KiB)
288.500 μs (9928 allocations: 504.16 KiB)
Float16:
31.900 μs (0 allocations: 0 bytes)
4.057 μs (0 allocations: 0 bytes)
Float32:
32.700 μs (0 allocations: 0 bytes)
656.667 ns (0 allocations: 0 bytes)
Float64:
27.000 μs (0 allocations: 0 bytes)
1.280 μs (0 allocations: 0 bytes)
``````
1 Like

edited to simple run in REPL

Intel(R) Celeron(R) CPU B815 @ 1.60GHz
Julia-1.8.3

``````Test Summary:         |  Pass  Total  Time
Tests                 | 80000  80000  7.7s
Fast `bool ^ float` | 40000  40000  5.5s
Fast `float ^ bool` | 40000  40000  2.2s

Fast `bool ^ float` benchmark (1 - native 2 - fast):
BigFloat:
3.705 ms (40000 allocations: 1.98 MiB)
2.529 ms (20000 allocations: 1015.62 KiB)
Float16:
674.400 μs (0 allocations: 0 bytes)
182.400 μs (0 allocations: 0 bytes)
Float32:
400.100 μs (0 allocations: 0 bytes)
13.000 μs (0 allocations: 0 bytes)
Float64:
776.300 μs (0 allocations: 0 bytes)
29.000 μs (0 allocations: 0 bytes)

Fast `float ^ bool` benchmark (1 - native 2 - fast):
BigFloat:
1.909 ms (20000 allocations: 1015.62 KiB)
1.522 ms (10056 allocations: 510.66 KiB)
Float16:
315.800 μs (0 allocations: 0 bytes)
28.600 μs (0 allocations: 0 bytes)
Float32:
135.200 μs (0 allocations: 0 bytes)
5.133 μs (0 allocations: 0 bytes)
Float64:
133.300 μs (0 allocations: 0 bytes)
10.900 μs (0 allocations: 0 bytes)
``````

Intel(R) Core™ i5-8250U CPU @ 1.60GHz
Julia-1.8.3

``````Test Summary:         |  Pass  Total  Time
Tests                 | 80000  80000  3.4s
Fast `bool ^ float` | 40000  40000  2.7s
Fast `float ^ bool` | 40000  40000  0.7s

Fast `bool ^ float` benchmark (1 - native 2 - fast):
BigFloat:
1.068 ms (40000 allocations: 1.98 MiB)
782.911 μs (20000 allocations: 1015.62 KiB)
Float16:
189.180 μs (0 allocations: 0 bytes)
24.224 μs (0 allocations: 0 bytes)
Float32:
171.924 μs (0 allocations: 0 bytes)
2.362 μs (0 allocations: 0 bytes)
Float64:
239.502 μs (0 allocations: 0 bytes)
5.628 μs (0 allocations: 0 bytes)

Fast `float ^ bool` benchmark (1 - native 2 - fast):
BigFloat:
552.802 μs (20000 allocations: 1015.62 KiB)
466.761 μs (10030 allocations: 509.34 KiB)
Float16:
67.050 μs (0 allocations: 0 bytes)
15.696 μs (0 allocations: 0 bytes)
Float32:
57.782 μs (0 allocations: 0 bytes)
1.145 μs (0 allocations: 0 bytes)
Float64:
53.871 μs (0 allocations: 0 bytes)
2.180 μs (0 allocations: 0 bytes)
``````
1 Like

Apple M1

``````**Apple M1**
**Julia-1.10.0-DEV.39**

Test Summary:         |  Pass  Total  Time
Tests                 | 80000  80000  1.0s
Fast `bool ^ float` | 40000  40000  0.7s
Fast `float ^ bool` | 40000  40000  0.3s

Fast `bool ^ float` benchmark (1 - native 2 - fast):
BigFloat:
642.959 μs (40000 allocations: 1.98 MiB)
505.334 μs (20000 allocations: 1015.62 KiB)
Float16:
120.708 μs (0 allocations: 0 bytes)
1.179 μs (0 allocations: 0 bytes)
Float32:
124.208 μs (0 allocations: 0 bytes)
2.546 μs (0 allocations: 0 bytes)
Float64:
252.917 μs (0 allocations: 0 bytes)
5.104 μs (0 allocations: 0 bytes)

Fast `float ^ bool` benchmark (1 - native 2 - fast):
BigFloat:
354.458 μs (20000 allocations: 1015.62 KiB)
245.042 μs (10096 allocations: 512.69 KiB)
Float16:
34.250 μs (0 allocations: 0 bytes)
456.431 ns (0 allocations: 0 bytes)
Float32:
34.291 μs (0 allocations: 0 bytes)
1.033 μs (0 allocations: 0 bytes)
Float64:
40.458 μs (0 allocations: 0 bytes)
2.852 μs (0 allocations: 0 bytes)
``````
1 Like

it is cool, M1 can process half faster then float and double, unlike all these intels and amd

1 Like

12th Gen Intel(R) Core™ i7-1270P
Julia-1.8.3

``````Test Summary:         |  Pass  Total  Time
Tests                 | 80000  80000  3.0s
Fast `bool ^ float` | 40000  40000  2.1s
Fast `float ^ bool` | 40000  40000  0.9s

Fast `bool ^ float` benchmark (1 - native 2 - fast):
BigFloat:
997.800 μs (40000 allocations: 1.98 MiB)
765.700 μs (20000 allocations: 1015.62 KiB)
Float16:
146.900 μs (0 allocations: 0 bytes)
14.100 μs (0 allocations: 0 bytes)
Float32:
146.800 μs (0 allocations: 0 bytes)
2.622 μs (0 allocations: 0 bytes)
Float64:
165.500 μs (0 allocations: 0 bytes)
5.157 μs (0 allocations: 0 bytes)

Fast `float ^ bool` benchmark (1 - native 2 - fast):
BigFloat:
571.300 μs (20000 allocations: 1015.62 KiB)
483.400 μs (10036 allocations: 509.64 KiB)
Float16:
59.700 μs (0 allocations: 0 bytes)
5.900 μs (0 allocations: 0 bytes)
Float32:
47.400 μs (0 allocations: 0 bytes)
1.230 μs (0 allocations: 0 bytes)
Float64:
39.900 μs (0 allocations: 0 bytes)
2.178 μs (0 allocations: 0 bytes)
``````

AMD Ryzen 9 5950X 16-Core Processor
Julia-1.8.3

``````Test Summary:         |  Pass  Total  Time
Tests                 | 80000  80000  1.6s
Fast `bool ^ float` | 40000  40000  1.1s
Fast `float ^ bool` | 40000  40000  0.5s

Fast `bool ^ float` benchmark (1 - native 2 - fast):
BigFloat:
795.900 μs (40000 allocations: 1.98 MiB)
584.600 μs (20000 allocations: 1015.62 KiB)
Float16:
93.900 μs (0 allocations: 0 bytes)
11.200 μs (0 allocations: 0 bytes)
Float32:
84.400 μs (0 allocations: 0 bytes)
1.190 μs (0 allocations: 0 bytes)
Float64:
111.600 μs (0 allocations: 0 bytes)
2.378 μs (0 allocations: 0 bytes)

Fast `float ^ bool` benchmark (1 - native 2 - fast):
BigFloat:
420.100 μs (20000 allocations: 1015.62 KiB)
328.800 μs (9980 allocations: 506.80 KiB)
Float16:
16.600 μs (0 allocations: 0 bytes)
4.857 μs (0 allocations: 0 bytes)
Float32:
16.400 μs (0 allocations: 0 bytes)
599.444 ns (0 allocations: 0 bytes)
Float64:
12.300 μs (0 allocations: 0 bytes)
1.100 μs (0 allocations: 0 bytes)
``````

Intel(R) Xeon(R) Silver 4210R CPU @ 2.40GHz
Julia-1.8.0

``````Test Summary:         |  Pass  Total  Time
Tests                 | 80000  80000  2.6s
Fast `bool ^ float` | 40000  40000  1.8s
Fast `float ^ bool` | 40000  40000  0.8s

Fast `bool ^ float` benchmark (1 - native 2 - fast):
BigFloat:
1.427 ms (40000 allocations: 1.98 MiB)
1.037 ms (20000 allocations: 1015.62 KiB)
Float16:
334.900 μs (0 allocations: 0 bytes)
37.600 μs (0 allocations: 0 bytes)
Float32:
392.500 μs (0 allocations: 0 bytes)
1.820 μs (0 allocations: 0 bytes)
Float64:
373.200 μs (0 allocations: 0 bytes)
3.538 μs (0 allocations: 0 bytes)

Fast `float ^ bool` benchmark (1 - native 2 - fast):
BigFloat:
751.700 μs (20000 allocations: 1015.62 KiB)
633.900 μs (9984 allocations: 507.00 KiB)
Float16:
26.700 μs (0 allocations: 0 bytes)
19.900 μs (0 allocations: 0 bytes)
Float32:
25.100 μs (0 allocations: 0 bytes)
1.000 μs (0 allocations: 0 bytes)
Float64:
58.200 μs (0 allocations: 0 bytes)
2.067 μs (0 allocations: 0 bytes)
``````
1 Like

thank you

That’s not really surprising since it’s one of the CPUs at the moment which has hardware support for half-precision floating point numbers.

5 Likes

AMD Ryzen 5 3500X 6-Core Processor
Julia-1.7.2

``````Test Summary:         |  Pass  Total
Tests                 | 80000  80000
Fast `bool ^ float` | 40000  40000
Fast `float ^ bool` | 40000  40000

Fast `bool ^ float` benchmark (1 - native 2 - fast):
BigFloat:
1.311 ms (40000 allocations: 2.14 MiB)
893.300 μs (20000 allocations: 1.07 MiB)
Float16:
115.200 μs (0 allocations: 0 bytes)
17.500 μs (0 allocations: 0 bytes)
Float32:
279.100 μs (0 allocations: 0 bytes)
52.900 μs (0 allocations: 0 bytes)
Float64:
236.300 μs (0 allocations: 0 bytes)
45.200 μs (0 allocations: 0 bytes)

Fast `float ^ bool` benchmark (1 - native 2 - fast):
BigFloat:
670.200 μs (20000 allocations: 1.07 MiB)
556.300 μs (9908 allocations: 541.84 KiB)
Float16:
25.600 μs (0 allocations: 0 bytes)
22.900 μs (0 allocations: 0 bytes)
Float32:
22.600 μs (0 allocations: 0 bytes)
22.600 μs (0 allocations: 0 bytes)
Float64:
20.200 μs (0 allocations: 0 bytes)
20.400 μs (0 allocations: 0 bytes)
``````
1 Like

valuable result

maybe exotic or ancient hardware or mobile processors?

Intel(R) Core™ i5-8365U CPU @ 1.60GHz
Julia-1.6.6

``````Test Summary:         |  Pass  Total
Tests                 | 80000  80000
Fast `bool ^ float` | 40000  40000
Fast `float ^ bool` | 40000  40000

Fast `bool ^ float` benchmark (1 - native 2 - fast):
BigFloat:
1.325 ms (40000 allocations: 2.14 MiB)
1.200 ms (20000 allocations: 1.07 MiB)
Float16:
133.400 μs (0 allocations: 0 bytes)
20.500 μs (0 allocations: 0 bytes)
Float32:
148.700 μs (0 allocations: 0 bytes)
84.300 μs (0 allocations: 0 bytes)
Float64:
164.000 μs (0 allocations: 0 bytes)
86.400 μs (0 allocations: 0 bytes)

Fast `float ^ bool` benchmark (1 - native 2 - fast):
BigFloat:
704.500 μs (20000 allocations: 1.07 MiB)
607.900 μs (10076 allocations: 551.03 KiB)
Float16:
52.000 μs (0 allocations: 0 bytes)
57.000 μs (0 allocations: 0 bytes)
Float32:
50.000 μs (0 allocations: 0 bytes)
51.500 μs (0 allocations: 0 bytes)
Float64:
50.300 μs (0 allocations: 0 bytes)
34.900 μs (0 allocations: 0 bytes)
``````
1 Like

more faster:

``````module M
using BenchmarkTools, Test, InteractiveUtils

println("**", rstrip(Sys.cpu_info()[1].model), "**")
println("**Julia-",VERSION,"**")
println("*more faster\n")

println("```")

#
pow_native(x::X, y::Y) where {X,Y} = x ^ y

#
pow_fast(x::Bool, y::T) where T <: AbstractFloat =
x ? one(T) : ifelse(iszero(y), one(T), ifelse(isnan(y), T(NaN), ifelse(signbit(y), T(Inf), zero(T))));

pow_fast(x::Bool, y::BigFloat) =
x ? big(1.0) : big(ifelse(iszero(y), 1.0 , ifelse(isnan(y), NaN, ifelse(signbit(y), Inf, 0.0))));

pow_fast(x::T, y::Bool) where T <: AbstractFloat = y ? x : one(T);
pow_fast(x::BigFloat, y::Bool) = y ? x : big(1.0);  # only big(1.0), not one(x), not BigFloat(1), not one(T)

# test data
function get_test_data1(::Type{T}) where T <: AbstractFloat
n = 10000
m = n ÷ 100  # for special values of the same type NaN, -Inf ...
r = zeros(T, n)  # for result
x = rand(Bool, n)
#x = ones(Bool, n)
#x = zeros(Bool, n)
y = (T <: BigFloat ? big.(randn(Float64, n)) : randn(T, n)) .^ 111
y[rand(1:n, m)] .= T(NaN)
y[rand(1:n, m)] .= -T(Inf)
y[rand(1:n, m)] .= T(Inf)
y[rand(1:n, m)] .= nextfloat(-T(Inf))
y[rand(1:n, m)] .= prevfloat(T(Inf))
y[rand(1:n, m)] .= T(0)
y[rand(1:n, m)] .= -T(0)
return n, r, x, y
end

function get_test_data2(::Type{T}) where T <: AbstractFloat
n = 10000
m = n ÷ 100  # for special values of the same type NaN, -Inf ...
r = zeros(T, n)  # for result
x = (T <: BigFloat ? big.(randn(Float64, n)) : randn(T, n)) .^ 111
y = rand(Bool, n)
#y = ones(Bool, n)
#y = zeros(Bool, n)
x[rand(1:n, m)] .= T(NaN)
x[rand(1:n, m)] .= -T(Inf)
x[rand(1:n, m)] .= T(Inf)
x[rand(1:n, m)] .= nextfloat(-T(Inf))
x[rand(1:n, m)] .= prevfloat(T(Inf))
x[rand(1:n, m)] .= T(0)
x[rand(1:n, m)] .= -T(0)
return n, r, x, y
end

# tests
@testset verbose = true "Tests" begin
@testset verbose = false "Fast `bool ^ float`" begin
for Flt in subtypes(AbstractFloat)
@testset verbose = true " \$Flt" begin
n, r, x, y = get_test_data1(Flt)
@testset "\$pow_fast" begin
for i = 1 : n
r_native = pow_native(x[i],y[i])
r_fast = pow_fast(x[i],y[i])
# big(1.0) !== big(1.0), NaN != NaN
@test r_native == r_fast ? true :
isnan(r_native) & isnan(r_fast) ? true : false
end
end
end
end
end
@testset verbose = false "Fast `float ^ bool`" begin
for Flt in subtypes(AbstractFloat)
@testset verbose = true " \$Flt" begin
n, r, x, y = get_test_data2(Flt)
@testset "\$pow_fast" begin
for i = 1 : n
r_native = pow_native(x[i],y[i])
r_fast = pow_fast(x[i],y[i])
# big(1.0) !== big(1.0), NaN != NaN
@test r_native == r_fast ? true :
isnan(r_native) & isnan(r_fast) ? true : false
end
end
end
end
end
end;

# benchmarks
# func
f!(f,r,x,y,n) = for i = 1 : n
r[i] = f(x[i], y[i])
end;

println("\nFast `bool ^ float` benchmark (1 - native 2 - fast):")

for Flt in subtypes(AbstractFloat)
n, r, x, y = get_test_data1(Flt)
println(" \$Flt:")
@btime f!(\$pow_native,\$r,\$x,\$y,\$n)
@btime f!(\$pow_fast,\$r,\$x,\$y,\$n)
end

println("\nFast `float ^ bool` benchmark (1 - native 2 - fast):")
for Flt in subtypes(AbstractFloat)
n, r, x, y = get_test_data2(Flt)
println(" \$Flt:")
@btime f!(\$pow_native,\$r,\$x,\$y,\$n)
@btime f!(\$pow_fast,\$r,\$x,\$y,\$n)
end

println("```")

end;
``````

Intel(R) Core™ i7-9750H CPU @ 2.60GHz
Julia-1.8.3

``````Test Summary:         |  Pass  Total  Time
Tests                 | 80000  80000  2.2s
Fast `bool ^ float` | 40000  40000  1.6s
Fast `float ^ bool` | 40000  40000  0.6s

Fast `bool ^ float` benchmark (1 - native 2 - fast):
BigFloat:
723.195 μs (40000 allocations: 1.98 MiB)
484.415 μs (20000 allocations: 1015.62 KiB) Float16:
153.620 μs (0 allocations: 0 bytes)
18.500 μs (0 allocations: 0 bytes)
Float32:
142.652 μs (0 allocations: 0 bytes)
1.567 μs (0 allocations: 0 bytes)
Float64:
199.983 μs (0 allocations: 0 bytes)
3.304 μs (0 allocations: 0 bytes)

Fast `float ^ bool` benchmark (1 - native 2 - fast):
BigFloat:
399.706 μs (20000 allocations: 1015.62 KiB)
273.336 μs (10024 allocations: 509.03 KiB)
Float16:
58.514 μs (0 allocations: 0 bytes)
13.401 μs (0 allocations: 0 bytes)
Float32:
56.400 μs (0 allocations: 0 bytes)
950.435 ns (0 allocations: 0 bytes)
Float64:
45.594 μs (0 allocations: 0 bytes)
1.707 μs (0 allocations: 0 bytes)
``````
1 Like

AMD Ryzen 9 3900XT 12-Core Processor
Julia-1.8.0
*more faster

``````Test Summary:         |  Pass  Total  Time
Tests                 | 80000  80000  1.9s
Fast `bool ^ float` | 40000  40000  1.3s
Fast `float ^ bool` | 40000  40000  0.6s

Fast `bool ^ float` benchmark (1 - native 2 - fast):
BigFloat:
1.198 ms (40000 allocations: 1.98 MiB)
808.500 μs (20000 allocations: 1015.62 KiB)
Float16:
224.500 μs (0 allocations: 0 bytes)
14.100 μs (0 allocations: 0 bytes)
Float32:
276.900 μs (0 allocations: 0 bytes)
1.210 μs (0 allocations: 0 bytes)
Float64:
257.400 μs (0 allocations: 0 bytes)
2.100 μs (0 allocations: 0 bytes)

Fast `float ^ bool` benchmark (1 - native 2 - fast):
BigFloat:
637.100 μs (20000 allocations: 1015.62 KiB)
526.000 μs (9952 allocations: 505.38 KiB)
Float16:
37.200 μs (0 allocations: 0 bytes)
6.700 μs (0 allocations: 0 bytes)
Float32:
36.600 μs (0 allocations: 0 bytes)
726.515 ns (0 allocations: 0 bytes)
Float64:
33.000 μs (0 allocations: 0 bytes)
1.410 μs (0 allocations: 0 bytes)
``````