Other options…
julia> @noinline ThrowLengthMismatchError() = throw("Lengths aren't equal!")
ThrowLengthMismatchError (generic function with 1 method)
julia> function directexp!(c,a,b)
N = length(a)
N == length(b) == length(c) || ThrowLengthMismatchError()
@inbounds for n in 1:N
c[n] = a[n]^b[n]
end
end
directexp! (generic function with 1 method)
julia> function cobexp!(c,a,b)
N = length(a)
N == length(b) == length(c) || ThrowLengthMismatchError()
@inbounds for n in 1:N
c[n] = exp(b[n]*log(a[n]))
end
end
cobexp! (generic function with 1 method)
julia> syspower(a, b) = ccall(:pow,Float64,(Float64,Float64),a,b)
syspower (generic function with 1 method)
julia> function syspow!(c,a,b)
N = length(a)
N == length(b) == length(c) || ThrowLengthMismatchError()
@inbounds for n in 1:N
c[n] = syspower(a[n], b[n])
end
end
syspow! (generic function with 1 method)
julia> using SIMDPirates, LoopVectorization
[ Info: Recompiling stale cache file /home/chriselrod/.julia/compiled/v1.3/LoopVectorization/4TogI.ji for LoopVectorization [bdcacae8-1622-11e9-2a5c-532679323890]
julia> using SLEEFPirates, SLEEFwrap, xsimdwrap
[ Info: Recompiling stale cache file /home/chriselrod/.julia/compiled/v1.3/SLEEFwrap/GD9JX.ji for SLEEFwrap [6e748a96-c50b-11e8-2c8a-ed394f64ea02]
[ Info: Recompiling stale cache file /home/chriselrod/.julia/compiled/v1.3/xsimdwrap/NPDTn.ji for xsimdwrap [b9b7a223-629f-5757-a0fe-5fc90c39777c]
julia> # requires unregistered dependencies:
# https://github.com/chriselrod/VectorizationBase.jl#master
# https://github.com/chriselrod/SIMDPirates.jl#master
# https://github.com/chriselrod/LoopVectorization.jl#master
# https://github.com/chriselrod/SLEEFPirates.jl#master
# https://github.com/chriselrod/SLEEFwrap.jl#master
# https://github.com/chriselrod/xsimdwrap.jl#master
@generated function jsleefpow!(c::AbstractArray{T}, a::AbstractArray{T}, b::AbstractArray{T}) where {T}
quote
N = length(a)
N == length(b) == length(c) || ThrowLengthMismatchError()
@vectorize $T for n in 1:N
c[n] = a[n] ^ b[n]
end
end
end
jsleefpow! (generic function with 1 method)
julia> @generated function csleefpow!(c::AbstractArray{T}, a::AbstractArray{T}, b::AbstractArray{T}) where {T <: Union{Float32,Float64}}
quote
N = length(a)
N == length(b) == length(c) || ThrowLengthMismatchError()
@cvectorize $T for n in 1:N
c[n] = a[n] ^ b[n]
end
end
end
csleefpow! (generic function with 1 method)
julia> @generated function xsimdpow!(c::AbstractArray{T}, a::AbstractArray{T}, b::AbstractArray{T}) where {T <: Union{Float32,Float64}}
quote
N = length(a)
N == length(b) == length(c) || ThrowLengthMismatchError()
@xvectorize $T for n in 1:N
c[n] = a[n] ^ b[n]
end
end
end
xsimdpow! (generic function with 1 method)
julia> @generated function jsleefpowcob!(c::AbstractArray{T}, a::AbstractArray{T}, b::AbstractArray{T}) where {T}
quote
N = length(a)
N == length(b) == length(c) || ThrowLengthMismatchError()
@vectorize $T for n in 1:N
c[n] = exp(b[n] * log(a[n]))
end
end
end
jsleefpowcob! (generic function with 1 method)
julia> @generated function csleefpowcob!(c::AbstractArray{T}, a::AbstractArray{T}, b::AbstractArray{T}) where {T <: Union{Float32,Float64}}
quote
N = length(a)
N == length(b) == length(c) || ThrowLengthMismatchError()
@cvectorize $T for n in 1:N
c[n] = exp(b[n] * log(a[n]))
end
end
end
csleefpowcob! (generic function with 1 method)
julia> @generated function xsimdpowcob!(c::AbstractArray{T}, a::AbstractArray{T}, b::AbstractArray{T}) where {T <: Union{Float32,Float64}}
quote
N = length(a)
N == length(b) == length(c) || ThrowLengthMismatchError()
@xvectorize $T for n in 1:N
c[n] = exp(b[n] * log(a[n]))
end
end
end
xsimdpowcob! (generic function with 1 method)
julia> a = rand(1000);
julia> b = rand(1000);
julia> c = zeros(1000);
julia> using BenchmarkTools
julia> @benchmark directexp!($c, $a, $b)
BenchmarkTools.Trial:
memory estimate: 0 bytes
allocs estimate: 0
--------------
minimum time: 14.081 μs (0.00% GC)
median time: 14.204 μs (0.00% GC)
mean time: 14.240 μs (0.00% GC)
maximum time: 24.483 μs (0.00% GC)
--------------
samples: 10000
evals/sample: 1
julia> @benchmark cobexp!($c, $a, $b)
BenchmarkTools.Trial:
memory estimate: 0 bytes
allocs estimate: 0
--------------
minimum time: 19.695 μs (0.00% GC)
median time: 19.928 μs (0.00% GC)
mean time: 19.970 μs (0.00% GC)
maximum time: 30.733 μs (0.00% GC)
--------------
samples: 10000
evals/sample: 1
julia> @benchmark syspow!($c, $a, $b)
BenchmarkTools.Trial:
memory estimate: 0 bytes
allocs estimate: 0
--------------
minimum time: 12.908 μs (0.00% GC)
median time: 12.998 μs (0.00% GC)
mean time: 13.031 μs (0.00% GC)
maximum time: 23.834 μs (0.00% GC)
--------------
samples: 10000
evals/sample: 1
julia> @benchmark jsleefpow!($c, $a, $b)
BenchmarkTools.Trial:
memory estimate: 0 bytes
allocs estimate: 0
--------------
minimum time: 6.537 μs (0.00% GC)
median time: 6.562 μs (0.00% GC)
mean time: 6.633 μs (0.00% GC)
maximum time: 11.406 μs (0.00% GC)
--------------
samples: 10000
evals/sample: 5
julia> @benchmark csleefpow!($c, $a, $b)
BenchmarkTools.Trial:
memory estimate: 0 bytes
allocs estimate: 0
--------------
minimum time: 6.052 μs (0.00% GC)
median time: 6.094 μs (0.00% GC)
mean time: 6.112 μs (0.00% GC)
maximum time: 9.819 μs (0.00% GC)
--------------
samples: 10000
evals/sample: 5
julia> @benchmark xsimdpow!($c, $a, $b)
BenchmarkTools.Trial:
memory estimate: 0 bytes
allocs estimate: 0
--------------
minimum time: 4.179 μs (0.00% GC)
median time: 4.191 μs (0.00% GC)
mean time: 4.202 μs (0.00% GC)
maximum time: 6.713 μs (0.00% GC)
--------------
samples: 10000
evals/sample: 7
julia> @benchmark jsleefpowcob!($c, $a, $b)
BenchmarkTools.Trial:
memory estimate: 0 bytes
allocs estimate: 0
--------------
minimum time: 2.442 μs (0.00% GC)
median time: 2.480 μs (0.00% GC)
mean time: 2.481 μs (0.00% GC)
maximum time: 22.302 μs (0.00% GC)
--------------
samples: 10000
evals/sample: 9
julia> @benchmark csleefpowcob!($c, $a, $b)
BenchmarkTools.Trial:
memory estimate: 0 bytes
allocs estimate: 0
--------------
minimum time: 2.413 μs (0.00% GC)
median time: 2.422 μs (0.00% GC)
mean time: 2.435 μs (0.00% GC)
maximum time: 4.540 μs (0.00% GC)
--------------
samples: 10000
evals/sample: 9
julia> @benchmark xsimdpowcob!($c, $a, $b)
BenchmarkTools.Trial:
memory estimate: 0 bytes
allocs estimate: 0
--------------
minimum time: 3.735 μs (0.00% GC)
median time: 3.746 μs (0.00% GC)
mean time: 3.755 μs (0.00% GC)
maximum time: 5.963 μs (0.00% GC)
--------------
samples: 10000
evals/sample: 8
All times above were in microseconds.
My best times were through exp(b * log(a))
with SLEEF, and my worst through Base.exp(b * Base.exp(a))
. My Base.:^
was faster. I also compiled Julia from source, with MARCH=native
in my Make.user.
The vectorized functions were naturally the fastest. The clear winner, if you can stomach the loss of precision, is @robsmith11’s version (which vectorizes after replacing a UInt32(...)
with unsafe_trunc(UInt32, ...)
.
julia> # https://github.com/etheory/fastapprox/blob/master/fastapprox/src/fastlog.h
function fastlog2(x::Float32)::Float32
y = Float32(reinterpret(Int32, x))
y *= 1.1920928955078125f-7
y - 126.94269504f0
end
fastlog2 (generic function with 1 method)
julia> function fastlog2(x::Float64)::Float32
fastlog2(Float32(x))
end
fastlog2 (generic function with 2 methods)
julia> # https://github.com/etheory/fastapprox/blob/master/fastapprox/src/fastexp.h
function fastpow2(x::Float32)::Float32
clipp = x < -126.0f0 ? -126.0f0 : x
clipp = min(126f0, max(-126f0, x))
reinterpret(Float32, Base.unsafe_trunc(UInt32, (1 << 23) * (clipp + 126.94269504f0)))
end
fastpow2 (generic function with 1 method)
julia> function fastpow2(x::Float64)::Float32
fastpow2(Float32(x))
end
fastpow2 (generic function with 2 methods)
julia> # https://github.com/etheory/fastapprox/blob/master/fastapprox/src/fastpow.h
function fastpow(x::Real, y::Real)::Real
fastpow2(y * fastlog2(x))
end
fastpow (generic function with 1 method)
julia> function fastpow!(c,a,b)
N = length(a)
N == length(b) == length(c) || ThrowLengthMismatchError()
@inbounds @simd ivdep for n in 1:N
c[n] = fastpow(a[n], b[n])
end
end
fastpow! (generic function with 1 method)
julia> @benchmark fastpow!($c, $a, $b)
BenchmarkTools.Trial:
memory estimate: 0 bytes
allocs estimate: 0
--------------
minimum time: 617.977 ns (0.00% GC)
median time: 619.324 ns (0.00% GC)
mean time: 629.481 ns (0.00% GC)
maximum time: 829.486 ns (0.00% GC)
--------------
samples: 10000
evals/sample: 173
I’d be worried that this could lead to convergence issues if you use these functions for calculating log densities and gradients for Hamiltonian Monte Carlo. Anyone know the level of accuracy required?
For now, I’ll stick with SLEEF/XSIMD, which still performed quite well and are accurate over a reasonable range.
I’ve been bitten by single precision before, when many Cholesky factorizations that were fine with double precision failed when I used single (under the idea vectorized code would run twice as fast), so I’m weary of going single-precision.
EDIT:
A look at the accuracy. I defined:
using Statistics
a = rand(1000);
b = rand(1000);
c = zeros(1000);
const C_EXACT = @. Float64( big(a) ^ big(b) );
mean_exact(c) = mean(c .== C_EXACT)
mean_approx(c) = mean(c .≈ C_EXACT)
function ulp_diff(x, y)
x, y = minmax(x, y)
est = round(Int, (y - x) / eps(x) )
while true
z = nextfloat(x, est)
z == y && return est
est += round(Int, (z - y) / eps(z))
end
end
mean_error(c) = mean(ulp_diff.(c, C_EXACT))
The mean_error
function gives the number of floating points numbers by which they’re off.
Now, rerunning the benchmarks, and comparing the accuracy with the rounded BigFloat
answers (over an admittedly very narrow subset of pow
’s domain):
julia> @benchmark directexp!($c, $a, $b)
BenchmarkTools.Trial:
memory estimate: 0 bytes
allocs estimate: 0
--------------
minimum time: 13.715 μs (0.00% GC)
median time: 14.199 μs (0.00% GC)
mean time: 14.338 μs (0.00% GC)
maximum time: 227.472 μs (0.00% GC)
--------------
samples: 10000
evals/sample: 1
julia> mean_exact(c), mean_approx(c), mean_error(c)
(1.0, 1.0, 0.0)
julia> @benchmark cobexp!($c, $a, $b)
BenchmarkTools.Trial:
memory estimate: 0 bytes
allocs estimate: 0
--------------
minimum time: 18.915 μs (0.00% GC)
median time: 19.161 μs (0.00% GC)
mean time: 19.280 μs (0.00% GC)
maximum time: 251.692 μs (0.00% GC)
--------------
samples: 10000
evals/sample: 1
julia> mean_exact(c), mean_approx(c), mean_error(c)
(0.815, 1.0, 0.201)
julia> @benchmark syspow!($c, $a, $b)
BenchmarkTools.Trial:
memory estimate: 0 bytes
allocs estimate: 0
--------------
minimum time: 13.079 μs (0.00% GC)
median time: 13.146 μs (0.00% GC)
mean time: 13.234 μs (0.00% GC)
maximum time: 30.236 μs (0.00% GC)
--------------
samples: 10000
evals/sample: 1
julia> mean_exact(c), mean_approx(c), mean_error(c)
(1.0, 1.0, 0.0)
julia> @benchmark jsleefpow!($c, $a, $b)
BenchmarkTools.Trial:
memory estimate: 0 bytes
allocs estimate: 0
--------------
minimum time: 6.635 μs (0.00% GC)
median time: 6.651 μs (0.00% GC)
mean time: 6.675 μs (0.00% GC)
maximum time: 10.282 μs (0.00% GC)
--------------
samples: 10000
evals/sample: 5
julia> mean_exact(c), mean_approx(c), mean_error(c)
(0.828, 1.0, 0.19)
julia> @benchmark csleefpow!($c, $a, $b)
BenchmarkTools.Trial:
memory estimate: 0 bytes
allocs estimate: 0
--------------
minimum time: 6.036 μs (0.00% GC)
median time: 6.063 μs (0.00% GC)
mean time: 6.101 μs (0.00% GC)
maximum time: 9.794 μs (0.00% GC)
--------------
samples: 10000
evals/sample: 5
julia> mean_exact(c), mean_approx(c), mean_error(c)
(0.984, 1.0, 0.016)
julia> @benchmark xsimdpow!($c, $a, $b)
BenchmarkTools.Trial:
memory estimate: 0 bytes
allocs estimate: 0
--------------
minimum time: 4.101 μs (0.00% GC)
median time: 4.190 μs (0.00% GC)
mean time: 4.182 μs (0.00% GC)
maximum time: 6.699 μs (0.00% GC)
--------------
samples: 10000
evals/sample: 7
julia> mean_exact(c), mean_approx(c), mean_error(c)
(0.69, 1.0, 0.325)
julia> @benchmark jsleefpowcob!($c, $a, $b)
BenchmarkTools.Trial:
memory estimate: 0 bytes
allocs estimate: 0
--------------
minimum time: 2.440 μs (0.00% GC)
median time: 2.449 μs (0.00% GC)
mean time: 2.467 μs (0.00% GC)
maximum time: 4.405 μs (0.00% GC)
--------------
samples: 10000
evals/sample: 9
julia> mean_exact(c), mean_approx(c), mean_error(c)
(0.673, 1.0, 0.35)
julia> @benchmark csleefpowcob!($c, $a, $b)
BenchmarkTools.Trial:
memory estimate: 0 bytes
allocs estimate: 0
--------------
minimum time: 2.413 μs (0.00% GC)
median time: 2.423 μs (0.00% GC)
mean time: 2.427 μs (0.00% GC)
maximum time: 4.557 μs (0.00% GC)
--------------
samples: 10000
evals/sample: 9
julia> mean_exact(c), mean_approx(c), mean_error(c)
(0.787, 1.0, 0.246)
julia> @benchmark xsimdpowcob!($c, $a, $b)
BenchmarkTools.Trial:
memory estimate: 0 bytes
allocs estimate: 0
--------------
minimum time: 3.647 μs (0.00% GC)
median time: 3.695 μs (0.00% GC)
mean time: 3.690 μs (0.00% GC)
maximum time: 5.989 μs (0.00% GC)
--------------
samples: 10000
evals/sample: 8
julia> mean_exact(c), mean_approx(c), mean_error(c)
(0.69, 1.0, 0.325)
julia> @benchmark fastpow!($c, $a, $b)
BenchmarkTools.Trial:
memory estimate: 0 bytes
allocs estimate: 0
--------------
minimum time: 617.277 ns (0.00% GC)
median time: 644.266 ns (0.00% GC)
mean time: 644.124 ns (0.00% GC)
maximum time: 1.737 μs (0.00% GC)
--------------
samples: 10000
evals/sample: 173
julia> mean_exact(c), mean_approx(c), mean_error(c)
ERROR: InexactError: trunc(Int64, NaN)
Stacktrace:
[1] trunc at ./float.jl:696 [inlined]
[2] round at ./float.jl:361 [inlined]
[3] ulp_diff(::Float64, ::Float64) at ./REPL[61]:7
[4] _broadcast_getindex_evalf at ./broadcast.jl:625 [inlined]
[5] _broadcast_getindex at ./broadcast.jl:598 [inlined]
[6] getindex at ./broadcast.jl:558 [inlined]
[7] macro expansion at ./broadcast.jl:888 [inlined]
[8] macro expansion at ./simdloop.jl:77 [inlined]
[9] copyto! at ./broadcast.jl:887 [inlined]
[10] copyto! at ./broadcast.jl:842 [inlined]
[11] copy at ./broadcast.jl:818 [inlined]
[12] materialize at ./broadcast.jl:798 [inlined]
[13] mean_error(::Array{Float64,1}) at ./REPL[62]:1
[14] top-level scope at REPL[84]:1
julia> mean_exact(c), mean_approx(c)
(0.0, 0.0)
The system pow
(called using ccall(:pow,...)
) and Julia’s default were the most numerically accurate. Unsurprisingly, exp(b * log(a))
was generally less accurate than pow
.