Taking my implementation from Renderer/src/RayTracingInOneWeekend.jl at master · Zentrik/Renderer · GitHub,
using SIMD
function hor_min(x::SIMD.Vec{8, F}) where F
@fastmath a = shufflevector(x, Val((4, 5, 6, 7, :undef, :undef, :undef, :undef))) # high half
@fastmath b = min(a, x)
@fastmath a = shufflevector(b, Val((2, 3, :undef, :undef, :undef, :undef, :undef, :undef)))
@fastmath b = min(a, b)
@fastmath a = shufflevector(b, Val((1, :undef, :undef, :undef, :undef, :undef, :undef, :undef)))
@fastmath b = min(a, b)
return @inbounds b[1]
end
@generated function getBits(mask::SIMD.Vec{N, Bool}) where N #This reverses the bits
s = """
%mask = trunc <$N x i8> %0 to <$N x i1>
%res = bitcast <$N x i1> %mask to i$N
ret i$N %res
"""
return :(
$(Expr(:meta, :inline));
Base.llvmcall($s, UInt8, Tuple{SIMD.LVec{N, Bool}}, mask.data)
)
end
function my_findmin(x)
# Assumes x's length is a multiple of 8
laneIndices = SIMD.Vec{8, Int64}((1, 2, 3, 4, 5, 6, 7, 8))
minvals = SIMD.Vec{8, Float64}(Inf)
min_indices = SIMD.Vec{8, Int64}(0)
lane = VecRange{8}(0)
i = 1
@inbounds @fastmath while i <= length(x)
predicate = x[lane + i] < minvals
minvals = vifelse(predicate, x[lane + i], minvals)
min_indices = vifelse(predicate, laneIndices, min_indices)
i += 8
laneIndices += 8
end
min_value = hor_min(minvals)
min_index = min_indices[trailing_zeros(getBits(min_value == minvals)) + 1]
return min_value, min_index
end
I get
julia> using BenchmarkTools
julia> x = rand(512);
julia> @benchmark my_findmin($(Ref(x))[]) samples=10^5
BenchmarkTools.Trial: 43356 samples with 903 evaluations.
Range (min … max): 122.924 ns … 472.468 ns ┊ GC (min … max): 0.00% … 0.00%
Time (median): 125.875 ns ┊ GC (median): 0.00%
Time (mean ± σ): 126.235 ns ± 6.468 ns ┊ GC (mean ± σ): 0.00% ± 0.00%
▅█▅▁▁▁▇▆▂▃▇▅▂▂▂▂▁▁ ▃
██████████████████████▇▇▇▇▆▇▆▆▆▆▆▆▆▆▆▆▆▆▆▅▆▆▆▅▆▆▆▆▆▆▆▆▆▆▆▇▆▆▆ █
123 ns Histogram: log(frequency) by time 151 ns <
Memory estimate: 0 bytes, allocs estimate: 0.
julia> function naive_findmin(w)
x = @fastmath foldl(min, w)
i = findfirst(==(x), w)::Int
x, i
end
naive_findmin (generic function with 2 methods)
julia> @benchmark naive_findmin($(Ref(x))[]) samples=10^5
BenchmarkTools.Trial: 58706 samples with 285 evaluations.
Range (min … max): 280.425 ns … 936.267 ns ┊ GC (min … max): 0.00% … 0.00%
Time (median): 288.126 ns ┊ GC (median): 0.00%
Time (mean ± σ): 293.907 ns ± 23.464 ns ┊ GC (mean ± σ): 0.00% ± 0.00%
██▄▂▃▃▅▄▂▁▁▁▁ ▁ ▂
█▃██████████████▇█▇▇▇▇▇▇▇▇▇▇▇▇██▇▆▆▆▆▆▆▆▆▆▅▅▆▆▆▆▅▆▅▅▄▅▄▅▄▅▅▅▄ █
280 ns Histogram: log(frequency) by time 426 ns <
Memory estimate: 0 bytes, allocs estimate: 0.
I don’t think extending my code to support non multiple of 8 lengths should slow things down much.
EDIT: oops, screwed up calculating the indices, not as fast as it first appeared. Though if you have avx512 it will probably be 4x faster.