Actually one can optimize this a bit further by using the identity -max(a,b) == min(-a,-b):
function superbee_refactor3!(s::T, a::T, b::T, h::Float64) where {T>:Vector{Float64}}
@fastmath for i = eachindex(s,a,b)
# @inbounds is inferred automatically - yay for safety AND speed!
ai = a[i]
bi = b[i]
t1 = max(min(ai,2bi),min(2ai,bi))
#t2 = -max(min(-ai,-2bi),min(-2ai,-bi))
t2 = min(max(ai,2bi),max(2ai,bi))
s[i] = ifelse(ai>0 && bi>0, t1, ifelse(bi<0, t2, zero(eltype(T))))*h
end
end
This shaves off another ~8ns (~20%) for me:
julia> @benchmark superbee_refactor3!($s,$a,$b,$h)
BenchmarkTools.Trial: 10000 samples with 993 evaluations.
Range (min … max): 34.887 ns … 55.143 ns ┊ GC (min … max): 0.00% … 0.00%
Time (median): 36.574 ns ┊ GC (median): 0.00%
Time (mean ± σ): 36.939 ns ± 1.519 ns ┊ GC (mean ± σ): 0.00% ± 0.00%
▁ ▂▅▄█▇▅▇▁▂▁ ▁ ▁▂ ▁ ▂ ▂ ▁ ▂
▅▅▄█▇██████████▅█▇██▅███████▇█▆▆▆▃▆▆▅▆▄▆▆▃▅▃▅▅▁▃▃▄▃▄▁▃▅▄▄▅▄ █
34.9 ns Histogram: log(frequency) by time 45.7 ns <
Memory estimate: 0 bytes, allocs estimate: 0.
EDIT: I also tried to use the minmax function but this breaks something and it is a lot slower. I am not sure what the difference is as the assembly looks very similar superficially. This can for some reason not use AVX instructions (256-bit) and only uses SSE instructions (128-bit).
function superbee_refactor4!(s::T, a::T, b::T, h::Float64) where {T>:Vector{Float64}}
@fastmath @simd for i = eachindex(s,a,b)
# @inbounds is inferred automatically - yay for safety AND speed!
ai = a[i]
bi = b[i]
# t1 = max(min(ai,2.0*bi),min(2.0*ai,bi))*h
# t2 = min(max(ai,2.0*bi),max(2.0*ai,bi))*h
m1,m2 = minmax(ai,2.0*bi)
m3,m4 = minmax(2.0*ai,2.0)
t1 = max(m1,m3)
t2 = min(m2,m4)
s[i] = ifelse(ai>0 && bi>0, t1, ifelse(bi<0, t2, zero(eltype(T))))*h
end
end
julia> @benchmark superbee_refactor4!($s,$a,$b,$h)
BenchmarkTools.Trial: 10000 samples with 803 evaluations.
Range (min … max): 154.908 ns … 518.042 ns ┊ GC (min … max): 0.00% … 0.00%
Time (median): 156.822 ns ┊ GC (median): 0.00%
Time (mean ± σ): 158.300 ns ± 11.322 ns ┊ GC (mean ± σ): 0.00% ± 0.00%
▂▂▆▇█▆▅▃▁▂▁▁▁ ▂▂▃▃▁ ▂
█████████████████████▆▆▇▆▆▆▄▆▄▅▅▅▇▆▅▅▆▇▆▇▆▇▇▆▇▇▆▇▆▆▇▆▇▆▄▆▆▆▁▆ █
155 ns Histogram: log(frequency) by time 177 ns <
Memory estimate: 0 bytes, allocs estimate: 0.