Optimising superbee! function

Actually one can optimize this a bit further by using the identity -max(a,b) == min(-a,-b):

function superbee_refactor3!(s::T, a::T, b::T, h::Float64) where {T>:Vector{Float64}}
    @fastmath for i = eachindex(s,a,b)
        # @inbounds is inferred automatically - yay for safety AND speed!
        ai = a[i]
        bi = b[i]
        t1 = max(min(ai,2bi),min(2ai,bi))
        #t2 = -max(min(-ai,-2bi),min(-2ai,-bi))
        t2 = min(max(ai,2bi),max(2ai,bi))
        s[i] = ifelse(ai>0 && bi>0, t1, ifelse(bi<0, t2, zero(eltype(T))))*h
    end
end

This shaves off another ~8ns (~20%) for me:

julia> @benchmark superbee_refactor3!($s,$a,$b,$h)
BenchmarkTools.Trial: 10000 samples with 993 evaluations.
 Range (min … max):  34.887 ns … 55.143 ns  ┊ GC (min … max): 0.00% … 0.00%
 Time  (median):     36.574 ns              ┊ GC (median):    0.00%
 Time  (mean ± σ):   36.939 ns ±  1.519 ns  ┊ GC (mean ± σ):  0.00% ± 0.00%

     ▁ ▂▅▄█▇▅▇▁▂▁ ▁ ▁▂ ▁  ▂ ▂  ▁                              ▂
  ▅▅▄█▇██████████▅█▇██▅███████▇█▆▆▆▃▆▆▅▆▄▆▆▃▅▃▅▅▁▃▃▄▃▄▁▃▅▄▄▅▄ █
  34.9 ns      Histogram: log(frequency) by time      45.7 ns <

 Memory estimate: 0 bytes, allocs estimate: 0.

EDIT: I also tried to use the minmax function but this breaks something and it is a lot slower. I am not sure what the difference is as the assembly looks very similar superficially. This can for some reason not use AVX instructions (256-bit) and only uses SSE instructions (128-bit).

function superbee_refactor4!(s::T, a::T, b::T, h::Float64) where {T>:Vector{Float64}}
    @fastmath @simd for i = eachindex(s,a,b)
        # @inbounds is inferred automatically - yay for safety AND speed!
        ai = a[i]
        bi = b[i]
        # t1 = max(min(ai,2.0*bi),min(2.0*ai,bi))*h
        # t2 = min(max(ai,2.0*bi),max(2.0*ai,bi))*h
        m1,m2 = minmax(ai,2.0*bi)
        m3,m4 = minmax(2.0*ai,2.0)
        t1 = max(m1,m3)
        t2 = min(m2,m4)
        s[i] = ifelse(ai>0 && bi>0, t1, ifelse(bi<0, t2, zero(eltype(T))))*h
    end
end
julia> @benchmark superbee_refactor4!($s,$a,$b,$h)
BenchmarkTools.Trial: 10000 samples with 803 evaluations.
 Range (min … max):  154.908 ns … 518.042 ns  ┊ GC (min … max): 0.00% … 0.00%
 Time  (median):     156.822 ns               ┊ GC (median):    0.00%
 Time  (mean ± σ):   158.300 ns ±  11.322 ns  ┊ GC (mean ± σ):  0.00% ± 0.00%

   ▂▂▆▇█▆▅▃▁▂▁▁▁ ▂▂▃▃▁                                          ▂
  █████████████████████▆▆▇▆▆▆▄▆▄▅▅▅▇▆▅▅▆▇▆▇▆▇▇▆▇▇▆▇▆▆▇▆▇▆▄▆▆▆▁▆ █
  155 ns        Histogram: log(frequency) by time        177 ns <

 Memory estimate: 0 bytes, allocs estimate: 0.