Can this be written even faster? (CPU)

I get

julia> include("/home/chriselrod/Documents/progwork/julia/scratch.jl")
WARNING: using BenchmarkTools.params in module Main conflicts with an existing identifier.
BenchmarkTools.Trial: 10000 samples with 1 evaluation.
 Range (min … max):  16.497 μs … 261.852 μs  ┊ GC (min … max): 0.00% … 0.00%
 Time  (median):     16.822 μs               ┊ GC (median):    0.00%
 Time  (mean ± σ):   19.193 μs ±   6.929 μs  ┊ GC (mean ± σ):  0.00% ± 0.00%

  █▃   ▄▂▃▂▁ ▁ ▁ ▃                           ▃                 ▁
  ██▅▅▆███████████▇▆▅▅▆▆██▆█▅▆▄▄▅▃▅▄▁▅▄▄▄▃▅▅▅█▇▄▄▅▅▁▄▃▃▄▃▄▄▇▅▆ █
  16.5 μs       Histogram: log(frequency) by time      45.3 μs <

 Memory estimate: 0 bytes, allocs estimate: 0.
BenchmarkTools.Trial: 10000 samples with 1 evaluation.
 Range (min … max):   9.986 μs … 54.222 μs  ┊ GC (min … max): 0.00% … 0.00%
 Time  (median):     10.250 μs              ┊ GC (median):    0.00%
 Time  (mean ± σ):   11.109 μs ±  2.885 μs  ┊ GC (mean ± σ):  0.00% ± 0.00%

  ▆█▄                     ▄▃▁                                 ▁
  ███▆▅▃▄▃▁▁▃▄▁▃▆▆▄▄▄▅▅▅▅▅█████▆▇▇▆▆▇▆▆▆▅▅▅▅▄▃▅▅▃▄▅▅▅▄▄▄▆▆▆▇▆ █
  9.99 μs      Histogram: log(frequency) by time      21.8 μs <

 Memory estimate: 0 bytes, allocs estimate: 0.

julia> versioninfo()
Julia Version 1.10.0
Commit 4745ef8816 (2024-02-06 13:44 UTC)
Platform Info:
  OS: Linux (x86_64-redhat-linux)
  CPU: 36 × Intel(R) Core(TM) i9-7980XE CPU @ 2.60GHz
  WORD_SIZE: 64
  LIBM: libopenlibm
  LLVM: libLLVM-15.0.7 (ORCJIT, skylake-avx512)
Threads: 36 default, 0 interactive, 18 GC (on 36 virtual cores)
Script
using BenchmarkTools
using StaticArrays
using LoopVectorization


# READ: https://discourse.julialang.org/t/can-this-be-written-even-faster-cpu/109924/17
# Generate the needed constants
N     = 6000
Cb    = inv(Float64(100000))
invCb = inv(Cb)
xᵢⱼᶻ  = rand(Float64,N*4)
xᵢⱼ   = rand(SVector{3,Float64},N*4)
γ⁻¹   = Float64(1/7)
ρ₀    = Float64(1000)
g     = Float64(9.81)
drhopLp = zeros(Float64, length(xᵢⱼ))
drhopLn = zeros(Float64, length(xᵢⱼ))


@inline function fancy7th(x)
    # todo tune the magic constant
    # initial guess based on fast inverse sqrt trick but adjusted to compute x^(1/7)
    t = copysign(reinterpret(Float64, 0x36cd000000000000 + reinterpret(UInt64,abs(x))÷7), x)
    @fastmath for _ in 1:2
        # newton's method for t^3 - x/t^4 = 0
        t2 = t*t
        t3 = t2*t
        t4 = t2*t2
        xot4 = x/t4
        t = t - t*(t3 - xot4)/(4*t3 + 3*xot4)
    end
    t
end

@inline faux(ρ₀, P, Cb, γ⁻¹)  = ρ₀ * ( ^( 1 + (P * Cb), γ⁻¹)   - 1)
@inline faux_fancy(ρ₀, P, Cb) = ρ₀ * ( fancy7th( 1 + (P * Cb)) - 1)
function loopvectorization_approach3(invCb,xᵢⱼᶻ,ρ₀,γ⁻¹,g,drhopLp,drhopLn)
        @tturbo for iter in eachindex(xᵢⱼᶻ)
               # IF I COMMENT '* -xᵢⱼᶻ[iter]' NO ALLOCATIONS, WHY?
               Pᵢⱼᴴ  = ρ₀ * (-g) * -xᵢⱼᶻ[iter]
               Pⱼᵢᴴ  = -Pᵢⱼᴴ
               ρᵢⱼᴴ  = faux(ρ₀, Pᵢⱼᴴ, invCb, γ⁻¹)
               ρⱼᵢᴴ  = faux(ρ₀, Pⱼᵢᴴ, invCb, γ⁻¹)
               drhopLp[iter] = ρᵢⱼᴴ
               drhopLn[iter] = ρⱼᵢᴴ
        end
    end

function fancy_loopvectorization_approach3(invCb,xᵢⱼᶻ,ρ₀,g,drhopLp,drhopLn)
    @tturbo for iter in eachindex(xᵢⱼᶻ)
           # IF I COMMENT '* -xᵢⱼᶻ[iter]' NO ALLOCATIONS, WHY?
           Pᵢⱼᴴ  = ρ₀ * (-g) * -xᵢⱼᶻ[iter]
           Pⱼᵢᴴ  = -Pᵢⱼᴴ
           ρᵢⱼᴴ  = faux_fancy(ρ₀, Pᵢⱼᴴ, invCb)
           ρⱼᵢᴴ  = faux_fancy(ρ₀, Pⱼᵢᴴ, invCb)
           drhopLp[iter] = ρᵢⱼᴴ
           drhopLn[iter] = ρⱼᵢᴴ
    end
end

l3 = @benchmark loopvectorization_approach3($invCb,$xᵢⱼᶻ,$ρ₀, $γ⁻¹, $g,$drhopLp,$drhopLn)
f3 = @benchmark fancy_loopvectorization_approach3($invCb,$xᵢⱼᶻ,$ρ₀, $g,$drhopLp,$drhopLn)

display(l3);
display(f3);

On another server

BenchmarkTools.Trial: 318 samples with 1 evaluation.
 Range (min … max):  1.083 ms … 226.018 ms  ┊ GC (min … max):  0.00% … 90.42%
 Time  (median):     1.146 ms               ┊ GC (median):     0.00%
 Time  (mean ± σ):   3.139 ms ±  20.050 ms  ┊ GC (mean ± σ):  59.79% ±  9.30%

  █▇▂                                                          
  ███▇▇▇▄▆▁▁▁▄▁▁▄▁▄▁▁▄▁▁▁▁▁▁▄▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▄ ▆
  1.08 ms      Histogram: log(frequency) by time      4.62 ms <

 Memory estimate: 15.38 MiB, allocs estimate: 432000.

julia> display(f3);
BenchmarkTools.Trial: 10000 samples with 1 evaluation.
 Range (min … max):   7.530 μs …  3.756 ms  ┊ GC (min … max): 0.00% … 0.00%
 Time  (median):     11.570 μs              ┊ GC (median):    0.00%
 Time  (mean ± σ):   11.908 μs ± 37.459 μs  ┊ GC (mean ± σ):  0.00% ± 0.00%

                                              ▃▂  █▇           
  ▂▁▂▂▂▁▂▂▁▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▃▄▃▂▃▃▄██▇███▇▅▃▃▃▃▃▂▂ ▃
  7.53 μs         Histogram: frequency by time        12.5 μs <

 Memory estimate: 0 bytes, allocs estimate: 0.

julia> versioninfo()
Julia Version 1.10.0
Commit 3120989f39b (2023-12-25 18:01 UTC)
Build Info:
  Official https://julialang.org/ release
Platform Info:
  OS: Linux (x86_64-linux-gnu)
  CPU: 64 × AMD EPYC 7513 32-Core Processor
  WORD_SIZE: 64
  LIBM: libopenlibm
  LLVM: libLLVM-15.0.7 (ORCJIT, znver3)
  Threads: 95 on 64 virtual cores

the allocations don’t show up with AVX512, which is why the 7980xe didn’t have that problem.

2 Likes