@inbounds with dot notation?

performance

#1

I’m wondering how to use @inbounds when vectorizing a function with dot notation or @.. I found Base.@propagate_inbounds but am not sure I’m using it correctly. Consider

using BenchmarkTools

Base.@propagate_inbounds active_dot(x, l, u, δ) = (x ≤ l + δ) || (x ≥ u - δ)

function active!(A, x, l, u, δ)
  @inbounds for i in eachindex(x)
    A[i] = (x[i] ≤ l[i] + δ[i]) || (x[i] ≥ u[i] - δ[i])
  end
  A
end

function foo(n)
  l = zeros(n)
  u = ones(n)
  x = rand(n) - 0.5
  rtol = atol = 1.0e-8
  # is it possible to write the following with dot notation??
  δ = [-Inf < l[i] < u[i] < Inf ? min(rtol * (u[i] - l[i]), atol) : atol for i = 1:n]

  A1 = Array{Bool}(n)
  A2 = Array{Bool}(n)

  t1 = @benchmark active!($A1, $x, $l, $u, $δ)
  show(STDOUT, MIME"text/plain"(), t1); println()

  t2 = @benchmark $A2 .= active_dot.($x, $l, $u, $δ)
  show(STDOUT, MIME"text/plain"(), t2); println()

  @assert all(A1 .== A2)
end

foo(10000)

The explicit @inbounds-instrumented loop is still about 10x faster:

BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     5.528 μs (0.00% GC)
  median time:      6.125 μs (0.00% GC)
  mean time:        6.545 μs (0.00% GC)
  maximum time:     23.742 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     6
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     45.809 μs (0.00% GC)
  median time:      49.291 μs (0.00% GC)
  mean time:        52.758 μs (0.00% GC)
  maximum time:     229.375 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     1

Can active_dot be improved somehow to close the gap? Thanks!