Recommended way to use something like `evalpoly` with `@turbo` from LoopVectorization.jl

What is the recommended way to evaluate polynomials inside loops annotated by @turbo from LoopVectorization.jl? Apparently, neither @evalpoly nor evalpoly work directly.

julia> using LoopVectorization

julia> function foo!(dst, src)
           @turbo for i in eachindex(dst, src)
               dst[i] = @evalpoly(src[i], 1, 2, 3)
           end
       end
ERROR: LoadError: MethodError: no method matching instruction!(::LoopVectorization.LoopSet, ::GlobalRef)
Closest candidates are:
  instruction!(::LoopVectorization.LoopSet, ::F) where F<:Function at ~/.julia/packages/LoopVectorization/x4G96/src/modeling/graphs.jl:1055
  instruction!(::LoopVectorization.LoopSet, ::Expr) at ~/.julia/packages/LoopVectorization/x4G96/src/modeling/graphs.jl:1038
  instruction!(::LoopVectorization.LoopSet, ::Symbol) at ~/.julia/packages/LoopVectorization/x4G96/src/modeling/graphs.jl:1054
Stacktrace:
  [1] add_compute!(ls::LoopVectorization.LoopSet, var::Symbol, ex::Expr, elementbytes::Int64, position::Int64, mpref::LoopVectorization.ArrayReferenceMetaPosition)
    @ LoopVectorization ~/.julia/packages/LoopVectorization/x4G96/src/parse/add_compute.jl:361
  [2] add_operation!(ls::LoopVectorization.LoopSet, LHS_sym::Symbol, RHS::Expr, LHS_ref::LoopVectorization.ArrayReferenceMetaPosition, elementbytes::Int64, position::Int64)
    @ LoopVectorization ~/.julia/packages/LoopVectorization/x4G96/src/modeling/graphs.jl:1158
  [3] prepare_rhs_for_storage!(ls::LoopVectorization.LoopSet, RHS::Expr, array::Symbol, rawindices::SubArray{Any, 1, Vector{Any}, Tuple{UnitRange{Int64}}, true}, elementbytes::Int64, position::Int64)
    @ LoopVectorization ~/.julia/packages/LoopVectorization/x4G96/src/modeling/graphs.jl:1185
  [4] add_assignment!(ls::LoopVectorization.LoopSet, LHS::Expr, RHS::Expr, elementbytes::Int64, position::Int64)
    @ LoopVectorization ~/.julia/packages/LoopVectorization/x4G96/src/modeling/graphs.jl:1247
  [5] push!(ls::LoopVectorization.LoopSet, ex::Expr, elementbytes::Int64, position::Int64, mpref::Nothing)
    @ LoopVectorization ~/.julia/packages/LoopVectorization/x4G96/src/modeling/graphs.jl:1271
  [6] push!(ls::LoopVectorization.LoopSet, ex::Expr, elementbytes::Int64, position::Int64)
    @ LoopVectorization ~/.julia/packages/LoopVectorization/x4G96/src/modeling/graphs.jl:1262
  [7] add_block!
    @ ~/.julia/packages/LoopVectorization/x4G96/src/modeling/graphs.jl:797 [inlined]
  [8] add_loop!(ls::LoopVectorization.LoopSet, q::Expr, elementbytes::Int64)
    @ LoopVectorization ~/.julia/packages/LoopVectorization/x4G96/src/modeling/graphs.jl:1023
  [9] copyto!
    @ ~/.julia/packages/LoopVectorization/x4G96/src/constructors.jl:15 [inlined]
 [10] LoopVectorization.LoopSet(q::Expr, mod::Symbol)
    @ LoopVectorization ~/.julia/packages/LoopVectorization/x4G96/src/constructors.jl:66
 [11] LoopVectorization.LoopSet(q::Expr, m::Module)
    @ LoopVectorization ~/.julia/packages/LoopVectorization/x4G96/src/constructors.jl:70
 [12] turbo_macro(::Module, ::LineNumberNode, ::Expr)
    @ LoopVectorization ~/.julia/packages/LoopVectorization/x4G96/src/constructors.jl:121
 [13] var"@turbo"(__source__::LineNumberNode, __module__::Module, args::Vararg{Any})
    @ LoopVectorization ~/.julia/packages/LoopVectorization/x4G96/src/constructors.jl:223
in expression starting at REPL[1]:2

julia> function foo!(dst, src)
           @turbo for i in eachindex(dst, src)
               dst[i] = evalpoly(src[i], (1, 2, 3))
           end
       end
ERROR: LoadError: Expression not recognized.
(1, 2, 3)
Stacktrace:
  [1] add_operation!(ls::LoopVectorization.LoopSet, LHS::Symbol, RHS::Expr, elementbytes::Int64, position::Int64)
    @ LoopVectorization ~/.julia/packages/LoopVectorization/x4G96/src/modeling/graphs.jl:1130
  [2] add_parent!(vparents::Vector{LoopVectorization.Operation}, deps::Vector{Symbol}, reduceddeps::Vector{Symbol}, ls::LoopVectorization.LoopSet, var::Expr, elementbytes::Int64, position::Int64)
    @ LoopVectorization ~/.julia/packages/LoopVectorization/x4G96/src/parse/add_compute.jl:68
  [3] add_compute!(ls::LoopVectorization.LoopSet, var::Symbol, ex::Expr, elementbytes::Int64, position::Int64, mpref::LoopVectorization.ArrayReferenceMetaPosition)
    @ LoopVectorization ~/.julia/packages/LoopVectorization/x4G96/src/parse/add_compute.jl:394
  [4] add_operation!(ls::LoopVectorization.LoopSet, LHS_sym::Symbol, RHS::Expr, LHS_ref::LoopVectorization.ArrayReferenceMetaPosition, elementbytes::Int64, position::Int64)
    @ LoopVectorization ~/.julia/packages/LoopVectorization/x4G96/src/modeling/graphs.jl:1158
  [5] prepare_rhs_for_storage!(ls::LoopVectorization.LoopSet, RHS::Expr, array::Symbol, rawindices::SubArray{Any, 1, Vector{Any}, Tuple{UnitRange{Int64}}, true}, elementbytes::Int64, position::Int64)
    @ LoopVectorization ~/.julia/packages/LoopVectorization/x4G96/src/modeling/graphs.jl:1185
  [6] add_assignment!(ls::LoopVectorization.LoopSet, LHS::Expr, RHS::Expr, elementbytes::Int64, position::Int64)
    @ LoopVectorization ~/.julia/packages/LoopVectorization/x4G96/src/modeling/graphs.jl:1247
  [7] push!(ls::LoopVectorization.LoopSet, ex::Expr, elementbytes::Int64, position::Int64, mpref::Nothing)
    @ LoopVectorization ~/.julia/packages/LoopVectorization/x4G96/src/modeling/graphs.jl:1271
  [8] push!(ls::LoopVectorization.LoopSet, ex::Expr, elementbytes::Int64, position::Int64)
    @ LoopVectorization ~/.julia/packages/LoopVectorization/x4G96/src/modeling/graphs.jl:1262
  [9] add_block!
    @ ~/.julia/packages/LoopVectorization/x4G96/src/modeling/graphs.jl:797 [inlined]
 [10] add_loop!(ls::LoopVectorization.LoopSet, q::Expr, elementbytes::Int64)
    @ LoopVectorization ~/.julia/packages/LoopVectorization/x4G96/src/modeling/graphs.jl:1023
 [11] copyto!
    @ ~/.julia/packages/LoopVectorization/x4G96/src/constructors.jl:15 [inlined]
 [12] LoopVectorization.LoopSet(q::Expr, mod::Symbol)
    @ LoopVectorization ~/.julia/packages/LoopVectorization/x4G96/src/constructors.jl:66
 [13] LoopVectorization.LoopSet(q::Expr, m::Module)
    @ LoopVectorization ~/.julia/packages/LoopVectorization/x4G96/src/constructors.jl:70
 [14] turbo_macro(::Module, ::LineNumberNode, ::Expr)
    @ LoopVectorization ~/.julia/packages/LoopVectorization/x4G96/src/constructors.jl:121
 [15] var"@turbo"(__source__::LineNumberNode, __module__::Module, args::Vararg{Any})
    @ LoopVectorization ~/.julia/packages/LoopVectorization/x4G96/src/constructors.jl:223
in expression starting at REPL[2]:2

Do I need to write out Horner’s method by hand or is there any other clever trick I can use?

The error for evalpoly was:

ERROR: LoadError: Expression not recognized.
(1, 2, 3)

You can hoist the tuple construction.

function foo!(dst, src)
    coefs = (1,2,3)
    @turbo for i in eachindex(dst, src)
        dst[i] = evalpoly(src[i], coefs)
    end
end
1 Like

Thanks a lot! This seems to work fine but appears to be a bit less efficient than writing out Horner’s method manually.

julia> using LoopVectorization, BenchmarkTools

julia> function foo!(dst, src)
           coefs = (1,2,3)
           @turbo for i in eachindex(dst, src)
               dst[i] = evalpoly(src[i], coefs)
           end
       end
foo! (generic function with 1 method)

julia> function bar!(dst, src)
           @turbo for i in eachindex(dst, src)
               s = src[i]
               dst[i] = 1 + s*(2 + 3*s)
           end
       end
bar! (generic function with 1 method)

julia> src = rand(100); dst = zero(src);

julia> @benchmark foo!($dst, $src)
BenchmarkTools.Trial: 10000 samples with 996 evaluations.
 Range (min … max):  22.532 ns … 56.331 ns  β”Š GC (min … max): 0.00% … 0.00%
 Time  (median):     22.949 ns              β”Š GC (median):    0.00%
 Time  (mean Β± Οƒ):   24.095 ns Β±  2.360 ns  β”Š GC (mean Β± Οƒ):  0.00% Β± 0.00%

   β–β–‚β–ƒβ–ˆ                                       ▁ β–†             ▁
  β–„β–ˆβ–ˆβ–ˆβ–ˆβ–β–†β–β–ˆβ–β–β–β–ƒβ–„β–„β–β–†β–‡β–„β–β–ƒβ–β–β–β–ƒβ–β–β–β–ƒβ–β–β–β–β–β–β–β–β–β–β–β–β–β–β–…β–ˆβ–‡β–ˆβ–…β–β–β–β–„β–β–β–„β–β–ƒβ–β–„ β–ˆ
  22.5 ns      Histogram: log(frequency) by time      28.6 ns <

 Memory estimate: 0 bytes, allocs estimate: 0.

julia> @benchmark bar!($dst, $src)
BenchmarkTools.Trial: 10000 samples with 997 evaluations.
 Range (min … max):  20.276 ns … 55.268 ns  β”Š GC (min … max): 0.00% … 0.00%
 Time  (median):     20.522 ns              β”Š GC (median):    0.00%
 Time  (mean Β± Οƒ):   20.802 ns Β±  2.012 ns  β”Š GC (mean Β± Οƒ):  0.00% Β± 0.00%

  β–‡β–ˆβ–                                                         ▁
  β–ˆβ–ˆβ–ˆβ–ˆβ–…β–…β–ƒβ–‡β–†β–„β–ƒβ–„β–β–β–„β–β–ƒβ–β–β–β–β–β–ƒβ–β–β–β–β–β–β–β–β–β–ƒβ–β–β–β–β–β–β–β–β–β–β–β–β–β–β–β–β–β–β–β–β–ƒβ–β–β–„β–‡β–‡ β–ˆ
  20.3 ns      Histogram: log(frequency) by time      33.4 ns <

 Memory estimate: 0 bytes, allocs estimate: 0.

I guess that’s because of the function call not recognized explicitly by LoopVectorization.jl, correct? Anyway, I don’t want to bother you with this and give you more time to work on the rewrite of LoopVectorization.jl to make this package even greater and more flexible :smiley:

2 Likes