Array Contraction, LoopVectorization & AD

Hi @mcabbott ,
Thank you for your answer.
So, a more complicated contraction is, for instance (I am trying also the package you suggested)

function tullio_conv(W,v)
    return @tullio C[i,k] := W[i,j,k,l] * v[j,l]
end

function tensor_conv(W,v)
    return @tensor C[i,k] := W[i,j,k,l] * v[j,l]
end

The benchmark reads

W = rand(2, 2, 37, 1400)
x = rand(2, 1400)
@benchmark sum(tullio_conv(W, x))

BenchmarkTools.Trial: 10000 samples with 1 evaluation.
 Range (min … max):  36.650 μs … 147.037 μs  ┊ GC (min … max): 0.00% … 0.00%
 Time  (median):     37.334 μs               ┊ GC (median):    0.00%
 Time  (mean ± σ):   38.074 μs ±   2.253 μs  ┊ GC (mean ± σ):  0.00% ± 0.00%

    █▅                                                          
  ▁▃███▄▃▂▂▁▂▄▄▃▃▃▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁ ▂
  36.6 μs         Histogram: frequency by time         45.3 μs <

 Memory estimate: 688 bytes, allocs estimate: 2.

@benchmark sum(tensor_conv(W, x))

BenchmarkTools.Trial: 10000 samples with 1 evaluation.
 Range (min … max):  141.516 μs …   3.715 ms  ┊ GC (min … max):  0.00% … 92.18%
 Time  (median):     203.635 μs               ┊ GC (median):     0.00%
 Time  (mean ± σ):   297.734 μs ± 433.301 μs  ┊ GC (mean ± σ):  26.07% ± 15.99%

  ▆█▆▅▃▂                                                        ▂
  ███████▇▆▅▅▃▄▄▁▁▁▁▁▁▁▅▁▁▁▄▁▃▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▃▁▄▆▇▇▇██▇█▇▇ █
  142 μs        Histogram: log(frequency) by time       2.61 ms <

 Memory estimate: 1.59 MiB, allocs estimate: 137.

Regarding AD, only Tullio looks to be working

@benchmark gradient(W -> sum(tullio_conv(W, x)), W)

BenchmarkTools.Trial: 8735 samples with 1 evaluation.
 Range (min … max):  406.598 μs …   4.155 ms  ┊ GC (min … max):  0.00% … 86.13%
 Time  (median):     467.586 μs               ┊ GC (median):     0.00%
 Time  (mean ± σ):   570.568 μs ± 404.231 μs  ┊ GC (mean ± σ):  12.25% ± 13.93%

  ▃█▆▄▂   ▁▃▃▁                                                  ▁
  ██████▆▅█████▆▄▅▃▁▁▃▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▃▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▄▇▇▇████ █
  407 μs        Histogram: log(frequency) by time       2.65 ms <

 Memory estimate: 1.61 MiB, allocs estimate: 59.
Error stacktrace for TensorOperations

MethodError: no method matching StridedViews.StridedView(::FillArrays.Fill{Float64, 2, Tuple{Base.OneTo{Int64}, Base.OneTo{Int64}}})

Closest candidates are:
StridedViews.StridedView(::PermutedDimsArray{T, N, P}) where {T, N, P}
@ StridedViews ~/.julia/packages/StridedViews/dcnHM/src/stridedview.jl:51
StridedViews.StridedView(::Base.ReshapedArray)
@ StridedViews ~/.julia/packages/StridedViews/dcnHM/src/stridedview.jl:50
StridedViews.StridedView(::SubArray)
@ StridedViews ~/.julia/packages/StridedViews/dcnHM/src/stridedview.jl:49

Stacktrace:
[1] tensorcontract!(C::Array{Float64, 4}, pC::Tuple{NTuple{4, Int64}, Tuple{}}, A::FillArrays.Fill{Float64, 2, Tuple{Base.OneTo{Int64}, Base.OneTo{Int64}}}, pA::Tuple{Tuple{Int64, Int64}, Tuple{}}, conjA::Symbol, B::Matrix{Float64}, pB::Tuple{Tuple{}, Tuple{Int64, Int64}}, conjB::Symbol, α::VectorInterface.One, β::VectorInterface.Zero, #unused#::TensorOperations.Backend{:StridedBLAS})
@ TensorOperations ~/.julia/packages/TensorOperations/7VyQe/src/implementation/abstractarray.jl:63
[2] tensorcontract!(C::Array{Float64, 4}, pC::Tuple{NTuple{4, Int64}, Tuple{}}, A::FillArrays.Fill{Float64, 2, Tuple{Base.OneTo{Int64}, Base.OneTo{Int64}}}, pA::Tuple{Tuple{Int64, Int64}, Tuple{}}, conjA::Symbol, B::Matrix{Float64}, pB::Tuple{Tuple{}, Tuple{Int64, Int64}}, conjB::Symbol, α::VectorInterface.One, β::VectorInterface.Zero)
@ TensorOperations ~/.julia/packages/TensorOperations/7VyQe/src/implementation/abstractarray.jl:35
[3] (::TensorOperationsChainRulesCoreExt.var"#58#65"{Tuple{Tuple{Int64, Int64}, Tuple{}}, FillArrays.Fill{Float64, 2, Tuple{Base.OneTo{Int64}, Base.OneTo{Int64}}}, Array{Float64, 4}, Tuple{Tuple{Int64, Int64}, Tuple{Int64, Int64}}, Symbol, Matrix{Float64}, Tuple{Tuple{Int64, Int64}, Tuple{}}, Symbol, VectorInterface.One, Tuple{}, ProjectTo{AbstractArray, NamedTuple{(:element, :axes), Tuple{ProjectTo{Float64, NamedTuple{(), Tuple{}}}, NTuple{4, Base.OneTo{Int64}}}}}})()
@ TensorOperationsChainRulesCoreExt ~/.julia/packages/TensorOperations/7VyQe/ext/TensorOperationsChainRulesCoreExt.jl:99
[4] unthunk
@ ~/.julia/packages/ChainRulesCore/7MWx2/src/tangent_types/thunks.jl:204 [inlined]
[5] wrap_chainrules_output
@ ~/.julia/packages/Zygote/YYT6v/src/compiler/chainrules.jl:110 [inlined]
[6] map (repeats 4 times)
@ ./tuple.jl:276 [inlined]
[7] wrap_chainrules_output
@ ~/.julia/packages/Zygote/YYT6v/src/compiler/chainrules.jl:111 [inlined]
[8] ZBack
@ ~/.julia/packages/Zygote/YYT6v/src/compiler/chainrules.jl:211 [inlined]
[9] Pullback
@ ./In[35]:25 [inlined]
[10] (::Zygote.Pullback{Tuple{typeof(tensor_conv), Array{Float64, 4}, Matrix{Float64}}, Tuple{Zygote.Pullback{Tuple{typeof(scalartype), Array{Float64, 4}}, Tuple{Zygote.ZBack{ChainRules.var"#typeof_pullback#45"}, Zygote.Pullback{Tuple{typeof(scalartype), Type{Array{Float64, 4}}}, Tuple{Zygote.Pullback{Tuple{typeof(scalartype), Type{Float64}}, Tuple{}}}}}}, Zygote.ZBack{TensorOperationsChainRulesCoreExt.var"#pullback#63"{Matrix{Float64}, Tuple{Tuple{Int64, Int64}, Tuple{}}, Array{Float64, 4}, Tuple{Tuple{Int64, Int64}, Tuple{Int64, Int64}}, Symbol, Matrix{Float64}, Tuple{Tuple{Int64, Int64}, Tuple{}}, Symbol, VectorInterface.One, VectorInterface.Zero, Tuple{}, ProjectTo{Number, NamedTuple{(), Tuple{}}}, ProjectTo{Number, NamedTuple{(), Tuple{}}}, ProjectTo{AbstractArray, NamedTuple{(:element, :axes), Tuple{ProjectTo{Float64, NamedTuple{(), Tuple{}}}, Tuple{Base.OneTo{Int64}, Base.OneTo{Int64}}}}}, ProjectTo{AbstractArray, NamedTuple{(:element, :axes), Tuple{ProjectTo{Float64, NamedTuple{(), Tuple{}}}, Tuple{Base.OneTo{Int64}, Base.OneTo{Int64}}}}}, ProjectTo{AbstractArray, NamedTuple{(:element, :axes), Tuple{ProjectTo{Float64, NamedTuple{(), Tuple{}}}, NTuple{4, Base.OneTo{Int64}}}}}}}, Zygote.ZBack{TensorOperationsChainRulesCoreExt.var"#tensoralloc_contract_pullback#41"{Tuple{DataType, Tuple{Tuple{Int64, Int64}, Tuple{}}, Array{Float64, 4}, Tuple{Tuple{Int64, Int64}, Tuple{Int64, Int64}}, Symbol, Matrix{Float64}, Tuple{Tuple{Int64, Int64}, Tuple{}}, Symbol, Bool}}}, Zygote.Pullback{Tuple{typeof(TensorOperations.promote_contract), Type{Float64}, Type{Float64}}, Tuple{Zygote.var"#2017#back#204"{typeof(identity)}, Zygote.var"#2173#back#293"{Zygote.var"#291#292"{Tuple{Tuple{Nothing}, Tuple{Nothing, Nothing}}, Zygote.Pullback{Tuple{typeof(Base.promote_op), typeof(TensorOperations.tensorop), Type{Float64}, Type{Float64}}, Tuple{Zygote.var"#2017#back#204"{typeof(identity)}, Zygote.var"#2173#back#293"{Zygote.var"#291#292"{Tuple{Tuple{Nothing}, Tuple{Nothing, Nothing}}, Zygote.ZBack{ChainRules.var"#apply_type_pullback#42"{Tuple{DataType, DataType}}}}}, Zygote.Pullback{Tuple{typeof(Core.Compiler.return_type), typeof(TensorOperations.tensorop), Type{Tuple{Float64, Float64}}}, Tuple{typeof(Core.Compiler.return_type)}}}}}}}}, Zygote.Pullback{Tuple{typeof(scalartype), Matrix{Float64}}, Tuple{Zygote.ZBack{ChainRules.var"#typeof_pullback#45"}, Zygote.Pullback{Tuple{typeof(scalartype), Type{Matrix{Float64}}}, Tuple{Zygote.Pullback{Tuple{typeof(scalartype), Type{Float64}}, Tuple{}}}}}}}})(Δ::FillArrays.Fill{Float64, 2, Tuple{Base.OneTo{Int64}, Base.OneTo{Int64}}})
@ Zygote ~/.julia/packages/Zygote/YYT6v/src/compiler/interface2.jl:0
[11] Pullback
@ ./In[38]:1 [inlined]
[12] (::Zygote.Pullback{Tuple{var"#47#48", Array{Float64, 4}}, Tuple{Zygote.var"#2995#back#766"{Zygote.var"#760#764"{Matrix{Float64}}}, Zygote.var"#1990#back#194"{Zygote.var"#190#193"{Zygote.Context{false}, GlobalRef, Matrix{Float64}}}, Zygote.Pullback{Tuple{typeof(tensor_conv), Array{Float64, 4}, Matrix{Float64}}, Tuple{Zygote.Pullback{Tuple{typeof(scalartype), Array{Float64, 4}}, Tuple{Zygote.ZBack{ChainRules.var"#typeof_pullback#45"}, Zygote.Pullback{Tuple{typeof(scalartype), Type{Array{Float64, 4}}}, Tuple{Zygote.Pullback{Tuple{typeof(scalartype), Type{Float64}}, Tuple{}}}}}}, Zygote.ZBack{TensorOperationsChainRulesCoreExt.var"#pullback#63"{Matrix{Float64}, Tuple{Tuple{Int64, Int64}, Tuple{}}, Array{Float64, 4}, Tuple{Tuple{Int64, Int64}, Tuple{Int64, Int64}}, Symbol, Matrix{Float64}, Tuple{Tuple{Int64, Int64}, Tuple{}}, Symbol, VectorInterface.One, VectorInterface.Zero, Tuple{}, ProjectTo{Number, NamedTuple{(), Tuple{}}}, ProjectTo{Number, NamedTuple{(), Tuple{}}}, ProjectTo{AbstractArray, NamedTuple{(:element, :axes), Tuple{ProjectTo{Float64, NamedTuple{(), Tuple{}}}, Tuple{Base.OneTo{Int64}, Base.OneTo{Int64}}}}}, ProjectTo{AbstractArray, NamedTuple{(:element, :axes), Tuple{ProjectTo{Float64, NamedTuple{(), Tuple{}}}, Tuple{Base.OneTo{Int64}, Base.OneTo{Int64}}}}}, ProjectTo{AbstractArray, NamedTuple{(:element, :axes), Tuple{ProjectTo{Float64, NamedTuple{(), Tuple{}}}, NTuple{4, Base.OneTo{Int64}}}}}}}, Zygote.ZBack{TensorOperationsChainRulesCoreExt.var"#tensoralloc_contract_pullback#41"{Tuple{DataType, Tuple{Tuple{Int64, Int64}, Tuple{}}, Array{Float64, 4}, Tuple{Tuple{Int64, Int64}, Tuple{Int64, Int64}}, Symbol, Matrix{Float64}, Tuple{Tuple{Int64, Int64}, Tuple{}}, Symbol, Bool}}}, Zygote.Pullback{Tuple{typeof(TensorOperations.promote_contract), Type{Float64}, Type{Float64}}, Tuple{Zygote.var"#2017#back#204"{typeof(identity)}, Zygote.var"#2173#back#293"{Zygote.var"#291#292"{Tuple{Tuple{Nothing}, Tuple{Nothing, Nothing}}, Zygote.Pullback{Tuple{typeof(Base.promote_op), typeof(TensorOperations.tensorop), Type{Float64}, Type{Float64}}, Tuple{Zygote.var"#2017#back#204"{typeof(identity)}, Zygote.var"#2173#back#293"{Zygote.var"#291#292"{Tuple{Tuple{Nothing}, Tuple{Nothing, Nothing}}, Zygote.ZBack{ChainRules.var"#apply_type_pullback#42"{Tuple{DataType, DataType}}}}}, Zygote.Pullback{Tuple{typeof(Core.Compiler.return_type), typeof(TensorOperations.tensorop), Type{Tuple{Float64, Float64}}}, Tuple{typeof(Core.Compiler.return_type)}}}}}}}}, Zygote.Pullback{Tuple{typeof(scalartype), Matrix{Float64}}, Tuple{Zygote.ZBack{ChainRules.var"#typeof_pullback#45"}, Zygote.Pullback{Tuple{typeof(scalartype), Type{Matrix{Float64}}}, Tuple{Zygote.Pullback{Tuple{typeof(scalartype), Type{Float64}}, Tuple{}}}}}}}}}})(Δ::Float64)
@ Zygote ~/.julia/packages/Zygote/YYT6v/src/compiler/interface2.jl:0
[13] (::Zygote.var"#75#76"{Zygote.Pullback{Tuple{var"#47#48", Array{Float64, 4}}, Tuple{Zygote.var"#2995#back#766"{Zygote.var"#760#764"{Matrix{Float64}}}, Zygote.var"#1990#back#194"{Zygote.var"#190#193"{Zygote.Context{false}, GlobalRef, Matrix{Float64}}}, Zygote.Pullback{Tuple{typeof(tensor_conv), Array{Float64, 4}, Matrix{Float64}}, Tuple{Zygote.Pullback{Tuple{typeof(scalartype), Array{Float64, 4}}, Tuple{Zygote.ZBack{ChainRules.var"#typeof_pullback#45"}, Zygote.Pullback{Tuple{typeof(scalartype), Type{Array{Float64, 4}}}, Tuple{Zygote.Pullback{Tuple{typeof(scalartype), Type{Float64}}, Tuple{}}}}}}, Zygote.ZBack{TensorOperationsChainRulesCoreExt.var"#pullback#63"{Matrix{Float64}, Tuple{Tuple{Int64, Int64}, Tuple{}}, Array{Float64, 4}, Tuple{Tuple{Int64, Int64}, Tuple{Int64, Int64}}, Symbol, Matrix{Float64}, Tuple{Tuple{Int64, Int64}, Tuple{}}, Symbol, VectorInterface.One, VectorInterface.Zero, Tuple{}, ProjectTo{Number, NamedTuple{(), Tuple{}}}, ProjectTo{Number, NamedTuple{(), Tuple{}}}, ProjectTo{AbstractArray, NamedTuple{(:element, :axes), Tuple{ProjectTo{Float64, NamedTuple{(), Tuple{}}}, Tuple{Base.OneTo{Int64}, Base.OneTo{Int64}}}}}, ProjectTo{AbstractArray, NamedTuple{(:element, :axes), Tuple{ProjectTo{Float64, NamedTuple{(), Tuple{}}}, Tuple{Base.OneTo{Int64}, Base.OneTo{Int64}}}}}, ProjectTo{AbstractArray, NamedTuple{(:element, :axes), Tuple{ProjectTo{Float64, NamedTuple{(), Tuple{}}}, NTuple{4, Base.OneTo{Int64}}}}}}}, Zygote.ZBack{TensorOperationsChainRulesCoreExt.var"#tensoralloc_contract_pullback#41"{Tuple{DataType, Tuple{Tuple{Int64, Int64}, Tuple{}}, Array{Float64, 4}, Tuple{Tuple{Int64, Int64}, Tuple{Int64, Int64}}, Symbol, Matrix{Float64}, Tuple{Tuple{Int64, Int64}, Tuple{}}, Symbol, Bool}}}, Zygote.Pullback{Tuple{typeof(TensorOperations.promote_contract), Type{Float64}, Type{Float64}}, Tuple{Zygote.var"#2017#back#204"{typeof(identity)}, Zygote.var"#2173#back#293"{Zygote.var"#291#292"{Tuple{Tuple{Nothing}, Tuple{Nothing, Nothing}}, Zygote.Pullback{Tuple{typeof(Base.promote_op), typeof(TensorOperations.tensorop), Type{Float64}, Type{Float64}}, Tuple{Zygote.var"#2017#back#204"{typeof(identity)}, Zygote.var"#2173#back#293"{Zygote.var"#291#292"{Tuple{Tuple{Nothing}, Tuple{Nothing, Nothing}}, Zygote.ZBack{ChainRules.var"#apply_type_pullback#42"{Tuple{DataType, DataType}}}}}, Zygote.Pullback{Tuple{typeof(Core.Compiler.return_type), typeof(TensorOperations.tensorop), Type{Tuple{Float64, Float64}}}, Tuple{typeof(Core.Compiler.return_type)}}}}}}}}, Zygote.Pullback{Tuple{typeof(scalartype), Matrix{Float64}}, Tuple{Zygote.ZBack{ChainRules.var"#typeof_pullback#45"}, Zygote.Pullback{Tuple{typeof(scalartype), Type{Matrix{Float64}}}, Tuple{Zygote.Pullback{Tuple{typeof(scalartype), Type{Float64}}, Tuple{}}}}}}}}}}})(Δ::Float64)
@ Zygote ~/.julia/packages/Zygote/YYT6v/src/compiler/interface.jl:45
[14] gradient(f::Function, args::Array{Float64, 4})
@ Zygote ~/.julia/packages/Zygote/YYT6v/src/compiler/interface.jl:97
[15] var"##core#1252"()
@ Main ~/.julia/packages/BenchmarkTools/0owsb/src/execution.jl:489
[16] var"##sample#1253"(::Tuple{}, __params::BenchmarkTools.Parameters)
@ Main ~/.julia/packages/BenchmarkTools/0owsb/src/execution.jl:495
[17] _run(b::BenchmarkTools.Benchmark, p::BenchmarkTools.Parameters; verbose::Bool, pad::String, kwargs::Base.Pairs{Symbol, Integer, NTuple{4, Symbol}, NamedTuple{(:samples, :evals, :gctrial, :gcsample), Tuple{Int64, Int64, Bool, Bool}}})
@ BenchmarkTools ~/.julia/packages/BenchmarkTools/0owsb/src/execution.jl:99
[18] #invokelatest#2
@ ./essentials.jl:821 [inlined]
[19] invokelatest
@ ./essentials.jl:816 [inlined]
[20] #run_result#45
@ ~/.julia/packages/BenchmarkTools/0owsb/src/execution.jl:34 [inlined]
[21] run_result
@ ~/.julia/packages/BenchmarkTools/0owsb/src/execution.jl:34 [inlined]
[22] run(b::BenchmarkTools.Benchmark, p::BenchmarkTools.Parameters; progressid::Nothing, nleaves::Float64, ndone::Float64, kwargs::Base.Pairs{Symbol, Integer, NTuple{5, Symbol}, NamedTuple{(:verbose, :samples, :evals, :gctrial, :gcsample), Tuple{Bool, Int64, Int64, Bool, Bool}}})
@ BenchmarkTools ~/.julia/packages/BenchmarkTools/0owsb/src/execution.jl:117
[23] run (repeats 2 times)
@ ~/.julia/packages/BenchmarkTools/0owsb/src/execution.jl:117 [inlined]
[24] #warmup#54
@ ~/.julia/packages/BenchmarkTools/0owsb/src/execution.jl:169 [inlined]
[25] warmup(item::BenchmarkTools.Benchmark)
@ BenchmarkTools ~/.julia/packages/BenchmarkTools/0owsb/src/execution.jl:168
[26] top-level scope
@ ~/.julia/packages/BenchmarkTools/0owsb/src/execution.jl:393

From you message, I understood I am not doing anything wrong. Is this right, or am I missing something?