Here is a runaway case under profiler
Top entry is:
_turbo_!(::Val{(false, 0, 0, 0, false, 4, 32, 15, 64, 32768, 262144, 12582912, 0x0000000000000005)}, ::Val{(:LoopVectorization, :getindex, LoopVectorization.OperationStruct(0x00000000000000000000000000000021, 0x00000000000000000000000000000000, 0x00000000000000000000000000000000, 0x00000000000000000000000000000000, 0x00000000000000000000000000000000, LoopVectorization.memload, 0x0001, 0x01), :LoopVectorization, :getindex, LoopVectorization.OperationStruct(0x00000000000000000000000000000002, 0x00000000000000000000000000000000, 0x00000000000000000000000000000000, 0x00000000000000000000000000000000, 0x00000000000000000000000000000000, LoopVectorization.memload, 0x0002, 0x02), :LoopVectorization, :div_fast, LoopVectorization.OperationStruct(0x00000000000000000000000000000021, 0x00000000000000000000000000000000, 0x00000000000000000000000000000000, 0x00000000000000000000000000010002, 0x00000000000000000000000000000000, LoopVectorization.compute, 0x0003, 0x00), :LoopVectorization, :getindex, LoopVectorization.OperationStruct(0x00000000000000000000000000000001, 0x00000000000000000000000000000000, 0x00000000000000000000000000000000, 0x00000000000000000000000000000000, 0x00000000000000000000000000000000, LoopVectorization.memload, 0x0004, 0x03), :LoopVectorization, :getindex, LoopVectorization.OperationStruct(0x00000000000000000000000000000002, 0x00000000000000000000000000000000, 0x00000000000000000000000000000000, 0x00000000000000000000000000000000, 0x00000000000000000000000000000000, LoopVectorization.memload, 0x0005, 0x04), :numericconstant, Symbol(\"###reduction##zero###11###\"), LoopVectorization.OperationStruct(0x00000000000000000000000000000002, 0x00000000000000000000000000000000, 0x00000000000000000000000000000001, 0x00000000000000000000000000000000, 0x00000000000000000000000000000000, LoopVectorization.constant, 0x0006, 0x00), :LoopVectorization, :vfmadd_fast, LoopVectorization.OperationStruct(0x00000000000000000000000000000021, 0x00000000000000000000000000000001, 0x00000000000000000000000000000000, 0x00000000000000000000000300040006, 0x00000000000000000000000000000000, LoopVectorization.compute, 0x0006, 0x00), :LoopVectorization, :reduced_add, LoopVectorization.OperationStruct(0x00000000000000000000000000000002, 0x00000000000000000000000000000001, 0x00000000000000000000000000000000, 0x00000000000000000000000000070005, 0x00000000000000000000000000000000, LoopVectorization.compute, 0x0005, 0x00), :LoopVectorization, :setindex!, LoopVectorization.OperationStruct(0x00000000000000000000000000000002, 0x00000000000000000000000000000001, 0x00000000000000000000000000000000, 0x00000000000000000000000000000008, 0x00000000000000000000000000000000, LoopVectorization.memstore, 0x0007, 0x04))}, ::Val{(LoopVectorization.ArrayRefStruct{:G, Symbol(\"##vptr##_G\")}(0x00000000000000000000000000000101, 0x00000000000000000000000000000201, 0x00000000000000000000000000000000, 0x00000000000000000000000000000101), LoopVectorization.ArrayRefStruct{:norm, Symbol(\"##vptr##_norm\")}(0x00000000000000000000000000000001, 0x00000000000000000000000000000002, 0x00000000000000000000000000000000, 0x00000000000000000000000000000001), LoopVectorization.ArrayRefStruct{:data, Symbol(\"##vptr##_data\")}(0x00000000000000000000000000000001, 0x00000000000000000000000000000001, 0x00000000000000000000000000000000, 0x00000000000000000000000000000001), LoopVectorization.ArrayRefStruct{:mu, Symbol(\"##vptr##_mu\")}(0x00000000000000000000000000000001, 0x00000000000000000000000000000002, 0x00000000000000000000000000000000, 0x00000000000000000000000000000001))}, ::Val{(0, (), (), (), (), ((6, LoopVectorization.IntOrFloat),), ())}, ::Val{(:n, :k)}, ::Val{Tuple{Tuple{CloseOpenIntervals.CloseOpen{Static.StaticInt{0}, Int64}, CloseOpenIntervals.CloseOpen{Static.StaticInt{0}, Int64}}, Tuple{LayoutPointers.GroupedStridedPointers{NTuple{4, Ptr{Float64}}, (1, 1, 1, 1), (0, 0, 0, 0), ((1, 2), (1,), (1,), (1,)), ((1, 2), (3,), (4,), (5,)), Tuple{Static.StaticInt{8}, Int64, Static.StaticInt{8}, Static.StaticInt{8}, Static.StaticInt{8}}, NTuple{5, Static.StaticInt{0}}}}}}, ::Int64, ::Int64, ::Ptr{Float64}, ::Ptr{Float64}, ::Ptr{Float64}, ::Ptr{Float64}, ::Int64)
C:\Users\Win10\.julia\packages\LoopVectorization\tSQDi\src\reconstruct_loopset.jl
Total: 14675 14675 (flat, cum) 35.14%
713 14675 14675 @generated function _turbo_!(
714 . . ::Val{var"#UNROLL#"}, ::Val{var"#OPS#"}, ::Val{var"#ARF#"}, ::Val{var"#AM#"}, ::Val{var"#LPSYM#"}, ::Val{Tuple{var"#LB#",var"#V#"}}, var"#flattened#var#arguments#"::Vararg{Any,var"#num#vargs#"}
715 . . ) where {var"#UNROLL#", var"#OPS#", var"#ARF#", var"#AM#", var"#LPSYM#", var"#LB#", var"#V#", var"#num#vargs#"}
716 . . # 1 + 1 # Irrelevant line you can comment out/in to force recompilation...
717 . . ls = _turbo_loopset(var"#OPS#", var"#ARF#", var"#AM#", var"#LPSYM#", var"#LB#".parameters, var"#V#".parameters, var"#UNROLL#")
718 . . pushfirst!(ls.preamble.args, :(var"#lv#tuple#args#" = reassemble_tuple(Tuple{var"#LB#",var"#V#"}, var"#flattened#var#arguments#")))
Seems to be triggered from step_M!
. This leads me to the question if
evidences .= 0
@tturbo for n β 1:N, k β 1:K
evidences[k] += G[k, n]
end
is race free? Iβd propose to use something like
evidences .= 0
@tturbo for k β 1:K
for n β 1:N
evidences[k] += G[k, n]
end
end
instead, assuming it is equivalent (or better) than Threads.@threads for k β 1:K
β¦