Indeed, can confirm that running
export JULIA_LLVM_ARGS=" -unroll-count=4 "
julia --cpu-target=skylake -O3 -t 1 vecfma.jl > as.out
makes it worse.
But removing the LLVM flag and running with --cpu-target=tigerlake gives me the right thing (tested on v1.8.5, v1.9.0, v1.9.1):
...
││┌ @ essentials.jl:13 within `getindex`
vmovups (%rcx,%rsi,4), %ymm0
vmovups 32(%rcx,%rsi,4), %ymm1
vmovups 64(%rcx,%rsi,4), %ymm2
vmovups 96(%rcx,%rsi,4), %ymm3
vmovups (%rdx,%rsi,4), %ymm4
vmovups 32(%rdx,%rsi,4), %ymm5
vmovups 64(%rdx,%rsi,4), %ymm6
vmovups 96(%rdx,%rsi,4), %ymm7
; ││└
; ││┌ @ fastmath.jl:165 within `add_fast`
vfmadd213ps (%rax,%rsi,4), %ymm0, %ymm4 # ymm4 = (ymm0 * ymm4) + mem
vfmadd213ps 32(%rax,%rsi,4), %ymm1, %ymm5 # ymm5 = (ymm1 * ymm5) + mem
vfmadd213ps 64(%rax,%rsi,4), %ymm2, %ymm6 # ymm6 = (ymm2 * ymm6) + mem
vfmadd213ps 96(%rax,%rsi,4), %ymm3, %ymm7 # ymm7 = (ymm3 * ymm7) + mem
; ││└
; ││┌ @ array.jl:969 within `setindex!`
vmovups %ymm4, (%rax,%rsi,4)
vmovups %ymm5, 32(%rax,%rsi,4)
vmovups %ymm6, 64(%rax,%rsi,4)
vmovups %ymm7, 96(%rax,%rsi,4)
...