Simple loop won't vectorize


#1

I’m porting over some of my Rust code to see if I can make the switch to Julia. My project is a numerical simulation benefits significantly from vectorization, so I’ve been testing a few simple functions in Julia to see how it compares to Rust.

Rust is able to use 256-bit AVX instructions for the following function, so it runs in 140ms, but Julia is only using scalar instructions, so it takes 1,518ms. What can I do to make the Julia code use AVX too? (I tried starting julia with --math-mode=fast, but that already seems to be the default.)

julia> function f()
           s = 0.0
           for i = 1e-9 * 1:10^9
               s += i * 1.23
           end
           s
       end

Rust:

use fastfloat::*;

fn f() -> F64 {
    let mut s = Fast(0.0);
    for i in 1..1_000_000_001 {
        s += Fast(i as f64) * 1.23;
    }
    s
}
julia> @code_native f()
        .text
; Function f {
; Location: REPL[66]:3
; Function literal_pow; {
; Location: none
; Function macro expansion; {
; Location: none
; Function ^; {
; Location: REPL[66]:2
        subq    $56, %rsp
        movabsq $139783190159232, %rax  # imm = 0x7F21CF651F80
        movl    $10, %edi
        movl    $9, %esi
        callq   *%rax
;}}}
; Function Colon; {
; Location: range.jl:3
; Function promote; {
; Location: promotion.jl:284
; Function _promote; {
; Location: promotion.jl:261
; Function convert; {
; Location: number.jl:7
; Function Type; {
; Location: float.jl:60
        vcvtsi2sdq      %rax, %xmm0, %xmm2
;}}}}
; Function Colon; {
; Location: range.jl:14
        movabsq $Colon, %rax
        movabsq $139782882036504, %rcx  # imm = 0x7F21BD078B18
        vmovsd  (%rcx), %xmm0           # xmm0 = mem[0],zero
        movabsq $139782882036512, %rcx  # imm = 0x7F21BD078B20
        vmovsd  (%rcx), %xmm1           # xmm1 = mem[0],zero
        leaq    8(%rsp), %rdi
        callq   *%rax
;}}
; Function iterate; {
; Location: range.jl:567
; Function iterate; {
; Location: range.jl:567
; Function <; {
; Location: int.jl:49
        movq    40(%rsp), %rax
        testq   %rax, %rax
;}
        jle     L315
; Location: range.jl:568
; Function unsafe_getindex; {
; Location: twiceprecision.jl:448
; Function -; {
; Location: int.jl:52
        movq    48(%rsp), %rcx
        movl    $1, %edx
        subq    %rcx, %rdx
;}
; Location: twiceprecision.jl:449
; Function *; {
; Location: promotion.jl:314
; Function promote; {
; Location: promotion.jl:284
; Function _promote; {
; Location: promotion.jl:261
; Function convert; {
; Location: number.jl:7
; Function Type; {
; Location: float.jl:60
        vcvtsi2sdq      %rdx, %xmm3, %xmm0
;}}}}
; Function *; {
; Location: float.jl:399
        vmovsd  24(%rsp), %xmm8         # xmm8 = mem[0],zero
        vmulsd  %xmm0, %xmm8, %xmm6
        vmovsd  32(%rsp), %xmm9         # xmm9 = mem[0],zero
        movabsq $139782882036528, %rsi  # imm = 0x7F21BD078B30
;}}}
; Function unsafe_getindex; {
; Location: float.jl:519
        vmovapd (%rsi), %xmm1
        vandpd  %xmm1, %xmm6, %xmm2
        vmovsd  8(%rsp), %xmm4          # xmm4 = mem[0],zero
;}
; Function unsafe_getindex; {
; Location: twiceprecision.jl:451
; Function +; {
; Location: float.jl:395
        vmovsd  16(%rsp), %xmm10        # xmm10 = mem[0],zero
;}
; Location: twiceprecision.jl:450
; Function add12; {
; Location: twiceprecision.jl:84
; Function abs; {
; Location: float.jl:519
        vandpd  %xmm1, %xmm4, %xmm5
;}}
; Function add12; {
; Location: float.jl:452
        vucomisd        %xmm5, %xmm2
;}}}}}
; Function f {
; Location: twiceprecision.jl:84
        vcmpnltsd       %xmm2, %xmm5, %xmm1
        vblendvpd       %xmm1, %xmm4, %xmm6, %xmm1
        jbe     L179
        vmovapd %xmm4, %xmm6
;}
; Function f {
; Location: REPL[66]:3
; Function iterate; {
; Location: range.jl:567
; Function iterate; {
; Location: range.jl:568
; Function unsafe_getindex; {
; Location: twiceprecision.jl:450
; Function add12; {
; Location: twiceprecision.jl:85
; Function canonicalize2; {
; Location: twiceprecision.jl:49
; Function +; {
; Location: float.jl:395
L179:
        vfmadd213sd     %xmm10, %xmm9, %xmm0
;}}}
; Location: twiceprecision.jl:451
; Function +; {
; Location: float.jl:395
        vaddsd  %xmm6, %xmm0, %xmm0
        vaddsd  %xmm1, %xmm0, %xmm0
        movabsq $139782882036520, %rdi  # imm = 0x7F21BD078B28
;}}}}
; Location: REPL[66]:4
; Function *; {
; Location: float.jl:399
        vmulsd  (%rdi), %xmm0, %xmm0
;}
; Function iterate; {
; Location: range.jl:567
; Function <; {
; Location: int.jl:49
        cmpq    $1, %rax
;}
        je      L310
        negq    %rcx
        movl    $2, %edx
        vmovapd (%rsi), %xmm11
        vmovsd  (%rdi), %xmm7           # xmm7 = mem[0],zero
        nopw    %cs:(%rax,%rax)
; Location: range.jl:568
; Function unsafe_getindex; {
; Location: twiceprecision.jl:449
; Function *; {
; Location: promotion.jl:314
; Function promote; {
; Location: promotion.jl:284
; Function _promote; {
; Location: promotion.jl:261
; Function convert; {
; Location: number.jl:7
; Function Type; {
; Location: float.jl:60
L240:
        leaq    (%rcx,%rdx), %rsi
        vcvtsi2sdq      %rsi, %xmm12, %xmm1
;}}}}
; Function *; {
; Location: float.jl:399
        vmulsd  %xmm1, %xmm8, %xmm2
;}}}
; Function unsafe_getindex; {
; Location: float.jl:519
        vandpd  %xmm11, %xmm2, %xmm6
;}
; Function unsafe_getindex; {
; Location: twiceprecision.jl:450
; Function add12; {
; Location: float.jl:452
        vucomisd        %xmm5, %xmm6
;}}}}
; Function f {
; Location: twiceprecision.jl:84
        vmovapd %xmm4, %xmm3
        jbe     L272
        vmovapd %xmm2, %xmm3
L272:
        vcmpnltsd       %xmm6, %xmm5, %xmm6
        vblendvpd       %xmm6, %xmm2, %xmm4, %xmm2
;}
; Function f {
; Location: REPL[66]:4
; Function iterate; {
; Location: range.jl:568
; Function unsafe_getindex; {
; Location: twiceprecision.jl:450
; Function add12; {
; Location: twiceprecision.jl:85
; Function canonicalize2; {
; Location: twiceprecision.jl:49
; Function +; {
; Location: float.jl:395
        vfmadd213sd     %xmm10, %xmm9, %xmm1
;}}}
; Location: twiceprecision.jl:451
; Function +; {
; Location: float.jl:395
        vaddsd  %xmm2, %xmm1, %xmm1
        vaddsd  %xmm3, %xmm1, %xmm1
;}}}
; Function +; {
; Location: float.jl:395
        vfmadd231sd     %xmm7, %xmm1, %xmm0
;}
; Function iterate; {
; Location: range.jl:568
; Function +; {
; Location: int.jl:53
        addq    $1, %rdx
;}
; Location: range.jl:567
; Function <; {
; Location: int.jl:49
        cmpq    %rdx, %rax
;}
        jge     L240
;}
; Location: REPL[66]:6
L310:
        addq    $56, %rsp
        retq
L315:
        vxorps  %xmm0, %xmm0, %xmm0
; Location: REPL[66]:6
        addq    $56, %rsp
        retq
        nopw    %cs:(%rax,%rax)
;}

Rust:

    4280:	48 83 ec 38          	sub    $0x38,%rsp
    4284:	c5 f9 6f 05 74 2d 02 	vmovdqa 0x22d74(%rip),%xmm0        # 27000 <_fini+0xe68>
    428b:	00 
    428c:	c4 41 29 57 d2       	vxorpd %xmm10,%xmm10,%xmm10
    4291:	b8 00 ca 9a 3b       	mov    $0x3b9aca00,%eax
    4296:	c4 e2 79 18 0d a1 2e 	vbroadcastss 0x22ea1(%rip),%xmm1        # 27140 <_fini+0xfa8>
    429d:	02 00 
    429f:	c5 f8 29 4c 24 20    	vmovaps %xmm1,0x20(%rsp)
    42a5:	c4 e2 79 18 0d 96 2e 	vbroadcastss 0x22e96(%rip),%xmm1        # 27144 <_fini+0xfac>
    42ac:	02 00 
    42ae:	c5 f8 29 4c 24 10    	vmovaps %xmm1,0x10(%rsp)
    42b4:	c4 e2 79 18 0d 8b 2e 	vbroadcastss 0x22e8b(%rip),%xmm1        # 27148 <_fini+0xfb0>
    42bb:	02 00 
    42bd:	c5 f8 29 0c 24       	vmovaps %xmm1,(%rsp)
    42c2:	c4 e2 7d 19 25 d5 2e 	vbroadcastsd 0x22ed5(%rip),%ymm4        # 271a0 <_fini+0x1008>
    42c9:	02 00 
    42cb:	c4 62 79 58 05 78 2e 	vpbroadcastd 0x22e78(%rip),%xmm8        # 2714c <_fini+0xfb4>
    42d2:	02 00 
    42d4:	c4 62 79 58 0d 73 2e 	vpbroadcastd 0x22e73(%rip),%xmm9        # 27150 <_fini+0xfb8>
    42db:	02 00 
    42dd:	c4 62 79 58 35 6e 2e 	vpbroadcastd 0x22e6e(%rip),%xmm14        # 27154 <_fini+0xfbc>
    42e4:	02 00 
    42e6:	c4 e2 79 58 0d 69 2e 	vpbroadcastd 0x22e69(%rip),%xmm1        # 27158 <_fini+0xfc0>
    42ed:	02 00 
    42ef:	c4 e2 79 58 15 64 2e 	vpbroadcastd 0x22e64(%rip),%xmm2        # 2715c <_fini+0xfc4>
    42f6:	02 00 
    42f8:	c4 41 21 57 db       	vxorpd %xmm11,%xmm11,%xmm11
    42fd:	c4 41 19 57 e4       	vxorpd %xmm12,%xmm12,%xmm12
    4302:	c4 41 11 57 ed       	vxorpd %xmm13,%xmm13,%xmm13
    4307:	66 0f 1f 84 00 00 00 	nopw   0x0(%rax,%rax,1)
    430e:	00 00 
    4310:	c5 f9 fe 5c 24 20    	vpaddd 0x20(%rsp),%xmm0,%xmm3
    4316:	c5 7e e6 f8          	vcvtdq2pd %xmm0,%ymm15
    431a:	c5 05 59 fc          	vmulpd %ymm4,%ymm15,%ymm15
    431e:	c4 41 05 58 d2       	vaddpd %ymm10,%ymm15,%ymm10
    4323:	c5 f9 fe 6c 24 10    	vpaddd 0x10(%rsp),%xmm0,%xmm5
    4329:	c5 fe e6 db          	vcvtdq2pd %xmm3,%ymm3
    432d:	c5 fe e6 ed          	vcvtdq2pd %xmm5,%ymm5
    4331:	c5 e5 59 dc          	vmulpd %ymm4,%ymm3,%ymm3
    4335:	c4 c1 65 58 db       	vaddpd %ymm11,%ymm3,%ymm3
    433a:	c5 d5 59 ec          	vmulpd %ymm4,%ymm5,%ymm5
    433e:	c4 c1 55 58 ec       	vaddpd %ymm12,%ymm5,%ymm5
    4343:	c5 f9 fe 34 24       	vpaddd (%rsp),%xmm0,%xmm6
    4348:	c5 fe e6 f6          	vcvtdq2pd %xmm6,%ymm6
    434c:	c5 cd 59 f4          	vmulpd %ymm4,%ymm6,%ymm6
    4350:	c4 c1 4d 58 f5       	vaddpd %ymm13,%ymm6,%ymm6
    4355:	c4 c1 79 fe f8       	vpaddd %xmm8,%xmm0,%xmm7
    435a:	c5 fe e6 ff          	vcvtdq2pd %xmm7,%ymm7
    435e:	c5 c5 59 fc          	vmulpd %ymm4,%ymm7,%ymm7
    4362:	c4 41 45 58 d2       	vaddpd %ymm10,%ymm7,%ymm10
    4367:	c4 c1 79 fe f9       	vpaddd %xmm9,%xmm0,%xmm7
    436c:	c5 fe e6 ff          	vcvtdq2pd %xmm7,%ymm7
    4370:	c5 c5 59 fc          	vmulpd %ymm4,%ymm7,%ymm7
    4374:	c5 45 58 db          	vaddpd %ymm3,%ymm7,%ymm11
    4378:	c4 c1 79 fe de       	vpaddd %xmm14,%xmm0,%xmm3
    437d:	c5 fe e6 db          	vcvtdq2pd %xmm3,%ymm3
    4381:	c5 e5 59 dc          	vmulpd %ymm4,%ymm3,%ymm3
    4385:	c5 65 58 e5          	vaddpd %ymm5,%ymm3,%ymm12
    4389:	c5 f9 fe d9          	vpaddd %xmm1,%xmm0,%xmm3
    438d:	c5 fe e6 db          	vcvtdq2pd %xmm3,%ymm3
    4391:	c5 e5 59 dc          	vmulpd %ymm4,%ymm3,%ymm3
    4395:	c5 65 58 ee          	vaddpd %ymm6,%ymm3,%ymm13
    4399:	c5 f9 fe c2          	vpaddd %xmm2,%xmm0,%xmm0
    439d:	83 c0 e0             	add    $0xffffffe0,%eax
    43a0:	0f 85 6a ff ff ff    	jne    4310 <_ZN4play1f17h63840982574dd7feE+0x90>
    43a6:	c4 c1 25 58 c2       	vaddpd %ymm10,%ymm11,%ymm0
    43ab:	c5 9d 58 c0          	vaddpd %ymm0,%ymm12,%ymm0
    43af:	c5 95 58 c0          	vaddpd %ymm0,%ymm13,%ymm0
    43b3:	c4 e3 7d 19 c1 01    	vextractf128 $0x1,%ymm0,%xmm1
    43b9:	c5 fd 58 c1          	vaddpd %ymm1,%ymm0,%ymm0
    43bd:	c4 e3 79 05 c8 01    	vpermilpd $0x1,%xmm0,%xmm1
    43c3:	c5 f9 58 c1          	vaddpd %xmm1,%xmm0,%xmm0
    43c7:	48 83 c4 38          	add    $0x38,%rsp
    43cb:	c5 f8 77             	vzeroupper 
    43ce:	c3                   	retq   
    43cf:	90                   	nop


#2

Is this really what you want?

julia> 1e-9 * 1:10^9
1.0e-9:1.0:1.0e9

If you remove the 1e-9, your loop iterates over integers instead and goes much faster.


#3

I strongly advise against using —math-mode=fast as it’s a fairly inherently broken feature. This loop can’t be vectorized without permission since floating point addition isn’t associative. You can put the @simd macro on the loop to give the compiler permission to reassociate operations as necessary to vectorize.


#4

Ugh sorry about that… I was sloppy when writing the post and forgot to remove the 1e-9 in the Julia code. It’s not the main issue though, which I have tracked down by looking at the LLVM-IR.

It seems that Rust uses i32 for the iterator, but Julia uses i64. Forcing i32 makes the Julia assembly nearly identical (and just as fast) as the Rust. For some reason LLVM finds it much easier to convert i32 -> f64 than i64 -> f64.

With for i in 1:10^9:

julia> @time f()
0.714863 seconds (5 allocations: 176 bytes)
6.150000006112102e17

With for i in Int32(1):Int32(10^9):

julia> @time f()
0.126425 seconds (5 allocations: 176 bytes)
6.150000006112102e17

#5

You’re probably measuring compilation time the first time around - use @btimeor @benchmark from BenchmarkTools for accurate benchmarking results.


#6

Thanks, but what I posted was for the 2nd runs and the results are stable.


#7

I am seeing the same thing:

julia> @btime (s = 0.0; @simd for i in 1:10^9 s += i * 1.23 end; s)
  713.254 ms (0 allocations: 0 bytes)
6.150000006112102e17

julia> @btime (s = 0.0; @simd for i in Int32(1):Int32(10^9) s += i * 1.23 end; s)
  130.840 ms (0 allocations: 0 bytes)
6.150000006112102e17

Both cases seem to SIMD, based on the native code, but only the Int32 case generates sane code. I’d love to understand what’s going on here.


Extra token, on same line as for loop start, is valid?
#8

It seems like a LLVM issue. Rust will also generate the same slow code if I force it to use i64 like 1..(2e9 as i64). That’s with Rust nightly, which is using a snapshot of LLVM 8.

I’m a bit beyond my depth here, but maybe a LLVM bug report would be appropriate?


#9

Also note that * binds more tightly than : so 1e-9 * 1:10^9 is misleadingly spaced and means (1e-9 * 1):10^9 not 1e-9 * (1:10^9).


#10

Interestingly, the problem seems architecture specific.

julia> using BenchmarkTools

julia> function fi64()
           s = 0.0
           @simd for i = 1:10^9
               s += i * 1.23
           end
           s
       end
fi64 (generic function with 1 method)

julia> function fi32()
           s = 0.0
           @simd for i = Int32(1):Int32(10^9)
               s += i * 1.23
           end
           s
       end
fi32 (generic function with 1 method)

yields:

julia> @btime fi32()
  64.811 ms (0 allocations: 0 bytes)
6.150000006117115e17

julia> @btime fi64()
  47.321 ms (0 allocations: 0 bytes)
6.150000006117115e17

julia> versioninfo()
Julia Version 1.2.0-DEV.219
Commit 77739c8780 (2019-01-26 07:58 UTC)
Platform Info:
  OS: Linux (x86_64-redhat-linux)
  CPU: Intel(R) Core(TM) i9-9940X CPU @ 3.30GHz
  WORD_SIZE: 64
  LIBM: libopenlibm
  LLVM: libLLVM-6.0.1 (ORCJIT, skylake)

The 64 bit integer version is about 35% faster on my computer. The assembly of both look sane. 64 bit int loop body:

L208:
	vmovapd	%zmm6, %zmm7
	vmovapd	%zmm5, %zmm8
	vmovapd	%zmm4, %zmm9
	vmovapd	%zmm2, %zmm10
; │││││ @ int.jl:53 within `+'
	vpsubq	%zmm1, %zmm0, %zmm2
	vpaddq	(%r9){1to8}, %zmm0, %zmm4
	vpaddq	(%r10){1to8}, %zmm0, %zmm5
	vpaddq	(%rdx){1to8}, %zmm0, %zmm6
; │└└└└
; │┌ @ float.jl:60 within `macro expansion'
	vcvtqq2pd	%zmm2, %zmm2
	vcvtqq2pd	%zmm4, %zmm4
	vcvtqq2pd	%zmm5, %zmm5
	vcvtqq2pd	%zmm6, %zmm6
; │└
; │┌ @ simdloop.jl:73 within `macro expansion' @ REPL[1]:4
; ││┌ @ float.jl:395 within `+'
	vfmadd213pd	%zmm10, %zmm3, %zmm2
	vfmadd213pd	%zmm9, %zmm3, %zmm4
	vfmadd213pd	%zmm8, %zmm3, %zmm5
	vfmadd213pd	%zmm7, %zmm3, %zmm6
; ││└
; ││ @ simdloop.jl:72 within `macro expansion'
; ││┌ @ simdloop.jl:50 within `simd_index'
; │││┌ @ range.jl:616 within `getindex'
; ││││┌ @ int.jl:53 within `+'
	vpaddq	(%rsi){1to8}, %zmm0, %zmm0
	addq	$-32, %rdi
	jne	L208

32 bit int loop body:

L208:
	vmovapd	%zmm6, %zmm7
	vmovapd	%zmm5, %zmm8
	vmovapd	%zmm4, %zmm9
	vmovapd	%zmm2, %zmm10
; │└
; │┌ @ simdloop.jl:72 within `macro expansion'
; ││┌ @ simdloop.jl:50 within `simd_index'
; │││┌ @ range.jl:618 within `getindex'
; ││││┌ @ int.jl:458 within `rem'
	vpsubd	%ymm1, %ymm0, %ymm2
	vpaddd	(%r8){1to8}, %ymm0, %ymm4
	vpaddd	(%r9){1to8}, %ymm0, %ymm5
	vpaddd	(%r10){1to8}, %ymm0, %ymm6
; ││└└└
; ││ @ simdloop.jl:73 within `macro expansion' @ REPL[2]:4
; ││┌ @ promotion.jl:314 within `*'
; │││┌ @ promotion.jl:284 within `promote'
; ││││┌ @ promotion.jl:261 within `_promote'
; │││││┌ @ number.jl:7 within `convert'
; ││││││┌ @ float.jl:60 within `Type'
	vcvtdq2pd	%ymm2, %zmm2
	vcvtdq2pd	%ymm4, %zmm4
	vcvtdq2pd	%ymm5, %zmm5
	vcvtdq2pd	%ymm6, %zmm6
; ││└└└└└
; ││ @ simdloop.jl:73 within `macro expansion' @ float.jl:395
	vfmadd213pd	%zmm10, %zmm3, %zmm2
	vfmadd213pd	%zmm9, %zmm3, %zmm4
	vfmadd213pd	%zmm8, %zmm3, %zmm5
	vfmadd213pd	%zmm7, %zmm3, %zmm6
; ││ @ simdloop.jl:74 within `macro expansion'
; ││┌ @ int.jl:53 within `+'
	vpaddd	(%rdi){1to8}, %ymm0, %ymm0
	addq	$-32, %rcx
	jne	L208

The difference of course being using ymm vs zmm registers for the integers.

So it does vectorize correctly on Skylake-X, at least.

EDIT:
There is also the @fastmath macro:

julia> foo(a, b) = a + b - a - b
foo (generic function with 1 method)

julia> fast_foo(a, b) = @fastmath a + b - a - b
fast_foo (generic function with 1 method)

julia> foo(1e18, 65)
63.0

julia> foo(1e18, 63)
-63.0

julia> fast_foo(1e18, 65)
0.0

julia> fast_foo(1e18, 63)
0.0

Note that special functions like log and sin will not generally be faster under fast math. It’s mostly good in places where @simd doesn’t work to allow for vectorization. I’d also recommend trying @simd ivdep for; sometimes the ivdep is necessary and helps to promise that there aren’t any dependencies between iterations.


#11

I guess there just isn’t any vectorized instruction for Int64 conversions without AVX512, but there is for Int32:

https://www.felixcloutier.com/x86/vcvtqq2pd
https://www.felixcloutier.com/x86/cvtdq2pd


#12

Naively I would sort of expect Int32 to convert to Float64 more straightforwardly. Float64 has something like 53 bits of mantissa. So an Int64 conversion might require a check to see if it uses more significant bits than that. If so, it might have to round up or down to approximate the integer in Float64. This could potentially take more time and operations.

(I don’t understand the assembly, and have no idea about the architecture differences, nor about pipelining, vectorizing, or other hardware tricks. Just speculating here.)


#13

robsmith answered that by saying avx512(vl and dq) have specific cpu instructions for converting 64 bit (quadword) integers to double precision floating point numbers.
Both avx and avx512 have such instructions for 32 bit (doubleword) integers to double precision floating point numbers.

So both conversions are fast and easily vectorized with avx512, but only the 32 bit integer -> 64 bit float is fast on avx platforms.

I don’t know anything about the hardware implementations of these, but a heuristic is that when (a vectorized) assembly instruction exists that does something, it will be fast and allow loops with them to be vectorized. Not all instructions take the same long. Obviously in this case, the loop with 64 bit integers -> 64 bit floats was faster on my platform, at 48 ms vs 65 ms for the entire function, which was doing a lot of operations other than the conversion.

I also don’t know if any error checking may happen, but I’m guessing the equivalent of fno-math-errno would turn that off in a vectorized loop / be required for it to vectorize in the first place? I don’t know the details there, or the extant things like that are controlled in software vs hardware.

Another notable exceptions to being fast when other similar operations exist are gather/scatter operations. But at least they’re still vectorized, so if you have a lot of computations in loops, they could still significantly speed up your code by enabling that loop to be vectorized (and are probably much faster than doing a gazillion vector shuffles to assemble the vectors).

But changing the data layout (if possible) so that you don’t need them will be faster.
IMO, the two biggest tricks to vectorization are avoiding branches (which includes bounds checks and domain-errors from functions like sqrt), and making sure you can load and store contiguous elements into/out of vectors. Broadcasting (filling a vector with a single number) is also fast.