I’m porting over some of my Rust code to see if I can make the switch to Julia. My project is a numerical simulation benefits significantly from vectorization, so I’ve been testing a few simple functions in Julia to see how it compares to Rust.
Rust is able to use 256-bit AVX instructions for the following function, so it runs in 140ms, but Julia is only using scalar instructions, so it takes 1,518ms. What can I do to make the Julia code use AVX too? (I tried starting julia with --math-mode=fast
, but that already seems to be the default.)
julia> function f()
s = 0.0
for i = 1e-9 * 1:10^9
s += i * 1.23
end
s
end
Rust:
use fastfloat::*;
fn f() -> F64 {
let mut s = Fast(0.0);
for i in 1..1_000_000_001 {
s += Fast(i as f64) * 1.23;
}
s
}
julia> @code_native f()
.text
; Function f {
; Location: REPL[66]:3
; Function literal_pow; {
; Location: none
; Function macro expansion; {
; Location: none
; Function ^; {
; Location: REPL[66]:2
subq $56, %rsp
movabsq $139783190159232, %rax # imm = 0x7F21CF651F80
movl $10, %edi
movl $9, %esi
callq *%rax
;}}}
; Function Colon; {
; Location: range.jl:3
; Function promote; {
; Location: promotion.jl:284
; Function _promote; {
; Location: promotion.jl:261
; Function convert; {
; Location: number.jl:7
; Function Type; {
; Location: float.jl:60
vcvtsi2sdq %rax, %xmm0, %xmm2
;}}}}
; Function Colon; {
; Location: range.jl:14
movabsq $Colon, %rax
movabsq $139782882036504, %rcx # imm = 0x7F21BD078B18
vmovsd (%rcx), %xmm0 # xmm0 = mem[0],zero
movabsq $139782882036512, %rcx # imm = 0x7F21BD078B20
vmovsd (%rcx), %xmm1 # xmm1 = mem[0],zero
leaq 8(%rsp), %rdi
callq *%rax
;}}
; Function iterate; {
; Location: range.jl:567
; Function iterate; {
; Location: range.jl:567
; Function <; {
; Location: int.jl:49
movq 40(%rsp), %rax
testq %rax, %rax
;}
jle L315
; Location: range.jl:568
; Function unsafe_getindex; {
; Location: twiceprecision.jl:448
; Function -; {
; Location: int.jl:52
movq 48(%rsp), %rcx
movl $1, %edx
subq %rcx, %rdx
;}
; Location: twiceprecision.jl:449
; Function *; {
; Location: promotion.jl:314
; Function promote; {
; Location: promotion.jl:284
; Function _promote; {
; Location: promotion.jl:261
; Function convert; {
; Location: number.jl:7
; Function Type; {
; Location: float.jl:60
vcvtsi2sdq %rdx, %xmm3, %xmm0
;}}}}
; Function *; {
; Location: float.jl:399
vmovsd 24(%rsp), %xmm8 # xmm8 = mem[0],zero
vmulsd %xmm0, %xmm8, %xmm6
vmovsd 32(%rsp), %xmm9 # xmm9 = mem[0],zero
movabsq $139782882036528, %rsi # imm = 0x7F21BD078B30
;}}}
; Function unsafe_getindex; {
; Location: float.jl:519
vmovapd (%rsi), %xmm1
vandpd %xmm1, %xmm6, %xmm2
vmovsd 8(%rsp), %xmm4 # xmm4 = mem[0],zero
;}
; Function unsafe_getindex; {
; Location: twiceprecision.jl:451
; Function +; {
; Location: float.jl:395
vmovsd 16(%rsp), %xmm10 # xmm10 = mem[0],zero
;}
; Location: twiceprecision.jl:450
; Function add12; {
; Location: twiceprecision.jl:84
; Function abs; {
; Location: float.jl:519
vandpd %xmm1, %xmm4, %xmm5
;}}
; Function add12; {
; Location: float.jl:452
vucomisd %xmm5, %xmm2
;}}}}}
; Function f {
; Location: twiceprecision.jl:84
vcmpnltsd %xmm2, %xmm5, %xmm1
vblendvpd %xmm1, %xmm4, %xmm6, %xmm1
jbe L179
vmovapd %xmm4, %xmm6
;}
; Function f {
; Location: REPL[66]:3
; Function iterate; {
; Location: range.jl:567
; Function iterate; {
; Location: range.jl:568
; Function unsafe_getindex; {
; Location: twiceprecision.jl:450
; Function add12; {
; Location: twiceprecision.jl:85
; Function canonicalize2; {
; Location: twiceprecision.jl:49
; Function +; {
; Location: float.jl:395
L179:
vfmadd213sd %xmm10, %xmm9, %xmm0
;}}}
; Location: twiceprecision.jl:451
; Function +; {
; Location: float.jl:395
vaddsd %xmm6, %xmm0, %xmm0
vaddsd %xmm1, %xmm0, %xmm0
movabsq $139782882036520, %rdi # imm = 0x7F21BD078B28
;}}}}
; Location: REPL[66]:4
; Function *; {
; Location: float.jl:399
vmulsd (%rdi), %xmm0, %xmm0
;}
; Function iterate; {
; Location: range.jl:567
; Function <; {
; Location: int.jl:49
cmpq $1, %rax
;}
je L310
negq %rcx
movl $2, %edx
vmovapd (%rsi), %xmm11
vmovsd (%rdi), %xmm7 # xmm7 = mem[0],zero
nopw %cs:(%rax,%rax)
; Location: range.jl:568
; Function unsafe_getindex; {
; Location: twiceprecision.jl:449
; Function *; {
; Location: promotion.jl:314
; Function promote; {
; Location: promotion.jl:284
; Function _promote; {
; Location: promotion.jl:261
; Function convert; {
; Location: number.jl:7
; Function Type; {
; Location: float.jl:60
L240:
leaq (%rcx,%rdx), %rsi
vcvtsi2sdq %rsi, %xmm12, %xmm1
;}}}}
; Function *; {
; Location: float.jl:399
vmulsd %xmm1, %xmm8, %xmm2
;}}}
; Function unsafe_getindex; {
; Location: float.jl:519
vandpd %xmm11, %xmm2, %xmm6
;}
; Function unsafe_getindex; {
; Location: twiceprecision.jl:450
; Function add12; {
; Location: float.jl:452
vucomisd %xmm5, %xmm6
;}}}}
; Function f {
; Location: twiceprecision.jl:84
vmovapd %xmm4, %xmm3
jbe L272
vmovapd %xmm2, %xmm3
L272:
vcmpnltsd %xmm6, %xmm5, %xmm6
vblendvpd %xmm6, %xmm2, %xmm4, %xmm2
;}
; Function f {
; Location: REPL[66]:4
; Function iterate; {
; Location: range.jl:568
; Function unsafe_getindex; {
; Location: twiceprecision.jl:450
; Function add12; {
; Location: twiceprecision.jl:85
; Function canonicalize2; {
; Location: twiceprecision.jl:49
; Function +; {
; Location: float.jl:395
vfmadd213sd %xmm10, %xmm9, %xmm1
;}}}
; Location: twiceprecision.jl:451
; Function +; {
; Location: float.jl:395
vaddsd %xmm2, %xmm1, %xmm1
vaddsd %xmm3, %xmm1, %xmm1
;}}}
; Function +; {
; Location: float.jl:395
vfmadd231sd %xmm7, %xmm1, %xmm0
;}
; Function iterate; {
; Location: range.jl:568
; Function +; {
; Location: int.jl:53
addq $1, %rdx
;}
; Location: range.jl:567
; Function <; {
; Location: int.jl:49
cmpq %rdx, %rax
;}
jge L240
;}
; Location: REPL[66]:6
L310:
addq $56, %rsp
retq
L315:
vxorps %xmm0, %xmm0, %xmm0
; Location: REPL[66]:6
addq $56, %rsp
retq
nopw %cs:(%rax,%rax)
;}
Rust:
4280: 48 83 ec 38 sub $0x38,%rsp
4284: c5 f9 6f 05 74 2d 02 vmovdqa 0x22d74(%rip),%xmm0 # 27000 <_fini+0xe68>
428b: 00
428c: c4 41 29 57 d2 vxorpd %xmm10,%xmm10,%xmm10
4291: b8 00 ca 9a 3b mov $0x3b9aca00,%eax
4296: c4 e2 79 18 0d a1 2e vbroadcastss 0x22ea1(%rip),%xmm1 # 27140 <_fini+0xfa8>
429d: 02 00
429f: c5 f8 29 4c 24 20 vmovaps %xmm1,0x20(%rsp)
42a5: c4 e2 79 18 0d 96 2e vbroadcastss 0x22e96(%rip),%xmm1 # 27144 <_fini+0xfac>
42ac: 02 00
42ae: c5 f8 29 4c 24 10 vmovaps %xmm1,0x10(%rsp)
42b4: c4 e2 79 18 0d 8b 2e vbroadcastss 0x22e8b(%rip),%xmm1 # 27148 <_fini+0xfb0>
42bb: 02 00
42bd: c5 f8 29 0c 24 vmovaps %xmm1,(%rsp)
42c2: c4 e2 7d 19 25 d5 2e vbroadcastsd 0x22ed5(%rip),%ymm4 # 271a0 <_fini+0x1008>
42c9: 02 00
42cb: c4 62 79 58 05 78 2e vpbroadcastd 0x22e78(%rip),%xmm8 # 2714c <_fini+0xfb4>
42d2: 02 00
42d4: c4 62 79 58 0d 73 2e vpbroadcastd 0x22e73(%rip),%xmm9 # 27150 <_fini+0xfb8>
42db: 02 00
42dd: c4 62 79 58 35 6e 2e vpbroadcastd 0x22e6e(%rip),%xmm14 # 27154 <_fini+0xfbc>
42e4: 02 00
42e6: c4 e2 79 58 0d 69 2e vpbroadcastd 0x22e69(%rip),%xmm1 # 27158 <_fini+0xfc0>
42ed: 02 00
42ef: c4 e2 79 58 15 64 2e vpbroadcastd 0x22e64(%rip),%xmm2 # 2715c <_fini+0xfc4>
42f6: 02 00
42f8: c4 41 21 57 db vxorpd %xmm11,%xmm11,%xmm11
42fd: c4 41 19 57 e4 vxorpd %xmm12,%xmm12,%xmm12
4302: c4 41 11 57 ed vxorpd %xmm13,%xmm13,%xmm13
4307: 66 0f 1f 84 00 00 00 nopw 0x0(%rax,%rax,1)
430e: 00 00
4310: c5 f9 fe 5c 24 20 vpaddd 0x20(%rsp),%xmm0,%xmm3
4316: c5 7e e6 f8 vcvtdq2pd %xmm0,%ymm15
431a: c5 05 59 fc vmulpd %ymm4,%ymm15,%ymm15
431e: c4 41 05 58 d2 vaddpd %ymm10,%ymm15,%ymm10
4323: c5 f9 fe 6c 24 10 vpaddd 0x10(%rsp),%xmm0,%xmm5
4329: c5 fe e6 db vcvtdq2pd %xmm3,%ymm3
432d: c5 fe e6 ed vcvtdq2pd %xmm5,%ymm5
4331: c5 e5 59 dc vmulpd %ymm4,%ymm3,%ymm3
4335: c4 c1 65 58 db vaddpd %ymm11,%ymm3,%ymm3
433a: c5 d5 59 ec vmulpd %ymm4,%ymm5,%ymm5
433e: c4 c1 55 58 ec vaddpd %ymm12,%ymm5,%ymm5
4343: c5 f9 fe 34 24 vpaddd (%rsp),%xmm0,%xmm6
4348: c5 fe e6 f6 vcvtdq2pd %xmm6,%ymm6
434c: c5 cd 59 f4 vmulpd %ymm4,%ymm6,%ymm6
4350: c4 c1 4d 58 f5 vaddpd %ymm13,%ymm6,%ymm6
4355: c4 c1 79 fe f8 vpaddd %xmm8,%xmm0,%xmm7
435a: c5 fe e6 ff vcvtdq2pd %xmm7,%ymm7
435e: c5 c5 59 fc vmulpd %ymm4,%ymm7,%ymm7
4362: c4 41 45 58 d2 vaddpd %ymm10,%ymm7,%ymm10
4367: c4 c1 79 fe f9 vpaddd %xmm9,%xmm0,%xmm7
436c: c5 fe e6 ff vcvtdq2pd %xmm7,%ymm7
4370: c5 c5 59 fc vmulpd %ymm4,%ymm7,%ymm7
4374: c5 45 58 db vaddpd %ymm3,%ymm7,%ymm11
4378: c4 c1 79 fe de vpaddd %xmm14,%xmm0,%xmm3
437d: c5 fe e6 db vcvtdq2pd %xmm3,%ymm3
4381: c5 e5 59 dc vmulpd %ymm4,%ymm3,%ymm3
4385: c5 65 58 e5 vaddpd %ymm5,%ymm3,%ymm12
4389: c5 f9 fe d9 vpaddd %xmm1,%xmm0,%xmm3
438d: c5 fe e6 db vcvtdq2pd %xmm3,%ymm3
4391: c5 e5 59 dc vmulpd %ymm4,%ymm3,%ymm3
4395: c5 65 58 ee vaddpd %ymm6,%ymm3,%ymm13
4399: c5 f9 fe c2 vpaddd %xmm2,%xmm0,%xmm0
439d: 83 c0 e0 add $0xffffffe0,%eax
43a0: 0f 85 6a ff ff ff jne 4310 <_ZN4play1f17h63840982574dd7feE+0x90>
43a6: c4 c1 25 58 c2 vaddpd %ymm10,%ymm11,%ymm0
43ab: c5 9d 58 c0 vaddpd %ymm0,%ymm12,%ymm0
43af: c5 95 58 c0 vaddpd %ymm0,%ymm13,%ymm0
43b3: c4 e3 7d 19 c1 01 vextractf128 $0x1,%ymm0,%xmm1
43b9: c5 fd 58 c1 vaddpd %ymm1,%ymm0,%ymm0
43bd: c4 e3 79 05 c8 01 vpermilpd $0x1,%xmm0,%xmm1
43c3: c5 f9 58 c1 vaddpd %xmm1,%xmm0,%xmm0
43c7: 48 83 c4 38 add $0x38,%rsp
43cb: c5 f8 77 vzeroupper
43ce: c3 retq
43cf: 90 nop