At least on my machine, gcc doesn’t want to use AVX, whereas Julia seemingly does.
I suspect this vectorization is what makes the difference here.
C asm
0000000000001420 <f>:
1420: f3 0f 1e fa endbr64
1424: 48 83 ec 28 sub $0x28,%rsp
1428: f2 0f 11 44 24 18 movsd %xmm0,0x18(%rsp)
142e: 66 0f 28 c1 movapd %xmm1,%xmm0
1432: f2 0f 11 54 24 10 movsd %xmm2,0x10(%rsp)
1438: e8 b3 fc ff ff callq 10f0 <sin@plt>
143d: f2 0f 10 5c 24 18 movsd 0x18(%rsp),%xmm3
1443: f2 0f 59 1d 25 0c 00 mulsd 0xc25(%rip),%xmm3 # 2070 <X0+0x8>
144a: 00
144b: f2 0f 11 44 24 08 movsd %xmm0,0x8(%rsp)
1451: 66 0f 28 c3 movapd %xmm3,%xmm0
1455: e8 76 fc ff ff callq 10d0 <cos@plt>
145a: f2 0f 59 05 1e 0c 00 mulsd 0xc1e(%rip),%xmm0 # 2080 <X0+0x18>
1461: 00
1462: f2 0f 10 54 24 10 movsd 0x10(%rsp),%xmm2
1468: f2 0f 59 15 08 0c 00 mulsd 0xc08(%rip),%xmm2 # 2078 <X0+0x10>
146f: 00
1470: f2 0f 5c 54 24 08 subsd 0x8(%rsp),%xmm2
1476: 48 83 c4 28 add $0x28,%rsp
147a: f2 0f 58 c2 addsd %xmm2,%xmm0
147e: c3 retq
147f: 90 nop
vs.
Julia asm
julia> @code_native acceleration(0.0, 1.0, 0.0, 0.05, 0.6, 0.7)
.text
; ┌ @ REPL[42]:1 within `acceleration'
pushq %rbx
subq $32, %rsp
vmovsd %xmm5, 16(%rsp)
vmovsd %xmm4, 24(%rsp)
vmovsd %xmm0, 8(%rsp)
movabsq $.rodata.cst16, %rax
; │┌ @ float.jl:383 within `-'
vxorpd (%rax), %xmm3, %xmm0
; │└
; │┌ @ float.jl:395 within `*'
vmulsd %xmm2, %xmm0, %xmm0
vmovsd %xmm0, (%rsp)
movabsq $cos, %rbx
; │└
; │┌ @ REPL[38]:1 within `csin'
leaq 17120(%rbx), %rax
vmovapd %xmm1, %xmm0
callq *%rax
vmovsd (%rsp), %xmm1 # xmm1 = mem[0],zero
; │└
; │┌ @ float.jl:392 within `-'
vsubsd %xmm0, %xmm1, %xmm0
vmovsd %xmm0, (%rsp)
vmovsd 8(%rsp), %xmm0 # xmm0 = mem[0],zero
; │└
; │┌ @ float.jl:395 within `*'
vmulsd 16(%rsp), %xmm0, %xmm0
; │└
; │┌ @ REPL[40]:1 within `ccos'
callq *%rbx
; │└
; │┌ @ float.jl:395 within `*'
vmulsd 24(%rsp), %xmm0, %xmm0
; │└
; │┌ @ float.jl:389 within `+'
vaddsd (%rsp), %xmm0, %xmm0
; │└
addq $32, %rsp
popq %rbx
retq
nopw %cs:(%rax,%rax)
; └
no, the calls are not inlined in julia (not with ccos
and not with native sin
), though I did rename them for my understanding - acceleration
is your f
:
callq are in here
L1340:
vmovsd %xmm0, 32(%rsp)
movabsq $sin, %rbx
; │ @ chaotic.jl:18 within `pendulum'
; │┌ @ chaotic.jl:13 within `acceleration'
callq *%rbx
vmovsd %xmm0, 120(%rsp)
vmovsd 24(%rsp), %xmm0 # xmm0 = mem[0],zero
; ││┌ @ float.jl:395 within `*'
vmulsd 80(%rsp), %xmm0, %xmm0
movabsq $cos, %r15
; ││└
callq *%r15
vmovsd %xmm0, 112(%rsp)
vmovsd 72(%rsp), %xmm0 # xmm0 = mem[0],zero
; │└ ```
Julia Code
function pendulum(print=false, x=1.0, v=0.0, dt=0.01, tmax=500.0, Ω=0.70, Γ=0.05, FF=0.6)
if print
out1 = @sprintf("2_dados_rk2_F%.2f_julia.txt", FF)
fout1 = open(out1, "w")
end
acceleration(t, x, v) = -Γ * v - sin(x) + FF * cos(Ω * t)
dt_half = dt / 2.0
for t in range(0.0, tmax; step=dt)
acc_start = acceleration(t, x, v)
t_middle = t + dt_half
x_middle = x + v * dt_half
v_middle = v + acc_start * dt_half
acc_middle = acceleration(t_middle, x_middle, v_middle)
x += v_middle * dt
v += acc_middle * dt
if print
@printf(fout1,"%12.6f %26.20f %26.20f\n", t+dt, x, v)
end
end
if print
close(fout1)
end
end
C code was the one by @lmiq, compiled with gcc -o pendulum pendulum.c -lm -O3
.