Numerical difference between Julia and (C, Fortran, Python) for a chaotic damped driven pendulum

At least on my machine, gcc doesn’t want to use AVX, whereas Julia seemingly does.

I suspect this vectorization is what makes the difference here.

C asm
0000000000001420 <f>:                                                                   
    1420:       f3 0f 1e fa             endbr64                                         
    1424:       48 83 ec 28             sub    $0x28,%rsp                               
    1428:       f2 0f 11 44 24 18       movsd  %xmm0,0x18(%rsp)                         
    142e:       66 0f 28 c1             movapd %xmm1,%xmm0                              
    1432:       f2 0f 11 54 24 10       movsd  %xmm2,0x10(%rsp)                         
    1438:       e8 b3 fc ff ff          callq  10f0 <sin@plt>                           
    143d:       f2 0f 10 5c 24 18       movsd  0x18(%rsp),%xmm3                         
    1443:       f2 0f 59 1d 25 0c 00    mulsd  0xc25(%rip),%xmm3        # 2070 <X0+0x8> 
    144a:       00                                                                      
    144b:       f2 0f 11 44 24 08       movsd  %xmm0,0x8(%rsp)                          
    1451:       66 0f 28 c3             movapd %xmm3,%xmm0                              
    1455:       e8 76 fc ff ff          callq  10d0 <cos@plt>                           
    145a:       f2 0f 59 05 1e 0c 00    mulsd  0xc1e(%rip),%xmm0        # 2080 <X0+0x18>
    1461:       00                                                                      
    1462:       f2 0f 10 54 24 10       movsd  0x10(%rsp),%xmm2                         
    1468:       f2 0f 59 15 08 0c 00    mulsd  0xc08(%rip),%xmm2        # 2078 <X0+0x10>
    146f:       00                                                                      
    1470:       f2 0f 5c 54 24 08       subsd  0x8(%rsp),%xmm2                          
    1476:       48 83 c4 28             add    $0x28,%rsp                               
    147a:       f2 0f 58 c2             addsd  %xmm2,%xmm0                              
    147e:       c3                      retq                                            
    147f:       90                      nop                                             

vs.

Julia asm
julia> @code_native acceleration(0.0, 1.0, 0.0, 0.05, 0.6, 0.7)     
        .text                                                       
; ┌ @ REPL[42]:1 within `acceleration'                              
        pushq   %rbx                                                
        subq    $32, %rsp                                           
        vmovsd  %xmm5, 16(%rsp)                                     
        vmovsd  %xmm4, 24(%rsp)                                     
        vmovsd  %xmm0, 8(%rsp)                                      
        movabsq $.rodata.cst16, %rax                                
; │┌ @ float.jl:383 within `-'                                      
        vxorpd  (%rax), %xmm3, %xmm0                                
; │└                                                                
; │┌ @ float.jl:395 within `*'                                      
        vmulsd  %xmm2, %xmm0, %xmm0                                 
        vmovsd  %xmm0, (%rsp)                                       
        movabsq $cos, %rbx                                          
; │└                                                                
; │┌ @ REPL[38]:1 within `csin'                                     
        leaq    17120(%rbx), %rax                                   
        vmovapd %xmm1, %xmm0                                        
        callq   *%rax                                               
        vmovsd  (%rsp), %xmm1                   # xmm1 = mem[0],zero
; │└                                                                
; │┌ @ float.jl:392 within `-'                                      
        vsubsd  %xmm0, %xmm1, %xmm0                                 
        vmovsd  %xmm0, (%rsp)                                       
        vmovsd  8(%rsp), %xmm0                  # xmm0 = mem[0],zero
; │└                                                                
; │┌ @ float.jl:395 within `*'                                      
        vmulsd  16(%rsp), %xmm0, %xmm0                              
; │└                                                                
; │┌ @ REPL[40]:1 within `ccos'                                     
        callq   *%rbx                                               
; │└                                                                
; │┌ @ float.jl:395 within `*'                                      
        vmulsd  24(%rsp), %xmm0, %xmm0                              
; │└                                                                
; │┌ @ float.jl:389 within `+'                                      
        vaddsd  (%rsp), %xmm0, %xmm0                                
; │└                                                                
        addq    $32, %rsp                                           
        popq    %rbx                                                
        retq                                                        
        nopw    %cs:(%rax,%rax)                                     
; └                                                                 

no, the calls are not inlined in julia (not with ccos and not with native sin), though I did rename them for my understanding - acceleration is your f:

callq are in here
L1340:                                                              
        vmovsd  %xmm0, 32(%rsp)                                     
        movabsq $sin, %rbx                                          
; │ @ chaotic.jl:18 within `pendulum'                               
; │┌ @ chaotic.jl:13 within `acceleration'                          
        callq   *%rbx                                               
        vmovsd  %xmm0, 120(%rsp)                                    
        vmovsd  24(%rsp), %xmm0                 # xmm0 = mem[0],zero
; ││┌ @ float.jl:395 within `*'                                     
        vmulsd  80(%rsp), %xmm0, %xmm0                              
        movabsq $cos, %r15                                          
; ││└                                                               
        callq   *%r15                                               
        vmovsd  %xmm0, 112(%rsp)                                    
        vmovsd  72(%rsp), %xmm0                 # xmm0 = mem[0],zero
; │└                                                                ```
Julia Code
function pendulum(print=false, x=1.0, v=0.0, dt=0.01, tmax=500.0, Ω=0.70, Γ=0.05, FF=0.6)
    if print
        out1 = @sprintf("2_dados_rk2_F%.2f_julia.txt", FF)
        fout1 = open(out1, "w")
    end
    acceleration(t, x, v) = -Γ * v - sin(x) + FF * cos(Ω * t)

    dt_half = dt / 2.0

    for t in range(0.0, tmax; step=dt)
        acc_start = acceleration(t, x, v)
        
        t_middle  = t + dt_half
        x_middle  = x + v * dt_half
        v_middle  = v + acc_start * dt_half	          
        acc_middle = acceleration(t_middle, x_middle, v_middle)                  
        
        x += v_middle * dt	
        v += acc_middle * dt
  
        if print
            @printf(fout1,"%12.6f %26.20f %26.20f\n", t+dt, x, v)
        end
    end
    if print
        close(fout1)
    end
end

C code was the one by @lmiq, compiled with gcc -o pendulum pendulum.c -lm -O3.

3 Likes