Julia on embedded devices & validation thereof

Or for a less trivial example:

using StaticCompiler
using StaticTools
using LoopVectorization

@inline function mul!(C::MallocArray, A::MallocArray, B::MallocArray)
    @turbo for n ∈ indices((C,B), 2), m ∈ indices((C,A), 1)
        Cmn = zero(eltype(C))
        for k ∈ indices((A,B), (2,1))
            Cmn += A[m,k] * B[k,n]
        end
        C[m,n] = Cmn
    end
    return C
end

function loopvec_matrix(argc::Int, argv::Ptr{Ptr{UInt8}})
    argc == 3 || return printf(stderrp(), c"Incorrect number of command-line arguments\n")
    rows = parse(Int64, argv, 2)            # First command-line argument
    cols = parse(Int64, argv, 3)            # Second command-line argument

    # LHS
    A = MallocArray{Float64}(undef, rows, cols)
    @turbo for i ∈ axes(A, 1)
        for j ∈ axes(A, 2)
           A[i,j] = i*j
        end
    end

    # RHS
    B = MallocArray{Float64}(undef, cols, rows)
    @turbo for i ∈ axes(B, 1)
        for j ∈ axes(B, 2)
           B[i,j] = i*j
        end
    end

    # # Matrix multiplication
    C = MallocArray{Float64}(undef, cols, cols)
    mul!(C, B, A)

    # Print to stdout
    printf(C)

    # Clean up matrices
    free(A)
    free(B)
    free(C)
end

# Attempt to compile
path = compile_executable(loopvec_matrix, (Int64, Ptr{Ptr{UInt8}}), "./")

which gives us

$ ls -alh loopvec_matrix
-rwxr-xr-x  1 me  staff    21K May 22 16:30 loopvec_matrix

$ ./loopvec_matrix 10 3
3.850000e+02	7.700000e+02	1.155000e+03
7.700000e+02	1.540000e+03	2.310000e+03
1.155000e+03	2.310000e+03	3.465000e+03

$ /usr/bin/time -l ./loopvec_matrix 100 100
[output omitted...]
        0.04 real         0.00 user         0.00 sys
   2113536  maximum resident set size
         0  average shared memory size
         0  average unshared data size
         0  average unshared stack size
       532  page reclaims
         0  page faults
         0  swaps
         0  block input operations
         0  block output operations
         0  messages sent
         0  messages received
         0  signals received
       127  voluntary context switches
         3  involuntary context switches

a 21 kB executable that uses 2.1 MB to multiply two 100x100 matrices. For comparison, ls:

$ /usr/bin/time -l ls -alh loopvec_matrix
-rwxr-xr-x  1 me  staff    21K May 22 16:30 loopvec_matrix
        0.00 real         0.00 user         0.00 sys
   2416640  maximum resident set size
         0  average shared memory size
         0  average unshared data size
         0  average unshared stack size
       609  page reclaims
         0  page faults
         0  swaps
         0  block input operations
         0  block output operations
         0  messages sent
         0  messages received
         0  signals received
         0  voluntary context switches
        14  involuntary context switches
17 Likes