Or for a less trivial example:
using StaticCompiler
using StaticTools
using LoopVectorization
@inline function mul!(C::MallocArray, A::MallocArray, B::MallocArray)
@turbo for n ∈ indices((C,B), 2), m ∈ indices((C,A), 1)
Cmn = zero(eltype(C))
for k ∈ indices((A,B), (2,1))
Cmn += A[m,k] * B[k,n]
end
C[m,n] = Cmn
end
return C
end
function loopvec_matrix(argc::Int, argv::Ptr{Ptr{UInt8}})
argc == 3 || return printf(stderrp(), c"Incorrect number of command-line arguments\n")
rows = parse(Int64, argv, 2) # First command-line argument
cols = parse(Int64, argv, 3) # Second command-line argument
# LHS
A = MallocArray{Float64}(undef, rows, cols)
@turbo for i ∈ axes(A, 1)
for j ∈ axes(A, 2)
A[i,j] = i*j
end
end
# RHS
B = MallocArray{Float64}(undef, cols, rows)
@turbo for i ∈ axes(B, 1)
for j ∈ axes(B, 2)
B[i,j] = i*j
end
end
# # Matrix multiplication
C = MallocArray{Float64}(undef, cols, cols)
mul!(C, B, A)
# Print to stdout
printf(C)
# Clean up matrices
free(A)
free(B)
free(C)
end
# Attempt to compile
path = compile_executable(loopvec_matrix, (Int64, Ptr{Ptr{UInt8}}), "./")
which gives us
$ ls -alh loopvec_matrix
-rwxr-xr-x 1 me staff 21K May 22 16:30 loopvec_matrix
$ ./loopvec_matrix 10 3
3.850000e+02 7.700000e+02 1.155000e+03
7.700000e+02 1.540000e+03 2.310000e+03
1.155000e+03 2.310000e+03 3.465000e+03
$ /usr/bin/time -l ./loopvec_matrix 100 100
[output omitted...]
0.04 real 0.00 user 0.00 sys
2113536 maximum resident set size
0 average shared memory size
0 average unshared data size
0 average unshared stack size
532 page reclaims
0 page faults
0 swaps
0 block input operations
0 block output operations
0 messages sent
0 messages received
0 signals received
127 voluntary context switches
3 involuntary context switches
a 21 kB executable that uses 2.1 MB to multiply two 100x100 matrices. For comparison, ls
:
$ /usr/bin/time -l ls -alh loopvec_matrix
-rwxr-xr-x 1 me staff 21K May 22 16:30 loopvec_matrix
0.00 real 0.00 user 0.00 sys
2416640 maximum resident set size
0 average shared memory size
0 average unshared data size
0 average unshared stack size
609 page reclaims
0 page faults
0 swaps
0 block input operations
0 block output operations
0 messages sent
0 messages received
0 signals received
0 voluntary context switches
14 involuntary context switches