I wrote a straightforward implementation of the insertion sort algorithm in Julia below. Then I wrote the exact translation in C to compare with Julia. I noticed a performance gap of about 10% in favor of C implementation. I can’t figure out what is causing this gap or what prevents Julia from reaching C here. I guess the difference comes from arrays in C being fixed-size and stack allocated vs. dynamic arrays in Julia but I’m not sure.

I use `-O3 --inline=yes --check-bounds=no`

in Julia benchmark and enable aggresive optimizations `-Ofast -marh=native`

in C too.

```
function insertion_sort!(x)
for i = 2:length(x)
c = x[i]
j = i
while j > 1 && c < x[j-1]
x[j] = x[j-1]
j -= 1
end
x[j] = c
end
end
m = 10^5
x = rand(1:m,m)
@btime insertion_sort!(y) setup = y = copy(x)
# 1.090 s (0 allocations: 0 bytes)
```

The C code is below for reference.

## Summary

```
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#define M 100000
void insertion_sort(long* x) {
for (int i = 1; i < M; ++i) {
int c = x[i];
int j = i;
while (j > 0 && c < x[j-1]) {
x[j] = x[j-1];
j -= 1;
}
x[j] = c;
}
}
int main(int argc, char *argv[]) {
long* x;
x = malloc(M * sizeof(long));
for (int i = 0; i < M; ++i)
x[i] = M * (double)rand() / RAND_MAX + 1;
for (int i = 0; i < 10; ++i) printf("%ld ", x[i]);
clock_t start = clock();
insertion_sort(x);
clock_t end = clock();
double total = (double)(end - start) / CLOCKS_PER_SEC;
printf("\nExecution time: %f\n", total);
for (int i = 0; i < 10; ++i) printf("%ld ", x[i]);
}
// 84019 39439 78310 79845 91165 19756 33523 76823 27778 55397
// Execution time: 1.000000
// 1 2 2 3 3 5 5 6 9 10
```