In case that helps someone figuring out my issue, this is the Julia code:

```
using LinearAlgebra: norm, I
function CGS2(X::Array{Float64,2})
n, m = size(X)
Q = zeros(Float64, n, m)
R = zeros(Float64, m, m)
r = zeros(Float64, m)
w = zeros(Float64, n)
R[1, 1] = norm(X[:, 1])
Q[:, 1] = X[:, 1] ./ R[1, 1]
for i in 2:m
w .= X[:, i]
r .= (w'Q)'
r[i:m] .= 0.
R[:, i] = r
w .-= Q * r
r .= (w'Q)'
r[i:m] .= 0.
w .-= Q * r
R[i, i] = norm(w)
Q[:, i] = w ./ R[i, i]
end
return Q, R
end
n = 1_000_000
m = 250
X = rand(n, m)
dt_CGS2 = @elapsed Q, R = CGS2(X)
println("dt_CGS2 = $dt_CGS2 sec")
print("extrema(Q'Q - I): ")
println(extrema(Q'Q - I))
print("extrema(X - Q * R): ")
println(extrema(X - Q * R))
println()
```

And this is my C code:

```
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <omp.h>
#include <sys/time.h>
#include <stdbool.h>
#include <mkl.h>
void CGS2(int n, int m, double *X, double *Q, double *R);
/*
void CGS2(int n, int m, double *X, double *Q, double *R)
Computes the classical Gram-Schmidt orthogonalization with
re-orthogonalization of m n-vectors.
In:
- int n: Dimension of the vectors.
- int m: Number of vectors to orthogonalize.
- double *X: m n-dimensional vectors assumed linearly independent
and stored by columns in an unrolled column major array.
Out:
- double *Q: Matrix Q of QR decomposition of X stored by columns in an
unrolled column major array.
- double *R: Matrix R of QR decomposition of X stored by columns in an
unrolled column major array.
*/
void CGS2(int n, int m, double *X, double *Q, double *R) {
double *Rtmp = (double*)malloc(m * sizeof(double));
R[0] = sqrt(cblas_ddot(n, &X[0], 1, &X[0], 1));
cblas_dcopy(n, &X[0], 1, &Q[0], 1);
cblas_dscal(n, 1./R[0], &Q[0], 1);
for (int j=1; j<m; j++)
R[j] = 0.;
for (int j=1; j<m; j++) {
cblas_dgemv(CblasColMajor, CblasTrans, n, j, 1., &Q[0], n, &X[j * n], 1, 0., &R[j * m], 1);
cblas_dcopy(n, &X[j * n], 1, &Q[j * n], 1);
cblas_dgemv(CblasColMajor, CblasNoTrans, n, j, -1., &Q[0], n, &R[j * m], 1, 1., &Q[j * n], 1);
cblas_dgemv(CblasColMajor, CblasTrans, n, j, 1., &Q[0], n, &Q[j * n], 1, 0., Rtmp, 1);
cblas_dgemv(CblasColMajor, CblasNoTrans, n, j, -1., &Q[0], n, Rtmp, 1, 1., &Q[j * n], 1);
R[j * m + j] = sqrt(cblas_ddot(n, &Q[j * n], 1, &Q[j * n], 1));
cblas_dscal(n, 1./R[j * m + j], &Q[j * n], 1);
for (int i=j+1; i<m; i++)
R[j * m + i] = 0.;
}
}
int main(int argc, char *argv[]) {
bool check_orthogonality_brgs = true;
int n, m, P;
n = 1000000;
m = 250;
P = 1;
// Generate random vectors
// X contains m n-dimensional vectors stored by row in an unrolled row major array
double *X = (double*)mkl_malloc(n * m * sizeof(double), 64);
for (int i=0; i<m; i++) {
for (int j=0; j<n; j++) {
X[i * n + j] = rand() / (double) RAND_MAX;
}
}
// Set the number of processes
omp_set_num_threads(P);
// Allocate data structures for QR decompositions
double *Q = (double*)mkl_malloc(n * m * sizeof(double), 64);
double *R = (double*)mkl_malloc(m * m * sizeof(double), 64);
double *S = (double*)mkl_malloc(k * m * sizeof(double), 64);
double *QtQ = (double*)mkl_malloc(m * m * sizeof(double), 64);
double *QR = (double*)mkl_malloc(n * m * sizeof(double), 64);
double dQtQ, dX;
long seconds, microseconds;
double dt;
struct timeval begin, end;
gettimeofday(&begin, 0);
CGS2(n, m, X, Q, R);
gettimeofday(&end, 0);
seconds = end.tv_sec - begin.tv_sec;
microseconds = end.tv_usec - begin.tv_usec;
dt = seconds + microseconds * 1e-6;
printf("Time elapsed for CGS2: %f sec.\n", dt);
cblas_dgemm(CblasColMajor, CblasTrans, CblasNoTrans, m, m, n, 1., Q, n, Q, n, 0., QtQ, m);
for (int i=0; i<m; i++)
QtQ[i * m + i] -= 1.;
dQtQ = matrix_2norm(QtQ, m, m);
printf("||I - Q^TQ||_2 = %g\n", dQtQ);
cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, n, m, m, 1., Q, n, R, m, 0., QR, n);
cblas_daxpy(n * m, -1., X, 1, QR, 1);
dX = matrix_2norm(QR, n, m);
printf("||X - QR||_2 = %g\n", dX);
mkl_free(X);
mkl_free(Q);
mkl_free(R);
mkl_free(S);
mkl_free(QtQ);
mkl_free(QR);
return 0;
}
```