How to call blas getrf, getri properly ? i want to create a benchmark inverse matrix using gauss, crout, native julia inv(A) and BLAS direct

hello, i am new here. fortran based but just for fun actually. i tried to convert test_fpu.f90 from polyhedron benchmark inverse matrix.
i heard julia is multihreading and easier to code, then dgemm performance is higher said octavian github, so i want to compare with this test_fpu.f90 . basically it is benchmarking gauss 250x250 real random double inverse and check the difference as average error 1000 times; crout 250x250 real random double inverse and check the difference as average error 1000 times; crout 2000x2000 real random matrix double inverse and check the difference as average error 1 times, inverse matrix with lapack dgetrf and dgetri twice and check the difference as average error 1 times. I have succefully using inv(A) native but can’t to use directly

using LinearAlgebra
using Random
function average_error(A, A2)
    n = size(A, 1)
    return sum(abs.(A .- A2)) / (n * n)
end

function gauss_inverse!(A::Matrix{Float64})
    n = size(A, 1)
    B = copy(A)
    ipvt = collect(1:n)
    temp = zeros(n)

    for k in 1:n
        # Partial pivoting
        imax = argmax(abs.(B[k:n, k])) + k - 1
        m = imax
        if m != k
            ipvt[[m, k]] = ipvt[[k, m]]
            B[[m, k], :] .= B[[k, m], :]
        end
        
        d = 1.0 / B[k, k]
        temp .= B[:, k]  # Store current column
        
        # Perform elimination
        for j in 1:n
            c = B[k, j] * d
            B[:, j] .-= temp .* c
            B[k, j] = c
        end
        
        # Store back the transformed column
        B[:, k] .= temp .* (-d)
        B[k, k] = d
    end
    
    # Apply final permutation
    A[:, ipvt] .= B
    return A
end

function crout_inverse(A::Matrix{Float64})
    n = size(A, 1)
    L = zeros(n, n)
    U = Matrix{Float64}(I, n, n)
    Ainv = zeros(n, n)

    for j in 1:n
        # Compute column j of L
        for i in j:n
            L[i,j] = A[i,j] - sum(L[i,1:j-1] .* U[1:j-1,j])
        end
        
        # Compute row j of U
        for i in j+1:n
            U[j,i] = (A[j,i] - sum(L[j,1:j-1] .* U[1:j-1,i])) / L[j,j]
        end
    end

    # Solve for the inverse column by column
    for i in 1:n
        e = zeros(n)
        e[i] = 1.0
        y = L \ e
        Ainv[:,i] = U \ y
    end
    return Ainv
end

# Function to compute the inverse using LAPACK's dgetrf! and dgetri!
# dgetrf! computes the LU factorization with partial pivoting
# dgetri! computes the inverse from the LU factorization
function lapack_inverse(A::Matrix{Float64})
    n = size(A, 1)
    n != size(A, 2) && error("Matrix must be square")

    # Create a copy since we modify in-place
    A_copy = copy(A)

    # Step 1: LU factorization (getrf!)
    # In Julia, getrf! modifies A_copy in-place and returns (A_copy, ipiv, info)
    result = LinearAlgebra.LAPACK.getrf!(A_copy)
    A_lu = result[1] # This is the modified A_copy
    ipiv = result[2]
    info_getrf = result[3] # Use a specific variable name

    # Debug prints for info from getrf!
    println("DEBUG: info_getrf value after getrf!: ", info_getrf)
    println("DEBUG: info_getrf type after getrf!: ", typeof(info_getrf))

    # Check for errors from getrf!
    if info_getrf > 0
        error("Matrix is singular in getrf! at position $info_getrf (from getrf!)")
    elseif info_getrf < 0
        error("Invalid argument in getrf! at position $(-info_getrf) (from getrf!)")
    end

    # Step 2: Matrix inversion (getri!)
    # In Julia, getri! modifies A_lu in-place with the inverse and returns (A_lu, info)
    inv_result = LinearAlgebra.LAPACK.getri!(A_lu, ipiv)
    A_inv = inv_result[1] # This is the modified A_lu, which is now the inverse
    info_getri = inv_result[2] # Use a specific variable name

    # Debug prints for info from getri!
    println("DEBUG: info_getri value after getri!: ", info_getri)
    println("DEBUG: info_getri type after getri!: ", typeof(info_getri))


    # Check for errors in getri!
    if info_getri > 0
         error("Matrix is singular in getri! at position $info_getri (from getri!)")
    elseif info_getri < 0
        error("Invalid argument in getri! at position $(-info_getri) (from getri!)")
    end

    return A_inv
end


function run_benchmarks()
    small_n = 250
    large_n = 2000
    reps = 1000

    println("Small matrix benchmark ($small_nΓ—$small_n), $reps repetitions")

    Random.seed!(42)
    A = randn(small_n, small_n)

    # === [1] Gauss ===
    println("\n[1] Gauss Elimination:")
    err_sum = 0.0
    t1 = @elapsed for i in 1:reps
     Ainv = copy(A)
     gauss_inverse!(Ainv)
     A2 = copy(Ainv)
     gauss_inverse!(A2)
     err_sum += average_error(A, A2)
     if i % 100 == 0
        println("  Iteration $i / $reps")
     end
    end
    println("Time: ", round(t1, digits=3), " seconds")
    println("Average error: ", err_sum / reps)


    # === [2] Crout ===
    println("\n[2] Crout Inversion:")
    err_sum = 0.0
    t2 = @elapsed for i in 1:reps
      Ainv = crout_inverse(A)
      A2 = crout_inverse(Ainv)
      err_sum += average_error(A, A2)
      if i % 100 == 0
        println("  Iteration $i / $reps")
      end
     end
    println("Time: ", round(t2, digits=3), " seconds")
    println("Average error: ", err_sum / reps)

    println("\nLarge matrix benchmark ($large_nΓ—$large_n), 1 repetition")

    Random.seed!(123)
    B = randn(large_n, large_n)

    # === [3] Crout Large ===
    println("\n[3] Crout Large:")
    t3 = @elapsed begin
        Binv = crout_inverse(B)
        B2 = crout_inverse(Binv)
        err = average_error(B, B2)
        println("Average error: ", err)
    end
    println("Time: ", round(t3, digits=3), " seconds")

    # === [4] INV native Julia Large ===
    println("\n[4] Native Julia (inv) Large:")
    t4 = @elapsed begin
        Binv = inv(B)
        B2 = inv(Binv)
        err = average_error(B, B2)
        println("Average error: ", err)
    end
    println("Time: ", round(t4, digits=3), " seconds")


    # === [5] Lapack manual Large ===
    println("\n[5] LAPACK (LU + inv) Large:")
    t5 = @elapsed begin
        Binv = lapack_inverse(B)
        B2 = lapack_inverse(Binv)
        err = average_error(B, B2)
        println("Average error: ", err)
    end
    println("Time: ", round(t5, digits=3), " seconds")

end

run_benchmarks()

i got error like

C:\Temp>julia test_fpu_1corev3.jl
Small matrix benchmark (250Γ—250), 1000 repetitions

[1] Gauss Elimination:
  Iteration 100 / 1000
  Iteration 200 / 1000
  Iteration 300 / 1000
  Iteration 400 / 1000
  Iteration 500 / 1000
  Iteration 600 / 1000
  Iteration 700 / 1000
  Iteration 800 / 1000
  Iteration 900 / 1000
  Iteration 1000 / 1000
Time: 37.984 seconds
Average error: 7.235891152085948e-15

[2] Crout Inversion:
  Iteration 100 / 1000
  Iteration 200 / 1000
  Iteration 300 / 1000
  Iteration 400 / 1000
  Iteration 500 / 1000
  Iteration 600 / 1000
  Iteration 700 / 1000
  Iteration 800 / 1000
  Iteration 900 / 1000
  Iteration 1000 / 1000
Time: 79.338 seconds
Average error: 3.599229967813043e-11

Large matrix benchmark (2000Γ—2000), 1 repetition

[3] Crout Large:
Average error: 9.277946949708182e-10
Time: 44.639 seconds

[4] Native Julia (inv) Large:
Average error: 5.759763066411798e-13
Time: 2.297 seconds

[5] LAPACK (LU + inv) Large:
DEBUG: info_getrf value after getrf!: 0
DEBUG: info_getrf type after getrf!: Int64
DEBUG: info_getri value after getri!: 0.014762497648100295
DEBUG: info_getri type after getri!: Float64
ERROR: LoadError: Matrix is singular in getri! at position 0.014762497648100295 (from getri!)
Stacktrace:
 [1] error(s::String)
   @ Base .\error.jl:35
 [2] lapack_inverse(A::Matrix{Float64})
   @ Main C:\Temp\test_fpu_1corev3.jl:112
 [3] macro expansion
   @ C:\Temp\test_fpu_1corev3.jl:191 [inlined]
 [4] macro expansion
   @ .\timing.jl:421 [inlined]
 [5] run_benchmarks()
   @ Main C:\Temp\test_fpu_1corev3.jl:190
 [6] top-level scope
   @ C:\Temp\test_fpu_1corev3.jl:200
in expression starting at C:\Temp\test_fpu_1corev3.jl:200

C:\Temp>notepad test_fpu_1corev3.jl

that is using internal blas, i dont know to call using others. help how to call blas function, internally, openblas, mkl, octavian.?

1 Like

Here’s your error. getri! only returns A_inv, no info (it checks LAPACK’s info internally and throws an error if nonzero, so if you get a return you know it’s valid). So just do

A_inv = LinearAlgebra.LAPACK.getri!(A_lu, ipiv)
2 Likes

yes finally i understand.
Here is the full code now:

using LinearAlgebra
using LinearAlgebra.BLAS
BLAS.set_num_threads(Sys.CPU_THREADS)
using Random

function average_error(A, A2)
    n = size(A, 1)
    return sum(abs.(A .- A2)) / (n * n)
end

function gauss_inverse!(A::Matrix{Float64})
    n = size(A, 1)
    B = copy(A)
    ipvt = collect(1:n)
    temp = zeros(n)

    for k in 1:n
        # Partial pivoting
        imax = argmax(abs.(B[k:n, k])) + k - 1
        m = imax
        if m != k
            ipvt[[m, k]] = ipvt[[k, m]]
            B[[m, k], :] .= B[[k, m], :]
        end
        
        d = 1.0 / B[k, k]
        temp .= B[:, k]  # Store current column
        
        # Perform elimination
        for j in 1:n
            c = B[k, j] * d
            B[:, j] .-= temp .* c
            B[k, j] = c
        end
        
        # Store back the transformed column
        B[:, k] .= temp .* (-d)
        B[k, k] = d
    end
    
    # Apply final permutation
    A[:, ipvt] .= B
    return A
end

function crout_inverse(A::Matrix{Float64})
    n = size(A, 1)
    L = zeros(n, n)
    U = Matrix{Float64}(I, n, n)
    Ainv = zeros(n, n)

    for j in 1:n
        # Compute column j of L
        for i in j:n
            L[i,j] = A[i,j] - sum(L[i,1:j-1] .* U[1:j-1,j])
        end
        
        # Compute row j of U
        for i in j+1:n
            U[j,i] = (A[j,i] - sum(L[j,1:j-1] .* U[1:j-1,i])) / L[j,j]
        end
    end

    # Solve for the inverse column by column
    for i in 1:n
        e = zeros(n)
        e[i] = 1.0
        y = L \ e
        Ainv[:,i] = U \ y
    end
    return Ainv
end

# Function to compute the inverse using LAPACK's dgetrf! and dgetri!
# dgetrf! computes the LU factorization with partial pivoting
# dgetri! computes the inverse from the LU factorization
function lapack_inverse(A::Matrix{Float64})
    n = size(A, 1)
    n != size(A, 2) && error("Matrix must be square")

    # Create a copy since we modify in-place
    A_copy = copy(A)

    # Step 1: LU factorization (getrf!)
    # In Julia, getrf! modifies A_copy in-place and returns (A_copy, ipiv, info)
    result = LinearAlgebra.LAPACK.getrf!(A_copy)
    A_lu = result[1] # This is the modified A_copy
    ipiv = result[2]
    info_getrf = result[3] # Use a specific variable name

    # Debug prints for info from getrf!
    println("DEBUG: info_getrf value after getrf!: ", info_getrf)
    println("DEBUG: info_getrf type after getrf!: ", typeof(info_getrf))

    # Check for errors from getrf!
    if info_getrf > 0
        error("Matrix is singular in getrf! at position $info_getrf (from getrf!)")
    elseif info_getrf < 0
        error("Invalid argument in getrf! at position $(-info_getrf) (from getrf!)")
    end

    # Step 2: Matrix inversion (getri!)
    # In Julia, getri! modifies A_lu in-place with the inverse and returns (A_lu, info)
    A_inv = LinearAlgebra.LAPACK.getri!(A_lu, ipiv)

end


function run_benchmarks()
    small_n = 250
    large_n = 2000
    reps = 1000

    println("Small matrix benchmark ($small_nΓ—$small_n), $reps repetitions")

    Random.seed!(42)
    A = randn(small_n, small_n)

    # === [1] Gauss ===
    println("\n[1] Gauss Elimination:")
    err_sum = 0.0
    t1 = @elapsed for i in 1:reps
     Ainv = copy(A)
     gauss_inverse!(Ainv)
     A2 = copy(Ainv)
     gauss_inverse!(A2)
     err_sum += average_error(A, A2)
     if i % 100 == 0
        println("  Iteration $i / $reps")
     end
    end
    println("Time: ", round(t1, digits=3), " seconds")
    println("Average error: ", err_sum / reps)


    # === [2] Crout ===
    println("\n[2] Crout Inversion:")
    err_sum = 0.0
    t2 = @elapsed for i in 1:reps
      Ainv = crout_inverse(A)
      A2 = crout_inverse(Ainv)
      err_sum += average_error(A, A2)
      if i % 100 == 0
        println("  Iteration $i / $reps")
      end
     end
    println("Time: ", round(t2, digits=3), " seconds")
    println("Average error: ", err_sum / reps)

    println("\nLarge matrix benchmark ($large_nΓ—$large_n), 1 repetition")

    Random.seed!(123)
    B = randn(large_n, large_n)

    # === [3] Crout Large ===
    println("\n[3] Crout Large:")
    t3 = @elapsed begin
        Binv = crout_inverse(B)
        B2 = crout_inverse(Binv)
        err = average_error(B, B2)
        println("Average error: ", err)
    end
    println("Time: ", round(t3, digits=3), " seconds")

    # === [4] INV native Julia Large ===
    println("\n[4] Native Julia (inv) Large:")
    t4 = @elapsed begin
        Binv = inv(B)
        B2 = inv(Binv)
        err = average_error(B, B2)
        println("Average error: ", err)
    end
    println("Time: ", round(t4, digits=3), " seconds")


    # === [5] Lapack manual Large ===
    println("\n[5] LAPACK (LU + inv) Large:")
    t5 = @elapsed begin
        Binv = lapack_inverse(B)
        B2 = lapack_inverse(Binv)
        err = average_error(B, B2)
        println("Average error: ", err)
    end
    println("Time: ", round(t5, digits=3), " seconds")

end

run_benchmarks()

Looks like the direct inverse native was faster than manually BLAS dgetrf-dgetri

C:\Temp>julia -t 24 test_fpu_1corev3.jl
Small matrix benchmark (250Γ—250), 1000 repetitions

[1] Gauss Elimination:
  Iteration 100 / 1000
  Iteration 200 / 1000
  Iteration 300 / 1000
  Iteration 400 / 1000
  Iteration 500 / 1000
  Iteration 600 / 1000
  Iteration 700 / 1000
  Iteration 800 / 1000
  Iteration 900 / 1000
  Iteration 1000 / 1000
Time: 35.519 seconds
Average error: 7.235891152085948e-15

[2] Crout Inversion:
  Iteration 100 / 1000
  Iteration 200 / 1000
  Iteration 300 / 1000
  Iteration 400 / 1000
  Iteration 500 / 1000
  Iteration 600 / 1000
  Iteration 700 / 1000
  Iteration 800 / 1000
  Iteration 900 / 1000
  Iteration 1000 / 1000
Time: 71.157 seconds
Average error: 3.599229967813043e-11

Large matrix benchmark (2000Γ—2000), 1 repetition

[3] Crout Large:
Average error: 9.277946949708182e-10
Time: 43.501 seconds

[4] Native Julia (inv) Large:
Average error: 5.759763066411798e-13
Time: 0.455 seconds

[5] LAPACK (LU + inv) Large:
DEBUG: info_getrf value after getrf!: 0
DEBUG: info_getrf type after getrf!: Int64
DEBUG: info_getrf value after getrf!: 0
DEBUG: info_getrf type after getrf!: Int64
Average error: 5.759763066411798e-13
Time: 0.92 seconds

second question:
How to use MKL?
i have finished installed.

julia> using MKL

julia> using LinearAlgebra

julia> BLAS.get_config()
LinearAlgebra.BLAS.LBTConfig
Libraries:
β”œ [ILP64] mkl_rt.2.dll
β”” [ LP64] mkl_rt.2.dll

julia> exit()

edited the code to.

using MKL
using LinearAlgebra
using LinearAlgebra.BLAS
BLAS.set_num_threads(Sys.CPU_THREADS)
using Random

..
function run_benchmarks()
    small_n = 250
    large_n = 2000
    reps = 1000
    println("BLAS configuration:")
    println("  Library: ", BLAS.vendor())
    println("  Threads: ", BLAS.get_num_threads())
    println("Small matrix benchmark ($small_nΓ—$small_n), $reps repetitions")
...
end
..

why is still sticking to openblas ?
C:\Temp>julia test_fpu_1core_mkl.jl
BLAS configuration:
  Library: lbt
  Threads: 12
Small matrix benchmark (250Γ—250), 1000 repetitions

Julia compiles your code the first time you call it. You’re likely measuring compilation time, not runtime. You should either make sure to call your function twice and only time the second call, or you should use something like @btime or @benchmark from BenchmarkTools.jl to get precise timings. Here are the timings from BenchmarkTools.jl on my laptop:

julia> using LinearAlgebra, BenchmarkTools

julia> function lapack_inverse(A::Matrix{Float64})
           # function body omitted for brevity---I copied your code verbatim
       end

julia> @benchmark begin
           Binv = inv(B)
           B2 = inv(Binv)
       end setup=(B = randn(2000, 2000))
BenchmarkTools.Trial: 45 samples with 1 evaluation per sample.
 Range (min … max):   92.878 ms … 155.635 ms  β”Š GC (min … max): 0.09% … 1.02%
 Time  (median):     101.919 ms               β”Š GC (median):    1.50%
 Time  (mean Β± Οƒ):   105.377 ms Β±  11.724 ms  β”Š GC (mean Β± Οƒ):  1.61% Β± 3.07%

      β–…β–‚β–ˆ   β–…  β–ˆ
  β–ˆβ–…β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–…β–ˆβ–…β–β–ˆβ–…β–β–β–ˆβ–β–β–β–…β–…β–β–β–ˆβ–…β–β–β–β–β–…β–β–β–β–β–β–β–…β–β–β–β–β–β–β–β–β–β–β–β–β–β–β–β–β–β–β–β–β–β–… ▁
  92.9 ms          Histogram: frequency by time          156 ms <

 Memory estimate: 63.02 MiB, allocs estimate: 10.

julia> redirect_stdout(devnull) do  # This is to silence the printing
           @benchmark begin
               Binv = lapack_inverse(B)
               B2 = lapack_inverse(Binv)
           end setup=(B = randn(2000, 2000))
       end
BenchmarkTools.Trial: 46 samples with 1 evaluation per sample.
 Range (min … max):   94.771 ms … 126.662 ms  β”Š GC (min … max): 0.09% … 21.00%
 Time  (median):     100.424 ms               β”Š GC (median):    1.52%
 Time  (mean Β± Οƒ):   101.896 ms Β±   6.740 ms  β”Š GC (mean Β± Οƒ):  1.62% Β±  3.05%

    β–ƒ    ▁▃  ▁ ▁  β–ˆ
  β–„β–„β–ˆβ–‡β–‡β–‡β–‡β–ˆβ–ˆβ–„β–„β–ˆβ–β–ˆβ–‡β–β–ˆβ–„β–„β–β–β–β–„β–„β–β–β–β–„β–β–„β–β–β–β–β–β–β–β–β–β–„β–β–β–β–β–β–β–β–β–β–β–β–β–β–β–β–β–β–β–β–β–‡ ▁
  94.8 ms          Histogram: frequency by time          127 ms <

 Memory estimate: 63.02 MiB, allocs estimate: 32.

As you can see, the performance of inv and lapack_inverse is virtually identical and an order of magnitude faster than the 0.9 seconds you measured for lapack_inverse, indicating that you were mainly measuring compilation time (we don’t have identical computers, of course, but I don’t think my computer is 10x faster than yours).

It’s not a surprise that they are identical, as the implementation of inv is more or less identical to your lapack_inverse. See LinearAlgebra.jl/src/dense.jl at release-1.11 Β· JuliaLang/LinearAlgebra.jl Β· GitHub in combination with LinearAlgebra.jl/src/lu.jl at release-1.11 Β· JuliaLang/LinearAlgebra.jl Β· GitHub, as well as the implementation of lu in the same file; the latter has a lot of indirection to handle multiple matrix types, but for Matrix{Float64} it ultimately just ends up calling getrf! and checking info for errors, just like your code.

I think you already are. Note that BLAS.vendor() is a remnant from earlier Julia versions and is not useful anymore. It will only tell you that BLAS/LAPACK is provided by libblastrampoline (lbt), which is the BLAS/LAPACK demuxer that allows Julia to swap between different backends at runtime. To see which backend you’re actually using, you should call BLAS.get_config() like you did in your interactive session. If it lists libraries called things like mkl_rt.2.dll, you’re using MKL.

2 Likes
  1. hmm with @btime i dont know how to print progress repetitions. like 100/1000 and so on.
C:\Temp>julia test_fpu_1core_mkl_timings.jl
BLAS configuration:
  Library: lbt
  Threads: 12
BLAS backend libraries:
LBTConfig([ILP64] mkl_rt.2.dll, [LP64] mkl_rt.2.dll)

Small matrix benchmark (250Γ—250), 1000 reps

[1] Gauss Elimination:
  43.509 s (259354000 allocations: 246.67 GiB)
Average error: 2.170767345625942e-14

[2] Crout Inversion:
  85.268 s (750027000 allocations: 267.01 GiB)
Average error: 1.3786365392807404e-10

Large matrix benchmark (2000Γ—2000), 1 rep

[3] Crout Large:
  45.665 s (66345051 allocations: 122.14 GiB)
Average error: 1.8646103528466233e-9

[4] Native Julia (inv) Large:
  191.761 ms (23 allocations: 93.54 MiB)
Average error: 1.0998328686413708e-12

[5] LAPACK (LU + inv) Large:
  191.918 ms (23 allocations: 93.54 MiB)
Average error: 1.0998328686413708e-12
  1. yes apparrently the back end will go to mkl.
    println("BLAS backend libraries:")
    println(BLAS.get_config())
  1. since the repetition in small matrix, i am thinking maybe we can get all cpu power threading.

source :

using MKL # Attempt to load MKL - this should be done before LinearAlgebra in a fresh session
using LinearAlgebra
using Random
using LinearAlgebra.BLAS
using Printf # For formatted output
using Statistics # For mean function
using BenchmarkTools # For accurate timing and statistics

# Ensure Julia is started with multiple threads (e.g., julia -t auto)
if Threads.nthreads() == 1
    println("WARNING: Julia is running with only 1 thread. For parallel benchmarks, start Julia with `julia -t auto` or `julia -t N` where N > 1.")
    println("Current Julia Threads: ", Threads.nthreads())
else
    println("Julia Threads: ", Threads.nthreads()) # Show number of Julia threads
end
println("BLAS Threads: ", LinearAlgebra.BLAS.get_num_threads()) # Show number of BLAS threads (set by default or environment)


function average_error(A, A2)
    n = size(A, 1)
    # Handle potential size mismatch or non-numeric types if errors occurred
    if size(A) != size(A2) || !isa(A2, AbstractMatrix{Float64})
        return NaN
    end
    return sum(abs.(A .- A2)) / (n * n)
end

# === Implementations ===

# Standard Serial Gauss-Jordan Elimination based Inverse (Reference for accuracy)
function gauss_jordan_inverse_serial!(A::Matrix{Float64})
    n = size(A, 1)
    B = copy(A) # Work on a copy of A
    I_matrix = Matrix{Float64}(I, n, n) # Start with an identity matrix

    @inbounds for k in 1:n
        # Partial Pivoting
        pivot_row = k
        for i in k+1:n
            if abs(B[i, k]) > abs(B[pivot_row, k])
                pivot_row = i
            end
        end

        if pivot_row != k
            # Swap rows in both B and I_matrix
            B[k, :], B[pivot_row, :] = B[pivot_row, :], B[k, :]
            I_matrix[k, :], I_matrix[pivot_row, :] = I_matrix[pivot_row, :], I_matrix[k, :]
        end

        pivot_element = B[k, k]
        if abs(pivot_element) < eps(Float64) * n * sqrt(n) # More robust singularity check
            error("Matrix is numerically singular at step $k (Gauss-Jordan Serial)")
        end

        # Scale the pivot row to make the pivot element 1
        scale = 1.0 / pivot_element
        B[k, :] .*= scale
        I_matrix[k, :] .*= scale

        # Perform elimination for other rows
        for i in 1:n
            if i != k
                factor = B[i, k]
                # Vectorized subtraction using broadcast
                B[i, :] .-= factor .* B[k, :]
                I_matrix[i, :] .-= factor .* I_matrix[k, :]
            end
        end
    end

    # After reduction, I_matrix holds the inverse
    return I_matrix
end

# Threaded Gauss Variation 1: Parallelize Elimination over Rows i (Validated Accurate)
function gauss_inverse_threaded_v1!(A::Matrix{Float64})
    n = size(A, 1)
    B = copy(A) # Augment with Identity
    I_matrix = Matrix{Float64}(I, n, n) # Identity matrix

    @inbounds for k in 1:n
        # Partial Pivoting (Serial)
        pivot_row = k
        for i in k+1:n
            if abs(B[i, k]) > abs(B[pivot_row, k])
                pivot_row = i
            end
        end

        if pivot_row != k
            # Swap rows in both B and I_matrix - Parallelize row swaps across all columns
            Threads.@threads for col = 1:n
                 B[k, col], B[pivot_row, col] = B[pivot_row, col], B[k, col]
                 I_matrix[k, col], I_matrix[pivot_row, col] = I_matrix[pivot_row, col], I_matrix[k, col]
            end
        end

        pivot_element = B[k, k]
        if abs(pivot_element) < eps(Float64) * n * sqrt(n) # More robust singularity check
            error("Matrix is numerically singular at step $k (Gauss Threaded V1)")
        end

        # Scale the pivot row - Parallelize scaling across columns
        scale = 1.0 / pivot_element
        Threads.@threads for col = 1:n
             B[k, col] *= scale
             I_matrix[k, col] *= scale
        end

        # Perform elimination for other rows - Parallelize over rows i
        Threads.@threads for i in 1:n
            if i != k
                factor = B[i, k] # Factor from column k in row i
                 # Subtract a multiple of the pivot row (row k) from row i
                 @inbounds for col = 1:n
                     B[i, col] -= factor * B[k, col]
                     I_matrix[i, col] -= factor * I_matrix[k, col]
                 end
            end
        end
    end

    return I_matrix # After reduction, I_matrix holds the inverse
end

# Single-threaded Crout Factorization and Inverse (For serial baseline)
function crout_inverse_original(A::Matrix{Float64})
    n = size(A, 1)
    L = zeros(n, n)
    U = Matrix{Float64}(I, n, n)
    Ainv = zeros(n, n)

    for j in 1:n
        # Compute column j of L
        for i in j:n
            sum_val = zero(Float64)
            @inbounds for k in 1:j-1
                sum_val += L[i,k] * U[k,j]
            end
            L[i,j] = A[i,j] - sum_val

            # Check for singularity during factorization
            if i == j && abs(L[j,j]) < eps(Float64) * n * sqrt(n)
                error("Matrix is numerically singular during Crout factorization at step $j")
            end
        end

        # Compute row j of U
        inv_Ljj = 1.0 / L[j,j]
        for i in j+1:n
            sum_val = zero(Float64)
            @inbounds for k in 1:j-1
                sum_val += L[j,k] * U[k,i]
            end
            U[j,i] = (A[j,i] - sum_val) * inv_Ljj
        end
    end

    # Solve for the inverse column by column
    for i in 1:n
        e = zeros(n)
        e[i] = 1.0
        y = L \ e # Solve L * y = e (Forward Substitution)
        Ainv[:,i] = U \ y # Solve U * x = y (Backward Substitution)
    end
    return Ainv
end

# Multi-threaded Crout Factorization and Inverse
function crout_inverse_threaded(A::Matrix{Float64})
    n = size(A, 1)
    L = zeros(n, n)
    U = Matrix{Float64}(I, n, n) # U is initialized as Identity

    # Crout Factorization (mostly sequential due to dependencies)
    # Parallelizing this part is complex and might not be beneficial for small matrices
    @inbounds for j in 1:n # Loop over columns
        # Compute column j of L
        for i in j:n # Loop over rows in column j
            sum_val = zero(Float64) # Corrected typo here (was Float66)
            @inbounds for k in 1:j-1
                sum_val += L[i,k] * U[k,j]
            end
            L[i,j] = A[i,j] - sum_val

            # Check for singularity during factorization
            if i == j && abs(L[j,j]) < eps(Float64) * n * sqrt(n)
                error("Matrix is numerically singular during Crout factorization at step $j")
            end
        end

        # Compute row j of U (for elements right of the diagonal)
        inv_Ljj = 1.0 / L[j,j]
        for i in j+1:n # Loop over columns in row j (right of diagonal)
            sum_val = zero(Float64)
            @inbounds for k in 1:j-1
                sum_val += L[j,k] * U[k,i]
            end
            U[j,i] = (A[j,i] - sum_val) * inv_Ljj
        end
    end

    # Solve for the inverse column by column (Parallelize this step)
    # A * Ainv = I => L*U * Ainv = I. Solve L*Y = I for Y, then U*Ainv = Y for Ainv.
    # We can solve for each column of Ainv independently.
    Ainv = zeros(n, n)
    Threads.@threads for i in 1:n # Parallelize over columns of the identity matrix (and thus Ainv)
        e = zeros(n) # Column of the identity matrix
        e[i] = 1.0

        # Solve L * y = e (Forward Substitution)
        # This part is sequential within each thread
        y = zeros(n)
        @inbounds for row in 1:n
            sum_val = zero(Float64)
            @inbounds for col in 1:row-1
                sum_val += L[row, col] * y[col]
            end
            y[row] = (e[row] - sum_val) / L[row, row]
        end

        # Solve U * x = y (Backward Substitution)
        # This part is sequential within each thread
        x = zeros(n) # This will be the i-th column of Ainv
        @inbounds for row in n:-1:1
            sum_val = zero(Float64)
            @inbounds for col in row+1:n
                sum_val += U[row, col] * x[col]
            end
            x[row] = (y[row] - sum_val) / U[row, row]
        end
        Ainv[:,i] .= x # Store the computed column of the inverse
    end
    return Ainv
end

# Function to compute the inverse using LAPACK's dgetrf! and dgetri!
# dgetrf! computes the LU factorization with partial pivoting
# dgetri! computes the inverse from the LU factorization
function lapack_inverse(A::Matrix{Float64})
    n = size(A, 1)
    n != size(A, 2) && error("Matrix must be square")

    # Create a copy since we modify in-place
    A_copy = copy(A)

    # Step 1: LU factorization (getrf!)
    # In Julia, getrf! modifies A_copy in-place and returns (A_copy, ipiv, info)
    result = LinearAlgebra.LAPACK.getrf!(A_copy)
    A_lu = result[1] # This is the modified A_copy
    ipiv = result[2]
    info_getrf = result[3] # Use a specific variable name

    # Check for errors from getrf!
    if info_getrf > 0
        error("Matrix is singular in getrf! at position $info_getrf (from getrf!)")
    elseif info_getrf < 0
        error("Invalid argument in getrf! at position $(-info_getrf) (from getrf!)")
    end

    # Step 2: Matrix inversion (getri!)
    # In Julia, getri! modifies A_lu in-place with the inverse and returns (A_lu, info)
    A_inv = LinearAlgebra.LAPACK.getri!(A_lu, ipiv)

end


# === Benchmarking ===

function run_benchmarks()
    small_n = 250
    large_n = 2000
    small_reps = 1000 # Increased repetitions for small matrix
    large_reps = 1 # Fewer repetitions for large matrices due to time

    println("\nStarting Benchmarks (Small: $small_reps repetitions on $small_nΓ—$small_n, Large: $large_reps repetition on $large_nΓ—$large_n)...")

    # --- Small Matrix Benchmarks ---
    println("\nSmall matrix benchmarks ($small_reps repetitions on $small_nΓ—$small_n)...")
    Random.seed!(42) # Use a fixed seed for reproducibility
    A_small = randn(small_n, small_n)
    # Add small diagonal perturbation to help ensure invertibility
    A_small += I * norm(A_small, Inf) * 1e-10

    # Declare variables and initialize outside the try blocks for small matrices
    local t_gauss_serial_small = NaN; local err_gauss_serial_small_min = NaN; local err_gauss_serial_small_max = NaN; local err_gauss_serial_small_avg = NaN;
    local t_gauss_v1_small = NaN; local err_gauss_v1_small_min = NaN; local err_gauss_v1_small_max = NaN; local err_gauss_v1_small_avg = NaN;
    local t_crout_serial_small = NaN; local err_crout_serial_small_min = NaN; local err_crout_serial_small_max = NaN; local err_crout_serial_small_avg = NaN;
    local t_crout_threaded_small = NaN; local err_crout_threaded_small_min = NaN; local err_crout_threaded_small_max = NaN; local err_crout_threaded_small_avg = NaN;
    local t_inv_small = NaN; local err_inv_small_min = NaN; local err_inv_small_max = NaN; local err_inv_small_avg = NaN;
    local t_lapack_small = NaN; local err_lapack_small_min = NaN; local err_lapack_small_max = NaN; local err_lapack_small_avg = NaN;


    # === [1] Gauss Elimination (Standard Serial, Small) ===
    println("\n[1] Gauss Elimination (Standard Serial, Small):")
    errors_gauss_serial_small = Float64[]
    try
        t_gauss_serial_small = @elapsed for i in 1:small_reps
            A_copy = copy(A_small)
            Ainv_rep = gauss_jordan_inverse_serial!(A_copy)
            A2_rep = gauss_jordan_inverse_serial!(Ainv_rep)
            push!(errors_gauss_serial_small, average_error(A_small, A2_rep))
            if i % 200 == 0 # Print progress less often for 1000 reps
               println("  Serial Iteration $i / $small_reps")
            end
        end
        err_gauss_serial_small_min = minimum(errors_gauss_serial_small)
        err_gauss_serial_small_max = maximum(errors_gauss_serial_small)
        err_gauss_serial_small_avg = mean(errors_gauss_serial_small)
        println("  Time: ", round(t_gauss_serial_small, digits=3), " seconds")
        println("  Error (Min/Max/Avg): ", @sprintf("%.3e", err_gauss_serial_small_min), " / ", @sprintf("%.3e", err_gauss_serial_small_max), " / ", @sprintf("%.3e", err_gauss_serial_small_avg))
    catch e
        println("  Benchmark [1] failed: ", e)
    end

    # === [2] Gauss Elimination (Threaded V1 - Row Parallel, Small) ===
    println("\n[2] Gauss Elimination (Threaded V1 - Row Parallel, Small):")
    errors_gauss_v1_small = Float64[]
    try
        t_gauss_v1_small = @elapsed for i in 1:small_reps
            A_copy = copy(A_small)
            Ainv_rep = gauss_inverse_threaded_v1!(A_copy)
            A2_rep = gauss_inverse_threaded_v1!(Ainv_rep)
            push!(errors_gauss_v1_small, average_error(A_small, A2_rep))
             if i % 200 == 0 # Print progress less often
               println("  V1 Iteration $i / $small_reps")
            end
        end
        err_gauss_v1_small_min = minimum(errors_gauss_v1_small)
        err_gauss_v1_small_max = maximum(errors_gauss_v1_small)
        err_gauss_v1_small_avg = mean(errors_gauss_v1_small)
        println("  Time: ", round(t_gauss_v1_small, digits=3), " seconds")
        println("  Error (Min/Max/Avg): ", @sprintf("%.3e", err_gauss_v1_small_min), " / ", @sprintf("%.3e", err_gauss_v1_small_max), " / ", @sprintf("%.3e", err_gauss_v1_small_avg))
    catch e
        println("  Benchmark [2] failed: ", e)
    end

     # === [3] Crout Inversion (Single Threaded, Small) ===
    println("\n[3] Crout Inversion (Single Threaded, Small):")
    errors_crout_serial_small = Float64[]
    try
        t_crout_serial_small = @elapsed for i in 1:small_reps
            Ainv_rep = crout_inverse_original(A_small)
            A2_rep = crout_inverse_original(Ainv_rep)
            push!(errors_crout_serial_small, average_error(A_small, A2_rep))
             if i % 200 == 0 # Print progress less often
               println("  Crout Serial Iteration $i / $small_reps")
            end
        end
        err_crout_serial_small_min = minimum(errors_crout_serial_small)
        err_crout_serial_small_max = maximum(errors_crout_serial_small)
        err_crout_serial_small_avg = mean(errors_crout_serial_small)
        println("  Time: ", round(t_crout_serial_small, digits=3), " seconds")
        println("  Error (Min/Max/Avg): ", @sprintf("%.3e", err_crout_serial_small_min), " / ", @sprintf("%.3e", err_crout_serial_small_max), " / ", @sprintf("%.3e", err_crout_serial_small_avg))
    catch e
        println("  Benchmark [3] failed: ", e)
    end

    # === [4] Crout Inversion (Threaded, Small) ===
    println("\n[4] Crout Inversion (Threaded, Small):")
    errors_crout_threaded_small = Float64[]
    try
        t_crout_threaded_small = @elapsed for i in 1:small_reps
            Ainv_rep = crout_inverse_threaded(A_small)
            A2_rep = crout_inverse_threaded(Ainv_rep)
            push!(errors_crout_threaded_small, average_error(A_small, A2_rep))
            if i % 200 == 0 # Print progress less often
               println("  Crout Threaded Iteration $i / $small_reps")
            end
        end
        err_crout_threaded_small_min = minimum(errors_crout_threaded_small)
        err_crout_threaded_small_max = maximum(errors_crout_threaded_small)
        err_crout_threaded_small_avg = mean(errors_crout_threaded_small)
        println("  Time: ", round(t_crout_threaded_small, digits=3), " seconds")
        println("  Error (Min/Max/Avg): ", @sprintf("%.3e", err_crout_threaded_small_min), " / ", @sprintf("%.3e", err_crout_threaded_small_max), " / ", @sprintf("%.3e", err_crout_threaded_small_avg))
    catch e
        println("  Benchmark [4] failed: ", e)
    end

    # === [5] Native Julia (inv) Small ===
    println("\n[5] Native Julia (inv, Small):")
    errors_inv_small = Float64[]
    try
        t_inv_small = @elapsed for i in 1:small_reps
            Ainv_rep = inv(A_small)
            A2_rep = inv(Ainv_rep)
            push!(errors_inv_small, average_error(A_small, A2_rep))
            if i % 200 == 0 # Print progress less often
               println("  inv Iteration $i / $small_reps")
            end
        end
        err_inv_small_min = minimum(errors_inv_small)
        err_inv_small_max = maximum(errors_inv_small)
        err_inv_small_avg = mean(errors_inv_small)
        println("  Time: ", round(t_inv_small, digits=3), " seconds")
        println("  Error (Min/Max/Avg): ", @sprintf("%.3e", err_inv_small_min), " / ", @sprintf("%.3e", err_inv_small_max), " / ", @sprintf("%.3e", err_inv_small_avg))
    catch e
        println("  Benchmark [5] failed: ", e)
    end

    # === [6] LAPACK manual Small ===
    println("\n[6] LAPACK (LU + inv, Small):")
    errors_lapack_small = Float64[]
    try
        t_lapack_small = @elapsed for i in 1:small_reps
            Ainv_rep = lapack_inverse(A_small)
            A2_rep = lapack_inverse(Ainv_rep)
            push!(errors_lapack_small, average_error(A_small, A2_rep))
             if i % 200 == 0 # Print progress less often
               println("  LAPACK Iteration $i / $small_reps")
            end
        end
        err_lapack_small_min = minimum(errors_lapack_small)
        err_lapack_small_max = maximum(errors_lapack_small)
        err_lapack_small_avg = mean(errors_lapack_small)
        println("  Time: ", round(t_lapack_small, digits=3), " seconds")
        println("  Error (Min/Max/Avg): ", @sprintf("%.3e", err_lapack_small_min), " / ", @sprintf("%.3e", err_lapack_small_max), " / ", @sprintf("%.3e", err_lapack_small_avg))
    catch e
        println("  Benchmark [6] failed: ", e)
    end


    # --- Large Matrix Benchmarks ---
    println("\nLarge matrix benchmarks ($large_reps repetition on $large_nΓ—$large_n)...")
    Random.seed!(123) # Different seed for large matrix
    A_large = randn(large_n, large_n)
    # Add small diagonal perturbation to help ensure invertibility
    A_large += I * norm(A_large, Inf) * 1e-10

     # Declare variables and initialize outside the try blocks for large matrices
    local t_gauss_serial_large = NaN; local err_gauss_serial_large = NaN;
    local t_gauss_v1_large = NaN; local err_gauss_v1_large = NaN;
    local t_crout_serial_large = NaN; local err_crout_serial_large = NaN;
    local t_crout_threaded_large = NaN; local err_crout_threaded_large = NaN;
    local t_inv_large = NaN; local err_inv_large = NaN;
    local t_lapack_large = NaN; local err_lapack_large = NaN;


    # === [7] Gauss Elimination (Standard Serial, Large) ===
    println("\n[7] Gauss Elimination (Standard Serial, Large):")
    try
        t_gauss_serial_large = @elapsed begin
            Ainv_large = gauss_jordan_inverse_serial!(copy(A_large))
            A2_large = gauss_jordan_inverse_serial!(Ainv_large)
            err_gauss_serial_large = average_error(A_large, A2_large)
        end
        println("  Time: ", round(t_gauss_serial_large, digits=3), " seconds")
        println("  Average error: ", @sprintf("%.3e", err_gauss_serial_large))
    catch e
        println("  Benchmark [7] failed: ", e)
    end

    # === [8] Gauss Elimination (Threaded V1 - Row Parallel, Large) ===
    println("\n[8] Gauss Elimination (Threaded V1 - Row Parallel, Large):")
    try
        t_gauss_v1_large = @elapsed begin
            Ainv_large = gauss_inverse_threaded_v1!(copy(A_large))
            A2_large = gauss_inverse_threaded_v1!(Ainv_large)
            err_gauss_v1_large = average_error(A_large, A2_large)
        end
        println("  Time: ", round(t_gauss_v1_large, digits=3), " seconds")
        println("  Average error: ", @sprintf("%.3e", err_gauss_v1_large))
    catch e
        println("  Benchmark [8] failed: ", e)
    end

    # === [9] Crout Inversion (Single Threaded, Large) ===
    println("\n[9] Crout Inversion (Single Threaded, Large):")
    try
        t_crout_serial_large = @elapsed begin
            Ainv_large = crout_inverse_original(A_large)
            A2_large = crout_inverse_original(Ainv_large)
            err_crout_serial_large = average_error(A_large, A2_large)
        end
        println("  Time: ", round(t_crout_serial_large, digits=3), " seconds")
        println("  Average error: ", @sprintf("%.3e", err_crout_serial_large))
    catch e
        println("  Benchmark [9] failed: ", e)
    end

    # === [10] Crout Inversion (Threaded, Large) ===
    println("\n[10] Crout Inversion (Threaded, Large):")
    try
        t_crout_threaded_large = @elapsed begin
            Ainv_large = crout_inverse_threaded(A_large)
            A2_large = crout_inverse_threaded(Ainv_large)
            err_crout_threaded_large = average_error(A_large, A2_large)
        end
        println("  Time: ", round(t_crout_threaded_large, digits=3), " seconds")
        println("  Average error: ", @sprintf("%.3e", err_crout_threaded_large))
    catch e
        println("  Benchmark [10] failed: ", e)
    end

    # === [11] Native Julia (inv) Large ===
    println("\n[11] Native Julia (inv, Large):")
    try
        t_inv_large = @elapsed begin
            Ainv_large = inv(A_large)
            A2_large = inv(Ainv_large)
            err_inv_large = average_error(A_large, A2_large)
        end
        println("  Time: ", round(t_inv_large, digits=3), " seconds")
        println("  Average error: ", @sprintf("%.3e", err_inv_large))
    catch e
        println("  Benchmark [11] failed: ", e)
    end

    # === [12] LAPACK manual Large ===
    println("\n[12] LAPACK (LU + inv, Large):")
    try
        t_lapack_large = @elapsed begin
            Ainv_large = lapack_inverse(A_large)
            A2_large = lapack_inverse(Ainv_large)
            err_lapack_large = average_error(A_large, A2_large)
        end
        println("  Time: ", round(t_lapack_large, digits=3), " seconds")
        println("  Average error: ", @sprintf("%.3e", err_lapack_large))
    catch e
        println("  Benchmark [12] failed: ", e)
    end


    println("\nBenchmark Summary (Time in seconds, Error as Max Error for Small, Avg Error for Large):")
    @printf "  %-40s | %-15s | %-15s | %-15s | %-15s\n" "Method" "Time (Small)" "Error (Small)" "Time (Large)" "Error (Large)"
    println("-"^110)
    @printf "  %-40s | %-15s | %-15s | %-15s | %-15s\n" "[1] Gauss (Standard Serial)" (isnan(t_gauss_serial_small) ? "Failed" : round(t_gauss_serial_small, digits=3)) (isnan(err_gauss_serial_small_max) ? "N/A" : @sprintf("%.3e", err_gauss_serial_small_max)) (isnan(t_gauss_serial_large) ? "Failed" : round(t_gauss_serial_large, digits=3)) (isnan(err_gauss_serial_large) ? "N/A" : @sprintf("%.3e", err_gauss_serial_large))
    @printf "  %-40s | %-15s | %-15s | %-15s | %-15s\n" "[2] Gauss (Threaded V1 - Row Parallel)" (isnan(t_gauss_v1_small) ? "Failed" : round(t_gauss_v1_small, digits=3)) (isnan(err_gauss_v1_small_max) ? "N/A" : @sprintf("%.3e", err_gauss_v1_small_max)) (isnan(t_gauss_v1_large) ? "Failed" : round(t_gauss_v1_large, digits=3)) (isnan(err_gauss_v1_large) ? "N/A" : @sprintf("%.3e", err_gauss_v1_large))
    @printf "  %-40s | %-15s | %-15s | %-15s | %-15s\n" "[3] Crout (Single Threaded)" (isnan(t_crout_serial_small) ? "Failed" : round(t_crout_serial_small, digits=3)) (isnan(err_crout_serial_small_max) ? "N/A" : @sprintf("%.3e", err_crout_serial_small_max)) (isnan(t_crout_serial_large) ? "Failed" : round(t_crout_serial_large, digits=3)) (isnan(err_crout_serial_large) ? "N/A" : @sprintf("%.3e", err_crout_serial_large))
    @printf "  %-40s | %-15s | %-15s | %-15s | %-15s\n" "[4] Crout (Threaded)" (isnan(t_crout_threaded_small) ? "Failed" : round(t_crout_threaded_small, digits=3)) (isnan(err_crout_threaded_small_max) ? "N/A" : @sprintf("%.3e", err_crout_threaded_small_max)) (isnan(t_crout_threaded_large) ? "Failed" : round(t_crout_threaded_large, digits=3)) (isnan(err_crout_threaded_large) ? "N/A" : @sprintf("%.3e", err_crout_threaded_large))
    @printf "  %-40s | %-15s | %-15s | %-15s | %-15s\n" "[5] Native Julia (inv)" (isnan(t_inv_small) ? "Failed" : round(t_inv_small, digits=3)) (isnan(err_inv_small_max) ? "N/A" : @sprintf("%.3e", err_inv_small_max)) (isnan(t_inv_large) ? "Failed" : round(t_inv_large, digits=3)) (isnan(err_inv_large) ? "N/A" : @sprintf("%.3e", err_inv_large))
    @printf "  %-40s | %-15s | %-15s | %-15s | %-15s\n" "[6] LAPACK (LU+inv)" (isnan(t_lapack_small) ? "Failed" : round(t_lapack_small, digits=3)) (isnan(err_lapack_small_max) ? "N/A" : @sprintf("%.3e", err_lapack_small_max)) (isnan(t_lapack_large) ? "Failed" : round(t_lapack_large, digits=3)) (isnan(err_lapack_large) ? "N/A" : @sprintf("%.3e", err_lapack_large))

end

# To run with multiple threads, start Julia with `julia -t auto` or `julia -t N`
# where N is the number of threads.
run_benchmarks()

Updated from AI. now working.

Benchmark Summary (Time in seconds, Error as Max Error for Small, Avg Error for Large):
  Method                                   | Time (Small)    | Error (Small)   | Time (Large)    | Error (Large)
--------------------------------------------------------------------------------------------------------------
  [1] Gauss (Standard Serial)              | 188.535         | 7.272e-15       | 285.61          | 3.633e-14
  [2] Gauss (Threaded V1 - Row Parallel)   | 109.895         | 7.272e-15       | 25.076          | 3.633e-14
  [3] Crout (Single Threaded)              | 38.474          | 5.135e-11       | 25.035          | 1.806e-09
  [4] Crout (Threaded)                     | 9.964           | 4.081e-11       | 7.615           | 1.839e-09
  [5] Native Julia (inv)                   | 3.545           | 6.973e-14       | 0.209           | 1.095e-12
  [6] LAPACK (LU+inv)                      | 3.55            | 6.973e-14       | 0.2             | 1.095e-12

compare to fortran

C:\Temp>test_fpu3_Qmkl_Qpar_O3_Qax
  Benchmark running, hopefully as only ACTIVE task
Test1: Gauss  - 1000 ( 250x 250) inverts in   5.848 seconds  Err=0.000000000000
Test2: Crout  - 1000 ( 250x 250) inverts in   6.178 seconds  Err=0.000000000000
Test3: Crout  -    2 (2000x2000) inverts in   5.852 seconds  Err=0.000000000000
Test4: Lapack -    2 (2000x2000) inverts in   0.302 seconds  Err=0.000000000002

The lapack and inv(A) native julia beat fortran inverse 0.2 vs 0.3 with MKL
For regular non blas, like using matrix operation non blas, like crout, fortran faster.

Would be interesting to see some Julia speed gurus take a stab at optimizing your code. Fortran performance should be achievable, but the code that hits the fast path may look slightly different in the two languages.