- hmm with @btime i dont know how to print progress repetitions. like 100/1000 and so on.
C:\Temp>julia test_fpu_1core_mkl_timings.jl
BLAS configuration:
Library: lbt
Threads: 12
BLAS backend libraries:
LBTConfig([ILP64] mkl_rt.2.dll, [LP64] mkl_rt.2.dll)
Small matrix benchmark (250Γ250), 1000 reps
[1] Gauss Elimination:
43.509 s (259354000 allocations: 246.67 GiB)
Average error: 2.170767345625942e-14
[2] Crout Inversion:
85.268 s (750027000 allocations: 267.01 GiB)
Average error: 1.3786365392807404e-10
Large matrix benchmark (2000Γ2000), 1 rep
[3] Crout Large:
45.665 s (66345051 allocations: 122.14 GiB)
Average error: 1.8646103528466233e-9
[4] Native Julia (inv) Large:
191.761 ms (23 allocations: 93.54 MiB)
Average error: 1.0998328686413708e-12
[5] LAPACK (LU + inv) Large:
191.918 ms (23 allocations: 93.54 MiB)
Average error: 1.0998328686413708e-12
- yes apparrently the back end will go to mkl.
println("BLAS backend libraries:")
println(BLAS.get_config())
- since the repetition in small matrix, i am thinking maybe we can get all cpu power threading.
source :
using MKL # Attempt to load MKL - this should be done before LinearAlgebra in a fresh session
using LinearAlgebra
using Random
using LinearAlgebra.BLAS
using Printf # For formatted output
using Statistics # For mean function
using BenchmarkTools # For accurate timing and statistics
# Ensure Julia is started with multiple threads (e.g., julia -t auto)
if Threads.nthreads() == 1
println("WARNING: Julia is running with only 1 thread. For parallel benchmarks, start Julia with `julia -t auto` or `julia -t N` where N > 1.")
println("Current Julia Threads: ", Threads.nthreads())
else
println("Julia Threads: ", Threads.nthreads()) # Show number of Julia threads
end
println("BLAS Threads: ", LinearAlgebra.BLAS.get_num_threads()) # Show number of BLAS threads (set by default or environment)
function average_error(A, A2)
n = size(A, 1)
# Handle potential size mismatch or non-numeric types if errors occurred
if size(A) != size(A2) || !isa(A2, AbstractMatrix{Float64})
return NaN
end
return sum(abs.(A .- A2)) / (n * n)
end
# === Implementations ===
# Standard Serial Gauss-Jordan Elimination based Inverse (Reference for accuracy)
function gauss_jordan_inverse_serial!(A::Matrix{Float64})
n = size(A, 1)
B = copy(A) # Work on a copy of A
I_matrix = Matrix{Float64}(I, n, n) # Start with an identity matrix
@inbounds for k in 1:n
# Partial Pivoting
pivot_row = k
for i in k+1:n
if abs(B[i, k]) > abs(B[pivot_row, k])
pivot_row = i
end
end
if pivot_row != k
# Swap rows in both B and I_matrix
B[k, :], B[pivot_row, :] = B[pivot_row, :], B[k, :]
I_matrix[k, :], I_matrix[pivot_row, :] = I_matrix[pivot_row, :], I_matrix[k, :]
end
pivot_element = B[k, k]
if abs(pivot_element) < eps(Float64) * n * sqrt(n) # More robust singularity check
error("Matrix is numerically singular at step $k (Gauss-Jordan Serial)")
end
# Scale the pivot row to make the pivot element 1
scale = 1.0 / pivot_element
B[k, :] .*= scale
I_matrix[k, :] .*= scale
# Perform elimination for other rows
for i in 1:n
if i != k
factor = B[i, k]
# Vectorized subtraction using broadcast
B[i, :] .-= factor .* B[k, :]
I_matrix[i, :] .-= factor .* I_matrix[k, :]
end
end
end
# After reduction, I_matrix holds the inverse
return I_matrix
end
# Threaded Gauss Variation 1: Parallelize Elimination over Rows i (Validated Accurate)
function gauss_inverse_threaded_v1!(A::Matrix{Float64})
n = size(A, 1)
B = copy(A) # Augment with Identity
I_matrix = Matrix{Float64}(I, n, n) # Identity matrix
@inbounds for k in 1:n
# Partial Pivoting (Serial)
pivot_row = k
for i in k+1:n
if abs(B[i, k]) > abs(B[pivot_row, k])
pivot_row = i
end
end
if pivot_row != k
# Swap rows in both B and I_matrix - Parallelize row swaps across all columns
Threads.@threads for col = 1:n
B[k, col], B[pivot_row, col] = B[pivot_row, col], B[k, col]
I_matrix[k, col], I_matrix[pivot_row, col] = I_matrix[pivot_row, col], I_matrix[k, col]
end
end
pivot_element = B[k, k]
if abs(pivot_element) < eps(Float64) * n * sqrt(n) # More robust singularity check
error("Matrix is numerically singular at step $k (Gauss Threaded V1)")
end
# Scale the pivot row - Parallelize scaling across columns
scale = 1.0 / pivot_element
Threads.@threads for col = 1:n
B[k, col] *= scale
I_matrix[k, col] *= scale
end
# Perform elimination for other rows - Parallelize over rows i
Threads.@threads for i in 1:n
if i != k
factor = B[i, k] # Factor from column k in row i
# Subtract a multiple of the pivot row (row k) from row i
@inbounds for col = 1:n
B[i, col] -= factor * B[k, col]
I_matrix[i, col] -= factor * I_matrix[k, col]
end
end
end
end
return I_matrix # After reduction, I_matrix holds the inverse
end
# Single-threaded Crout Factorization and Inverse (For serial baseline)
function crout_inverse_original(A::Matrix{Float64})
n = size(A, 1)
L = zeros(n, n)
U = Matrix{Float64}(I, n, n)
Ainv = zeros(n, n)
for j in 1:n
# Compute column j of L
for i in j:n
sum_val = zero(Float64)
@inbounds for k in 1:j-1
sum_val += L[i,k] * U[k,j]
end
L[i,j] = A[i,j] - sum_val
# Check for singularity during factorization
if i == j && abs(L[j,j]) < eps(Float64) * n * sqrt(n)
error("Matrix is numerically singular during Crout factorization at step $j")
end
end
# Compute row j of U
inv_Ljj = 1.0 / L[j,j]
for i in j+1:n
sum_val = zero(Float64)
@inbounds for k in 1:j-1
sum_val += L[j,k] * U[k,i]
end
U[j,i] = (A[j,i] - sum_val) * inv_Ljj
end
end
# Solve for the inverse column by column
for i in 1:n
e = zeros(n)
e[i] = 1.0
y = L \ e # Solve L * y = e (Forward Substitution)
Ainv[:,i] = U \ y # Solve U * x = y (Backward Substitution)
end
return Ainv
end
# Multi-threaded Crout Factorization and Inverse
function crout_inverse_threaded(A::Matrix{Float64})
n = size(A, 1)
L = zeros(n, n)
U = Matrix{Float64}(I, n, n) # U is initialized as Identity
# Crout Factorization (mostly sequential due to dependencies)
# Parallelizing this part is complex and might not be beneficial for small matrices
@inbounds for j in 1:n # Loop over columns
# Compute column j of L
for i in j:n # Loop over rows in column j
sum_val = zero(Float64) # Corrected typo here (was Float66)
@inbounds for k in 1:j-1
sum_val += L[i,k] * U[k,j]
end
L[i,j] = A[i,j] - sum_val
# Check for singularity during factorization
if i == j && abs(L[j,j]) < eps(Float64) * n * sqrt(n)
error("Matrix is numerically singular during Crout factorization at step $j")
end
end
# Compute row j of U (for elements right of the diagonal)
inv_Ljj = 1.0 / L[j,j]
for i in j+1:n # Loop over columns in row j (right of diagonal)
sum_val = zero(Float64)
@inbounds for k in 1:j-1
sum_val += L[j,k] * U[k,i]
end
U[j,i] = (A[j,i] - sum_val) * inv_Ljj
end
end
# Solve for the inverse column by column (Parallelize this step)
# A * Ainv = I => L*U * Ainv = I. Solve L*Y = I for Y, then U*Ainv = Y for Ainv.
# We can solve for each column of Ainv independently.
Ainv = zeros(n, n)
Threads.@threads for i in 1:n # Parallelize over columns of the identity matrix (and thus Ainv)
e = zeros(n) # Column of the identity matrix
e[i] = 1.0
# Solve L * y = e (Forward Substitution)
# This part is sequential within each thread
y = zeros(n)
@inbounds for row in 1:n
sum_val = zero(Float64)
@inbounds for col in 1:row-1
sum_val += L[row, col] * y[col]
end
y[row] = (e[row] - sum_val) / L[row, row]
end
# Solve U * x = y (Backward Substitution)
# This part is sequential within each thread
x = zeros(n) # This will be the i-th column of Ainv
@inbounds for row in n:-1:1
sum_val = zero(Float64)
@inbounds for col in row+1:n
sum_val += U[row, col] * x[col]
end
x[row] = (y[row] - sum_val) / U[row, row]
end
Ainv[:,i] .= x # Store the computed column of the inverse
end
return Ainv
end
# Function to compute the inverse using LAPACK's dgetrf! and dgetri!
# dgetrf! computes the LU factorization with partial pivoting
# dgetri! computes the inverse from the LU factorization
function lapack_inverse(A::Matrix{Float64})
n = size(A, 1)
n != size(A, 2) && error("Matrix must be square")
# Create a copy since we modify in-place
A_copy = copy(A)
# Step 1: LU factorization (getrf!)
# In Julia, getrf! modifies A_copy in-place and returns (A_copy, ipiv, info)
result = LinearAlgebra.LAPACK.getrf!(A_copy)
A_lu = result[1] # This is the modified A_copy
ipiv = result[2]
info_getrf = result[3] # Use a specific variable name
# Check for errors from getrf!
if info_getrf > 0
error("Matrix is singular in getrf! at position $info_getrf (from getrf!)")
elseif info_getrf < 0
error("Invalid argument in getrf! at position $(-info_getrf) (from getrf!)")
end
# Step 2: Matrix inversion (getri!)
# In Julia, getri! modifies A_lu in-place with the inverse and returns (A_lu, info)
A_inv = LinearAlgebra.LAPACK.getri!(A_lu, ipiv)
end
# === Benchmarking ===
function run_benchmarks()
small_n = 250
large_n = 2000
small_reps = 1000 # Increased repetitions for small matrix
large_reps = 1 # Fewer repetitions for large matrices due to time
println("\nStarting Benchmarks (Small: $small_reps repetitions on $small_nΓ$small_n, Large: $large_reps repetition on $large_nΓ$large_n)...")
# --- Small Matrix Benchmarks ---
println("\nSmall matrix benchmarks ($small_reps repetitions on $small_nΓ$small_n)...")
Random.seed!(42) # Use a fixed seed for reproducibility
A_small = randn(small_n, small_n)
# Add small diagonal perturbation to help ensure invertibility
A_small += I * norm(A_small, Inf) * 1e-10
# Declare variables and initialize outside the try blocks for small matrices
local t_gauss_serial_small = NaN; local err_gauss_serial_small_min = NaN; local err_gauss_serial_small_max = NaN; local err_gauss_serial_small_avg = NaN;
local t_gauss_v1_small = NaN; local err_gauss_v1_small_min = NaN; local err_gauss_v1_small_max = NaN; local err_gauss_v1_small_avg = NaN;
local t_crout_serial_small = NaN; local err_crout_serial_small_min = NaN; local err_crout_serial_small_max = NaN; local err_crout_serial_small_avg = NaN;
local t_crout_threaded_small = NaN; local err_crout_threaded_small_min = NaN; local err_crout_threaded_small_max = NaN; local err_crout_threaded_small_avg = NaN;
local t_inv_small = NaN; local err_inv_small_min = NaN; local err_inv_small_max = NaN; local err_inv_small_avg = NaN;
local t_lapack_small = NaN; local err_lapack_small_min = NaN; local err_lapack_small_max = NaN; local err_lapack_small_avg = NaN;
# === [1] Gauss Elimination (Standard Serial, Small) ===
println("\n[1] Gauss Elimination (Standard Serial, Small):")
errors_gauss_serial_small = Float64[]
try
t_gauss_serial_small = @elapsed for i in 1:small_reps
A_copy = copy(A_small)
Ainv_rep = gauss_jordan_inverse_serial!(A_copy)
A2_rep = gauss_jordan_inverse_serial!(Ainv_rep)
push!(errors_gauss_serial_small, average_error(A_small, A2_rep))
if i % 200 == 0 # Print progress less often for 1000 reps
println(" Serial Iteration $i / $small_reps")
end
end
err_gauss_serial_small_min = minimum(errors_gauss_serial_small)
err_gauss_serial_small_max = maximum(errors_gauss_serial_small)
err_gauss_serial_small_avg = mean(errors_gauss_serial_small)
println(" Time: ", round(t_gauss_serial_small, digits=3), " seconds")
println(" Error (Min/Max/Avg): ", @sprintf("%.3e", err_gauss_serial_small_min), " / ", @sprintf("%.3e", err_gauss_serial_small_max), " / ", @sprintf("%.3e", err_gauss_serial_small_avg))
catch e
println(" Benchmark [1] failed: ", e)
end
# === [2] Gauss Elimination (Threaded V1 - Row Parallel, Small) ===
println("\n[2] Gauss Elimination (Threaded V1 - Row Parallel, Small):")
errors_gauss_v1_small = Float64[]
try
t_gauss_v1_small = @elapsed for i in 1:small_reps
A_copy = copy(A_small)
Ainv_rep = gauss_inverse_threaded_v1!(A_copy)
A2_rep = gauss_inverse_threaded_v1!(Ainv_rep)
push!(errors_gauss_v1_small, average_error(A_small, A2_rep))
if i % 200 == 0 # Print progress less often
println(" V1 Iteration $i / $small_reps")
end
end
err_gauss_v1_small_min = minimum(errors_gauss_v1_small)
err_gauss_v1_small_max = maximum(errors_gauss_v1_small)
err_gauss_v1_small_avg = mean(errors_gauss_v1_small)
println(" Time: ", round(t_gauss_v1_small, digits=3), " seconds")
println(" Error (Min/Max/Avg): ", @sprintf("%.3e", err_gauss_v1_small_min), " / ", @sprintf("%.3e", err_gauss_v1_small_max), " / ", @sprintf("%.3e", err_gauss_v1_small_avg))
catch e
println(" Benchmark [2] failed: ", e)
end
# === [3] Crout Inversion (Single Threaded, Small) ===
println("\n[3] Crout Inversion (Single Threaded, Small):")
errors_crout_serial_small = Float64[]
try
t_crout_serial_small = @elapsed for i in 1:small_reps
Ainv_rep = crout_inverse_original(A_small)
A2_rep = crout_inverse_original(Ainv_rep)
push!(errors_crout_serial_small, average_error(A_small, A2_rep))
if i % 200 == 0 # Print progress less often
println(" Crout Serial Iteration $i / $small_reps")
end
end
err_crout_serial_small_min = minimum(errors_crout_serial_small)
err_crout_serial_small_max = maximum(errors_crout_serial_small)
err_crout_serial_small_avg = mean(errors_crout_serial_small)
println(" Time: ", round(t_crout_serial_small, digits=3), " seconds")
println(" Error (Min/Max/Avg): ", @sprintf("%.3e", err_crout_serial_small_min), " / ", @sprintf("%.3e", err_crout_serial_small_max), " / ", @sprintf("%.3e", err_crout_serial_small_avg))
catch e
println(" Benchmark [3] failed: ", e)
end
# === [4] Crout Inversion (Threaded, Small) ===
println("\n[4] Crout Inversion (Threaded, Small):")
errors_crout_threaded_small = Float64[]
try
t_crout_threaded_small = @elapsed for i in 1:small_reps
Ainv_rep = crout_inverse_threaded(A_small)
A2_rep = crout_inverse_threaded(Ainv_rep)
push!(errors_crout_threaded_small, average_error(A_small, A2_rep))
if i % 200 == 0 # Print progress less often
println(" Crout Threaded Iteration $i / $small_reps")
end
end
err_crout_threaded_small_min = minimum(errors_crout_threaded_small)
err_crout_threaded_small_max = maximum(errors_crout_threaded_small)
err_crout_threaded_small_avg = mean(errors_crout_threaded_small)
println(" Time: ", round(t_crout_threaded_small, digits=3), " seconds")
println(" Error (Min/Max/Avg): ", @sprintf("%.3e", err_crout_threaded_small_min), " / ", @sprintf("%.3e", err_crout_threaded_small_max), " / ", @sprintf("%.3e", err_crout_threaded_small_avg))
catch e
println(" Benchmark [4] failed: ", e)
end
# === [5] Native Julia (inv) Small ===
println("\n[5] Native Julia (inv, Small):")
errors_inv_small = Float64[]
try
t_inv_small = @elapsed for i in 1:small_reps
Ainv_rep = inv(A_small)
A2_rep = inv(Ainv_rep)
push!(errors_inv_small, average_error(A_small, A2_rep))
if i % 200 == 0 # Print progress less often
println(" inv Iteration $i / $small_reps")
end
end
err_inv_small_min = minimum(errors_inv_small)
err_inv_small_max = maximum(errors_inv_small)
err_inv_small_avg = mean(errors_inv_small)
println(" Time: ", round(t_inv_small, digits=3), " seconds")
println(" Error (Min/Max/Avg): ", @sprintf("%.3e", err_inv_small_min), " / ", @sprintf("%.3e", err_inv_small_max), " / ", @sprintf("%.3e", err_inv_small_avg))
catch e
println(" Benchmark [5] failed: ", e)
end
# === [6] LAPACK manual Small ===
println("\n[6] LAPACK (LU + inv, Small):")
errors_lapack_small = Float64[]
try
t_lapack_small = @elapsed for i in 1:small_reps
Ainv_rep = lapack_inverse(A_small)
A2_rep = lapack_inverse(Ainv_rep)
push!(errors_lapack_small, average_error(A_small, A2_rep))
if i % 200 == 0 # Print progress less often
println(" LAPACK Iteration $i / $small_reps")
end
end
err_lapack_small_min = minimum(errors_lapack_small)
err_lapack_small_max = maximum(errors_lapack_small)
err_lapack_small_avg = mean(errors_lapack_small)
println(" Time: ", round(t_lapack_small, digits=3), " seconds")
println(" Error (Min/Max/Avg): ", @sprintf("%.3e", err_lapack_small_min), " / ", @sprintf("%.3e", err_lapack_small_max), " / ", @sprintf("%.3e", err_lapack_small_avg))
catch e
println(" Benchmark [6] failed: ", e)
end
# --- Large Matrix Benchmarks ---
println("\nLarge matrix benchmarks ($large_reps repetition on $large_nΓ$large_n)...")
Random.seed!(123) # Different seed for large matrix
A_large = randn(large_n, large_n)
# Add small diagonal perturbation to help ensure invertibility
A_large += I * norm(A_large, Inf) * 1e-10
# Declare variables and initialize outside the try blocks for large matrices
local t_gauss_serial_large = NaN; local err_gauss_serial_large = NaN;
local t_gauss_v1_large = NaN; local err_gauss_v1_large = NaN;
local t_crout_serial_large = NaN; local err_crout_serial_large = NaN;
local t_crout_threaded_large = NaN; local err_crout_threaded_large = NaN;
local t_inv_large = NaN; local err_inv_large = NaN;
local t_lapack_large = NaN; local err_lapack_large = NaN;
# === [7] Gauss Elimination (Standard Serial, Large) ===
println("\n[7] Gauss Elimination (Standard Serial, Large):")
try
t_gauss_serial_large = @elapsed begin
Ainv_large = gauss_jordan_inverse_serial!(copy(A_large))
A2_large = gauss_jordan_inverse_serial!(Ainv_large)
err_gauss_serial_large = average_error(A_large, A2_large)
end
println(" Time: ", round(t_gauss_serial_large, digits=3), " seconds")
println(" Average error: ", @sprintf("%.3e", err_gauss_serial_large))
catch e
println(" Benchmark [7] failed: ", e)
end
# === [8] Gauss Elimination (Threaded V1 - Row Parallel, Large) ===
println("\n[8] Gauss Elimination (Threaded V1 - Row Parallel, Large):")
try
t_gauss_v1_large = @elapsed begin
Ainv_large = gauss_inverse_threaded_v1!(copy(A_large))
A2_large = gauss_inverse_threaded_v1!(Ainv_large)
err_gauss_v1_large = average_error(A_large, A2_large)
end
println(" Time: ", round(t_gauss_v1_large, digits=3), " seconds")
println(" Average error: ", @sprintf("%.3e", err_gauss_v1_large))
catch e
println(" Benchmark [8] failed: ", e)
end
# === [9] Crout Inversion (Single Threaded, Large) ===
println("\n[9] Crout Inversion (Single Threaded, Large):")
try
t_crout_serial_large = @elapsed begin
Ainv_large = crout_inverse_original(A_large)
A2_large = crout_inverse_original(Ainv_large)
err_crout_serial_large = average_error(A_large, A2_large)
end
println(" Time: ", round(t_crout_serial_large, digits=3), " seconds")
println(" Average error: ", @sprintf("%.3e", err_crout_serial_large))
catch e
println(" Benchmark [9] failed: ", e)
end
# === [10] Crout Inversion (Threaded, Large) ===
println("\n[10] Crout Inversion (Threaded, Large):")
try
t_crout_threaded_large = @elapsed begin
Ainv_large = crout_inverse_threaded(A_large)
A2_large = crout_inverse_threaded(Ainv_large)
err_crout_threaded_large = average_error(A_large, A2_large)
end
println(" Time: ", round(t_crout_threaded_large, digits=3), " seconds")
println(" Average error: ", @sprintf("%.3e", err_crout_threaded_large))
catch e
println(" Benchmark [10] failed: ", e)
end
# === [11] Native Julia (inv) Large ===
println("\n[11] Native Julia (inv, Large):")
try
t_inv_large = @elapsed begin
Ainv_large = inv(A_large)
A2_large = inv(Ainv_large)
err_inv_large = average_error(A_large, A2_large)
end
println(" Time: ", round(t_inv_large, digits=3), " seconds")
println(" Average error: ", @sprintf("%.3e", err_inv_large))
catch e
println(" Benchmark [11] failed: ", e)
end
# === [12] LAPACK manual Large ===
println("\n[12] LAPACK (LU + inv, Large):")
try
t_lapack_large = @elapsed begin
Ainv_large = lapack_inverse(A_large)
A2_large = lapack_inverse(Ainv_large)
err_lapack_large = average_error(A_large, A2_large)
end
println(" Time: ", round(t_lapack_large, digits=3), " seconds")
println(" Average error: ", @sprintf("%.3e", err_lapack_large))
catch e
println(" Benchmark [12] failed: ", e)
end
println("\nBenchmark Summary (Time in seconds, Error as Max Error for Small, Avg Error for Large):")
@printf " %-40s | %-15s | %-15s | %-15s | %-15s\n" "Method" "Time (Small)" "Error (Small)" "Time (Large)" "Error (Large)"
println("-"^110)
@printf " %-40s | %-15s | %-15s | %-15s | %-15s\n" "[1] Gauss (Standard Serial)" (isnan(t_gauss_serial_small) ? "Failed" : round(t_gauss_serial_small, digits=3)) (isnan(err_gauss_serial_small_max) ? "N/A" : @sprintf("%.3e", err_gauss_serial_small_max)) (isnan(t_gauss_serial_large) ? "Failed" : round(t_gauss_serial_large, digits=3)) (isnan(err_gauss_serial_large) ? "N/A" : @sprintf("%.3e", err_gauss_serial_large))
@printf " %-40s | %-15s | %-15s | %-15s | %-15s\n" "[2] Gauss (Threaded V1 - Row Parallel)" (isnan(t_gauss_v1_small) ? "Failed" : round(t_gauss_v1_small, digits=3)) (isnan(err_gauss_v1_small_max) ? "N/A" : @sprintf("%.3e", err_gauss_v1_small_max)) (isnan(t_gauss_v1_large) ? "Failed" : round(t_gauss_v1_large, digits=3)) (isnan(err_gauss_v1_large) ? "N/A" : @sprintf("%.3e", err_gauss_v1_large))
@printf " %-40s | %-15s | %-15s | %-15s | %-15s\n" "[3] Crout (Single Threaded)" (isnan(t_crout_serial_small) ? "Failed" : round(t_crout_serial_small, digits=3)) (isnan(err_crout_serial_small_max) ? "N/A" : @sprintf("%.3e", err_crout_serial_small_max)) (isnan(t_crout_serial_large) ? "Failed" : round(t_crout_serial_large, digits=3)) (isnan(err_crout_serial_large) ? "N/A" : @sprintf("%.3e", err_crout_serial_large))
@printf " %-40s | %-15s | %-15s | %-15s | %-15s\n" "[4] Crout (Threaded)" (isnan(t_crout_threaded_small) ? "Failed" : round(t_crout_threaded_small, digits=3)) (isnan(err_crout_threaded_small_max) ? "N/A" : @sprintf("%.3e", err_crout_threaded_small_max)) (isnan(t_crout_threaded_large) ? "Failed" : round(t_crout_threaded_large, digits=3)) (isnan(err_crout_threaded_large) ? "N/A" : @sprintf("%.3e", err_crout_threaded_large))
@printf " %-40s | %-15s | %-15s | %-15s | %-15s\n" "[5] Native Julia (inv)" (isnan(t_inv_small) ? "Failed" : round(t_inv_small, digits=3)) (isnan(err_inv_small_max) ? "N/A" : @sprintf("%.3e", err_inv_small_max)) (isnan(t_inv_large) ? "Failed" : round(t_inv_large, digits=3)) (isnan(err_inv_large) ? "N/A" : @sprintf("%.3e", err_inv_large))
@printf " %-40s | %-15s | %-15s | %-15s | %-15s\n" "[6] LAPACK (LU+inv)" (isnan(t_lapack_small) ? "Failed" : round(t_lapack_small, digits=3)) (isnan(err_lapack_small_max) ? "N/A" : @sprintf("%.3e", err_lapack_small_max)) (isnan(t_lapack_large) ? "Failed" : round(t_lapack_large, digits=3)) (isnan(err_lapack_large) ? "N/A" : @sprintf("%.3e", err_lapack_large))
end
# To run with multiple threads, start Julia with `julia -t auto` or `julia -t N`
# where N is the number of threads.
run_benchmarks()
Updated from AI. now working.
Benchmark Summary (Time in seconds, Error as Max Error for Small, Avg Error for Large):
Method | Time (Small) | Error (Small) | Time (Large) | Error (Large)
--------------------------------------------------------------------------------------------------------------
[1] Gauss (Standard Serial) | 188.535 | 7.272e-15 | 285.61 | 3.633e-14
[2] Gauss (Threaded V1 - Row Parallel) | 109.895 | 7.272e-15 | 25.076 | 3.633e-14
[3] Crout (Single Threaded) | 38.474 | 5.135e-11 | 25.035 | 1.806e-09
[4] Crout (Threaded) | 9.964 | 4.081e-11 | 7.615 | 1.839e-09
[5] Native Julia (inv) | 3.545 | 6.973e-14 | 0.209 | 1.095e-12
[6] LAPACK (LU+inv) | 3.55 | 6.973e-14 | 0.2 | 1.095e-12
compare to fortran
C:\Temp>test_fpu3_Qmkl_Qpar_O3_Qax
Benchmark running, hopefully as only ACTIVE task
Test1: Gauss - 1000 ( 250x 250) inverts in 5.848 seconds Err=0.000000000000
Test2: Crout - 1000 ( 250x 250) inverts in 6.178 seconds Err=0.000000000000
Test3: Crout - 2 (2000x2000) inverts in 5.852 seconds Err=0.000000000000
Test4: Lapack - 2 (2000x2000) inverts in 0.302 seconds Err=0.000000000002
The lapack and inv(A) native julia beat fortran inverse 0.2 vs 0.3 with MKL
For regular non blas, like using matrix operation non blas, like crout, fortran faster.