Hi. I am writing a CFD solver using Finite Element Methods. I encounter issues in assembling the sparse matrices( specially in regards with the performance). I construct a sparse matrix in CSC form. This function works fine but I feel its too slow for the performance I am aiming.
I tried with multi threading, I created rows, cols and values array for each threads, then concatenated the results from all the threads and used that array to construct the CSC matrix.
There is some performance improvement but not as much as I expected. (A speedup of around 3 when I use 8 cores).
function assembling_matrices(ndim,total_nodes_domain,element_data_array, fixed_u_nodes, fixed_v_nodes, fixed_w_nodes, fixed_p_nodes)
M = spzeros(ndim*total_nodes_domain,ndim*total_nodes_domain)
K1 = spzeros(ndim*total_nodes_domain,ndim*total_nodes_domain)
K2 = spzeros(ndim*total_nodes_domain,ndim*total_nodes_domain)
HV = spzeros(total_nodes_domain,ndim*total_nodes_domain)
GG = spzeros(ndim*total_nodes_domain,total_nodes_domain)
G = spzeros(ndim*total_nodes_domain,total_nodes_domain)
function global_stiffness_matrices(k, K_size, rows, cols, values, DOFs, NEN)
# Determine ranges for rows and columns based on K_size
row_range = K_size[1] == ndim * total_nodes_domain ? ndim * NEN : NEN
col_range = K_size[2] == ndim * total_nodes_domain ? ndim * NEN : NEN
for i in 1:row_range
row_index = DOFs[i]
for j in 1:col_range
col_index = DOFs[j]
push!(rows, row_index)
push!(cols, col_index)
push!(values, k[i, j])
end
end
return rows, cols, values
end
rows_m = [Vector{Int64}() for _ in 1:Threads.nthreads()]
rows_hv = [Vector{Int64}() for _ in 1:Threads.nthreads()]
# same thing goes for k1, k2, gg, g( I did not include it over here).
cols_m = [Vector{Int64}() for _ in 1:Threads.nthreads()]
cols_hv = [Vector{Int64}() for _ in 1:Threads.nthreads()]
values_m = [Vector{Float64}() for _ in 1:Threads.nthreads()]
values_hv = [Vector{Float64}() for _ in 1:Threads.nthreads()]
Threads.@threads for element in element_data_array
tid = Threads.threadid()
NEN = element.NEN
# This includes the effect of periodic nodes.
DOFs = element.DOFs_assembly
m = element.m
hv = element.hv
k1 = element.k1
k2 = element.k2
gg = element.gg
g = element.g
rows_m[tid], cols_m[tid], values_m[tid] = global_stiffness_matrices(m, (size(M,1), size(M,2)), rows_m[tid], cols_m[tid], values_m[tid], DOFs, NEN)
rows_hv[tid], cols_hv[tid], values_hv[tid] = global_stiffness_matrices(hv, (size(HV,1), size(HV,2)), rows_hv[tid], cols_hv[tid], values_hv[tid], DOFs, NEN)
rows_k1[tid], cols_k1[tid], values_k1[tid] = global_stiffness_matrices(k1, (size(K1,1), size(K1,2)), rows_k1[tid], cols_k1[tid], values_k1[tid], DOFs, NEN)
rows_k2[tid], cols_k2[tid], values_k2[tid] = global_stiffness_matrices(k2, (size(K2,1), size(K2,2)), rows_k2[tid], cols_k2[tid], values_k2[tid], DOFs, NEN)
rows_gg[tid], cols_gg[tid], values_gg[tid] = global_stiffness_matrices(gg, (size(GG,1), size(GG,2)), rows_gg[tid], cols_gg[tid], values_gg[tid], DOFs, NEN)
rows_g[tid], cols_g[tid], values_g[tid] = global_stiffness_matrices(g, (size(G,1), size(G,2)), rows_g[tid], cols_g[tid], values_g[tid], DOFs, NEN)
end
array_5 = [rows_m, rows_mca, rows_hv, rows_k1, rows_k2, rows_gg, rows_g]
array_6 = [cols_m, cols_mca, cols_hv, cols_k1, cols_k2, cols_gg, cols_g]
array_7 = [values_m, values_mca, values_hv, values_k1, values_k2, values_gg, values_g]
function concatenate_vectors(array_5, array_6, array_7)
results_row = Vector{Vector{Int64}}(undef, 11)
Threads.@threads for index in 1:length(array_5)
results_row[index] = vcat(array_5[index]...)
end
results_col = Vector{Vector{Int64}}(undef, 11)
Threads.@threads for index in 1:length(array_6)
results_col[index] = vcat(array_6[index]...)
end
results_val = Vector{Vector{Float64}}(undef, 11)
Threads.@threads for index in 1:length(array_7)
results_val[index] = vcat(array_7[index]...)
end
return results_row, results_col, results_val
end
# Concatenate the individual rows, cols and values array.
rows_mat, cols_mat, values_mat = concatenate_vectors(array_5, array_6, array_7)
# Now, construct the sparse CSC matrix.
main_array = [M, MCA, HV, K1, K2, GG, G]
Threads.@threads for index in 1:length(main_array)
row = rows_mat[index]
col = cols_mat[index]
val = values_mat[index]
size_1 = size(main_array[index],1)
size_2 = size(main_array[index],2)
main_array[index] = sparse(row, col, val, size_1 , size_2)
end
M, MCA, HV, K1, K2, GG, G = main_array
return M,MCA,HV,K1,K2,GG,G
end #end of function.
There are actually 11 matrices like this, I have only included a few of them and the size of all the matrices are not same. Hence, I had to create variables like row_range and col_range.
Element_data_array is an array of structs. Each struct contains all the informations for an element.
Right now for
Total_nodes_domain = 23,000. Its taking like 1.9 seconds to assemble all the matrices which is still a lot for me since I aim to solve problem with atleast 200,000 nodes.
In addition to this, there is significant allocations (5.950 GiB), I also aim to reduce this since my memory limit is 188 GB.
I tried my best to optimize this further but couldn’t do it. It would be of great help if you guys could assist me in making this code better.
Thank you for your suggestions. I apologize if this information is not enough. If any extra information is needed, please let me know.