Multiple-threading does not increase speed in my 16-threads workstation, I am wondering is there something wrong in my code?

Hi, I am recently trying to leverage multiple threading to speed my code. However, even though I likely realized multiple threading, my code run slower. Could somebody proficient in this have a skim on this? My code is posted below. I am attempting to separately compute the four portions of the Jacobian matrix of my model system. The results from my code is right since the combined output is a right Jacobian. But the performance is not improved. The results from @btime shows the maximum time consumes by one of the four branches is 396 ms, while the total time for parallelization is 876 ms. This is beyond my expectation if they are really working simultaneously.

function constitutive_elast1(nodalforce1, noddis)
    for jii = 1:length1_1
        nodalforce1[jii] = 0.0
    end
    for kjh in element_num_inter1

        epsiNk = BNjk2[kjh] * noddis[(6*elements41[kjh][2]-5):(6*elements41[kjh][2])] - BNik2[kjh] * noddis[(6*elements41[kjh][1]-5):(6*elements41[kjh][1])]
        epsiMk = BMjk2[kjh] * noddis[(6*elements41[kjh][2]-5):(6*elements41[kjh][2])] - BMik2[kjh] * noddis[(6*elements41[kjh][1]-5):(6*elements41[kjh][1])]
        epsiLk = BLjk2[kjh] * noddis[(6*elements41[kjh][2]-5):(6*elements41[kjh][2])] - BLik2[kjh] * noddis[(6*elements41[kjh][1]-5):(6*elements41[kjh][1])]
        #epsiTk = sqrt(epsiMk^2 + epsiLk^2)
        #zeroFloa = 0.0
        sigmNk = epsiNk * EN
        sigmMk = epsiMk * EN * alpha
        sigmLk = epsiLk * EN * alpha
        #println([sigmNk,sigmMk,sigmLk])
        if elements41[kjh][1] >= 1 && elements41[kjh][1] <= portion_1
        nodalforce1[(elements41[kjh][1]*6-5):(elements41[kjh][1]*6)] += (-le2[kjh] * Ak2[kjh] * (sigmNk * BNik2[kjh] + sigmMk * BMik2[kjh] + sigmLk * BLik2[kjh]))'
        end
        if elements41[kjh][2] >= 1 && elements41[kjh][2] <= portion_1
        nodalforce1[(elements41[kjh][2]*6-5):(elements41[kjh][2]*6)] += (le2[kjh] * Ak2[kjh] * (sigmNk * BNjk2[kjh] + sigmMk * BMjk2[kjh] + sigmLk * BLjk2[kjh]))'
        end
    end
    for kjh in element_num_bound1

        epsiNk = BNjk2[kjh] * noddis[(6*elements41[kjh][2]-5):(6*elements41[kjh][2])] - BNik2[kjh] * noddis[(6*elements41[kjh][1]-5):(6*elements41[kjh][1])]
        epsiMk = BMjk2[kjh] * noddis[(6*elements41[kjh][2]-5):(6*elements41[kjh][2])] - BMik2[kjh] * noddis[(6*elements41[kjh][1]-5):(6*elements41[kjh][1])]
        epsiLk = BLjk2[kjh] * noddis[(6*elements41[kjh][2]-5):(6*elements41[kjh][2])] - BLik2[kjh] * noddis[(6*elements41[kjh][1]-5):(6*elements41[kjh][1])]
        sigmNk = epsiNk * EN * 50
        sigmMk = epsiMk * EN * alpha * 50
        sigmLk = epsiLk * EN * alpha * 50
        if elements41[kjh][1] >= 1 && elements41[kjh][1] <= portion_1
        nodalforce1[(elements41[kjh][1]*6-5):(elements41[kjh][1]*6)] += (-le2[kjh] * Ak2[kjh] * (sigmNk * BNik2[kjh] + sigmMk * BMik2[kjh] + sigmLk * BLik2[kjh]))'
        end
        if elements41[kjh][2] >= 1 && elements41[kjh][2] <= portion_1
        nodalforce1[(elements41[kjh][2]*6-5):(elements41[kjh][2]*6)] += (le2[kjh] * Ak2[kjh] * (sigmNk * BNjk2[kjh] + sigmMk * BMjk2[kjh] + sigmLk * BLjk2[kjh]))'
        end
    end
    nothing
end


function constitutive_elast2(nodalforce1, noddis)
    for jii = 1:length1_2
        nodalforce1[jii] = 0.0
    end
    for kjh in element_num_inter2

        epsiNk = BNjk2[kjh] * noddis[(6*(elements41[kjh][2]-portion2[1]+1)-5):(6*(elements41[kjh][2]-portion2[1]+1))] - BNik2[kjh] * noddis[(6*(elements41[kjh][1]-portion2[1]+1)-5):(6*(elements41[kjh][1]-portion2[1]+1))]
        epsiMk = BMjk2[kjh] * noddis[(6*(elements41[kjh][2]-portion2[1]+1)-5):(6*(elements41[kjh][2]-portion2[1]+1))] - BMik2[kjh] * noddis[(6*(elements41[kjh][1]-portion2[1]+1)-5):(6*(elements41[kjh][1]-portion2[1]+1))]
        epsiLk = BLjk2[kjh] * noddis[(6*(elements41[kjh][2]-portion2[1]+1)-5):(6*(elements41[kjh][2]-portion2[1]+1))] - BLik2[kjh] * noddis[(6*(elements41[kjh][1]-portion2[1]+1)-5):(6*(elements41[kjh][1]-portion2[1]+1))]
        #epsiTk = sqrt(epsiMk^2 + epsiLk^2)
        #zeroFloa = 0.0
        sigmNk = epsiNk * EN
        sigmMk = epsiMk * EN * alpha
        sigmLk = epsiLk * EN * alpha
        #println([sigmNk,sigmMk,sigmLk])
        if elements41[kjh][1] > portion_1 && elements41[kjh][1] <= portion_2
        nodalforce1[((elements41[kjh][1]-portion_1)*6-5):((elements41[kjh][1]-portion_1)*6)] += (-le2[kjh] * Ak2[kjh] * (sigmNk * BNik2[kjh] + sigmMk * BMik2[kjh] + sigmLk * BLik2[kjh]))'
        end
        if elements41[kjh][2] > portion_1 && elements41[kjh][2] <= portion_2
        nodalforce1[((elements41[kjh][2]-portion_1)*6-5):((elements41[kjh][2]-portion_1)*6)] += (le2[kjh] * Ak2[kjh] * (sigmNk * BNjk2[kjh] + sigmMk * BMjk2[kjh] + sigmLk * BLjk2[kjh]))'
        end
    end
    for kjh in element_num_bound2

        epsiNk = BNjk2[kjh] * noddis[(6*(elements41[kjh][2]-portion2[1]+1)-5):(6*(elements41[kjh][2]-portion2[1]+1))] - BNik2[kjh] * noddis[(6*(elements41[kjh][1]-portion2[1]+1)-5):(6*(elements41[kjh][1]-portion2[1]+1))]
        epsiMk = BMjk2[kjh] * noddis[(6*(elements41[kjh][2]-portion2[1]+1)-5):(6*(elements41[kjh][2]-portion2[1]+1))] - BMik2[kjh] * noddis[(6*(elements41[kjh][1]-portion2[1]+1)-5):(6*(elements41[kjh][1]-portion2[1]+1))]
        epsiLk = BLjk2[kjh] * noddis[(6*(elements41[kjh][2]-portion2[1]+1)-5):(6*(elements41[kjh][2]-portion2[1]+1))] - BLik2[kjh] * noddis[(6*(elements41[kjh][1]-portion2[1]+1)-5):(6*(elements41[kjh][1]-portion2[1]+1))]
        sigmNk = epsiNk * EN * 50
        sigmMk = epsiMk * EN * alpha * 50
        sigmLk = epsiLk * EN * alpha * 50
        if elements41[kjh][1] > portion_1 && elements41[kjh][1] <= portion_2
        nodalforce1[((elements41[kjh][1]-portion_1)*6-5):((elements41[kjh][1]-portion_1)*6)] += (-le2[kjh] * Ak2[kjh] * (sigmNk * BNik2[kjh] + sigmMk * BMik2[kjh] + sigmLk * BLik2[kjh]))'
        end
        if elements41[kjh][2] > portion_1 && elements41[kjh][2] <= portion_2
        nodalforce1[((elements41[kjh][2]-portion_1)*6-5):((elements41[kjh][2]-portion_1)*6)] += (le2[kjh] * Ak2[kjh] * (sigmNk * BNjk2[kjh] + sigmMk * BMjk2[kjh] + sigmLk * BLjk2[kjh]))'
        end
    end
    nothing
end


function constitutive_elast3(nodalforce1, noddis)
    for jii = 1:length1_3
        nodalforce1[jii] = 0.0
    end
    for kjh in element_num_inter3

        epsiNk = BNjk2[kjh] * noddis[(6*(elements41[kjh][2]-portion3[1]+1)-5):(6*(elements41[kjh][2]-portion3[1]+1))] - BNik2[kjh] * noddis[(6*(elements41[kjh][1]-portion3[1]+1)-5):(6*(elements41[kjh][1]-portion3[1]+1))]
        epsiMk = BMjk2[kjh] * noddis[(6*(elements41[kjh][2]-portion3[1]+1)-5):(6*(elements41[kjh][2]-portion3[1]+1))] - BMik2[kjh] * noddis[(6*(elements41[kjh][1]-portion3[1]+1)-5):(6*(elements41[kjh][1]-portion3[1]+1))]
        epsiLk = BLjk2[kjh] * noddis[(6*(elements41[kjh][2]-portion3[1]+1)-5):(6*(elements41[kjh][2]-portion3[1]+1))] - BLik2[kjh] * noddis[(6*(elements41[kjh][1]-portion3[1]+1)-5):(6*(elements41[kjh][1]-portion3[1]+1))]
        #epsiTk = sqrt(epsiMk^2 + epsiLk^2)
        #zeroFloa = 0.0
        sigmNk = epsiNk * EN
        sigmMk = epsiMk * EN * alpha
        sigmLk = epsiLk * EN * alpha
        #println([sigmNk,sigmMk,sigmLk])
        if elements41[kjh][1] > portion_2 && elements41[kjh][1] <= portion_3
        nodalforce1[((elements41[kjh][1]-portion_2)*6-5):((elements41[kjh][1]-portion_2)*6)] += (-le2[kjh] * Ak2[kjh] * (sigmNk * BNik2[kjh] + sigmMk * BMik2[kjh] + sigmLk * BLik2[kjh]))'
        end
        if elements41[kjh][2] > portion_2 && elements41[kjh][2] <= portion_3
        nodalforce1[((elements41[kjh][2]-portion_2)*6-5):((elements41[kjh][2]-portion_2)*6)] += (le2[kjh] * Ak2[kjh] * (sigmNk * BNjk2[kjh] + sigmMk * BMjk2[kjh] + sigmLk * BLjk2[kjh]))'
        end
    end
    for kjh in element_num_bound3

        epsiNk = BNjk2[kjh] * noddis[(6*(elements41[kjh][2]-portion3[1]+1)-5):(6*(elements41[kjh][2]-portion3[1]+1))] - BNik2[kjh] * noddis[(6*(elements41[kjh][1]-portion3[1]+1)-5):(6*(elements41[kjh][1]-portion3[1]+1))]
        epsiMk = BMjk2[kjh] * noddis[(6*(elements41[kjh][2]-portion3[1]+1)-5):(6*(elements41[kjh][2]-portion3[1]+1))] - BMik2[kjh] * noddis[(6*(elements41[kjh][1]-portion3[1]+1)-5):(6*(elements41[kjh][1]-portion3[1]+1))]
        epsiLk = BLjk2[kjh] * noddis[(6*(elements41[kjh][2]-portion3[1]+1)-5):(6*(elements41[kjh][2]-portion3[1]+1))] - BLik2[kjh] * noddis[(6*(elements41[kjh][1]-portion3[1]+1)-5):(6*(elements41[kjh][1]-portion3[1]+1))]
        sigmNk = epsiNk * EN * 50
        sigmMk = epsiMk * EN * alpha * 50
        sigmLk = epsiLk * EN * alpha * 50
        if elements41[kjh][1] > portion_2 && elements41[kjh][1] <= portion_3
        nodalforce1[((elements41[kjh][1]-portion_2)*6-5):((elements41[kjh][1]-portion_2)*6)] += (-le2[kjh] * Ak2[kjh] * (sigmNk * BNik2[kjh] + sigmMk * BMik2[kjh] + sigmLk * BLik2[kjh]))'
        end
        if elements41[kjh][2] > portion_2 && elements41[kjh][2] <= portion_3
        nodalforce1[((elements41[kjh][2]-portion_2)*6-5):((elements41[kjh][2]-portion_2)*6)] += (le2[kjh] * Ak2[kjh] * (sigmNk * BNjk2[kjh] + sigmMk * BMjk2[kjh] + sigmLk * BLjk2[kjh]))'
        end
    end
    nothing
end


function constitutive_elast4(nodalforce1, noddis)
    for jii = 1:length1_4
        nodalforce1[jii] = 0.0
    end
    for kjh in element_num_inter4

        epsiNk = BNjk2[kjh] * noddis[(6*(elements41[kjh][2]-portion4[1]+1)-5):(6*(elements41[kjh][2]-portion4[1]+1))] - BNik2[kjh] * noddis[(6*(elements41[kjh][1]-portion4[1]+1)-5):(6*(elements41[kjh][1]-portion4[1]+1))]
        epsiMk = BMjk2[kjh] * noddis[(6*(elements41[kjh][2]-portion4[1]+1)-5):(6*(elements41[kjh][2]-portion4[1]+1))] - BMik2[kjh] * noddis[(6*(elements41[kjh][1]-portion4[1]+1)-5):(6*(elements41[kjh][1]-portion4[1]+1))]
        epsiLk = BLjk2[kjh] * noddis[(6*(elements41[kjh][2]-portion4[1]+1)-5):(6*(elements41[kjh][2]-portion4[1]+1))] - BLik2[kjh] * noddis[(6*(elements41[kjh][1]-portion4[1]+1)-5):(6*(elements41[kjh][1]-portion4[1]+1))]
        #epsiTk = sqrt(epsiMk^2 + epsiLk^2)
        #zeroFloa = 0.0
        sigmNk = epsiNk * EN
        sigmMk = epsiMk * EN * alpha
        sigmLk = epsiLk * EN * alpha
        #println([sigmNk,sigmMk,sigmLk])
        if elements41[kjh][1] > portion_3 && elements41[kjh][1] <= portion_4
        nodalforce1[((elements41[kjh][1]-portion_3)*6-5):((elements41[kjh][1]-portion_3)*6)] += (-le2[kjh] * Ak2[kjh] * (sigmNk * BNik2[kjh] + sigmMk * BMik2[kjh] + sigmLk * BLik2[kjh]))'
        end
        if elements41[kjh][2] > portion_3 && elements41[kjh][2] <= portion_4
        nodalforce1[((elements41[kjh][2]-portion_3)*6-5):((elements41[kjh][2]-portion_3)*6)] += (le2[kjh] * Ak2[kjh] * (sigmNk * BNjk2[kjh] + sigmMk * BMjk2[kjh] + sigmLk * BLjk2[kjh]))'
        end
    end
    for kjh in element_num_bound4

        epsiNk = BNjk2[kjh] * noddis[(6*(elements41[kjh][2]-portion4[1]+1)-5):(6*(elements41[kjh][2]-portion4[1]+1))] - BNik2[kjh] * noddis[(6*(elements41[kjh][1]-portion4[1]+1)-5):(6*(elements41[kjh][1]-portion4[1]+1))]
        epsiMk = BMjk2[kjh] * noddis[(6*(elements41[kjh][2]-portion4[1]+1)-5):(6*(elements41[kjh][2]-portion4[1]+1))] - BMik2[kjh] * noddis[(6*(elements41[kjh][1]-portion4[1]+1)-5):(6*(elements41[kjh][1]-portion4[1]+1))]
        epsiLk = BLjk2[kjh] * noddis[(6*(elements41[kjh][2]-portion4[1]+1)-5):(6*(elements41[kjh][2]-portion4[1]+1))] - BLik2[kjh] * noddis[(6*(elements41[kjh][1]-portion4[1]+1)-5):(6*(elements41[kjh][1]-portion4[1]+1))]
        sigmNk = epsiNk * EN * 50
        sigmMk = epsiMk * EN * alpha * 50
        sigmLk = epsiLk * EN * alpha * 50
        if elements41[kjh][1] > portion_3 && elements41[kjh][1] <= portion_4
        nodalforce1[((elements41[kjh][1]-portion_3)*6-5):((elements41[kjh][1]-portion_3)*6)] += (-le2[kjh] * Ak2[kjh] * (sigmNk * BNik2[kjh] + sigmMk * BMik2[kjh] + sigmLk * BLik2[kjh]))'
        end
        if elements41[kjh][2] > portion_3 && elements41[kjh][2] <= portion_4
        nodalforce1[((elements41[kjh][2]-portion_3)*6-5):((elements41[kjh][2]-portion_3)*6)] += (le2[kjh] * Ak2[kjh] * (sigmNk * BNjk2[kjh] + sigmMk * BMjk2[kjh] + sigmLk * BLjk2[kjh]))'
        end
    end
    nothing
end

@btime begin 
    task15 = Threads.@spawn forwarddiff_color_jacobian!(jac1, constitutive_elast1, input[portion1[1]*6-5:portion1[end]*6], colorvec=colors1_1)
    task16 = Threads.@spawn forwarddiff_color_jacobian!(jac2, constitutive_elast2, input[portion2[1]*6-5:portion2[end]*6], colorvec=colors1_2)
    task17 = Threads.@spawn forwarddiff_color_jacobian!(jac3, constitutive_elast3, input[portion3[1]*6-5:portion3[end]*6], colorvec=colors1_3)
    task18 = Threads.@spawn forwarddiff_color_jacobian!(jac4, constitutive_elast4, input[portion4[1]*6-5:portion4[end]*6], colorvec=colors1_4)
    
    tangent_matrix1[1:(portion_1*6), portion1[1]*6-5:portion1[end]*6] = fetch(task15)
    tangent_matrix1[(portion_1*6+1):(portion_2*6), portion2[1]*6-5:portion2[end]*6] = fetch(task16)
    tangent_matrix1[(portion_2*6+1):(portion_3*6), portion3[1]*6-5:portion3[end]*6] = fetch(task17)
    tangent_matrix1[(portion_3*6+1):(portion_4*6), portion4[1]*6-5:portion4[end]*6] = fetch(task18)
end

For one thing, you are allocating lots of little arrays in your inner loops, that won’t help performance or parallelization. Not to mention using lots of global variables. I would fix the serial performance first, before worrying about multi-threading.

(But it’s hard to say anything for certain since your posted code is not runnable; it’s not even clear how big the computation that you’re trying to parallelize is.)

1 Like

All the global variables in my function are defined as constants. I did not post all codes here since there are too much, so I only post some relevant to multithreads.

Thanks for your notice, I am happy to know there are still some spaces to improve my code.