Poor performance while multithreading (Julia 1.0)

Very good! This improves readability a lot. However, you still do not understand your own algorithm, and by the remaining convolutedness of your code you deprive yourself of the opportunity to understand it. To wit: Inline everything. First your original code, with minor changes to printing and without multithreading:

       function MWE() :: Nothing

           L1 = 10000
           L2 = 10000

           Ti = 10.0
           Tf = 0.1
           b  = 0.8

           Input1 = init(L1, L2)
           Input2 = init(L1, L2)

           while Ti > Tf

               #println("Integration step @  $(Ti).")

               for Iterator in 1 : L1
                   computeKernel1!(Ti, Iterator, Input1, Input2)
               end

               for Iterator in 1 : L1
                   computeKernel2!(Ti, Iterator, Input1, Input2)
               end

               updateStep!((b - 1) * Ti, Input1, Input2)
               Ti = b * Ti
           end

           println(Input1.Kernel1[1:10])
       end
julia> @time MWE()
[2.60948e6, 5.21896e6, 7.82844e6, 1.04379e7, 1.30474e7, 1.56569e7, 1.82664e7, 2.08759e7, 2.34853e7, 2.60948e7]
 48.036136 seconds (48.50 k allocations: 17.141 GiB, 2.87% gc time)

Inline everything:

function MWE2() :: Nothing

    L1 = 10000
    L2 = 10000

    Ti = 10.0
    Tf = 0.1
    b  = 0.8

    Input1 = MyStruct(ones(Float64, L1), ones(Float64, L2, L1))
    Input2 = MyStruct(ones(Float64, L1), ones(Float64, L2, L1))

    while Ti > Tf

       # println("Integration step @  $(Ti).")

        for Iterator in 1 : L1
            let T = Ti 
                Input2.Kernel1[Iterator] = Input1.Kernel2[Iterator] / T
            end
        end

        for Iterator in 1 : L1
            let T = Ti 
            #computeKernel2!(Ti, Iterator, Input1, Input2)
                F  = T * first(Input1.Kernel1)
                for i in 1 : size(Input1.Kernel2, 1)
                    Input2.Kernel2[i, Iterator] = F * T * i
                end

                #integrate!(-T, T, Iterator, Input1, Input2)
                let LowerBound = -T, UpperBound = T
                    for i in 1 : 9
                        v = LowerBound + i * (UpperBound - LowerBound) / 10.0
                        F = (UpperBound - LowerBound) / 10.0 * ( v * first(Input2.Kernel1)) * v
                        for j in 1 : size(Input1.Kernel2, 1)
                            Input2.Kernel2[j, Iterator] += F * j
                        end
                    end
                end

            end
        end
        let dT = (b - 1) * Ti
            Input1.Kernel1 .+= dT * Input2.Kernel1
            Input1.Kernel2 .+= dT * Input2.Kernel2
        end
        Ti = b * Ti
    end

    println(Input1.Kernel1[1:10])

end
julia> @time MWE2()
[2.60948e6, 5.21896e6, 7.82844e6, 1.04379e7, 1.30474e7, 1.56569e7, 1.82664e7, 2.08759e7, 2.34853e7, 2.60948e7]
 46.487035 seconds (142.37 k allocations: 17.145 GiB, 2.16% gc time)

Now you can refactor your code: Remove mystruct, fix missing dots, possibly replace loops by one-line broadcasts, hoist the integrate loop, remove unneeded temporaries. See that Kernel2[i, j] = Kernel2[i, k] for every j, k, so you can cut by a factor of 10000. I am assuming that this is a bug in your code? Don’t think about @threads until you understand your algorithm, and you don’t understand your algorithm until you know every loop and every dependency. By inlining, loops and dependencies become obvious.