Is the linear system solver \ also multi threaded in Julia as in Matlab? And how to “multithread” it in Julia?

For the record, I have experimented with it, also inspired by a talk I have watched (JuliaCon 2016 | Iterative Methods for Sparse Linear Systems in Julia | Lars Ruthotto - YouTube).
Honestly, in my experience it performed more poorly than just A\b, with A sparse with dimensions (60802, 60802) and the normal LinearAlgebra package (julia> BLAS.vendor() :openblas64).

@time K = sparse(iK,jK,sK)
@time Ksolve = Symmetric(K[freedofs,freedofs])
@time Usolve = Ksolve\F
@time U[freedofs] = Usolve
@time U[freedofs] = Symmetric(K[freedofs,freedofs])\F

gives:

0.024518 seconds (17 allocations: 47.242 MiB)
0.008867 seconds (8 allocations: 17.448 MiB)
0.141280 seconds (50 allocations: 128.413 MiB)
0.000167 seconds
0.149922 seconds (58 allocations: 145.861 MiB)

We can thus say that:

K = sparse(iK,jK,sK)
Ksolve = Symmetric(K[freedofs,freedofs])
Usolve = Ksolve\F
U[freedofs] = Usolve

takes 0.174832 s, and:

      U[freedofs] = Symmetric(K[freedofs,freedofs])\F

takes 0.149922 s.

Note: the size of K is (60802, 60802).

Perhaps a minor thing, but don’t do this. It creates an unnecessary copy. If you need to shape it into an 1D vector write

vec(KE) 

instead.

Are you doing the benchmarking in global scope (you shouldn’t)? And are you making sure not to include compilation time?

Also, you cannot add up runtimes like this, in fact, you should be aware that the @time macro is not well suited to doing microbenchmarks. Instead wrap your code in a function before benchmarking, and use BenchmarkTools.jl.

I am confused: didn’t you previously provide timing of 18 seconds?
0.15 seconds is nothing to complain about for a matrix this size…

18 sec for the full optimization analysis, with about 61 iterations

I see, I didn’t get that. I thought it was just the solve. Ignore my advice.

I think 0.15 sec for a 60k x 60k matrix is reasonable. What did you expect?

It all started from the fact that I am getting the same computational time for a “mirrored” (conceptually) code in Matlab. I was trying to prove to my self that Julia is faster. Instead I managed with some fine tuning to get to Matlab performance, and not better…

Let’s make it faster then. Post your code … :slight_smile:

But did you wrap the code in a function, and do the benchmarking properly?

It would be nice to see both the code and the benchmarking code.

Here we go: https://github.com/pollinico/topopt_jl

@PetrKryslUCSD did you have a chance to look at the code? Any suggestions for speedup? Thanks!

Wow, that was four years ago. I’m afraid I will have to look at it again to refresh my memory.

I was just curious to know if there were any ideas. Nothing urgent of course, just for fun/curiosity

With LLMs, the biggest change to do is :

for the optimize function :

@inline function _find_csc_position(A::SparseMatrixCSC{Float64,Int}, r::Int, c::Int)::Int
    @inbounds for p in A.colptr[c]:(A.colptr[c + 1] - 1)
        if A.rowval[p] == r
            return p
        end
    end
    error("CSC position not found for ($r, $c)")
end

function build_free_stiffness_cache(
    freedofs::AbstractVector{Int},
    iK::AbstractVector{Int},
    jK::AbstractVector{Int},
    KE::AbstractMatrix{Float64},
    nele::Int,
    ndofs::Int,
)
    nfree = length(freedofs)

    freeid = zeros(Int, ndofs)
    @inbounds for k in 1:nfree
        freeid[freedofs[k]] = k
    end

    # Exact equivalent of:
    #   sK = reshape(vec(KE) * Emod', 64*nele)
    #   K  = sparse(iK, jK, sK)
    #   Kff = K[freedofs, freedofs]
    #   Symmetric(Kff)              # Julia default: upper triangle
    #
    # Therefore we keep only entries that land in the upper triangle of the
    # free-dof matrix, using the original iK/jK ordering and vec(KE) ordering.
    # This is stricter than rebuilding pairs from edofMat, and remains correct
    # even if KE is not exactly symmetric or if iK/jK are customized.
    KEv = vec(KE)
    rows = Int[]
    cols = Int[]
    asmelem = Int[]
    asmval = Float64[]
    sizehint!(rows, 36 * nele)
    sizehint!(cols, 36 * nele)
    sizehint!(asmelem, 36 * nele)
    sizehint!(asmval, 36 * nele)

    @inbounds for q in eachindex(iK)
        row_full = iK[q]
        col_full = jK[q]
        row_free = freeid[row_full]
        col_free = freeid[col_full]
        (row_free == 0 || col_free == 0) && continue

        # Same triangle used by Symmetric(K[freedofs,freedofs]) with default :U.
        row_free <= col_free || continue

        local_id = (q - 1) % 64 + 1
        elem_id = (q - 1) ÷ 64 + 1

        push!(rows, row_free)
        push!(cols, col_free)
        push!(asmelem, elem_id)
        push!(asmval, KEv[local_id])
    end

    Kff = sparse(rows, cols, ones(Float64, length(rows)), nfree, nfree)
    asmpos = Vector{Int}(undef, length(rows))
    @inbounds for q in eachindex(rows)
        asmpos[q] = _find_csc_position(Kff, rows[q], cols[q])
    end
    fill!(Kff.nzval, 0.0)

    return Kff, asmpos, asmelem, asmval
end

@inline function assemble_free_stiffness!(
    Kff::SparseMatrixCSC{Float64,Int},
    asmpos::Vector{Int},
    asmelem::Vector{Int},
    asmval::Vector{Float64},
    Emod::Vector{Float64},
)
    fill!(Kff.nzval, 0.0)
    @inbounds for q in eachindex(asmpos)
        Kff.nzval[asmpos[q]] += asmval[q] * Emod[asmelem[q]]
    end
    return Kff
end

function check_free_stiffness_cache(
    freedofs::AbstractVector{Int},
    iK::AbstractVector{Int},
    jK::AbstractVector{Int},
    KE::AbstractMatrix{Float64},
    Emod::AbstractVector{Float64},
    Kff::SparseMatrixCSC{Float64,Int},
    asmpos::Vector{Int},
    asmelem::Vector{Int},
    asmval::Vector{Float64},
    ndofs::Int,
)
    # Debug helper, not used in the optimized loop.
    # Returns 0.0 up to floating-point roundoff if the optimized assembly matches
    # the old sparse(iK,jK,sK) + extraction path.
    sK_ref = reshape(vec(KE) * reshape(collect(Emod), 1, :), length(iK))
    K_ref = sparse(iK, jK, sK_ref, ndofs, ndofs)
    assemble_free_stiffness!(Kff, asmpos, asmelem, asmval, collect(Emod))
    return norm(Matrix(Symmetric(K_ref[freedofs, freedofs])) - Matrix(Symmetric(Kff, :U)), Inf)
end

@inline function update_material!(
    Emod::Vector{Float64},
    xPhysv::AbstractVector{Float64},
    penal::Float64,
    E0::Float64,
    Emin::Float64,
)
    dE = E0 - Emin
    @inbounds @simd for e in eachindex(Emod)
        xp = xPhysv[e]
        Emod[e] = Emin + xp^penal * dE
    end
    return nothing
end

function refactorize_cholesky!(old_factor, Kff::SparseMatrixCSC{Float64,Int})
    A = Symmetric(Kff, :U)
    if old_factor === nothing
        return cholesky(A; check=false)
    end

    # Julia's CHOLMOD interface supports reusing a symbolic factorization on many versions.
    # If not available locally, this falls back to a fresh factorization with the same matrix.
    try
        return cholesky!(old_factor, A; check=false)
    catch
        return cholesky(A; check=false)
    end
end

function solve_free_displacements!(
    U::Vector{Float64},
    Ufree::Vector{Float64},
    Ffree::AbstractVector{Float64},
    freedofs::AbstractVector{Int},
    Kfact,
)
    ldiv!(Ufree, Kfact, Ffree)
    @inbounds @simd for k in eachindex(freedofs)
        U[freedofs[k]] = Ufree[k]
    end
    return nothing
end

# -----------------------------------------------------------------------------
# Element objective/sensitivity without U[edofMat] allocation
# -----------------------------------------------------------------------------

function element_objective_sensitivities!(
    ce::AbstractVector{Float64},
    dc::AbstractVector{Float64},
    dv::AbstractVector{Float64},
    U::AbstractVector{Float64},
    edofMat::AbstractMatrix{Int},
    KE::AbstractMatrix{Float64},
    xPhysv::AbstractVector{Float64},
    Emod::AbstractVector{Float64},
    penal::Float64,
    E0::Float64,
    Emin::Float64,
)
    dE = E0 - Emin
    c = 0.0
    nele = length(ce)

    @inbounds for e in 1:nele
        energy = 0.0
        for i in 1:8
            ui = U[edofMat[e, i]]
            kui = 0.0
            for j in 1:8
                kui += KE[i, j] * U[edofMat[e, j]]
            end
            energy += ui * kui
        end

        xp = xPhysv[e]
        ce[e] = energy
        c += Emod[e] * energy
        dc[e] = -penal * dE * xp^(penal - 1.0) * energy
        dv[e] = 1.0
    end

    return c
end

# -----------------------------------------------------------------------------
# Filters without temporary vectors from broadcasted H * ... expressions
# -----------------------------------------------------------------------------

function apply_filter!(
    dc::AbstractVector{Float64},
    dv::AbstractVector{Float64},
    x::AbstractVector{Float64},
    H::SparseMatrixCSC{Float64,Int},
    Hs::AbstractVector{Float64},
    work1::Vector{Float64},
    work2::Vector{Float64},
    ft::Int,
)
    n = length(dc)

    if ft == 1
        @inbounds @simd for e in 1:n
            work1[e] = x[e] * dc[e]
        end
        mul!(work2, H, work1)
        @inbounds @simd for e in 1:n
            dc[e] = work2[e] / Hs[e] / max(1.0e-3, x[e])
        end
    elseif ft == 2
        @inbounds @simd for e in 1:n
            work1[e] = dc[e] / Hs[e]
        end
        mul!(work2, H, work1)
        copyto!(dc, work2)

        @inbounds @simd for e in 1:n
            work1[e] = dv[e] / Hs[e]
        end
        mul!(work2, H, work1)
        copyto!(dv, work2)
    end

    return nothing
end

# -----------------------------------------------------------------------------
# OC update without broadcast allocations
# -----------------------------------------------------------------------------

function optimalityCriteria_fast!(
    x::AbstractVector{Float64},
    xPhys::AbstractVector{Float64},
    xnew::AbstractVector{Float64},
    dc::AbstractVector{Float64},
    dv::AbstractVector{Float64},
    H::SparseMatrixCSC{Float64,Int},
    Hs::AbstractVector{Float64},
    work::Vector{Float64},
    ft::Int,
    volfrac::Float64,
)
    l1 = 0.0
    l2 = 1.0e9
    move = 0.2
    n = length(x)

    while (l2 - l1) / (l1 + l2) > 1.0e-3
        lmid = 0.5 * (l2 + l1)

        @inbounds for e in 1:n
            xe = x[e]
            cand = xe * sqrt(-dc[e] / dv[e] / lmid)
            xnew[e] = max(0.0, max(xe - move, min(1.0, min(xe + move, cand))))
        end

        if ft == 1
            copyto!(xPhys, xnew)
        elseif ft == 2
            mul!(work, H, xnew)
            @inbounds @simd for e in 1:n
                xPhys[e] = work[e] / Hs[e]
            end
        end

        sx = 0.0
        @inbounds @simd for e in 1:n
            sx += xPhys[e]
        end

        if sx > volfrac * n
            l1 = lmid
        else
            l2 = lmid
        end
    end

    return nothing
end

# -----------------------------------------------------------------------------
# Optimized optimize!
# -----------------------------------------------------------------------------

function optimize!(
    F::AbstractVector{Float64},
    U::Vector{Float64},
    freedofs::AbstractVector{Int},
    penal::Float64,
    E0::Float64,
    Emin::Float64,
    ft::Int,
    nelx::Int,
    nely::Int,
    iK::AbstractVector{Int},      # kept for API compatibility; no longer used
    jK::AbstractVector{Int},      # kept for API compatibility; no longer used
    edofMat::AbstractMatrix{Int},
    KE::AbstractMatrix{Float64},
    H::SparseMatrixCSC{Float64,Int},
    Hs::AbstractArray{Float64},
    volfrac::Float64,
    alg::String,
)::Tuple{Matrix{Float64},Matrix{Float64}}

    n = nelx * nely
    ndofs = length(U)
    nfree = length(freedofs)
    @assert length(F) == nfree

    x = fill(volfrac, nely, nelx)
    xPhys = copy(x)
    xnew = similar(x)

    xv = vec(x)
    xPhysv = vec(xPhys)
    xnewv = vec(xnew)
    Hsv = vec(Hs)

    ce = Vector{Float64}(undef, n)
    dc = Vector{Float64}(undef, n)
    dv = Vector{Float64}(undef, n)
    Emod = Vector{Float64}(undef, n)
    work1 = Vector{Float64}(undef, n)
    work2 = Vector{Float64}(undef, n)
    Ufree = Vector{Float64}(undef, nfree)

    Kff, asmpos, asmelem, asmval = build_free_stiffness_cache(freedofs, iK, jK, KE, n, ndofs)
    Kfact = nothing

    if alg == "MMA"
        m = 1
        xold1 = copy(xv)
        xold2 = copy(xv)
        xmin = zeros(Float64, n)
        xmax = ones(Float64, n)
        low = zeros(Float64, n)
        upp = ones(Float64, n)
        cc = fill(1000.0, m)
        dd = ones(Float64, m)
        aa0 = 1.0
        aa = zeros(Float64, m)
        move = 0.2
        f0 = 1.0

        df0dx = Vector{Float64}(undef, n)
        fval = Vector{Float64}(undef, m)
        dfdx = Matrix{Float64}(undef, m, n)
    end

    maxoutit = 1000
    kkttol = 5.0e-5
    kktnorm = kkttol + 10.0
    loop = 0
    change = 1.0

    fill!(U, 0.0)

    while kktnorm > kkttol && change > 1.0e-2 && loop < maxoutit
        loop += 1

        # FE analysis: update values in the existing sparse pattern, refactorize, solve.
        update_material!(Emod, xPhysv, penal, E0, Emin)
        assemble_free_stiffness!(Kff, asmpos, asmelem, asmval, Emod)
        Kfact = refactorize_cholesky!(Kfact, Kff)
        solve_free_displacements!(U, Ufree, F, freedofs, Kfact)

        # Objective and sensitivities.
        c = element_objective_sensitivities!(ce, dc, dv, U, edofMat, KE, xPhysv, Emod, penal, E0, Emin)

        # Filtering.
        apply_filter!(dc, dv, xv, H, Hsv, work1, work2, ft)

        if alg == "OC"
            optimalityCriteria_fast!(xv, xPhysv, xnewv, dc, dv, H, Hsv, work1, ft, volfrac)

            change = 0.0
            @inbounds @simd for e in 1:n
                change = max(change, abs(xnewv[e] - xv[e]))
            end
            copyto!(xv, xnewv)

        elseif alg == "MMA"
            @inbounds @simd for e in 1:n
                xe = xv[e]
                xmin[e] = max(xe - move, 0.0)
                xmax[e] = min(xe + move, 1.0)
            end

            if loop == 1
                f0 = c
            end

            f0val = c / f0
            @inbounds @simd for e in 1:n
                df0dx[e] = dc[e] / f0
                dfdx[1, e] = dv[e] / (n * volfrac)
            end

            sxphys = 0.0
            @inbounds @simd for e in 1:n
                sxphys += xPhysv[e]
            end
            fval[1] = sxphys / (n * volfrac) - 1.0

            xmma, ymma, zmma, lam, xsi, eta, mu, zet, s, low, upp =
                mmasub(m, n, loop, xv, xmin, xmax, xold1, xold2,
                       f0val, df0dx, fval, dfdx, low, upp, aa0, aa, cc, dd)

            copyto!(xold2, xold1)
            copyto!(xold1, xv)

            if ft == 1
                copyto!(xPhysv, xmma)
            elseif ft == 2
                mul!(work1, H, xmma)
                @inbounds @simd for e in 1:n
                    xPhysv[e] = work1[e] / Hsv[e]
                end
            end

            change = 0.0
            @inbounds @simd for e in 1:n
                change = max(change, abs(xmma[e] - xv[e]))
            end
            copyto!(xv, xmma)

            _, kktnorm, _ = kktcheck(
                m, n, xmma, ymma, zmma, lam, xsi, eta, mu, zet, s,
                xmin, xmax, df0dx, fval, dfdx, aa0, aa, cc, dd,
            )
        else
            error("Unknown algorithm: $alg. Expected \"OC\" or \"MMA\".")
        end

        @printf("It: %i  Obj.: %.4f Vol: %.3f  Ch: %.3f |KKT|: %.4f\n",
                loop, c, mean(xPhysv), change, kktnorm)
    end

    return x, xPhys
end

and for the subsolv :


@inline function _subsolv_update_x_work!(
    ux1, xl1, ux2, xl2, ux3, xl3, uxinv1, xlinv1, uxinv2, xlinv2,
    x, low, upp,
)
    @inbounds @simd for j in eachindex(x)
        ux = Float64(upp[j]) - x[j]
        xl = x[j] - Float64(low[j])
        ux_2 = ux * ux
        xl_2 = xl * xl
        ux1[j] = ux
        xl1[j] = xl
        ux2[j] = ux_2
        xl2[j] = xl_2
        ux3[j] = ux_2 * ux
        xl3[j] = xl_2 * xl
        uxinv1[j] = 1.0 / ux
        xlinv1[j] = 1.0 / xl
        uxinv2[j] = 1.0 / ux_2
        xlinv2[j] = 1.0 / xl_2
    end
    return nothing
end

@inline function _subsolv_add_sparse_scaled_cols!(GG, A::SparseMatrixCSC, scale, sgn::Float64)
    @inbounds for j in 1:size(A, 2)
        sc = sgn * scale[j]
        for p in A.colptr[j]:(A.colptr[j + 1] - 1)
            GG[A.rowval[p], j] += Float64(A.nzval[p]) * sc
        end
    end
    return nothing
end

function _subsolv_fill_GG!(GG, P::SparseMatrixCSC, Q::SparseMatrixCSC, uxinv2, xlinv2)
    fill!(GG, 0.0)
    _subsolv_add_sparse_scaled_cols!(GG, P, uxinv2, 1.0)
    _subsolv_add_sparse_scaled_cols!(GG, Q, xlinv2, -1.0)
    return nothing
end

function _subsolv_fill_GG!(GG, P::SparseMatrixCSC, Q::AbstractMatrix, uxinv2, xlinv2)
    m, n = size(GG)
    @inbounds for j in 1:n
        scq = xlinv2[j]
        for i in 1:m
            GG[i, j] = -Float64(Q[i, j]) * scq
        end
    end
    _subsolv_add_sparse_scaled_cols!(GG, P, uxinv2, 1.0)
    return nothing
end

function _subsolv_fill_GG!(GG, P::AbstractMatrix, Q::SparseMatrixCSC, uxinv2, xlinv2)
    m, n = size(GG)
    @inbounds for j in 1:n
        scp = uxinv2[j]
        for i in 1:m
            GG[i, j] = Float64(P[i, j]) * scp
        end
    end
    _subsolv_add_sparse_scaled_cols!(GG, Q, xlinv2, -1.0)
    return nothing
end

function _subsolv_fill_GG!(GG, P::AbstractMatrix, Q::AbstractMatrix, uxinv2, xlinv2)
    m, n = size(GG)
    @inbounds for j in 1:n
        scp = uxinv2[j]
        scq = xlinv2[j]
        for i in 1:m
            GG[i, j] = Float64(P[i, j]) * scp - Float64(Q[i, j]) * scq
        end
    end
    return nothing
end

function _subsolv_compute_products!(plam, qlam, gvec, tmpm, P, Q, p0, q0, lam, uxinv1, xlinv1)
    mul!(plam, transpose(P), lam)
    mul!(qlam, transpose(Q), lam)
    @inbounds @simd for j in eachindex(plam)
        plam[j] += Float64(p0[j])
        qlam[j] += Float64(q0[j])
    end

    mul!(gvec, P, uxinv1)
    mul!(tmpm, Q, xlinv1)
    @inbounds @simd for i in eachindex(gvec)
        gvec[i] += tmpm[i]
    end
    return nothing
end

function _subsolv_residual!(
    ux1, xl1, ux2, xl2, ux3, xl3, uxinv1, xlinv1, uxinv2, xlinv2,
    plam, qlam, gvec, tmpm,
    x, y, z::Float64, lam, xsi, eta, mu, zet::Float64, s,
    epsi::Float64,
    low, upp, alfa, beta, p0, q0, P, Q, a0, a, b, c, d,
)
    _subsolv_update_x_work!(ux1, xl1, ux2, xl2, ux3, xl3, uxinv1, xlinv1, uxinv2, xlinv2, x, low, upp)
    _subsolv_compute_products!(plam, qlam, gvec, tmpm, P, Q, p0, q0, lam, uxinv1, xlinv1)

    norm2 = 0.0
    maxabs = 0.0

    @inbounds @simd for j in eachindex(x)
        rex = plam[j] / ux2[j] - qlam[j] / xl2[j] - xsi[j] + eta[j]
        rexsi = xsi[j] * (x[j] - Float64(alfa[j])) - epsi
        reeta = eta[j] * (Float64(beta[j]) - x[j]) - epsi
        norm2 += rex * rex + rexsi * rexsi + reeta * reeta
        maxabs = max(maxabs, abs(rex), abs(rexsi), abs(reeta))
    end

    adotlam = 0.0
    @inbounds @simd for i in eachindex(lam)
        adotlam += Float64(a[i]) * lam[i]
    end
    rez = Float64(a0) - zet - adotlam
    rezet = zet * z - epsi
    norm2 += rez * rez + rezet * rezet
    maxabs = max(maxabs, abs(rez), abs(rezet))

    @inbounds @simd for i in eachindex(y)
        rey = Float64(c[i]) + Float64(d[i]) * y[i] - mu[i] - lam[i]
        relam = gvec[i] - Float64(a[i]) * z - y[i] + s[i] - Float64(b[i])
        remu = mu[i] * y[i] - epsi
        res = lam[i] * s[i] - epsi
        norm2 += rey * rey + relam * relam + remu * remu + res * res
        maxabs = max(maxabs, abs(rey), abs(relam), abs(remu), abs(res))
    end

    return sqrt(norm2), maxabs
end

function subsolv(m, n, epsimin, low, upp, alfa, beta, p0, q0, P, Q, a0, a, b, c, d)
    # Primal and dual variables.
    x = Vector{Float64}(undef, n)
    y = ones(Float64, m)
    z = 1.0
    lam = ones(Float64, m)
    xsi = Vector{Float64}(undef, n)
    eta = Vector{Float64}(undef, n)
    mu = Vector{Float64}(undef, m)
    zet = 1.0
    s = ones(Float64, m)

    @inbounds @simd for j in 1:n
        x[j] = 0.5 * (Float64(alfa[j]) + Float64(beta[j]))
        xsi[j] = max(1.0 / (x[j] - Float64(alfa[j])), 1.0)
        eta[j] = max(1.0 / (Float64(beta[j]) - x[j]), 1.0)
    end
    @inbounds @simd for i in 1:m
        mu[i] = max(1.0, 0.5 * Float64(c[i]))
    end

    # n-work arrays.
    ux1 = Vector{Float64}(undef, n)
    xl1 = Vector{Float64}(undef, n)
    ux2 = Vector{Float64}(undef, n)
    xl2 = Vector{Float64}(undef, n)
    ux3 = Vector{Float64}(undef, n)
    xl3 = Vector{Float64}(undef, n)
    uxinv1 = Vector{Float64}(undef, n)
    xlinv1 = Vector{Float64}(undef, n)
    uxinv2 = Vector{Float64}(undef, n)
    xlinv2 = Vector{Float64}(undef, n)
    plam = Vector{Float64}(undef, n)
    qlam = Vector{Float64}(undef, n)
    delx = Vector{Float64}(undef, n)
    diagx = Vector{Float64}(undef, n)
    diagxinv = Vector{Float64}(undef, n)
    dx = Vector{Float64}(undef, n)
    dxsi = Vector{Float64}(undef, n)
    deta = Vector{Float64}(undef, n)
    tmpn = Vector{Float64}(undef, n)

    # m-work arrays.
    gvec = Vector{Float64}(undef, m)
    tmpm = Vector{Float64}(undef, m)
    dely = Vector{Float64}(undef, m)
    dellam = Vector{Float64}(undef, m)
    diagy = Vector{Float64}(undef, m)
    diagyinv = Vector{Float64}(undef, m)
    diaglamyi = Vector{Float64}(undef, m)
    dlam = Vector{Float64}(undef, m)
    dy = Vector{Float64}(undef, m)
    dmu = Vector{Float64}(undef, m)
    ds = Vector{Float64}(undef, m)

    # Old values for line search.
    xold = similar(x)
    yold = similar(y)
    lamold = similar(lam)
    xsiold = similar(xsi)
    etaold = similar(eta)
    muold = similar(mu)
    sold = similar(s)

    # GG is dense because the Schur complements become dense anyway.
    GG = Matrix{Float64}(undef, m, n)

    # Linear-system workspaces. Only one branch is active for a given call.
    if m < n
        Alam = Matrix{Float64}(undef, m, m)
        AA = Matrix{Float64}(undef, m + 1, m + 1)
        bb = Vector{Float64}(undef, m + 1)
        blam = Vector{Float64}(undef, m)
    else
        Axx = Matrix{Float64}(undef, n, n)
        AA = Matrix{Float64}(undef, n + 1, n + 1)
        bb = Vector{Float64}(undef, n + 1)
        axz = Vector{Float64}(undef, n)
        bx = Vector{Float64}(undef, n)
        dellamyi = Vector{Float64}(undef, m)
        diaglamyiinv = Vector{Float64}(undef, m)
    end

    epsi = 1.0
    itera = 0

    while epsi > Float64(epsimin)
        residunorm, residumax = _subsolv_residual!(
            ux1, xl1, ux2, xl2, ux3, xl3, uxinv1, xlinv1, uxinv2, xlinv2,
            plam, qlam, gvec, tmpm,
            x, y, z, lam, xsi, eta, mu, zet, s,
            epsi,
            low, upp, alfa, beta, p0, q0, P, Q, a0, a, b, c, d,
        )

        ittt = 0
        while residumax > 0.9 * epsi && ittt < 200
            ittt += 1
            itera += 1

            _subsolv_update_x_work!(ux1, xl1, ux2, xl2, ux3, xl3, uxinv1, xlinv1, uxinv2, xlinv2, x, low, upp)
            _subsolv_compute_products!(plam, qlam, gvec, tmpm, P, Q, p0, q0, lam, uxinv1, xlinv1)
            _subsolv_fill_GG!(GG, P, Q, uxinv2, xlinv2)

            @inbounds @simd for j in 1:n
                inva = 1.0 / (x[j] - Float64(alfa[j]))
                invb = 1.0 / (Float64(beta[j]) - x[j])
                dpsidx = plam[j] / ux2[j] - qlam[j] / xl2[j]
                delx[j] = dpsidx - epsi * inva + epsi * invb
                diagx[j] = 2.0 * (plam[j] / ux3[j] + qlam[j] / xl3[j]) + xsi[j] * inva + eta[j] * invb
                diagxinv[j] = 1.0 / diagx[j]
            end

            adotlam = 0.0
            @inbounds @simd for i in 1:m
                adotlam += Float64(a[i]) * lam[i]
            end
            delz = Float64(a0) - adotlam - epsi / z

            @inbounds @simd for i in 1:m
                dely[i] = Float64(c[i]) + Float64(d[i]) * y[i] - lam[i] - epsi / y[i]
                dellam[i] = gvec[i] - Float64(a[i]) * z - y[i] - Float64(b[i]) + epsi / lam[i]
                diagy[i] = Float64(d[i]) + mu[i] / y[i]
                diagyinv[i] = 1.0 / diagy[i]
                diaglamyi[i] = s[i] / lam[i] + diagyinv[i]
            end

            dz = 0.0

            if m < n
                # blam = dellam + dely ./ diagy - GG * (delx ./ diagx)
                @inbounds @simd for j in 1:n
                    tmpn[j] = delx[j] * diagxinv[j]
                end
                mul!(blam, GG, tmpn)
                @inbounds @simd for i in 1:m
                    blam[i] = dellam[i] + dely[i] * diagyinv[i] - blam[i]
                end

                # Alam = Diagonal(diaglamyi) + GG * Diagonal(diagxinv) * GG'
                fill!(Alam, 0.0)
                @inbounds for j in 1:n
                    wj = diagxinv[j]
                    for i in 1:m
                        gij = GG[i, j]
                        gijw = gij * wj
                        for k in i:m
                            Alam[i, k] += gijw * GG[k, j]
                        end
                    end
                end
                @inbounds for i in 1:m
                    Alam[i, i] += diaglamyi[i]
                    for k in (i + 1):m
                        Alam[k, i] = Alam[i, k]
                    end
                end

                @inbounds for i in 1:m
                    for k in 1:m
                        AA[i, k] = Alam[i, k]
                    end
                    AA[i, m + 1] = Float64(a[i])
                    AA[m + 1, i] = Float64(a[i])
                    bb[i] = blam[i]
                end
                AA[m + 1, m + 1] = -zet / z
                bb[m + 1] = delz

                ldiv!(lu!(AA), bb)

                @inbounds @simd for i in 1:m
                    dlam[i] = bb[i]
                end
                dz = bb[m + 1]

                mul!(tmpn, transpose(GG), dlam)
                @inbounds @simd for j in 1:n
                    dx[j] = -(delx[j] + tmpn[j]) * diagxinv[j]
                end
            else
                # Axx = Diagonal(diagx) + GG' * Diagonal(1 ./ diaglamyi) * GG
                @inbounds @simd for i in 1:m
                    diaglamyiinv[i] = 1.0 / diaglamyi[i]
                    dellamyi[i] = dellam[i] + dely[i] * diagyinv[i]
                end

                fill!(Axx, 0.0)
                @inbounds for j in 1:n
                    Axx[j, j] = diagx[j]
                end
                @inbounds for i in 1:m
                    wi = diaglamyiinv[i]
                    for j in 1:n
                        gij = GG[i, j]
                        gijw = gij * wi
                        for k in j:n
                            Axx[j, k] += gijw * GG[i, k]
                        end
                    end
                end
                @inbounds for j in 1:n
                    for k in (j + 1):n
                        Axx[k, j] = Axx[j, k]
                    end
                end

                azz = zet / z
                bz = delz
                @inbounds @simd for i in 1:m
                    ai = Float64(a[i])
                    wi = diaglamyiinv[i]
                    azz += ai * ai * wi
                    bz -= ai * dellamyi[i] * wi
                end

                @inbounds for j in 1:n
                    sx = 0.0
                    sb = 0.0
                    for i in 1:m
                        gij = GG[i, j]
                        wi = diaglamyiinv[i]
                        sx += gij * Float64(a[i]) * wi
                        sb += gij * dellamyi[i] * wi
                    end
                    axz[j] = -sx
                    bx[j] = delx[j] + sb
                end

                @inbounds for j in 1:n
                    for k in 1:n
                        AA[j, k] = Axx[j, k]
                    end
                    AA[j, n + 1] = axz[j]
                    AA[n + 1, j] = axz[j]
                    bb[j] = -bx[j]
                end
                AA[n + 1, n + 1] = azz
                bb[n + 1] = -bz

                ldiv!(lu!(AA), bb)

                @inbounds @simd for j in 1:n
                    dx[j] = bb[j]
                end
                dz = bb[n + 1]

                mul!(tmpm, GG, dx)
                @inbounds @simd for i in 1:m
                    dlam[i] = (tmpm[i] - dz * Float64(a[i]) + dellamyi[i]) * diaglamyiinv[i]
                end
            end

            @inbounds @simd for i in 1:m
                dy[i] = (-dely[i] + dlam[i]) * diagyinv[i]
                dmu[i] = -mu[i] + epsi / y[i] - mu[i] * dy[i] / y[i]
                ds[i] = -s[i] + epsi / lam[i] - s[i] * dlam[i] / lam[i]
            end
            dzet = -zet + epsi / z - zet * dz / z

            stmxx = max(-1.01 * dz / z, -1.01 * dzet / zet)
            stmalfa = -Inf
            stmbeta = -Inf

            @inbounds @simd for i in 1:m
                stmxx = max(
                    stmxx,
                    -1.01 * dy[i] / y[i],
                    -1.01 * dlam[i] / lam[i],
                    -1.01 * dmu[i] / mu[i],
                    -1.01 * ds[i] / s[i],
                )
            end

            @inbounds @simd for j in 1:n
                inva = 1.0 / (x[j] - Float64(alfa[j]))
                invb = 1.0 / (Float64(beta[j]) - x[j])
                dxsi[j] = -xsi[j] + epsi * inva - xsi[j] * dx[j] * inva
                deta[j] = -eta[j] + epsi * invb + eta[j] * dx[j] * invb
                stmxx = max(stmxx, -1.01 * dxsi[j] / xsi[j], -1.01 * deta[j] / eta[j])
                stmalfa = max(stmalfa, -1.01 * dx[j] * inva)
                stmbeta = max(stmbeta, 1.01 * dx[j] * invb)
            end

            steg = 1.0 / max(max(max(stmalfa, stmbeta), stmxx), 1.0)

            copyto!(xold, x)
            copyto!(yold, y)
            zold = z
            copyto!(lamold, lam)
            copyto!(xsiold, xsi)
            copyto!(etaold, eta)
            copyto!(muold, mu)
            zetold = zet
            copyto!(sold, s)

            itto = 0
            resinew = 2.0 * residunorm
            resimaxnew = residumax

            while resinew > residunorm && itto < 50
                itto += 1

                @inbounds @simd for j in 1:n
                    x[j] = xold[j] + steg * dx[j]
                    xsi[j] = xsiold[j] + steg * dxsi[j]
                    eta[j] = etaold[j] + steg * deta[j]
                end
                @inbounds @simd for i in 1:m
                    y[i] = yold[i] + steg * dy[i]
                    lam[i] = lamold[i] + steg * dlam[i]
                    mu[i] = muold[i] + steg * dmu[i]
                    s[i] = sold[i] + steg * ds[i]
                end
                z = zold + steg * dz
                zet = zetold + steg * dzet

                resinew, resimaxnew = _subsolv_residual!(
                    ux1, xl1, ux2, xl2, ux3, xl3, uxinv1, xlinv1, uxinv2, xlinv2,
                    plam, qlam, gvec, tmpm,
                    x, y, z, lam, xsi, eta, mu, zet, s,
                    epsi,
                    low, upp, alfa, beta, p0, q0, P, Q, a0, a, b, c, d,
                )

                steg *= 0.5
            end

            residunorm = resinew
            residumax = resimaxnew
        end

        if ittt > 198
            println("epsi: ", epsi)
            println("ittt: ", ittt)
        end
        epsi *= 0.1
    end

    return x, y, z, lam, xsi, eta, mu, zet, s
end

but the code is no more matlab like but more c like