For the record, I have experimented with it, also inspired by a talk I have watched (JuliaCon 2016 | Iterative Methods for Sparse Linear Systems in Julia | Lars Ruthotto - YouTube).
Honestly, in my experience it performed more poorly than just A\b, with A sparse with dimensions (60802, 60802) and the normal LinearAlgebra package (julia> BLAS.vendor() :openblas64).
@time K = sparse(iK,jK,sK)
@time Ksolve = Symmetric(K[freedofs,freedofs])
@time Usolve = Ksolve\F
@time U[freedofs] = Usolve
@time U[freedofs] = Symmetric(K[freedofs,freedofs])\F
gives:
0.024518 seconds (17 allocations: 47.242 MiB)
0.008867 seconds (8 allocations: 17.448 MiB)
0.141280 seconds (50 allocations: 128.413 MiB)
0.000167 seconds
0.149922 seconds (58 allocations: 145.861 MiB)
We can thus say that:
K = sparse(iK,jK,sK)
Ksolve = Symmetric(K[freedofs,freedofs])
Usolve = Ksolve\F
U[freedofs] = Usolve
takes 0.174832 s, and:
U[freedofs] = Symmetric(K[freedofs,freedofs])\F
takes 0.149922 s.
Note: the size of K is (60802, 60802).
Perhaps a minor thing, but don’t do this. It creates an unnecessary copy. If you need to shape it into an 1D vector write
vec(KE)
instead.
Are you doing the benchmarking in global scope (you shouldn’t)? And are you making sure not to include compilation time?
Also, you cannot add up runtimes like this, in fact, you should be aware that the @time macro is not well suited to doing microbenchmarks. Instead wrap your code in a function before benchmarking, and use BenchmarkTools.jl.
I am confused: didn’t you previously provide timing of 18 seconds?
0.15 seconds is nothing to complain about for a matrix this size…
18 sec for the full optimization analysis, with about 61 iterations
I see, I didn’t get that. I thought it was just the solve. Ignore my advice.
I think 0.15 sec for a 60k x 60k matrix is reasonable. What did you expect?
It all started from the fact that I am getting the same computational time for a “mirrored” (conceptually) code in Matlab. I was trying to prove to my self that Julia is faster. Instead I managed with some fine tuning to get to Matlab performance, and not better…
Let’s make it faster then. Post your code … ![]()
But did you wrap the code in a function, and do the benchmarking properly?
It would be nice to see both the code and the benchmarking code.
Here we go: https://github.com/pollinico/topopt_jl
@PetrKryslUCSD did you have a chance to look at the code? Any suggestions for speedup? Thanks!
Wow, that was four years ago. I’m afraid I will have to look at it again to refresh my memory.
I was just curious to know if there were any ideas. Nothing urgent of course, just for fun/curiosity
With LLMs, the biggest change to do is :
for the optimize function :
@inline function _find_csc_position(A::SparseMatrixCSC{Float64,Int}, r::Int, c::Int)::Int
@inbounds for p in A.colptr[c]:(A.colptr[c + 1] - 1)
if A.rowval[p] == r
return p
end
end
error("CSC position not found for ($r, $c)")
end
function build_free_stiffness_cache(
freedofs::AbstractVector{Int},
iK::AbstractVector{Int},
jK::AbstractVector{Int},
KE::AbstractMatrix{Float64},
nele::Int,
ndofs::Int,
)
nfree = length(freedofs)
freeid = zeros(Int, ndofs)
@inbounds for k in 1:nfree
freeid[freedofs[k]] = k
end
# Exact equivalent of:
# sK = reshape(vec(KE) * Emod', 64*nele)
# K = sparse(iK, jK, sK)
# Kff = K[freedofs, freedofs]
# Symmetric(Kff) # Julia default: upper triangle
#
# Therefore we keep only entries that land in the upper triangle of the
# free-dof matrix, using the original iK/jK ordering and vec(KE) ordering.
# This is stricter than rebuilding pairs from edofMat, and remains correct
# even if KE is not exactly symmetric or if iK/jK are customized.
KEv = vec(KE)
rows = Int[]
cols = Int[]
asmelem = Int[]
asmval = Float64[]
sizehint!(rows, 36 * nele)
sizehint!(cols, 36 * nele)
sizehint!(asmelem, 36 * nele)
sizehint!(asmval, 36 * nele)
@inbounds for q in eachindex(iK)
row_full = iK[q]
col_full = jK[q]
row_free = freeid[row_full]
col_free = freeid[col_full]
(row_free == 0 || col_free == 0) && continue
# Same triangle used by Symmetric(K[freedofs,freedofs]) with default :U.
row_free <= col_free || continue
local_id = (q - 1) % 64 + 1
elem_id = (q - 1) ÷ 64 + 1
push!(rows, row_free)
push!(cols, col_free)
push!(asmelem, elem_id)
push!(asmval, KEv[local_id])
end
Kff = sparse(rows, cols, ones(Float64, length(rows)), nfree, nfree)
asmpos = Vector{Int}(undef, length(rows))
@inbounds for q in eachindex(rows)
asmpos[q] = _find_csc_position(Kff, rows[q], cols[q])
end
fill!(Kff.nzval, 0.0)
return Kff, asmpos, asmelem, asmval
end
@inline function assemble_free_stiffness!(
Kff::SparseMatrixCSC{Float64,Int},
asmpos::Vector{Int},
asmelem::Vector{Int},
asmval::Vector{Float64},
Emod::Vector{Float64},
)
fill!(Kff.nzval, 0.0)
@inbounds for q in eachindex(asmpos)
Kff.nzval[asmpos[q]] += asmval[q] * Emod[asmelem[q]]
end
return Kff
end
function check_free_stiffness_cache(
freedofs::AbstractVector{Int},
iK::AbstractVector{Int},
jK::AbstractVector{Int},
KE::AbstractMatrix{Float64},
Emod::AbstractVector{Float64},
Kff::SparseMatrixCSC{Float64,Int},
asmpos::Vector{Int},
asmelem::Vector{Int},
asmval::Vector{Float64},
ndofs::Int,
)
# Debug helper, not used in the optimized loop.
# Returns 0.0 up to floating-point roundoff if the optimized assembly matches
# the old sparse(iK,jK,sK) + extraction path.
sK_ref = reshape(vec(KE) * reshape(collect(Emod), 1, :), length(iK))
K_ref = sparse(iK, jK, sK_ref, ndofs, ndofs)
assemble_free_stiffness!(Kff, asmpos, asmelem, asmval, collect(Emod))
return norm(Matrix(Symmetric(K_ref[freedofs, freedofs])) - Matrix(Symmetric(Kff, :U)), Inf)
end
@inline function update_material!(
Emod::Vector{Float64},
xPhysv::AbstractVector{Float64},
penal::Float64,
E0::Float64,
Emin::Float64,
)
dE = E0 - Emin
@inbounds @simd for e in eachindex(Emod)
xp = xPhysv[e]
Emod[e] = Emin + xp^penal * dE
end
return nothing
end
function refactorize_cholesky!(old_factor, Kff::SparseMatrixCSC{Float64,Int})
A = Symmetric(Kff, :U)
if old_factor === nothing
return cholesky(A; check=false)
end
# Julia's CHOLMOD interface supports reusing a symbolic factorization on many versions.
# If not available locally, this falls back to a fresh factorization with the same matrix.
try
return cholesky!(old_factor, A; check=false)
catch
return cholesky(A; check=false)
end
end
function solve_free_displacements!(
U::Vector{Float64},
Ufree::Vector{Float64},
Ffree::AbstractVector{Float64},
freedofs::AbstractVector{Int},
Kfact,
)
ldiv!(Ufree, Kfact, Ffree)
@inbounds @simd for k in eachindex(freedofs)
U[freedofs[k]] = Ufree[k]
end
return nothing
end
# -----------------------------------------------------------------------------
# Element objective/sensitivity without U[edofMat] allocation
# -----------------------------------------------------------------------------
function element_objective_sensitivities!(
ce::AbstractVector{Float64},
dc::AbstractVector{Float64},
dv::AbstractVector{Float64},
U::AbstractVector{Float64},
edofMat::AbstractMatrix{Int},
KE::AbstractMatrix{Float64},
xPhysv::AbstractVector{Float64},
Emod::AbstractVector{Float64},
penal::Float64,
E0::Float64,
Emin::Float64,
)
dE = E0 - Emin
c = 0.0
nele = length(ce)
@inbounds for e in 1:nele
energy = 0.0
for i in 1:8
ui = U[edofMat[e, i]]
kui = 0.0
for j in 1:8
kui += KE[i, j] * U[edofMat[e, j]]
end
energy += ui * kui
end
xp = xPhysv[e]
ce[e] = energy
c += Emod[e] * energy
dc[e] = -penal * dE * xp^(penal - 1.0) * energy
dv[e] = 1.0
end
return c
end
# -----------------------------------------------------------------------------
# Filters without temporary vectors from broadcasted H * ... expressions
# -----------------------------------------------------------------------------
function apply_filter!(
dc::AbstractVector{Float64},
dv::AbstractVector{Float64},
x::AbstractVector{Float64},
H::SparseMatrixCSC{Float64,Int},
Hs::AbstractVector{Float64},
work1::Vector{Float64},
work2::Vector{Float64},
ft::Int,
)
n = length(dc)
if ft == 1
@inbounds @simd for e in 1:n
work1[e] = x[e] * dc[e]
end
mul!(work2, H, work1)
@inbounds @simd for e in 1:n
dc[e] = work2[e] / Hs[e] / max(1.0e-3, x[e])
end
elseif ft == 2
@inbounds @simd for e in 1:n
work1[e] = dc[e] / Hs[e]
end
mul!(work2, H, work1)
copyto!(dc, work2)
@inbounds @simd for e in 1:n
work1[e] = dv[e] / Hs[e]
end
mul!(work2, H, work1)
copyto!(dv, work2)
end
return nothing
end
# -----------------------------------------------------------------------------
# OC update without broadcast allocations
# -----------------------------------------------------------------------------
function optimalityCriteria_fast!(
x::AbstractVector{Float64},
xPhys::AbstractVector{Float64},
xnew::AbstractVector{Float64},
dc::AbstractVector{Float64},
dv::AbstractVector{Float64},
H::SparseMatrixCSC{Float64,Int},
Hs::AbstractVector{Float64},
work::Vector{Float64},
ft::Int,
volfrac::Float64,
)
l1 = 0.0
l2 = 1.0e9
move = 0.2
n = length(x)
while (l2 - l1) / (l1 + l2) > 1.0e-3
lmid = 0.5 * (l2 + l1)
@inbounds for e in 1:n
xe = x[e]
cand = xe * sqrt(-dc[e] / dv[e] / lmid)
xnew[e] = max(0.0, max(xe - move, min(1.0, min(xe + move, cand))))
end
if ft == 1
copyto!(xPhys, xnew)
elseif ft == 2
mul!(work, H, xnew)
@inbounds @simd for e in 1:n
xPhys[e] = work[e] / Hs[e]
end
end
sx = 0.0
@inbounds @simd for e in 1:n
sx += xPhys[e]
end
if sx > volfrac * n
l1 = lmid
else
l2 = lmid
end
end
return nothing
end
# -----------------------------------------------------------------------------
# Optimized optimize!
# -----------------------------------------------------------------------------
function optimize!(
F::AbstractVector{Float64},
U::Vector{Float64},
freedofs::AbstractVector{Int},
penal::Float64,
E0::Float64,
Emin::Float64,
ft::Int,
nelx::Int,
nely::Int,
iK::AbstractVector{Int}, # kept for API compatibility; no longer used
jK::AbstractVector{Int}, # kept for API compatibility; no longer used
edofMat::AbstractMatrix{Int},
KE::AbstractMatrix{Float64},
H::SparseMatrixCSC{Float64,Int},
Hs::AbstractArray{Float64},
volfrac::Float64,
alg::String,
)::Tuple{Matrix{Float64},Matrix{Float64}}
n = nelx * nely
ndofs = length(U)
nfree = length(freedofs)
@assert length(F) == nfree
x = fill(volfrac, nely, nelx)
xPhys = copy(x)
xnew = similar(x)
xv = vec(x)
xPhysv = vec(xPhys)
xnewv = vec(xnew)
Hsv = vec(Hs)
ce = Vector{Float64}(undef, n)
dc = Vector{Float64}(undef, n)
dv = Vector{Float64}(undef, n)
Emod = Vector{Float64}(undef, n)
work1 = Vector{Float64}(undef, n)
work2 = Vector{Float64}(undef, n)
Ufree = Vector{Float64}(undef, nfree)
Kff, asmpos, asmelem, asmval = build_free_stiffness_cache(freedofs, iK, jK, KE, n, ndofs)
Kfact = nothing
if alg == "MMA"
m = 1
xold1 = copy(xv)
xold2 = copy(xv)
xmin = zeros(Float64, n)
xmax = ones(Float64, n)
low = zeros(Float64, n)
upp = ones(Float64, n)
cc = fill(1000.0, m)
dd = ones(Float64, m)
aa0 = 1.0
aa = zeros(Float64, m)
move = 0.2
f0 = 1.0
df0dx = Vector{Float64}(undef, n)
fval = Vector{Float64}(undef, m)
dfdx = Matrix{Float64}(undef, m, n)
end
maxoutit = 1000
kkttol = 5.0e-5
kktnorm = kkttol + 10.0
loop = 0
change = 1.0
fill!(U, 0.0)
while kktnorm > kkttol && change > 1.0e-2 && loop < maxoutit
loop += 1
# FE analysis: update values in the existing sparse pattern, refactorize, solve.
update_material!(Emod, xPhysv, penal, E0, Emin)
assemble_free_stiffness!(Kff, asmpos, asmelem, asmval, Emod)
Kfact = refactorize_cholesky!(Kfact, Kff)
solve_free_displacements!(U, Ufree, F, freedofs, Kfact)
# Objective and sensitivities.
c = element_objective_sensitivities!(ce, dc, dv, U, edofMat, KE, xPhysv, Emod, penal, E0, Emin)
# Filtering.
apply_filter!(dc, dv, xv, H, Hsv, work1, work2, ft)
if alg == "OC"
optimalityCriteria_fast!(xv, xPhysv, xnewv, dc, dv, H, Hsv, work1, ft, volfrac)
change = 0.0
@inbounds @simd for e in 1:n
change = max(change, abs(xnewv[e] - xv[e]))
end
copyto!(xv, xnewv)
elseif alg == "MMA"
@inbounds @simd for e in 1:n
xe = xv[e]
xmin[e] = max(xe - move, 0.0)
xmax[e] = min(xe + move, 1.0)
end
if loop == 1
f0 = c
end
f0val = c / f0
@inbounds @simd for e in 1:n
df0dx[e] = dc[e] / f0
dfdx[1, e] = dv[e] / (n * volfrac)
end
sxphys = 0.0
@inbounds @simd for e in 1:n
sxphys += xPhysv[e]
end
fval[1] = sxphys / (n * volfrac) - 1.0
xmma, ymma, zmma, lam, xsi, eta, mu, zet, s, low, upp =
mmasub(m, n, loop, xv, xmin, xmax, xold1, xold2,
f0val, df0dx, fval, dfdx, low, upp, aa0, aa, cc, dd)
copyto!(xold2, xold1)
copyto!(xold1, xv)
if ft == 1
copyto!(xPhysv, xmma)
elseif ft == 2
mul!(work1, H, xmma)
@inbounds @simd for e in 1:n
xPhysv[e] = work1[e] / Hsv[e]
end
end
change = 0.0
@inbounds @simd for e in 1:n
change = max(change, abs(xmma[e] - xv[e]))
end
copyto!(xv, xmma)
_, kktnorm, _ = kktcheck(
m, n, xmma, ymma, zmma, lam, xsi, eta, mu, zet, s,
xmin, xmax, df0dx, fval, dfdx, aa0, aa, cc, dd,
)
else
error("Unknown algorithm: $alg. Expected \"OC\" or \"MMA\".")
end
@printf("It: %i Obj.: %.4f Vol: %.3f Ch: %.3f |KKT|: %.4f\n",
loop, c, mean(xPhysv), change, kktnorm)
end
return x, xPhys
end
and for the subsolv :
@inline function _subsolv_update_x_work!(
ux1, xl1, ux2, xl2, ux3, xl3, uxinv1, xlinv1, uxinv2, xlinv2,
x, low, upp,
)
@inbounds @simd for j in eachindex(x)
ux = Float64(upp[j]) - x[j]
xl = x[j] - Float64(low[j])
ux_2 = ux * ux
xl_2 = xl * xl
ux1[j] = ux
xl1[j] = xl
ux2[j] = ux_2
xl2[j] = xl_2
ux3[j] = ux_2 * ux
xl3[j] = xl_2 * xl
uxinv1[j] = 1.0 / ux
xlinv1[j] = 1.0 / xl
uxinv2[j] = 1.0 / ux_2
xlinv2[j] = 1.0 / xl_2
end
return nothing
end
@inline function _subsolv_add_sparse_scaled_cols!(GG, A::SparseMatrixCSC, scale, sgn::Float64)
@inbounds for j in 1:size(A, 2)
sc = sgn * scale[j]
for p in A.colptr[j]:(A.colptr[j + 1] - 1)
GG[A.rowval[p], j] += Float64(A.nzval[p]) * sc
end
end
return nothing
end
function _subsolv_fill_GG!(GG, P::SparseMatrixCSC, Q::SparseMatrixCSC, uxinv2, xlinv2)
fill!(GG, 0.0)
_subsolv_add_sparse_scaled_cols!(GG, P, uxinv2, 1.0)
_subsolv_add_sparse_scaled_cols!(GG, Q, xlinv2, -1.0)
return nothing
end
function _subsolv_fill_GG!(GG, P::SparseMatrixCSC, Q::AbstractMatrix, uxinv2, xlinv2)
m, n = size(GG)
@inbounds for j in 1:n
scq = xlinv2[j]
for i in 1:m
GG[i, j] = -Float64(Q[i, j]) * scq
end
end
_subsolv_add_sparse_scaled_cols!(GG, P, uxinv2, 1.0)
return nothing
end
function _subsolv_fill_GG!(GG, P::AbstractMatrix, Q::SparseMatrixCSC, uxinv2, xlinv2)
m, n = size(GG)
@inbounds for j in 1:n
scp = uxinv2[j]
for i in 1:m
GG[i, j] = Float64(P[i, j]) * scp
end
end
_subsolv_add_sparse_scaled_cols!(GG, Q, xlinv2, -1.0)
return nothing
end
function _subsolv_fill_GG!(GG, P::AbstractMatrix, Q::AbstractMatrix, uxinv2, xlinv2)
m, n = size(GG)
@inbounds for j in 1:n
scp = uxinv2[j]
scq = xlinv2[j]
for i in 1:m
GG[i, j] = Float64(P[i, j]) * scp - Float64(Q[i, j]) * scq
end
end
return nothing
end
function _subsolv_compute_products!(plam, qlam, gvec, tmpm, P, Q, p0, q0, lam, uxinv1, xlinv1)
mul!(plam, transpose(P), lam)
mul!(qlam, transpose(Q), lam)
@inbounds @simd for j in eachindex(plam)
plam[j] += Float64(p0[j])
qlam[j] += Float64(q0[j])
end
mul!(gvec, P, uxinv1)
mul!(tmpm, Q, xlinv1)
@inbounds @simd for i in eachindex(gvec)
gvec[i] += tmpm[i]
end
return nothing
end
function _subsolv_residual!(
ux1, xl1, ux2, xl2, ux3, xl3, uxinv1, xlinv1, uxinv2, xlinv2,
plam, qlam, gvec, tmpm,
x, y, z::Float64, lam, xsi, eta, mu, zet::Float64, s,
epsi::Float64,
low, upp, alfa, beta, p0, q0, P, Q, a0, a, b, c, d,
)
_subsolv_update_x_work!(ux1, xl1, ux2, xl2, ux3, xl3, uxinv1, xlinv1, uxinv2, xlinv2, x, low, upp)
_subsolv_compute_products!(plam, qlam, gvec, tmpm, P, Q, p0, q0, lam, uxinv1, xlinv1)
norm2 = 0.0
maxabs = 0.0
@inbounds @simd for j in eachindex(x)
rex = plam[j] / ux2[j] - qlam[j] / xl2[j] - xsi[j] + eta[j]
rexsi = xsi[j] * (x[j] - Float64(alfa[j])) - epsi
reeta = eta[j] * (Float64(beta[j]) - x[j]) - epsi
norm2 += rex * rex + rexsi * rexsi + reeta * reeta
maxabs = max(maxabs, abs(rex), abs(rexsi), abs(reeta))
end
adotlam = 0.0
@inbounds @simd for i in eachindex(lam)
adotlam += Float64(a[i]) * lam[i]
end
rez = Float64(a0) - zet - adotlam
rezet = zet * z - epsi
norm2 += rez * rez + rezet * rezet
maxabs = max(maxabs, abs(rez), abs(rezet))
@inbounds @simd for i in eachindex(y)
rey = Float64(c[i]) + Float64(d[i]) * y[i] - mu[i] - lam[i]
relam = gvec[i] - Float64(a[i]) * z - y[i] + s[i] - Float64(b[i])
remu = mu[i] * y[i] - epsi
res = lam[i] * s[i] - epsi
norm2 += rey * rey + relam * relam + remu * remu + res * res
maxabs = max(maxabs, abs(rey), abs(relam), abs(remu), abs(res))
end
return sqrt(norm2), maxabs
end
function subsolv(m, n, epsimin, low, upp, alfa, beta, p0, q0, P, Q, a0, a, b, c, d)
# Primal and dual variables.
x = Vector{Float64}(undef, n)
y = ones(Float64, m)
z = 1.0
lam = ones(Float64, m)
xsi = Vector{Float64}(undef, n)
eta = Vector{Float64}(undef, n)
mu = Vector{Float64}(undef, m)
zet = 1.0
s = ones(Float64, m)
@inbounds @simd for j in 1:n
x[j] = 0.5 * (Float64(alfa[j]) + Float64(beta[j]))
xsi[j] = max(1.0 / (x[j] - Float64(alfa[j])), 1.0)
eta[j] = max(1.0 / (Float64(beta[j]) - x[j]), 1.0)
end
@inbounds @simd for i in 1:m
mu[i] = max(1.0, 0.5 * Float64(c[i]))
end
# n-work arrays.
ux1 = Vector{Float64}(undef, n)
xl1 = Vector{Float64}(undef, n)
ux2 = Vector{Float64}(undef, n)
xl2 = Vector{Float64}(undef, n)
ux3 = Vector{Float64}(undef, n)
xl3 = Vector{Float64}(undef, n)
uxinv1 = Vector{Float64}(undef, n)
xlinv1 = Vector{Float64}(undef, n)
uxinv2 = Vector{Float64}(undef, n)
xlinv2 = Vector{Float64}(undef, n)
plam = Vector{Float64}(undef, n)
qlam = Vector{Float64}(undef, n)
delx = Vector{Float64}(undef, n)
diagx = Vector{Float64}(undef, n)
diagxinv = Vector{Float64}(undef, n)
dx = Vector{Float64}(undef, n)
dxsi = Vector{Float64}(undef, n)
deta = Vector{Float64}(undef, n)
tmpn = Vector{Float64}(undef, n)
# m-work arrays.
gvec = Vector{Float64}(undef, m)
tmpm = Vector{Float64}(undef, m)
dely = Vector{Float64}(undef, m)
dellam = Vector{Float64}(undef, m)
diagy = Vector{Float64}(undef, m)
diagyinv = Vector{Float64}(undef, m)
diaglamyi = Vector{Float64}(undef, m)
dlam = Vector{Float64}(undef, m)
dy = Vector{Float64}(undef, m)
dmu = Vector{Float64}(undef, m)
ds = Vector{Float64}(undef, m)
# Old values for line search.
xold = similar(x)
yold = similar(y)
lamold = similar(lam)
xsiold = similar(xsi)
etaold = similar(eta)
muold = similar(mu)
sold = similar(s)
# GG is dense because the Schur complements become dense anyway.
GG = Matrix{Float64}(undef, m, n)
# Linear-system workspaces. Only one branch is active for a given call.
if m < n
Alam = Matrix{Float64}(undef, m, m)
AA = Matrix{Float64}(undef, m + 1, m + 1)
bb = Vector{Float64}(undef, m + 1)
blam = Vector{Float64}(undef, m)
else
Axx = Matrix{Float64}(undef, n, n)
AA = Matrix{Float64}(undef, n + 1, n + 1)
bb = Vector{Float64}(undef, n + 1)
axz = Vector{Float64}(undef, n)
bx = Vector{Float64}(undef, n)
dellamyi = Vector{Float64}(undef, m)
diaglamyiinv = Vector{Float64}(undef, m)
end
epsi = 1.0
itera = 0
while epsi > Float64(epsimin)
residunorm, residumax = _subsolv_residual!(
ux1, xl1, ux2, xl2, ux3, xl3, uxinv1, xlinv1, uxinv2, xlinv2,
plam, qlam, gvec, tmpm,
x, y, z, lam, xsi, eta, mu, zet, s,
epsi,
low, upp, alfa, beta, p0, q0, P, Q, a0, a, b, c, d,
)
ittt = 0
while residumax > 0.9 * epsi && ittt < 200
ittt += 1
itera += 1
_subsolv_update_x_work!(ux1, xl1, ux2, xl2, ux3, xl3, uxinv1, xlinv1, uxinv2, xlinv2, x, low, upp)
_subsolv_compute_products!(plam, qlam, gvec, tmpm, P, Q, p0, q0, lam, uxinv1, xlinv1)
_subsolv_fill_GG!(GG, P, Q, uxinv2, xlinv2)
@inbounds @simd for j in 1:n
inva = 1.0 / (x[j] - Float64(alfa[j]))
invb = 1.0 / (Float64(beta[j]) - x[j])
dpsidx = plam[j] / ux2[j] - qlam[j] / xl2[j]
delx[j] = dpsidx - epsi * inva + epsi * invb
diagx[j] = 2.0 * (plam[j] / ux3[j] + qlam[j] / xl3[j]) + xsi[j] * inva + eta[j] * invb
diagxinv[j] = 1.0 / diagx[j]
end
adotlam = 0.0
@inbounds @simd for i in 1:m
adotlam += Float64(a[i]) * lam[i]
end
delz = Float64(a0) - adotlam - epsi / z
@inbounds @simd for i in 1:m
dely[i] = Float64(c[i]) + Float64(d[i]) * y[i] - lam[i] - epsi / y[i]
dellam[i] = gvec[i] - Float64(a[i]) * z - y[i] - Float64(b[i]) + epsi / lam[i]
diagy[i] = Float64(d[i]) + mu[i] / y[i]
diagyinv[i] = 1.0 / diagy[i]
diaglamyi[i] = s[i] / lam[i] + diagyinv[i]
end
dz = 0.0
if m < n
# blam = dellam + dely ./ diagy - GG * (delx ./ diagx)
@inbounds @simd for j in 1:n
tmpn[j] = delx[j] * diagxinv[j]
end
mul!(blam, GG, tmpn)
@inbounds @simd for i in 1:m
blam[i] = dellam[i] + dely[i] * diagyinv[i] - blam[i]
end
# Alam = Diagonal(diaglamyi) + GG * Diagonal(diagxinv) * GG'
fill!(Alam, 0.0)
@inbounds for j in 1:n
wj = diagxinv[j]
for i in 1:m
gij = GG[i, j]
gijw = gij * wj
for k in i:m
Alam[i, k] += gijw * GG[k, j]
end
end
end
@inbounds for i in 1:m
Alam[i, i] += diaglamyi[i]
for k in (i + 1):m
Alam[k, i] = Alam[i, k]
end
end
@inbounds for i in 1:m
for k in 1:m
AA[i, k] = Alam[i, k]
end
AA[i, m + 1] = Float64(a[i])
AA[m + 1, i] = Float64(a[i])
bb[i] = blam[i]
end
AA[m + 1, m + 1] = -zet / z
bb[m + 1] = delz
ldiv!(lu!(AA), bb)
@inbounds @simd for i in 1:m
dlam[i] = bb[i]
end
dz = bb[m + 1]
mul!(tmpn, transpose(GG), dlam)
@inbounds @simd for j in 1:n
dx[j] = -(delx[j] + tmpn[j]) * diagxinv[j]
end
else
# Axx = Diagonal(diagx) + GG' * Diagonal(1 ./ diaglamyi) * GG
@inbounds @simd for i in 1:m
diaglamyiinv[i] = 1.0 / diaglamyi[i]
dellamyi[i] = dellam[i] + dely[i] * diagyinv[i]
end
fill!(Axx, 0.0)
@inbounds for j in 1:n
Axx[j, j] = diagx[j]
end
@inbounds for i in 1:m
wi = diaglamyiinv[i]
for j in 1:n
gij = GG[i, j]
gijw = gij * wi
for k in j:n
Axx[j, k] += gijw * GG[i, k]
end
end
end
@inbounds for j in 1:n
for k in (j + 1):n
Axx[k, j] = Axx[j, k]
end
end
azz = zet / z
bz = delz
@inbounds @simd for i in 1:m
ai = Float64(a[i])
wi = diaglamyiinv[i]
azz += ai * ai * wi
bz -= ai * dellamyi[i] * wi
end
@inbounds for j in 1:n
sx = 0.0
sb = 0.0
for i in 1:m
gij = GG[i, j]
wi = diaglamyiinv[i]
sx += gij * Float64(a[i]) * wi
sb += gij * dellamyi[i] * wi
end
axz[j] = -sx
bx[j] = delx[j] + sb
end
@inbounds for j in 1:n
for k in 1:n
AA[j, k] = Axx[j, k]
end
AA[j, n + 1] = axz[j]
AA[n + 1, j] = axz[j]
bb[j] = -bx[j]
end
AA[n + 1, n + 1] = azz
bb[n + 1] = -bz
ldiv!(lu!(AA), bb)
@inbounds @simd for j in 1:n
dx[j] = bb[j]
end
dz = bb[n + 1]
mul!(tmpm, GG, dx)
@inbounds @simd for i in 1:m
dlam[i] = (tmpm[i] - dz * Float64(a[i]) + dellamyi[i]) * diaglamyiinv[i]
end
end
@inbounds @simd for i in 1:m
dy[i] = (-dely[i] + dlam[i]) * diagyinv[i]
dmu[i] = -mu[i] + epsi / y[i] - mu[i] * dy[i] / y[i]
ds[i] = -s[i] + epsi / lam[i] - s[i] * dlam[i] / lam[i]
end
dzet = -zet + epsi / z - zet * dz / z
stmxx = max(-1.01 * dz / z, -1.01 * dzet / zet)
stmalfa = -Inf
stmbeta = -Inf
@inbounds @simd for i in 1:m
stmxx = max(
stmxx,
-1.01 * dy[i] / y[i],
-1.01 * dlam[i] / lam[i],
-1.01 * dmu[i] / mu[i],
-1.01 * ds[i] / s[i],
)
end
@inbounds @simd for j in 1:n
inva = 1.0 / (x[j] - Float64(alfa[j]))
invb = 1.0 / (Float64(beta[j]) - x[j])
dxsi[j] = -xsi[j] + epsi * inva - xsi[j] * dx[j] * inva
deta[j] = -eta[j] + epsi * invb + eta[j] * dx[j] * invb
stmxx = max(stmxx, -1.01 * dxsi[j] / xsi[j], -1.01 * deta[j] / eta[j])
stmalfa = max(stmalfa, -1.01 * dx[j] * inva)
stmbeta = max(stmbeta, 1.01 * dx[j] * invb)
end
steg = 1.0 / max(max(max(stmalfa, stmbeta), stmxx), 1.0)
copyto!(xold, x)
copyto!(yold, y)
zold = z
copyto!(lamold, lam)
copyto!(xsiold, xsi)
copyto!(etaold, eta)
copyto!(muold, mu)
zetold = zet
copyto!(sold, s)
itto = 0
resinew = 2.0 * residunorm
resimaxnew = residumax
while resinew > residunorm && itto < 50
itto += 1
@inbounds @simd for j in 1:n
x[j] = xold[j] + steg * dx[j]
xsi[j] = xsiold[j] + steg * dxsi[j]
eta[j] = etaold[j] + steg * deta[j]
end
@inbounds @simd for i in 1:m
y[i] = yold[i] + steg * dy[i]
lam[i] = lamold[i] + steg * dlam[i]
mu[i] = muold[i] + steg * dmu[i]
s[i] = sold[i] + steg * ds[i]
end
z = zold + steg * dz
zet = zetold + steg * dzet
resinew, resimaxnew = _subsolv_residual!(
ux1, xl1, ux2, xl2, ux3, xl3, uxinv1, xlinv1, uxinv2, xlinv2,
plam, qlam, gvec, tmpm,
x, y, z, lam, xsi, eta, mu, zet, s,
epsi,
low, upp, alfa, beta, p0, q0, P, Q, a0, a, b, c, d,
)
steg *= 0.5
end
residunorm = resinew
residumax = resimaxnew
end
if ittt > 198
println("epsi: ", epsi)
println("ittt: ", ittt)
end
epsi *= 0.1
end
return x, y, z, lam, xsi, eta, mu, zet, s
end
but the code is no more matlab like but more c like