I am unable to avoid allocation when performing an in-place solve against a simplical Cholesky factorisation, but have no problems with a supernodal factorisation.
Note that:
- the allocation is small, but since I am performing many millions of solves (as part of an eigenproblem using @stabbles excellent ArnoldiMethod.jl package, this adds up to hundreds of Gb with associated GC overhead;
 - simply avoiding the problem using the supernodal factorisation is suboptimal, since the size and sparsity of the matrices is such that the simplical factorisation provides a much faster solve.
 
I understand that CHOLMOD hooks into Julia’s allocation routines, so I am assuming that the allocations are internal to CHOLMOD, and thus this might have to be a question or upstream. However I thought I would ask here first as I am touching the internals of various stdlibs in order to call the cholmod_solve2 interface without moving between Julia and CHOLMOD representations of the various matrices; there could easily be a Julia side implementation detail that I am missing, perhaps relating to common structure. As such, any thoughts from those who’ve touched the core implementation of cholmod.jl, e.g. @sacha, @andreasnoack  would be appreciated.
The following code demonstrates the problem (much of this is hacked from the SuiteSparse stdlib).
using SparseArrays
using LinearAlgebra
using SuiteSparse
using BenchmarkTools
import SuiteSparse.CHOLMOD: Sparse, Dense, Factor, C_Dense, C_Factor, C_Sparse, SuiteSparse_long
import SuiteSparse.CHOLMOD: common_struct, common_supernodal, common_nmethods, change_factor!, set_print_level, defaults
import SuiteSparse.CHOLMOD: VTypes, @cholmod_name, fact_, cholesky!, CHOLMOD_A
import SparseArrays: getcolptr, getrowval
function supercholesky(A::Sparse; shift::Real=0.0, check::Bool = true,
  perm::Union{Nothing,AbstractVector{SuiteSparse_long}}=nothing)
  
  cm = defaults(common_struct[Threads.threadid()])
  set_print_level(cm, 0)
  
  # Force a supernodal solution (eliminates alloc on solve)
  unsafe_store!(common_supernodal[Threads.threadid()], 2)
  
  # Compute the symbolic factorization
  F = fact_(A, cm; perm = perm)
  
  # Compute the numerical factorization
  cholesky!(F, A; shift = shift, check = check)
  
  return F
  
end
function _solve!(X::Dense{Tv}, Yref::Ref{Ptr{C_Dense{Tv}}}, Eref::Ref{Ptr{C_Dense{Tv}}}, F::Factor{Tv}, B::Dense{Tv}) where Tv<:VTypes
  
  # Pointer to pre-allocated dense matrix
  Xref = Ptr{C_Dense{Tv}}(pointer(X))
  sys = CHOLMOD_A # Solve type
  Bset = C_NULL   # Unused parameter
  Xset = C_NULL   # Unused parameter
  
  if size(F,1) != size(B,1)
    throw(DimensionMismatch("LHS and RHS should have the same number of rows. " *
    "LHS has $(size(F,1)) rows, but RHS has $(size(B,1)) rows."))
  end
  
  if !issuccess(F)
    s = unsafe_load(pointer(F))
    if s.is_ll == 1
      throw(LinearAlgebra.PosDefException(s.minor))
    else
      throw(LinearAlgebra.ZeroPivotException(s.minor))
    end
  end
  
  res = ccall((@cholmod_name("solve2"), :libcholmod), Cint,
  # Input                                                        # Output                                         # Workspace
  (Cint, Ptr{C_Factor{Tv}}, Ptr{C_Dense{Tv}}, Ptr{C_Sparse{Tv}}, Ref{Ptr{C_Dense{Tv}}},  Ref{Ptr{C_Sparse{Tv}}},  Ref{Ptr{C_Dense{Tv}}},  Ref{Ptr{C_Dense{Tv}}}, Ptr{UInt8}),
  sys,   F,                   B,              Bset,              Xref,                   Xset,                    Yref,                   Eref,                  SuiteSparse.CHOLMOD.common_struct[Threads.threadid()])
  
  if(res != 1)
    throw(ErrorException("CHOLMOD solve failure"))
  end
  
  return 
  
end
T = Float64
N = 300
X = Dense(Array{T}(undef, N))
B = Dense(rand(N))
Yref = Ref(Ptr{C_Dense{T}}(C_NULL))
Eref = Ref(Ptr{C_Dense{T}}(C_NULL))
A = sprand(N, N, 0.005)
A = A + A' + N*I
As = Sparse(A)
F_simple = cholesky(A)
F_super =  supercholesky(As);
_solve!(X, Yref, Eref, F_simple, B);
@show F_simple
A\B ≈ X ? println("Simplical pass") : error("Simplical fail")
_solve!(X, Yref, Eref, F_super, B);
@show F_super
A\B ≈ X ? println("Supernodal pass") : error("Supernodal fail")
@benchmark _solve!($X, $Yref, $Eref, $F_simple, $B)
@benchmark _solve!($X, $Yref, $Eref, $F_super, $B)
Simplical result:
@benchmark _solve!($X, $Yref, $Eref, $F_simple, $B)
BenchmarkTools.Trial:
  memory estimate:  9.46 KiB
  allocs estimate:  2
  --------------
  minimum time:     4.200 μs (0.00% GC)
  median time:      4.443 μs (0.00% GC)
  mean time:        4.476 μs (0.72% GC)
  maximum time:     55.157 μs (82.83% GC)
  --------------
  samples:          10000
  evals/sample:     7
Supernodal result:
 @benchmark _solve!($X, $Yref, $Eref, $F_super, $B)
BenchmarkTools.Trial:
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     18.299 μs (0.00% GC)
  median time:      18.800 μs (0.00% GC)
  mean time:        19.150 μs (0.00% GC)
  maximum time:     88.299 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     1