How to solve the non linear Coupled Cluster equations using Inexact Newton with DIIS?

I have been trying to solve the Coupled Cluster Amplitude Equations which are a set of non-linear equations using Julia. The aim is to find cluster amplitudes tensor T2 such that the residual tensor (defined below) is zero (or has norm less than a certain tolerance value) :

function calculate_residual(g_vvoo,g_voov,g_vovo,g_oovv,g_oooo,g_vvvv,T2,fvv,foo,R2u,R2)
    @tensor begin
        R2u[a_1,a_2,i_1,i_2]= 0.5* g_vvoo[a_1,a_2,i_1,i_2] - g_voov[a_1,i_3,i_1,a_3] * T2[a_2,a_3,i_3,i_2] - g_vovo[a_2,i_3,a_3,i_2] * T2[a_1,a_3,i_1,i_3]- g_vovo[a_2,i_3,a_3,i_1] * T2[a_1,a_3,i_3,i_2]+ 2*g_voov[a_1,i_3,i_1,a_3] * T2[a_2,a_3,i_2,i_3]+ 0.5*g_oooo[i_3,i_4,i_1,i_2] * T2[a_1,a_2,i_3,i_4]+ fvv[a_2,a_3] * T2[a_1,a_3,i_1,i_2]+ 0.5*g_vvvv[a_1,a_2,a_3,a_4] * T2[a_3,a_4,i_1,i_2]- foo[i_3,i_2] * T2[a_1,a_2,i_1,i_3]+ 2*(g_oovv[i_3,i_4,a_3,a_4] * T2[a_1,a_3,i_1,i_3]) * T2[a_2,a_4,i_2,i_4]- 2 *(g_oovv[i_3,i_4,a_3,a_4] * T2[a_1,a_3,i_1,i_3]) * T2[a_2,a_4,i_4,i_2]+ 0.5*(g_oovv[i_3,i_4,a_3,a_4] * T2[a_1,a_3,i_3,i_1]) * T2[a_2,a_4,i_4,i_2]- 2*(g_oovv[i_3,i_4,a_3,a_4] * T2[a_3,a_4,i_3,i_2]) * T2[a_1,a_2,i_1,i_4]- 1*(g_oovv[i_3,i_4,a_3,a_4] * T2[a_1,a_4,i_1,i_3]) * T2[a_2,a_3,i_2,i_4]+ (g_oovv[i_3,i_4,a_3,a_4] * T2[a_1,a_4,i_1,i_3]) * T2[a_2,a_3,i_4,i_2]+ (g_oovv[i_3,i_4,a_3,a_4] * T2[a_2,a_4,i_4,i_3]) * T2[a_1,a_3,i_1,i_2]+ (g_oovv[i_3,i_4,a_3,a_4] * T2[a_3,a_4,i_2,i_3]) * T2[a_1,a_2,i_1,i_4]- 2*(g_oovv[i_3,i_4,a_3,a_4] * T2[a_2,a_3,i_4,i_3]) * T2[a_1,a_4,i_1,i_2]+ 0.5*(g_oovv[i_3,i_4,a_3,a_4] * T2[a_1,a_4,i_3,i_2]) * T2[a_2,a_3,i_4,i_1]+ 0.5* (g_oovv[i_3,i_4,a_3,a_4] * T2[a_3,a_4,i_1,i_2]) * T2[a_1,a_2,i_3,i_4]
     end
     @tensor begin
        R2[a,b,i,j] := R2u[a,b,i,j] + R2u[b,a,j,i]
     end
     return R2
end #Only T2 (a rank 4 tensor) is variable, everything else
# are parameters

The problems I am facing can be categorized into two categories
One - Problems when trying to do it using a Package
I cannot find a package in Julia that can solve this set of equation using inexact newton (and also has the ability to implement DIIS) (where I can give my own Jacobian – as my function maps a rank 4 tensor to a rank 4 tensor, the Jacobian would be a rank 8 tensor!). I tried NLSolve.jl , it seems to incorrectly state that it’s method anderson is same as DIIS or Pulay mixing.

Two - Problems in coding my own iterative solver with DIIS
After not getting an appropriate package, I tried to solve the equations by hand using the following initial solver :

Hand-written Iterative Solver
function ccd_by_hand(maxitr)
    R2u,R2,g_vvoo,g_voov,g_vovo,g_oovv,g_oooo,g_vvvv,K,fvv,foo,erhf = initialize_cc_variables();
    R_iter = zeros(typeof(R2), size(R2)) 
    T2_old = initialize_t2_only();
    normtol=1.0e-8
    # display(T2_old) # Using MP2 amplitudes
    e_new = calculate_ECCD(g_oovv,T2_old)
    e_old = copy(0.0)
    earr = []
    etol = 1.0e-6
    p_max = 6
    p_min = 2
    push!(earr,e_new[1]+erhf)
    println("Starting CCD Iterations with Convergence Criteria as ||R₂||<$normtol , ΔE<$etol and max iterations=$maxitr")
    println("-----------------------------------------------------------------------------------------------")
    for i in 1:maxitr
        @printf("Starting Iteration: %d  Current Total Energy: %.8f\n", i, e_new[1]+erhf)
        println("Energy before updating = $(e_old[1]+erhf)")
        R2 = calculate_residual(g_vvoo,g_voov,g_vovo,g_oovv,g_oooo,g_vvvv,T2_old,fvv,foo,R2u,R2)
        conv,r2norm = check_convergence(R2,normtol,e_old,e_new,etol)
        if conv
            @printf("CCD Converged in %d iterations, no further updates required,current ||R2||=%.10f and ΔE = %.12f\n\n-------------------------\nE_Total = %.8f\n", i,r2norm,abs(e_old[1]-e_new[1]),e_new[1]+erhf)
            break
        end
        R_iter = calculate_R_iter(R_iter,R2,fvv,foo)
        @printf("CCD Not Converged in %d iterations, current ||R2|| = %.10f and ΔE = %.12f \n", i,r2norm,abs(e_old[1]-e_new[1]))
        push!(earr,e_new[1]+erhf)
        T2_old = update_T2(T2_old,R2,fvv,foo)
        e_old = e_new
        e_new = calculate_ECCD(g_oovv,T2_old)
        println("Energy after updating = $(e_new[1]+erhf)")
        
    end
    
end

The code does work and the solution converges, but as many might know, the Coupled Cluster Equations are very slow to converge and usually need some kind of convergence accelerators. I tried to implement DIIS using the residuals scaled by the elements of foo and fvv operators as the error vectors in the code below

Code with DIIS
using Printf,Plots
include("ccd-helper.jl")
function ccd_by_hand(maxitr)
    R2u,R2,g_vvoo,g_voov,g_vovo,g_oovv,g_oooo,g_vvvv,K,fvv,foo,erhf = initialize_cc_variables();
    R_iter = zeros(Float64,size(R2)) 
    T2_old = initialize_t2_only();
    normtol=1.0e-8
    # display(T2_old) # Using MP2 amplitudes
    e_new = calculate_ECCD(g_oovv,T2_old)
    e_old = copy(0.0)
    earr = []
    etol = 1.0e-6
    p_max = 6 # Maximum number of previous iterations to store for DIIS
    p_min = 2 # DIIS will start after this many iterations
    R_iter_storage = Array{typeof(R2)}(undef, 0)
    push!(earr,e_new[1]+erhf)
    println("Starting CCD Iterations with Convergence Criteria as ||R₂||<$normtol , ΔE<$etol and max iterations=$maxitr")
    println("-----------------------------------------------------------------------------------------------")
    for i in 1:maxitr
        @printf("\n-------------------------\nStarting Iteration: %d  Current Total Energy: %.8f\n", i, e_new[1]+erhf)
        println("Energy before updating = $(e_old[1]+erhf)")
        R2 = calculate_residual(g_vvoo,g_voov,g_vovo,g_oovv,g_oooo,g_vvvv,T2_old,fvv,foo,R2u,R2)
        conv,r2norm = check_convergence(R2,normtol,e_old,e_new,etol)
        if conv
            @printf("CCD Converged in %d iterations, no further updates required,current ||R2||=%.10f and ΔE = %.12f\n\n-------------------------\nE_Total = %.8f\n", i,r2norm,abs(e_old[1]-e_new[1]),e_new[1]+erhf)
            break
        end
        R_iter = calculate_R_iter(R_iter,R2,fvv,foo)
        # display(R_iter)
        @printf("CCD Not Converged in %d iterations, current ||R2|| = %.10f and ΔE = %.12f \n", i,r2norm,abs(e_old[1]-e_new[1]))

        if i >= p_min   #DIIS 
            println("DIIS is being implemented in iteration $i")
            if i >= p_min + p_max - 1  # Adjusted condition to start popping correctly
                popfirst!(R_iter_storage)
            end
            push!(R_iter_storage, copy(R_iter))
            # R_iter = DIIS_by_hand(R_iter_storage, length(R_iter_storage))
            R_iter = DIIS_by_hand_constrained(R_iter_storage, length(R_iter_storage))
            println("NORM OF R_ITER = $(norm(R_iter))")
        elseif i == p_min - 1
            push!(R_iter_storage, copy(R_iter))
        end
        println(length(R_iter_storage))
        e_old = e_new
        e_new = calculate_ECCD(g_oovv,T2_old)
        T2_old = update_T2_using_R_iter(T2_old,R_iter)
        println("Energy after updating = $(e_new[1]+erhf)\n-------------------------\n")
    end
    return R_iter_storage
    
end
maxitr = 200;
R_iter_storage=ccd_by_hand(maxitr);

However, on implementing this DIIS, the convergence becomes even slower now rather than accelerating and I cannot understand why. I tried to use the same implementation of DIIS as done in Fermi.jl . Below I have also given all the necessary subroutines I had defined in the above functions :

Definitions of user defined functions used in solver
function initialize_t2_only()
    h,g,hnuc,norb,nelec = main("water_dump.fcidump");
    T2 = zeros(size(g))
    f = zeros(size(h)) # fock matrix
    nocc = round(Int,nelec/2);
    nv = norb - nocc;
    erhf = calculate_ERHF(h,g,hnuc,nocc)
######################## Fock Matrix and K - Tensor Initialization ##############################
    for p in 1:norb , q in 1:norb
        s_pq=0
        for i in 1:nocc
            s_pq += 2*g[p,q,i,i]-g[p,i,i,q]
        end
        f[p,q] = h[p,q] + s_pq
    end
    K = zeros(nv,nv,nocc,nocc)
    # Note that K Tensor is indexed as K[a,b,i,j] where a,b are virtual orbitals and i,j are occupied orbitals
    # It has a lower dimensionality than the g tensor, as we dont have transition elements where 
    # the transition is happening from one virtual to another virtual or one occupied to another occupied.
    # In later versions of the code, we will use the g tensor directly to calculate the T2 amplitudes without
    # uselessly allocating an extra tensor.
    for a in 1:nv, b in 1:nv, i in 1:nocc, j in 1:nocc
        K[a,b,i,j] = g[nocc+a,i,nocc+b,j]
    end

##################################################################################
################## Amplitude Initialization ######################################
    T2 = - K 
    for a in 1:nv, b in 1:nv, i in 1:nocc, j in 1:nocc
        T2[a,b,i,j] = T2[a,b,i,j]/(f[nocc+a,nocc+a] + f[nocc+b,nocc+b] - f[i,i] - f[j,j])
        # if abs(T2[a,b,i,j]) < 10e-8
        #     T2[a,b,i,j] = 0.0
        # end
    end

##################################################################################
    return T2
end

function calculate_ECCD(g_oovv,T2)
    @tensoropt begin
        ECCSD[:] := 2*g_oovv[i_1,i_2,a_1,a_2] * T2[a_1,a_2,i_1,i_2] - g_oovv[i_1,i_2,a_1,a_2] * T2[a_1,a_2,i_2,i_1]
    end
    return ECCSD
end

function calculate_residual(g_vvoo,g_voov,g_vovo,g_oovv,g_oooo,g_vvvv,T2,fvv,foo,R2u,R2)
    @tensor begin
        R2u[a_1,a_2,i_1,i_2]= 0.5* g_vvoo[a_1,a_2,i_1,i_2] - g_voov[a_1,i_3,i_1,a_3] * T2[a_2,a_3,i_3,i_2] - g_vovo[a_2,i_3,a_3,i_2] * T2[a_1,a_3,i_1,i_3]- g_vovo[a_2,i_3,a_3,i_1] * T2[a_1,a_3,i_3,i_2]+ 2*g_voov[a_1,i_3,i_1,a_3] * T2[a_2,a_3,i_2,i_3]+ 0.5*g_oooo[i_3,i_4,i_1,i_2] * T2[a_1,a_2,i_3,i_4]+ fvv[a_2,a_3] * T2[a_1,a_3,i_1,i_2]+ 0.5*g_vvvv[a_1,a_2,a_3,a_4] * T2[a_3,a_4,i_1,i_2]- foo[i_3,i_2] * T2[a_1,a_2,i_1,i_3]+ 2*(g_oovv[i_3,i_4,a_3,a_4] * T2[a_1,a_3,i_1,i_3]) * T2[a_2,a_4,i_2,i_4]- 2 *(g_oovv[i_3,i_4,a_3,a_4] * T2[a_1,a_3,i_1,i_3]) * T2[a_2,a_4,i_4,i_2]+ 0.5*(g_oovv[i_3,i_4,a_3,a_4] * T2[a_1,a_3,i_3,i_1]) * T2[a_2,a_4,i_4,i_2]- 2*(g_oovv[i_3,i_4,a_3,a_4] * T2[a_3,a_4,i_3,i_2]) * T2[a_1,a_2,i_1,i_4]- 1*(g_oovv[i_3,i_4,a_3,a_4] * T2[a_1,a_4,i_1,i_3]) * T2[a_2,a_3,i_2,i_4]+ (g_oovv[i_3,i_4,a_3,a_4] * T2[a_1,a_4,i_1,i_3]) * T2[a_2,a_3,i_4,i_2]+ (g_oovv[i_3,i_4,a_3,a_4] * T2[a_2,a_4,i_4,i_3]) * T2[a_1,a_3,i_1,i_2]+ (g_oovv[i_3,i_4,a_3,a_4] * T2[a_3,a_4,i_2,i_3]) * T2[a_1,a_2,i_1,i_4]- 2*(g_oovv[i_3,i_4,a_3,a_4] * T2[a_2,a_3,i_4,i_3]) * T2[a_1,a_4,i_1,i_2]+ 0.5*(g_oovv[i_3,i_4,a_3,a_4] * T2[a_1,a_4,i_3,i_2]) * T2[a_2,a_3,i_4,i_1]+ 0.5* (g_oovv[i_3,i_4,a_3,a_4] * T2[a_3,a_4,i_1,i_2]) * T2[a_1,a_2,i_3,i_4]
     end
     @tensor begin
        R2[a,b,i,j] := R2u[a,b,i,j] + R2u[b,a,j,i]
     end
     return R2
end

function check_convergence(R2,normtol,e_old,e_new,etol)
    r2norm = sqrt(abs(dot(R2,R2)))
    if(r2norm < normtol && abs(e_new[1]-e_old[1])<etol)
        return true,r2norm
    else
        return false,r2norm
    end
        
end

function calculate_R_iter(R_iter,R2,fvv,foo)
    nv = size(fvv)[1]
    nocc = size(foo)[1]
    shiftp = 0.20
    for a in 1:nv, b in 1:nv, i in 1:nocc, j in 1:nocc
        R_iter[a,b,i,j] = R2[a,b,i,j]/(fvv[a,a] + fvv[b,b]-foo[i,i] - foo[j,j]+shiftp)
    end
    return R_iter
end

function DIIS_by_hand_constrained(R_iter_storage,p)
    B = zeros(p+1,p+1)
    Bpp = dot(R_iter_storage[p],R_iter_storage[p])
    for i in 1:p
        for j in 1:i
            B[i,j] = dot(R_iter_storage[i],R_iter_storage[j])/Bpp
            B[j,i] = B[i,j]
        end
    end
    B[p+1, 1:p] .= -1
    B[1:p, p+1] .= -1
    B[p+1, p+1] = 0
    Id = zeros(p+1)
    Id[p+1]= -1
    B = Symmetric(B)
    display(B)
    # prob = LinearProblem(B,Id)
    # sol = solve(prob,LinearSolve.KrylovJL_CG())
    # C = sol.u
    # C = cholesky(B)\Id
    C = B\Id
    pop!(C) # As C[p+1] is the Lagrange multiplier
    s = zeros(size(R_iter_storage[1]))
    for i in 1:p
        s = s + C[i].*R_iter_storage[i]
    end
    println("The sum of coefficients of the DIIS are: ",sum(C))
    return s
end

function update_T2_using_R_iter(T2,R_iter)
    nv = size(T2)[1]
    nocc = size(T2)[3]
    for a in 1:nv, b in 1:nv, i in 1:nocc, j in 1:nocc
        T2[a,b,i,j] = T2[a,b,i,j] - R_iter[a,b,i,j]
    end
    return T2
end

Any suggestions that can help me progress in any of the two ways i'm exploring are really appreciated. 

Due to the lack of any responses, I am using this reply to close this topic. There seems to be no packages available in Julia that can solve a non linear equation using inexact newton + DIIS. So the only way to implement it as of now is to code it by hand which at this point I have already done.