KrylovKit eigsolve of LinearMap giving different values in CPU and GPU

I am trying to compare the eigsolve function using CPU and GPU, using the following code:

using CUDA
using CUDA.CUSPARSE
using LinearAlgebra
using LinearMaps
using SparseArrays
using Random
using KrylovKit

function eign_cpu(M::Union{AbstractMatrix,LinearMap{T}}, v0::Vector{T}) where {T<:Number}
    tol = 1e-10
    (E, V, info) = eigsolve(M, v0, 1, :SR; ishermitian=true, tol=tol)
    return (E[1], V[1], info)
end

function eign_gpu(M::Union{CuArray,LinearMap{T}}, v0::CuVector{T}) where {T<:Number}
    tol = 1e-10
    (E, V, info) = eigsolve(M, v0, 1, :SR; ishermitian=true, tol=tol)
    return (E[1], V[1], info)
end

Random.seed!(0)

n = 4
m = 4
Es = [1, 1, 2, 3]
Ed = [2, 3, 4, 4]

w = ones(4)
sw = sqrt.(w)
sw_d = CuArray(sw)

C = LinearMap{Float64}((out::AbstractArray, y::AbstractArray) -> (out .= sw; out .*= -dot(sw, y)), n, ismutating=true, issymmetric=true)
C_d = LinearMap{Float64}((out::CuArray, y::CuArray) -> (out .= sw_d; out .*= -dot(sw_d, y)), n, ismutating=true, issymmetric=true)

M = sparse(Es, Ed, ones(Float64, m), n, n)
M_d = CuSparseMatrixCSC(M)

adj_cpu = p::AbstractArray -> (M.nzval .= p ./ 2; Symmetric(M))
adj_gpu = p::CuArray -> (M_d.nzVal .= p ./ 2; Symmetric(M_d))

p = rand(m, 1)
p_d = CuArray(p)

aux = Vector{Float64}(undef, n)
eig_grad = randn(Float64, n)
v0 = ((randn!(aux) .*= 1e-4) .+= eig_grad)
v0_d = CuArray(v0)

if isapprox(v0, Array(v0_d))
    println("v and v0 are approximate")
end
if isapprox(sw, Array(sw_d))
    println("C and C_d are approximate")
end
if isapprox(M, SparseMatrixCSC(M_d))
    println("M and M_d are approximate")
end
S = C + adj_cpu(p)
(E_cpu, V_cpu, info_cpu) = eign_cpu(S, v0)
println("info cpu = ", info_cpu)
S_d = C_d + adj_gpu(p_d)
(E_gpu, V_gpu, info_gpu) = eign_gpu(S_d, v0_d)
println("info gpu = ", info_gpu)

if !isapprox(V_cpu, Array(V_gpu))
    println("WARN: diverging eigenvalues between cpu and gpu:\n",
        "E_cpu = ", E_cpu, " V_cpu = ", V_cpu, "\nE_gpu = ", E_gpu, " V_gpu = ", V_gpu)
    readline()
end

The isapprox checks for the input data all return true. However, the output is:

info cpu = ConvergenceInfo: 4 converged values after 1 iterations and 4 applications of the linear map;
norms of residuals are given by (1.3683491030815672e-36, 6.545806401840613e-35, 6.344902430327517e-34, 8.827885896605726e-33).

info gpu = ConvergenceInfo: 4 converged values after 1 iterations and 4 applications of the linear map;
norms of residuals are given by (7.776628566574466e-20, 1.087879714604506e-17, 4.976482876540383e-17, 3.382716406082749e-17).

WARN: different eigenvalues between cpu and gpu:
E_cpu = -3.6566857184706016 V_cpu = [-0.5154937434198816, -0.46021338805521755, -0.5355038728938648, -0.485495046386023]
E_gpu = -3.7700145739756743 V_gpu = [0.48275625184457943, 0.47793647243109605, 0.52265534023327, 0.5151257370300364]

I expected the results to be approximately equal.
Am I missing something in this experiment? Some wrong assumption maybe?

Just to be sure, did you check that S and S_d are the same ?
Is there a guarantee that M.nzval and M_d.nzval refer to the same non-zero elements, in the same order ?

When running:

println("M = ", M)
println("M_d = ", SparseMatrixCSC(M_d))

I get

M = sparse([1, 1, 2, 3], [2, 3, 4, 4], [1.0, 1.0, 1.0, 1.0], 4, 4)
M_d = sparse(Int32[1, 1, 2, 3], Int32[2, 3, 4, 4], [1.0, 1.0, 1.0, 1.0], 4, 4)

So the only difference is the type of the indexes, do you think it has some influence?

I think you have encountered a bug in the CUDA sparse array implementation: multiplication by adj_gpu is incomplete.
There is an open issue for CUDA.jl