Gradient of ldiv trace on CUDA

Hello,

I’m trying to differentiate a left matrix division on CuArrays. I actually have the L parts l1,l2 of lu-decompositions of two matrices M1, M2 respectively. So m = l1*l1'

I made two functions, one that reconstructs the matrices and one that does forward-backward substitution solves.
MWE :

using CUDA, LinearAlgebra, Flux
l1 = tril(rand(5,5)) |> cu
l2 = tril(rand(5,5)) |> cu
function f(L1,L2)                                                                                                                                                                                                                                
    tr((L2*L2')\(L1*L1'))
end
g = gradient(Flux.params(l1)) do
    f(l1,l2)
end    

throws

ERROR: ArgumentError: cannot take the CPU address of a CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}
Stacktrace:
  [1] unsafe_convert(#unused#::Type{Ptr{Float32}}, x::CuArray{Float32, 2, CUDA.Mem.DeviceBuffer})
    @ CUDA C:\Users\dehaybe\.julia\packages\CUDA\Axzxe\src\array.jl:315
  [2] getrf!(A::CuArray{Float32, 2, CUDA.Mem.DeviceBuffer})
    @ LinearAlgebra.LAPACK C:\Users\dehaybe\AppData\Local\Programs\Julia-1.7.2\share\julia\stdlib\v1.7\LinearAlgebra\src\lapack.jl:575
  [3] lu!(A::CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, ::RowMaximum; check::Bool)
    @ LinearAlgebra C:\Users\dehaybe\AppData\Local\Programs\Julia-1.7.2\share\julia\stdlib\v1.7\LinearAlgebra\src\lu.jl:81
  [4] lu(A::Adjoint{Float32, CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}}, pivot::RowMaximum; check::Bool)
    @ LinearAlgebra C:\Users\dehaybe\AppData\Local\Programs\Julia-1.7.2\share\julia\stdlib\v1.7\LinearAlgebra\src\lu.jl:279
  [5] lu(A::Adjoint{Float32, CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}}, pivot::RowMaximum) (repeats 2 times)
    @ LinearAlgebra C:\Users\dehaybe\AppData\Local\Programs\Julia-1.7.2\share\julia\stdlib\v1.7\LinearAlgebra\src\lu.jl:278
  [6] \(A::Adjoint{Float32, CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}}, B::Diagonal{Float32, FillArrays.Fill{Float32, 1, Tuple{Base.OneTo{Int64}}}})
    @ LinearAlgebra C:\Users\dehaybe\AppData\Local\Programs\Julia-1.7.2\share\julia\stdlib\v1.7\LinearAlgebra\src\generic.jl:1142
  [7] (::Zygote.var"#766#767"{CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}})(Z̄::Diagonal{Float32, FillArrays.Fill{Float32, 1,
 Tuple{Base.OneTo{Int64}}}})
    @ Zygote C:\Users\dehaybe\.julia\packages\Zygote\cCyLF\src\lib\array.jl:507
  [8] #3077#back
    @ C:\Users\dehaybe\.julia\packages\ZygoteRules\AIbCs\src\adjoint.jl:67 [inlined]
  [9] Pullback
    @ c:\Users\dehaybe\OneDrive - UCL\Doctorat\julia\ReinforcementLearning\src\ReinforcementLearningZoo\src\algorithms\policy_gradient\mpo.jl:300 [inlined]
 [10] (::typeof(∂(f)))(Δ::Float32)
    @ Zygote C:\Users\dehaybe\.julia\packages\Zygote\cCyLF\src\compiler\interface2.jl:0
 [11] Pullback
    @ c:\Users\dehaybe\OneDrive - UCL\Doctorat\julia\ReinforcementLearning\src\ReinforcementLearningZoo\src\algorithms\policy_gradient\mpo.jl:296 [inlined]
 [12] (::Zygote.var"#94#95"{Zygote.Params, typeof(∂(#13)), Zygote.Context})(Δ::Float32)
    @ Zygote C:\Users\dehaybe\.julia\packages\Zygote\cCyLF\src\compiler\interface.jl:357
 [13] gradient(f::Function, args::Zygote.Params)
    @ Zygote C:\Users\dehaybe\.julia\packages\Zygote\cCyLF\src\compiler\interface.jl:76
 [14] top-level scope
    @ c:\Users\dehaybe\OneDrive - UCL\Doctorat\julia\ReinforcementLearning\src\ReinforcementLearningZoo\src\algorithms\policy_gradient\mpo.jl:295
function f2(L1,L2)
    L1tril = LowerTriangular(L1)
    L2tril = LowerTriangular(L2)
    Y = L2tril\(L1tril*L1tril')
    X = L2tril'\Y
    tr(X)
end                                                                                                                                                                                                                                                                                                                                                                                                                                   
g = gradient(Flux.params(l1)) do
    f2(l1,l2)
end

throws

ERROR: ArgumentError: cannot take the CPU address of a CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}
Stacktrace:
  [1] unsafe_convert(#unused#::Type{Ptr{Float32}}, x::CuArray{Float32, 2, CUDA.Mem.DeviceBuffer})
    @ CUDA C:\Users\dehaybe\.julia\packages\CUDA\Axzxe\src\array.jl:315
  [2] unsafe_convert(#unused#::Type{Ptr{Float32}}, V::SubArray{Float32, 2, CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, Tuple{UnitRange{Int64}, UnitRange{Int64}}, false})
    @ Base .\subarray.jl:427
  [3] trtrs!(uplo::Char, trans::Char, diag::Char, A::SubArray{Float32, 2, CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, Tuple{UnitRange{Int64}, UnitRange{Int64}}, false}, B::SubArray{Float32, 1, Vector{Float32}, Tuple{UnitRange{Int64}}, true})
    @ LinearAlgebra.LAPACK C:\Users\dehaybe\AppData\Local\Programs\Julia-1.7.2\share\julia\stdlib\v1.7\LinearAlgebra\src\lapack.jl:3424
  [4] ldiv!(A::LowerTriangular{Float32, SubArray{Float32, 2, CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, Tuple{UnitRange{Int64}, UnitRange{Int64}}, false}}, B::SubArray{Float32, 1, Vector{Float32}, Tuple{UnitRange{Int64}}, true})
    @ LinearAlgebra C:\Users\dehaybe\AppData\Local\Programs\Julia-1.7.2\share\julia\stdlib\v1.7\LinearAlgebra\src\triangular.jl:727
  [5] ldiv!(xA::LowerTriangular{Float32, CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}}, b::SparseArrays.SparseVector{Float32, Int64})
    @ SparseArrays C:\Users\dehaybe\AppData\Local\Programs\Julia-1.7.2\share\julia\stdlib\v1.7\SparseArrays\src\sparsevector.jl:1912
  [6] ldiv!(A::LowerTriangular{Float32, CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}}, B::SparseArrays.SparseMatrixCSC{Float32, Int64})
    @ LinearAlgebra C:\Users\dehaybe\AppData\Local\Programs\Julia-1.7.2\share\julia\stdlib\v1.7\LinearAlgebra\src\bidiag.jl:744
  [7] \(A::LowerTriangular{Float32, CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}}, B::Diagonal{Float32, FillArrays.Fill{Float32, 1, Tuple{Base.OneTo{Int64}}}})
    @ LinearAlgebra C:\Users\dehaybe\AppData\Local\Programs\Julia-1.7.2\share\julia\stdlib\v1.7\LinearAlgebra\src\triangular.jl:1661
  [8] (::Zygote.var"#758#759"{UpperTriangular{Float32, Adjoint{Float32, CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}}}, CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}})(Ȳ::Diagonal{Float32, FillArrays.Fill{Float32, 1, 
Tuple{Base.OneTo{Int64}}}})
    @ Zygote C:\Users\dehaybe\.julia\packages\Zygote\cCyLF\src\lib\array.jl:491
  [9] #3049#back
    @ C:\Users\dehaybe\.julia\packages\ZygoteRules\AIbCs\src\adjoint.jl:67 [inlined]
 [10] Pullback
    @ c:\Users\dehaybe\OneDrive - UCL\Doctorat\julia\ReinforcementLearning\src\ReinforcementLearningZoo\src\algorithms\policy_gradient\mpo.jl:310 [inlined]
 [11] (::typeof(∂(f2)))(Δ::Float32)
    @ Zygote C:\Users\dehaybe\.julia\packages\Zygote\cCyLF\src\compiler\interface2.jl:0
 [12] Pullback
    @ c:\Users\dehaybe\OneDrive - UCL\Doctorat\julia\ReinforcementLearning\src\ReinforcementLearningZoo\src\algorithms\policy_gradient\mpo.jl:314 [inlined]
 [13] (::Zygote.var"#94#95"{Zygote.Params, typeof(∂(#15)), Zygote.Context})(Δ::Float32)
    @ Zygote C:\Users\dehaybe\.julia\packages\Zygote\cCyLF\src\compiler\interface.jl:357
 [14] gradient(f::Function, args::Zygote.Params)
    @ Zygote C:\Users\dehaybe\.julia\packages\Zygote\cCyLF\src\compiler\interface.jl:76
 [15] top-level scope
    @ c:\Users\dehaybe\OneDrive - UCL\Doctorat\julia\ReinforcementLearning\src\ReinforcementLearningZoo\src\algorithms\policy_gradient\mpo.jl:313

What I actually need is the gradient of the trace of the result of inv(M2)*M1.
Unfortunately, this seems to be not differentiable. I’ve been looking for a few days but I’m stuck. It’s quite hard to integrate AD and CUDA. Is there a package that I may not know of that could help ? Thanks !!

Stupid question to get the discussion going: does this work on CPU?

Yes it does. I changed my initial post because I forgot the Flux.params in my MWE… so the error was not correct. I was in a hurry when I wrote that.

I got around this this morning doing as follows:

function f3(L1,L2)
    U2 = L2'
    M1 = L1*L1'
    L2i = inv(L2)
    U2i = inv(U2)
    M2i = U2i*L2i
    X = M2i*M1
    tr(X)
end
g = gradient(Flux.params(l1)) do 
    f3(l1,l2)
end

I used the fact that (AB)^-1 == B^-1A^-1. The issue is that I do two matrix inversions, that’s super inefficient because the inverter does not know that L1, L2 and U2 are triangular matrices. Unfortunately, using an UpperTriangular struct does not work

function f4(l1,l2)
    L1 = l1 |> LowerTriangular
    L2 = l2 |> LowerTriangular
    U2 = l2' |> UpperTriangular 
    M1 = L1*L1'
    L2i = inv(L2)
    U2i = inv(U2)
    M2i = U2i*L2i
    X = M2i*M1
    tr(X)
end
g = gradient(Flux.params(l1)) do 
    f4(l1,l2)
end
#throws:
ERROR: ArgumentError: cannot take the CPU address of a CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}
Stacktrace:
  [1] unsafe_convert(#unused#::Type{Ptr{Float32}}, x::CuArray{Float32, 2, CUDA.Mem.DeviceBuffer})
    @ CUDA C:\Users\dehaybe\.julia\packages\CUDA\Axzxe\src\array.jl:315
  [2] trtrs!(uplo::Char, trans::Char, diag::Char, A::CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, B::Matrix{Float32})
    @ LinearAlgebra.LAPACK C:\Users\dehaybe\AppData\Local\Programs\Julia-1.7.2\share\julia\stdlib\v1.7\LinearAlgebra\src\lapack.jl:3424
  [3] ldiv!(A::UpperTriangular{Float32, Adjoint{Float32, CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}}}, B::Matrix{Float32})
    @ LinearAlgebra C:\Users\dehaybe\AppData\Local\Programs\Julia-1.7.2\share\julia\stdlib\v1.7\LinearAlgebra\src\triangular.jl:788
  [4] inv(A::UpperTriangular{Float32, Adjoint{Float32, CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}}})
    @ LinearAlgebra C:\Users\dehaybe\AppData\Local\Programs\Julia-1.7.2\share\julia\stdlib\v1.7\LinearAlgebra\src\triangular.jl:809
  [5] adjoint
    @ C:\Users\dehaybe\.julia\packages\Zygote\cCyLF\src\lib\array.jl:458 [inlined]
  [6] _pullback
    @ C:\Users\dehaybe\.julia\packages\ZygoteRules\AIbCs\src\adjoint.jl:65 [inlined]
  [7] _pullback
    @ c:\Users\dehaybe\OneDrive - UCL\Doctorat\julia\ReinforcementLearning\src\ReinforcementLearningZoo\src\algorithms\policy_gradient\mpo.jl:290 [inlined]
  [8] _pullback(::Zygote.Context, ::typeof(f3), ::CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, ::CuArray{Float32, 2, CUDA.Mem.DeviceBuffer})
    @ Zygote C:\Users\dehaybe\.julia\packages\Zygote\cCyLF\src\compiler\interface2.jl:0
  [9] _pullback
    @ c:\Users\dehaybe\OneDrive - UCL\Doctorat\julia\ReinforcementLearning\src\ReinforcementLearningZoo\src\algorithms\policy_gradient\mpo.jl:296 [inlined]
 [10] _pullback(::Zygote.Context, ::var"#23#24")
    @ Zygote C:\Users\dehaybe\.julia\packages\Zygote\cCyLF\src\compiler\interface2.jl:0
 [11] pullback(f::Function, ps::Zygote.Params)
    @ Zygote C:\Users\dehaybe\.julia\packages\Zygote\cCyLF\src\compiler\interface.jl:352
 [12] gradient(f::Function, args::Zygote.Params)
    @ Zygote C:\Users\dehaybe\.julia\packages\Zygote\cCyLF\src\compiler\interface.jl:75
 [13] top-level scope
    @ c:\Users\dehaybe\OneDrive - UCL\Doctorat\julia\ReinforcementLearning\src\ReinforcementLearningZoo\src\algorithms\policy_gradient\mpo.jl:295

This happens only with the UpperTriangular inverse, which tells me that the transpose adjoint is the culprit somehow.

1 Like

Acutally, we can use a non-lazy transpose by changing to

function f3(l1,l2)
    L1 = l1 |> LowerTriangular
    U1 = permutedims(l1) |> UpperTriangular 
    L2 = l2 |> LowerTriangular
    U2 = permutedims(l2) |> UpperTriangular 
    M1 = L1*U1
    L2i = inv(L2)
    U2i = inv(U2)
    M2i = U2i*L2i
    X = M2i*M1
    tr(X)
end
g = gradient(Flux.params(l1)) do 
    f3(l1,l2)
end

Works. That’s the best I can do I think.