Weird behavior of GraphNeuralNetworks.jl `SGConv` layer within `Flux.withgradient`

First, I am quite new to Julia’s ML ecosystem, so I might have made stupid mistakes..

Recently, I tried to debug a very training loop involving GraphNeuralNetworks.jl’s SGConv layer,
and found successively longer runtimes for each epoch.

I tried to condense this into a minimal example trying out various things, (minimal might be a bit of a stretch though):


import Flux
import GraphNeuralNetworks as GNN
import CUDA
import MLUtils
import CairoMakie
using Statistics

CUDA.allowscalar(false) # Disable scalar operations for CUDA --> no indexing

"""
Test script for the SGConv layer in the GraphNeuralNetworks.jl package.
"""


"""
Generate a set of community graphs for testing purposes.
"""
function generate_dummy_graphs(n_samples =100)

    all_graphs = GNN.GNNGraph[]
    
    for i in 1:n_samples

        n_nodes = rand(75:125)

        # random adjacency matrix
        A = Float32.(rand(Float32, n_nodes, n_nodes) .> 0.5) 

        # Create simple dummy node features
        features = ones(Float32, 1, n_nodes)
            
        # Create GNNGraph
        g = GNN.GNNGraph(A, ndata=(x=features,))

        push!(all_graphs, g) # make it symmetric
    end
    return all_graphs
end

################################################################################################################
# build data loader
data_loader = let 
    data = generate_dummy_graphs(100000)
    data_loader = Flux.DataLoader(data, batchsize=100, shuffle=true, collate = true)
end


################################################################################################################
# actual tests
# all cpu here, no cuda
# test sgconv with dummy data. 
let
    times = Float64[]

    thing = GNN.SGConv(1 => 128, 1; bias = false, add_self_loops=true)

    for bg in data_loader

        t = @elapsed Flux.withgradient(thing) do model 
            A = model(bg, bg.ndata.x) 
            return sum(A)
        end
        push!(times, t)
    end

    fig = CairoMakie.Figure(size=(600, 400))
    ax = CairoMakie.Axis(fig[1, 1], title="Differentiation Time per Batch default CPU", xlabel="Batch Index", ylabel="Time (s)")
    CairoMakie.scatterlines!(ax, times, label="Training Time", markersize = 6)
    CairoMakie.ylims!(ax, minimum(times)*0.9, 1.1*quantile(times, 0.99))
    CairoMakie.save("sgconv_cpu_differentiation_time.png", fig)
end



####################################################################################################
# test: behavior on CUDA
let
    if !CUDA.functional()
        error("CUDA is not functional. Please check your CUDA installation.")
    end

    CUDA.device_reset!()
    times = Float64[]
    # outside again - use one instance
    thing = GNN.SGConv(1 => 128, 1; bias = false, add_self_loops=true) |> Flux.gpu

    for (i,bg) in enumerate(data_loader) # no CuIterator here because not Array like
        bg_gpu  = bg |> Flux.gpu # move to gpu
        t = CUDA.@elapsed Flux.withgradient(thing) do model 
            A = model(bg_gpu, bg_gpu.ndata.x) 
            return sum(A) 
        end

        if i % 50 == 0 
            CUDA.pool_status()
        end

        push!(times, t)
    end

    fig = CairoMakie.Figure(size=(600, 400))
    ax = CairoMakie.Axis(fig[1, 1], title="Differentiation time per Batch new instance CUDA", xlabel="Batch Index", ylabel="Time (s)")
    CairoMakie.scatterlines!(ax, times, label="Training Time", markersize = 6)
    CairoMakie.ylims!(ax, minimum(times)*0.9, 1.1*quantile(times, 0.99))
    CairoMakie.save("sgconv_cuda_differentiation_time.png", fig)
end

####################################################################################################
# with inference mode only on GPU
let
    if !CUDA.functional()
        error("CUDA is not functional. Please check your CUDA installation.")
    end

    CUDA.device_reset!()
    
    times = Float64[]
    
    thing = GNN.SGConv(1 => 128, 1; bias = false, add_self_loops=true) |> Flux.gpu

    for bg in data_loader

        bg_gpu = bg |> Flux.gpu # move to gpu

        t = CUDA.@elapsed sum(thing(bg_gpu, bg_gpu.ndata.x))

        push!(times, t)
    end

    fig = CairoMakie.Figure(size=(600, 400))
    ax = CairoMakie.Axis(fig[1, 1], title="Inference time only", xlabel="Batch Index", ylabel="Time (s)")
    CairoMakie.scatterlines!(ax, times, label="Training Time", markersize = 6)
    CairoMakie.ylims!(ax, minimum(times)*0.9, 1.1*quantile(times, 0.99))
    CairoMakie.save("sgconv_cuda_inference_time.png", fig)
end


####################################################################################################
# try with another GNN layer 
let
    if !CUDA.functional()
        error("CUDA is not functional. Please check your CUDA installation.")
    end

    CUDA.device_reset!()

    times = Float64[]

    thing = GNN.CGConv(1 => 128) |> Flux.gpu

    for bg in data_loader
        bg_gpu = bg |> Flux.gpu # move to gpu

        t = CUDA.@elapsed Flux.gradient(bg -> sum(thing(bg, bg.ndata.x)), bg_gpu)

        push!(times, t)
    end

    fig = CairoMakie.Figure(size=(600, 400))
    ax = CairoMakie.Axis(fig[1, 1], title="Differentiation Time per Batch CGConv", xlabel="Batch Index", ylabel="Time (s)")
    CairoMakie.scatterlines!(ax, times, label="Training Time", markersize = 6)
    CairoMakie.ylims!(ax, minimum(times)*0.9, 1.1*quantile(times, 0.99))
    CairoMakie.save("cgconv_cuda_differentiation_time.png", fig)
end

with the following package versions:

CUDA = "5.8.2"
CairoMakie = "0.13.10"
Flux = "0.16.4"
GraphNeuralNetworks = "1.0.0"
MLUtils = "0.4.8"
cuDNN = "1.4.3"

I attached the resulting images. It seems that specifically for the SGConv layer, runtimes increase ever more over time.
It almost seems that some internal state is accumulating that Zygote has to differentiate through. In other cases, I would have thought about some kind of memory leak, but after staring at this for a while I am quite confused.

Has anyone encountered this before, or can point out a mistake I made or how to debug this further?
Thanks in advance





for clarification: the images with the linear increase in measured runtime are the ones using SGConv. Also GATConv → SGConv (forgot to change the labels)

output from CUDA.pool_status for the CUDA test case on every 50th batch:

Effective GPU memory usage: 15.42% (3.623 GiB/23.494 GiB)
Memory pool usage: 215.494 MiB (1.125 GiB reserved)
Effective GPU memory usage: 15.42% (3.623 GiB/23.494 GiB)
Memory pool usage: 1018.041 MiB (1.125 GiB reserved)
Effective GPU memory usage: 15.42% (3.623 GiB/23.494 GiB)
Memory pool usage: 647.486 MiB (1.125 GiB reserved)
Effective GPU memory usage: 15.42% (3.623 GiB/23.494 GiB)
Memory pool usage: 322.637 MiB (1.125 GiB reserved)
Effective GPU memory usage: 15.42% (3.623 GiB/23.494 GiB)
Memory pool usage: 1.027 GiB (1.125 GiB reserved)
Effective GPU memory usage: 15.42% (3.623 GiB/23.494 GiB)
Memory pool usage: 693.769 MiB (1.125 GiB reserved)
Effective GPU memory usage: 15.42% (3.623 GiB/23.494 GiB)
Memory pool usage: 359.210 MiB (1.125 GiB reserved)
Effective GPU memory usage: 15.42% (3.623 GiB/23.494 GiB)
Memory pool usage: 1.059 GiB (1.125 GiB reserved)
Effective GPU memory usage: 15.42% (3.623 GiB/23.494 GiB)
Memory pool usage: 729.254 MiB (1.125 GiB reserved)
Effective GPU memory usage: 15.42% (3.623 GiB/23.494 GiB)
Memory pool usage: 352.808 MiB (1.125 GiB reserved)
Effective GPU memory usage: 15.42% (3.623 GiB/23.494 GiB)
Memory pool usage: 1.065 GiB (1.125 GiB reserved)
Effective GPU memory usage: 15.42% (3.623 GiB/23.494 GiB)
Memory pool usage: 727.764 MiB (1.125 GiB reserved)
Effective GPU memory usage: 15.42% (3.623 GiB/23.494 GiB)
Memory pool usage: 396.216 MiB (1.125 GiB reserved)
Effective GPU memory usage: 15.42% (3.623 GiB/23.494 GiB)
Memory pool usage: 1.091 GiB (1.125 GiB reserved)
Effective GPU memory usage: 15.42% (3.623 GiB/23.494 GiB)
Memory pool usage: 760.323 MiB (1.125 GiB reserved)
Effective GPU memory usage: 15.42% (3.623 GiB/23.494 GiB)
Memory pool usage: 395.635 MiB (1.125 GiB reserved)
Effective GPU memory usage: 15.42% (3.623 GiB/23.494 GiB)
Memory pool usage: 1.093 GiB (1.125 GiB reserved)
Effective GPU memory usage: 15.39% (3.617 GiB/23.494 GiB)
Memory pool usage: 760.915 MiB (1.125 GiB reserved)
Effective GPU memory usage: 15.39% (3.617 GiB/23.494 GiB)
Memory pool usage: 394.625 MiB (1.125 GiB reserved)
Effective GPU memory usage: 15.39% (3.617 GiB/23.494 GiB)
Memory pool usage: 65.648 MiB (1.125 GiB reserved)