First, I am quite new to Julia’s ML ecosystem, so I might have made stupid mistakes..
Recently, I tried to debug a very training loop involving GraphNeuralNetworks.jl
’s SGConv
layer,
and found successively longer runtimes for each epoch.
I tried to condense this into a minimal example trying out various things, (minimal might be a bit of a stretch though):
import Flux
import GraphNeuralNetworks as GNN
import CUDA
import MLUtils
import CairoMakie
using Statistics
CUDA.allowscalar(false) # Disable scalar operations for CUDA --> no indexing
"""
Test script for the SGConv layer in the GraphNeuralNetworks.jl package.
"""
"""
Generate a set of community graphs for testing purposes.
"""
function generate_dummy_graphs(n_samples =100)
all_graphs = GNN.GNNGraph[]
for i in 1:n_samples
n_nodes = rand(75:125)
# random adjacency matrix
A = Float32.(rand(Float32, n_nodes, n_nodes) .> 0.5)
# Create simple dummy node features
features = ones(Float32, 1, n_nodes)
# Create GNNGraph
g = GNN.GNNGraph(A, ndata=(x=features,))
push!(all_graphs, g) # make it symmetric
end
return all_graphs
end
################################################################################################################
# build data loader
data_loader = let
data = generate_dummy_graphs(100000)
data_loader = Flux.DataLoader(data, batchsize=100, shuffle=true, collate = true)
end
################################################################################################################
# actual tests
# all cpu here, no cuda
# test sgconv with dummy data.
let
times = Float64[]
thing = GNN.SGConv(1 => 128, 1; bias = false, add_self_loops=true)
for bg in data_loader
t = @elapsed Flux.withgradient(thing) do model
A = model(bg, bg.ndata.x)
return sum(A)
end
push!(times, t)
end
fig = CairoMakie.Figure(size=(600, 400))
ax = CairoMakie.Axis(fig[1, 1], title="Differentiation Time per Batch default CPU", xlabel="Batch Index", ylabel="Time (s)")
CairoMakie.scatterlines!(ax, times, label="Training Time", markersize = 6)
CairoMakie.ylims!(ax, minimum(times)*0.9, 1.1*quantile(times, 0.99))
CairoMakie.save("sgconv_cpu_differentiation_time.png", fig)
end
####################################################################################################
# test: behavior on CUDA
let
if !CUDA.functional()
error("CUDA is not functional. Please check your CUDA installation.")
end
CUDA.device_reset!()
times = Float64[]
# outside again - use one instance
thing = GNN.SGConv(1 => 128, 1; bias = false, add_self_loops=true) |> Flux.gpu
for (i,bg) in enumerate(data_loader) # no CuIterator here because not Array like
bg_gpu = bg |> Flux.gpu # move to gpu
t = CUDA.@elapsed Flux.withgradient(thing) do model
A = model(bg_gpu, bg_gpu.ndata.x)
return sum(A)
end
if i % 50 == 0
CUDA.pool_status()
end
push!(times, t)
end
fig = CairoMakie.Figure(size=(600, 400))
ax = CairoMakie.Axis(fig[1, 1], title="Differentiation time per Batch new instance CUDA", xlabel="Batch Index", ylabel="Time (s)")
CairoMakie.scatterlines!(ax, times, label="Training Time", markersize = 6)
CairoMakie.ylims!(ax, minimum(times)*0.9, 1.1*quantile(times, 0.99))
CairoMakie.save("sgconv_cuda_differentiation_time.png", fig)
end
####################################################################################################
# with inference mode only on GPU
let
if !CUDA.functional()
error("CUDA is not functional. Please check your CUDA installation.")
end
CUDA.device_reset!()
times = Float64[]
thing = GNN.SGConv(1 => 128, 1; bias = false, add_self_loops=true) |> Flux.gpu
for bg in data_loader
bg_gpu = bg |> Flux.gpu # move to gpu
t = CUDA.@elapsed sum(thing(bg_gpu, bg_gpu.ndata.x))
push!(times, t)
end
fig = CairoMakie.Figure(size=(600, 400))
ax = CairoMakie.Axis(fig[1, 1], title="Inference time only", xlabel="Batch Index", ylabel="Time (s)")
CairoMakie.scatterlines!(ax, times, label="Training Time", markersize = 6)
CairoMakie.ylims!(ax, minimum(times)*0.9, 1.1*quantile(times, 0.99))
CairoMakie.save("sgconv_cuda_inference_time.png", fig)
end
####################################################################################################
# try with another GNN layer
let
if !CUDA.functional()
error("CUDA is not functional. Please check your CUDA installation.")
end
CUDA.device_reset!()
times = Float64[]
thing = GNN.CGConv(1 => 128) |> Flux.gpu
for bg in data_loader
bg_gpu = bg |> Flux.gpu # move to gpu
t = CUDA.@elapsed Flux.gradient(bg -> sum(thing(bg, bg.ndata.x)), bg_gpu)
push!(times, t)
end
fig = CairoMakie.Figure(size=(600, 400))
ax = CairoMakie.Axis(fig[1, 1], title="Differentiation Time per Batch CGConv", xlabel="Batch Index", ylabel="Time (s)")
CairoMakie.scatterlines!(ax, times, label="Training Time", markersize = 6)
CairoMakie.ylims!(ax, minimum(times)*0.9, 1.1*quantile(times, 0.99))
CairoMakie.save("cgconv_cuda_differentiation_time.png", fig)
end
with the following package versions:
CUDA = "5.8.2"
CairoMakie = "0.13.10"
Flux = "0.16.4"
GraphNeuralNetworks = "1.0.0"
MLUtils = "0.4.8"
cuDNN = "1.4.3"
I attached the resulting images. It seems that specifically for the SGConv
layer, runtimes increase ever more over time.
It almost seems that some internal state is accumulating that Zygote has to differentiate through. In other cases, I would have thought about some kind of memory leak, but after staring at this for a while I am quite confused.
Has anyone encountered this before, or can point out a mistake I made or how to debug this further?
Thanks in advance