This post is hidden. You deleted this post 8 hours ago.
I have been working on the following loss function to train neural networks based on a statistical invariant. I have tried to port the function to GPU/CUDA, and I have the following code. It works correctly; however, since I am new to CUDA, I would like to ask if anyone knows or can think of any optimization that could speed up the code. Thank you very much.
The loss function is,
function sliced_invariant_statistical_loss_optimized_gpu_2(nn_model, loader, hparams)
@assert loader.batchsize == hparams.samples
@assert length(loader) == hparams.epochs
losses = Vector{Float32}()
optim = Flux.setup(Flux.Adam(hparams.η), nn_model)
@showprogress for data in loader
data = gpu(data) # Move data to GPU once per epoch
Ω = gpu([sample_random_direction(size(data)[1]) for _ in 1:(hparams.m)])
total = 0.0f0 # Use a scalar for accumulation
# Generate all random noise in one batch and move to GPU
x_batch = gpu(rand(hparams.noise_model, hparams.samples * hparams.K))
# Pre-compute column indices for slicing
start_cols = hparams.K * (1:(hparams.samples - 1))
end_cols = hparams.K * (2:(hparams.samples)) .- 1
loss, grads = Flux.withgradient(nn_model) do nn
# Process batch through nn_model
yₖ_batch = nn(x_batch)
for ω in Ω
s = transpose(ω) * yₖ_batch
aₖ = mapreduce(+, 1:length(start_cols)) do i
slice = s[:, start_cols[i]:end_cols[i]]
dot_product = dot(ω, data[:, i + 1])
generate_aₖ(slice, dot_product)
end
total += scalar_diff(aₖ ./ sum(aₖ))
end
total / hparams.m
end
Flux.update!(optim, nn_model, grads[1])
push!(losses, loss)
end
return losses
end
and I called it like this,
dim = 100
device = gpu
model = device(Generator(latent_dim)) #this what ever is neural network
# Mean vector (zero vector of length dim)
mean_vector = zeros(dims)
# Covariance matrix (identity matrix of size dim x dim)
cov_matrix = Diagonal(ones(dims))
# Create the multivariate normal distribution
noise_model = device(MvNormal(mean_vector, cov_matrix))
hparams = device(
HyperParamsSlicedISL(;
K=2, samples=1000, epochs=60, η=1e-3, noise_model=noise_model, m=2
),
)
sliced_invariant_statistical_loss_optimized_gpu_2(model, train_loader, hparams)
The rest of the functions are,
function _sigmoid(
ŷ::Transpose{T,CUDA.CuArray{T,1,CUDA.Mem.DeviceBuffer}}, y::T
) where {T<:AbstractFloat}
return sigmoid_fast.((y .- ŷ) .* 10.0f0)
end
function ϕ(
yₖ::Transpose{T,CUDA.CuArray{T,1,CUDA.Mem.DeviceBuffer}}, yₙ::T
) where {T<:AbstractFloat}
return CUDA.sum(_sigmoid(yₖ, yₙ))
end
function γ(
yₖ::Transpose{T,CUDA.CuArray{T,1,CUDA.Mem.DeviceBuffer}}, yₙ::T, m::Int64
) where {T<:AbstractFloat}
function eₘ_cuda(m, length)
return CuArray([j == m ? T(1.0) : T(0.0) for j in 0:length])
end
return eₘ_cuda(m, size(yₖ, 2)) * ψₘ(ϕ(yₖ, yₙ), m)
end
function generate_aₖ(
ŷ::Transpose{T,CUDA.CuArray{T,1,CUDA.Mem.DeviceBuffer}}, y::T
) where {T<:AbstractFloat}
# Calculate the squared differences
return CUDA.sum([γ(ŷ, y, k) for k in 0:length(ŷ)])
end
@inline scalar_diff(q::CuArray{T}) where {T<:AbstractFloat} =
sum((q .- (T(1.0f0) ./ T(length(q)))) .^ 2)