Hello everyone,
I’m currently working on training a model using the Flux framework, but I’ve encountered a significant performance issue: the training time for each epoch is around 10 minutes, which seems unusually slow. I’ve experimented with different loss functions, but the training speed remains a bottleneck.
# Define a multi-layer perceptron (MLP) structure
struct DiscreetMLP
dropout::Dropout # Dropout layer to prevent overfitting
layer1::Dense # First dense layer
layer2::Dense # Second dense layer
layer3::Dense # Third dense layer
end
# Enable Flux to work with DiscreetMLP
Flux.@functor DiscreetMLP
# Constructor for DiscreetMLP, taking hidden layer size and dropout rate as inputs
function DiscreetMLP(hidden_size, dropout_rate)
layer1 = Dense(hidden_size, 2 * hidden_size, tanh) # First layer with activation
layer2 = Dense(2 * hidden_size, 2 * hidden_size, tanh) # Second layer with activation
layer3 = Dense(2 * hidden_size, 1) # Output layer
dropout = Dropout(dropout_rate) # Dropout layer
return DiscreetMLP(dropout, layer1, layer2, layer3) # Return the constructed model
end
# Define the forward pass for the MLP
function (mlp::DiscreetMLP)(input)
input = mlp.layer1(input) # Pass input through first layer
input = mlp.dropout(input) # Apply dropout
input = mlp.layer2(input) # Pass input through second layer
input = mlp.dropout(input) # Apply dropout
input = mlp.layer3(input) # Pass input through output layer
return input # Return final output
end
# Define a graph neural network structure
struct DiscreetGNN
init_layer::LayerInit
layer1::Layer
layer2::Layer
layer3::Layer
layer4::Layer
layer5::Layer
mlp::DiscreetMLP
end
Flux.@functor DiscreetGNN
# Constructor for DiscreetGNN, taking input size, hidden size, and dropout rate
function DiscreetGNN(input_size, hidden_size, dropout_rate)
init_layer = LayerInit(input_size, hidden_size) # Initialize layer
layer1 = Layer(hidden_size, hidden_size) # First GNN layer
layer2 = Layer(hidden_size, hidden_size) # Second GNN layer
layer3 = Layer(hidden_size, hidden_size) # Third GNN layer
layer4 = Layer(hidden_size, hidden_size) # Fourth GNN layer
layer5 = Layer(hidden_size, hidden_size) # Fifth GNN layer
mlp = DiscreetMLP(hidden_size, dropout_rate) # Create MLP instance
return DiscreetGNN(init_layer, layer1, layer2, layer3, layer4, layer5, mlp) # Return the constructed GNN
end
# Define the forward pass for the GNN
function (gnn::DiscreetGNN)(adj1, adj2)
#I have ensured that inputs and outputs are of type Float32
# Normalize and propagate through layers
x_1 = mynormalize(gnn.init_layer(adj2)) # Initial adjacency normalization
x_2 = mynormalize(gnn.layer1(x_1, adj1)) # First GNN layer propagation
x_3 = mynormalize(gnn.layer2(x_2, adj1)) # Second GNN layer propagation
x_4 = mynormalize(gnn.layer3(x_3, adj1)) # Third GNN layer propagation
x_5 = mynormalize(gnn.layer4(x_4, adj1)) # Fourth GNN layer propagation
x_6 = gnn.layer5(x_5, adj1) # Fifth GNN layer propagation
# Calculate scores for each layer's output
s1 = gnn.mlp(x_1') # MLP output for first layer
s2 = gnn.mlp(x_2') # MLP output for second layer
s3 = gnn.mlp(x_3') # MLP output for third layer
s4 = gnn.mlp(x_4') # MLP output for fourth layer
s5 = gnn.mlp(x_6') # MLP output for fifth layer
# Combine the scores from all layers
score_total = s1 + s2 + s3 + s4 + s5
return sigmoid(score_total) # Apply sigmoid to return probabilities
end
# Function to normalize the input
function mynormalize(x)
norms = sqrt.(sum(abs2, x, dims=2)) .+ 0.01 # Calculate norms with a small constant to avoid division by zero
return Float32.(x ./ norms) # Normalize the input
end
# Training function to create the model
function create_discreet_model(model_size)
model = DiscreetGNN(model_size, 20, 0.6) # Create GNN with specified model size, hidden size, and dropout rate
return model
end
#MODEL TRAINING
# Custom cost-sensitive binary cross-entropy loss function
function cost_sensitive_loss(predictions, targets; pos_weight=2, neg_weight=1, aggregation=mean, epsilon=1e-10)
pos_weighted = targets * pos_weight # Weight for positive class
neg_weighted = (1 .- targets) * neg_weight # Weight for negative class
losses = @.(-pos_weighted * log(predictions + epsilon) - neg_weighted * log(1 - predictions + epsilon)) # Calculate losses
return aggregation(losses) # Return aggregated loss
end
# Function to round predictions based on a threshold
function custom_round(predictions, threshold=0.9)
return Int.(predictions .>= threshold ? 1 : 0) # Convert predictions to binary based on threshold
end
# Function to calculate total accuracy
function overall_accuracy(predictions, targets)
return sum(custom_round.(predictions) .== targets) / length(targets) # Calculate accuracy
end
# Function to calculate class-specific accuracy
function class_specific_accuracy(predictions, targets, class)
pred = custom_round.(predictions) .== class # Predicted labels for the specific class
true_vals = targets .== class # True labels for the specific class
correct_predictions = sum(pred .& true_vals) # Count correct predictions
total_count = sum(true_vals) # Total instances for the class
return correct_predictions / total_count # Return class-specific accuracy
end
loss(ŷ, y) = cost_sensitive_binary_crossentropy(ŷ, y)
# Function to train the model using the metrics
function train_discreet_model(model, train_dataset, test_dataset)
learning_rate = 0.001 # Set learning rate
opt = Flux.setup(Adam(learning_rate), model) # Set up optimizer
# Initialize arrays to store training history
loss_train_history = []
#...initialize arrays for the other metrics too
# Calculate initial losses and accuracies
loss_train = mean([loss(model(ad, adm)[1:n], binary_arr[1:n]) for (ad, adm, binary_arr, n) in train_dataset])
loss_test = mean([loss(model(ad, adm)[1:n], binary_arr[1:n]) for (ad, adm, binary_arr, n) in test_dataset])
acc_test_binary_arr = mean([class_specific_accuracy(model(ad, adm)[1:n], binary_arr[1:n], 1) for (ad, adm, binary_arr, n) in test_dataset])
acc_test_other = mean([class_specific_accuracy(model(ad, adm)[1:n], binary_arr[1:n], 0) for (ad, adm, binary_arr, n) in test_dataset])
acc_total = mean([overall_accuracy(model(ad, adm)[1:n], binary_arr[1:n]) for (ad, adm, binary_arr, n) in test_dataset])
est = mean([sum(custom_round.(vec(model(ad, adm)))) for (ad, adm, binary_arr, _) in test_dataset])
# Store initial metrics
push!(loss_train_history, loss_train)
#...push also for the other metrics
@show 0, loss_train, loss_test, acc_test_binary_arr, acc_test_other, acc_total, est
# Training loop for a specified number of epochs
for epoch in 1:5
for (ad, adm, binary_arr, n) in train_dataset
Flux.train!(loss, params(model), [(ad, adm, binary_arr, n)], opt) # Update model parameters
end
# Compute metrics after each epoch
loss_train = mean([loss(model(ad, adm)[1:n], binary_arr[1:n]) for (ad, adm, binary_arr, n) in train_dataset])
loss_test = mean([loss(model(ad, adm)[1:n], binary_arr[1:n]) for (ad, adm, binary_arr, n) in test_dataset])
acc_test_binary_arr = mean([class_specific_accuracy(model(ad, adm)[1:n], binary_arr[1:n], 1) for (ad, adm, binary_arr, n) in test_dataset])
acc_test_other = mean([class_specific_accuracy(model(ad, adm)[1:n], binary_arr[1:n], 0) for (ad, adm, binary_arr, n) in test_dataset])
acc_total = mean([overall_accuracy(model(ad, adm)[1:n], binary_arr[1:n]) for (ad, adm, binary_arr, n) in test_dataset])
est = mean([sum(custom_round.(vec(model(ad, adm)))) for (ad, adm, binary_arr, _) in test_dataset])
# Append metrics to histories
push!(loss_train_history, loss_train)
#....push for the other metrics
# Log metrics
@show epoch, loss_train, loss_test, acc_test_binary_arr, acc_test_other, acc_total, est
end
# Return the final metrics and histories
return loss_train_hist #... and the rest metrics
end
I would greatly appreciate any suggestions or insights on what might be causing the slow performance and how to enhance it.
Thank you for your help!