Hi!
I created a structure for ML (for a generalized linear modell) and i want to store the linking function by initializing the structure using the name of the distribution.
The code of the structure:
mutable struct GLM
learning_input::Matrix{Float64}
learning_output::Vector{Float64}
weights::Vector{Float64}
distribution::String # Distribution (normal, exponential, etc.)
link_function::Function # Link function
cost_convergence_error::Float64
cost_history::Vector{Float64}
name::String
description::String
function GLM(learning_input::Matrix{Float64}, learning_output::Vector{Float64}, distribution::String="normal",name::String="Generalized Linear Modell", description::String="This modell is for a generalized linear modell!")
@assert size(learning_input, 1) == length(learning_output) "Inputs and outputs sizes do not match"
n_features = size(learning_input, 2)
weights = zeros(n_features) # Generate initial weights as zeros
cost_convergence_error = 0.0 # Default cost convergence error
cost_history = Float64[] # Empty cost history vector
link_fn=get_link_function(distribution, weights)
new(learning_input, learning_output, weights, distribution, link_fn, cost_convergence_error, cost_history, name, description)
end
end
The code for the get_link_function:
function get_link_function(distribution::String, weights)
if distribution == "normal"
return (
x -> dot(x, weights) # Identity with weights
)
elseif distribution in ["exponential", "poisson"]
return (
x -> log(dot(x, weights)) # Log with weighted input
)
elseif distribution == "gamma"
return (
x -> 1 / dot(x, weights) # Inverse with weights
)
elseif distribution == "inverse_gaussian"
return (
x -> 1 / (dot(x, weights)^2) # Inverse square with weights
)
elseif distribution in ["bernoulli", "binomial"]
return (
x -> log(dot(x, weights) / (1 - dot(x, weights))) # Logit with weights
)
elseif distribution in ["categorical", "multinomial"]
return (
x -> log.(dot(x, weights)) # Log-softmax
)
else
error("Unsupported distribution: ($distribution). Supported options: normal, exponential, poisson, gamma, inverse_gaussian, bernoulli, binomial, categorical, multinomial.")
end
end
Additional function for generating data:
function generate_data(n_samples, n_features, distribution, noise=0.1)
Random.seed!(42) # For reproducibility
X = rand(n_samples, n_features) # Random features
# Add bias term consistently (if needed)
#X = hcat(ones(n_samples), X) # Add bias term
true_weights = randn(n_features) # This should create a vector with size n_features
link_fn, link_inv_fn = get_link_fn(distribution)
# Generate dependent variable
linear_predictor = X * true_weights
if distribution == "normal"
y = linear_predictor .+ noise * randn(n_samples) # Add Gaussian noise
elseif distribution in ["exponential", "poisson"] || distribution == "bernoulli" || distribution == "binomial"
y = exp.(linear_predictor) .+ noise # Non-negative
elseif distribution == "gamma"
y = 1 ./ (linear_predictor .+ 1e-6)
else
error("Unsupported distribution: $distribution")
end
return X, y, true_weights
end
And the main() looks like:
function main()
# Parameters
n_samples = 100
n_features = 20
noise = 0.2
width = 72 # For printing
decimals = 20
distribution = "normal"
# Generate synthetic data
X, y, true_weights = generate_data(n_samples, n_features, distribution, noise)
# Initialize GLM with generated data
glm = GLM(X, y, distribution)
end
I always receive the following error for the returning the function of normal distribution in get_link_function():
ERROR: DimensionMismatch: dot product arguments have lengths 2000 and 20
I would like to have the function inside the structure so I do not have to call it in the main() to give value to link_fn of GLM. Could you tell what is the problem with my code?
Thank you for your help