Type stability when using fields of structs

I am trying to optimize a function which mainly operates on arrays that are (nearly all) fields of a given struct. Yet, the performance of the function is NOT really fast. My first idea was to check type stability and type predictability with the @code_warntype macro. There are a lot of red marked warnings but I don’t really know how to avoid them and how to annotate the types of the variables correctly.
Could anyone please help me to optimize (performance) the following function, e.g. how to get rid of type instability and type predictability?

The original function (I already tried multithreading at some points):
conv_layer is an isntance of a struct (see below)

# Computes the derivative of the kernels/weights on the given layer, the results are used to optimize the kernels/weights
function multichannel_conv_gradients(conv_layer)
    # storing all the necessary shapes
    current_batch_size::Int, in_channels::Int, input_height::Int, input_width::Int = size(conv_layer.inputs)
    current_batch_size, out_channels::Int, output_height::Int, output_width::Int = size(conv_layer.outputs)
    kernel_height::Int, kernel_width::Int = size(conv_layer.kernels)[3:4]

    # storing often used data which will be modified
    inputs = conv_layer.inputs_padded
    losses = conv_layer.losses

    # calculating derivative of the activation function
    if conv_layer.df != 1
        df = conv_layer.df(conv_layer.outputs_no_activation)
    else
        df = ones(current_batch_size, out_channels, output_height, output_width)
    end

    gradients = conv_layer.gradients

    # going throw all data in batch
    for index_batch in 1:current_batch_size

        # going throw all out_channels (because each out_channel can be viewed seperatly)
        # for out_channel in 1:out_channels
        Threads.@threads for out_channel in 1:out_channels

            # going throw each gradient (respectively every weight, same shape as kernels)
            for in_channel in 1:in_channels, y_w in 1:kernel_height, x_w in 1:kernel_width
                value = 0.00
                
                # going throw each output (beacuse each weight has influence on every output)
                # for y_out in 1:output_height, x_out in 1:output_width
                Threads.@threads for y_out in 1:output_height # @inbounds
                Threads.@threads for x_out in 1:output_width # @inbounds
                    m, n = get_input_position((y_out, x_out), conv_layer.stride)
                    value += inputs[index_batch, in_channel, m + y_w - 1, n + x_w - 1] * losses[index_batch, out_channel, y_out, x_out] * df[index_batch, out_channel, y_out, x_out] # .*
                end
                end
            
                gradients[out_channel, in_channel, y_w, x_w] += value
            end

        end
    
    end
    if current_batch_size != 1
        gradients /= current_batch_size
    end
    
    return gradients
end

I also think the given struct (conv_layer) is also necesary to see because some of the fields can be nothing or an Array.
The struct (mutable):

# returns the position in an input matrix given by the position in output (e.g. usefull for conv, pool and diffrent	backward-passes)
# output_position and stride must be tuples
function get_input_position(output_position::Tuple, stride::Tuple)
    m = output_position[1] + (stride[1] - 1) * (output_position[1] - 1)
    n = output_position[2] + (stride[2] - 1) * (output_position[2] - 1)

    return m, n
end

mutable struct Conv
    # characteristics of the layer
    in_channels::Int
    out_channels::Int
    kernel_size::Tuple
    stride::Tuple
    padding::Tuple
    activation_function # can be nothing
    df # derivative of activation function
    # data
    inputs # can be nothing
    inputs_padded # saved for performence optimization
    kernels::Array # weights
    outputs_no_activation # can be nothing
    outputs # can be nothing
    losses # can be nothing
    previous_losses # losses for the previous layer, can be nothing
    gradients::Array # gradients of the kernels/weights
    derivative_cache::Array
    # custom constructor
    function Conv(in_channels::Int, out_channels::Int, kernel_size::Tuple; stride::Tuple=(1, 1), padding::Tuple=(0, 0), activation_function=nothing)
        # setting up the activation function
        if isnothing(activation_function)
            new_activation_function = nothing
            df = 1
            gain = 1
            #=
        else
            new_activation_function = sigmoid
            df = d_sigmoid
            gain = 1
        =#
        end

        # initialize kernels/weights
        kernels_shape = (out_channels, in_channels, kernel_size[1], kernel_size[2])
        kernels = randn(kernels_shape)
        # initialize gradients of kernels/weights
        gradients = zeros(size(kernels))

        # placeholders
        inputs = nothing
        inputs_padded = nothing
        outputs_no_activation = nothing
        outputs = nothing
        losses = nothing
        previous_losses = nothing
        derivative_cache = Array{Any}(undef) ##

        # create new instance/object
        new(in_channels, 
            out_channels, 
            kernel_size, 
            stride, 
            padding, 
            new_activation_function, 
            df, 
            inputs,
            inputs_padded,
            kernels,
            outputs_no_activation,
            outputs,
            losses,
            previous_losses,
            gradients,
            derivative_cache ##
        )
    end
end

And finally the function for benchmarking (runned with multiple threads):
If you want, you can run @code_warntype multichannel_conv_gradients(layer)
for checking type-releated problems.

using BenchmarkTools, InteractiveUtils
function benchmark()
    input = rand(1, 6, 14, 14)
    layer.inputs = input
    layer.inputs_padded = input
    output = rand(1, 16, 10, 10) # normally computed before but for simplicity just random initialized
    layer.outputs_no_activation = output
    layer.outputs = output
    layer.losses = rand(size(output)...) # normally computed before but for simplicity just random initialized
    # println(calculate_output_shape(14, 14, 5, 5, stride=(1, 1), padding=(0, 0)))
    # exit()

    # interesting part
    # @code_warntype multichannel_conv_gradients(layer)
    gradients = @btime multichannel_conv_gradients(layer)
end

layer = Conv(6, 16, (5, 5))
benchmark()
>>> 6.292 ms (1960939 allocations: 39.80 MiB)

I am very sorry that it is so much code in total, please let me know if you have any questions considering the code.

the struct fields are all Any, this is the source of type instability, btw, why reinventing Conv?

https://fluxml.ai/Flux.jl/stable/models/layers/#Flux.Conv

3 Likes

It’s a lot of code, so this is just my brief first impression:

Your struct definition is full of abstract types, such as Array and Tuple, and even worse, many untyped fields, which become Any. Very bad for type stability. All your struct fields should have a fully concrete or parametric type (or perhaps a small Union with Nothing).

Remove the @threads for now, they make it difficult to identify performance issues. Add back one thread call (not multiple nested ones) at the very end. Threads should be added when everything else is fast, not as an effort to save inefficient code.

2 Likes

Just for fun and to get a better understanding of CNNs.

Thank you (again). Now, I removed threading for now and annotated a lot of types in the struct. Most of the red-marked type-warnings (@code_warntype) are gone. However df and gradients are still marked
red:

...
gradients::Array{Float64} <- red when displayed with @code_warntype
stride::Tuple{Int64, Int64}
df::Array{Float64}
...
Body::Array{Float64} <- red when displayed with @code_warntype

If you want to take a look at the hole code again:

mutable struct Conv
    # characteristics of the layer
    in_channels::Int
    out_channels::Int
    kernel_size::Tuple{Int, Int}
    stride::Tuple{Int, Int}
    padding::Tuple{Int, Int}
    activation_function::Union{Nothing, Function} # can be nothing
    df::Union{Function, Int} # derivative of activation function
    # data
    inputs::Union{Nothing, Array{Float64}} # can be nothing
    inputs_padded::Union{Nothing, Array{Float64}} # saved for performence optimization
    kernels::Array{Float64} # weights
    outputs_no_activation::Union{Nothing, Array{Float64}} # can be nothing
    outputs::Union{Nothing, Array{Float64}} # can be nothing
    losses::Union{Nothing, Array{Float64}} # can be nothing
    previous_losses::Union{Nothing, Array{Float64}} # losses for the previous layer, can be nothing
    gradients::Array{Float64} # gradients of the kernels/weights
    derivative_cache::Array{Float64}
    # custom constructor
    function Conv(in_channels::Int, out_channels::Int, kernel_size::Tuple; stride::Tuple=(1, 1), padding::Tuple=(0, 0), activation_function=nothing)
        # setting up the activation function
        if isnothing(activation_function)
            new_activation_function = nothing
            df = 1
            gain = 1
            #=
        else
            new_activation_function = sigmoid
            df = d_sigmoid
            gain = 1
        =#
        end

        # initialize kernels/weights
        kernels_shape = (out_channels, in_channels, kernel_size[1], kernel_size[2])
        kernels = randn(kernels_shape)
        # initialize gradients of kernels/weights
        gradients = zeros(size(kernels))

        # placeholders
        inputs = nothing
        inputs_padded = nothing
        outputs_no_activation = nothing
        outputs = nothing
        losses = nothing
        previous_losses = nothing
        derivative_cache = Array{Float64}(undef) ##

        # create new instance/object
        new(in_channels, 
            out_channels, 
            kernel_size, 
            stride, 
            padding, 
            new_activation_function, 
            df, 
            inputs,
            inputs_padded,
            kernels,
            outputs_no_activation,
            outputs,
            losses,
            previous_losses,
            gradients,
            derivative_cache ##
        )
    end
end

# Computes the derivative of the kernels/weights on the given layer, the results are used to optimize the kernels/weights
function multichannel_conv_gradients(conv_layer)
    # storing all the necessary shapes
    current_batch_size::Int, in_channels::Int, input_height::Int, input_width::Int = size(conv_layer.inputs)
    current_batch_size, out_channels::Int, output_height::Int, output_width::Int = size(conv_layer.outputs)
    kernel_height::Int, kernel_width::Int = size(conv_layer.kernels)[3:4]

    # storing often used data which will be modified
    inputs = conv_layer.inputs_padded
    losses = conv_layer.losses

    # calculating derivative of the activation function
    if conv_layer.df != 1
        df = conv_layer.df(conv_layer.outputs_no_activation)
    else
        df::Array{Float64} = ones(current_batch_size, out_channels, output_height, output_width)
    end

    gradients::Array{Float64} = conv_layer.gradients
    stride::Tuple{Int, Int} = conv_layer.stride

    # going throw all data in batch
    for index_batch in 1:current_batch_size

        # going throw all out_channels (because each out_channel can be viewed seperatly)
        for out_channel in 1:out_channels
        # Threads.@threads for out_channel in 1:out_channels

            # going throw each gradient (respectively every weight, same shape as kernels)
            for in_channel in 1:in_channels, y_w in 1:kernel_height, x_w in 1:kernel_width
            # Threads.@threads for in_channel in 1:in_channels
            # for y_w in 1:kernel_height
            # for x_w in 1:kernel_width
                value = 0.00
                
                # going throw each output (beacuse each weight has influence on every output)
                for y_out in 1:output_height, x_out in 1:output_width
                # Threads.@threads for y_out in 1:output_height # @inbounds
                # Threads.@threads for x_out in 1:output_width # @inbounds
                    # m, n = get_input_position((y_out, x_out), conv_layer.stride)
                    m = y_out + (stride[1] - 1) * (y_out - 1)
                    n = x_out + (stride[2] - 1) * (x_out - 1)
                    value += inputs[index_batch, in_channel, m + y_w - 1, n + x_w - 1] * losses[index_batch, out_channel, y_out, x_out] * df[index_batch, out_channel, y_out, x_out] # .*
                # end
                end
            
                gradients[out_channel, in_channel, y_w, x_w] += value
            end
            # end
            # end

        end
    
    end
    if current_batch_size != 1
        gradients /= current_batch_size
    end
    
    return gradients
end

using BenchmarkTools, InteractiveUtils
function benchmark()
    input = rand(1, 6, 14, 14)
    layer.inputs = input
    layer.inputs_padded = input
    output = rand(1, 16, 10, 10) # normally computed before but for simplicity just random initialized
    layer.outputs_no_activation = output
    layer.outputs = output
    layer.losses = rand(size(output)...) # normally computed before but for simplicity just random initialized
    # println(calculate_output_shape(14, 14, 5, 5, stride=(1, 1), padding=(0, 0)))
    # exit()

    # interesting part
    @code_warntype multichannel_conv_gradients(layer)
    gradients = @btime multichannel_conv_gradients(layer)
    # println(typeof(benchmark))
end

layer = Conv(6, 16, (5, 5))
benchmark()

Benchmark results:

  • original with nearly no type-annotations: >>> 6.292 ms (1960939 allocations: 39.80 MiB)
  • with original code but threading removed: >>> ~40ms
  • with no threads but new code: >>> 17.417 ms (724807 allocations: 11.07 MiB)
Array{Float64}

is not s concrete type. Try Array{Float64,N} where N is one for a vector, 2 for s matrix and larger for a higher dimensional array. That will probably make the last two reds in @code_warntype disappear and might help quite a lot with performance.

1 Like

Also Function Is not a concrete type. You should try:

mutable struct Conv{F<: Function}
    # characteristics of the layer
    in_channels::Int
    out_channels::Int
    kernel_size::Tuple{Int, Int}
    stride::Tuple{Int, Int}
    padding::Tuple{Int, Int}
    activation_function::Union{Nothing, Function} # can be nothing
    df::Union{F, Int}
    …

Note that every function is its own type, so you might need F1, F2, … at the top of the struct…

1 Like

In which situations could this be an Int? And does it have to be stored inside the struct?

1 Like

Thank you VERY much! The runtime now: >>>868.800 μs (2 allocations: 12.67 KiB)
I didn’t expect a so great performance boost. When using no threads, the last red two warnings are also gone. (When using threads ::Core.Box in red is shown but I don’t think that is a problem.)

df is the derivative function of activation_function, if activation_function == nothing df is 1. 1 is just the information that it is not necessary to take the derivative of activation_function.

Thank you also for this information. I am not sure if that is really necessary for now but maybe I will try it in the future.

@DNF and @vettert thank you all. Just if you want to see the current version of the code:

mutable struct Conv
    # characteristics of the layer
    in_channels::Int
    out_channels::Int
    kernel_size::Tuple{Int, Int}
    stride::Tuple{Int, Int}
    padding::Tuple{Int, Int}
    activation_function::Union{Nothing, Function} # can be nothing
    df::Union{Function, Int} # derivative of activation function
    # data
    inputs::Union{Nothing, Array{Float64, 4}} # can be nothing
    inputs_padded::Union{Nothing, Array{Float64, 4}} # saved for performence optimization
    kernels::Array{Float64, 4} # weights
    outputs_no_activation::Union{Nothing, Array{Float64, 4}} # can be nothing
    outputs::Union{Nothing, Array{Float64, 4}} # can be nothing
    losses::Union{Nothing, Array{Float64, 4}} # can be nothing
    previous_losses::Union{Nothing, Array{Float64, 4}} # losses for the previous layer, can be nothing
    gradients::Array{Float64, 4} # gradients of the kernels/weights
    derivative_cache::Array{Float64}
    # custom constructor
    function Conv(in_channels::Int, out_channels::Int, kernel_size::Tuple; stride::Tuple=(1, 1), padding::Tuple=(0, 0), activation_function=nothing)
        # setting up the activation function
        if isnothing(activation_function)
            new_activation_function = nothing
            df = 1
            gain = 1
            #=
        else
            new_activation_function = sigmoid
            df = d_sigmoid
            gain = 1
        =#
        end

        # initialize kernels/weights
        kernels_shape = (out_channels, in_channels, kernel_size[1], kernel_size[2])
        kernels = randn(kernels_shape)
        # initialize gradients of kernels/weights
        gradients = zeros(size(kernels))

        # placeholders
        inputs = nothing
        inputs_padded = nothing
        outputs_no_activation = nothing
        outputs = nothing
        losses = nothing
        previous_losses = nothing
        derivative_cache = Array{Float64}(undef) ##

        # create new instance/object
        new(in_channels, 
            out_channels, 
            kernel_size, 
            stride, 
            padding, 
            new_activation_function, 
            df, 
            inputs,
            inputs_padded,
            kernels,
            outputs_no_activation,
            outputs,
            losses,
            previous_losses,
            gradients,
            derivative_cache ##
        )
    end
end

# Computes the derivative of the kernels/weights on the given layer, the results are used to optimize the kernels/weights
function multichannel_conv_gradients(conv_layer)
    # storing all the necessary shapes
    current_batch_size::Int, in_channels::Int, input_height::Int, input_width::Int = size(conv_layer.inputs)
    current_batch_size, out_channels::Int, output_height::Int, output_width::Int = size(conv_layer.outputs)
    kernel_height::Int, kernel_width::Int = size(conv_layer.kernels)[3:4]

    # storing often used data which will be modified
    inputs = conv_layer.inputs_padded
    losses = conv_layer.losses

    # calculating derivative of the activation function
    if conv_layer.df != 1
        df = conv_layer.df(conv_layer.outputs_no_activation)
    else
        df::Array{Float64, 4} = ones(current_batch_size, out_channels, output_height, output_width)
    end

    gradients::Array{Float64, 4} = conv_layer.gradients
    stride::Tuple{Int, Int} = conv_layer.stride

    # going throw all data in batch
    for index_batch in 1:current_batch_size

        # going throw all out_channels (because each out_channel can be viewed seperatly)
        for out_channel in 1:out_channels
        # Threads.@threads for out_channel in 1:out_channels

            # going throw each gradient (respectively every weight, same shape as kernels)
            for in_channel in 1:in_channels, y_w in 1:kernel_height, x_w in 1:kernel_width
            # Threads.@threads for in_channel in 1:in_channels
            # for y_w in 1:kernel_height
            # for x_w in 1:kernel_width
                value = 0.00
                
                # going throw each output (beacuse each weight has influence on every output)
                for y_out in 1:output_height, x_out in 1:output_width
                # Threads.@threads for y_out in 1:output_height # @inbounds
                # Threads.@threads for x_out in 1:output_width # @inbounds
                    # m, n = get_input_position((y_out, x_out), conv_layer.stride)
                    m = y_out + (stride[1] - 1) * (y_out - 1)
                    n = x_out + (stride[2] - 1) * (x_out - 1)
                    value += inputs[index_batch, in_channel, m + y_w - 1, n + x_w - 1] * losses[index_batch, out_channel, y_out, x_out] * df[index_batch, out_channel, y_out, x_out] # .*
                # end
                end
            
                gradients[out_channel, in_channel, y_w, x_w] += value
            end
            # end
            # end

        end
    
    end
    if current_batch_size != 1
        gradients /= current_batch_size
    end
    
    return gradients
end

using BenchmarkTools, InteractiveUtils
function benchmark()
    input = rand(1, 6, 14, 14)
    layer.inputs = input
    layer.inputs_padded = input
    output = rand(1, 16, 10, 10) # normally computed before but for simplicity just random initialized
    layer.outputs_no_activation = output
    layer.outputs = output
    layer.losses = rand(size(output)...) # normally computed before but for simplicity just random initialized
    # println(calculate_output_shape(14, 14, 5, 5, stride=(1, 1), padding=(0, 0)))
    # exit()

    # interesting part
    @code_warntype multichannel_conv_gradients(layer)
    gradients = @btime multichannel_conv_gradients(layer)
    # println(typeof(benchmark))
end

layer = Conv(6, 16, (5, 5))
benchmark()
>>> 869.400 μs (2 allocations: 12.67 KiB)

Runtime >>> 869.400 μs (2 allocations: 12.67 KiB)

1 Like