N-dimensional ConvLayer by scratch with all the bells and whistles but... super super slow :-/

I just implemented a fully-from-scratch convolutional layer on my BetaML ML lib.

It works with or without autodiff, it accepts infinite dimensions, nchannels input/output, padding, stride… training even works :slight_smile: :slight_smile: … on a 3x3x2 matrix :-/ :-/ :-/

On a 64x64x2 data, backpropagation of a single layer takes 8 seconds.

See below the code of the forward(), backward() (derror/dx) and get_gradient() (derror/dw) functions (full code on GitHub or ]dev BetaML).

Profiling the code I realised 95% of the time is spent in converting CartesianIndex to StaticVector and computing the indices of the convolution.

As the layer size is knows at construction time, would it make sense to:
(a) Cache the relevant convolutional indices inside the ConvLayer struct ?
(a.1) Use Int32 in the case
(b.1) Use Float32 for the weights ? For the data also ?
(c) Other ideas ?


"ConvLayer structure"
mutable struct ConvLayer{ND,NDPLUS1,NDPLUS2} <: AbstractLayer
   "Input size (including nchannel_in as last dimension)"
   input_size::SVector{NDPLUS1,Int64}
   "Weight tensor (aka \"filter\" or \"kernel\") with respect to the input from previous layer or data (kernel_size array augmented by the nchannels_in and nchannels_out dimensions)"
   weight::Array{Float64,NDPLUS2}
   "Wether to use (and learn) a bias weigth [def: true]"
   usebias::Bool
   "Bias (nchannels_out array)"
   bias::Array{Float64,1}
   "Padding (initial)"
   padding_start::SVector{ND,Int64}
   "Padding (ending)"
   padding_end::SVector{ND,Int64}
   "Stride"
   stride::SVector{ND,Int64}
   "Number of dimensions (excluding input and output channels)"
   ndims::Int64
   "Activation function"
   f::Function
   "Derivative of the activation function"
   df::Union{Function,Nothing}
end

"Computation of the padded x"
function _xpadComp(layer::ConvLayer,x)
   input_size, output_size = size(layer)
   # input padding
   padding_start = SVector{layer.ndims+1,Int64}([layer.padding_start...,0])
   padding_end   = SVector{layer.ndims+1,Int64}([layer.padding_end...,0])
   padded_size   = input_size .+  padding_start .+ padding_end
   xstart        = padding_start .+ 1
   xends         = padding_start .+ input_size
   xpadded       = zeros(eltype(x), padded_size...)
   xpadded[[range(s,e,step=1) for (s,e) in zip(xstart,xends)]...] = x
   return xpadded
end

"forward passage"
function _zComp(layer::ConvLayer,x)
    if ndims(x) == 1
       reshape(x,size(layer)[1]) 
     end
     input_size, output_size = size(layer)
   
     # input padding
     xpadded = _xpadComp(layer,x)
 
     y = zeros(output_size)
 
     for yi in CartesianIndices(y)
      yiarr         = convert(SVector{layer.ndims+1,Int64},yi)
      nchannel_out  = yiarr[end]
      starti        = yiarr[1:end-1] .* layer.stride .- layer.stride .+ 1
      starti        = vcat(starti,1)
      endi          = starti .+  convert(SVector{layer.ndims+1,Int64},size(layer.weight)[1:end-1]) .- 1
      weight        = selectdim(layer.weight,layer.ndims+2,nchannel_out)
      xpadded_in    = xpadded[[range(s,e,step=1) for (s,e) in zip(starti,endi)]...]
      y_unbias      = @turbo dot(weight, xpadded_in)
      y[yi]         = layer.bias[nchannel_out] .+ y_unbias
     end
   
     return y
 end

 function forward(layer::ConvLayer,x)
   z =  _zComp(layer,x) 
   return layer.f.(z)
 end

"backward (with respect to inputs: derror/dx)"
function backward(layer::ConvLayer,x,next_gradient)
 
    z      =  _zComp(layer,x)
   
    if layer.df != nothing
       dfz = layer.df.(z)  
    else
       dfz =  layer.f'.(z) # using AD
    end
    dϵ_dz  = @turbo  dfz .* next_gradient

    nchannels_out = size(layer.weight)[end]
    nchannels_in  = size(layer.weight)[end-1]
    input_size, output_size = size(layer)
    de_dx         = zeros(layer.input_size...)
 
    @inbounds for nch_in in 1:nchannels_in
       w_ch_in = selectdim(layer.weight,layer.ndims+1,nch_in)
       @inbounds for nch_out in 1:nchannels_out
          w_ch_in_out  = selectdim(w_ch_in,layer.ndims+1,nch_out)
          dϵ_dz_ch_out = selectdim(dϵ_dz,layer.ndims+1,nch_out)
          de_dx_ch_in  = selectdim(de_dx,layer.ndims+1,nch_in)
          @inbounds for w_idx in CartesianIndices(w_ch_in_out)
             w_idx = convert(SVector{layer.ndims,Int64},w_idx)
             @inbounds for dey_idx in CartesianIndices(dϵ_dz_ch_out)
                dey_idx = convert(SVector{layer.ndims,Int64},dey_idx)
                idx_x_source_padded = w_idx .+ (dey_idx .- 1 ) .* layer.stride
                if all(idx_x_source_padded .> layer.padding_start) && all(idx_x_source_padded .<=  layer.padding_start .+ input_size[1:end-1])
                   idx_x_source = idx_x_source_padded .- layer.padding_start
                   de_dx_ch_in[idx_x_source...] += dϵ_dz_ch_out[dey_idx...] * w_ch_in_out[w_idx...]
                end
             end
          end
       end
    end
    return de_dx
 end

"get_gradient (with respect to the weights (derror/dw)"
function get_gradient(layer::ConvLayer,x,next_gradient) 
 
    z      =  _zComp(layer,x)
    if layer.df != nothing
       dfz = layer.df.(z)  
    else
       dfz =  layer.f'.(z) # using AD
    end
    dϵ_dz  = @turbo  dfz .* next_gradient
    dw     = zeros(size(layer.weight))
    xpadded = _xpadComp(layer,x)

    # derror_dw computation
    for idx in CartesianIndices(dw)
       idx = convert(SVector{layer.ndims+2,Int64},idx)
       nchannel_out = idx[end]
       nchannel_in  = idx[end-1]
       wdims        = idx[1:end-2]
       dϵ_dz_nchannelOut = selectdim(dϵ_dz,layer.ndims+1,nchannel_out)
       xval = zeros(size(dϵ_dz_nchannelOut))
       for yi in CartesianIndices(dϵ_dz_nchannelOut)
          idx_y = convert(SVector{layer.ndims,Int64},yi)
          idx_x_dest = idx_y
          idx_x_source = wdims .+ (idx_y .- 1 ) .* layer.stride # xpadded[i] = w[i] + (Y[i] -1 ) * STRIDE
          xval[idx_x_dest] = xpadded[vcat(idx_x_source,nchannel_in)...]  
       end
       dw[idx...] = dot(xval,dϵ_dz_nchannelOut)  # slighly more efficient than using += on each individual product
    end

    # derror_dbias computation
    if layer.usebias
       dbias = zeros(length(layer.bias))
       for bias_idx in 1:length(layer.bias)
          nchannel_out = bias_idx
          dϵ_dz_nchannelOut = selectdim(dϵ_dz,layer.ndims+1,nchannel_out)
          dbias[bias_idx] = sum(dϵ_dz_nchannelOut)
       end
       return Learnable((dw,dbias))
    else
       return Learnable((dwb,))
     end
 end

By converting the CartesianIndex to Tuple instead of SArray I got a 20% speed up… but I need a 1000x speedup to get the lib barely usable :-//

What’s the goal here?

I think that making conv fast is about as hard as making matrix multiplication fast. Having things like if all(idx_x_source_padded .> layer.padding_start) on inner loops seems unlikely to compile to a straight, SIMD-friendly, set of operations. It’s likely you should work directly with integers, no arrays no broadcasting, ideally no branches.

ok, but what about store the ids of the convolution in the Layer structure:

mutable struct ConvLayer{ND,NDPLUS1,NDPLUS2} <: AbstractLayer
   [...]
   "x ids of the convolution (computed in `preprocessing`` - itself at the beginning of `train`"
   x_ids::Array{NTuple{NDPLUS1,Int32},1}
   "y ids of the convolution (computed in `preprocessing`` - itself at the beginning of `train`"
   y_ids::Array{NTuple{NDPLUS1,Int32},1}
   "w ids of the convolution (computed in `preprocessing`` - itself at the beginning of `train`"
   w_ids::Array{NTuple{NDPLUS2,Int32},1}
end

Then create a preprocess(layer::ConvLayer) function to create the arrays at beginning of fit! and at forward/backward/get_gradient time do something like:


y=zeros(size(layer[2])
for i in 1:length(x_ids)
   y[y_ids[i]...] += x[x_ids[i]...] * w[w_ids[i]...] 
end

Would it still be super slow because of the indirect reference ? Is there any neural network framework that took this approach ?

Indeed by caching the convolution ids in the layer struct I got a very nice speed up that makes it usable for small jobs :slight_smile: :slight_smile: :slight_smile: :

Here a self-standing script:


"Part of [BetaML](https://github.com/sylvaticus/BetaML.jl). Licence is MIT."

# Experimental

abstract type AbstractLayer end


using Random, LinearAlgebra, StaticArrays, LoopVectorization, Distributions
import Base.size

"""
ConvLayer

Representation of a convolutional layer in the network

"""
mutable struct ConvLayer{ND,NDPLUS1,NDPLUS2} <: AbstractLayer
   "Input size (including nchannel_in as last dimension)"
   input_size::SVector{NDPLUS1,Int64}
   "Weight tensor (aka \"filter\" or \"kernel\") with respect to the input from previous layer or data (kernel_size array augmented by the nchannels_in and nchannels_out dimensions)"
   weight::Array{Float64,NDPLUS2}
   "Wether to use (and learn) a bias weigth [def: true]"
   usebias::Bool
   "Bias (nchannels_out array)"
   bias::Array{Float64,1}
   "Padding (initial)"
   padding_start::SVector{ND,Int64}
   "Padding (ending)"
   padding_end::SVector{ND,Int64}
   "Stride"
   stride::SVector{ND,Int64}
   "Number of dimensions (excluding input and output channels)"
   ndims::Int64
   "Activation function"
   f::Function
   "Derivative of the activation function"
   df::Union{Function,Nothing}

   "x ids of the convolution (computed in `preprocessing`` - itself at the beginning of `train`"
   x_ids::Array{NTuple{NDPLUS1,Int32},1}
   "y ids of the convolution (computed in `preprocessing`` - itself at the beginning of `train`"
   y_ids::Array{NTuple{NDPLUS1,Int32},1}
   "w ids of the convolution (computed in `preprocessing`` - itself at the beginning of `train`"
   w_ids::Array{NTuple{NDPLUS2,Int32},1}


   """
   ConvLayer()

   Instantiate a new nD-dimensional, possibly multichannel ConvolutionalLayer

   The input data is either a column vector (in which case is reshaped) or an array of `input_size` augmented by the `n_channels` dimension, the output size depends on the `input_size`, `kernel_size`, `padding` and `striding` but has always `nchannels_out` as its last dimention.  

   # Positional arguments:
   * `input_size`:    Shape of the input layer (integer for 1D convolution, tuple otherwise). Do not consider the channels number here.
   * `kernel_size`:   Size of the kernel (aka filter or learnable weights) (integer for 1D or hypercube kernels or nD-sized tuple for assymmetric kernels). Do not consider the channels number here.
   * `nchannels_in`:  Number of channels in input
   * `nchannels_out`: Number of channels in output
   # Keyword arguments:
   * `kernel_init`:   Initial weigths with respect to the input [default: Xavier initialisation]. If given, it should be a multidimensional array of `kernel_size` augmented by `nchannels_in` and `nchannels_out` dimensions
   * `bias_init`:     Initial weigths with respect to the bias [default: Xavier initialisation]. If given it should be a `nchannels_out` vector of scalars.
   * `f`:   Activation function [def: `relu`]
   * `df`:  Derivative of the activation function [default: `nothing` (i.e. use AD)]
   * `rng`: Random Number Generator (see [`FIXEDSEED`](@ref)) [deafult: `Random.GLOBAL_RNG`]

   # Notes:
   - Xavier initialization is sampled from a `Uniform` distribution between `⨦ sqrt(6/(prod(input_size)*nchannels_in))`
   - to retrieve the output size of the layer, use `size(ConvLayer[2])`. The output size on each dimension _d_ (except the last one that is given by `nchannels_out`) is given by the following formula (ceiled): `output_size[d] = 1 + (input_size[d]+2*padding[d]-kernel_size[d])/stride[d]`
   """
   function ConvLayer(input_size,kernel_size,nchannels_in,nchannels_out;
            stride  = (ones(Int64,length(input_size))...,),
            rng     = Random.GLOBAL_RNG,
            padding = nothing, # zeros(Int64,length(input_size)),
            kernel_init  = rand(rng, Uniform(-sqrt(6/(prod(input_size)*nchannels_in)),sqrt(6/(prod(input_size)*nchannels_in))),(kernel_size...,nchannels_in,nchannels_out)...),
            usebias = true,
            bias_init    = usebias ? rand(rng, Uniform(-sqrt(6/(prod(input_size)*nchannels_in)),sqrt(6/(prod(input_size)*nchannels_in))),nchannels_out) : zeros(Float64,nchannels_out),
            f       = identity,
            df      = nothing)
      
      # be sure all are tuples of right dimension...
      if typeof(input_size) <: Integer
         input_size = (input_size,)
      end
      nD = length(input_size)
      if typeof(kernel_size) <: Integer
         kernel_size = ([kernel_size for d in 1:nD]...,)
      end
      length(input_size) == length(kernel_size) || error("Number of dimensions of the kernel must equate number of dimensions of input data")
      if typeof(stride) <: Integer
         stride = ([stride for d in 1:nD]...,)
      end
      if typeof(padding) <: Integer
         padding_start = ([padding for d in 1:nD]...,)
         padding_end   = ([padding for d in 1:nD]...,)
      elseif isnothing(padding) # compute padding to keep same size/stride if not provided
         target_out_size = [Int(round(input_size[d]/stride[d])) for d in 1:length(input_size)]
         padding_total   = [(target_out_size[d]-1)*stride[d] - input_size[d]+kernel_size[d] for d in 1:length(input_size)]
         padding_start   =  Int.(ceil.(padding_total ./ 2))
         padding_end     =  padding_total .- padding_start  
      else
         padding_start = padding[1]
         padding_end   = padding[2]
      end
      nD == length(stride) || error("`stride` must be either a scalar or a tuple that equates the number of dimensions of input data")
      nD == length(padding_start) == length(padding_end) || error("`padding` must be: (a) the value `nothing` for automatic computation, (b) a scalar for same padding on all dimensions or (c) a 2-elements tuple where each elements are tuples that equate the number of dimensions of input data for indicating the padding to set in front of the data and the padding to set at the ending of the data")

      new{nD,nD+1,nD+2}((input_size...,nchannels_in),kernel_init,usebias,bias_init,padding_start,padding_end,stride,nD,f,df,[],[],[])
   end
end



function preprocess!(layer::ConvLayer{ND,NDPLUS1,NDPLUS2}) where {ND,NDPLUS1,NDPLUS2}

   if length(layer.x_ids) > 0
      empty!(layer.x_ids)
      empty!(layer.w_ids)
      empty!(layer.y_ids)
   end

   input_size, output_size = size(layer)
   nchannels_out = output_size[end]
   nchannels_in  = input_size[end]
   convsize      = input_size[1:end-1]
   ndims_conv    = ND

   wsize = size(layer.weight)
   ysize = output_size
  
   # preallocating temp variables
   w_idx               = Array{Int32,1}(undef,NDPLUS2)
   y_idx               = Array{Int32,1}(undef,NDPLUS1)
   w_idx_conv          = Array{Int32,1}(undef,ND)
   y_idx_conv          = Array{Int32,1}(undef,ND)
   idx_x_source_padded = Array{Int32,1}(undef,ND)
   checkstart          = Array{Bool,1}(undef,ND)
   checkend            = Array{Bool,1}(undef,ND)
   x_idx               = Array{Int32,1}(undef,NDPLUS1)

   @inbounds for nch_in in 1:nchannels_in
      @inbounds for nch_out in 1:nchannels_out
         @inbounds for w_idx_conv in CartesianIndices( ((wsize[1:end-2]...),) ) 
            w_idx_conv = Tuple(w_idx_conv)
            w_idx = (w_idx_conv...,nch_in,nch_out)
            @inbounds for y_idx_conv in CartesianIndices( ((ysize[1:end-1]...),) )
               y_idx_conv = Tuple(y_idx_conv)
               y_idx      = (y_idx_conv...,nch_out)
               check = true
               @inbounds for d in 1:ndims_conv
                  idx_x_source_padded[d] = w_idx_conv[d] + (y_idx_conv[d] - 1 ) * layer.stride[d]
                  checkstart[d] =  idx_x_source_padded[d] > layer.padding_start[d]
                  checkend[d]   =  idx_x_source_padded[d] <=  layer.padding_start[d] .+ convsize[d]
                  checkstart[d] && checkend[d] || begin check = false; break; end
               end
               check || continue
               
               @inbounds @simd  for d in 1:ndims_conv
                  x_idx[d] = idx_x_source_padded[d] - layer.padding_start[d]
               end
               x_idx[ndims_conv+1] = nch_in
               push!(layer.x_ids,((x_idx...,)))
               push!(layer.w_ids,w_idx)
               push!(layer.y_ids,y_idx)            
            end
         end
      end
   end
end


function _xpadComp(layer::ConvLayer,x)
   input_size, output_size = size(layer)
   # input padding
   padding_start = SVector{layer.ndims+1,Int64}([layer.padding_start...,0])
   padding_end   = SVector{layer.ndims+1,Int64}([layer.padding_end...,0])
   padded_size   = input_size .+  padding_start .+ padding_end
   xstart        = padding_start .+ 1
   xends         = padding_start .+ input_size
   xpadded       = zeros(eltype(x), padded_size...)
   xpadded[[range(s,e,step=1) for (s,e) in zip(xstart,xends)]...] = x
   return xpadded
end

function _zComp_old(layer::ConvLayer,x)
   if ndims(x) == 1
      reshape(x,size(layer)[1]) 
    end
    input_size, output_size = size(layer)
  
    # input padding
    xpadded = _xpadComp(layer,x)

    y = zeros(output_size)

    for yi in CartesianIndices(y)
     yiarr         = Tuple(yi)
     nchannel_out  = yiarr[end]
     starti        = yiarr[1:end-1] .* layer.stride .- layer.stride .+ 1
     starti        = vcat(starti,1)
     endi          = starti .+  convert(SVector{layer.ndims+1,Int64},size(layer.weight)[1:end-1]) .- 1
     weight        = selectdim(layer.weight,layer.ndims+2,nchannel_out)
     xpadded_in    = xpadded[[range(s,e,step=1) for (s,e) in zip(starti,endi)]...]
     y_unbias      = @turbo dot(weight, xpadded_in)
     y[yi]         = layer.bias[nchannel_out] .+ y_unbias
    end
  
    return y
end


function _zComp(layer::ConvLayer{ND,NDPLUS1,NDPLUS2},x) where {ND,NDPLUS1,NDPLUS2} # slower :-/
   input_size, output_size = size(layer)
   nchannels_out = output_size[end]
   nchannels_in  = input_size[end]
   convsize      = input_size[1:end-1]
   ndims_conv    = ND

   if ndims(x) == 1
      reshape(x,size(layer)[1]) 
   end

   y = zeros(output_size)
   lx_ids  = layer.x_ids
   ly_ids  = layer.y_ids
   lw_ids  = layer.w_ids
   lweight = layer.weight

   @simd for idx in 1:length(layer.y_ids)
     y[ly_ids[idx]...] += x[lx_ids[idx]...] * lweight[lw_ids[idx]...] 
   end

   for ch_out in 1:nchannels_out
      y_ch_out = selectdim(y,NDPLUS1,ch_out)
      y_ch_out .+= layer.bias[ch_out]
   end

   return y
end


"""
forward()

Compute forward pass of a ConvLayer

"""
function forward_old(layer::ConvLayer,x)
  z =  _zComp_old(layer,x) 
  return layer.f.(z)
end

function forward(layer::ConvLayer,x)
   z =  _zComp(layer,x) 
   return layer.f.(z)
 end


function backward_old(layer::ConvLayer,x,next_gradient) # with respect to inputs: derror/dx

   z      =  _zComp_old(layer,x)
  
   if layer.df != nothing
      dfz = layer.df.(z)  
   else
      dfz =  layer.f'.(z) # using AD
   end
   dϵ_dz  = @turbo  dfz .* next_gradient

   nchannels_out = size(layer.weight)[end]
   nchannels_in  = size(layer.weight)[end-1]
   input_size, output_size = size(layer)
   de_dx         = zeros(layer.input_size...)

   @inbounds for nch_in in 1:nchannels_in
      w_ch_in = selectdim(layer.weight,layer.ndims+1,nch_in)
      @inbounds for nch_out in 1:nchannels_out
         w_ch_in_out  = selectdim(w_ch_in,layer.ndims+1,nch_out)
         dϵ_dz_ch_out = selectdim(dϵ_dz,layer.ndims+1,nch_out)
         de_dx_ch_in  = selectdim(de_dx,layer.ndims+1,nch_in)
         @inbounds for w_idx in CartesianIndices(w_ch_in_out)
            w_idx = Tuple(w_idx)
            @inbounds for dey_idx in CartesianIndices(dϵ_dz_ch_out)
               dey_idx = Tuple(dey_idx)
               idx_x_source_padded = w_idx .+ (dey_idx .- 1 ) .* layer.stride
               checkstart = idx_x_source_padded .> layer.padding_start
               chekend    = idx_x_source_padded .<=  layer.padding_start .+ input_size[1:end-1]
               if all(checkstart) && all(chekend)
                  idx_x_source = idx_x_source_padded .- layer.padding_start
                  de_dx_ch_in[idx_x_source...] += dϵ_dz_ch_out[dey_idx...] * w_ch_in_out[w_idx...]
               end
            end
         end
      end
   end
   return de_dx
end


function backward(layer::ConvLayer{ND,NDPLUS1,NDPLUS2},x, next_gradient) where {ND,NDPLUS1,NDPLUS2}
   input_size, output_size = size(layer)

   z      =  _zComp(layer,x)
  
   if layer.df != nothing
      dfz = layer.df.(z)  
   else
      dfz =  layer.f'.(z) # using AD
   end
   dϵ_dz  = @turbo  dfz .* next_gradient
   de_dx  = zeros(layer.input_size...)

   lx_ids  = layer.x_ids
   ly_ids  = layer.y_ids
   lw_ids  = layer.w_ids
   lweight = layer.weight

   @simd for idx in 1:length(layer.y_ids)
      de_dx[lx_ids[idx]...] += dϵ_dz[ly_ids[idx]...] * lweight[lw_ids[idx]...] 
   end
   return de_dx
end



function get_gradient_old(layer::ConvLayer,x,next_gradient) # derror/dw
   z      =  _zComp_old(layer,x)
   if layer.df != nothing
      dfz = layer.df.(z)  
   else
      dfz =  layer.f'.(z) # using AD
   end
   dϵ_dz  = @turbo  dfz .* next_gradient
   dw    = zeros(size(layer.weight))

   xpadded = _xpadComp(layer,x)

   # dw computation
   for idx in CartesianIndices(dw)
      idx = Tuple(idx)
      nchannel_out = idx[end]
      nchannel_in  = idx[end-1]
      wdims        = idx[1:end-2]
      dϵ_dz_nchannelOut = selectdim(dϵ_dz,layer.ndims+1,nchannel_out)
      xval = zeros(size(dϵ_dz_nchannelOut))
      for yi in CartesianIndices(dϵ_dz_nchannelOut)
         idx_y = Tuple(yi)
         idx_x_dest = idx_y
         idx_x_source = wdims .+ (idx_y .- 1 ) .* layer.stride # xpadded[i] = w[i] + (Y[i] -1 ) * STRIDE
         xval[idx_x_dest...] = xpadded[vcat(idx_x_source,nchannel_in)...]  
      end
      dw[idx...] = dot(xval,dϵ_dz_nchannelOut)  # slighly more efficient than using += on each individual product
   end

   if layer.usebias
      dbias = zeros(length(layer.bias))
      for bias_idx in 1:length(layer.bias)
         nchannel_out = bias_idx
         dϵ_dz_nchannelOut = selectdim(dϵ_dz,layer.ndims+1,nchannel_out)
         dbias[bias_idx] = sum(dϵ_dz_nchannelOut)
      end
      return (dw,dbias)
   else
      return (dw,)
    end
end


function get_gradient(layer::ConvLayer{ND,NDPLUS1,NDPLUS2},x, next_gradient) where {ND,NDPLUS1,NDPLUS2}
   z      =  _zComp(layer,x)
  
   if layer.df != nothing
      dfz = layer.df.(z)  
   else
      dfz =  layer.f'.(z) # using AD
   end

   dϵ_dz  = @turbo  dfz .* next_gradient
   de_dw  = zeros(size(layer.weight))

   lx_ids  = layer.x_ids
   ly_ids  = layer.y_ids
   lw_ids  = layer.w_ids

   @simd for idx in 1:length(layer.y_ids)
      de_dw[lw_ids[idx]...] += dϵ_dz[ly_ids[idx]...] * x[lx_ids[idx]...] 
   end

   if layer.usebias
      dbias = zeros(length(layer.bias))
      for bias_idx in 1:length(layer.bias)
         nchannel_out = bias_idx
         dϵ_dz_nchannelOut = selectdim(dϵ_dz,layer.ndims+1,nchannel_out)
         dbias[bias_idx] = sum(dϵ_dz_nchannelOut)
      end
      return (de_dw,dbias)
   else
      return (de_dw,)
   end
end


"""
size()

Get the dimensions of the layers in terms of (dimensions in input, dimensions in output) including channels as last dimension
"""
function size(layer::ConvLayer)
   nchannels_in  = layer.input_size[end]
   nchannels_out = size(layer.weight)[end]
   in_size  = (layer.input_size...,)
   out_size = ([1 + Int(floor((layer.input_size[d]+layer.padding_start[d]+layer.padding_end[d]-size(layer.weight,d))/layer.stride[d])) for d in 1:layer.ndims]...,nchannels_out)
   return (in_size,out_size)
end

And then we can use it with:


using BenchmarkTools

# Multiple in_channel/ out_channel
x = reshape(1:(32*32*3),32,32,3)
l = ConvLayer((32,32),(4,4),3,5,f=identity,df=x->1) #-> from 32x32x4 to ?x?x5 using a 4x4 filter df is provided to avoid zygote dependency in this script
preprocess!(l)
print("preproces btime: ")
@btime preprocess!($l)
y = forward_old(l,x)
print("forward_old btime: ")
@btime forward_old($l,$x)
y2 = forward(l,x) 
y2 == y
print("forward btime: ")
@btime forward($l,$x)
de_dy = y ./ 10
de_dw_old = get_gradient_old(l,x,de_dy)
print("get_gradient_old btime: ")
@btime get_gradient_old($l,$x,$de_dy)
de_dw     = get_gradient(l,x,de_dy)
de_dw_old[1] == de_dw[1]
print("get_gradient btime: ")
@btime get_gradient($l,$x,$de_dy)
de_dx_old = backward_old(l,x,de_dy)
print("backward_old btime: ")
@btime backward_old($l,$x,$de_dy)
de_dx     = backward(l,x,de_dy)
de_dx_old[1] == de_dx[1]
print("backward btime: ")
@btime backward($l,$x,$de_dy)

that produces:

preproces btime: 
  6.481 ns (0 allocations: 0 bytes)

forward_old btime: 
  39.261 ms (302153 allocations: 14.48 MiB)

forward btime: 
  21.354 ms (691964 allocations: 10.64 MiB)

get_gradient_old btime: 
  882.561 ms (6585411 allocations: 221.10 MiB)

get_gradient btime: 
  73.475 ms (1806178 allocations: 27.68 MiB)

backward_old btime: 
  1.361 s (11706692 allocations: 443.24 MiB)

backward btime: 
  70.537 ms (1845173 allocations: 28.30 MiB)

At the end I implemented the cached version of the convolution layer (ConvLayer) and the pooling layer (PoolLayer) in BetaML v0.9.5… very flexible… it accepts arrays of any dimension in input and any function can be used in the two type of layers. If it is a “well known” function, the corresponding derivative is picked up, otherwise AD (Zygote) is at rescue. This means that for most common tasks (prediction, regression, convolution…) we can run a NN model without AD.

Unfortunately, while it is hundreds of times faster than before, it is still thousand of times slower than needed for any practical task. A pity :-//

The following script on a subset of MNIST run in 10-15 minutes on a mid-powerful portable with a train/test accuracy of ~35% that demonstrate that “learning” is indeed happening… but sloooovly… :

using BetaML
using Statistics
using MLDatasets # For loading the training data
using Plots            # For plotting the CM

TESTRNG = FIXEDRNG

x_train, y_train = MLDatasets.MNIST.traindata()
x_train          = permutedims(x_train,(3,2,1))
x_train          = convert(Array{Float32,3},x_train)
x_train          = reshape(x_train,size(x_train,1),size(x_train,2)*size(x_train,3))
ohm              = OneHotEncoder()
y_train_oh       = fit!(ohm,y_train)

x_test, y_test = MLDatasets.MNIST.testdata()
x_test          = permutedims(x_test,(3,2,1))
x_test          = convert(Array{Float32,3},x_test)
x_test          = reshape(x_test,size(x_test,1),size(x_test,2)*size(x_test,3))
y_test_oh       = predict(ohm,y_test)


(N,D)    = size(x_train)
l1       = ReshaperLayer((D,1),(28,28,1))
l2       = ConvLayer(size(l1)[2],(5,5),8,stride=2,rng=copy(TESTRNG))
l3       = PoolingLayer(size(l2)[2],(2,2))
l4       = ConvLayer(size(l3)[2],(3,3),16,stride=2,rng=copy(TESTRNG))
l5       = PoolingLayer(size(l4)[2],(2,2))
l6       = ReshaperLayer(size(l5)[2])
l7       = DenseLayer(size(l6)[2][1],10,f=BetaML.relu, rng=copy(TESTRNG))
l8       = VectorFunctionLayer(size(l7)[2][1],f=BetaML.softmax)
layers   = [l1,l2,l3,l4,l5,l6,l7,l8]
m = NeuralNetworkEstimator(layers=layers,loss=squared_cost,verbosity=HIGH,batch_size=64,epochs=5)

(x_debug,x_other),(y_debug_oh,y_other_oh)  = partition([x_train,y_train_oh],[0.01,0.99],rng=copy(TESTRNG))

ŷ = fit!(m,x_debug,y_debug_oh)

y_true  = inverse_predict(ohm,convert(Matrix{Bool},y_debug_oh))
ŷ_nonoh = inverse_predict(ohm,ŷ)
accuracy(y_true,ŷ_nonoh)
hcat(y_true,ŷ_nonoh)

ŷtest       = predict(m,x_test)
ytest_true  = inverse_predict(ohm,convert(Matrix{Bool},y_test_oh))
ŷtest_nonoh = inverse_predict(ohm,ŷtest)
accuracy(ytest_true,ŷtest_nonoh)
hcat(ytest_true,ŷtest_nonoh)

cm = ConfusionMatrix()
fit!(cm,ytest_true,ŷtest_nonoh)
print(cm)
julia> println(cm)
A ConfusionMatrix BetaMLModel (fitted)

-----------------------------------------------------------------

*** CONFUSION MATRIX ***

Scores actual (rows) vs predicted (columns):

11×11 Matrix{Any}:
 "Labels"    "7"     "2"    "1"    "0"     "4"    "9"    "5"     "6"    "3"     "8"
 "7"       28     177     22     27     362     20     19      71     18     284
 "2"        0     901      0      0      13      0      0      57      0      61
 "1"        1     113      1      1       5      3      1     293      4     713
 "0"        5     369      8      3      59      5      1     400      8     122
 "4"        0      13      0      1     860      0      0      52      0      56
 "9"        0      20      3      1     691      1      0      27      1     265
 "5"        6      24      2      1     144      4      1     397      3     310
 "6"        0      27      0      0      48      0      0     880      0       3
 "3"        3     235      3      1      27      0      1     189      2     549
 "8"        0      16      0      0      37      0      0      62      1     858
Normalised scores actual (rows) vs predicted (columns):

11×11 Matrix{Any}:
 "Labels"   "7"          "2"        "1"          "0"          "4"         "9"         "5"          "6"        "3"         "8"
 "7"       0.0272374    0.172179   0.0214008    0.0262646    0.35214     0.0194553   0.0184825    0.0690661  0.0175097   0.276265
 "2"       0.0          0.873062   0.0          0.0          0.0125969   0.0         0.0          0.0552326  0.0         0.0591085
 "1"       0.000881057  0.0995595  0.000881057  0.000881057  0.00440529  0.00264317  0.000881057  0.25815    0.00352423  0.628194
 "0"       0.00510204   0.376531   0.00816327   0.00306122   0.0602041   0.00510204  0.00102041   0.408163   0.00816327  0.12449
 "4"       0.0          0.0132383  0.0          0.00101833   0.875764    0.0         0.0          0.0529532  0.0         0.0570265
 "9"       0.0          0.0198216  0.00297324   0.00099108   0.684836    0.00099108  0.0          0.0267592  0.00099108  0.262636
 "5"       0.00672646   0.0269058  0.00224215   0.00112108   0.161435    0.0044843   0.00112108   0.445067   0.00336323  0.347534
 "6"       0.0          0.0281837  0.0          0.0          0.0501044   0.0         0.0          0.91858    0.0         0.00313152
 "3"       0.0029703    0.232673   0.0029703    0.000990099  0.0267327   0.0         0.000990099  0.187129   0.0019802   0.543564
 "8"       0.0          0.0164271  0.0          0.0          0.0379877   0.0         0.0          0.063655   0.00102669  0.880903

 *** CONFUSION REPORT ***

- Accuracy:               0.3535
- Misclassification rate: 0.6465000000000001
- Number of classes:      10

  N Class   precision   recall  specificity  f1score  actual_count  predicted_count
                          TPR       TNR                 support                  

  1 7           0.651    0.027        0.998    0.052         1028              43
  2 2           0.475    0.873        0.889    0.616         1032            1895
  3 1           0.026    0.001        0.996    0.002         1135              39
  4 0           0.086    0.003        0.996    0.006          980              35
  5 4           0.383    0.876        0.846    0.533          982            2246
  6 9           0.030    0.001        0.996    0.002         1009              33
  7 5           0.043    0.001        0.998    0.002          892              23
  8 6           0.362    0.919        0.829    0.520          958            2428
  9 3           0.054    0.002        0.996    0.004         1010              37
 10 8           0.266    0.881        0.738    0.409          974            3221

- Simple   avg.    0.238    0.358        0.928    0.215
- Weigthed avg.    0.238    0.353        0.930    0.212

-----------------------------------------------------------------
Output of `info(cm)`:
- mean_precision:       (0.23775332496030574, 0.23798049942477636)
- fitted_records:       10000
- specificity:  [0.9983281319661168, 0.8891614629794826, 0.9957134799774394, 0.9964523281596452, 0.846307385229541, 0.9964408853297743, 0.9975845410628019, 0.8287989382879893, 0.9961067853170189, 0.7382007533791269]
- precision:    [0.6511627906976745, 0.47546174142480213, 0.02564102564102564, 0.08571428571428572, 0.38290293855743546, 0.030303030303030304, 0.043478260869565216, 0.3624382207578254, 0.05405405405405406, 0.2663769015833592]
- misclassification:    0.6465000000000001
- mean_recall:  (0.35835816198752957, 0.3535)
- n_categories: 10
- normalised_scores:    [0.027237354085603113 0.17217898832684825 0.021400778210116732 0.026264591439688716 0.3521400778210117 0.019455252918287938 0.01848249027237354 0.06906614785992218 0.017509727626459144 0.27626459143968873; 0.0 0.873062015503876 0.0 0.0 0.012596899224806201 0.0 0.0 0.055232558139534885 0.0 0.05910852713178295; 0.000881057268722467 0.09955947136563877 0.000881057268722467 0.000881057268722467 0.004405286343612335 0.0026431718061674008 0.000881057268722467 0.2581497797356828 0.003524229074889868 0.628193832599119; 0.00510204081632653 0.376530612244898 0.00816326530612245 0.003061224489795918 0.06020408163265306 0.00510204081632653 0.0010204081632653062 0.40816326530612246 0.00816326530612245 0.12448979591836734; 0.0 0.013238289205702648 0.0 0.0010183299389002036 0.8757637474541752 0.0 0.0 0.05295315682281059 0.0 0.05702647657841141; 0.0 0.019821605550049554 0.002973240832507433 0.0009910802775024777 0.6848364717542121 0.0009910802775024777 0.0 0.026759167492566897 0.0009910802775024777 0.2626362735381566; 0.006726457399103139 0.026905829596412557 0.002242152466367713 0.0011210762331838565 0.16143497757847533 0.004484304932735426 0.0011210762331838565 0.44506726457399104 0.0033632286995515697 0.3475336322869955; 0.0 0.028183716075156576 0.0 0.0 0.05010438413361169 0.0 0.0 0.918580375782881 0.0 0.003131524008350731; 0.0029702970297029703 0.23267326732673269 0.0029702970297029703 0.0009900990099009901 0.026732673267326732 0.0 0.0009900990099009901 0.18712871287128713 0.0019801980198019802 0.5435643564356436; 0.0 0.01642710472279261 0.0 0.0 0.037987679671457907 0.0 0.0 0.06365503080082136 0.001026694045174538 0.8809034907597536]
- tn:   [8957, 7974, 8827, 8988, 7632, 8959, 9086, 7494, 8955, 6663]
- mean_f1score: (0.21451589602864818, 0.21241972340379744)
- actual_count: [1028, 1032, 1135, 980, 982, 1009, 892, 958, 1010, 974]
- accuracy:     0.3535
- recall:       [0.027237354085603113, 0.873062015503876, 0.000881057268722467, 0.003061224489795918, 0.8757637474541752, 0.0009910802775024777, 0.0011210762331838565, 0.918580375782881, 0.0019801980198019802, 0.8809034907597536]
- f1score:      [0.05228758169934641, 0.6156474205671336, 0.0017035775127768314, 0.005911330049261084, 0.5328376703841388, 0.0019193857965451055, 0.002185792349726776, 0.5197873597164796, 0.0038204393505253103, 0.4090584028605483]
- mean_specificity:     (0.9283094691688938, 0.9295946916889363)
- predicted_count:      [43, 1895, 39, 35, 2246, 33, 23, 2428, 37, 3221]
- scores:       [28 177 22 27 362 20 19 71 18 284; 0 901 0 0 13 0 0 57 0 61; 1 113 1 1 5 3 1 293 4 713; 5 369 8 3 59 5 1 400 8 122; 0 13 0 1 860 0 0 52 0 56; 0 20 3 1 691 1 0 27 1 265; 6 24 2 1 144 4 1 397 3 310; 0 27 0 0 48 0 0 880 0 3; 3 235 3 1 27 0 1 189 2 549; 0 16 0 0 37 0 0 62 1 858]
- tp:   [28, 901, 1, 3, 860, 1, 1, 880, 2, 858]
- fn:   [1000, 131, 1134, 977, 122, 1008, 891, 78, 1008, 116]
- categories:   [7, 2, 1, 0, 4, 9, 5, 6, 3, 8]
- fp:   [15, 994, 38, 32, 1386, 32, 22, 1548, 35, 2363]

julia> res = info(cm);

julia> heatmap(string.(res["categories"]),string.(res["categories"]),res["normalised_scores"],seriescolor=cgrad([:white,:blue]),xlabel="Predicted",ylabel="Actual", title="Confusion Matrix (normalised scores)")

image

1 Like