Bayesian Neural Network (Multiclass Classification) in Turing.jl and Flux.jl

Seems there’s two problems here. The first is that the softmax function in NNLib has some pretty restrictive typing, which means it’s not playing nice with the automatic differentiation. The other is that the anonymous function you defined as a replacement for softmax, xs -> exp.(xs) / sum(exp.(xs)) doesn’t actually work columnwise – it’ll take the sum of an entire rectangular array and divide every element by that sum. So the whole array will sum to one, but no individual column is a probability vector. I wrote my own softmax function that plays fairly nice with Flux:

function softmax_(arr::AbstractArray)
    ex = mapslices(x -> exp.(x), arr, dims=1)
    rows, cols = size(arr)
    val = similar(ex)
    for i in 1:cols
        s = sum(ex[:,i])
        for j in 1:rows
            val[j,i] = ex[j,i] / s
        end
    end
    return val
end

And changed the feedforward function to:

function feedforward(inp::AbstractArray, theta::AbstractVector)
    W0, b0, W1, b1, W2, b2 = weights(theta)
    model = Chain(
        Dense(W0, b0, tanh),
        Dense(W1, b1, tanh),
        Dense(W2, b2, σ),
        softmax_
    )

    return model(inp)
end

Note that you’ll also get a speedup from using Turing’s reverse mode AD (which uses Flux) by calling setadbackend(:reverse_diff). Reverse mode tends to be faster if you have a lot of parameters, as in the case of neural nets.

I haven’t investigated to see if the results are sensible, but here’s the full code I’ve modified to work:

using Flux
using RDatasets
using Measures
using StatsPlots
using PlotThemes
using Turing
pyplot()
theme(:orange)

iris = dataset("datasets", "iris");

@df iris scatter(:SepalLength, :SepalWidth, group = :Species,
    xlabel = "Length", ylabel = "Width", markersize = 5,
    markeralpha = 0.75, markerstrokewidth = 0, linealpha = 0,
    m = (0.5, [:cross :hex :star7], 12),
    margin = 5mm)

inputs = convert(Matrix, iris[1:4]);
labels = map(x -> x == "setosa" ? 0 : x == "versicolor" ? 1 : 2, iris[end]);

function softmax_(arr::AbstractArray)
    ex = mapslices(x -> exp.(x), arr, dims=1)
    rows, cols = size(arr)
    val = similar(ex)
    for i in 1:cols
        s = sum(ex[:,i])
        for j in 1:rows
            val[j,i] = ex[j,i] / s
        end
    end
    return val
end

function weights(theta::AbstractVector)
    W0 = reshape(theta[ 1:20], 5, 4); b0 = reshape(theta[21:25], 5)
    W1 = reshape(theta[26:45], 4, 5); b1 = reshape(theta[46:49], 4)
    WO = reshape(theta[50:61], 3, 4); bO = reshape(theta[61:63], 3)

    return W0, b0, W1, b1, WO, bO
end

function feedforward(inp::AbstractArray, theta::AbstractVector)
    W0, b0, W1, b1, W2, b2 = weights(theta)
    model = Chain(
        Dense(W0, b0, tanh),
        Dense(W1, b1, tanh),
        Dense(W2, b2, σ),
        softmax_
    )

    return model(inp)
end

alpha = 0.09;
sigma = sqrt(1.0 / alpha);

@model bayesnn(inp, lab) = begin
    theta ~ MvNormal(zeros(63), sigma .* ones(63))

    preds = feedforward(inp, theta)
    for i = 1:length(lab)
        lab[i] ~ Categorical(preds[:,i])
    end
end

setadbackend(:reverse_diff)
steps = 5000
chain = sample(bayesnn(Array(inputs'), labels), HMC(steps, 0.05, 4));

Let me know how it works out! It’s a nifty little project you’re doing here.

4 Likes