Okay, I think I managed to get it working! First I load in the data and one-hot-encode it (pretty similar to my first shot above):
using MLJ
using Flux
import MLJFlux
import MLJIteration
using CairoMakie
import Base.@kwdef
using Downloads
url_seqs = "https://raw.githubusercontent.com/abidlabs/deep-learning-genomics-primer/master/sequences.txt"
sequences_raw = split(String(take!(Downloads.download(url_seqs, IOBuffer()))))
url_labels = "https://raw.githubusercontent.com/abidlabs/deep-learning-genomics-primer/master/labels.txt"
labels_raw = parse.(Int64, split(String(take!(Downloads.download(url_labels, IOBuffer())))))
#%%
bases_dna = ['A', 'C', 'G', 'T']
function ohe_row(sequence)
return collect(sequence) .== permutedims(bases_dna)
end
function onehot(sequences)
L = 50
N_bases = 4
out = BitArray(undef, L, N_bases, size(sequences, 1))
for (i, sequence) in enumerate(sequences)
out[:, :, i] = reshape(ohe_row(sequence), (L, N_bases, 1))
end
return out
end
#%%
sequences = onehot(sequences_raw);
images = coerce(sequences, GrayImage);
labels = coerce(labels_raw, Binary);
N_images = length(images)
idxs = shuffle(1:N_images);
train = idxs[1:1500];
test = idxs[1501:end];
One slightly weird feature is that it seems to be easiest to convert the genomic data to an L
times 4 dimensional gray image (with L
being the sequence length, 50 in this case).
The convolutional network was a bit tricky to get working, at least regarding using the correct dimensions for the convolutional filter. However, I think I got it by using the following:
@kwdef struct MyDNABuilder
kernel_size::Int
filters::Int
dense::Int
end
make2d(x::AbstractArray) = reshape(x, :, size(x)[end])
function MLJFlux.build(b::MyDNABuilder, rng, n_in, n_out, n_channels)
kernel_size, filters, dense = b.kernel_size, b.filters, b.dense
init = Flux.glorot_uniform(rng)
front = Chain(
Conv((kernel_size, 4), 1 => filters, pad = (0, 0), bias = true, init = init),
MaxPool((4, 1)),
make2d,
)
d = Flux.outputsize(front, (n_in..., n_channels, 1)) |> first
return Chain(front, Dense(d, dense, init = init), Dense(dense, n_out, init = init))
end
clf = MLJFlux.ImageClassifier(
builder = MyDNABuilder(kernel_size = 12, filters = 32, dense = 16),
loss = Flux.crossentropy,
optimiser = ADAM(0.001, (0.9, 0.999)),
epochs = 50,
batch_size = 32,
rng = 123,
finaliser = softmax,
)
This network specification should be similar to the one used in Python (see my original post).
To capture the training losses as a function of epoch, I followed the MNIST example from MLJFLux:
# To store the traces:
losses = Float32[]
training_losses = Float32[]
parameter_means = Float32[];
epochs = Int64[]
# To update the traces:
update_loss(loss) = push!(losses, loss)
update_training_loss(losses) = push!(training_losses, losses[end])
update_epochs(epoch) = push!(epochs, epoch)
controls = [
Step(1),
NumberLimit(n = 50),
WithLossDo(),
WithLossDo(update_loss),
WithTrainingLossesDo(update_training_loss),
WithIterationsDo(update_epochs),
];
# The "self-iterating" classifier:
iterated_clf = IteratedModel(
model = clf,
controls = controls,
resampling = Holdout(fraction_train = 0.75),
measure = log_loss,
);
# Binding the wrapped model to data:
mach = machine(iterated_clf, images[train], labels[train]);
# Training
fit!(mach);
This gives the following training loss curve:
I do not understand why the loss decreases, then increases around epoch 25-30 and then suddenly drops again. If anyone has any suggestions, please let me know! In comparison, the training loss in the Python example does not exhibit similar behaviour.
The accuracy on the test set is 96.8% using Julia, while it is 97.4% using Python so thatβs also an advantage of the Python version.
The confusion matrix given Julia is:
βββββββββββββββββββββββββββββ
β Ground Truth β
βββββββββββββββΌββββββββββββββ¬ββββββββββββββ€
β Predicted β 0 β 1 β
βββββββββββββββΌββββββββββββββΌββββββββββββββ€
β 0 β 241 β 3 β
βββββββββββββββΌββββββββββββββΌββββββββββββββ€
β 1 β 13 β 243 β
βββββββββββββββ΄ββββββββββββββ΄ββββββββββββββ
and in Python is:
[[249 10]
[ 3 238]]