Benchmarked some code which maybe useful for others:
Running on my linux laptop with 8GB CPU RAM and No GPU.
Code #1:
function createDataset(path)
    N = sum(length(readdir("$path/$label")) for label in readdir(path))
    X = zeros(Float32, 28, 28, N)
    y = zeros(UInt8, N)
    i = 1
    
    for label in readdir(path)
        label_val = parse(UInt8, label)
        for file in readdir("$path/$label")
            img = load("$path/$label/$file")
            img_resized = imresize(img, (28, 28))
            img_matrix = Float32.(channelview(img_resized))
            X[:,:,i] = img_matrix
            y[i] = label_val
            i += 1
        end
    end
    return X, y
end
@time x_train, y_train = createDataset("./mnist_png/training/")
println("Loaded 60k images for training...")
@time x_test, y_test = createDataset("./mnist_png/testing/")
println("Loaded 10k images for testing...")
Output:   4.641136 seconds (4.85 M allocations: 714.702 MiB, 12.10% gc time, 13.50% compilation time)
Loaded 60k images for training…
0.789023 seconds (629.70 k allocations: 109.900 MiB, 2.33% gc time)
Loaded 10k images for testing…
Code #2:
function createDataset(path)
    N = sum(length(readdir("$path/$label")) for label in readdir(path))
    X = Vector{Matrix{Float32}}()
    y = Vector{UInt8}()
    sizehint!(X, N)  
    sizehint!(y, N)
    
    for label in readdir(path)
        label_val = parse(UInt8, label)
        for file in readdir("$path/$label")
            img = load("$path/$label/$file")
            img_resized = imresize(img, (28, 28)) 
            img_matrix = Float32.(channelview(img_resized))
            push!(X, img_matrix)
            push!(y, label_val)
        end
    end
    return X, y
end
Output:   5.439044 seconds (4.63 M allocations: 531.180 MiB, 13.79% gc time, 18.48% compilation time)
Loaded 60k images for training…
0.648424 seconds (600.21 k allocations: 79.616 MiB, 4.58% gc time)
Loaded 10k images for testing…
Code #3:
function createDataset(path)
    N = sum(length(readdir("$path/$label")) for label in readdir(path))
    X_vec = Vector{Matrix{Float32}}()
    y = Vector{UInt8}()
    sizehint!(X_vec, N)
    sizehint!(y, N)
    
    for label in readdir(path)
        label_val = parse(UInt8, label)
        for file in readdir("$path/$label")
            img = load("$path/$label/$file")
            img_resized = imresize(img, (28, 28))
            img_matrix = Float32.(channelview(img_resized))
            push!(X_vec, img_matrix)
            push!(y, label_val)
        end
    end
    
    X = zeros(Float32, 28, 28, N)
    for i in 1:N
        X[:,:,i] = X_vec[i]
    end
    
    return X, y
end
Output:   5.375299 seconds (4.63 M allocations: 710.502 MiB, 11.16% gc time, 17.17% compilation time)
Loaded 60k images for training…
0.588595 seconds (600.22 k allocations: 109.526 MiB)
Loaded 10k images for testing…
Code #4:
function createDataset(path)
    N = sum(length(readdir("$path/$label")) for label in readdir(path))
    X = [Vector{Matrix{Float32}}() for _ in 1:Threads.nthreads()]
    y = [Vector{UInt8}() for _ in 1:Threads.nthreads()]
    
    for label in readdir(path)
        label_val = parse(UInt8, label)
        Threads.@threads :static for file in readdir("$path/$label")
            img = load("$path/$label/$file")
            img_resized = imresize(img, (28, 28))
            img_matrix = Float32.(channelview(img_resized))
            i = Threads.threadid()
            push!(X[i], img_matrix)
            push!(y[i], label_val)
        end
    end
    
    XX = zeros(Float32, 28, 28, N)
    yy = zeros(UInt8, N)
    n = 1
    for i in 1:Threads.nthreads()
        for j in 1:length(X[i])
            XX[:,:,n] = X[i][j]
            yy[n] = y[i][j]
            n += 1
        end
    end
    
    return XX, yy
end
Output: 3.070019 seconds (5.26 M allocations: 741.474 MiB, 13.37% gc time, 26614 lock conflicts, 123.00% compilation time)
Loaded 60k images for training…
0.465059 seconds (600.51 k allocations: 109.710 MiB, 30.19% gc time, 4051 lock conflicts)
Loaded 10k images for testing…