Loading 60k images from a folder. Python code is way faster than Julia

This is the code i have currently:

using Images
function createDataset(path)
    X = []
    y = []
    for label in readdir(path)
        for file in readdir("$path/$label")
            img = load("$path/$label/$file")
            data = reshape(Float32.(channelview(img)),28,28,1)
            if length(X) == 0
                X = data
            else
                X = cat(X,data,dims=3)
            end
            push!(y,parse(Float32,label))
        end
    end
    return X,y
end

The same code logic in python:

import os
import time
import numpy as np
from PIL import Image

def create_dataset(path):
    X = []
    y = []

    for label in os.listdir(path):
        label_path = os.path.join(path, label)
        if not os.path.isdir(label_path):
            continue

        for file in os.listdir(label_path):
            file_path = os.path.join(label_path, file)
            img = Image.open(file_path).convert('L')
            data = np.array(img, dtype=np.float32)
            data = data.reshape(28, 28, 1)
            X.append(data)
            y.append(float(label))
    
    return np.array(X), np.array(y)

x_train, y_train = create_dataset("mnist_png/training")
x_test, y_test = create_dataset("mnist_png/testing")

Python took 13s but Julia is taking more than 10mins and still running.

What am i doing wrong?

how large are the images? Are you sure you’re not dipping into Swap?

but also, this is copying the whole X every time you append some new data, maybe this is the problem?

also, is it possible to benchmark on just a few files? I wonder if CPU/storage is throttleed after long running session.

The images are from mnist dataset which are already 28Ă—28
(MNIST png | Kaggle)

I just wanted to reshape again the same image and see how long does it take.

Regarding cat, I felt the same but i couldn’t quite interpret it. If that’s the problem, any idea on how to do this differently? Use a vector and push! it?

You can determine the size of X a priori and preallocate it. Then set the elements in the loop instead of growing the array.

Your python code is appending, while your Julia code is allocating a new array for each iteration, giving the algorithm N^2 complexity.

Use append or push in Julia, too, or better yet, pre-allocate.

Julia code should not have untyped containers. The above line is a guarantee for slow code.

5 Likes

Sorry, new to Julia.

Should i do it like this?

X = vector{Float64}(60000)

Does this pre allocate for 60k images like the other comment said?
Or, should i use the sizehint! function?

Can you try

function createDataset(path)
    N = sum(length(readdir("$path/$label")) for label in readdir(path))
    X = zeros(Float32,28,28,N)
    y = zeros(Float32,N)
    i=1
    for label in readdir(path)
        for file in readdir("$path/$label")
            img = load("$path/$label/$file")
            chim = channelview(img)
            @views data[:,:,i] .= chim
            y[i] = parse(Float32,label)
            i+=1
        end
    end
    return X,y
end

how mmany img do you have ?
It is very slow indeed, Is it load thats so slow ?
I can cheat to go down to 1s (on my computer) for 60K images

function createDataset(path)
    N = sum(length(readdir("$path/$label")) for label in readdir(path))
    @info "$N Images found"
    X = [Matrix{Float32}[] for _ in 1:Threads.nthreads()]
    y = [Float32[] for _ in 1:Threads.nthreads()]
    for label in readdir(path)
        Threads.@threads :static for file in readdir("$path/$label")
            img = load("$path/$label/$file")
            chim = channelview(img)
            i = Threads.threadid()
            push!(X[i],chim)
            push!(y[i],parse(Float32,label)) 
        end
    end
    XX = zeros(Float32,28,28,N)
    yy = zeros(Float32,N)
    n = 1
    for i in 1:Threads.nthreads()
        for j in 1:length(X[i])
            @views XX[:,:,n] .= X[i][j]
            yy[n] = y[i][j]
            n += 1
        end
    end
    return XX,yy
end

2 Likes

This only sllocates a vector of floats. You either need a vector of Matrix or a 3d array, it’s up to you which is better for your purpose.

If you can mange with a vector of matrix, I would just do

X = Vector{Matrix{Float64}}(undef, N) 

since each load allocates a matrix, you might as well use that

X[i] = load(filename) 

If you must have a 3d array, you have to fully allocate, though

X = Array{Float64, 3}(undef, 28,28,N)

Your solution worked like a charm!

I did some few adjustments in the code.

function createDataset(path)
    N = sum(length(readdir("$path/$label")) for label in readdir(path))
    X = zeros(Float32, 28, 28, N)
    y = zeros(UInt8, N)
    i = 1
    
    for label in readdir(path)
        label_val = parse(UInt8, label)
        for file in readdir("$path/$label")
            img = load("$path/$label/$file")
            img_matrix = Float32.(channelview(img)) 
            X[:,:,i] = img_matrix 
            y[i] = label_val
            i += 1
        end
    end
    return X, y
end

This took me 4s for 60k images

Benchmarked some code which maybe useful for others:
Running on my linux laptop with 8GB CPU RAM and No GPU.

Code #1:

function createDataset(path)
    N = sum(length(readdir("$path/$label")) for label in readdir(path))
    X = zeros(Float32, 28, 28, N)
    y = zeros(UInt8, N)
    i = 1
    
    for label in readdir(path)
        label_val = parse(UInt8, label)
        for file in readdir("$path/$label")
            img = load("$path/$label/$file")
            img_resized = imresize(img, (28, 28))
            img_matrix = Float32.(channelview(img_resized))
            X[:,:,i] = img_matrix
            y[i] = label_val
            i += 1
        end
    end
    return X, y
end

@time x_train, y_train = createDataset("./mnist_png/training/")
println("Loaded 60k images for training...")
@time x_test, y_test = createDataset("./mnist_png/testing/")
println("Loaded 10k images for testing...")

Output: 4.641136 seconds (4.85 M allocations: 714.702 MiB, 12.10% gc time, 13.50% compilation time)
Loaded 60k images for training…
0.789023 seconds (629.70 k allocations: 109.900 MiB, 2.33% gc time)
Loaded 10k images for testing…

Code #2:

function createDataset(path)
    N = sum(length(readdir("$path/$label")) for label in readdir(path))
    X = Vector{Matrix{Float32}}()
    y = Vector{UInt8}()
    sizehint!(X, N)  
    sizehint!(y, N)
    
    for label in readdir(path)
        label_val = parse(UInt8, label)
        for file in readdir("$path/$label")
            img = load("$path/$label/$file")
            img_resized = imresize(img, (28, 28)) 
            img_matrix = Float32.(channelview(img_resized))
            push!(X, img_matrix)
            push!(y, label_val)
        end
    end
    return X, y
end

Output: 5.439044 seconds (4.63 M allocations: 531.180 MiB, 13.79% gc time, 18.48% compilation time)
Loaded 60k images for training…
0.648424 seconds (600.21 k allocations: 79.616 MiB, 4.58% gc time)
Loaded 10k images for testing…

Code #3:

function createDataset(path)
    N = sum(length(readdir("$path/$label")) for label in readdir(path))
    X_vec = Vector{Matrix{Float32}}()
    y = Vector{UInt8}()
    sizehint!(X_vec, N)
    sizehint!(y, N)
    
    for label in readdir(path)
        label_val = parse(UInt8, label)
        for file in readdir("$path/$label")
            img = load("$path/$label/$file")
            img_resized = imresize(img, (28, 28))
            img_matrix = Float32.(channelview(img_resized))
            push!(X_vec, img_matrix)
            push!(y, label_val)
        end
    end
    
    X = zeros(Float32, 28, 28, N)
    for i in 1:N
        X[:,:,i] = X_vec[i]
    end
    
    return X, y
end

Output: 5.375299 seconds (4.63 M allocations: 710.502 MiB, 11.16% gc time, 17.17% compilation time)
Loaded 60k images for training…
0.588595 seconds (600.22 k allocations: 109.526 MiB)
Loaded 10k images for testing…

Code #4:

function createDataset(path)
    N = sum(length(readdir("$path/$label")) for label in readdir(path))
    X = [Vector{Matrix{Float32}}() for _ in 1:Threads.nthreads()]
    y = [Vector{UInt8}() for _ in 1:Threads.nthreads()]
    
    for label in readdir(path)
        label_val = parse(UInt8, label)
        Threads.@threads :static for file in readdir("$path/$label")
            img = load("$path/$label/$file")
            img_resized = imresize(img, (28, 28))
            img_matrix = Float32.(channelview(img_resized))
            i = Threads.threadid()
            push!(X[i], img_matrix)
            push!(y[i], label_val)
        end
    end
    
    XX = zeros(Float32, 28, 28, N)
    yy = zeros(UInt8, N)
    n = 1
    for i in 1:Threads.nthreads()
        for j in 1:length(X[i])
            XX[:,:,n] = X[i][j]
            yy[n] = y[i][j]
            n += 1
        end
    end
    
    return XX, yy
end

Output: 3.070019 seconds (5.26 M allocations: 741.474 MiB, 13.37% gc time, 26614 lock conflicts, 123.00% compilation time)
Loaded 60k images for training…
0.465059 seconds (600.51 k allocations: 109.710 MiB, 30.19% gc time, 4051 lock conflicts)
Loaded 10k images for testing…

1 Like

I think you didn’t start Julia with julia -t auto to use all threads. Times are similar because all you did is type stable and dominated by loading. Nice benchmark though

1 Like

Thanks for the info. I edited the previous reply with julia -t auto