Loading 60k images from a folder. Python code is way faster than Julia

Can you try

function createDataset(path)
    N = sum(length(readdir("$path/$label")) for label in readdir(path))
    X = zeros(Float32,28,28,N)
    y = zeros(Float32,N)
    i=1
    for label in readdir(path)
        for file in readdir("$path/$label")
            img = load("$path/$label/$file")
            chim = channelview(img)
            @views data[:,:,i] .= chim
            y[i] = parse(Float32,label)
            i+=1
        end
    end
    return X,y
end

how mmany img do you have ?
It is very slow indeed, Is it load thats so slow ?
I can cheat to go down to 1s (on my computer) for 60K images

function createDataset(path)
    N = sum(length(readdir("$path/$label")) for label in readdir(path))
    @info "$N Images found"
    X = [Matrix{Float32}[] for _ in 1:Threads.nthreads()]
    y = [Float32[] for _ in 1:Threads.nthreads()]
    for label in readdir(path)
        Threads.@threads :static for file in readdir("$path/$label")
            img = load("$path/$label/$file")
            chim = channelview(img)
            i = Threads.threadid()
            push!(X[i],chim)
            push!(y[i],parse(Float32,label)) 
        end
    end
    XX = zeros(Float32,28,28,N)
    yy = zeros(Float32,N)
    n = 1
    for i in 1:Threads.nthreads()
        for j in 1:length(X[i])
            @views XX[:,:,n] .= X[i][j]
            yy[n] = y[i][j]
            n += 1
        end
    end
    return XX,yy
end

2 Likes