[Code Review] Dirstat ported from C++ and Java

Hi I am writing a simple dirstat program which will traverse a directory and collect various metadata for the file. A task is dispatched to a threadpool when a we encounter a directory. Currently I have implementations in C++ and Java and I these 2 implementations are faster than my Julia program. I am new to Julia so I am wondering if there are any glaring mistakes that I have made.

Note: I am aware that we should have a vector per thread to reduce lock contention but I decided not to impl that yet due to lack of time. As of now I am just comparing these 2 impls with Julia.

Thanks!

Benchmarks (with pagecache)

Julia: 780ms
C++: 337ms
Java: 544ms

Julia Impl

using Base.Threads

const S_IFDIR = 0o040000  # Directory
const S_IFREG = 0o100000  # Regular file
const S_IFLNK = 0o120000  # Symbolic link

struct Filemetadata
    name::String
    size::Int64
    creation_time::Float64
    modification_time::Float64
    access_time::Float64
    is_directory::Bool
    is_file::Bool
    is_symlink::Bool
end

@inline function get_file_metadata(file_path::String)::Filemetadata
    data = lstat(file_path)
    is_directory = (data.mode & S_IFDIR) == S_IFDIR
    is_file = (data.mode & S_IFREG) == S_IFREG
    is_symlink = (data.mode & S_IFLNK) == S_IFLNK

    size = is_directory ? 0 : data.size

    return Filemetadata(file_path,
        size,
        data.ctime,
        data.mtime,
        0.0,
        is_directory,
        is_file,
        is_symlink
    )
end

function read_dir!(directory_path::String, file_metadata::Vector{Filemetadata}, lk::ReentrantLock)
    tasks = Task[]    

    for entry in readdir(directory_path, join=true)
        data = get_file_metadata(entry)

        if data.is_symlink
            continue
        end

        if data.is_directory
            lock(lk) do
                push!(file_metadata, data)
            end
            task = Threads.@spawn read_dir!(joinpath(directory_path, entry), file_metadata, lk)
            push!(tasks, task)
        elseif data.is_file
            lock(lk) do
                push!(file_metadata, data)
            end
        end
    end

    for task in tasks
        wait(task)
    end
end

function walk_storage(file_path::String)::Vector{Filemetadata}
    # Vector to store the file metadata with size 1024
    file_metadata = Vector{Filemetadata}()
    lock = ReentrantLock()
    read_dir!(file_path, file_metadata, lock)

    return file_metadata
end

function main()    
    result = walk_storage("/mnt/sn850x")
    total_sz_b = 0

    for file in result
        total_sz_b += file.size
    end

    total_size_mb = total_sz_b / 1024 / 1024
    total_size_gb = total_sz_b / 1024 / 1024 / 1024

    println("[+] Total size of all files: ", total_sz_b, " B")
    println("[+] Total size of all files: ", total_size_mb, " MB")
    println("[+] Total size of all files: ", total_size_gb, " GB")
    println("[+] Total number of files: ", length(result))end

if abspath(PROGRAM_FILE) == @__FILE__
    main()
end

C++ Impl

#include <iostream>
#include <utility>
#include <sys/stat.h>
#include <filesystem>
#include <mutex>
#include <vector>
#include "mimalloc-new-delete.h" // enable mimalloc new/delete overloads
#include "BS_thread_pool.hpp"

namespace fs = std::filesystem;
namespace chrono = std::chrono;

class file_metadata {
public:
    // Name of the file, including the path
    std::string name;
    // Size of the file in bytes
    long size;
    // Time when the file was created
    chrono::time_point<chrono::system_clock> creation_time;
    // Time when the file was last modified
    chrono::time_point<chrono::system_clock> modification_time;
    // Time when the file was last accessed
    chrono::time_point<chrono::system_clock> access_time;
    // Whether the file is a directory
    bool is_directory;
    // Whether the file is a regular file
    bool is_file;
    // Is the file a symlink
    bool is_symlink;

    file_metadata(std::string name, long size,
                  chrono::time_point<chrono::system_clock> creation_time,
                  chrono::time_point<chrono::system_clock> modification_time,
                  chrono::time_point<chrono::system_clock> access_time,
                  bool is_directory, bool is_file, bool is_symlink)
            : name(std::move(name)), size(size), creation_time(creation_time),
              modification_time(modification_time), access_time(access_time),
              is_directory(is_directory), is_file(is_file), is_symlink(is_symlink) {}

    static auto from_path(const fs::path &path) -> file_metadata {
        // call stat on the path
        struct stat file_stat{};
        lstat(path.c_str(), &file_stat);

        chrono::time_point<std::chrono::system_clock> access_time(
                chrono::seconds(file_stat.st_atim.tv_sec) +
                chrono::nanoseconds(file_stat.st_atim.tv_nsec)
        );

        chrono::time_point<std::chrono::system_clock> modification_time(
                chrono::seconds(file_stat.st_mtim.tv_sec) +
                chrono::nanoseconds(file_stat.st_mtim.tv_nsec)
        );

        chrono::time_point<std::chrono::system_clock> creation_time(
                chrono::seconds(file_stat.st_ctim.tv_sec) +
                chrono::nanoseconds(file_stat.st_ctim.tv_nsec)
        );

        return {
                path.string(),
                S_ISDIR(file_stat.st_mode) ? 0 : file_stat.st_size,
                creation_time,
                modification_time,
                access_time,
                S_ISDIR(file_stat.st_mode),
                S_ISREG(file_stat.st_mode),
                S_ISLNK(file_stat.st_mode)
        };
    }
};

class dirstat {
private:
    std::vector<file_metadata> entries;
    std::mutex entries_mutex;
    BS::thread_pool pool;

    auto read_dir(const fs::path &directory_path) -> void {
        for(const auto &entry_path: fs::directory_iterator(directory_path)) {
            auto file_metadata = file_metadata::from_path(entry_path);

            if (file_metadata.is_directory) {
                std::scoped_lock lock(entries_mutex);
                entries.emplace_back(file_metadata);

                pool.detach_task([this, entry_path] {
                    return read_dir(entry_path);
                });
            } else if (file_metadata.is_file) {
                std::scoped_lock lock(entries_mutex);
                entries.emplace_back(file_metadata);
            }
        }
    }

public:
    auto walk_storage(const fs::path &path) -> std::vector<file_metadata> const& {
        read_dir(path);
        pool.wait();
        return entries;
    }
};

auto main() -> int {
    std::string root = "/mnt/sn850x";

    dirstat ds;
    auto& result = ds.walk_storage(root);

    long total_size = 0;

    for (const auto &entry: result) {
        total_size += entry.size;
    }

    double total_size_mb = (double)total_size / (1024.0 * 1024.0);
    double total_size_gb = (double)total_size / (1024.0 * 1024.0 * 1024.0);

    std::cout << "[+] Total size of all files: " << total_size << " B\n";
    std::cout << "[+] Total size of all files: " << std::format("{}", total_size_mb) << " MB\n";
    std::cout << "[+] Total size of all files: " << std::format("{}", total_size_gb) << " GB\n";
    std::cout << "[+] Number of files: " << result.size() << "\n";

    return 0;
}

Java Impl

package sg.edu.ntu;

import java.io.File;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.attribute.BasicFileAttributes;
import java.nio.file.attribute.FileTime;
import java.util.*;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;

public class Main {
    record FileMetadata(
            String name,
            long sz,
            FileTime creationTime,
            FileTime modificationTime,
            FileTime lastAccessTime,
            boolean isDirectory,
            boolean isFile
    ) {
        static FileMetadata tryFromPath(Path path) {
            try {
                BasicFileAttributes attributes = Files.readAttributes(path, BasicFileAttributes.class);
                boolean isDirectory = attributes.isDirectory();

                return new FileMetadata(
                        path.toString(),
                        isDirectory ? 0 : attributes.size(),
                        attributes.creationTime(),
                        attributes.lastModifiedTime(),
                        attributes.lastAccessTime(),
                        isDirectory,
                        attributes.isRegularFile()
                );
            } catch (IOException e) {
                System.out.printf("[-] Got IOException: %s\n", e);
            }

            return null;
        }
    }

    public static void main(String[] args) {
        List<FileMetadata> files = walkStorage("/mnt/sn850x");

        long totalSz = files.stream().mapToLong(FileMetadata::sz).sum();
        double totalSzMb = totalSz / (1024.0 * 1024.0);
        double totalSzGb = totalSz / (1024.0 * 1024.0 * 1024.0);

        System.out.printf("[+] Total size of all files: %d B\n", totalSz);
        System.out.printf("[+] Total size of all files: %f MB\n", totalSzMb);
        System.out.printf("[+] Total size of all files: %f GB\n", totalSzGb);
        System.out.printf("[+] Number of files: %d\n", files.size());
    }

    private static List<FileMetadata> walkStorage(String path) {
        try(ExecutorService pool = Executors.newWorkStealingPool()) {
            List<FileMetadata> fileMetadata = Collections.synchronizedList(new ArrayList<>());

            readDir(path, fileMetadata, pool);
            return fileMetadata;
        } catch (Exception e) {
            return List.of();
        }
    }

    private static void readDir(String path, List<FileMetadata> fileMetadata, ExecutorService pool) {
        File baseFile = new File(path);
        File[] files = baseFile.listFiles(file -> !Files.isSymbolicLink(file.toPath()));

        if(files != null) {
            for(File file: files) {
                if(file.isDirectory()) {
                    fileMetadata.add(FileMetadata.tryFromPath(file.toPath()));
                    pool.submit(() -> readDir(file.getAbsolutePath(), fileMetadata, pool));
                } else {
                    fileMetadata.add(FileMetadata.tryFromPath(file.toPath()));
                }
            }
        }
    }
}
1 Like

Since this is tagged “Code Review” a comment on Julia convention:
Usually we put the argument(s) that get(s) mutated first. So we’d prefer read_dir!(file_metadata, directory_path, lock). Also type annotations are generally only used for dispatch, so in this case you don’t need to put them - there is absolutely no performance gain.

Now for the code. I dislike the lock-based approach and would forgo that. Here is a version of your code that I would consider more idiomatic. I don’t know about performance but it could end up faster as well as there is no synchronization overhead anymore.

function read_dir(directory_path)
    tasks = Task[]
    files = Filemetadata[]

    for entry in readdir(directory_path, join=true)
        data = get_file_metadata(entry)

        data.is_symlink && continue # skip symlinks
        (data.is_directory || data.is_file) && push!(file_metadata, data) # save files and directories
        data.is_directory && push!(tasks, Threads.@spawn(read_dir(entry))) # recurse directories
        # are you sure joinpath(directory_path, entry) is correct in your code if you have join=true in readdir?
        # if so change entry back to joinpath(directory_path, entry) in the call above
    end
    # merge results with the results from the subtasks
    return vcat(files, mapreduce(fetch, vcat, tasks))
end

Hello! Thanks for feedback on the style that is followed in Julia. Also you are right that since I have join=true, I didn’t need joinpath.
Performance wise, unfortunately this version of read_dir it runs 2x slower at 1.8s on my machine :frowning: I also did a version where I pushed to a local vector before pushing them to the final vector and that saved around 100ms

How are you benchmarking these exactly? Java and Julia have some runtime overheads that a C++ executable won’t.

You’re forgoing NRVO here.
If you allocated entries locally and returned it, the allocation would be in the destination and this wouldn’t require a move or a copy.
If you std::moved it in the end, it wouldn’t require a copy, but would require a move.
As is, it needs a copy. I doubt that makes much difference.

1 Like

I am currently using hyperfine to benchmark them. I understand that there are runtime overhead that C++ executable won’t have and I don’t expect them to work as fast as the C++ impl.

This is how I run the benchmarks

Java

hyperfine 'java -jar ./impl/java/out/artifacts/java_jar/java.jar' --warmup 8
Benchmark 1: java -jar ./impl/java/out/artifacts/java_jar/java.jar
  Time (mean ± σ):     569.9 ms ±  12.4 ms    [User: 2834.7 ms, System: 4483.7 ms]
  Range (min … max):   553.7 ms … 591.7 ms    10 runs

Julia

hyperfine 'julia ./impl/julia/multi_threaded.jl' --warmup 8
Benchmark 1: julia ./impl/julia/multi_threaded.jl
  Time (mean ± σ):     803.3 ms ±  36.3 ms    [User: 827.9 ms, System: 2453.8 ms]
  Range (min … max):   733.6 ms … 864.2 ms    10 runs

C++

hyperfine cpp --warmup 8
Benchmark 1: cpp
  Time (mean ± σ):     336.1 ms ±   6.2 ms    [User: 866.3 ms, System: 2719.6 ms]
  Range (min … max):   328.4 ms … 347.1 ms    10 runs

Note that restarting julia over and over again, as hyperfine does, means that your code will be compiled over and over again. Consider using BenchmarkTools.jl to benchmark the julia code in a single session & eliminate the compilation overhead.

2 Likes

As for the remaining overhead compared to C++/java, I think this will be the largest part. You’re using a threadpool both in C++ and java, which matches the number of worker-threads (in C++/java speak) to the number of hardware threads of your machine, and scheduling individual pieces of work one after another (without starting them!). In contrast, in Julia you’re creating & immediately scheduling N tasks for work, unnecessarily increasing contention on your lock.

It’s just a guess, but I wonder if you can gain a speedup by reformulating your code to take advantage of Threads.@threads, which should have better performance characteristics & better mimic the behavior of the threadpools in the other languages. Might also be worth to use one of the 1.11 prereleases to take advantage of :greedy scheduling, if the workload is non-uniform.

Have you set -t ... or JULIA_NUM_THREADS for julia through some mechanism? Otherwise, you might just be running with a single thread. What does Threads.nthreads() say when you start julia?

Thanks for this! Now the numbers looks much better! Now it’s much closer to the C++ impl

BenchmarkTools.Trial: 14 samples with 1 evaluation.
 Range (min … max):  301.356 ms … 500.675 ms  ┊ GC (min … max): 0.00% … 0.00%
 Time  (median):     381.827 ms               ┊ GC (median):    5.84%
 Time  (mean ± σ):   373.544 ms ±  51.094 ms  ┊ GC (mean ± σ):  7.88% ± 8.11%

                              █
  ▆▁▁▆▁▁▆▁▆▁▁▁▁▆▁▁▁▁▁▆▁▁▆▁▁▁▆▁█▁▁▁▁▆▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▆ ▁
  301 ms           Histogram: frequency by time          501 ms <

 Memory estimate: 377.67 MiB, allocs estimate: 3274785.

Then I guess another question that I have is, is there anyway for me to store a compiled julia script and run it instead of running my previous method?

1 Like

Yup I have set JULIA_NUM_THREADS=8 and running Threads.nthreads() is 8

1 Like

While there are some solutions (to various degree of “solution”) available today, I think what you’re looking for will mostly be covered by the upcoming juliac:

which will be presented in just shy of 2 weeks from today. I’m just a user of the language, but the community at large has been waiting for this for a very long time, so we’re all very excited about it :slight_smile:

7 Likes

Thanks for this info, I guess I really picked the perfect time to use Julia :laughing:

2 Likes

Sorry I don’t quite get what you mean here. From what I understand, I should allocate the vector in walk_storage and pass in a reference to read_dir?
Something like this

class dirstat {
private:
    std::mutex entries_mutex;
    BS::thread_pool pool;

    auto read_dir(const fs::path &directory_path, std::vector<file_metadata>& entries) -> void {
        for(const auto &entry_path: fs::directory_iterator(directory_path)) {
            auto file_metadata = file_metadata::from_path(entry_path);

            if (file_metadata.is_directory) {
                std::scoped_lock lock(entries_mutex);
                entries.emplace_back(file_metadata);

                pool.detach_task([this, entry_path, &entries] {
                    return read_dir(entry_path, entries);
                });
            } else if (file_metadata.is_file) {
                std::scoped_lock lock(entries_mutex);
                entries.emplace_back(file_metadata);
            }
        }
    }

public:
    auto walk_storage(const fs::path &path) -> std::vector<file_metadata> {
        std::vector<file_metadata> entries;
        read_dir(path, entries);
        pool.wait();
        return entries;
    }
};

The code above runs slight faster by 10-20ms

Yes, exactly that. You can see the first bullet under the section Non-mandatory copy/move(since C++11) elision for some more information, but it’s a bit dense.
Basically, it’s saying code like your new version allows avoiding copy/moves. Because entries

  1. isn’t volatile and has “automatic” (i.e. normal, not-static) life time (so that it’d have been destroyed upon returning)
  2. has the same type as the function’s return type.
  3. is returned by name
1 Like

Depending on your exact use, DaemonMode.jl might be helpful.

3 Likes

You wrap your code in a Package and then use PrecompileTools.jl to cache your code.

But if you use @time on your first call you check how much of an overhead compilation is.

Julia 1.11 will bring down startup time for scripts a bit.

I tried this but there isn’t any difference in terms of performance. I am not sure if I did it correctly tho.

File Tree

├── Manifest.toml
├── MultiThreaded
│   ├── Manifest.toml
│   ├── Project.toml
│   └── src
│       └── MultiThreaded.jl
├── Project.toml
└── runner.jl

MultiThreaded/src/MultiThreaded.jl

module MultiThreaded

using Base.Threads
using PrecompileTools: @compile_workload, @setup_workload

const S_IFDIR = 0o040000  # Directory
const S_IFREG = 0o100000  # Regular file
const S_IFLNK = 0o120000  # Symbolic link

struct Filemetadata
    name::String
    size::Int64
    creation_time::Float64
    modification_time::Float64
    access_time::Float64
    is_directory::Bool
    is_file::Bool
    is_symlink::Bool
end

@inline function get_file_metadata(file_path::String)::Filemetadata
    data = lstat(file_path)
    is_directory = (data.mode & S_IFDIR) == S_IFDIR
    is_file = (data.mode & S_IFREG) == S_IFREG
    is_symlink = (data.mode & S_IFLNK) == S_IFLNK

    size = is_directory ? 0 : data.size

    return Filemetadata(file_path,
        size,
        data.ctime,
        data.mtime,
        0.0,
        is_directory,
        is_file,
        is_symlink
    )
end

function read_dir!(directory_path::String, file_metadata::Vector{Filemetadata}, lk::ReentrantLock)
    tasks = Task[]    

    for entry in readdir(directory_path, join=true)
        data = get_file_metadata(entry)

        if data.is_symlink
            continue
        end

        if data.is_directory
            lock(lk) do
                push!(file_metadata, data)
            end
            task = Threads.@spawn read_dir!(entry, file_metadata, lk)
            push!(tasks, task)
        elseif data.is_file
            lock(lk) do
                push!(file_metadata, data)
            end
        end
    end

    for task in tasks
        wait(task)
    end
end

function walk_storage(file_path::String)::Vector{Filemetadata}
    # Vector to store the file metadata with size 1024
    file_metadata = Vector{Filemetadata}()
    lock = ReentrantLock()
    read_dir!(file_path, file_metadata, lock)

    return file_metadata
end

function dirstat()    
    result = walk_storage("/mnt/sn850x")
    total_sz_b = 0

    for file in result
        total_sz_b += file.size
    end

    total_size_mb = total_sz_b / 1024 / 1024
    total_size_gb = total_sz_b / 1024 / 1024 / 1024

    println("[+] Total size of all files: ", total_sz_b, " B")
    println("[+] Total size of all files: ", total_size_mb, " MB")
    println("[+] Total size of all files: ", total_size_gb, " GB")
    println("[+] Total number of files: ", length(result))
end

@compile_workload begin
    dirstat()
end

end 

runner.jl

using MultiThreaded

MultiThreaded.dirstat()

Running @time shows that compilation takes about 5% of the time which means that it’s not even close to the 380ms that was previously reported using BenchmarkTools.

I ran runner.jl using the following

julia --project=/mnt/sn850x/project/impl/julia ./runner.jl

I also tried using Sys Image with PackageCompiler but it resulted with the same performance (.so file was 180mb)

Do you need to set -tauto or did you set the ENV variable JULIA_NUM_THREADS=auto?

Yup I set JULIA_NUM_THREADS=8 before running runner.jl. I have 8 core CPU so I chose 8