Hi I am writing a simple dirstat
program which will traverse a directory and collect various metadata for the file. A task is dispatched to a threadpool when a we encounter a directory. Currently I have implementations in C++ and Java and I these 2 implementations are faster than my Julia program. I am new to Julia so I am wondering if there are any glaring mistakes that I have made.
Note: I am aware that we should have a vector per thread to reduce lock contention but I decided not to impl that yet due to lack of time. As of now I am just comparing these 2 impls with Julia.
Thanks!
Benchmarks (with pagecache)
Julia: 780ms
C++: 337ms
Java: 544ms
Julia Impl
using Base.Threads
const S_IFDIR = 0o040000 # Directory
const S_IFREG = 0o100000 # Regular file
const S_IFLNK = 0o120000 # Symbolic link
struct Filemetadata
name::String
size::Int64
creation_time::Float64
modification_time::Float64
access_time::Float64
is_directory::Bool
is_file::Bool
is_symlink::Bool
end
@inline function get_file_metadata(file_path::String)::Filemetadata
data = lstat(file_path)
is_directory = (data.mode & S_IFDIR) == S_IFDIR
is_file = (data.mode & S_IFREG) == S_IFREG
is_symlink = (data.mode & S_IFLNK) == S_IFLNK
size = is_directory ? 0 : data.size
return Filemetadata(file_path,
size,
data.ctime,
data.mtime,
0.0,
is_directory,
is_file,
is_symlink
)
end
function read_dir!(directory_path::String, file_metadata::Vector{Filemetadata}, lk::ReentrantLock)
tasks = Task[]
for entry in readdir(directory_path, join=true)
data = get_file_metadata(entry)
if data.is_symlink
continue
end
if data.is_directory
lock(lk) do
push!(file_metadata, data)
end
task = Threads.@spawn read_dir!(joinpath(directory_path, entry), file_metadata, lk)
push!(tasks, task)
elseif data.is_file
lock(lk) do
push!(file_metadata, data)
end
end
end
for task in tasks
wait(task)
end
end
function walk_storage(file_path::String)::Vector{Filemetadata}
# Vector to store the file metadata with size 1024
file_metadata = Vector{Filemetadata}()
lock = ReentrantLock()
read_dir!(file_path, file_metadata, lock)
return file_metadata
end
function main()
result = walk_storage("/mnt/sn850x")
total_sz_b = 0
for file in result
total_sz_b += file.size
end
total_size_mb = total_sz_b / 1024 / 1024
total_size_gb = total_sz_b / 1024 / 1024 / 1024
println("[+] Total size of all files: ", total_sz_b, " B")
println("[+] Total size of all files: ", total_size_mb, " MB")
println("[+] Total size of all files: ", total_size_gb, " GB")
println("[+] Total number of files: ", length(result))end
if abspath(PROGRAM_FILE) == @__FILE__
main()
end
C++ Impl
#include <iostream>
#include <utility>
#include <sys/stat.h>
#include <filesystem>
#include <mutex>
#include <vector>
#include "mimalloc-new-delete.h" // enable mimalloc new/delete overloads
#include "BS_thread_pool.hpp"
namespace fs = std::filesystem;
namespace chrono = std::chrono;
class file_metadata {
public:
// Name of the file, including the path
std::string name;
// Size of the file in bytes
long size;
// Time when the file was created
chrono::time_point<chrono::system_clock> creation_time;
// Time when the file was last modified
chrono::time_point<chrono::system_clock> modification_time;
// Time when the file was last accessed
chrono::time_point<chrono::system_clock> access_time;
// Whether the file is a directory
bool is_directory;
// Whether the file is a regular file
bool is_file;
// Is the file a symlink
bool is_symlink;
file_metadata(std::string name, long size,
chrono::time_point<chrono::system_clock> creation_time,
chrono::time_point<chrono::system_clock> modification_time,
chrono::time_point<chrono::system_clock> access_time,
bool is_directory, bool is_file, bool is_symlink)
: name(std::move(name)), size(size), creation_time(creation_time),
modification_time(modification_time), access_time(access_time),
is_directory(is_directory), is_file(is_file), is_symlink(is_symlink) {}
static auto from_path(const fs::path &path) -> file_metadata {
// call stat on the path
struct stat file_stat{};
lstat(path.c_str(), &file_stat);
chrono::time_point<std::chrono::system_clock> access_time(
chrono::seconds(file_stat.st_atim.tv_sec) +
chrono::nanoseconds(file_stat.st_atim.tv_nsec)
);
chrono::time_point<std::chrono::system_clock> modification_time(
chrono::seconds(file_stat.st_mtim.tv_sec) +
chrono::nanoseconds(file_stat.st_mtim.tv_nsec)
);
chrono::time_point<std::chrono::system_clock> creation_time(
chrono::seconds(file_stat.st_ctim.tv_sec) +
chrono::nanoseconds(file_stat.st_ctim.tv_nsec)
);
return {
path.string(),
S_ISDIR(file_stat.st_mode) ? 0 : file_stat.st_size,
creation_time,
modification_time,
access_time,
S_ISDIR(file_stat.st_mode),
S_ISREG(file_stat.st_mode),
S_ISLNK(file_stat.st_mode)
};
}
};
class dirstat {
private:
std::vector<file_metadata> entries;
std::mutex entries_mutex;
BS::thread_pool pool;
auto read_dir(const fs::path &directory_path) -> void {
for(const auto &entry_path: fs::directory_iterator(directory_path)) {
auto file_metadata = file_metadata::from_path(entry_path);
if (file_metadata.is_directory) {
std::scoped_lock lock(entries_mutex);
entries.emplace_back(file_metadata);
pool.detach_task([this, entry_path] {
return read_dir(entry_path);
});
} else if (file_metadata.is_file) {
std::scoped_lock lock(entries_mutex);
entries.emplace_back(file_metadata);
}
}
}
public:
auto walk_storage(const fs::path &path) -> std::vector<file_metadata> const& {
read_dir(path);
pool.wait();
return entries;
}
};
auto main() -> int {
std::string root = "/mnt/sn850x";
dirstat ds;
auto& result = ds.walk_storage(root);
long total_size = 0;
for (const auto &entry: result) {
total_size += entry.size;
}
double total_size_mb = (double)total_size / (1024.0 * 1024.0);
double total_size_gb = (double)total_size / (1024.0 * 1024.0 * 1024.0);
std::cout << "[+] Total size of all files: " << total_size << " B\n";
std::cout << "[+] Total size of all files: " << std::format("{}", total_size_mb) << " MB\n";
std::cout << "[+] Total size of all files: " << std::format("{}", total_size_gb) << " GB\n";
std::cout << "[+] Number of files: " << result.size() << "\n";
return 0;
}
Java Impl
package sg.edu.ntu;
import java.io.File;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.attribute.BasicFileAttributes;
import java.nio.file.attribute.FileTime;
import java.util.*;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
public class Main {
record FileMetadata(
String name,
long sz,
FileTime creationTime,
FileTime modificationTime,
FileTime lastAccessTime,
boolean isDirectory,
boolean isFile
) {
static FileMetadata tryFromPath(Path path) {
try {
BasicFileAttributes attributes = Files.readAttributes(path, BasicFileAttributes.class);
boolean isDirectory = attributes.isDirectory();
return new FileMetadata(
path.toString(),
isDirectory ? 0 : attributes.size(),
attributes.creationTime(),
attributes.lastModifiedTime(),
attributes.lastAccessTime(),
isDirectory,
attributes.isRegularFile()
);
} catch (IOException e) {
System.out.printf("[-] Got IOException: %s\n", e);
}
return null;
}
}
public static void main(String[] args) {
List<FileMetadata> files = walkStorage("/mnt/sn850x");
long totalSz = files.stream().mapToLong(FileMetadata::sz).sum();
double totalSzMb = totalSz / (1024.0 * 1024.0);
double totalSzGb = totalSz / (1024.0 * 1024.0 * 1024.0);
System.out.printf("[+] Total size of all files: %d B\n", totalSz);
System.out.printf("[+] Total size of all files: %f MB\n", totalSzMb);
System.out.printf("[+] Total size of all files: %f GB\n", totalSzGb);
System.out.printf("[+] Number of files: %d\n", files.size());
}
private static List<FileMetadata> walkStorage(String path) {
try(ExecutorService pool = Executors.newWorkStealingPool()) {
List<FileMetadata> fileMetadata = Collections.synchronizedList(new ArrayList<>());
readDir(path, fileMetadata, pool);
return fileMetadata;
} catch (Exception e) {
return List.of();
}
}
private static void readDir(String path, List<FileMetadata> fileMetadata, ExecutorService pool) {
File baseFile = new File(path);
File[] files = baseFile.listFiles(file -> !Files.isSymbolicLink(file.toPath()));
if(files != null) {
for(File file: files) {
if(file.isDirectory()) {
fileMetadata.add(FileMetadata.tryFromPath(file.toPath()));
pool.submit(() -> readDir(file.getAbsolutePath(), fileMetadata, pool));
} else {
fileMetadata.add(FileMetadata.tryFromPath(file.toPath()));
}
}
}
}
}