# 1 - Create input file
using Random
open("in.txt", "w") do io
foreach(_ -> println(io, randstring(rand(1:9))), 1:1_000_000)
end
# 2 - Read n random lines
using StatsBase
Nlines = countlines("in.txt")
n = 5_000
ix = sort(sample(1:Nlines, n; replace=false))
str = Vector{String}(undef, n)
i = j = 1
for line in eachline("in.txt")
(i in ix) && begin str[j] = line; j += 1; end
(j==n+1) && break
i += 1
end
# 3 - shuffle the n lines and output to file
shuffle!(str)
open("out.txt", "w") do io
for i in eachindex(str)
println(io, str[i])
end
end
using Mmap
function process_file2(in_fn, out_fn, n)
f = open(in_fn, "r")
fout = open(out_fn, "w")
mm = Mmap.mmap(f, Vector{UInt8})
L = length(mm)
i = 0
lines = Set{UInt}()
while i < n
ix = rand(1:L)
ix2 = ix
while ix2 < L && mm[ix2] != UInt8('\n')
ix2 += 1
end
ix2 += 1
ix2 < L || continue
ix2 in lines && continue
push!(lines, ix2)
while ix2 < L && mm[ix2] != UInt8('\n')
write(fout, mm[ix2])
ix2 += 1
end
write(fout, '\n')
i += 1
end
close(fout)
close(f)
end
n = 5_000
process_file2("infile.txt", "outfile.txt", n)
This tries to avoid reading the whole file, and thus is o(Nlines). In practice, it was 1000x faster than previous post.
A somewhat annoying cost, is non-uniformity if lines are of different lengths and subsampled number of lines can’t be too close to original number of lines.
In addition to the suggestions here, note that readdlm is untyped and thus very slow and bad for large files—work with lines instead. I would generally just avoid readdlm actually, I regret it being a stdlib.
The simplest version of what you want would be this:
using Random
foreach(println, shuffle!(readlines())[1:5000])
This should be reasonably efficient and is actually shorter than the shell commands and does the equivalent work. If you run this from the command line it works like this:
julia -e 'using Random; foreach(println, shuffle!(readlines())[1:5000])' < file.txt > shuffled_trimmed_file.txt
Or if you want to open named files it gets a bit more verbose:
using Random
open("file.txt", read=true) do in
open("shuffled_trimmed_file.txt", write=true) do out
lines = shuffle!(readlines(in))
for i = 1:5000
println(out, line[i])
end
end
end
This can definitely be golfed to be shorter, but you get the point.
It would, however, be much more efficient to use reservoir sampling and only keep at most 5000 lines in memory at a time. There’s a very cool package called StreamSampling that implements this for you:
using StreamSampling
lines = itsample(eachline("file.txt"), 5000)
That’s it and it’s wildly efficient since it never needs to hold more than 5000 lines in memory.