Let’s just start with generating some sample data.
DIR0="$(ENV["HOME"])/Documents/twosigma_demo"
# Create directory and parent directories
mkpath(DIR0)
# Create alpha_2sigma.txt with two rows
write(
joinpath(DIR0, "alpha_2sigma.txt"),
"""
0 0 0
1 1 1
"""
)
# Create beta_2sigma.txt with no rows
write(
joinpath(DIR0, "beta_2sigma.txt"),
"""
"""
)
# Create gamma_2sigma.txt with 150 rows and 3 columns
write(
joinpath(DIR0, "gamma_2sigma.txt"),
join(join.(eachrow(rand(0:9, 150,3)), " "), "\n") * "\n"
)
# Create delta_2sigma.txt with 160 rows and 4 columns
write(
joinpath(DIR0, "delta_2sigma.txt"),
join(join.(eachrow(rand(0:9, 160,4)), " "), "\n") * "\n"
)
Next let’s inspect the results.
julia> DIR0="$(ENV["HOME"])/Documents/twosigma_demo"
"/home/mkitti/Documents/twosigma_demo"
julia> readdir(DIR0)
4-element Vector{String}:
"alpha_2sigma.txt"
"beta_2sigma.txt"
"gamma_2sigma.txt"
"delta_2sigma.txt"
julia> println(read(joinpath(DIR0, "alpha_2sigma.txt"), String))
0 0 0
1 1 1
julia> println(read(joinpath(DIR0, "beta_2sigma.txt"), String))
julia> println(read(joinpath(DIR0, "gamma_2sigma.txt"), String))
7 4 0
1 8 1
8 4 0
0 1 6
9 5 0
0 0 5
...
# abbreviated
julia> println(read(joinpath(DIR0, "delta_2sigma.txt"), String))
2 4 1 1
4 1 7 2
1 3 2 0
3 6 2 7
2 4 6 4
7 5 5 6
1 3 0 7
5 8 2 8
7 5 1 4
2 1 5 9
...
# abbreviated
Now let’s try some simple for loops.
julia> for file in readdir(DIR0)
println(file)
end
alpha_2sigma.txt
beta_2sigma.txt
delta_2sigma.txt
gamma_2sigma.txt
julia> for file in readdir(DIR0)
println(joinpath(DIR0, file))
end
/home/mkitti/Documents/twosigma_demo/alpha_2sigma.txt
/home/mkitti/Documents/twosigma_demo/beta_2sigma.txt
/home/mkitti/Documents/twosigma_demo/delta_2sigma.txt
/home/mkitti/Documents/twosigma_demo/gamma_2sigma.txt
julia> for file in readdir(DIR0)
file = joinpath(DIR0, file)
lines = readlines(file)
println(length(lines))
end
2
0
160
150
Instead of printing the number of lines in each file, let’s try to collect that into a Vector
by pushing.
julia> nrows = Int[]
Int64[]
julia> for file in readdir(DIR0)
file = joinpath(DIR0, file)
lines = readlines(file)
push!(nrows, length(lines))
end
julia> nrows
3-element Vector{Int64}:
2
0
160
150
There’s a simpler way to do the above via map
:
julia> map(readdir(DIR0)) do file
file = joinpath(DIR0, file)
lines = readlines(file)
return length(lines)
end
3-element Vector{Int64}:
2
0
160
150
Now, let’s say I wanted to print the name of the file which has 150 or more lines.
julia> for file in readdir(DIR0)
file = joinpath(DIR0, file)
lines = readlines(file)
if length(lines) >= 150
println(file)
end
end
/home/mkitti/Documents/twosigma_demo/delta_2sigma.txt
/home/mkitti/Documents/twosigma_demo/gamma_2sigma.txt
We could then try to examine the number of columns.
julia> for file in readdir(DIR0)
file = joinpath(DIR0, file)
lines = readlines(file)
if length(lines) > 0
first_line = first(lines)
columns_in_first_line = split(first_line, " ")
println(length(columns_in_first_line))
else
println("No rows!")
end
end
3
No rows!
4
3
Next let’s print out the files that more than 150 rows and more than 3 columns.
julia> for file in readdir(DIR0)
file = joinpath(DIR0, file)
lines = readlines(file)
NR = length(lines)
if NR >= 150
first_line = first(lines)
columns_in_first_line = split(first_line, " ")
NC = length(columns_in_first_line)
if NC >= 3
println(file, " has ", NR, " rows and ", NC, " columns")
end
end
end
/home/mkitti/Documents/twosigma_demo/delta_2sigma.txt has 160 rows and 4 columns
/home/mkitti/Documents/twosigma_demo/gamma_2sigma.txt has 150 rows and 3 columns
Instead of printing this, let’s put it into a Vector
.
julia> large_files = String[]
String[]
julia> for file in readdir(DIR0)
file = joinpath(DIR0, file)
lines = readlines(file)
NR = length(lines)
if NR >= 150
first_line = first(lines)
columns_in_first_line = split(first_line, " ")
NC = length(columns_in_first_line)
if NC >= 3
push!(large_files, file)
end
end
end
julia> large_files
2-element Vector{String}:
"/home/mkitti/Documents/twosigma_demo/delta_2sigma.txt"
"/home/mkitti/Documents/twosigma_demo/gamma_2sigma.txt"
Now let’s make this a function.
julia> function get_large_files(DIR0)
large_files = String[]
for file in readdir(DIR0)
file = joinpath(DIR0, file)
lines = readlines(file)
NR = length(lines)
if NR >= 150
first_line = first(lines)
columns_in_first_line = split(first_line, " ")
NC = length(columns_in_first_line)
if NC >= 3
push!(large_files, file)
end
end
end
return large_files
end
get_large_files (generic function with 1 method)
julia> get_large_files(DIR0)
2-element Vector{String}:
"/home/mkitti/Documents/twosigma_demo/delta_2sigma.txt"
"/home/mkitti/Documents/twosigma_demo/gamma_2sigma.txt"
We can also return the number of rows and columns.
julia> large_files, nrows, ncols = get_large_files_nrows_ncols(DIR0)
(["/home/mkitti/Documents/twosigma_demo/delta_2sigma.txt", "/home/mkitti/Documents/twosigma_demo/gamma_2sigma.txt"], [160, 150], [4, 3])
julia> large_files
2-element Vector{String}:
"/home/mkitti/Documents/twosigma_demo/delta_2sigma.txt"
"/home/mkitti/Documents/twosigma_demo/gamma_2sigma.txt"
julia> nrows
2-element Vector{Int64}:
160
150
julia> ncols
2-element Vector{Int64}:
4
3
Alternatively, we could return the rows and columns as a tuple.
julia> function get_large_files_and_sizes(DIR0)
large_files = String[]
large_file_sizes = Tuple{Int,Int}[]
for file in readdir(DIR0)
file = joinpath(DIR0, file)
lines = readlines(file)
NR = length(lines)
if NR >= 150
first_line = first(lines)
columns_in_first_line = split(first_line, " ")
NC = length(columns_in_first_line)
if NC >= 3
push!(large_files, file)
push!(large_file_sizes, (NR, NC))
end
end
end
return large_files, large_file_sizes
end
get_large_files_and_sizes (generic function with 1 method)
julia> large_files, sizes = get_large_files_and_sizes(DIR0)
(["/home/mkitti/Documents/twosigma_demo/delta_2sigma.txt", "/home/mkitti/Documents/twosigma_demo/gamma_2sigma.txt"], [(160, 4), (150, 3)])
julia> large_files
2-element Vector{String}:
"/home/mkitti/Documents/twosigma_demo/delta_2sigma.txt"
"/home/mkitti/Documents/twosigma_demo/gamma_2sigma.txt"
julia> sizes
2-element Vector{Tuple{Int64, Int64}}:
(160, 4)
(150, 3)
Before continuing I want to comment on comparing tuples.
The following may be surprising.
julia> (160, 4) >= (150, 5)
true
I think you you might want to do the following to explitly compare each pair of numbers and determine they are all greater than or equal to the number on the right.
julia> (160, 4) .>= (150, 5)
(true, false)
julia> all((160, 4) .>= (150, 5))
false
Using the above, let’s rewrite the function
julia> function get_large_files_and_sizes_2(DIR0)
large_files = String[]
large_file_sizes = Tuple{Int,Int}[]
for file in readdir(DIR0)
file = joinpath(DIR0, file)
lines = readlines(file)
NR = length(lines)
NC = 0
if NR > 0
first_line = first(lines)
columns_in_first_line = split(first_line, " ")
NC = length(columns_in_first_line)
end
_size = (NR, NC)
if all(_size .>= (150, 3))
push!(large_files, file)
push!(large_file_sizes, _size)
end
end
return large_files, large_file_sizes
end
get_large_files_and_sizes_2 (generic function with 1 method)
julia> large_files, large_file_sizes = get_large_files_and_sizes_2(DIR0)
(["/home/mkitti/Documents/twosigma_demo/delta_2sigma.txt", "/home/mkitti/Documents/twosigma_demo/gamma_2sigma.txt"], [(160, 4), (150, 3)])
julia> large_files
2-element Vector{String}:
"/home/mkitti/Documents/twosigma_demo/delta_2sigma.txt"
"/home/mkitti/Documents/twosigma_demo/gamma_2sigma.txt"
julia> large_file_sizes
2-element Vector{Tuple{Int64, Int64}}:
(160, 4)
(150, 3)
Now let’s try this with GMT.jl.
julia> using GMT
julia> gmtread(joinpath(DIR0, "alpha_2sigma.txt"))
BoundingBox: [0.0, 1.0, 0.0, 1.0, 0.0, 1.0]
2×3 GMTdataset{Float64, 2}
Row │ col.1 col.2 col.3
─────┼─────────────────────
1 │ 0.0 0.0 0.0
2 │ 1.0 1.0 1.0
julia> gmtread(joinpath(DIR0, "beta_2sigma.txt"))
gmtread [WARNING]: File /home/mkitti/Documents/twosigma_demo/beta_2sigma.txt is empty!
┌ Warning: file "/home/mkitti/Documents/twosigma_demo/beta_2sigma.txt" is empty or has no data after the header.
└ @ GMT ~/.julia/packages/GMT/SI3aF/src/gmtreadwrite.jl:189
String[]
julia> gmtread(joinpath(DIR0, "gamma_2sigma.txt"))
BoundingBox: [0.0, 9.0, 0.0, 9.0, 0.0, 9.0]
150×3 GMTdataset{Float64, 2}
Row │ col.1 col.2 col.3
─────┼─────────────────────
1 │ 7.0 4.0 0.0
2 │ 1.0 8.0 1.0
3 │ 8.0 4.0 0.0
4 │ 0.0 1.0 6.0
5 │ 9.0 5.0 0.0
julia> gmtread(joinpath(DIR0, "delta_2sigma.txt"))
BoundingBox: [0.0, 9.0, 0.0, 9.0, 0.0, 9.0, 0.0, 9.0]
160×4 GMTdataset{Float64, 2}
Row │ col.1 col.2 col.3 col.4
─────┼────────────────────────────
1 │ 2.0 4.0 1.0 1.0
2 │ 4.0 1.0 7.0 2.0
3 │ 1.0 3.0 2.0 0.0
4 │ 3.0 6.0 2.0 7.0
Let’s make sure we can examine the sizes as we would expect.
julia> for file in readdir(DIR0)
file = joinpath(DIR0, file)
gmt_hold = gmtread(file)
println(size(gmt_hold))
end
(2, 3)
gmtread [WARNING]: File /home/mkitti/Documents/twosigma_demo/beta_2sigma.txt is empty!
┌ Warning: file "/home/mkitti/Documents/twosigma_demo/beta_2sigma.txt" is empty or has no data after the header.
└ @ GMT ~/.julia/packages/GMT/SI3aF/src/gmtreadwrite.jl:189
(0, 0)
(160, 4)
(150, 3)
Let’s write a function to retrieve large datasets.
julia> function get_large_datasets_gmt(DIR0)
large_datasets = GMTdataset{Float64, 2}[]
for file in readdir(DIR0)
file = joinpath(DIR0, file)
dataset = gmtread(file)
if all(size(dataset) .>= (150,3))
push!(large_datasets, dataset)
end
end
return large_datasets
end
get_large_datasets_gmt (generic function with 1 method)
julia> large_datasets = get_large_datasets_gmt(DIR0);
gmtread [WARNING]: File /home/mkitti/Documents/twosigma_demo/beta_2sigma.txt is empty!
┌ Warning: file "/home/mkitti/Documents/twosigma_demo/beta_2sigma.txt" is empty or has no data after the header.
└ @ GMT ~/.julia/packages/GMT/SI3aF/src/gmtreadwrite.jl:189
julia> length(large_datasets)
2
julia> large_datasets[1]
BoundingBox: [0.0, 9.0, 0.0, 9.0, 0.0, 9.0, 0.0, 9.0]
160×4 GMTdataset{Float64, 2}
Row │ col.1 col.2 col.3 col.4
─────┼────────────────────────────
1 │ 2.0 4.0 1.0 1.0
2 │ 4.0 1.0 7.0 2.0
3 │ 1.0 3.0 2.0 0.0
4 │ 3.0 6.0 2.0 7.0
5 │ 2.0 4.0 6.0 4.0
# abbreviated
julia> large_datasets[2]
BoundingBox: [0.0, 9.0, 0.0, 9.0, 0.0, 9.0]
150×3 GMTdataset{Float64, 2}
Row │ col.1 col.2 col.3
─────┼─────────────────────
1 │ 7.0 4.0 0.0
2 │ 1.0 8.0 1.0
3 │ 8.0 4.0 0.0
4 │ 0.0 1.0 6.0
5 │ 9.0 5.0 0.0
6 │ 0.0 0.0 5.0
7 │ 7.0 9.0 3.0
I will also note that there is a much more compact way of writing this.
julia> function get_large_datasets_gmt_2(DIR0)
filter(gmtread.(joinpath.(DIR0, readdir(DIR0)))) do dataset
all(size(dataset) .>= (150,3))
end
end
get_large_datasets_gmt_2 (generic function with 1 method)
julia> datasets = get_large_datasets_gmt(DIR0);
gmtread [WARNING]: File /home/mkitti/Documents/twosigma_demo/beta_2sigma.txt is empty!
┌ Warning: file "/home/mkitti/Documents/twosigma_demo/beta_2sigma.txt" is empty or has no data after the header.
└ @ GMT ~/.julia/packages/GMT/SI3aF/src/gmtreadwrite.jl:189
julia> datasets[1]
BoundingBox: [0.0, 9.0, 0.0, 9.0, 0.0, 9.0, 0.0, 9.0]
160×4 GMTdataset{Float64, 2}
Row │ col.1 col.2 col.3 col.4
─────┼────────────────────────────
1 │ 2.0 4.0 1.0 1.0
2 │ 4.0 1.0 7.0 2.0
3 │ 1.0 3.0 2.0 0.0
4 │ 3.0 6.0 2.0 7.0
5 │ 2.0 4.0 6.0 4.0
# abbreviated
julia> datasets[2]
BoundingBox: [0.0, 9.0, 0.0, 9.0, 0.0, 9.0]
150×3 GMTdataset{Float64, 2}
Row │ col.1 col.2 col.3
─────┼─────────────────────
1 │ 7.0 4.0 0.0
2 │ 1.0 8.0 1.0
3 │ 8.0 4.0 0.0
4 │ 0.0 1.0 6.0
5 │ 9.0 5.0 0.0
# abbreviated
As for the error that you are getting, the issue arises if you attempt to do the following.
julia> epoch_hold = nothing
julia> epoch_hold[:,1]
ERROR: MethodError: no method matching getindex(::Nothing, ::Colon, ::Int64)
Stacktrace:
[1] top-level scope
@ REPL[201]:1
We could avoid the above error by testing for nothing
:
julia> epoch_hold = gmtread(joinpath(DIR0, "alpha_2sigma.txt"))
BoundingBox: [0.0, 1.0, 0.0, 1.0, 0.0, 1.0]
2×3 GMTdataset{Float64, 2}
Row │ col.1 col.2 col.3
─────┼─────────────────────
1 │ 0.0 0.0 0.0
2 │ 1.0 1.0 1.0
julia> if !isnothing(epoch_hold)
epoch_hold[:,1]
end
2-element Vector{Float64}:
0.0
1.0
julia> epoch_hold = nothing
julia> if !isnothing(epoch_hold)
my_dataset[:,1]
end
Benny pointed out that there is a flaw in your function as written. We can demonstrate the flaw as follows.
julia> function gmt_read(file)
epoch_hold = gmtread(file, incols = "0,1,2");
if length(epoch_hold) > 100
return epoch_hold
end
end
gmt_read (generic function with 1 method)
julia> gmt_read(joinpath(DIR0, "gamma_2sigma.txt"))[:,1]
150-element Vector{Float64}:
7.0
1.0
8.0
# abbreviated
julia> gmt_read(joinpath(DIR0, "delta_2sigma.txt"))[:,1]
160-element Vector{Float64}:
2.0
4.0
# abbreviated
julia> gmt_read(joinpath(DIR0, "alpha_2sigma.txt"))[:,1]
ERROR: MethodError: no method matching getindex(::Nothing, ::Colon, ::Int64)
Stacktrace:
[1] top-level scope
@ REPL[229]:1
julia> gmt_read(joinpath(DIR0, "beta_2sigma.txt"))[:,1]
gmtread [WARNING]: File /home/mkitti/Documents/twosigma_demo/beta_2sigma.txt is empty!
┌ Warning: file "/home/mkitti/Documents/twosigma_demo/beta_2sigma.txt" is empty or has no data after the header.
└ @ GMT ~/.julia/packages/GMT/SI3aF/src/gmtreadwrite.jl:189
ERROR: MethodError: no method matching getindex(::Nothing, ::Colon, ::Int64)
Stacktrace:
[1] top-level scope
@ REPL[230]:1
The flaw here is that for small datasets your function returns nothing
and then you try to index the result.
julia> alpha = gmt_read(joinpath(DIR0, "alpha_2sigma.txt"))
julia> isnothing(alpha)
true
julia> alpha[:, 1]
ERROR: MethodError: no method matching getindex(::Nothing, ::Colon, ::Int64)
Stacktrace:
[1] top-level scope
@ REPL[234]:1
The reason this happens is that your function is identical to the following equivalent function definitions.
julia> function gmt_read(file)
epoch_hold = gmtread(file, incols = "0,1,2");
if length(epoch_hold) > 100
return epoch_hold
end
return nothing
end
gmt_read (generic function with 1 method)
julia> function gmt_read(file)
epoch_hold = gmtread(file, incols = "0,1,2");
if length(epoch_hold) > 100
return epoch_hold
else
return nothing
end
end
gmt_read (generic function with 1 method)
We can reproduce the error as follows.
julia> function gmt_read(file)
epoch_hold = gmtread(file, incols = "0,1,2");
if length(epoch_hold) > 100
return epoch_hold
else
return nothing
end
end
gmt_read (generic function with 1 method)
julia> for file in readdir(DIR0)
file = joinpath(DIR0, file)
gmt_hold = gmt_read(file)
gmt_hold[:,1]
end
ERROR: MethodError: no method matching getindex(::Nothing, ::Colon, ::Int64)
Stacktrace:
[1] top-level scope
@ ./REPL[243]:4
We can correct it as follows.
julia> function gmt_read(file)
epoch_hold = gmtread(file, incols = "0,1,2");
return epoch_hold
end
gmt_read (generic function with 1 method)
julia> for file in readdir(DIR0)
file = joinpath(DIR0, file)
gmt_hold = gmt_read(file)
if length(gmt_hold) > 100
println(gmt_hold[:,1])
end
end
gmtread [WARNING]: File /home/mkitti/Documents/twosigma_demo/beta_2sigma.txt is empty!
┌ Warning: file "/home/mkitti/Documents/twosigma_demo/beta_2sigma.txt" is empty or has no data after the header.
└ @ GMT ~/.julia/packages/GMT/SI3aF/src/gmtreadwrite.jl:189
[2.0, 4.0, 1.0, 3.0, 2.0, 7.0, 1.0, 5.0, 7.0, 2.0, 5.0, 8.0, 6.0, 1.0, 8.0, 5.0, 5.0, 3.0, 9.0, 7.0, 7.0, 6.0, 2.0, 6.0, 4.0, 4.0, 7.0, 8.0, 9.0, 0.0, 8.0, 6.0, 7.0, 6.0, 6.0, 7.0, 4.0, 4.0, 1.0, 9.0, 2.0, 4.0, 0.0, 2.0, 6.0, 2.0, 8.0, 5.0, 8.0, 4.0, 8.0, 0.0, 2.0, 3.0, 1.0, 4.0, 6.0, 1.0, 5.0, 1.0, 0.0, 5.0, 6.0, 4.0, 6.0, 6.0, 4.0, 0.0, 0.0, 0.0, 4.0, 8.0, 5.0, 6.0, 3.0, 4.0, 9.0, 7.0, 5.0, 8.0, 0.0, 7.0, 8.0, 0.0, 2.0, 0.0, 7.0, 5.0, 3.0, 8.0, 5.0, 0.0, 9.0, 2.0, 0.0, 1.0, 8.0, 2.0, 3.0, 9.0, 4.0, 9.0, 9.0, 9.0, 3.0, 5.0, 7.0, 1.0, 6.0, 8.0, 3.0, 9.0, 3.0, 7.0, 7.0, 9.0, 1.0, 9.0, 6.0, 0.0, 0.0, 8.0, 3.0, 0.0, 8.0, 2.0, 2.0, 5.0, 1.0, 1.0, 1.0, 0.0, 7.0, 8.0, 6.0, 9.0, 3.0, 4.0, 1.0, 5.0, 3.0, 3.0, 8.0, 1.0, 1.0, 3.0, 1.0, 5.0, 8.0, 7.0, 9.0, 0.0, 0.0, 3.0, 9.0, 7.0, 2.0, 6.0, 1.0, 5.0]
[7.0, 1.0, 8.0, 0.0, 9.0, 0.0, 7.0, 0.0, 8.0, 7.0, 9.0, 6.0, 9.0, 3.0, 8.0, 3.0, 6.0, 1.0, 8.0, 0.0, 5.0, 9.0, 0.0, 7.0, 5.0, 9.0, 3.0, 7.0, 6.0, 9.0, 5.0, 0.0, 1.0, 4.0, 6.0, 6.0, 4.0, 5.0, 5.0, 0.0, 1.0, 9.0, 3.0, 9.0, 6.0, 8.0, 4.0, 7.0, 3.0, 6.0, 8.0, 4.0, 5.0, 0.0, 5.0, 8.0, 5.0, 9.0, 0.0, 7.0, 1.0, 5.0, 6.0, 4.0, 6.0, 7.0, 2.0, 2.0, 2.0, 1.0, 7.0, 9.0, 7.0, 7.0, 9.0, 5.0, 7.0, 0.0, 9.0, 3.0, 4.0, 3.0, 8.0, 6.0, 9.0, 6.0, 4.0, 3.0, 4.0, 7.0, 4.0, 3.0, 5.0, 7.0, 4.0, 4.0, 4.0, 6.0, 9.0, 3.0, 0.0, 2.0, 6.0, 4.0, 0.0, 4.0, 0.0, 6.0, 1.0, 5.0, 4.0, 8.0, 5.0, 9.0, 8.0, 1.0, 4.0, 0.0, 6.0, 9.0, 1.0, 6.0, 8.0, 4.0, 1.0, 0.0, 6.0, 2.0, 6.0, 4.0, 6.0, 4.0, 2.0, 9.0, 5.0, 1.0, 5.0, 0.0, 1.0, 1.0, 9.0, 8.0, 1.0, 9.0, 2.0, 7.0, 8.0, 0.0, 5.0, 0.0]