I have a set of CSVs, about 1.7GB total size. (I’m attempting to compete in the r/dataisbeautiful visualization competition for this month).
When trying to load all the files with loadtable()
, I run into some problems (running with multiple cores and with one core).
table = loadtable("/Users/mcintna1/Documents/DataSets/dataisbeautiful_march/",
output = "data",
indexcols = [1],
skiplines_begin = 1,
distributed = true)
This gives me the following error
Error parsing /Users/mcintna1/Documents/DataSets/dataisbeautiful_march/julia_analysis.ipynb
Couldn't split line, error at char 11:
"cells": [
_______^
error(::String) at ./error.jl:33
quotedsplit(::TextParse.VectorBackedUTF8String, ::TextParse.LocalOpts{UInt8,UInt8,UInt8}, ::Bool, ::Int64, ::Int64) at /Users/mcintna1/.julia/packages/TextParse/o3nmV/src/csv.jl:668
readcolnames(::TextParse.VectorBackedUTF8String, ::TextParse.LocalOpts{UInt8,UInt8,UInt8}, ::Int64, ::Array{String,1}) at /Users/mcintna1/.julia/packages/TextParse/o3nmV/src/csv.jl:462
#_csvread_internal#26(::Bool, ::Char, ::Char, ::Type, ::Type, ::Bool, ::Int64, ::Array{String,1}, ::Nothing, ::Int64, ::Array{String,1}, ::Bool, ::Array{String,1}, ::Array{String,1}, ::OrderedCollections.OrderedDict{Union{Int64, String},AbstractArray{T,1} where T}, ::Int64, ::Dict{Any,Any}, ::Array{Any,1}, ::String, ::Int64, ::typeof(TextParse._csvread_internal), ::TextParse.VectorBackedUTF8String, ::Char) at /Users/mcintna1/.julia/packages/TextParse/o3nmV/src/csv.jl:209
(::getfield(TextParse, Symbol("#kw##_csvread_internal")))(::NamedTuple{(:filename, :rowno, :colspool, :prevheaders, :noresize, :prev_parsers, :samecols, :skiplines_begin),Tuple{String,Int64,OrderedCollections.OrderedDict{Union{Int64, String},AbstractArray{T,1} where T},Array{String,1},Bool,Dict{Any,Any},Array{String,1},Int64}}, ::typeof(TextParse._csvread_internal), ::TextParse.VectorBackedUTF8String, ::Char) at ./none:0
(::getfield(TextParse, Symbol("##22#24")){Base.Iterators.Pairs{Symbol,Any,NTuple{7,Symbol},NamedTuple{(:rowno, :colspool, :prevheaders, :noresize, :prev_parsers, :samecols, :skiplines_begin),Tuple{Int64,OrderedCollections.OrderedDict{Union{Int64, String},AbstractArray{T,1} where T},Array{String,1},Bool,Dict{Any,Any},Array{String,1},Int64}}},String,Char})(::IOStream) at /Users/mcintna1/.julia/packages/TextParse/o3nmV/src/csv.jl:108
#open#310(::Base.Iterators.Pairs{Union{},Union{},Tuple{},NamedTuple{(),Tuple{}}}, ::Function, ::getfield(TextParse, Symbol("##22#24")){Base.Iterators.Pairs{Symbol,Any,NTuple{7,Symbol},NamedTuple{(:rowno, :colspool, :prevheaders, :noresize, :prev_parsers, :samecols, :skiplines_begin),Tuple{Int64,OrderedCollections.OrderedDict{Union{Int64, String},AbstractArray{T,1} where T},Array{String,1},Bool,Dict{Any,Any},Array{String,1},Int64}}},String,Char}, ::String, ::Vararg{String,N} where N) at ./iostream.jl:369
open at ./iostream.jl:367 [inlined]
#_csvread_f#20 at /Users/mcintna1/.julia/packages/TextParse/o3nmV/src/csv.jl:105 [inlined]
(::getfield(TextParse, Symbol("#kw##_csvread_f")))(::NamedTuple{(:rowno, :colspool, :prevheaders, :noresize, :prev_parsers, :samecols, :skiplines_begin),Tuple{Int64,OrderedCollections.OrderedDict{Union{Int64, String},AbstractArray{T,1} where T},Array{String,1},Bool,Dict{Any,Any},Array{String,1},Int64}}, ::typeof(TextParse._csvread_f), ::String, ::Char) at ./none:0
#csvread#25(::Base.Iterators.Pairs{Symbol,Any,Tuple{Symbol,Symbol},NamedTuple{(:samecols, :skiplines_begin),Tuple{Array{String,1},Int64}}}, ::Function, ::Array{String,1}, ::Char) at /Users/mcintna1/.julia/packages/TextParse/o3nmV/src/csv.jl:140
#csvread at ./none:0 [inlined]
#_loadtable_serial#3(::Char, ::Array{Int64,1}, ::Nothing, ::Nothing, ::Nothing, ::Bool, ::Bool, ::typeof(csvread), ::Base.Iterators.Pairs{Symbol,Int64,Tuple{Symbol},NamedTuple{(:skiplines_begin,),Tuple{Int64}}}, ::typeof(JuliaDB._loadtable_serial), ::UnionAll, ::Array{String,1}) at /Users/mcintna1/.julia/packages/JuliaDB/ZXPIx/src/util.jl:83
(::getfield(JuliaDB, Symbol("##190#193")){Array{Int64,1},Base.Iterators.Pairs{Symbol,Int64,Tuple{Symbol},NamedTuple{(:skiplines_begin,),Tuple{Int64}}},UnionAll})(::Array{String,1}) at ./none:0
do_task(::Dagger.Context, ::Dagger.OSProc, ::Int64, ::Function, ::Tuple{Array{String,1}}, ::Bool, ::Bool, ::Bool) at /Users/mcintna1/.julia/packages/Dagger/sdZXi/src/scheduler.jl:259
#143 at /Users/osx/buildbot/slave/package_osx64/build/usr/share/julia/stdlib/v1.1/Distributed/src/remotecall.jl:339 [inlined]
run_work_thunk(::getfield(Distributed, Symbol("##143#144")){typeof(Dagger.Sch.do_task),Tuple{Dagger.Context,Dagger.OSProc,Int64,getfield(JuliaDB, Symbol("##190#193")){Array{Int64,1},Base.Iterators.Pairs{Symbol,Int64,Tuple{Symbol},NamedTuple{(:skiplines_begin,),Tuple{Int64}}},UnionAll},Tuple{Array{String,1}},Bool,Bool,Bool},Base.Iterators.Pairs{Union{},Union{},Tuple{},NamedTuple{(),Tuple{}}}}, ::Bool) at /Users/osx/buildbot/slave/package_osx64/build/usr/share/julia/stdlib/v1.1/Distributed/src/process_messages.jl:56
#remotecall_fetch#148(::Base.Iterators.Pairs{Union{},Union{},Tuple{},NamedTuple{(),Tuple{}}}, ::Function, ::Function, ::Distributed.LocalProcess, ::Dagger.Context, ::Vararg{Any,N} where N) at /Users/osx/buildbot/slave/package_osx64/build/usr/share/julia/stdlib/v1.1/Distributed/src/remotecall.jl:364
remotecall_fetch(::Function, ::Distributed.LocalProcess, ::Dagger.Context, ::Vararg{Any,N} where N) at /Users/osx/buildbot/slave/package_osx64/build/usr/share/julia/stdlib/v1.1/Distributed/src/remotecall.jl:364
#remotecall_fetch#152(::Base.Iterators.Pairs{Union{},Union{},Tuple{},NamedTuple{(),Tuple{}}}, ::Function, ::Function, ::Int64, ::Dagger.Context, ::Vararg{Any,N} where N) at /Users/osx/buildbot/slave/package_osx64/build/usr/share/julia/stdlib/v1.1/Distributed/src/remotecall.jl:406
remotecall_fetch at /Users/osx/buildbot/slave/package_osx64/build/usr/share/julia/stdlib/v1.1/Distributed/src/remotecall.jl:406 [inlined]
macro expansion at /Users/mcintna1/.julia/packages/Dagger/sdZXi/src/scheduler.jl:272 [inlined]
(::getfield(Dagger.Sch, Symbol("##13#14")){Dagger.Context,Dagger.OSProc,Int64,getfield(JuliaDB, Symbol("##190#193")){Array{Int64,1},Base.Iterators.Pairs{Symbol,Int64,Tuple{Symbol},NamedTuple{(:skiplines_begin,),Tuple{Int64}}},UnionAll},Tuple{Array{String,1}},Channel{Any},Bool,Bool,Bool})() at ./task.jl:259
Stacktrace:
[1] compute_dag(::Dagger.Context, ::Dagger.Thunk) at /Users/mcintna1/.julia/packages/Dagger/sdZXi/src/scheduler.jl:62
[2] compute(::Dagger.Context, ::Dagger.Thunk) at /Users/mcintna1/.julia/packages/Dagger/sdZXi/src/compute.jl:25
[3] #fromchunks#47(::String, ::Int64, ::Base.Iterators.Pairs{Union{},Union{},Tuple{},NamedTuple{(),Tuple{}}}, ::Function, ::Array{Dagger.Thunk,1}) at /Users/mcintna1/.julia/packages/JuliaDB/ZXPIx/src/table.jl:141
[4] (::getfield(JuliaDB, Symbol("#kw##fromchunks")))(::NamedTuple{(:output, :fnoffset),Tuple{String,Int64}}, ::typeof(JuliaDB.fromchunks), ::Array{Dagger.Thunk,1}) at ./none:0
[5] #_loadtable#188(::Nothing, ::String, ::Bool, ::Array{Int64,1}, ::Bool, ::Bool, ::Base.Iterators.Pairs{Symbol,Int64,Tuple{Symbol},NamedTuple{(:skiplines_begin,),Tuple{Int64}}}, ::Function, ::Type, ::String) at /Users/mcintna1/.julia/packages/JuliaDB/ZXPIx/src/io.jl:140
[6] #_loadtable at ./none:0 [inlined]
[7] #loadtable#186 at /Users/mcintna1/.julia/packages/JuliaDB/ZXPIx/src/io.jl:63 [inlined]
[8] (::getfield(JuliaDB, Symbol("#kw##loadtable")))(::NamedTuple{(:output, :indexcols, :skiplines_begin, :distributed),Tuple{String,Array{Int64,1},Int64,Bool}}, ::typeof(loadtable), ::String) at ./none:0
[9] top-level scope at In[8]:1
However, when loading each table by itself, I run into no errors!
for file in CSVs
table = loadtable(file, indexcols = [1], skiplines_begin = 1)
println("Loaded table: $file")
end
Where CSVs is a vector of all the CSV file locations. It cranks through this loop, and finishes with no errors.
I can’t quite figure out where this error is coming from, or how to deal with it. I thought it might be “bad data” in one of the CSVs, but it seems to be able to read each one just fine. Is it an error in the combining part?