CSV.read stopped working

I have a script that I have used regularly in the past to read a CSV-file and output a PostgreSQL-dumpfile to create a table and populate it with data.

This script does not work anymore and it seems that something has changed in the CSV-library.

My version: Version 0.6.3-pre.0 (2017-12-18 07:11 UTC)

Code:

julia> CSV.read("/home/js/Downloads/data-1512997404715.csv")
ERROR: MissingException: encountered a missing value for a non-null column type on row = 686, col = 3
Stacktrace:
 [1] (::CSV.##4#5)(::Int64, ::Int64) at /home/js/.julia/v0.6/CSV/src/parsefields.jl:122
 [2] parsefield(::Base.AbstractIOBuffer{Array{UInt8,1}}, ::Type{WeakRefString{UInt8}}, ::CSV.Options{Void}, ::Int64, ::Int64, ::Base.RefValue{CSV.ParsingState}, ::CSV.##4#5) at /home/js/.julia/v0.6/CSV/src/parsefields.jl:233
 [3] parsefield at /home/js/.julia/v0.6/CSV/src/parsefields.jl:315 [inlined]
 [4] parsefield at /home/js/.julia/v0.6/CSV/src/parsefields.jl:129 [inlined] (repeats 2 times)
 [5] streamfrom at /home/js/.julia/v0.6/CSV/src/Source.jl:209 [inlined]
 [6] macro expansion at /home/js/.julia/v0.6/DataStreams/src/DataStreams.jl:542 [inlined]
 [7] stream!(::CSV.Source{Base.AbstractIOBuffer{Array{UInt8,1}},Void}, ::Type{DataStreams.Data.Field}, ::DataFrames.DataFrameStream{Tuple{CategoricalArrays.CategoricalArray{String,1,UInt32,String,CategoricalArrays.CategoricalString{UInt32},Union{}},CategoricalArrays.CategoricalArray{String,1,UInt32,String,CategoricalArrays.CategoricalString{UInt32},Union{}},CategoricalArrays.CategoricalArray{String,1,UInt32,String,CategoricalArrays.CategoricalString{UInt32},Union{}},CategoricalArrays.CategoricalArray{String,1,UInt32,String,CategoricalArrays.CategoricalString{UInt32},Union{}},Array{Int64,1},Array{Int64,1},CategoricalArrays.CategoricalArray{Union{Missings.Missing, String},1,UInt32,String,CategoricalArrays.CategoricalString{UInt32},Missings.Missing},CategoricalArrays.CategoricalArray{Union{Missings.Missing, String},1,UInt32,String,CategoricalArrays.CategoricalString{UInt32},Missings.Missing},CategoricalArrays.CategoricalArray{Union{Missings.Missing, String},1,UInt32,String,CategoricalArrays.CategoricalString{UInt32},Missings.Missing},CategoricalArrays.CategoricalArray{Union{Missings.Missing, String},1,UInt32,String,CategoricalArrays.CategoricalString{UInt32},Missings.Missing},CategoricalArrays.CategoricalArray{String,1,UInt32,String,CategoricalArrays.CategoricalString{UInt32},Union{}},CategoricalArrays.CategoricalArray{String,1,UInt32,String,CategoricalArrays.CategoricalString{UInt32},Union{}}}}, ::DataStreams.Data.Schema{true,Tuple{CategoricalArrays.CategoricalString{UInt32},CategoricalArrays.CategoricalString{UInt32},CategoricalArrays.CategoricalString{UInt32},CategoricalArrays.CategoricalString{UInt32},Int64,Int64,Union{CategoricalArrays.CategoricalString{UInt32}, Missings.Missing},Union{CategoricalArrays.CategoricalString{UInt32}, Missings.Missing},Union{CategoricalArrays.CategoricalString{UInt32}, Missings.Missing},Union{CategoricalArrays.CategoricalString{UInt32}, Missings.Missing},CategoricalArrays.CategoricalString{UInt32},CategoricalArrays.CategoricalString{UInt32}}}, ::Int64, ::NTuple{12,Base.#identity}, ::DataStreams.Data.##15#16, ::Array{Any,1}, ::Type{Ref{(:ut, :combined_name, :first_name, :author_role, :author_position, :addr_no, :email, :pref_name, :organisation, :suborgname, :city, :country)}}) at /home/js/.julia/v0.6/DataStreams/src/DataStreams.jl:614
 [8] #stream!#17(::Bool, ::Dict{Int64,Function}, ::Function, ::Array{Any,1}, ::Array{Any,1}, ::Function, ::CSV.Source{Base.AbstractIOBuffer{Array{UInt8,1}},Void}, ::Type{DataFrames.DataFrame}) at /home/js/.julia/v0.6/DataStreams/src/DataStreams.jl:490
 [9] (::DataStreams.Data.#kw##stream!)(::Array{Any,1}, ::DataStreams.Data.#stream!, ::CSV.Source{Base.AbstractIOBuffer{Array{UInt8,1}},Void}, ::Type{DataFrames.DataFrame}) at ./<missing>:0
 [10] #read#47(::Bool, ::Dict{Int64,Function}, ::Bool, ::Array{Any,1}, ::Function, ::String, ::Type{T} where T) at /home/js/.julia/v0.6/CSV/src/Source.jl:314
 [11] read(::String) at /home/js/.julia/v0.6/CSV/src/Source.jl:313

According to CSV’s latest documentation the default for CSV is to handle null values:

“nullable::Bool; indicates whether values can be nullable or not; true by default. If set to false and missing values are encountered, a NullException will be thrown”

How can CSV decide which column is a “non-null column type”?

Regards
Johann

See https://github.com/JuliaData/CSV.jl/issues/159 and https://github.com/JuliaData/CSV.jl/issues/153. We definitely need to improve this, but as a workaround you can increase the value of rows_for_type_detect, e.g. to 1000.

2 Likes

Thanks @nalimilan. That workaround helped.

Regards
Johann

You could also try CSVFiles.jl. The underlying parser (TextParse.jl) can handle missing values in columns that show up the first time even after the rows that are used for type detection, so I think it might just work on that kind of file without any further options.

1 Like

Helped me also with a Boundserror problem. Thanks a lot!

My error message was:

BoundsError: attempt to access 0-element Array{UInt8,1} at index [1]
macro expansion at parsefields.jl:81 [inlined]
parsefield(::Base.AbstractIOBuffer{Array{UInt8,1}}, ::Type{Float64}, ::CSV.Options{DateFormat{Symbol("yyyy-mm-dd\\THH:MM:SS.s"),Tuple{Base.Dates.DatePart{'y'},Base.Dates.Delim{Char,1},Base.Dates.DatePart{'m'},Base.Dates.Delim{Char,1},Base.Dates.DatePart{'d'},Base.Dates.Delim{Char,1},Base.Dates.DatePart{'H'},Base.Dates.Delim{Char,1},Base.Dates.DatePart{'M'},Base.Dates.Delim{Char,1},Base.Dates.DatePart{'S'},Base.Dates.Delim{Char,1},Base.Dates.DatePart{'s'}}}}, ::Int64, ::Int64, ::Base.RefValue{CSV.ParsingState}, ::CSV.##4#5) at float.jl:215
macro expansion at DataStreams.jl:542 [inlined]
stream!(::CSV.Source{Base.AbstractIOBuffer{Array{UInt8,1}},DateFormat{Symbol("yyyy-mm-dd\\THH:MM:SS.s"),Tuple{Base.Dates.DatePart{'y'},Base.Dates.Delim{Char,1},Base.Dates.DatePart{'m'},Base.Dates.Delim{Char,1},Base.Dates.DatePart{'d'},Base.Dates.Delim{Char,1},Base.Dates.DatePart{'H'},Base.Dates.Delim{Char,1},Base.Dates.DatePart{'M'},Base.Dates.Delim{Char,1},Base.Dates.DatePart{'S'},Base.Dates.Delim{Char,1},Base.Dates.DatePart{'s'}}}}, ::Type{DataStreams.Data.Field}, ::DataFrames.DataFrameStream{Tuple{Array{Date,1},Array{Float64,1},Array{Float64,1},Array{Float64,1},Array{Float64,1},Array{Int64,1},Array{Float64,1},Array{Float64,1},Array{Float64,1},Array{Float64,1},Array{Float64,1},Array{Float64,1},Array{Float64,1},Array{Float64,1},Array{Float64,1},Array{Float64,1},Array{Float64,1},Array{Float64,1},Array{Float64,1},Array{Float64,1},Array{Float64,1},Array{Float64,1},Array{Float64,1},WeakRefStrings.WeakRefStringArray{WeakRefString{UInt8},1,Union{}},Array{Float64,1},Array{Float64,1},Array{Float64,1},Array{Float64,1}}}, ::DataStreams.Data.Schema{true,Tuple{Date,Float64,Float64,Float64,Float64,Int64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,WeakRefString{UInt8},Float64,Float64,Float64,Float64}}, ::Int64, ::NTuple{28,Base.#identity}, ::DataStreams.Data.##15#16, ::Array{Any,1}, ::Type{Ref{(:Date, :mean_sentiment, :median_sentiment, :var_sentiment, :kurt_sentiment, :posts, :q90, :q80, :q75, :q25, :q10, :optvar, :optmean, :optoptvar, :optoptmean, :meang, :varg, :kurtg, :postsg, :q90g, :q80g, :q75g, :q25g, :q10g, :MA_mean, :mean_detrended, :MA_var, :var_detrended)}}) at DataStreams.jl:614
#stream!#17(::Bool, ::Dict{Int64,Function}, ::Function, ::Array{Any,1}, ::Array{Any,1}, ::Function, ::CSV.Source{Base.AbstractIOBuffer{Array{UInt8,1}},DateFormat{Symbol("yyyy-mm-dd\\THH:MM:SS.s"),Tuple{Base.Dates.DatePart{'y'},Base.Dates.Delim{Char,1},Base.Dates.DatePart{'m'},Base.Dates.Delim{Char,1},Base.Dates.DatePart{'d'},Base.Dates.Delim{Char,1},Base.Dates.DatePart{'H'},Base.Dates.Delim{Char,1},Base.Dates.DatePart{'M'},Base.Dates.Delim{Char,1},Base.Dates.DatePart{'S'},Base.Dates.Delim{Char,1},Base.Dates.DatePart{'s'}}}}, ::Type{DataFrames.DataFrame}) at DataStreams.jl:490
(::DataStreams.Data.#kw##stream!)(::Array{Any,1}, ::DataStreams.Data.#stream!, ::CSV.Source{Base.AbstractIOBuffer{Array{UInt8,1}},DateFormat{Symbol("yyyy-mm-dd\\THH:MM:SS.s"),Tuple{Base.Dates.DatePart{'y'},Base.Dates.Delim{Char,1},Base.Dates.DatePart{'m'},Base.Dates.Delim{Char,1},Base.Dates.DatePart{'d'},Base.Dates.Delim{Char,1},Base.Dates.DatePart{'H'},Base.Dates.Delim{Char,1},Base.Dates.DatePart{'M'},Base.Dates.Delim{Char,1},Base.Dates.DatePart{'S'},Base.Dates.Delim{Char,1},Base.Dates.DatePart{'s'}}}}, ::Type{DataFrames.DataFrame}) at <missing>:0
#read#47(::Bool, ::Dict{Int64,Function}, ::Bool, ::Array{Any,1}, ::Function, ::String, ::Type{T} where T) at Source.jl:315
read(::String, ::Type{T} where T) at Source.jl:314
include_string(::String, ::String) at loading.jl:522
include_string(::String, ::String, ::Int64) at eval.jl:30
include_string(::Module, ::String, ::String, ::Int64, ::Vararg{Int64,N} where N) at eval.jl:34
(::Atom.##100#105{String,Int64,String})() at eval.jl:75
withpath(::Atom.##100#105{String,Int64,String}, ::String) at utils.jl:30
withpath(::Function, ::String) at eval.jl:38
hideprompt(::Atom.##99#104{String,Int64,String}) at repl.jl:65
macro expansion at eval.jl:73 [inlined]
(::Atom.##98#103{Dict{String,Any}})() at task.jl:80

Your error looks like a different problem. Please file an issue against CSV.jl with a way to reproduce it if you have the time.