How do I do the following pandas manipulation on a Julia Dataframe column:
>>> s = Series(['a1', 'b2', 'c3'])
>>> s.str.extract('(?P<letter>[ab])(?P<digit>\d)')
letter digit
0 a 1
1 b 2
2 NaN NaN
How do I do the following pandas manipulation on a Julia Dataframe column:
>>> s = Series(['a1', 'b2', 'c3'])
>>> s.str.extract('(?P<letter>[ab])(?P<digit>\d)')
letter digit
0 a 1
1 b 2
2 NaN NaN
I am not aware of an inbuilt method, but to get you started.
Julia support regexes and so the first parts would be:
data = ["a1", "b2", "c3"]
regex = r"(?P<letter>[ab])(?P<digit>\d)"
result = match.(regex, data) # notice the "dot" after match
3-element Array{Any,1}:
RegexMatch("a1", letter="a", digit="1")
RegexMatch("b2", letter="b", digit="2")
nothing
So now you have to convert the result
array into a DataFrame
…
Which is a bit annoying and the below is not the most efficient way of doing it.
getnames(m::RegexMatch) = collect(values(Base,PCRE.capture_names(m.regex.regex)))
getnames(m::Void) = String[]
columns = Symbol.(unique(reduce(append!, getnames.(result))))
df = DataFrame(fill(String, length(columns)), columns, 0)
for rm in result
if rm === nothing
push!(df, fill(NA, length(colums))
continue
end
row = Any[]
for column in columns
if column ∉ getnames(rm)
push!(row, rm[column])
else
push!(row, NA)
end
end
push!(df, row)
end
@vchuravy looks rather tedious; thanks for the details nevertheless.