Hello fellow Julia coders. I’m relatively new to Julia and one of the first things I like to do is translate another script into my new working language. I was working on a simple machine learning project with the Kaggle Titanic dataset to predict survival and decided I needed a way to onehot encode the String (and similar String-like data Types). I found a number of options within other packages but felt this was a moderately advanced challenge to try and code myself.
I’d be interested in feedback and thoughts on how to improve this function and hope you find it interesting as you continue to learn in the Julia world!
function onehotenc(df, dropnums = false, droplast = false)
# Near the top of the function ensure that a DataFrame is passed into the function with @assert
@assert typeof(df) == DataFrame "onehotenc expecting a DataFrame"
# If the dropnums parameter is set to true then call the keepstrings!() function
# Pull out the string or categorical variables to be onehot-encoded
df = keepstrings(df)
#println("keepstrings")
# For each column to be onehot-encoded
# Set up a counter for the attributes that are onehot-encoded
ct = 0
# Set aside a DataFrame for building the onehot-encoding DataFrame
onehot = DataFrame()
# Loop to develop onehot-encoding
for col in eachcol(df)
# Increment the counter for our first attribute
ct += 1
#println("col = $col")
# Save the length of the original set of values
len = length(col)
#println("len = $len")
# Save the unique values (or Set) and how many there are for our initial zeros matrix
vals = unique(col)
lenvals = length(vals)
#println("vals = $vals, and lenvals = $lenvals")
# Save a DataFrame of a Matrix of zeros of the length of original rows (rows), and length of unique values (cols)
zero = DataFrame(zeros(len, lenvals))
#println("Created the zero dataframe")
# Rename the new zeros DF columns with the unique values of the attribute being onehot-encoded NEW FUNCTION?
# If droplast = true then don't convert the last attribute to a onehot column
z_ct = 0
for name in names(zero)
z_ct += 1
rename!(zero, name => Symbol(vals[z_ct]))
end # for
#println("renamezeros")
# Change the value from 0 to 1 in the zero DataFrame based on whether the unique value matches the original DF
for col_zero in 1:size(zero, 2)
zero[!, col_zero] = [col[i] == vals[col_zero] for i in eachindex(col)]
#println("col = $col, col_zero = $col_zero")
end
# Logic to get the master zeros DataFrame loaded
if ct == 1
#println("Copying zero to onehot")
onehot = copy(zero)
else
#println("hcat zero to onehot")
onehot = hcat(onehot, zero)
end
end
return onehot
end