Hello,
I am new Julia and trying to train a basic SVM model for multiclass text classification in Julia. My dataset has around 75K rows and 2 columns (text and label). The context of the dataset is the abstracts of scientific papers gathered from PubMed. I have 10 labels in the dataset.
The dataset looks like this:
I keep receiving two different Method errors. The starting one is:
ERROR: MethodError: no method matching DocumentTermMatrix(::Vector{String})
I have tried:
convert(Array,data[:,:text])
and also:
convert(Matrix,data[:,:text])
Array conversion gives the same error, and matrix conversion gives:
ERROR: MethodError: no method matching (Matrix)(::Vector{String})
My code is:
using DataFrames, CSV, StatsBase,Printf, LIBSVM, TextAnalysis, Random
function ReadData(data)
df = CSV.read(data, DataFrame)
return df
end
function splitdf(df, pct)
@assert 0 <= pct <= 1
ids = collect(axes(df, 1))
shuffle!(ids)
sel = ids .<= nrow(df) .* pct
return view(df,sel, :), view(df, .!sel, :)
end
function Feature_Extract(data)
Text = convert(Array,data[:,:text])
m = DocumentTermMatrix(Text)
X = tfidf(m)
return X
end
function Classify(data)
data = ReadData(data)
train, test = splitdf(data, 0.5)
ytrain = train.label
ytest = test.label
Xtrain = Feature_Extract(train)
Xtest = Feature_Extract(test)
model = svmtrain(Xtrain, ytrain)
ŷ, decision_values = svmpredict(model, Xtest);
@printf "Accuracy: %.2f%%\n" mean(ŷ .== ytest) * 100
end
data = "data/composite_data.csv"
@time Classify(data)
I understand that DocumentTermMatrix requires a corpus. My question is how can I iteratively (ideally not in a time expensive way) create a corpus based on my dataframe.
I really appreciate your help on my first try with JuliaLang.
Thank you!
EDIT:
I have managed to get the corpus but now facing DimensionMismatch Error:
using DataFrames, CSV, StatsBase,Printf, LIBSVM, TextAnalysis, Random
function ReadData(data)
df = CSV.read(data, DataFrame)
#count = countmap(df.label)
#println(count)
#amt,lesslabel = findmin(count)
#println(amt, lesslabel)
#println(first(df,5))
return df
end
function splitdf(df, pct)
@assert 0 <= pct <= 1
ids = collect(axes(df, 1))
shuffle!(ids)
sel = ids .<= nrow(df) .* pct
return view(df,sel, :), view(df, .!sel, :)
end
function Feature_Extract(data)
crps = Corpus(StringDocument.(data.text))
update_lexicon!(crps)
m = DocumentTermMatrix(crps)
X = tf_idf(m)
return X
end
function Classify(data)
data = ReadData(data)
#println(labels)
#println(first(instances))
train, test = splitdf(data, 0.5)
ytrain = train.label
ytest = test.label
Xtrain = Feature_Extract(train)
Xtest = Feature_Extract(test)
model = svmtrain(Xtrain, ytrain)
ŷ, decision_values = svmpredict(model, Xtest);
@printf "Accuracy: %.2f%%\n" mean(ŷ .== ytest) * 100
end
data = "data/composite_data.csv"
@time Classify(data)
Error:
ERROR: DimensionMismatch("Size of second dimension of training instance\n matrix (247317) does not match length of\n labels (38263)")