I have a dataframe with a few dozen columns.
For each row I want to ensure that a subset of the columns (let’s say the relevant columns are named x1 to x6) is sorted
I was wondering whether there is a faster approach than what I drafted below.
#Discourse MWE check if dataframe columns are sorted
using DataFrames
using BenchmarkTools
#generate some data
n=20_000_000;
#the relevant columns are integers
mat=trunc.(Int,90 .* rand(n,6));
#note : my actual data has additional columns (-> I need to work with a dataframe)
df=DataFrame(mat);
#sort columns
#my data is generally sorted. But I want to ensure this is the case (i.e. identify the dataframes where the data is not sorted).
#I do not care about the performance of this function
function mysort!(df)
cols=propertynames(df)
for i=1:size(df,1)
df[i,:] .= sort([df[i,col] for col in cols])
end
return nothing
end
function myIsSorted(m::Array)
@inbounds for i=1:size(m,1)
if !(m[i,6]>=m[i,5]>=m[i,4]>=m[i,3]>=m[i,2]>=m[i,1])
return false
end
end
return true
end
function myIsSorted(df)
m=convert(Array{Int64,2},df)
return myIsSorted(m)
end
#sort data
mysort!(df) #I do not care about the performance of this function
@btime myIsSorted($df) #0.37s, 915MB for 20m rows