I’ve been looking around and messing with map, eachrow and different things but I haven’t been able to figure it out.
Essentially, if I have a N x M dataframe, I want to return a new N x M dataframe, except each value in the new dataframe is each value in the old dataframe divided by the maximum of the row it originally sits in the old dataframe.
For a 1 row dataframe of [2,4,6] it should return [.33,.66,1].
But in my use case mapped to a dataframe with many rows.
df = DataFrame(a = rand(1:10, 3), b = rand(1:10, 3), c=rand(1:10, 3))
dfn = DataFrame(Float64, 0, 3) ## create a new dataframe with same number of columns
for r in eachrow(df)
m = collect(r) ./ maximum(r)
push!(dfn, m)
end
I suppose it can be golf coded, but generally it can be something like this
m = rand(1000, 100)
function baz!(m)
for i in axes(m, 1)
@views m[i, :] .= m[i, :] ./ maximum(m[i, :])
end
end
@btime baz!($m) # 386.500 μs (3000 allocations: 140.63 KiB)
function baa!(m)
for i in axes(m, 1)
mval = -Inf
for j in axes(m, 2)
mval = mval < m[i, j] ? m[i, j] : mval
end
for j in axes(m, 2)
m[i, j] /= mval
end
end
end
@btime baa!($m) # 202.932 μs (0 allocations: 0 bytes)
which is still slower then allocating version.
But you gave me an idea
function baa2!(m)
maxi = Vector{Float64}(undef, size(m, 1))
@inbounds for i in axes(m, 1)
mval = -Inf
for j in axes(m, 2)
mval = mval < m[i, j] ? m[i, j] : mval
end
maxi[i] = mval
end
@inbounds for j in axes(m, 2)
for i in axes(m, 1)
m[i, j] /= maxi[i]
end
end
end
@btime baa2!($m) # 123.239 μs (1 allocation: 7.94 KiB)
I got one more: although it’s getting a bit ridiculous syntax wise
function fasterbaz3!(m)
nrows, ncols = size(m)
maximums = m[:, 1] # copying the first col saves one col in the first iteration haha
# iterate down the rows first which matches julia's memory layout
@inbounds for j in 2:ncols, i in 1:nrows
maximums[i] = max(maximums[i], m[i, j])
end
@inbounds for j in 1:ncols, i in 1:nrows
m[i, j] /= maximums[i]
end
end
ok ok very last one! let’s use the fact that multiplications are faster than divisions…
function fasterbaz4!(m)
nrows, ncols = size(m)
maximums = m[:, 1]
@inbounds for j in 2:ncols, i in 1:nrows
maximums[i] = max(maximums[i], m[i, j])
end
# now maximums are actually their inverse for multiplication below
maximums .= 1 ./ maximums
@inbounds for j in 1:ncols, i in 1:nrows
m[i, j] *= maximums[i]
end
end
function baa3!(m)
maxi = m[:, 1]
ncol = size(m, 2)
@inbounds for j in 2:ncol
for i in axes(m, 1)
maxi[i] = maxi[i] < m[i, j] ? m[i, j] : maxi[i]
end
end
maxi .= 1 ./ maxi
@inbounds for j in axes(m, 2)
for i in axes(m, 1)
m[i, j] *= maxi[i]
end
end
end
@btime baa3!($m) # 34.706 μs (1 allocation: 7.94 KiB)
Something is going wrong with that maximum broadcast.
julia> foo2(df) = df ./ [maximum(row) for row in eachrow(df)]
foo2 (generic function with 1 method)
julia> @btime foo2($df)
29.195 ms (554947 allocations: 10.81 MiB)
That being said, it’s a lot better with Tables.rows instead of eachrow.
julia> foo2(df) = df ./ [maximum(row) for row in Tables.rows(df)]
foo2 (generic function with 1 method)
julia> @btime foo2($df)
12.237 ms (9063 allocations: 985.81 KiB)