How to speed up the for-loop with dataframe access

As @bkamins referenced above, the type stabilities are causing all the problems. Here is a version like recommended, but using DataFramesMeta.jl for easier syntax.

julia> using DataFramesMeta, BenchmarkTools;

julia> function setup()
           df = DataFrame(protein1 = [rand(1:20_000) for _ in 1:1_000_000], protein2 = [rand(1:20_000) for _ in 1:1_000_000])
       end;

julia> function duplicates(df)
           n = 200
           dup = zeros(Int, n)
           for i in 1:n
               for j in i:size(df, 1)
                   if df.protein1[i] == df.protein2[j] && df.protein1[j] == df.protein2[i]
                       dup[i] = j
                       break
                   end
               end
           end
           return dup
       end;
julia> function duplicates_dfm(df)
           n = 200
           dup = zeros(Int, n)
           @with df begin
               for i in 1:n
                   for j in i:length(:protein1)
                       if :protein1[i] == :protein2[j] && :protein1[j] == :protein2[i]
                           dup[i] = j
                           break
                       end
                   end
               end
           end
           return dup
       end;

julia> df = @btime setup();
  15.265 ms (33 allocations: 30.52 MiB)

julia> dups = @btime duplicates($df);
  31.710 s (990677773 allocations: 20.72 GiB)

julia> dups = @btime duplicates_dfm($df);
  193.206 ms (7 allocations: 2.03 KiB)
2 Likes