As @bkamins referenced above, the type stabilities are causing all the problems. Here is a version like recommended, but using DataFramesMeta.jl for easier syntax.
julia> using DataFramesMeta, BenchmarkTools;
julia> function setup()
df = DataFrame(protein1 = [rand(1:20_000) for _ in 1:1_000_000], protein2 = [rand(1:20_000) for _ in 1:1_000_000])
end;
julia> function duplicates(df)
n = 200
dup = zeros(Int, n)
for i in 1:n
for j in i:size(df, 1)
if df.protein1[i] == df.protein2[j] && df.protein1[j] == df.protein2[i]
dup[i] = j
break
end
end
end
return dup
end;
julia> function duplicates_dfm(df)
n = 200
dup = zeros(Int, n)
@with df begin
for i in 1:n
for j in i:length(:protein1)
if :protein1[i] == :protein2[j] && :protein1[j] == :protein2[i]
dup[i] = j
break
end
end
end
end
return dup
end;
julia> df = @btime setup();
15.265 ms (33 allocations: 30.52 MiB)
julia> dups = @btime duplicates($df);
31.710 s (990677773 allocations: 20.72 GiB)
julia> dups = @btime duplicates_dfm($df);
193.206 ms (7 allocations: 2.03 KiB)