How do I make the julia code efficient?

function conver(ind, str)
    ls = length(str)
    ls < ind + 2 && return nothing
    s = SubString(str, ind+1, ind+2)
    if 'N' in s
        return missing
    else
        return s=="CG"
    end
end
function classify_reads(index::Int64
                       ,match_read_cpg::GroupedDataFrame
                       ,starts_cpgs::Int64
                       ,starts_reads::Int64
                       ,seqs_reads::LongDNASeq
                       ,overlapcopy::DataFrame)
    covered_cpgs = match_read_cpg[index][:,2]
    if length(covered_cpgs)<4
        return missing
    end
    start_cpgs=starts_cpgs[covered_cpgs]
    start_of_read=starts_reads[index]
    start_cpgs = start_cpgs .- start_of_read
    sequence=seqs_reads[index]
    representation=Vector{Bool}()
    s=String(sequence)
    for i in start_cpgs
        c=conver(i,s)
        if isequal(c,missing)|isequal(c,nothing)
        	deleteat!(overlapcopy,findall(overlapcopy.queryHits.==index .&& overlapcopy.subjectHits.==covered_cpgs[findfirst(x->x==i,start_cpgs)]))
        else
            push!(representation,c)
        end
    end
    if length(representation)<4
        return missing
    end
    concordant = (all(representation) || all(.!representation))
    return !concordant
end        
        
function calculatestate(classified_reads,match_read_cpg,starts_cpgs,starts_reads,seqs_reads,overlapcopy)
    p=Vector{Union{Missing,Bool}}()
    for i in classified_reads
        #println(i)
        a=classify_reads(i,match_read_cpg,starts_cpgs,starts_reads,seqs_reads,overlapcopy)
        push!(p,a)
    end
    p
end

match_read_cpg (the second parameter to the second and third function) is a GroupedDataframe that I need iterate one dataframe at a time to handle. When I execute the calculatestate function, I use the first for and there is a function called a=classify_reads (i,match_read_cpg,starts_cpgs,starts_reads,seqs_reads, overlapcopy) with a for inside. I actually want to broadcast conver(the first function) in the second function classify_reads, but I have to make a judgment after execution to delete the rows of overlapcopy. So you have two nested fors. This step will slow down. Would you please show me how to solve the problem to make it faster? Is there anything else I can change to improve the performances? Thanks for helping me!

The low hanging fruits are (I have no idea how they affect performance, because the example is not runnable):

  1. Add a view here
  1. Replace this with a non-allocating generator of the indexes to run over:

This is probably the worst line:

Why not (I even think it is simpler, not sure if correct):

for i in eachindex(overlapcopy.queryHits)
    if overlapcopy.queryHits[i] == index &&
       overlapcopy.subjectHIts[i] == covered_cpgs[findfirst(x->x==i,start_cpgs)]
       deleteat!(overlapcopy, i)
    end
end

Finally, in this one I think you are allocating an array unnecessarily in all(.!representation), because the .!representation will create a new array:

use

concordant = (all(representation) || all(==(false), representation))

(or all(!, representation), seems to work)

Thanks very much.I will test it.

A really basic thing is to remember that DataFrames and GroupedDataFrames are not type-stable objects. So you should write a function which acts on the columns from those objects and pass columns to those columns to the function.

1 Like