How do I make the julia code efficient?

zhangchunyong · September 21, 2022, 9:39am

function conver(ind, str)
    ls = length(str)
    ls < ind + 2 && return nothing
    s = SubString(str, ind+1, ind+2)
    if 'N' in s
        return missing
    else
        return s=="CG"
    end
end
function classify_reads(index::Int64
                       ,match_read_cpg::GroupedDataFrame
                       ,starts_cpgs::Int64
                       ,starts_reads::Int64
                       ,seqs_reads::LongDNASeq
                       ,overlapcopy::DataFrame)
    covered_cpgs = match_read_cpg[index][:,2]
    if length(covered_cpgs)<4
        return missing
    end
    start_cpgs=starts_cpgs[covered_cpgs]
    start_of_read=starts_reads[index]
    start_cpgs = start_cpgs .- start_of_read
    sequence=seqs_reads[index]
    representation=Vector{Bool}()
    s=String(sequence)
    for i in start_cpgs
        c=conver(i,s)
        if isequal(c,missing)|isequal(c,nothing)
        	deleteat!(overlapcopy,findall(overlapcopy.queryHits.==index .&& overlapcopy.subjectHits.==covered_cpgs[findfirst(x->x==i,start_cpgs)]))
        else
            push!(representation,c)
        end
    end
    if length(representation)<4
        return missing
    end
    concordant = (all(representation) || all(.!representation))
    return !concordant
end        
        
function calculatestate(classified_reads,match_read_cpg,starts_cpgs,starts_reads,seqs_reads,overlapcopy)
    p=Vector{Union{Missing,Bool}}()
    for i in classified_reads
        #println(i)
        a=classify_reads(i,match_read_cpg,starts_cpgs,starts_reads,seqs_reads,overlapcopy)
        push!(p,a)
    end
    p
end

match_read_cpg (the second parameter to the second and third function) is a GroupedDataframe that I need iterate one dataframe at a time to handle. When I execute the calculatestate function, I use the first for and there is a function called a=classify_reads (i,match_read_cpg,starts_cpgs,starts_reads,seqs_reads, overlapcopy) with a for inside. I actually want to broadcast conver(the first function) in the second function classify_reads, but I have to make a judgment after execution to delete the rows of overlapcopy. So you have two nested fors. This step will slow down. Would you please show me how to solve the problem to make it faster? Is there anything else I can change to improve the performances? Thanks for helping me!

lmiq · September 21, 2022, 2:11pm

The low hanging fruits are (I have no idea how they affect performance, because the example is not runnable):

Add a view here

Replace this with a non-allocating generator of the indexes to run over:

This is probably the worst line:

Why not (I even think it is simpler, not sure if correct):

for i in eachindex(overlapcopy.queryHits)
    if overlapcopy.queryHits[i] == index &&
       overlapcopy.subjectHIts[i] == covered_cpgs[findfirst(x->x==i,start_cpgs)]
       deleteat!(overlapcopy, i)
    end
end

Finally, in this one I think you are allocating an array unnecessarily in all(.!representation), because the .!representation will create a new array:

use

concordant = (all(representation) || all(==(false), representation))

(or all(!, representation), seems to work)

zhangchunyong · September 21, 2022, 2:32pm

Thanks very much.I will test it.

pdeffebach · September 21, 2022, 2:45pm

A really basic thing is to remember that DataFrames and GroupedDataFrames are not type-stable objects. So you should write a function which acts on the columns from those objects and pass columns to those columns to the function.

Topic		Replies	Views
Bad performance: using OOP with for loops to identify substring New to Julia performance	8	582	January 7, 2022
Help with strings! General Usage strings , array	3	407	December 7, 2019
Pattern matching in Matrix General Usage question , package , match , pattern-match	3	152	December 16, 2024
Fast String processing in Julia vs. Python General Usage	38	6829	March 7, 2017
Julia Row-wise operation of DataFrames New to Julia question , dataframes	2	1284	July 13, 2019

How do I make the julia code efficient?

Related topics