function conver(ind, str)
ls = length(str)
ls < ind + 2 && return nothing
s = SubString(str, ind+1, ind+2)
if 'N' in s
return missing
else
return s=="CG"
end
end
function classify_reads(index::Int64
,match_read_cpg::GroupedDataFrame
,starts_cpgs::Int64
,starts_reads::Int64
,seqs_reads::LongDNASeq
,overlapcopy::DataFrame)
covered_cpgs = match_read_cpg[index][:,2]
if length(covered_cpgs)<4
return missing
end
start_cpgs=starts_cpgs[covered_cpgs]
start_of_read=starts_reads[index]
start_cpgs = start_cpgs .- start_of_read
sequence=seqs_reads[index]
representation=Vector{Bool}()
s=String(sequence)
for i in start_cpgs
c=conver(i,s)
if isequal(c,missing)|isequal(c,nothing)
deleteat!(overlapcopy,findall(overlapcopy.queryHits.==index .&& overlapcopy.subjectHits.==covered_cpgs[findfirst(x->x==i,start_cpgs)]))
else
push!(representation,c)
end
end
if length(representation)<4
return missing
end
concordant = (all(representation) || all(.!representation))
return !concordant
end
function calculatestate(classified_reads,match_read_cpg,starts_cpgs,starts_reads,seqs_reads,overlapcopy)
p=Vector{Union{Missing,Bool}}()
for i in classified_reads
#println(i)
a=classify_reads(i,match_read_cpg,starts_cpgs,starts_reads,seqs_reads,overlapcopy)
push!(p,a)
end
p
end
match_read_cpg (the second parameter to the second and third function) is a GroupedDataframe that I need iterate one dataframe at a time to handle. When I execute the calculatestate function, I use the first for and there is a function called a=classify_reads (i,match_read_cpg,starts_cpgs,starts_reads,seqs_reads, overlapcopy) with a for inside. I actually want to broadcast conver(the first function) in the second function classify_reads, but I have to make a judgment after execution to delete the rows of overlapcopy. So you have two nested fors. This step will slow down. Would you please show me how to solve the problem to make it faster? Is there anything else I can change to improve the performances? Thanks for helping me!