two observations:
- the “+1” instead of “+3” I don’t think changes the times much, but it certainly doesn’t change the result. As I certainly can’t find an ATG startcodon following this one before position(A)+3;
- the takewhile function is also in the IterTools module. Therefore, having avoided the use of drop, the IterTools package is enough.
For times and allocations I did these tests…
julia> seq = randdnaseq(10^8)
100000000nt DNA Sequence:
AACATGAGCGTGGTTTTATTTGGGAGGACGATATTATTC…TCCCGCGGCAACTCCCGTACAATCAGAGTTGGCGTACCG
julia> using BenchmarkTools
julia> @time collect(locationiterator(seq))
2.170984 seconds (16.12 M allocations: 842.458 MiB, 11.20% gc time, 7.71% compilation time)
1563050-element Vector{UnitRange{Int64}}:
4:264
45:152
julia> sseq=string(seq)
"AACATGAGCGTGGTTTTATTTGGGAGGACGATATTATTCCCGAGATGCGGTGTGAAAAAACAAATATTAAAACATTCAGCTTCTGTTGTACACAAGTCGCATCGCATTGTGGCCCGGAGGGAAAGGAGGTGTGCGTTGTGGATGATCAATAAAATCCGTCATCCACCCCGCCAGTTAAATTATATAGGCTCTCTCAGAGCACCTGACAGAGCACTCGGCGGTTGGATCGGGATGGCAGAT" ⋯ 99999520 bytes ⋯ "GTATCGGATCCAGGTCTAGTAGAGTCGTATGGACCATAGCGGCTCAGGTTAAAACCCAAAGGCTCTGGGGAAACATCGTGGTTTATCGGCAGTCGTCGACCAGGACAGTCAAGTGACTCTAAGTCCTAGTAGCTCAAACCGCGCGGATGCTGTAACGCTTTCTGGTTGGGCCACTGTAGAGACTCCGACACATTCCTCCCTTCCCGCGGCAACTCCCGTACAATCAGAGTTGGCGTACCG"
julia> ptrn=r"ATG(?:[AGTC][AGTC][AGTC])*?(TAG|TAA|TGA)"
r"ATG(?:[AGTC][AGTC][AGTC])*?(TAG|TAA|TGA)"
julia> f(R) = findnext(ptrn,sseq,first(R)+3)
f (generic function with 1 method)
julia> itr = takewhile(!isnothing, iterated(f, findfirst(ptrn,sseq)))
IterTools.TakeWhile{IterTools.Iterated{UnitRange{Int64}, typeof(f)}}(Base.var"#97#98"{typeof(isnothing)}(isnothing), IterTools.Iterated{UnitRange{Int64}, typeof(f)}(f, 4:264))
julia> @btime [R for R in itr]
949.594 ms (9378307 allocations: 324.01 MiB)
1563050-element Vector{UnitRange{Int64}}:
4:264
45:152
142:264
julia> function se(sseq)
eos=true
R=-2:0
v=UnitRange[]
while eos
R=f(R)
(eos=!isnothing(R)) && push!(v,R)
end
v
end
se (generic function with 1 method)
julia> @btime se(sseq)
751.423 ms (4689159 allocations: 138.16 MiB)
1563050-element Vector{UnitRange}:
4:264
45:152
142:264
232:264