using GenomicFeatures
using DataFrames
col = [
Interval("chr1", 10628, 10683, '?', "abc")
Interval("chr1", 10643, 10779, '?', "abc")
Interval("chr1", 10645, 10748, '?', "abc")
Interval("chr1", 10648, 10786, '?', "abc")
Interval("chr1", 10676, 10767, '?', "abc")
Interval("chr1", 10690, 10731, '?', "abc")
Interval("chr1", 10707, 10793, '?', "abc")
Interval("chr1", 10742, 10814, '?', "abc")
] |> IntervalCollection
hhh = [
Interval("chr1", 10631, 10638)
Interval("chr1", 10633, 10635)
Interval("chr1", 10636, 10650)
Interval("chr1", 10638, 10649)
Interval("chr1", 10641, 10651)
Interval("chr1", 10644, 10689)
Interval("chr1", 10645, 10660)
Interval("chr1", 10650, 10650)
] |> IntervalCollection
But I want to findoverlap with minoverlap=5.I tried like the following
function number_interval(tp::Tuple)
# Unpack Tuple.
(i, interval) = tp
# Setup numbered metadata.
new_metadata = (
i = i,
original = GenomicFeatures.metadata(interval)
)
# Create new interval with numbered metadata.
return Interval(
seqname(interval),
leftposition(interval),
rightposition(interval),
strand(interval),
new_metadata
)
end
# Convert intervals.
col_numbered = col |> enumerate .|> number_interval |> IntervalCollection
hhh_numbered = hhh |> enumerate .|> number_interval |> IntervalCollection
# Reproduce example.
df = DataFrame()
for (c, h) in eachoverlap(col_numbered, hhh_numbered)
result = (
queryHits = GenomicFeatures.metadata(c).i,
subjectHits = GenomicFeatures.metadata(h).i
)
push!(df, result)
end
julia> df
27×2 DataFrame
Row │ queryHits subjectHits
│ Int64 Int64
─────┼────────────────────────
1 │ 1 1
2 │ 1 2
3 │ 1 3
4 │ 1 4
5 │ 1 5
6 │ 1 6
7 │ 1 7
8 │ 1 8
9 │ 2 3
10 │ 2 4
11 │ 2 5
12 │ 2 6
13 │ 2 7
14 │ 2 8
15 │ 3 3
But I can not solve the problem with minoverlap=5.For example,hhh’s first line has more than 5 overlaps with col’s first ,I will output the index of col’s index and hhh’s index.hhh’s second line does not have 5 overlaps,it will not occur in the final dataframe.The function above seems to solve minoverlap=1.What should I do to solve this problem? Thank all guys for helping me!