Below is an example of how I would introduce a filtering step.
Adjust the filtering step as required – your overlap criteria weren’t clear to me.
using GenomicFeatures #v2
using DataFrames
col = [
Interval("chr1", 10628, 10683, '?', "abc")
Interval("chr1", 10643, 10779, '?', "abc")
Interval("chr1", 10645, 10748, '?', "abc")
Interval("chr1", 10648, 10786, '?', "abc")
Interval("chr1", 10676, 10767, '?', "abc")
Interval("chr1", 10690, 10731, '?', "abc")
Interval("chr1", 10707, 10793, '?', "abc")
Interval("chr1", 10742, 10814, '?', "abc")
] |> IntervalCollection
hhh = [
Interval("chr1", 10631, 10638)
Interval("chr1", 10633, 10635)
Interval("chr1", 10636, 10650)
Interval("chr1", 10638, 10649)
Interval("chr1", 10641, 10651)
Interval("chr1", 10644, 10689)
Interval("chr1", 10645, 10660)
Interval("chr1", 10650, 10650)
] |> IntervalCollection
function number_interval(tp::Tuple)
# Unpack Tuple.
(i, interval) = tp
# Setup numbered metadata.
new_metadata = (
i = i,
original = GenomicFeatures.metadata(interval)
)
# Create new interval with numbered metadata.
return Interval(
seqname(interval),
leftposition(interval),
rightposition(interval),
strand(interval),
new_metadata
)
end
"""
Given intervals a and b, that are assumed to overlap, the overlap is the span between the maximum left position and minimum right position.
interval: a b a b
side: l l r r
layout: |----|----|---|
"""
function overlap_span(interval_a, interval_b)
left = max(leftposition(interval_a), leftposition(interval_b))
right = min(rightposition(interval_a), rightposition(interval_b))
return length(left:right)
end
# Convert intervals.
col_numbered = col |> enumerate .|> number_interval |> IntervalCollection
hhh_numbered = hhh |> enumerate .|> number_interval |> IntervalCollection
# Setup overlap iterator.
itr = eachoverlap(col_numbered, hhh_numbered); # Note: suppressing IntervalTree's output.
# Setup "on the fly" filtering.
itr = Iterators.filter(itr) do (c, h)
# Select pairs with a minimum overlap of 5.
if overlap_span(c, h) >= 5
return true
end
# Reject overlapping pair.
return false
end
# Setup result generator.
itr = Base.Generator(itr) do (c, h)
result = (
queryHits = GenomicFeatures.metadata(c).i,
subjectHits = GenomicFeatures.metadata(h).i
)
return result
end
# Collect results in a `DataFrame`.
df = DataFrame(itr)
julia> df
19×2 DataFrame
Row │ queryHits subjectHits
│ Int64 Int64
─────┼────────────────────────
1 │ 1 1
2 │ 1 3
3 │ 1 4
4 │ 1 5
5 │ 1 6
6 │ 1 7
7 │ 2 3
8 │ 2 4
9 │ 2 5
10 │ 2 6
11 │ 2 7
12 │ 3 3
13 │ 3 4
14 │ 3 5
15 │ 3 6
16 │ 3 7
17 │ 4 6
18 │ 4 7
19 │ 5 6
This question of zhangchunyong’s continues from the following.