Another question on eachoverlap in GenomicFeatures.jl

Below is an example of how I would introduce a filtering step.
Adjust the filtering step as required – your overlap criteria weren’t clear to me.

using GenomicFeatures #v2
using DataFrames


col = [
	Interval("chr1", 10628, 10683, '?', "abc")
	Interval("chr1", 10643, 10779, '?', "abc")
	Interval("chr1", 10645, 10748, '?', "abc")
	Interval("chr1", 10648, 10786, '?', "abc")
	Interval("chr1", 10676, 10767, '?', "abc")
	Interval("chr1", 10690, 10731, '?', "abc")
	Interval("chr1", 10707, 10793, '?', "abc")
	Interval("chr1", 10742, 10814, '?', "abc")
] |> IntervalCollection

hhh = [
	Interval("chr1", 10631, 10638)
	Interval("chr1", 10633, 10635)
	Interval("chr1", 10636, 10650)
	Interval("chr1", 10638, 10649)
	Interval("chr1", 10641, 10651)
	Interval("chr1", 10644, 10689)
	Interval("chr1", 10645, 10660)
	Interval("chr1", 10650, 10650)
] |> IntervalCollection

function number_interval(tp::Tuple)
	# Unpack Tuple.
	(i, interval) = tp

	# Setup numbered metadata.
	new_metadata = (
		i = i,
		original = GenomicFeatures.metadata(interval)
	)

	# Create new interval with numbered metadata.
	return Interval(
		seqname(interval),
		leftposition(interval),
		rightposition(interval),
		strand(interval),
		new_metadata
	)
end

"""
Given intervals a and b, that are assumed to overlap, the overlap is the span between the maximum left position and minimum right position.
interval: a    b    a   b
    side: l    l    r   r
  layout: |----|----|---|
"""
function overlap_span(interval_a, interval_b)
	left = max(leftposition(interval_a), leftposition(interval_b))
	right = min(rightposition(interval_a), rightposition(interval_b))
	return length(left:right)
end

# Convert intervals.
col_numbered = col |> enumerate .|> number_interval |> IntervalCollection
hhh_numbered = hhh |> enumerate .|> number_interval |> IntervalCollection

# Setup overlap iterator.
itr = eachoverlap(col_numbered, hhh_numbered); # Note: suppressing IntervalTree's output.

# Setup "on the fly" filtering.
itr = Iterators.filter(itr) do (c, h)
	# Select pairs with a minimum overlap of 5.
	if overlap_span(c, h) >= 5
		return true
	end
	
	# Reject overlapping pair.
	return false
end

# Setup result generator.
itr = Base.Generator(itr) do (c, h)
	result = (
		queryHits = GenomicFeatures.metadata(c).i,
		subjectHits = GenomicFeatures.metadata(h).i
	)
	return result
end

# Collect results in a `DataFrame`.
df = DataFrame(itr)
julia> df
19×2 DataFrame
 Row │ queryHits  subjectHits 
     │ Int64      Int64       
─────┼────────────────────────
   1 │         1            1
   2 │         1            3
   3 │         1            4
   4 │         1            5
   5 │         1            6
   6 │         1            7
   7 │         2            3
   8 │         2            4
   9 │         2            5
  10 │         2            6
  11 │         2            7
  12 │         3            3
  13 │         3            4
  14 │         3            5
  15 │         3            6
  16 │         3            7
  17 │         4            6
  18 │         4            7
  19 │         5            6

This question of zhangchunyong’s continues from the following.

1 Like