Hi friends,

I made a post about the problem of generating pairs of objects while also filtering them, and it seemed that parallel computing was the main suggestion, so I’m trying to learn CUDA for that, but I’ve run into a wall with the performance of the underlying functions themselves. They’re run 100k’s times each but they are just multiplication or some conditional statement, so I’m not sure how to make them faster when they are doing things so simple, if there is anything.

```
using Graphs
using Random
using LinearAlgebra
using GeometryBasics
using BenchmarkTools
using StatProfilerHTML
const k = 4
const tol::Float32 = 10.0^(-k)
const rval::Float32 = 10^k
const msgval::Float32 = 0.0452
const msgtuple::GeometryBasics.Point{2,Float32} = GeometryBasics.Point(msgval,msgval)
function computecross(graph::SimpleGraph, pos::Vector{GeometryBasics.Point{2,Float32}}, f=8, b=25)
edgs = collect(edges(graph))
eview = view(edgs, eachindex(edgs))
ecount = ne(graph)
vcount = nv(graph)
allocatedlength = unsafe_trunc(Int32, (ecount*(ecount-1)/f-b*ecount))
segments = map(x -> get_segment(x, pos), edgs)
crossings = Vector{Tuple{Tuple{ Graphs.SimpleGraphs.SimpleEdge{Int64}, Graphs.SimpleGraphs.SimpleEdge{Int64}}, Point{2, Float32} } }(undef, allocatedlength)
sizehint!(crossings, allocatedlength)
index=1
for i in 1:ecount
@inbounds seg_i = segments[i]
for j in 1:i-1
@views @inbounds x1, y1, x2, y2, x3, y3, x4, y4, t, u = line_data(seg_i, segments[j])
if 0 < t && t < 1 && 0 < u && u < 1
point = lineintersection(x1, y1, x1-x2, y1-y2, x3, y3, x3-x4, y3-y4, t, u)
@views @inbounds if point !== msgtuple && not_on_endpoint(seg_i, segments[j], point)
@inbounds crossings[index] = ((eview[i], eview[j]), point)
index+=1
end
end
end
end
resize!(crossings, index-1)
return crossings
end
function get_segment(e, positions::Vector{GeometryBasics.Point{2,Float32}})
return GeometryBasics.Line(positions[src(e)], positions[dst(e)])
end
function line_data(seg1, seg2)
x1 = seg1[1][1]
y1 = seg1[1][2]
x2 = seg1[2][1]
y2 = seg1[2][2]
x3 = seg2[1][1]
y3 = seg2[1][2]
x4 = seg2[2][1]
y4 = seg2[2][2]
@fastmath t_n = (x1-x3)*(y3-y4)-(y1-y3)*(x3-x4)
@fastmath t_d = (x1-x2)*(y3-y4)-(y1-y2)*(x3-x4)
@fastmath u_n = -1*((x1-x2)*(y1-y3)-(y1-y2)*(x1-x3))
@fastmath u_d = (x1-x2)*(y3-y4)-(y1-y2)*(x3-x4)
t = t_n/t_d
u = u_n/u_d
return x1, y1, x2, y2, x3, y3, x4, y4, t, u
end
function lineintersection(x1, y1, x12, y12, x3, y3, x34, y34, t, u)
@fastmath p1x = x1-(t)*(x12)
@fastmath p1y = y1-(t)*(y12)
@fastmath p2x = x3+(u)*(x34)
@fastmath p2y = y3+(u)*(y34)
if isapprox_on_lines(-x12, -y12, x1, y1, p1x, p1y)
return GeometryBasics.Point(fast_k_round(p1x), fast_k_round(p1y))
elseif isapprox_on_lines(-x34, -y34, x3, y3, p2x, p2y)
return GeometryBasics.Point(fast_k_round(p2x), fast_k_round(p2y))
else
return msgtuple
end
end
function not_on_endpoint(seg1, seg2, pt)
return (distance2(seg1[1],pt)<tol || distance2(seg1[2],pt)<tol || distance2(seg2[1],pt)<tol || distance2(seg2[2],pt)<tol) ? (return false) : (return true)
end
function distance2(x, y)
sum = float(abs2(zero(eltype(x)) - zero(eltype(y))))
@simd for i in eachindex(x, y)
@inbounds sum += abs2(x[i] - y[i])
end
return sum
end
function is_intersecting(t, u)
return (0 <= t ) && (t <= 1) && (0 <= u) && (u <= 1)
end
function fast_k_round(x::Float32)
@fastmath @inline return Float32(unsafe_trunc(Int32, rval*x)/rval)
end
function isapprox_on_lines(seg1xdiff::Float32, seg1ydiff::Float32, choosenseg1x::Float32, choosenseg1y::Float32, px::Float32, py::Float32)
# Compute cross products
cross1 = @fastmath seg1xdiff*(py-choosenseg1y)-seg1ydiff*(px-choosenseg1x)
# Compute squares of magnitudes
norm_sq1::Float32 = @fastmath abs2(cross1)
# Compare squares of magnitudes to square of tolerance
norm_sq1 < tol ? (return true) : (return false)
end
function testing_generate_layout(graph::AbstractGraph)
n = nv(graph)
positions::Vector{GeometryBasics.Point{2,Float32}} = [GeometryBasics.Point(round(rand(), digits = k), round(rand(), digits = k)) for _ in 1:n]
return positions
end
function testing_generate_graph()
n = rand(50:50)
max_edges = n * (n - 1) ÷ 2
e = min(max_edges)
graph = SimpleGraph(n, e)
#graph = SimpleDiGraph(n, e)
return graph
end
G = testing_generate_graph()
positions1 = testing_generate_layout(G)
```

Around 50% of the program when sampled are the functions below.

`not_on_endpoint`

is ~18% which checks if a calculated crossing point is an endpoint of one of the line segments. This is mostly from calculating distances, For this, I was using a function for Euclidean distance that had been posted by a steward I think on the forums, so I don’t think I can improve it at all.

`lineintersection`

is ~17%. Most of this is a conditional for checking which of two possible points is an intersection by checking if it is approximately on the lines with a cross-product calculation.

`line_data`

is ~14% and this retrieves coordinates from the line segments and calculates two numbers for an inequality which determines if there is a crossing.

I’ve tried to read every post I can find on best practices and performance for these, and the function is type-stable. Is there something I’m not doing, or doing wrong, that should be obvious?

Benchmark for 50 vertex complete graph with random drawing is ~6.5 to 6.8ms and ~6MiB