I’m trying to code a simple b-spline (and then NURBS) on the GPU using kernels to speed up a ton of simultaneous evaluations.
I’m having trouble because Cox-deBoor is recursive and throws an error when I try to use this as a kernel. Alternatively, I could use de Boor’s method but this requires allocating an array, which is also not allowed in a kernel. In the code below, I avoid the problem by “manually” unrolling the recursion in Cox-de Boor, but I’m sure there is a better way.
There is also the issue that I need the spline data on the GPU, and I think I have correctly handled this using a wrapper type, but perhaps this isn’t optimal either.
#wrapper type
struct BSplineCurve{A<:AbstractArray,V<:AbstractVector,T<:Int} <: Function
pnts::A
knots::V
degree::T
end
using StaticArrays
function BSplineCurve(pnts;degree=3)
count,T = size(pnts, 2),promote_type(eltype(pnts),Float32)
knots = [zeros(degree); collect(range(0, count-degree) / (count-degree)); ones(degree)]
BSplineCurve(pnts,SA{T}[knots...],degree)
end
function (l::BSplineCurve)(u::T) where T
n = size(l.pnts, 2)
pt = SA{T}[0,0]
for k in 1:n
pt += Bd(l.knots,u,k,l.degree)*l.pnts[:,k]
end
pt
end
# terrible unrolled recursion
function Bd(knots, u, k, d)
d==1 && return Bd1(knots,u,k)
d==2 && return Bd2(knots,u,k)
return Bd3(knots,u,k)
end
function Bd3(knots, u, k)
d=3
α = ifelse(knots[k]<u≤knots[k+d],(u-knots[k])/(knots[k+d]-knots[k])*Bd2(knots,u,k),0f0)
β = ifelse(knots[k+1]≤u<knots[k+d+1],(knots[k+d+1]-u)/(knots[k+d+1]-knots[k+1])*Bd2(knots,u,k+1),0f0)
α+β
end
function Bd2(knots, u, k)
d=2
α = ifelse(knots[k]<u≤knots[k+d],(u-knots[k])/(knots[k+d]-knots[k])*Bd1(knots,u,k),0f0)
β = ifelse(knots[k+1]≤u<knots[k+d+1],(knots[k+d+1]-u)/(knots[k+d+1]-knots[k+1])*Bd1(knots,u,k+1),0f0)
α+β
end
function Bd1(knots, u, k)
d=1
α = ifelse(knots[k]<u≤knots[k+d],(u-knots[k])/(knots[k+d]-knots[k]),0f0)
β = ifelse(knots[k+1]≤u<knots[k+d+1],(knots[k+d+1]-u)/(knots[k+d+1]-knots[k+1]),0f0)
α+β
end
# Does it work with KernelAbstractions?
using CUDA, KernelAbstractions
@kernel function _test!(a::AbstractArray,l::BSplineCurve)
# Map index to physical space
I = @index(Global)
u = (I-1)/(length(a)-1)
q = l(u)
a[I] = q'*q # just want some scalar output
end
test!(a,l)=_test!(get_backend(a),64)(a,l,ndrange=length(a))
a = CUDA.zeros(64)
square = BSplineCurve(SA[5 5 0 -5 -5 -5 0 5 5
0 5 5 5 0 -5 -5 -5 0],degree=1)
test!(a,square)
a|>Array # yes!