CuArray + GLMakie

Turns out there was some “scalar indexing” going on, where individual elements of the GPU buffers were being copied to the CPU for processing. By using GLBuffer{Point2f} directly, and using low-level APIs that avoid some of the automatic processing (e.g. center! inspects data to determine limits, lines! does a map to detect invalid vertices) I have a working example that keeps all data on the GPU:

function plot(; T=Float32, N=1024, resolution=(800,600))
    t = CUDA.rand(T, N)
    X = CUDA.rand(T, N)


    ## initialization
    #
    # this should be only done once, keeping the buffer and resources across frames

    # XXX: we need create a screen, which initializes a GL Context,
    #      so that we can create a GLBuffer before having rendered anything.
    screen = GLMakie.global_gl_screen(resolution, true)

    # get a buffer object and register it with CUDA
    buffer = GLAbstraction.GLBuffer(Point2f, N)
    resource = let
        ref = Ref{CUDA.CUgraphicsResource}()
        CUDA.cuGraphicsGLRegisterBuffer(ref, buffer.id,
                                        CUDA.CU_GRAPHICS_MAP_RESOURCE_FLAGS_WRITE_DISCARD)
        ref[]
    end

    # NOTE: Makie's out-of-place API (lines, scatter) performs may iterating operations,
    #       like determining the range of the data, so we use a manual scene instead.
    scene = Scene(; resolution)
    cam2d!(scene)

    # XXX: manually position the cameral (`center!` would iterate data)
    cam = Makie.camera(scene)
    cam.projection[] = Makie.orthographicprojection(
        #= x =# 0f0, 1f0,
        #= y =# 0f0, 1f0,
        #= z =# 0f0, 1f0)


    ## main processing
    #
    # this needs to be done every time we get new data and want to plot it

    NVTX.@range "main" begin
        # process data, generate points
        NVTX.@range "CUDA" begin
            # map OpenGL buffer object for writing from CUDA
            CUDA.cuGraphicsMapResources(1, [resource], stream())

            # get a CuArray object that we can work with
            array = let
                ptr_ref = Ref{CUDA.CUdeviceptr}()
                numbytes_ref = Ref{Csize_t}()
                CUDA.cuGraphicsResourceGetMappedPointer_v2(ptr_ref, numbytes_ref, resource)

                ptr = reinterpret(CuPtr{Point2f}, ptr_ref[])
                len = Int(numbytes_ref[] ÷ sizeof(Point2f))

                unsafe_wrap(CuArray, ptr, len)
            end

            # generate points
            broadcast!(array, t, X) do x, y
                Point2f(x, y)
            end

            # wait for the GPU to finish
            synchronize()

            CUDA.cuGraphicsUnmapResources(1, [resource], stream())
        end

        # generate and render plot
        NVTX.@range "Makie" begin
            scatter!(scene, buffer)

            # force everything to render (for benchmarking purposes)
            GLMakie.render_frame(screen, resize_buffers=false)
            GLMakie.glFinish()
        end

    end

    save("plot.png", scene)


    ## clean-up

    CUDA.cuGraphicsUnregisterResource(resource)

    return
end

This performs well, doing all the rendering in a couple of 100s of us. It requires https://github.com/JuliaPlots/Makie.jl/pull/1803, and I’d also recommend to disable gpu_getindex so that scalar iteration of GLBuffer (a performance trap) is disallowed:

@eval GLMakie.GLAbstraction begin
    # XXX: to make scalar iteration error
    function gpu_getindex(b::GLBuffer{T}, range::UnitRange) where T
        error("GLBuffer getindex")  # XXX: for development
        multiplicator = sizeof(T)
        offset = first(range)-1
        value = Vector{T}(undef, length(range))
        bind(b)
        glGetBufferSubData(b.buffertype, multiplicator*offset, sizeof(value), value)
        bind(b, 0)
        return value
    end
end

It’s too bad that GLBuffer doesn’t support many array operations to make, e.g., lines! work properly. I wonder if it wouldn’t be better to pass CuArrays into Makie and have it do the necessary GL Interop calls automatically (which would make is possible to keep the CuArray around longer, and perform array operations on it, instead of eagerly converting it to a GLBuffer).

5 Likes