GPU/CPU Agnostic FFT code

,

I am looking to port my Professor’s 15 year old C++ code to Julia in a GPU/CPU Agnostic way maybe by using AcceleratedKernels.jl. The C++ code uses FFTW library, I know CUDA has CUDAFFTs but I don’t know if AcceleratedKernels.jl or JuliaGPU has such a framwork for FFTs. Is this a good idea? Or is this too ambitious and I should go for CUDAFFTs or something else?

Julia has a FFTW.jl library that dispatch on cpu and gpu and no problem mixing CuArray programing and kernel programing, both will be agnostic, if you need something more than AcceleratedKernels.jl, KernelAbstraction.jl creates agnostic kernels for cpu and gpu. Also, CxxWrap.jl can help you port things that you don’t know how to translate but that won’t be agnostic. For instance in this code

using FFTW, CUDA, BenchmarkTools
julia> function try_FFT_on_cpu()
           values = rand(256, 256, 256)
           value_complex = ComplexF32.(values)
           cvalues = similar((value_complex), ComplexF32)
           copyto!(cvalues, values)
           cy = similar(cvalues)
           cF = plan_fft!(cvalues, flags=FFTW.MEASURE)
           @btime a = ($cF*$cy)
           return nothing
       end
try_FFT_on_cpu (generic function with 1 method)

julia> function try_FFT_on_cuda()
           values = rand(256, 256, 256)
           value_complex = ComplexF32.(values)
           cvalues = similar(cu(value_complex), ComplexF32)
           copyto!(cvalues, values)
           cy = similar(cvalues)
           cF = plan_fft!(cvalues)
           @btime CUDA.@sync a = ($cF*$cy)
           return nothing
       end
try_FFT_on_cuda (generic function with 1 method)

from Unreasonably fast FFT on CUDA - #8 by roflmaostc

2 Likes

It’s not FFTW.jl but instead AbstractFFTs.jl and CUDA.jl which dispatches CuArray to CUDA.

7 Likes

figured out the way to do it, when I used just AbstractFFTs.jl naively as follows for a GPU/CPU agnostic implementation, it did not work:

function poisson_solve!(out_array::AbstractArray{T,2}, 
                                       inp_array::AbstractArray{T,2},
                                       coefficient::AbstractArray{T,2}) where T
            println("Backend agnostic FFT for Poisson solve")
            fourier_array = rfft(inp_array)
            fourier_array .*= coefficient
            out_array .= irfft(fourier_array, size(inp_array, 2))
            return out_array
end

Apparently, you need to specify the backend and without it I was getting StackOverflow errors.

So, I tried the following with multiple dispatch and the Requires.jl library:

function __init__()
    # CUDA implementation - only loaded if CUDA.jl is available
    @require CUDA="052768ef-5323-5732-b1bb-66c8b64840ba" begin
        function poisson_solve!(out_array::CUDA.CuArray{T,2}, 
                              inp_array::CUDA.CuArray{T,2},
                              coefficient::CUDA.CuArray{T,2}) where T
            println("Using CUDA FFT for Poisson solve")
            fourier_array = rfft(inp_array)
            fourier_array .*= coefficient
            out_array .= irfft(fourier_array, size(inp_array, 2))
            return out_array
        end
    end

    # AMDGPU implementation - only loaded if AMDGPU.jl is available
    @require AMDGPU="21141c5a-9bdb-4563-92ae-f87d6854732e" begin
        function poisson_solve!(out_array::AMDGPU.ROCArray{T,2}, 
                                inp_array::AMDGPU.ROCArray{T,2},
                                coefficient::AMDGPU.ROCArray{T,2}) where T
            println("Using ROCm FFT for Poisson solve")
            fourier_array = rfft(inp_array)
            fourier_array .*= coefficient
            out_array .= irfft(fourier_array, size(inp_array, 2))
            return out_array
        end
    end
end

for CUDA and ROCm and the following for CPU, oneAPI and Metal:

function poisson_solve!(out_array::AbstractArray{T,2}, 
                        inp_array::AbstractArray{T,2},
                        coefficient::AbstractArray{T,2}) where T
    # Convert inputs to CPU Arrays if they aren't already
    # This handles oneAPI and Metal arrays that need conversion
    inp_cpu = Array(inp_array)
    coeff_cpu = Array(coefficient)

    # Use FFTW's implementation explicitly
    fourier_array = FFTW.rfft(inp_cpu)

    # Multiply in Fourier space
    fourier_array .*= coeff_cpu

    # Perform inverse FFT
    result = FFTW.irfft(fourier_array, size(inp_cpu, 2))

    # Copy back to output array (handles conversion back to original array type)
    out_array .= result

    return out_array
end

Dependancies: using AbstractFFTs, using FFTW, using Requires, using AMDGPU or using CUDA
Now, it works regardless of backend. But maybe folks have suggestions for a cleaner implementation.

Sorry, I might not understand.

But why is this not working?


using AbstractFFTs, FFTW, AMDGPU, CUDA
function poisson_solve!(out_array::AbstractArray{T,2}, 
                                       inp_array::AbstractArray{T,2},
                                       coefficient::AbstractArray{T,2}) where T

            fourier_array = rfft(inp_array)
            fourier_array .*= coefficient
            out_array .= irfft(fourier_array, size(inp_array, 2))
            return out_array
end

rfft is provided by AbstractFFTs.jl and CUDA, AMDGPU, FFTW already do the dispatch for the concrete types such as CuArray, …

1 Like

You are right, it does work for ROCArray, CuArray and Array. I had trouble because I had failed to import FFTW and therefore I was getting StackOverflow errors for CPU Array.

A separate implementation/dispatch is needed for MtlArray and oneArray because they do not have their own FFT backend.

Found a better way to do it such that it works on CPUs, AMDGPUs, CUDA GPUs, oneAPI and Metal (oneAPI and Metal fallback to CPU):
Dependancies depending on backend: using AMDGPU, using CUDA, using oneAPI, using Metal.
Here is the code:

using AbstractFFTs
using Requires
using FFTW
#using AMDGPU or CUDA or oneAPI or Metal

# Define traits for array types based on FFT support
abstract type FFTSupport end
struct NativeFFTSupport <: FFTSupport end
struct CPUFallbackFFT <: FFTSupport end

# Trait functions to determine FFT support
fft_trait(::Type{<:AbstractArray{T,N}}) where {T,N} = NativeFFTSupport()  # Default: native support
fft_trait(::Type) = CPUFallbackFFT()  # Unknown types: CPU fallback

# Specialize for types that need CPU fallback
function __init__()
    @require oneAPI="8f75cd03-7ff8-4ecb-9b8f-daf728133b1b" begin
        fft_trait(::Type{<:oneAPI.oneArray}) = CPUFallbackFFT()
    end
    @require Metal="dde4c033-4e86-420c-a63e-0dd931031962" begin
        fft_trait(::Type{<:Metal.MtlArray}) = CPUFallbackFFT()
    end
end

function poisson_solve!(out_array::AbstractArray{T,2}, 
                        inp_array::AbstractArray{T,2},
                        coefficient::AbstractArray{T,2}, 
                        ::NativeFFTSupport = fft_trait(typeof(inp_array))) where T
    # Do Fourier Transform
    fourier_array = rfft(inp_array)

    # Multiply in Fourier space
    fourier_array .*= coefficient

    # Perform inverse FFT
    out_array .= irfft(fourier_array, size(inp_array, 2))

    return out_array
end

# Method for arrays requiring CPU fallback
function poisson_solve!(out_array::AbstractArray{T,2}, 
                        inp_array::AbstractArray{T,2},
                        coefficient::AbstractArray{T,2},
                        ::CPUFallbackFFT = fft_trait(typeof(inp_array))) where T
    # CPU fallback implementation
    array_type = typeof(inp_array).name.wrapper

    # Convert to CPU
    inp_cpu = Array(inp_array)
    coef_cpu = Array(coefficient)

    # Perform FFT on CPU
    fourier_array = rfft(inp_cpu)
    # Multiply coefficient in k-space
    fourier_array .*= coef_cpu
    # Do inverse fourier transform
    result_cpu = irfft(fourier_array, size(inp_cpu, 2))

    # Convert back to original type
    out_array .= array_type(result_cpu)

    return out_array
end
2 Likes