Unreasonably fast FFT on CUDA

Usually 3D convolutions are much faster with FFTs, if the kernel is not like (2,2,2).

Also just dumping my numbers.

julia> using FFTW, CUDA, BenchmarkTools

julia> function try_FFT_on_cuda()
           values = rand(353, 353, 353)
           value_complex = ComplexF32.(values)
           cvalues = similar(cu(value_complex), ComplexF32)
           copyto!(cvalues, values)
           cy = similar(cvalues)
           cF = plan_fft!(cvalues)
           @btime CUDA.@sync a = ($cF*$cy)
           return nothing
       end
try_FFT_on_cuda (generic function with 1 method)

julia> try_FFT_on_cuda()
  20.319 ms (6 allocations: 304 bytes)

julia> function try_FFT_on_cpu()
           values = rand(353, 353, 353)
           value_complex = ComplexF32.(values)
           cvalues = similar((value_complex), ComplexF32)
           copyto!(cvalues, values)
           cy = similar(cvalues)
           cF = plan_fft!(cvalues, flags=FFTW.MEASURE)
           @btime a = ($cF*$cy)
           return nothing
       end
try_FFT_on_cpu (generic function with 1 method)

julia> try_FFT_on_cpu()
  100.503 ms (314 allocations: 27.09 KiB)



# power of 2
julia> function try_FFT_on_cpu()
           values = rand(256, 256, 256)
           value_complex = ComplexF32.(values)
           cvalues = similar((value_complex), ComplexF32)
           copyto!(cvalues, values)
           cy = similar(cvalues)
           cF = plan_fft!(cvalues, flags=FFTW.MEASURE)
           @btime a = ($cF*$cy)
           return nothing
       end
try_FFT_on_cpu (generic function with 1 method)

julia> function try_FFT_on_cuda()
           values = rand(256, 256, 256)
           value_complex = ComplexF32.(values)
           cvalues = similar(cu(value_complex), ComplexF32)
           copyto!(cvalues, values)
           cy = similar(cvalues)
           cF = plan_fft!(cvalues)
           @btime CUDA.@sync a = ($cF*$cy)
           return nothing
       end
try_FFT_on_cuda (generic function with 1 method)

julia> try_FFT_on_cpu()
  18.883 ms (314 allocations: 27.09 KiB)

julia> try_FFT_on_cuda()
  2.483 ms (6 allocations: 304 bytes)

1 Like