Usually 3D convolutions are much faster with FFTs, if the kernel is not like (2,2,2).
Also just dumping my numbers.
julia> using FFTW, CUDA, BenchmarkTools
julia> function try_FFT_on_cuda()
values = rand(353, 353, 353)
value_complex = ComplexF32.(values)
cvalues = similar(cu(value_complex), ComplexF32)
copyto!(cvalues, values)
cy = similar(cvalues)
cF = plan_fft!(cvalues)
@btime CUDA.@sync a = ($cF*$cy)
return nothing
end
try_FFT_on_cuda (generic function with 1 method)
julia> try_FFT_on_cuda()
20.319 ms (6 allocations: 304 bytes)
julia> function try_FFT_on_cpu()
values = rand(353, 353, 353)
value_complex = ComplexF32.(values)
cvalues = similar((value_complex), ComplexF32)
copyto!(cvalues, values)
cy = similar(cvalues)
cF = plan_fft!(cvalues, flags=FFTW.MEASURE)
@btime a = ($cF*$cy)
return nothing
end
try_FFT_on_cpu (generic function with 1 method)
julia> try_FFT_on_cpu()
100.503 ms (314 allocations: 27.09 KiB)
# power of 2
julia> function try_FFT_on_cpu()
values = rand(256, 256, 256)
value_complex = ComplexF32.(values)
cvalues = similar((value_complex), ComplexF32)
copyto!(cvalues, values)
cy = similar(cvalues)
cF = plan_fft!(cvalues, flags=FFTW.MEASURE)
@btime a = ($cF*$cy)
return nothing
end
try_FFT_on_cpu (generic function with 1 method)
julia> function try_FFT_on_cuda()
values = rand(256, 256, 256)
value_complex = ComplexF32.(values)
cvalues = similar(cu(value_complex), ComplexF32)
copyto!(cvalues, values)
cy = similar(cvalues)
cF = plan_fft!(cvalues)
@btime CUDA.@sync a = ($cF*$cy)
return nothing
end
try_FFT_on_cuda (generic function with 1 method)
julia> try_FFT_on_cpu()
18.883 ms (314 allocations: 27.09 KiB)
julia> try_FFT_on_cuda()
2.483 ms (6 allocations: 304 bytes)