Hi,
I am using CuArrays
to perform fft convolutions and I am getting the error
ERROR: CUFFTError(code 2, cuFFT failed to allocate GPU or CPU memory)
.
I have threads about collecting temporaries but I don’t know how to do this here.
I’d be happy to have some hints,
Thank you,
Romain
Here is a MWE (hopefully).
using CuArrays, GPUArrays, CUDAnative
import Base: *
module Convolution
export convolution
struct convolution
kernel::AbstractArray
kernel_fft::AbstractArray
tmp::AbstractArray
fft_flag::Int
N::Int
n::Int
gpu::Bool
p_forward
p_backward
function convolution(kernel::AbstractArray,gpu = false)
if gpu
x = new(kernel,fft(fftshift(kernel)),
fft(kernel),0,size(kernel)[1],
prod(size(kernel)),gpu,
0,
0)
else
error("not here")
end
return x
end
end
end
function *(cv::Convolution.convolution, x::AbstractArray)
# return ifftshift(real(irfft(cv.kernel_fft .* rfft(fftshift(x)),cv.N)))
if cv.gpu
return real.( ifftshift(ifft(cv.kernel_fft .* fft(fftshift(x)))))
else
return ifftshift(real(ifft(cv.kernel_fft .* fft(fftshift(x)))))
end
end
TY = Float32
dev = CUDAnative.CuDevice(0)
const gpu = cu
N = 2^10
L = 100
hx = 2L/N |> TY
println("\n\n###############\n Neural Field solution , N = $N, dx= $hx\n"*"#"^20)
X = TY.(-L + hx * collect(0:N-1) )
g = TY(1e-4)*exp.(-(1 * X.^2 .+ 1 * X'.^2)/10) |> gpu
J = Convolution.convolution(g,true)
v = rand(N,N) |> gpu;
v2=zeros(v)
for ii=1:20000
v2 .= J * v
(ii,GPUArrays.free_global_memory(dev)) |> println
end