Help speeding up FFT processing code

Hi all, I have a three functions that I have translated from Python and would like some assistance in improving their performance. The full working code is:

using FFTW

"""
Implementation of numpy.roll
"""
roll(vec, num) = [vec[mod1(i - num, length(vec))] for i in eachindex(vec)]

function roll(array::Matrix, num)
	rows, cols = size(array)

	flat = Array{eltype(Array), 1}(undef, rows*cols)

	for j = 1:rows
		flat[1 + (j-1)*cols:j*cols] = array[j, :]
	end

	rolled = roll(flat, num)

	for j = 1:rows
		array[j, :] = rolled[1 + (j-1)*cols:j*cols]
	end

	return array
end

function filter_map(siz::Int)
	basic_size  = ceil(Int, 0.25*siz)

	trap = vcat(
		collect(1:basic_size),
		fill(basic_size, basic_size-1),
		collect(basic_size:-1:1)
	)/basic_size

	out = zeros(Float64, (siz, 3))
	# Create RGB indices
	green_inds = round(Int, 0.5*(siz - length(trap))) .+ collect(1:length(trap))
	red_inds = mod1.(green_inds .+ basic_size, siz)
	blue_inds = mod1.(green_inds .- basic_size, siz)

	out[red_inds, 1] = trap
	out[green_inds, 2] = trap
	out[blue_inds, 3] = trap

	return out
end

 function csi_jl(array::Array{ComplexF64, 2})
	array =  permutedims(array)
	arows, acols = size(array)

	filter = filter_map(round(Int, acols))
	frows, fcols = size(filter)

	ph_indices = 1:frows .+ floor(Int32, 0.5*(acols - frows))
	ph0 = fftshift(ifft(array, 2), 2)[:, ph_indices]

	ph0_rgb = Array{eltype(ph0), 3}(undef, (3, arows, frows))
	for i = 1:3 
		ph0_rgb[i, :, :] = ph0.*filter[:, i]'
	end

	filter_shift = ceil(Int, 0.25*frows)
	ph0_rgb[1, :, :] = roll(ph0_rgb[1, :, :], -filter_shift)
	ph0_rgb[3, :, :] = roll(ph0_rgb[3, :, :], filter_shift)

	im0_rgb = abs.(fft(fftshift(ph0_rgb, 3), 3))

	# maximum creates a 1xNxM array, slice just gets NxM array
	scale_factor = abs.(array)./maximum(im0_rgb, dims=1)[1, :, :]
	for i = 1:3
		im0_rgb[i, :, :] = im0_rgb[i, :, :].*scale_factor
	end

	im0_rgb = permutedims(im0_rgb, [3, 2, 1])

	reverse!(im0_rgb, dims=3)

	return real(im0_rgb)
end

test_array = randn(ComplexF64, 500, 500)
csijl = csi_jl(test_array)

The timings I get show that my custom roll is 2.4x faster than numpy.roll, my filter_map is 1.8x faster, but csi_jl is 1.5x slower than the Python implementation. The benchmark is :

@benchmark csi_jl(test_array)

BenchmarkTools.Trial: 
  memory estimate:  139.58 MiB
  allocs estimate:  504145
  --------------
  minimum time:     93.170 ms (5.71% GC)
  median time:      100.893 ms (5.42% GC)
  mean time:        104.874 ms (5.53% GC)
  maximum time:     164.856 ms (6.27% GC)
  --------------
  samples:          48
  evals/sample:     1

The memory use and allocations seem high, but I haven’t been able to figure out a way to bring them, or the time, down. Any help would be appreciated.

For reference, the original code is here under filter_map_construction and csi_array.

ERROR: LoadError: UndefVarError: trapezoid not defined

Sorry, I made a slight name change, but it’s fixed now.

As general remarks:

  • slicing A[i, :] creates a copy (-> more memory allocations), consider using @view
  • you can do a few operations in-place, like
    fft!(fftshift(ph0_rgb, 3), 3)
    
    and
    im0_rgb[i, :, :] .*= scale_factor
    
2 Likes

Thanks, I was thinking about where best to use views, and I found a few spots. The scale_factor sugestion also helped a bit. What I have now is

function csi_jl(input_array::Array{ComplexF64, 2})
	array = @view input_array[:, :]
	array =  permutedims(array)
	arows, acols = size(array)

	filter = filter_map(round(Int, acols))
	frows, fcols = size(filter)

	ph_indices = 1:frows .+ floor(Int32, 0.5*(acols - frows))
	ph0 = fftshift(ifft(array, 2), 2)
	phview = @view ph0[:, ph_indices]

	ph0_rgb = Array{eltype(ph0), 3}(undef, (3, arows, frows))
	for i = 1:3
		fview = @view filter[:, i]
		ph0_rgb[i, :, :] = phview.*fview'
	end

	filter_shift = ceil(Int, 0.25*frows)
	ph0_rgb[1, :, :] = roll(ph0_rgb[1, :, :], -filter_shift)
	ph0_rgb[3, :, :] = roll(ph0_rgb[3, :, :], filter_shift)

	im0_rgb = abs.(fft!(fftshift(ph0_rgb, 3), 3))

	immax = @view maximum(im0_rgb, dims=1)[1, :, :]
	# maxmimum creates a 1xNxM array, slice just gets NxM array
	scale_factor = abs.(array)./immax
	for i = 1:3
		im0_rgb[i, :, :] .*= scale_factor
	end

	im0_rgb = permutedims(im0_rgb, [3, 2, 1])

	reverse!(im0_rgb, dims=3)

	return real(im0_rgb)
end

with a timing of

julia> csijl_time = @benchmark csi_jl(test_array)
BenchmarkTools.Trial: 
  memory estimate:  101.22 MiB
  allocs estimate:  503126
  --------------
  minimum time:     78.258 ms (0.00% GC)
  median time:      89.775 ms (4.49% GC)
  mean time:        93.422 ms (3.63% GC)
  maximum time:     142.141 ms (0.00% GC)
  --------------
  samples:          54
  evals/sample:     1

That’s ~15 ms off the time and ~40 MiB off the memory, pretty good I’d say, though still 12 ms (1.24x) behind the Python version. I’ll test and see if I can do any more.

You can do some more assignments in-place:

ph0_rgb[i, :, :] .= phview.*fview'
[...]
ph0_rgb[1, :, :] .= roll(ph0_rgb[1, :, :], -filter_shift)
ph0_rgb[3, :, :] .= roll(ph0_rgb[3, :, :], filter_shift)

Also, isn’t roll the same as circshift?

2 Likes

I tried a few variations of circshift and couldn’t get the same result. I think it’s difference between row-major and column-major.

I’ll try those assignments too and see what that gives me. Thanks.

It turns out I had an error in the benchmark for my roll function leading me to think it was faster than it was in practice. I reworked it using circshift and was able to shave off a bit more time. The code I have now is

using BenchmarkTools
using PyCall
using FFTW
using Random

"""
Roll matrix for when the shift is positive.
"""
function rollpos(mat, s)
	rows, cols = size(mat)  # Size of matrix
	rs = s ÷ cols  # Rows to shift
	cs = s % cols  # where to split columns
	mat1 = @view mat[:, 1:cols-cs]
	mat2 = @view mat[:, cols-cs+1:end]
	mat3 = circshift(mat2, (rs+1, 0))
	mat4 = circshift(mat1, (rs, 0))

	return hcat(mat3, mat4)
end
	
"""
Roll matrix when the shift equals the number of columns.
"""
function rolleq(mat, s)
	rows, cols = size(mat)  # Size of matrix
	rs = s ÷ cols  # Rows to shift
	return circshift(mat, (rs, 0))
end

"""
Roll matrix when the shift is negative
"""
function rollneg(mat, s)
	rows, cols = size(mat)  # Size of matrix
	rs = s ÷ cols
	cs = abs(s) % cols	
	mat1 = @view mat[:, 1+cs:end]
	mat2 = @view mat[:, 1:cs]
	mat3 = circshift(mat1, (rs, 0)) 
	mat4 = circshift(mat2, (rs-1, 0))
	
	return hcat(mat3, mat4)
end


"""
Roll a matrix along its rows. Same as `numpy.roll`.
"""
function roll(matrix, s::Int)
	rows, cols = size(matrix)  # size of matrix
	
	if s % cols == 0
		return rolleq(matrix, s)
	elseif s < 0
		return rollneg(matrix, s)
	else
		return rollpos(matrix, s)
	end
end


np = pyimport("numpy")

x1 = randn(1000, 1000)
n = 10
np.roll(x1, -1) == roll(x1, -1)

@benchmark np.roll($x1, $n;);  # 7.369 ms, 7.63 MiB, 43 allocs
@benchmark roll($x1, $n);  # 2.938 ms, 15.26 MiB, 6 allocs

function filter_map(siz::Int)

	basic_size  = ceil(Int, 0.25*siz)

	trap = vcat(
		1:basic_size,
		fill(basic_size, basic_size-1),
		basic_size:-1:1
	)/basic_size

	out = zeros(Float64, (siz, 3))

	# Create RGB indices
	green_inds = round(Int, 0.5*(siz - length(trap))) .+ collect(1:length(trap))
	red_inds = mod1.(green_inds .+ basic_size, siz)
	blue_inds = mod1.(green_inds .- basic_size, siz)
	out[red_inds, 1] .= trap
	out[green_inds, 2] .= trap
	out[blue_inds, 3] .= trap
	return out
end

function csi_jl(input_array::Array{ComplexF64, 2})
	array = @view input_array[:, :]
	array =  permutedims(array)  # Row-col swap (not the same as array')
	arows, acols = size(array)

	filter = filter_map(round(Int, acols))
	frows, fcols = size(filter)

	ph_indices = 1:frows .+ floor(Int32, 0.5*(acols - frows))
	ph0 = fftshift(ifft(array, 2), 2)
	phview = @view ph0[:, ph_indices]

	ph0_rgb = Array{eltype(ph0), 3}(undef, (3, arows, frows))
	for i = 1:3
		fview = @view filter[:, i]
		ph0_rgb[i, :, :] .= phview.*fview'
	end

	filter_shift = ceil(Int, 0.25*frows)
	# In-place roll ( .== roll) didn't change the result
	ph1 = @view ph0_rgb[1, :, :]
	ph3 = @view ph0_rgb[3, :, :]
	ph0_rgb[1, :, :] = roll(ph1, -filter_shift)
	ph0_rgb[3, :, :] = roll(ph3, filter_shift)

	im0_rgb = abs.(fft!(fftshift(ph0_rgb, 3), 3))

	immax = @view maximum(im0_rgb, dims=1)[1, :, :]
	# maxmimum creates a 1xNxM array, slice just gets NxM array
	scale_factor = abs.(array)./immax
	for i = 1:3
		im0_rgb[i, :, :] .*= scale_factor
	end

	im0_rgb = permutedims(im0_rgb, [3, 2, 1])

	reverse!(im0_rgb, dims=3)

	return real(im0_rgb)
end

test_array = randn(ComplexF64, 500, 500)
@benchmark csi_jl($test_array)
--------------------------------------------
julia> @benchmark csi_jl(test_array)
BenchmarkTools.Trial: 
  memory estimate:  70.61 MiB
  allocs estimate:  123
  --------------
  minimum time:     59.434 ms (1.56% GC)
  median time:      65.015 ms (1.46% GC)
  mean time:        67.045 ms (2.22% GC)
  maximum time:     78.513 ms (2.57% GC)
  --------------
  samples:          75
  evals/sample:     1

That’s 1/3 off the runtime, pretty good! Now, I’m ~1-2 ms faster than the Python version, so pretty much within error I’d say. It still seems like there are more gains to be had, but I can’t see any more low-hanging fruit.

1 Like

One thing you can do is to assign abs.(array) to a variable before ph0 = fftshift(ifft(array, 2), 2), so that you can do the FFT in-place also here:

absarray = abs.(array)
ph0 = fftshift(ifft!(array, 2), 2)
# ...
scale_factor = absarray ./ immax

but this isn’t a huge saving, as abs.(array) ./ immax now allocates a single array, with my suggestion you add a temporary array abs.(array). In the end, I don’t think it’s worth doing it.

1 Like

I made that switch, but as you said, it didn’t really affect the timing. Though, I left it in hoping I might be able to make use of it in the future. One big change is that I switched from using indivudal @view s to puttting @views in front of the whole function, which made a huge difference. Between that and eliminating one of the loops, I am down to ~46 ms, compared to Python’s ~61 ms. I might revisit this later, but I think that’s good for now. Thanks for your help! Here’s the “final” code:

@views function csi_jl(input_array::Array{ComplexF64, 2})
	array = input_array[:, :]
	array =  permutedims(array)  # Row-col swap (not the same as array')
	absarray = abs.(array)  # Need this later, allows for `ifft!`  rather than `ifft`
	arows, acols = size(array)

	filter = filter_map(round(Int, acols))
	frows, fcols = size(filter)

	ph_indices = 1:frows .+ floor(Int32, 0.5*(acols - frows))
	ph0 = fftshift(ifft!(array, 2), 2)
	phview = ph0[:, ph_indices]

	filter_shift = ceil(Int, 0.25*frows)
	# In-place roll ( .== roll) didn't change the result
	ph0_rgb = Array{eltype(ph0), 3}(undef, (3, arows, frows))
	ph0_rgb[1, :, :] .= roll(phview.*filter[:, 1]', -filter_shift)
	ph0_rgb[2, :, :] .=      phview.*filter[:, 2]'
	ph0_rgb[3, :, :] .= roll(phview.*filter[:, 3]', filter_shift)

	im0_rgb = abs.(fft!(fftshift(ph0_rgb, 3), 3))

	immax = maximum(im0_rgb, dims=1)[1, :, :]  # maxmimum creates a 1xNxM array, slice just gets NxM array
	scale_factor = absarray./immax

	for i = 1:3
		im0_rgb[i, :, :] .*= scale_factor
	end

	im0_rgb = permutedims(im0_rgb, [3, 2, 1])

	reverse!(im0_rgb, dims=3)

	return real(im0_rgb)
end
3 Likes