Hi all, I have a three functions that I have translated from Python and would like some assistance in improving their performance. The full working code is:
using FFTW
"""
Implementation of numpy.roll
"""
roll(vec, num) = [vec[mod1(i - num, length(vec))] for i in eachindex(vec)]
function roll(array::Matrix, num)
rows, cols = size(array)
flat = Array{eltype(Array), 1}(undef, rows*cols)
for j = 1:rows
flat[1 + (j-1)*cols:j*cols] = array[j, :]
end
rolled = roll(flat, num)
for j = 1:rows
array[j, :] = rolled[1 + (j-1)*cols:j*cols]
end
return array
end
function filter_map(siz::Int)
basic_size = ceil(Int, 0.25*siz)
trap = vcat(
collect(1:basic_size),
fill(basic_size, basic_size-1),
collect(basic_size:-1:1)
)/basic_size
out = zeros(Float64, (siz, 3))
# Create RGB indices
green_inds = round(Int, 0.5*(siz - length(trap))) .+ collect(1:length(trap))
red_inds = mod1.(green_inds .+ basic_size, siz)
blue_inds = mod1.(green_inds .- basic_size, siz)
out[red_inds, 1] = trap
out[green_inds, 2] = trap
out[blue_inds, 3] = trap
return out
end
function csi_jl(array::Array{ComplexF64, 2})
array = permutedims(array)
arows, acols = size(array)
filter = filter_map(round(Int, acols))
frows, fcols = size(filter)
ph_indices = 1:frows .+ floor(Int32, 0.5*(acols - frows))
ph0 = fftshift(ifft(array, 2), 2)[:, ph_indices]
ph0_rgb = Array{eltype(ph0), 3}(undef, (3, arows, frows))
for i = 1:3
ph0_rgb[i, :, :] = ph0.*filter[:, i]'
end
filter_shift = ceil(Int, 0.25*frows)
ph0_rgb[1, :, :] = roll(ph0_rgb[1, :, :], -filter_shift)
ph0_rgb[3, :, :] = roll(ph0_rgb[3, :, :], filter_shift)
im0_rgb = abs.(fft(fftshift(ph0_rgb, 3), 3))
# maximum creates a 1xNxM array, slice just gets NxM array
scale_factor = abs.(array)./maximum(im0_rgb, dims=1)[1, :, :]
for i = 1:3
im0_rgb[i, :, :] = im0_rgb[i, :, :].*scale_factor
end
im0_rgb = permutedims(im0_rgb, [3, 2, 1])
reverse!(im0_rgb, dims=3)
return real(im0_rgb)
end
test_array = randn(ComplexF64, 500, 500)
csijl = csi_jl(test_array)
The timings I get show that my custom roll
is 2.4x faster than numpy.roll
, my filter_map
is 1.8x faster, but csi_jl
is 1.5x slower than the Python implementation. The benchmark is :
@benchmark csi_jl(test_array)
BenchmarkTools.Trial:
memory estimate: 139.58 MiB
allocs estimate: 504145
--------------
minimum time: 93.170 ms (5.71% GC)
median time: 100.893 ms (5.42% GC)
mean time: 104.874 ms (5.53% GC)
maximum time: 164.856 ms (6.27% GC)
--------------
samples: 48
evals/sample: 1
The memory use and allocations seem high, but I haven’t been able to figure out a way to bring them, or the time, down. Any help would be appreciated.
For reference, the original code is here under filter_map_construction
and csi_array
.