Hi all, I have shifted from python where I was using a bicubic interpolation in numpy. Code for that is below,
import numpy as np
kernel_x = np.random.rand(100,4)
kernel_y = np.random.rand(100,4)
ix0 = np.random.randint(0, 37, size=(100,))
iy0 = np.random.randint(0, 17, size=(100,))
kcoeff = np.random.rand(40,20,5011,10)
def bicubic_numpy(n_layer,kernel_x,kernel_y,ix0,iy0,kcoeff):
kcoeff_intp = np.zeros((n_layer,5011,10))
for n in range(n_layer):
for i in range(4):
for j in range(4):
kcoeff_intp[n,:,:] += kernel_x[n,i] * kernel_y[n,j] * kcoeff[ix0[n] + i, iy0[n] + j,:,:] ### removed -1 term after j
return np.exp(np.log(10.0) * kcoeff_intp)
Timing of above function using,
%timeit bicubic_numpy(100,kernel_x,kernel_y,ix0,iy0,kcoeff)
178 ms ± 258 μs per loop (mean ± std. dev. of 7 runs, 1 loop each)
When I shifted same problem to julia, my naive implementation looks like this,
function my_bicubic_implementation!(cache::Array{Float64}, N_layer::Int64, kernel_x::Matrix{Float64},kernel_y::Matrix{Float64},ix0::Vector{Int64},iy0::Vector{Int64},kcoeff::Array{Float64})
for n in 1:N_layer
for i in 1:4
for j in 1:4
for k in 1:size(kcoeff,3)
for l in 1:size(kcoeff,4)
cache[n,k,l] = cache[n,k,l] + kernel_x[n,i] * kernel_y[n,j] * kcoeff[ix0[n]+i, iy0[n]+j, k, l]
end
end
end
end
end
@. cache = 10.0^cache
end
benchamrking the above implementation with BenchmarkTools for random arrays as,
kcoeff_intp = zeros(Float64, 100, 5011, 10)
kernel_x = rand(100, 4)
kernel_y = rand(100, 4)
ix0 = rand(1:37, 100) # subtract 1 if you want 0-based like NumPy
iy0 = rand(1:17, 100)
kcoeff = rand(40, 20, 5011, 10)
gives a time of 3s
, which was obviously very slow. Then after applying some performance tips from others on discord, I was able to get it to same level as numpy. Below is the final version of the code (where used @turbo
micro (1.5x improvement), used accumulator variable (3x improvement), chage index ordering of outer loop to respect column major choice of julia (4x improvement)).
function bicubic(
N_layer::Integer, kernel_x::AbstractMatrix{F}, kernel_y::AbstractMatrix{F},
ix0::AbstractVector{<:Integer}, iy0::AbstractVector{<:Integer},
kcoeff::AbstractArray{F, 4}
) where {F<:AbstractFloat}
klim, llim = size(kcoeff, 3), size(kcoeff, 4)
buf = zeros(F, N_layer, klim, llim)
@inbounds for l in 1:llim, k in 1:klim, n in 1:N_layer
acc = zero(F) # <= accumulator variable
@turbo for i in 1:4, j in 1:4
acc += kernel_x[n, i] * kernel_y[n, j] * kcoeff[ix0[n] + i, iy0[n] + j, k, l]
end
buf[n, k, l] = exp10(acc) # <= `exp10` instead of `10 ^`
end
return buf
end
This runs with a timing of 160 ms
, which is of the same order as numpy. This is now satisfactory for me but it would be great if someone can give tips on how to improve this further. This would help a lot.
Thanks!