I am trying to optimize my code and I’ve tried to follow all performance tips in the manual and browsing the forum. I’ve avoided slicing arrays with @views
, made sure to pre-allocate arrays, etc. However, I don’t really know what the “lower bound” of memory allocation my function should get to. My function basically gets a large array 3D array and loops over the 3rd dimension to perform a calculation (correlation matrix) and store it in a result array. I need to perform this many many times (inside an optimization). Here is the gist and the many attempts to maximize performance:
function testfun(a,b) #my naive implementation
temp = [[a[:,:,t] b[:,:,t]] for t = 1:10]
res = Vector{Float64}(undef, Int(10*6*5/2))
for t = 1:10
range2 = ((t-1)*15 + 1):15*t
res[range2] .= @views [cor(temp[t])[i,j] for j = 1:5 for i = (j+1):6][:]
end
res[isnan.(res)] .= -10.0
return res
end
function testfun2(a,b) #pre-allocate intermediate and output arrays
temp = Matrix{Float64}(undef, size(a,1), size(a,2)+size(b,2))
res = Vector{Float64}(undef, Int(10*6*5/2))
for t = 1:10
temp .= @views [a[:,:,t] b[:,:,t]]
range2 = ((t-1)*15 + 1):15*t
res[range2] .= @views [cor(temp)[i,j] for j = 1:5 for i = (j+1):6][:]
end
res[isnan.(res)] .= -10.0
return res
end
function testfun3!(res,a,b,temp) # use arrays as inputs
for t = 1:10
temp .= @views [a[:,:,t] b[:,:,t]]
range2 = ((t-1)*15 + 1):15*t
res[range2] .= @views [cor(temp)[i,j] for j = 1:5 for i = (j+1):6][:]
end
res[isnan.(res)] .= -10.0
return res
end
function testfun4!(res,c,temp) # instead of concatenating, use as input
for t = 1:10
temp .= @views c[t]
range2 = ((t-1)*15 + 1):15*t
res[range2] .= @views [cor(temp)[i,j] for j = 1:5 for i = (j+1):6][:]
end
res[isnan.(res)] .= -10.0
return res
end
a = rand(300_000, 2, 10)
b = rand(300_000, 4, 10)
temp = Matrix{Float64}(undef, size(a,1), size(a,2)+size(b,2))
res = Vector{Float64}(undef, Int(10*6*5/2))
c = [[a[:,:,t] b[:,:,t]] for t = 1:10]
@btime testfun($a,$b) #2.456s (1755 allocations: 2.28 GiB)
@btime testfun2($a,$b) #1.690s (1716 allocations: 2.16 GiB)
@btime testfun3!($res,$a,$b,$temp) #1.511s (1713 allocations: 2.15 GiB)
@btime testfun4!($res,$c,$temp) #1.597s (1693 allocations: 2.01GiB)
Is there something else I am missing?