Hi, I am trying to optimize a simple function. I have written both devectorized and vectorized version codes for the function to see which one is faster. Here are the codes:
Vectorized:
function funvec(m::Int64,data::Array{Float64,2})
a,b = data[:,1], data[:,2]
M = Int(length(data[:,1]))
n = Int(m*(m+1)/2)
u = zeros(M,n)
v = zero(u)
k = 1
for i in 1:m
for j in 1:i #
u[:,k] .= @. a^(i-j)*b^(j-1)
if ((i-j)!=0)
v[:,k] .= @. (i-j)*a^(i-j-1)*b^(j-1)
end
k=k+1
end
end
return u,v
end
Devectorized:
function fundevec(m::Int64,data::Array{Float64,2})
a,b = data[:,1], data[:,2]
M = Int(length(data[:,1]))
n = Int(m*(m+1)/2)
u = zeros(M,n)
v = zero(u)
k = 1
for i in 1:m
for j in 1:i
Threads.@threads for s in 1:M
u[s,k] = a[s]^(i-j)*b[s]^(j-1)
if ((i-j)!=0)
v[s,k] = (i-j)*a[s]^(i-j-1)*b[s]^(j-1)
end
end
k=k+1
end
end
return u,v
end
Call them as
const m,N = 200,200
data = rand(N,2)
@benchmark funvec(m,data)
@benchmark fundevec(m,data)
I have two questions.
1-)
The above two functions are giving exactly same results and fundevec
is faster for now. To make funvec
faster I modify (by adding Threads.@threads before for loop) it as follows
function funvec(m::Int64,data::Array{Float64,2})
a,b = data[:,1], data[:,2]
M = Int(length(data[:,1]))
n = Int(m*(m+1)/2)
u = zeros(M,n)
v = zero(u)
k = 1
for i in 1:m
Threads.@threads for j in 1:i #
u[:,k] .= @. a^(i-j)*b^(j-1)
if ((i-j)!=0)
v[:,k] .= @. (i-j)*a^(i-j-1)*b^(j-1)
end
k=k+1
end
end
return u,v
end
But in this case I can not get same results with fundevec
. Can you explain why I get different results when I added Threads.@threads
?
2-) Can I optimize further, one of these two functions?
Thank you for your help.