Preallocating the result of getindex(...)

question

#1

Let’s say I have an array such as
x = randn(1000),

and a pre-allocated destination like
xtmp = Array{Float64}(800)

I know that creating a slice like x[1:800] has to copy the data somewhere,
but how do I specify that I want that data copied into xtmp without making any additional copies?
I have been doing this via making a view, but I imagine there should be some way of doing this copying
without the overhead of creating a view.

Thanks!


#2

A loop works well.


#3
xtmp .= @view x[1:800]

if you want it vectorized, or

for i in 1:800
  xtmp[i] = x[i]
end

is a classic.


#4

I guess I should have provided more information on why I was asking this.

I have code similar to this:

x = randn(1_000_000);
inds = shuffle(1:1_000_000)[1:800000];
A = randn(50, 1_000_000);
xtmp = zeros(800_000);
Atmp = zeros(50, 800_000);

I’m finding that

julia> @time begin
           copy!(xtmp, view(x, inds))
           copy!(Atmp, view(A, :, inds))
           sum(Atmp * xtmp)
       end
  0.256046 seconds (41 allocations: 1.391 KiB)
12203.768937947363

is slower than

julia> @time sum(A[:,inds]*x[inds])
  0.195484 seconds (13 allocations: 311.280 MiB, 1.13% gc time)
12203.768937947363

despite the memory allocation taking place in the second version. I was essentially wondering if there was a way to write the second version using pre-allocated arrays to eliminate the extra allocations, since that version is clearly doing something different to the first one.


#5

Well this is a whole different thing.

inds = shuffle(1:1_000_000)[1:800000];

that’s not contiguous, so

 copy!(xtmp, view(x, inds))
 copy!(Atmp, view(A, :, inds))

those are views but do not contiguously address the memory. Then you do

Atmp * xtmp

that’s the operation which really matters here, but it’s really slow because the pointers are jumping wildly about in memory so it can’t SIMD or BLAS or anything to make it fast.

On the otherhand,

A[:,inds]*x[inds]

that slices to allocate two new arrays, but when it puts the values in there the values are perfectly aligned in memory, so * in this case will be really fast and will be a BLAS call.

Moral of the story: blindly avoiding allocations will not make code faster.

Edit

Just to drive this point home, notice how much better it does when the view is more sane. With your code I get:

x = randn(1_000_000);
inds = shuffle(1:1_000_000)[1:800000];
A = randn(50, 1_000_000);
xtmp = zeros(800_000);
Atmp = zeros(50, 800_000);



function f(xtmp,x,Atmp,inds)
 copy!(xtmp, view(x, inds))
 copy!(Atmp, view(A, :, inds))
 sum(Atmp * xtmp)
end

function g(xtmp,x,Atmp,inds)
  sum(A[:,inds]*x[inds])
end

@time f(xtmp,x,Atmp,inds)
@time f(xtmp,x,Atmp,inds)
@time g(xtmp,x,Atmp,inds)
@time g(xtmp,x,Atmp,inds)

  0.273511 seconds (36 allocations: 1.313 KiB)
  0.272545 seconds (36 allocations: 1.313 KiB)
  0.253679 seconds (13 allocations: 311.280 MiB, 9.58% gc time)
  0.266512 seconds (13 allocations: 311.280 MiB, 9.40% gc time)

but if I sort the indices, then the view is better behaved and it’s much faster:

x = randn(1_000_000);
inds = sort(shuffle(1:1_000_000)[1:800000]);
A = randn(50, 1_000_000);
xtmp = zeros(800_000);
Atmp = zeros(50, 800_000);



function f(xtmp,x,Atmp,inds)
 copy!(xtmp, view(x, inds))
 copy!(Atmp, view(A, :, inds))
 sum(Atmp * xtmp)
end

function g(xtmp,x,Atmp,inds)
  sum(A[:,inds]*x[inds])
end

@time f(xtmp,x,Atmp,inds)
@time f(xtmp,x,Atmp,inds)
@time g(xtmp,x,Atmp,inds)
@time g(xtmp,x,Atmp,inds)

  0.136312 seconds (36 allocations: 1.313 KiB)
  0.145749 seconds (36 allocations: 1.313 KiB)
  0.207015 seconds (13 allocations: 311.280 MiB, 11.87% gc time)
  0.202954 seconds (13 allocations: 311.280 MiB, 12.78% gc time)

#6

But I’m fairly sure the result of copy!(xtmp, view(x, inds)) is an Array{Float64, 1} contiguous, since the result is a plain array instead of a SubArray.

julia> copy!(xtmp, view(x, inds))
800000-element Array{Float64,1}:

julia> view(x, inds)
800000-element SubArray{Float64,1,Array{Float64,1},Tuple{Array{Int64,1}},false}:

I know that the views themselves involve wild pointer-jumping which prevents SIMD and BLAS optimizations, so that’s why the below code is even slower.

julia> @time sum(view(A, :, inds) * view(x, inds))
  0.602032 seconds (41 allocations: 1.391 KiB)
12203.768937947227

It is interesting to see the amount of time needed change so drastically when the indices are sorted, though. Is there some reason that the allocating version doesn’t speed up as much as the view version when the indices are sorted? I would imagine that getindex(…) would also suffer from pointer-jumping problems in the first case and benefit from sorted indices in the second.

Thanks for the in-depth analysis by the way!


#7

oh yes, that is right. Then somehow the slicing is able to do this much more efficiently than the view. That, I don’t know. It would also explain

but someone who knows the views/slicing code well would have to comment.


#8

I can’t reproduce the original timing results. For me, the copy! version is slightly faster. I’m also getting fewer allocations than you in both cases. Are you on 0.6?

BTW, replacing copy!(Atmp, view(A, :, inds)) with Atmp .= view(A, :, inds) is slightly faster for me; it appears that the generic AbstractArray version of copy! isn’t fully inlined for matrices (it is for vectors).

Edit: it’s not just the inlining; it appears the code for the broadcast version is also just faster than the code for copy!.


#9

on my computer an explicit loop is generally (not always!) slightly faster (see f3 below)
(it may not be worth the trouble though)

I think it depends on how exactly the indices look like. If you run the code a few times, you can see to what extent the timings vary

I think a number of the macros in my code below may be unnecessary (sorry, I am not quite sure where to place them or whether those are needed at all anymore).

function f1(xtmp,Atmp,inds,x,A)
           copy!(xtmp, view(x, inds))
           copy!(Atmp, view(A, :, inds))
           sum(Atmp * xtmp)
end

function f2(xtmp,Atmp,inds,x,A)
  sum(A[:,inds]*x[inds])
end

function f3(xtmp,Atmp,inds,x,A)
  sz=size(A,1)
  @inbounds @fastmath @simd for j=1:length(inds)
    @inbounds ij=inds[j]
    @inbounds xtmp[j]=x[ij]
    for l=1:sz
      @inbounds A[l,j]=A[l,ij]
    end
  end
  sum(Atmp * xtmp)
end

function something(n)

x = randn(1_000_000);
inds = shuffle(1:1_000_000)[1:800000];
A = randn(50, 1_000_000);
xtmp = zeros(800_000);
Atmp = zeros(50, 800_000);

f1(xtmp,Atmp,inds,x,A)
f2(xtmp,Atmp,inds,x,A)
f3(xtmp,Atmp,inds,x,A)

x3=x2=x1=0.0
@time for u=1:n
  x1=f1(xtmp,Atmp,inds,x,A)
end
@time for u=1:n
  x2=f2(xtmp,Atmp,inds,x,A)
end
@time for u=1:n
  x3=f3(xtmp,Atmp,inds,x,A)
end
@show x1,x3,x3
@show x1==x2==x3
end

something(20)