Vector notation, intermediate value and allocations

uliano · June 16, 2022, 7:24am

I made great progress in understanding allocation mechanism, I’m still failing occasionally to write unallocating vectorized code, not even sure that in the specific cases belowmentioned would it even be possible

Under the allocating vectorized code there is, commented, working code. My concerns are ONLY for lthe last 3 large allocations which are inside the innerloop.

I like vector notation as it makes the intent much more clear.


        - function write_xtc_atoms(file::IOStream, precision::Real, coords::AbstractMatrix{T}) where {T <: Real}
        0     pos = position(file)
        0     natoms = size(coords)[2]
        -     prev_atom = 0
      176     write(file, hton(Int32(natoms)))
        0     if natoms <= 9
        0         for j in 1:3
        0             for atom in 1:natoms
        0                 write(file, hton(Float32(coords[i, atom])))
        -             end
        -         end
        -     else
      352         minint = MVector{3, Int32}(undef)
        0         maxint = MVector{3, Int32}(undef)
      352         prevcoord = @MVector Int[0, 0, 0]
      352         tmp = MVector{3, Int}(undef)
     2816         tmpcoords = MMatrix{3, 10, Int}(undef)
  7022928         intcoords = Matrix{Int}(undef, 3, natoms)
  4390320         buffer = BitBuffer(natoms * 15)
      176         write(file, hton(Float32(precision)))
        0         minint .= typemax(Int32)
        0         maxint .= typemin(Int32)
        -         mindiff = typemax(Int32)
        -         prevrun = -1
        0         for atom in 1:natoms
        0             fc = @SVector Float32[round(coords[i, atom] * precision) for i in 1:3]
        0             if any(fc .> max_absolute_int )
        0                 seek(file, pos)
        0                 return false
        -                 # scaling would cause overflow 
        -             end
        0             for ii in 1:3
        0                 intcoords[ii, atom] = convert(Int, fc[ii])
        0                 minint[ii] = intcoords[ii, atom] < minint[ii] ? intcoords[ii, atom] : minint[ii]
        0                 maxint[ii] = intcoords[ii, atom] > maxint[ii] ? intcoords[ii, atom] : maxint[ii]
        0                 tmp[ii] = prevcoord[ii] - intcoords[ii, atom]
        -             end
        0             diff = norm(tmp, 1)
        0             if diff < mindiff && atom > 1
        -                 mindiff = diff
        -             end
        0             prevcoord .= view(intcoords, :, atom)
        -         end
        0         for ii in 1:3
      528             write(file, hton(minint[ii]))
        -         end
        0         for ii in 1:3
      528             write(file, hton(maxint[ii]))
        -         end
        0         if any(convert.(Float32, maxint - minint) .>= max_absolute_int)
        0             seek(file, pos)
        0             return false
        -             # turning values to unsigned would cause overflow
        -         end
        0         sizeint = SA[maxint[1] - minint[1] + 1,
        -                      maxint[2] - minint[2] + 1,
        -                      maxint[3] - minint[3] + 1]
        0         if any( sizeint .> 2^24-1)
        0             bitsizeint = sizeofint.(sizeint)
        -             bitsize = 0
        -         else    
        0             bitsize = sizeofints(sizeint)
        -         end
        0         smallidx = FIRSTINDEX
        0         while smallidx < LASTIDX && magicints[smallidx + 1] < mindiff
        0             smallidx += 1
        -         end
      176         write(file, hton(Int32(smallidx)))
        0         maxidx = min(LASTIDX, smallidx + 8)
        0         minidx = maxidx - 8
        0         smaller = magicints[max(FIRSTINDEX, smallidx - 1) + 1] ÷ 2
        0         smallnum = magicints[smallidx + 1] ÷ 2
        0         sizesmall = SA[magicints[smallidx + 1], magicints[smallidx + 1], magicints[smallidx + 1]]
        0         larger = magicints[maxidx + 1] ÷ 2
        -         atom = 1
        0         while atom <= natoms
        -             is_small = 0
        - 
  9274176             if smallidx < maxidx && atom > 1 && all(abs.(view(intcoords, :, atom) .- view(intcoords, :, prev_atom)) .< larger)
        -             # if smallidx < maxidx && atom > 1 && 
        -             #    abs(intcoords[1, atom] - intcoords[1, prev_atom]) < larger &&
        -             #    abs(intcoords[2, atom] - intcoords[2, prev_atom]) < larger &&
        -             #    abs(intcoords[3, atom] - intcoords[3, prev_atom]) < larger
        -                 is_smaller = 1
        0             elseif smallidx > minidx
        -                 is_smaller = -1
        -             else
        -                 is_smaller = 0
        -             end
        -             # can we swap? should we swap?
        - 
  9469920             if atom + 1 <= natoms && all(abs.(view(intcoords, :, atom) .- view(intcoords, :, atom + 1)) .< smallnum)
        -             # tmp .= abs.(view(intcoords, :, atom) .- view(intcoords, :, atom + 1))
        -             # if atom + 1 <= natoms && tmp[1] < smallnum && tmp[2] < smallnum && tmp[3] < smallnum
        - 
        -                 # this was kinda shot in the dark, breaks functionality.
        -                 # intcoords[:, atom], intcoords[:, atom + 1] = view(intcoords, :, atom + 1), view(intcoords, :, atom)
        0                 tmp .= view(intcoords, :, atom)
        0                 intcoords[:, atom] .= view(intcoords, :, atom + 1)
        0                 intcoords[:, atom + 1] .= tmp
        -                 is_small = 1
        -             end
        - 
        0             tmp .= view(intcoords, :, atom) .- minint
        - 
        0             if bitsize == 0
        0                 for ii in 1:3
        0                     sendbits!(buffer, tmp[ii], bitsizeint[1])
        -                 end
        -             else
        0                 sendints!(buffer, bitsize, sizeint, tmp)
        -             end
        -             prev_atom = atom
        0             atom += 1
        -             run = 1
        0             if is_small == 0 && is_smaller == -1
        -                 is_smaller = 0
        -             end
        0             while is_small != 0 && run <= 8
        - 
        0                 if is_smaller == -1 && norm(view(intcoords, :, atom) - view(intcoords, :, prev_atom)) >= smaller
        -                 # tmp .= view(intcoords, :, atom) .- view(intcoords, :, prev_atom)
        -                 # if is_smaller == -1 && norm(tmp) >= smaller
        -                     is_smaller = 0
        -                 end
        0                 tmpcoords[:, run] .= view(intcoords, :, atom) .- view(intcoords, :, prev_atom) .+ smallnum
        0                 run += 1
        -                 prev_atom = atom
        0                 atom += 1
        -                 is_small = 0
        - 
 18618624                 if atom <= natoms && all(abs.(view(intcoords, :, atom) .- view(intcoords, :, prev_atom)) .< smallnum)
        -             
        -                 # this doesn't work (breaks functionality, why???)
        -                 # tmp .= abs.(view(intcoords, :, atom) .- view(intcoords, :, prev_atom)) .< smallnum
        -                 # if atom <= natoms && tmp[1] < smallnum && tmp[2] < smallnum && tmp[3] < smallnum
        - 
        -                 # if atom <= natoms &&
        -                 #    abs(intcoords[1, atom] - intcoords[1, prev_atom]) < smallnum &&
        -                 #    abs(intcoords[2, atom] - intcoords[2, prev_atom]) < smallnum &&
        -                 #    abs(intcoords[3, atom] - intcoords[3, prev_atom]) < smallnum
        - 
        0                     is_small = 1
        -                 end
        -             end
        0             if prevrun != run || is_smaller != 0
        -                 prevrun = run;
        0                 sendbits!(buffer, 1, 1)
        0                 sendbits!(buffer, (run - 1) * 3 + is_smaller + 1, 5)
        -             else
        0                 sendbits!(buffer, 0, 1)
        -             end
        0             for k in 1:run - 1
        0                 sendints!(buffer, smallidx, sizesmall, view(tmpcoords, :, k))
        -             end
        0             if is_smaller != 0
        0                 smallidx += is_smaller
        0                 if is_smaller < 0
        -                     smallnum = smaller
        0                     smaller = magicints[smallidx] ÷ 2
        -                 else
        -                     smaller = smallnum
        0                     smallnum = magicints[smallidx + 1] ÷ 2
        -                 end
        0                 sizesmall = SA[magicints[smallidx + 1], magicints[smallidx + 1], magicints[smallidx + 1]]
        -             end
        -         end
      176         write(file, hton(Int32(length(buffer))))
        0         write(file, buffer)
        -         # xdr format requires 32 bit alignment
        0         while mod(position(file), 4) != 0
        0             write(file, zero(UInt8))
        -         end
        -     end 
        0     return true
        - end

Philippe_Maincon1 · June 16, 2022, 8:52am

Hi Uliano,

The way you ask, it’s not easy to help.

Can you reproduce your question with a shorter function (a minimum working example, MWE)?
Can you provide the call that runs this function, and provides an unwanted output (be it an error, or a measure of low performance). This includes any module/package you might be using.
How large are your arrays (1e1, 1e2, … 1e6)? (we might end up discussing heap vs. stack - StaticArrays.jl

uliano · June 16, 2022, 9:18am

the function was there to provide the context. Most relevant parts are:

        tmp = MVector{3, Int}(undef) # from StaticArrays
        tmpcoords = MMatrix{3, 10, Int}(undef) # from StaticArrays
        intcoords = Matrix{Int}(undef, 3, natoms)

natoms sometimes can reach 10^6, more likely 20-100 x 10^3

everithing else is a scalar

first “hotspot”:

  9274176             if smallidx < maxidx && atom > 1 && all(abs.(view(intcoords, :, atom) .- view(intcoords, :, prev_atom)) .< larger)
        -             # if smallidx < maxidx && atom > 1 && 
        -             #    abs(intcoords[1, atom] - intcoords[1, prev_atom]) < larger &&
        -             #    abs(intcoords[2, atom] - intcoords[2, prev_atom]) < larger &&
        -             #    abs(intcoords[3, atom] - intcoords[3, prev_atom]) < larger

both commented and uncommented perform the expected task but only the uncommented allocates

second “hotspot”:

  9469920             if atom + 1 <= natoms && all(abs.(view(intcoords, :, atom) .- view(intcoords, :, atom + 1)) .< smallnum)
        -             # tmp .= abs.(view(intcoords, :, atom) .- view(intcoords, :, atom + 1))
        -             # if atom + 1 <= natoms && tmp[1] < smallnum && tmp[2] < smallnum && tmp[3] < smallnum

again, both commented and uncommented perform the expected task but only the uncommented allocates

third “hotspot”:

 18618624                 if atom <= natoms && all(abs.(view(intcoords, :, atom) .- view(intcoords, :, prev_atom)) .< smallnum)
        -             
        -                 # this doesn't work (breaks functionality, why???)
        -                 # tmp .= abs.(view(intcoords, :, atom) .- view(intcoords, :, prev_atom)) .< smallnum
        -                 # if atom <= natoms && tmp[1] < smallnum && tmp[2] < smallnum && tmp[3] < smallnum
        - 
        -                 # if atom <= natoms &&
        -                 #    abs(intcoords[1, atom] - intcoords[1, prev_atom]) < smallnum &&
        -                 #    abs(intcoords[2, atom] - intcoords[2, prev_atom]) < smallnum &&
        -                 #    abs(intcoords[3, atom] - intcoords[3, prev_atom]) < smallnum

here we ave a further commented version which doesnt’ produce the expected result and I don’t understand why

GunnarFarneback · June 16, 2022, 9:45am

All those allocations have the same source and happen because all is a function and all(y) needs y to be computed before all is called, and with y = f.(x) the output of the vectorized call f.(x) needs temporary storage before all is called.

However, there is a second method of all, which allows you to rewrite all(f.(x)) as all(f, x), where f is called on demand by all and thus can avoid allocations. It’s not super convenient for the kind of f you have in your code but you can try if it gives you a satisfactory result.

uliano · June 16, 2022, 10:08am

It is really unfortunate that the compiler couldn’t figure out that there are only 3 temporary values (as per MVector{3,) and store these intermediates in registers or on the stack.

I’ll give a try with the the alternate all. Is there something equivalent also for norm?

ffevotte · June 16, 2022, 10:17am

But your test involves intcoords, which is a Matrix (not an MVector). How could the compiler figure out that this matrix will always be of size 3xN ?

I also kind of like the form where all is called on a generator expression. For your first hotspot, it would looke like:

all(@inbounds abs(intcoords[i, atom] - intcoords[i, prev_atom]) < larger for i in 1:3)

uliano · June 16, 2022, 1:19pm

You’re right, probably it could have been inferred observing the various assignments and broadcast to and from 3-vectors but that’s non local to the specific piece of code.

I could have used an MMatrix for intcoords but if I understood right it is not reccomanded to exceed a few dozen elements.

Also, MArrays are a patch for a “missing feature” that would have been more nicely implemented in the language, i.e. fixed size mutable arrays (possibly stack based). that would probably solve most of this temporary allocation troubles.

ffevotte · June 16, 2022, 1:44pm

Then why not something like mutable arrays of fixed-size, immutable structs (such as tuples)?

In your example, I believe intcoords is actually best described as a vector of atom coordinates. And a coordinate in a 3-dimensional space is a 3-tuple. So why not match this description in the data structures you’re using? A much simplified version of your code could look like this:

julia> natoms = 10
10

# use NTuple{3, Int} for coordinates
# use Vector{NTuple{3, Int} as the type of the vector of atoms coordinates
julia> intcoords = Vector{NTuple{3, Int}}(undef, natoms);

# I'm putting the vector initialization apart here, to show that
# it's possible to change the atoms coordinates if needed
# (although you can't mutate individual coordinates; you have to replace
# the whole n-tuple at once like it's done here. But it's very fast)
julia> for i in 1:natoms
           intcoords[i] = ntuple(i->rand(1:10), 3)
       end
julia> intcoords
10-element Vector{Tuple{Int64, Int64, Int64}}:
 (4, 3, 1)
 (5, 2, 6)
 (8, 4, 2)
 (7, 2, 4)
 (8, 9, 10)
 (4, 1, 3)
 (9, 5, 3)
 (7, 1, 2)
 (3, 6, 1)
 (5, 5, 2)

#  a mock-up for the main algorithm
julia> function foo(intcoords)
           natoms = length(intcoords)
           count = 0
           for i in 2:natoms
               # count the number of times the condition happens to be satisfied
               if all(abs.(intcoords[i] .- intcoords[i-1]) .< 5)
                   count += 1
               end
           end
           count
       end
foo (generic function with 1 method)

# the loop does not allocate
julia> using BenchmarkTools
julia> @btime foo($intcoords)
  12.257 ns (0 allocations: 0 bytes)
4

uliano · June 16, 2022, 2:27pm

Interesting suggestion, I had (briefly considered) to use SVectors but I struggled (my fault I’m learning and some concepts or habits are still to come), a vestigial remain of this effort can be seen in a couple of lines sizesmall = SA[magicints[…

In this process I learned that I don’t really need StaticArrays (as I’m not using linear algebra) so Tuples seems just right

I will reconsider and make an effort to refactor in this direction

(The main motivation for mutable arrays is that old habits are hard to overcome)

Topic		Replies	Views
I don't undestand allocations New to Julia	16	636	June 15, 2022
Common allocation mistakes Performance memory-allocation	47	7161	August 21, 2023
Avoiding allocations of small but non-trivial arrays (work array alternative?) Performance question	38	4487	November 17, 2022
Local arrays in for loops New to Julia	55	1826	March 3, 2021
Remove allocations from splatting? General Usage question , memory-allocation	18	572	July 19, 2023

Vector notation, intermediate value and allocations

Related topics