Marching cubes without extra allocations


#1

Hi,

I’ve been trying to implement a version of the famous marching cubes algorithm and have arrived at the following abomination:

function marching_cubes(values::Array{T,3},points::Array{Point3D{T},3},iso,cube_size=1) where T
  vertices     = Array{Point3f0,1}()
  indices      = Array{Tuple{Int,Int,Int},1}()
  vertex_list  = Array{Point3f0,1}(12)
  ind_pushed   = false
  cube_indices = [(0,0,0),(cube_size,0,0),(cube_size,cube_size,0),(0,cube_size,0),(0,0,cube_size),(cube_size,0,cube_size),(cube_size,cube_size,cube_size),(0,cube_size,cube_size)]
  edge_table,tri_table = get_edge_tri_table()
  i_max,j_max,k_max    = size(points)
  for k=1:cube_size:k_max-1
    for j=1:cube_size:j_max-1
      for i=1:cube_size:i_max-1
        
        cube_index = zero(Int32)
        for n = 1:8
          if values[i+cube_indices[n][1],j+cube_indices[n][2],k+cube_indices[n][3]]<iso
            cube_index |= Int32(2^(n-1))
          end
        end
        cube_index += Int32(1)
        edge_entry  = edge_table[cube_index]
        
        if edge_entry == 0
          continue
        end
        if edge_entry & 1 == 1
          vertex_list[1] = vertex_interp(iso,points[i+cube_indices[1][1],j+cube_indices[1][2],k+cube_indices[1][3]],points[i+cube_indices[2][1],j+cube_indices[2][2],k+cube_indices[2][3]],values[i+cube_indices[1][1],j+cube_indices[1][2],k+cube_indices[1][3]],values[i+cube_indices[2][1],j+cube_indices[2][2],k+cube_indices[2][3]])
        end
        if edge_entry & 2 == 2
          vertex_list[2] = vertex_interp(iso,points[i+cube_indices[2][1],j+cube_indices[2][2],k+cube_indices[2][3]],points[i+cube_indices[3][1],j+cube_indices[3][2],k+cube_indices[3][3]],values[i+cube_indices[2][1],j+cube_indices[2][2],k+cube_indices[2][3]],values[i+cube_indices[3][1],j+cube_indices[3][2],k+cube_indices[3][3]])
        end
        if edge_entry & 4 == 4
          vertex_list[3] = vertex_interp(iso,points[i+cube_indices[3][1],j+cube_indices[3][2],k+cube_indices[3][3]],points[i+cube_indices[4][1],j+cube_indices[4][2],k+cube_indices[4][3]],values[i+cube_indices[3][1],j+cube_indices[3][2],k+cube_indices[3][3]],values[i+cube_indices[4][1],j+cube_indices[4][2],k+cube_indices[4][3]])
        end
        if edge_entry & 8 == 8
          vertex_list[4] = vertex_interp(iso,points[i+cube_indices[4][1],j+cube_indices[4][2],k+cube_indices[4][3]],points[i+cube_indices[1][1],j+cube_indices[1][2],k+cube_indices[1][3]],values[i+cube_indices[4][1],j+cube_indices[4][2],k+cube_indices[4][3]],values[i+cube_indices[1][1],j+cube_indices[1][2],k+cube_indices[1][3]])
        end
        if edge_entry & 16 == 16
          vertex_list[5] = vertex_interp(iso,points[i+cube_indices[5][1],j+cube_indices[5][2],k+cube_indices[5][3]],points[i+cube_indices[6][1],j+cube_indices[6][2],k+cube_indices[6][3]],values[i+cube_indices[5][1],j+cube_indices[5][2],k+cube_indices[5][3]],values[i+cube_indices[6][1],j+cube_indices[6][2],k+cube_indices[6][3]])
        end
        if edge_entry & 32 == 32
          vertex_list[6] = vertex_interp(iso,points[i+cube_indices[6][1],j+cube_indices[6][2],k+cube_indices[6][3]],points[i+cube_indices[7][1],j+cube_indices[7][2],k+cube_indices[7][3]],values[i+cube_indices[6][1],j+cube_indices[6][2],k+cube_indices[6][3]],values[i+cube_indices[7][1],j+cube_indices[7][2],k+cube_indices[7][3]])
        end
        if edge_entry & 64 == 64
          vertex_list[7] = vertex_interp(iso,points[i+cube_indices[7][1],j+cube_indices[7][2],k+cube_indices[7][3]],points[i+cube_indices[8][1],j+cube_indices[8][2],k+cube_indices[8][3]],values[i+cube_indices[7][1],j+cube_indices[7][2],k+cube_indices[7][3]],values[i+cube_indices[8][1],j+cube_indices[8][2],k+cube_indices[8][3]])
        end
        if edge_entry & 128 == 128
          vertex_list[8] = vertex_interp(iso,points[i+cube_indices[8][1],j+cube_indices[8][2],k+cube_indices[8][3]],points[i+cube_indices[5][1],j+cube_indices[5][2],k+cube_indices[5][3]],values[i+cube_indices[8][1],j+cube_indices[8][2],k+cube_indices[8][3]],values[i+cube_indices[5][1],j+cube_indices[5][2],k+cube_indices[5][3]])
        end
        if edge_entry & 256 == 256
          vertex_list[9] = vertex_interp(iso,points[i+cube_indices[1][1],j+cube_indices[1][2],k+cube_indices[1][3]],points[i+cube_indices[5][1],j+cube_indices[5][2],k+cube_indices[5][3]],values[i+cube_indices[1][1],j+cube_indices[1][2],k+cube_indices[1][3]],values[i+cube_indices[5][1],j+cube_indices[5][2],k+cube_indices[5][3]])
        end
        if edge_entry & 512 == 512
          vertex_list[10] = vertex_interp(iso,points[i+cube_indices[2][1],j+cube_indices[2][2],k+cube_indices[2][3]],points[i+cube_indices[6][1],j+cube_indices[6][2],k+cube_indices[6][3]],values[i+cube_indices[2][1],j+cube_indices[2][2],k+cube_indices[2][3]],values[i+cube_indices[6][1],j+cube_indices[6][2],k+cube_indices[6][3]])
        end
        if edge_entry & 1024 == 1024
          vertex_list[11] = vertex_interp(iso,points[i+cube_indices[3][1],j+cube_indices[3][2],k+cube_indices[3][3]],points[i+cube_indices[7][1],j+cube_indices[7][2],k+cube_indices[7][3]],values[i+cube_indices[3][1],j+cube_indices[3][2],k+cube_indices[3][3]],values[i+cube_indices[7][1],j+cube_indices[7][2],k+cube_indices[7][3]])
        end
        if edge_entry & 2048 == 2048
          vertex_list[12] = vertex_interp(iso,points[i+cube_indices[4][1],j+cube_indices[4][2],k+cube_indices[4][3]],points[i+cube_indices[8][1],j+cube_indices[8][2],k+cube_indices[8][3]],values[i+cube_indices[4][1],j+cube_indices[4][2],k+cube_indices[4][3]],values[i+cube_indices[8][1],j+cube_indices[8][2],k+cube_indices[8][3]])
        end
        
        table_entry = tri_table[cube_index]
        n = 1
        while table_entry[n]!=-1
          vert = vertex_list[table_entry[n]+1]
          push!(vertices,vert)
          if !ind_pushed
            push!(indices,(i,j,k))
            ind_pushed = true
          end
          n+=1
        end
        ind_pushed = false        
      end
    end
  end
  return vertices,indices
end

This results in no real unwanted allocations through each loop, having @time output of
0.026461 seconds (720 allocations: 88.844 KiB)

However as you can see this code looks awful and I was wondering how to do this more cleanly.
I’ve tried a lot of things which didn’t work. Defining arrays t_val, t_p through selection of the correct elements resulted in allocations of the order of 300mb, defining array views resulted in allocations of 100mb and slow execution, defining the correct values of indices in each loop resulted in 200mb allocations.

I guess what I’m trying to understand is how to not allocate any extra arrays or values inside the loop, but be able to define what part of the array to use inside the loop.

If there is a totally different way of doing this that leads to more readable code and similar performance please let me know i’m very keen to learn how to utilize all the tools in julia!

cheers

EDIT: Upon request, a working code via pastebin:
https://pastebin.com/0sKpb643

Only thing that changed is Point3D -> Point3f0 (Point3f0 is basically static array of 3 Float32’s) . Both have same functionality but one is custom the other isn’t.

Minimal requirement: GeometryTypes
Visualization requirement: GLVisualize,ColorTypes


#2

I would suggest first trying to rewrite the code to operate on views rather than slices, which might help you modularize the algorithm. Constructing a view() will allocate less memory than copying out a slice of an array, although it still allocates some. Fortunately, I think there’s some cool work underway to make non-allocating views possible in Julia v0.7.

Until then, if you find that your algorithm is cleaner with views, and if you want to remove the small memory allocation that each view creates, it’s possible to make an unsafe view-like object that allocates no memory at all. One example is here: https://github.com/rdeits/NNLS.jl/blob/b7314fb9691a9d4ec9897316c8732f9aba94ed47/src/NNLS.jl#L200 (based on earlier work by @tkoolen and @jrevels). This object is “unsafe” because if you keep the UnsafeVectorView around after its parent goes out of scope, then you may end up with a view of junk data. But it’s otherwise ideal for constructing lots of extremely cheap views within the inner loop of an algorithm. For example, I use these unsafe views in NNLS.jl here: https://github.com/rdeits/NNLS.jl/blob/b7314fb9691a9d4ec9897316c8732f9aba94ed47/src/NNLS.jl#L342 to efficiently grab views of my A matrix to pass to the construct_householder! function which expects an AbstractVector. This means that construct_householder! doesn’t need to know anything about the indices of A, but I also have no memory allocation penalty at all.


Array views becoming dominant source of memory allocation
#3

Thank you so much for your answer! I tried it with views, which does result in marginally cleaner code (still kind of annoying since the way indexing over the cube works needs me to define these cube_indices), but this still resulted in some 100mb being allocated in total, and more importantly code that runs 10x slower than the solution I have now.

I will experiment with the unsafe things, I was wondering if julia allows for this kind of control similar to pointer stuff in C or C++, and it seems that it might. Thank you for the examples and explanation!


#4

You could replace your if cavalcade with some inlined function calls, or with a single @update_vertex_list ... macro expansion. Where do the numbers in each if come from?


#5

These are from the way the original algorithm was implemented in:
http://paulbourke.net/geometry/polygonise/
TL;DR
There are only so many possible combinations that result in a surface through a cube -> depending on which we construct a 12-bit number. Then we have two lookup tables telling us what edges to form a vertex on (12 possible) and which vertices to combine in triangles. That’s where these numbers come from, it’s pretty specific, but maybe someone knows a cleaner way to do it. I didn’t bother and just 1-1 translated the algorithm.

A macro would clean things up though.


#6

This is how I think:

“This could be an interesting thing to look at. Oh, what are those Point3D structs. Oh, there are no input arguments to the function, will I have to create those myself? Nah, this will take to long,”.

and then I click back in the browser.

If you give code that can easily be run so people can easily experiment, you will usually get a lot more help than if you force people to run the code through their head and guess what your custom types are.


#7

Since I don’t think it’s handy to paste this amount of code in discourse I made a pastebin:
https://pastebin.com/0sKpb643

Only thing that changed is Point3D-> Point3f0 (Point3f0 is basically static array of 3 Float32’s) . Both have same functionality but one is custom the other isn’t.

Minimal requirement: GeometryTypes
Visualization requirement: GLVisualize,ColorTypes

I’ll also edit my first post with the pastebin link.

I hope this makes it more easy to play with.

Cheers


#8

ICYMI: Meshing.jl has an implementation of marching cubes:

Does that meet your need?


#9

Yes, in another discussion someone brought up this package that I didn’t know. This algorithm is very significantly faster than mine, generating only around 1.35mib for a signed distance field of 91x91x91.
I wil certainly base my algorithm on this one. There is however one weird thing in that it creates a mesh (looked at it with renderdoc) with a lot of rogue vertices that are added doubly etc, so i’m not sure what’s going on with that.

But thanks a lot for the mention!