Very slow execution time in comparison even to Python

I am new to Julia.

While choosing a programming language for our next project wrote a short program in Julia:

using LinearAlgebra
using Statistics
using Random

import Base: +, -

struct Point
    x::Float64
    y::Float64
    z::Float64
end

+(a::Point, b::Point) = Point(a.x + b.x, a.y + b.y, a.z + b.z)
-(a::Point, b::Point) = Point(a.x - b.x, a.y - b.y, a.z - b.z)

struct Plane
    point::Point
    normal::Array{Float64}
end

ArrayToMatrix(array) = vcat(array...)

function mean_center(points)
    centroid = Point(mean(ArrayToMatrix([[p.x p.y p.z] for p in points]), dims = 1)...)
    points_centered = [p - centroid for p in points]
    centroid, points_centered
end

function best_fit_plane(points)
    centroid, centered_points = mean_center(points)
    m = ArrayToMatrix([[p.x p.y p.z] for p in centered_points])
    u, _, _ = svd(transpose(m))
    Plane(centroid, u[:,end])
end

function main()
  MIN = -2000
  MAX = 2000
  BATCHES_COUNT = 1000000
  POINTS_PER_BATCH = 8

  generatePoint() = map(x -> x + MIN, rand!(zeros(3)) * (MAX - MIN))
  generateBatch() = map(x -> generatePoint(), zeros(POINTS_PER_BATCH))
  generateBatches() = map(x -> generateBatch(), zeros(BATCHES_COUNT))

  for b in generateBatches()
    points = [Point(coord...) for coord in b]
    println(best_fit_plane(points))
  end
end

main()

running this code with following command:

time julia -O 3 -t auto -p auto bench.jl

and get on my mac book:

julia -p auto -t auto -O 3 bench.jl  210,16s user 112,23s system 30% cpu 17:43,32 total

Almost 18 mins. The same code in Python + skspatial finishes in 11 mins average. Haskell ~ 1 min.

What I made the wrong way?

Welcome!

The array is an abstract type here because the dimension is not given.
The following should be much more performant:

normal::Array{Float64, 1}
normal::Vector{Float64} # equivalent to the one above

2 Likes

There are many things to improve here. In addition to the abstract type in Plane, I believe the main issue is the repeated (and very slow) conversions of Points into matrix. You must find a way to not do that.

I’ll have a look later, unless someone beats me to it.

One quick thing to improve performance (and readability) is to define also

Base.:/(a::Point, s::Number) = Point(a.x/s, a.y/s, a.z/s)

and then write

function mean_center(points)
    centroid = mean(points)
    points_centered = [p - centroid for p in points]
    centroid, points_centered
end
2 Likes

A little bit faster would be

m = [getproperty(p, c) for p in centered_points, c in (:x, :y, :z)]
1 Like

A bit less elegant perhaps but this already gives you a x2 speed-up

function mean_center2(points,centroid)
    for p in points
        centroid += [p.x; p.y; p.z]
    end
    centroid = Point(centroid./length(points)...)
    points_centered = [p - centroid for p in points]
    centroid, points_centered
end

function best_fit_plane2(points,centroid,m)
    centroid, centered_points = mean_center2(points,centroid)
    @inbounds for p ∈ 1:8
        m[p,1],m[p,2],m[p,3] = centered_points[p].x,centered_points[p].y, centered_points[p].z
    end    
    u, _, _ = svd(transpose(m))
    Plane(centroid, u[:,end])
end

function main()
  MIN = -2000
  MAX = 2000
  BATCHES_COUNT = 10000
  POINTS_PER_BATCH = 8

  generatePoint() = map(x -> x + MIN, rand(3) * (MAX - MIN))
  generateBatch() = map(x -> generatePoint(), zeros(POINTS_PER_BATCH))
  generateBatches() = map(x -> generateBatch(), zeros(BATCHES_COUNT))

  for b in generateBatches()
    points = [Point(coord...) for coord in b]
    # println(best_fit_plane(points))
    best_fit_plane(points)
  end
end

function main2()
    MIN = -2000
    MAX = 2000
    BATCHES_COUNT = 10000
    POINTS_PER_BATCH = 8
    m           = fill(0.0,POINTS_PER_BATCH,3)
    centroid    = fill(0.0,3)

    generatePoint()   = Point( rand(3) * (MAX - MIN) .+ MIN...)
    generateBatch()   = map(x -> generatePoint(), zeros(POINTS_PER_BATCH))
    generateBatches() = map(x -> generateBatch(), zeros(BATCHES_COUNT))
  
    for b in generateBatches()
      best_fit_plane2(b,centroid,m)
    end
end

@benchmark main()
BenchmarkTools.Trial: 
  memory estimate:  107.57 MiB
  allocs estimate:  1149496
  --------------
  minimum time:     196.844 ms (9.15% GC)
  median time:      216.148 ms (12.00% GC)
  mean time:        269.495 ms (23.74% GC)
  maximum time:     955.288 ms (44.72% GC)
  --------------
  samples:          19
  evals/sample:     1

@benchmark main2()
BenchmarkTools.Trial: 
  memory estimate:  89.10 MiB
  allocs estimate:  949496
  --------------
  minimum time:     136.543 ms (3.96% GC)
  median time:      138.251 ms (4.29% GC)
  mean time:        139.093 ms (4.30% GC)
  maximum time:     154.269 ms (3.36% GC)
  --------------
  samples:          36
  evals/sample:     1
1 Like

Do you really want this thing to print all the time? It’s going to spend its time printing mostly.

1 Like

Here there is not so much performance gain to be expected, but for readability I would prefer

generatePoint(min, max) = Point(rand() * (max - min) + min, rand() * (max - min) + min, rand() * (max - min) + min) # or a bit less performant but shorter Point((rand(3)*(max - min) .+ min)...)
generateBatch(points_per_batch) = [generatePoint() for _ in 1:points_per_batch]

It seems printing to a terminal in Julia is pretty slow compared to Python, so that will be noticeable for lots of output to a terminal:

paulm@cmstorm 09:25:/data/examples/julia$ cat print_integers.py 
for i in range(1, 1000001):
    print(i)

paulm@cmstorm 09:25:/data/examples/julia$ cat print_integers.jl 
for i in 1 : 1000000
    println(i)
end

paulm@cmstorm 09:25:/data/examples/julia$ time python print_integers.py
...
999998
999999
1000000

real	0m2.440s
user	0m1.196s
sys	0m1.093s

paulm@cmstorm 09:25:/data/examples/julia$ time julia -O3 print_integers.jl
...
real	0m5.790s
user	0m2.909s
sys	0m2.863s

paulm@cmstorm 09:26:/data/examples/julia$ time python print_integers.py > /dev/null

real	0m0.277s
user	0m0.274s
sys	0m0.003s

paulm@cmstorm 09:27:/data/examples/julia$ time julia -O3 print_integers.jl > /dev/null

real	0m0.375s
user	0m0.289s
sys	0m0.071s

I think there was a github issue for this as well. Ah, it’s https://github.com/JuliaLang/julia/issues/36639, but that doesn’t provide much information yet, other than a post on this forum.

I would not use my own definition of Point. Instead I would just use StaticArrays. You can roll your own, like you have, but then you should add some more functionality.

You should absolutely avoid the super-expensive conversion from points to matrix, and I would not generate the whole set of batches at once, just generate one at a time.

Here’s an example implementation using SVector:


using StaticArrays
using Statistics: mean
using LinearAlgebra: svd


struct Plane
    point::SVector{3, Float64}
    normal::SVector{3, Float64}
end

points2matrix(points) = reshape(reinterpret(Float64, points), 3, :)

function mean_center(points)
    centroid = mean(points)
    points_centered = points .- (centroid,)
    return centroid, points_centered
end

function best_fit_plane(points)
    centroid, centered_points = mean_center(points)
    M = points2matrix(centered_points)
    u, = svd(M)
    return Plane(centroid, u[:,end])
end


function main()
    MIN = -2000
    MAX = 2000
    BATCHES_COUNT = 1000000
    POINTS_PER_BATCH = 8
  
    generatePoint() = rand(SVector{3, Float64}) .* (MAX - MIN) .+ MIN
    
    generateBatch() = [generatePoint() for _ in 1:POINTS_PER_BATCH]
    generateBatches() = (generateBatch() for _ in 1:BATCHES_COUNT)
    
    planes = [best_fit_plane(batch) for batch in generateBatches()]
    return planes
end

While benchmarking I suggest reducing the BATCHES_COUNT which is needlessly large for benchmarking.

Notice the definition of generateBatches. It has () instead of []. This is a generator, so each batch is generated on demand.

Runtime for the above code is ~6.5 seconds. It doesn’t print though, so you will need to add that if you like.

5 Likes

Here’s a multithreaded version that scales pretty well (3.5x faster with 4 threads in my laptop):

function main_threaded()
    MIN = -2000
    MAX = 2000
    BATCHES_COUNT = 1000000
    POINTS_PER_BATCH = 8
  
    generatePoint() = rand(SVector{3, Float64}) .* (MAX - MIN) .+ MIN
    generateBatch() = [generatePoint() for _ in 1:POINTS_PER_BATCH]
    
    planes = Vector{Plane}(undef, BATCHES_COUNT)
    Threads.@threads for i in 1:BATCHES_COUNT
        planes[i] = best_fit_plane(generateBatch())
    end
    return planes
end
5 Likes

I have a hard time believing that it’s useful to print 1 million calculated Planes objects to screen. Printing to file, ok, but I don’t think this is particularly useful, and it messes up benchmarking.

3 Likes

At least one should avoid printing inside the hot loop.

7 Likes

See my post in Why is printing to a terminal slow? just yet, e.g. piping through less is also very slow

2 Likes

If the objective is to just speed up the posted script, then I think the conversation is going just fine. But if an objective is to compare versions of the script in different languages, then I think the Python and Haskell code should also be posted. I myself won’t be able to make language comparisons, but I’m sure someone else can.
Besides, they may spot performance differences that are happening because code that looks equivalent actually isn’t. A (probably unrelated) example is how array-slicing bracket-syntax makes copies in Julia but views in Python.

2 Likes

A note: In my posted example, roughly 80%-90% of the time is now spent inside svd. If you want more speed-ups, you should consider whether you can do better than the standard svd.

4 Likes
using LinearAlgebra, Statistics, Random, StaticArrays

struct Plane
    point::SVector{3,Float64}
    normal::SVector{3,Float64}
end

function mean_center(points::SMatrix)
    centroid = mean(points,dims=2)
    points_centered = points .- centroid
    centroid, points_centered
end

function best_fit_plane(points::SMatrix)
    centroid, centered_points = mean_center(points)
    u, = svd(centered_points) # you can leave _, _ if you are interested in the first output only
    Plane(centroid, u[:,end])
end

function main()
	MIN = -2000
	MAX = 2000
	BATCHES_COUNT = 10000
	POINTS_PER_BATCH = 8
	
	generateBatch() = MIN .+ (MAX - MIN)*rand(SMatrix{3,POINTS_PER_BATCH,Float64})

	out = Vector{Plane}(undef,BATCHES_COUNT) # initialize output
	for i in eachindex(out)
		b = generateBatch()
		out[i] = best_fit_plane(b)
	end
	return out
end

I tried speeding up the code by storing the batch data in a SMatrix from StaticArrays but it seems that they use the same SVD as in julia base. So no significant speed up. The generation of the batch data is slightly faster though.

@Yury_Kotlyarov if you are working with computational geometry, consider Meshes.jl. It should be a performant api for geometrical objects and meshes. Your Point type is available there for example, as well as many other types.

For data with only 3 dimensions I think it will be hard to make it faster, otherwise truncated svd or RandomizedLinAlg might be worth a try. Using MKL and Float32 instead of Float64 can speed up svd quite a bit.

3 Likes

Since we are only interested in one part of the SVD, i.e. U, we can calculate the SVD of MM’ like shown here. The SVDs of MM’ and M share U.

function best_fit_plane(points::SMatrix)
    centroid, centered_points = mean_center(points)
	m = centered_points*transpose(centered_points)
    u, = svd(m) # you can leave _, _ if you are interested in the first output only
    Plane(centroid, u[:,end])
end

The gain is noticeably, but should be even higher if we increase POINTS_PER_BATCH.

function best_fit_plane(points::SMatrix)
    centroid, centered_points = mean_center(points)
	m = centered_points*transpose(centered_points)
    u, = svd(m) # you can leave _, _ if you are interested in the first output only
    Plane(centroid, u[:,end])
end

One needs to check that I have made no errors in the implementation or my train of thought, but with this trick I can bring down the time of the main loop from 15s to 4s using

BATCHES_COUNT = 250000
POINTS_PER_BATCH = 32
3 Likes