Performance of recursive function

Fabrice_Rosay · July 8, 2022, 5:19pm

Hi,
I have the following recursive function to calculate the number of positions in a game.
All functions inside perft (play, gen_moves, isOver) are non allocating, yet perft allocates a lot of memory.
Using the track-allocation function the only line that is allocating is the recursive call inside perft (the line in the for loop).
Is there anything I can do to avoid this or is it an artefact of the way allocation are tracked ?
(for info this function is 2 times slower than an equivalent one in C++ and I try to narrow the gap)

function perft(pos,depth,moves)
	nodes=0
	 if isOver(pos)[1]
		return 0
	end
	  if depth==1
		return count_moves(pos)
	end


	nmoves=gen_moves(pos,moves[depth])
	for k in 1:nmoves
		 nodes+=perft(play(pos,moves[depth][k]),depth-1,moves)#@timeit to "nested"
	end
	return nodes
end

jling · July 8, 2022, 5:37pm

can you post complete and benchmark-able code?

Fabrice_Rosay · July 8, 2022, 5:40pm

it is a quite large module 300 lines:


import Base.:<<,Base.:>>,Base.:~,Base.:&,Base.:⊻,Base.:|,Base.copy
export Game,
    canPlay,
    play,
    undo,
    isOver,
    score,
    getplayer,
    getboard,
    getround,
    gethash,
    Move,
    gen_moves


const N=7
const NN=N*N



struct Square
	sq::Int
end

is_pass(sq::Square)=sq.sq==-1

struct Bitboard
	data::UInt64
end

Base.:<<(b::Bitboard,n)=Bitboard(b.data<<n)
Base.:>>(b::Bitboard,n)=Bitboard(b.data>>n)
Base.:~(b::Bitboard)=Bitboard(~b.data & 0x7f7f7f7f7f7f7f)
Base.:⊻(b1::Bitboard,b2::Bitboard)=Bitboard(b1.data ⊻ b2.data)
Base.:&(b1::Bitboard,b2::Bitboard)=Bitboard(b1.data & b2.data)
Base.:|(b1::Bitboard,b2::Bitboard)=Bitboard(b1.data | b2.data)

Base.iterate(b::Bitboard)=(Square(trailing_zeros(b.data)),b.data&(b.data-0x1))
Base.iterate(b::Bitboard,state)=state==0x0 ? nothing : (Square(trailing_zeros(state)),state &(state-0x1))


function set(b::Bitboard,sq::Square)
	Bitboard(b.data|UInt64(1)<<sq.sq)
end

function get(b::Bitboard,sq::Square)
	b.data>>sq.sq & 1
end

Bitboard(sq::Square)=Bitboard(UInt64(1)<<sq.sq)
function singles(b::Bitboard)
	Bitboard((b.data << 1 | b.data << 9 | b.data >> 7 | b.data << 8 | b.data >> 8 |
	b.data >> 1 | b.data >> 9 | b.data << 7) & 0x7f7f7f7f7f7f7f)
end

function singles(sq::Square)
	b=Bitboard(UInt64(1)<<sq.sq)
	return singles(b)
end

function doubles(b::Bitboard)
	Bitboard(((b.data << 2 | b.data << 10 | b.data << 18 | b.data >> 6 | b.data >> 14 | b.data << 17 | b.data >> 15) & 0x7e7e7e7e7e7e7e) |

				((b.data << 16 | b.data >> 16) & 0x7f7f7f7f7f7f7f) |

				((b.data >> 2 | b.data >> 10 | b.data >> 18 | b.data << 6 | b.data << 14 | b.data << 15 | b.data >> 17) & 0x3f3f3f3f3f3f3f)
			 )
end

function doubles(sq::Square)
	b=Bitboard(UInt64(1)<<sq.sq)
	return doubles(b)
end


is_empty(b::Bitboard)=b.data==0
is_full(b::Bitboard)=b.data==0x7f7f7f7f7f7f7f
function test()
	b=Bitboard(0x7f7f7f7f7f7f7f)
	for x in b
		println(doubles(x))
	end
end

struct Board
	bplayer::Bitboard
	bopp::Bitboard
end

mutable struct Game
    board::Board
    player::Int8
	pass::Int8
    round::Int
end

struct Move
    from::Square
    to::Square
end

function Game()
	bb=Bitboard(0)
	bb=set(bb,Square(0))
	bb=set(bb,Square(54))
	bw=Bitboard(0)
	bw=set(bw,Square(6))
	bw=set(bw,Square(48))

    return Game(
    Board(bb,bw),
    1,
	0,
    0
    )
end

score(pos,n=0) = abs(sum(pos.board[:,:,1])-sum(pos.board[:,:,2]))
getplayer(pos) = pos.player


function getboard(pos)
    answer = zeros(Int8, N, N, 3, 1)
    answer[:,:,1:2,1].=pos.board
 	answer[:, :, 3, 1] .= pos.player
	# if pos.player==1
	# 	answer[:,:,1,1].=(pos.board[:,:,1,1].-pos.board[:,:,2,1])
	# else
	# 	answer[:,:,1,1].=rot180(pos.board[:,:,2,1].-pos.board[:,:,1,1])
	# end
    return answer
end


gethash(pos) =(deepcopy(pos.board),pos.player,pos.round)#pos.hash
getround(pos) = pos.round

other(x)=-x
index(x)=x==1 ? 1 : 2

function isOver(pos)
	bplayer=pos.board.bplayer
	bopp=pos.board.bopp
	p=count_ones(bplayer.data)
	o=count_ones(bopp.data)
	if is_empty(bplayer) || is_empty(bopp)
    	return true,(p-o)*pos.player
	end
	if is_full(bplayer|bopp)
		return true,sign((p-o)*pos.player)
	end
	if pos.pass==2
		return true,sign((p-o)*pos.player)
	end
	if pos.round>=200
		return true,sign((p-o)*pos.player)
	end
	return false,0
end

@inbounds @inline function play(game, move::Move)
    sqf=move.from
	sqt=move.to
	bplayer=game.board.bplayer
	bopp=game.board.bopp
	if is_pass(sqf)
		return Game(Board(bopp,bplayer),other(game.player),game.pass+1,game.round+1)
	end
	if sqf!=sqt
    	bplayer⊻=Bitboard(sqf)
	end
    bplayer=set(bplayer,sqt)
	turned=singles(sqt) & bopp
	bopp⊻=turned
	bplayer|=turned
	return Game(Board(bopp,bplayer),other(game.player),0,game.round+1)
end


function gen_moves(position,answer)
    cpt = 0
    for sq in singles(position.board.bplayer) & ~(position.board.bplayer | position.board.bopp)
		#push!(answer,Move(sq,sq))
		cpt+=1
		answer[cpt]=Move(sq,sq)
	end
	for sqf in position.board.bplayer
		for sqt in doubles(sqf) & ~(position.board.bplayer | position.board.bopp)
			#push!(answer,Move(sqf,sqt))
			cpt+=1
			answer[cpt]=Move(sqf,sqt)
		end
	end
	if cpt==0
		cpt+=1
		#push!(answer,Move(Square(-1),Square(-1)))
		answer[cpt]=Move(Square(-1),Square(-1))
	end
    cpt
end

function gen_move(position,k)
    move=Move(Square(-1),Square(-1))
    cpt = 0
    for sq in singles(position.board.bplayer) & ~(position.board.bplayer | position.board.bopp)
		move=Move(sq,sq)
		cpt+=1
		if cpt==k
			return move
		end

	end
	for sqf in position.board.bplayer
		for sqt in doubles(sqf) & ~(position.board.bplayer | position.board.bopp)
			move=Move(sqf,sqt)
			cpt+=1
			if cpt==k
				return move
			end
		end
	end
end

function count_moves(position)
    cpt = 0
    for sq in singles(position.board.bplayer) & ~(position.board.bplayer | position.board.bopp)
		cpt+=1
	end
	for sqf in position.board.bplayer
		for sqt in doubles(sqf) & ~(position.board.bplayer | position.board.bopp)
			cpt+=1
		end
	end
	if cpt==0
		push!(answer,Move(Square(-1),Square(-1)))
	end
    cpt
end

function perft(pos,depth,moves)
	nodes=0
	 if isOver(pos)[1]
		return 0
	end
	  if depth==1
		return count_moves(pos)
	end


	nmoves=gen_moves(pos,moves[depth])
	for k in 1:nmoves
		 nodes+=perft(play(pos,moves[depth][k]),depth-1,moves)#@timeit to "nested"
	end
	return nodes
end

function perf(pos,depth)

	 moves=[Vector{Move}(undef,200) for k in 1:depth]
	t=time()
	 nodes=perft(pos,depth,moves)

	t=time()-t
	MN=round(nodes/(10^6*t),digits=3)
	println("nodes: $nodes, speed: $MN Mn/s")

end
end

using ..GAME

function main()
	game=GAME.Game()
	GAME.perf(game,6)
	GAME.perf(game,6)
end

main()

lawless-m · July 8, 2022, 5:44pm

Julia doesn’t do tail call optimization and this is a situation that is unlikely to change.

github.com/JuliaLang/julia

tail call elimination

opened 01:36AM - 28 Nov 13 UTC

closed 11:43PM - 11 Jul 16 UTC

gitfoxi

speculative

It's interesting that this is valid syntax in Julia and Lua: ``` jl function re…c() return rec() end ``` The difference is that in Lua, when you call `rec()` it will stare at you and engage your motherboard's built-in space heater until you ctrl-C out. In Julia: ``` jl julia> rec() ERROR: stack overflow in rec at none:2 (repeats 80000 times) ``` So the stack 80000 things deep. That's interesting. Why does this matter? I'm not sure. But some people care a lot: http://www.lua.org/pil/6.3.html

Recursive code is best re-written as loops or perhaps channels, depending on the use case.

Some have tried via macros

but that is a few years old

stevengj · July 8, 2022, 7:07pm

Irrelevant because the recursive call here is not in tail position.

Tail-call optimization is practically irrelevant to recursion in an imperative language like Julia with first-class loops (i.e. not lisp) — you would only use recursion for cases that can’t be trivially written as loops, i.e. recursion is normally not used for tail calls.

lawless-m · July 8, 2022, 7:56pm

my point being that recursion is a non-optimal path

Fabrice_Rosay · July 8, 2022, 9:09pm

I rewrote the function to fixed depth with only for loops:
-it is slighly faster
-allocations do not disappear
-though execution time varies a lot.

btime GAME.perf_loop($game,4,$moves)
  7.826 ms (162661 allocations: 7.45 MiB)
 ────────────────────────────────────────────────────────────────────
                            Time                    Allocations      
                   ───────────────────────   ────────────────────────
 Tot / % measured:      5.01s /   0.2%           4.15GiB /   0.2%    

 Section   ncalls     time    %tot     avg     alloc    %tot      avg
 ────────────────────────────────────────────────────────────────────
 1              1   8.40ms  100.0%  8.40ms   7.45MiB  100.0%  7.45MiB
   2           16   8.39ms   99.9%   525μs   7.44MiB  100.0%   476KiB
     3        256   8.33ms   99.2%  32.5μs   7.43MiB   99.8%  29.7KiB
       4    6.46k   7.03ms   83.7%  1.09μs   7.14MiB   95.8%  1.13KiB
 ───────────────────────────────────────────────────────────────────

1 is te most outer loop and 4 the most inner.
It seems that the function play is doing one allocation … I had changed Game struct to mutable, when it was intended not to be. Now almost all allocations disappeared.
Sorry for your time

stevengj · July 9, 2022, 7:13am

Recursion is perfectly fine in performance-sensitive code. In any language (not just Julia), the trick is simply to enlarge the base case (so that the recursion overhead is amortized), except in the trivial TCO case that can just be transformed to a loop. See also Recursive call vs while loop - #18 by stevengj

stevengj · July 9, 2022, 7:21am

You have push! in your inner loop, so you would expect to have allocations.

Fabrice_Rosay · July 9, 2022, 7:53am

The array where push! happens is preallocated hence no allocation is made.
For the record and to second Performance of recursive function - #8 by stevengj, after correcting the mutable struct “bug” and doing a small tweak to counting function, the julia library is now slightly faster than C++ couterpart(libataxx for those interested).
TimerOutputs.jl was of great help, for optimizing the code.

jules · July 9, 2022, 10:42am

If you preallocate an array and push! to it, you will allocate new memory, because push! adds a new element outside of the preallocated range. However, you can preallocate additional memory outside of the current size of the array using sizehint!, which will then cause push! not to allocate until that memory is exhausted. Julia already does that by itself so growing an array by repeatedly push!ing is faster. If you know the maximum size of the array after all push!es you can also do something like

v = zeros(10_000)
resize!(v, 0)

And I think resize! leaves the backing memory intact, which you can then push! into again.

stevengj · July 10, 2022, 7:43am

This is what sizehint! is for.

jules · July 10, 2022, 7:58am

I meant to suggest this more for related algorithms where you start with some Vector of a certain size and reduce and increase its size during an algorithm without incurring additional allocations.

Topic		Replies	Views
For loop performance vs "functional" performance Performance advent-of-code	17	504	January 4, 2025
Improving for loop speed in recursive calculation Performance	1	175	March 10, 2023
Alllocations in a reduction over a Tuple Performance memory-allocation	2	450	April 28, 2022
Trampolines Performance recursion	6	1020	November 13, 2021
Too many allocations when indexing with slices Performance indexing , memory-allocation	16	2745	August 17, 2018

Performance of recursive function

Related topics