Performance of recursive function

Hi,
I have the following recursive function to calculate the number of positions in a game.
All functions inside perft (play, gen_moves, isOver) are non allocating, yet perft allocates a lot of memory.
Using the track-allocation function the only line that is allocating is the recursive call inside perft (the line in the for loop).
Is there anything I can do to avoid this or is it an artefact of the way allocation are tracked ?
(for info this function is 2 times slower than an equivalent one in C++ and I try to narrow the gap)

function perft(pos,depth,moves)
	nodes=0
	 if isOver(pos)[1]
		return 0
	end
	  if depth==1
		return count_moves(pos)
	end


	nmoves=gen_moves(pos,moves[depth])
	for k in 1:nmoves
		 nodes+=perft(play(pos,moves[depth][k]),depth-1,moves)#@timeit to "nested"
	end
	return nodes
end

can you post complete and benchmark-able code?

it is a quite large module 300 lines:


import Base.:<<,Base.:>>,Base.:~,Base.:&,Base.:⊻,Base.:|,Base.copy
export Game,
    canPlay,
    play,
    undo,
    isOver,
    score,
    getplayer,
    getboard,
    getround,
    gethash,
    Move,
    gen_moves


const N=7
const NN=N*N



struct Square
	sq::Int
end

is_pass(sq::Square)=sq.sq==-1

struct Bitboard
	data::UInt64
end

Base.:<<(b::Bitboard,n)=Bitboard(b.data<<n)
Base.:>>(b::Bitboard,n)=Bitboard(b.data>>n)
Base.:~(b::Bitboard)=Bitboard(~b.data & 0x7f7f7f7f7f7f7f)
Base.:⊻(b1::Bitboard,b2::Bitboard)=Bitboard(b1.data ⊻ b2.data)
Base.:&(b1::Bitboard,b2::Bitboard)=Bitboard(b1.data & b2.data)
Base.:|(b1::Bitboard,b2::Bitboard)=Bitboard(b1.data | b2.data)

Base.iterate(b::Bitboard)=(Square(trailing_zeros(b.data)),b.data&(b.data-0x1))
Base.iterate(b::Bitboard,state)=state==0x0 ? nothing : (Square(trailing_zeros(state)),state &(state-0x1))


function set(b::Bitboard,sq::Square)
	Bitboard(b.data|UInt64(1)<<sq.sq)
end

function get(b::Bitboard,sq::Square)
	b.data>>sq.sq & 1
end

Bitboard(sq::Square)=Bitboard(UInt64(1)<<sq.sq)
function singles(b::Bitboard)
	Bitboard((b.data << 1 | b.data << 9 | b.data >> 7 | b.data << 8 | b.data >> 8 |
	b.data >> 1 | b.data >> 9 | b.data << 7) & 0x7f7f7f7f7f7f7f)
end

function singles(sq::Square)
	b=Bitboard(UInt64(1)<<sq.sq)
	return singles(b)
end

function doubles(b::Bitboard)
	Bitboard(((b.data << 2 | b.data << 10 | b.data << 18 | b.data >> 6 | b.data >> 14 | b.data << 17 | b.data >> 15) & 0x7e7e7e7e7e7e7e) |

				((b.data << 16 | b.data >> 16) & 0x7f7f7f7f7f7f7f) |

				((b.data >> 2 | b.data >> 10 | b.data >> 18 | b.data << 6 | b.data << 14 | b.data << 15 | b.data >> 17) & 0x3f3f3f3f3f3f3f)
			 )
end

function doubles(sq::Square)
	b=Bitboard(UInt64(1)<<sq.sq)
	return doubles(b)
end


is_empty(b::Bitboard)=b.data==0
is_full(b::Bitboard)=b.data==0x7f7f7f7f7f7f7f
function test()
	b=Bitboard(0x7f7f7f7f7f7f7f)
	for x in b
		println(doubles(x))
	end
end

struct Board
	bplayer::Bitboard
	bopp::Bitboard
end

mutable struct Game
    board::Board
    player::Int8
	pass::Int8
    round::Int
end

struct Move
    from::Square
    to::Square
end

function Game()
	bb=Bitboard(0)
	bb=set(bb,Square(0))
	bb=set(bb,Square(54))
	bw=Bitboard(0)
	bw=set(bw,Square(6))
	bw=set(bw,Square(48))

    return Game(
    Board(bb,bw),
    1,
	0,
    0
    )
end

score(pos,n=0) = abs(sum(pos.board[:,:,1])-sum(pos.board[:,:,2]))
getplayer(pos) = pos.player


function getboard(pos)
    answer = zeros(Int8, N, N, 3, 1)
    answer[:,:,1:2,1].=pos.board
 	answer[:, :, 3, 1] .= pos.player
	# if pos.player==1
	# 	answer[:,:,1,1].=(pos.board[:,:,1,1].-pos.board[:,:,2,1])
	# else
	# 	answer[:,:,1,1].=rot180(pos.board[:,:,2,1].-pos.board[:,:,1,1])
	# end
    return answer
end


gethash(pos) =(deepcopy(pos.board),pos.player,pos.round)#pos.hash
getround(pos) = pos.round

other(x)=-x
index(x)=x==1 ? 1 : 2

function isOver(pos)
	bplayer=pos.board.bplayer
	bopp=pos.board.bopp
	p=count_ones(bplayer.data)
	o=count_ones(bopp.data)
	if is_empty(bplayer) || is_empty(bopp)
    	return true,(p-o)*pos.player
	end
	if is_full(bplayer|bopp)
		return true,sign((p-o)*pos.player)
	end
	if pos.pass==2
		return true,sign((p-o)*pos.player)
	end
	if pos.round>=200
		return true,sign((p-o)*pos.player)
	end
	return false,0
end

@inbounds @inline function play(game, move::Move)
    sqf=move.from
	sqt=move.to
	bplayer=game.board.bplayer
	bopp=game.board.bopp
	if is_pass(sqf)
		return Game(Board(bopp,bplayer),other(game.player),game.pass+1,game.round+1)
	end
	if sqf!=sqt
    	bplayer⊻=Bitboard(sqf)
	end
    bplayer=set(bplayer,sqt)
	turned=singles(sqt) & bopp
	bopp⊻=turned
	bplayer|=turned
	return Game(Board(bopp,bplayer),other(game.player),0,game.round+1)
end


function gen_moves(position,answer)
    cpt = 0
    for sq in singles(position.board.bplayer) & ~(position.board.bplayer | position.board.bopp)
		#push!(answer,Move(sq,sq))
		cpt+=1
		answer[cpt]=Move(sq,sq)
	end
	for sqf in position.board.bplayer
		for sqt in doubles(sqf) & ~(position.board.bplayer | position.board.bopp)
			#push!(answer,Move(sqf,sqt))
			cpt+=1
			answer[cpt]=Move(sqf,sqt)
		end
	end
	if cpt==0
		cpt+=1
		#push!(answer,Move(Square(-1),Square(-1)))
		answer[cpt]=Move(Square(-1),Square(-1))
	end
    cpt
end

function gen_move(position,k)
    move=Move(Square(-1),Square(-1))
    cpt = 0
    for sq in singles(position.board.bplayer) & ~(position.board.bplayer | position.board.bopp)
		move=Move(sq,sq)
		cpt+=1
		if cpt==k
			return move
		end

	end
	for sqf in position.board.bplayer
		for sqt in doubles(sqf) & ~(position.board.bplayer | position.board.bopp)
			move=Move(sqf,sqt)
			cpt+=1
			if cpt==k
				return move
			end
		end
	end
end

function count_moves(position)
    cpt = 0
    for sq in singles(position.board.bplayer) & ~(position.board.bplayer | position.board.bopp)
		cpt+=1
	end
	for sqf in position.board.bplayer
		for sqt in doubles(sqf) & ~(position.board.bplayer | position.board.bopp)
			cpt+=1
		end
	end
	if cpt==0
		push!(answer,Move(Square(-1),Square(-1)))
	end
    cpt
end

function perft(pos,depth,moves)
	nodes=0
	 if isOver(pos)[1]
		return 0
	end
	  if depth==1
		return count_moves(pos)
	end


	nmoves=gen_moves(pos,moves[depth])
	for k in 1:nmoves
		 nodes+=perft(play(pos,moves[depth][k]),depth-1,moves)#@timeit to "nested"
	end
	return nodes
end

function perf(pos,depth)

	 moves=[Vector{Move}(undef,200) for k in 1:depth]
	t=time()
	 nodes=perft(pos,depth,moves)

	t=time()-t
	MN=round(nodes/(10^6*t),digits=3)
	println("nodes: $nodes, speed: $MN Mn/s")

end
end

using ..GAME

function main()
	game=GAME.Game()
	GAME.perf(game,6)
	GAME.perf(game,6)
end

main()

Julia doesn’t do tail call optimization and this is a situation that is unlikely to change.

https://github.com/JuliaLang/julia/issues/4964

Recursive code is best re-written as loops or perhaps channels, depending on the use case.

Some have tried via macros

but that is a few years old

Irrelevant because the recursive call here is not in tail position.

Tail-call optimization is practically irrelevant to recursion in an imperative language like Julia with first-class loops (i.e. not lisp) β€” you would only use recursion for cases that can’t be trivially written as loops, i.e. recursion is normally not used for tail calls.

2 Likes

my point being that recursion is a non-optimal path

I rewrote the function to fixed depth with only for loops:
-it is slighly faster
-allocations do not disappear
-though execution time varies a lot.

btime GAME.perf_loop($game,4,$moves)
  7.826 ms (162661 allocations: 7.45 MiB)
 ────────────────────────────────────────────────────────────────────
                            Time                    Allocations      
                   ───────────────────────   ────────────────────────
 Tot / % measured:      5.01s /   0.2%           4.15GiB /   0.2%    

 Section   ncalls     time    %tot     avg     alloc    %tot      avg
 ────────────────────────────────────────────────────────────────────
 1              1   8.40ms  100.0%  8.40ms   7.45MiB  100.0%  7.45MiB
   2           16   8.39ms   99.9%   525ΞΌs   7.44MiB  100.0%   476KiB
     3        256   8.33ms   99.2%  32.5ΞΌs   7.43MiB   99.8%  29.7KiB
       4    6.46k   7.03ms   83.7%  1.09ΞΌs   7.14MiB   95.8%  1.13KiB
 ───────────────────────────────────────────────────────────────────

1 is te most outer loop and 4 the most inner.
It seems that the function play is doing one allocation … I had changed Game struct to mutable, when it was intended not to be. Now almost all allocations disappeared.
Sorry for your time

1 Like

Recursion is perfectly fine in performance-sensitive code. In any language (not just Julia), the trick is simply to enlarge the base case (so that the recursion overhead is amortized), except in the trivial TCO case that can just be transformed to a loop. See also Recursive call vs while loop - #18 by stevengj

2 Likes

You have push! in your inner loop, so you would expect to have allocations.

The array where push! happens is preallocated hence no allocation is made.
For the record and to second Performance of recursive function - #8 by stevengj, after correcting the mutable struct β€œbug” and doing a small tweak to counting function, the julia library is now slightly faster than C++ couterpart(libataxx for those interested).
TimerOutputs.jl was of great help, for optimizing the code.

If you preallocate an array and push! to it, you will allocate new memory, because push! adds a new element outside of the preallocated range. However, you can preallocate additional memory outside of the current size of the array using sizehint!, which will then cause push! not to allocate until that memory is exhausted. Julia already does that by itself so growing an array by repeatedly push!ing is faster. If you know the maximum size of the array after all push!es you can also do something like

v = zeros(10_000)
resize!(v, 0)

And I think resize! leaves the backing memory intact, which you can then push! into again.

2 Likes

This is what sizehint! is for.

1 Like

I meant to suggest this more for related algorithms where you start with some Vector of a certain size and reduce and increase its size during an algorithm without incurring additional allocations.

1 Like