Performance of Julia as a general-purpose programming language


#1

Hi,
I used Julia for a “computing” project recently and it was really nice. Now, I’m tempted to use Julia for a more “classic” project (involving tree-like data-structures) so I’ve implemented a test project (the tictactoe game with a monte-carlo bot… :clown_face:) but my Julia code is 3x slower than my C++ code (see Julia code below and the github repo here). According to the profiler, a lot of time is spent to copy the game board so I’ve implemented a “mutable” version making less copies but it is not faster.
Is this a limitation of Julia or did I make mistakes in my code ?

# tictactoe_cmp.jl
const NB_GAMES = 1000

struct Tictactoe
  board::Vector{Int}
  nbFreeMoves::Int
  currentPlayer::Int
  nextPlayer::Int
  finished::Bool
  winner::Int
end

function Tictactoe() 
  board = [0, 0, 0, 0, 0, 0, 0, 0, 0]
  nbFreeMoves = 9
  currentPlayer = 1
  nextPlayer = 2
  finished = false
  winner = 0
  Tictactoe(board, nbFreeMoves, currentPlayer, nextPlayer, finished, winner)
end

# check if the game is finished and find the winner
function computeFinish(board::Vector{Int}, nbFreeMoves::Int)
  b11, b12, b13, b21, b22, b23, b31, b32, b33 = board
  if b11 != 0 && b11 == b12 == b13
    true, b11
  elseif b21 != 0 && b21 == b22 == b23
    true, b21
  elseif b31 != 0 && b31 == b32 == b33
    true, b31
  elseif b11 != 0 && b11 == b21 == b31
    true, b11
  elseif b12 != 0 && b12 == b22 == b32
    true, b12
  elseif b13 != 0 && b13 == b23 == b33
    true, b13
  elseif b11 != 0 && b11 == b22 == b33
    true, b11
  elseif b13 != 0 && b13 == b22 == b31
    true, b11
  elseif nbFreeMoves == 0  # no winner, no empty cell -> draw
    true, 0
  else  # no winner, empty cell -> game not finished
    false, 0
  end
end

# find the moveIndex'th free cell in the board
function findIndex(board::Vector{Int}, moveIndex::Int)
  nbFree = 0
  k = 1
  while k<=9 
    if board[k] == 0
      nbFree += 1
      if nbFree == moveIndex
        break
      end
    end
    k += 1
  end
  k
end

function playIndex(game::Tictactoe, moveIndex::Int)
  k = findIndex(game.board, moveIndex)
  if k <= 9
    board = copy(game.board)   # this line seems to take some time
    board[k] = game.currentPlayer
    nbFreeMoves = game.nbFreeMoves - 1
    currentPlayer = game.nextPlayer
    nextPlayer = game.currentPlayer
    finished, winner = computeFinish(board, nbFreeMoves)
    Tictactoe(board, nbFreeMoves, currentPlayer, nextPlayer, finished, winner)
  else
    game
  end
end

abstract type Bot end

function runGame(game::Tictactoe, bot1::Bot, bot2::Bot)
  while !game.finished
    bot = game.currentPlayer == 1 ? bot1 : bot2
    moveIndex = genmove(game, bot)
    game = playIndex(game, moveIndex)
  end
  game.winner
end

struct RandomBot <: Bot end

function genmove(game::Tictactoe, bot::RandomBot)
  nbMoves = game.nbFreeMoves
  rand(1:nbMoves)
end

struct McBot <: Bot
  maxSims::Int
end
# evaluate a move by running many random simulations (monte-carlo)
function evalMove(game::Tictactoe, moveIndex::Int, nbSims::Int)
  bot = RandomBot()
  player = game.currentPlayer
  game1 = playIndex(game, moveIndex)
  evalMove1(unused) = runGame(game1, bot, bot) == player ? 1 : 0
  mapreduce(evalMove1, (+), 0, 1:nbSims)
end

function genmove(game::Tictactoe, bot::McBot)
  nbMoves = game.nbFreeMoves
  nbMoveSims = div(bot.maxSims, nbMoves)
  simulate(moveIndex) = evalMove(game, moveIndex, nbMoveSims)
  wins = map(simulate, [moveIndex for moveIndex in 1:nbMoves])
  indmax(wins)
end

# run many games for reducing the variance of the results
function runGames(game::Tictactoe, bot1::Bot, bot2::Bot, nbGames::Int)
  res = [0, 0, 0]
  for _ in 1:nbGames
    winner = 1 + runGame(game, bot1, bot2)
    res[winner] += 1
  end
  res
end

# compare RandomBot and McBot for a given number of simulations
function runXp(nbSims::Int)
  game = Tictactoe()
  botA = RandomBot()
  botB = McBot(nbSims)
  resAB = runGames(game, botA, botB, NB_GAMES)
  resBA = runGames(game, botB, botA, NB_GAMES)
  println("RandomBot McBot ", nbSims, " ", NB_GAMES, " ", 
          resAB[1], " ", resAB[2], " ", resAB[3], " ", 
          resBA[1], " ", resBA[2], " ", resBA[3], " ", 
          resAB[1]+resBA[1], " ", resAB[2]+resBA[3], " ", resAB[3]+resBA[2])
end

println("botA botB nbSims nbGames drawsA1B2 winsA1 winsB2 drawsB1A2 winsB1 winsA2 draws winsA winsB")
vecNbSims = [1, 10, 100, 1000, 10000]
map(runXp, vecNbSims)

#2

This is really a lot of code to ask people to look over, so it’s hard to say anything with certainty.

However, one thing I notice is that your board vector is always 9 elements, so you might want to consider using an SVector (which is immutable) or an MVector (which is fixed-size but mutable) from https://github.com/JuliaArrays/StaticArrays.jl


#3

I think that a good rule of thumb to keep in mind is that probably any code can get <2x from C without too much of an issue, and sometimes to get to 1x with C it might take some work (at least until more compiler optimizations like constant propagation and stack-allocated views are a thing).

So if you’re at 3x, you should really be looking at what’s different about the algorithm. As @rdeits notes, static vectors may be one major difference: I assume a lot of the memory usage in the C++ version is static and not dynamic for this.


#4

I would say this calls for some profiling to identify the bottleneck.


#5

I did some benchmarking (… showing off my new package)

The first row is board = copy(game.board). Couldn’t you replace it with something like copy!(tmp_board, game.board), to avoid allocating? Alternatively, as @rdeits said, check out StaticArrays. The second row identifies a dynamic dispatch:

function runGame(game::Tictactoe, bot1::Bot, bot2::Bot)
  while !game.finished
    bot = game.currentPlayer == 1 ? bot1 : bot2  #### HERE
    moveIndex = genmove(game, bot)
    game = playIndex(game, moveIndex)
  end

@code_warntype would tell you that bot does not have a fixed type (sometimes it’s typeof(bot1), sometimes typeof(bot2)). If you replaced it with something like:

   moveIndex = game.current_player == 1 ? genmove(game, bot1) : genmove(game, bot2)

Then you might gain another 25%.


#6

Thanks for your replies. I know this is quite a lot of code so I really appreciate your help.
StaticArrays was indeed a good idea: the Julia code is now only 2x slower than the C++ code, which is very acceptable.
@cstjean I’ll try that soon, thanks.


#7

The rest of the difference is probably due to the dynamic dispatch @cstjean pointed out. C++ is pretty good at this because it uses vtables in general – which are pretty fast but not fast enough for things like arithmetic – and devirtualization only occasionally. Julia, on the other hand, uses devirtualization as much as possible and vtables never. We can address this, but that work hasn’t been done yet. Presumably the gameplay alternates between the two players, so you could just unroll the loop so that it includes one call to one bot and another call to the other bot, both of which have predictable types.


#8

Profiling shows you are spending a lot of time for the random number generation.

As range in rand(rng) is very simple, try this (it reduced runtime by 20% on my computer)

genmove(game::Tictactoe, bot::RandomBot) = ceil(Int,rand()*game.nbFreeMoves)

Also, the map function allocates an array which you are not using.
You may want to consider the code below instead of indmax(wins).
But I am actually not sure if it is correct. Please check yourself.

function genmove(game::Tictactoe, bot::McBot)
    nbMoves = game.nbFreeMoves
    nbMoveSims = div(bot.maxSims, nbMoves)
    bestmove(nbMoves,game,nbMoveSims) 
  end
  
function bestmove(nbMoves::Int,game,nbMoveSims::Int)
      bestval=0
      bestidx=1
      for i=1:nbMoves
          thisval=evalMove(game, i, nbMoveSims)
          if thisval>bestval
              bestval=thisval
              bestidx=i
          end
      end
      bestidx
  end

EDIT: The Julia code could possibly be faster if you would generate multiple random numbers at once. But that would probably not be a fair comparison against the reference code.


#9

That’s really unfortunate that this hack is so much faster than built-in rand(1:n). I have a plan to improve this (extending #22612 to generation of multiple values together with #23964), but I don’t know how much the gap can be reduced with this approach…


#10

well, I suppose rand(‘some_range’) is much more general than what I have built. Although one could probably extend what I have written to more general ranges (with step size 1).

It would actually be interesting to see if my simple approach is still faster even for something like:

 rand(231:33:423)

#11

I’ve tried several implementations, using SVector, MVector, mutable struct… SVector is the fastest. The “rand trick” reduces the computation time significantly but fixing the type bot1/bot2 is quite negligible on my tests.
Current julia code is only 50% slower than C++ code :slightly_smiling_face:
Thanks for your help.


#12

The proposed approach (with ceil) is faster but it does not guarantee an uniform distribution - this would be apparent for large n (for small n this non-uniformity can be neglected). The implementation of range(1:n) tries to give correct results.


#13

Just ran the test cases from GitHub code. My take is the performance penalty is about 33% or 1.33x which is in the ballpark where we expect. Moreover, the command for timecheck includes julia runtime loading time. One option may be to run the test multiple times to ensure the runtime is resident in memory at least the code part or keep julia session open and loaded in another window and run the .jl file in the REPL once so that the code block from julia runtime is run at least once.

Secondly, the focus also should be where Julia can add performance gains like SIMD or LLVM operations as well as dispatches. I do not think this code will do enough justice on those aspects. Although, I have not gone through the code but such problems are generally will be on recursions. So not the best performance sample as such.

regards,

Sambit

$ time ./tictactoe_cmp.out
botA botB nbSims nbGames drawsA1B2 winsA1 winsB2 drawsB1A2 winsB1 winsA2 draws winsA winsB
RandomBot McBot 1 1000 78 428 494 9 911 80 87 508 1405
RandomBot McBot 10 1000 103 405 492 5 950 45 108 450 1442
RandomBot McBot 100 1000 42 232 726 9 983 8 51 240 1709
RandomBot McBot 1000 1000 36 135 829 3 991 6 39 141 1820
RandomBot McBot 10000 1000 24 126 850 7 982 11 31 137 1832

real	0m12.193s
user	0m12.189s
sys	0m0.005s

$ time julia tictactoe_cmp_svector.jl
botA botB nbSims nbGames drawsA1B2 winsA1 winsB2 drawsB1A2 winsB1 winsA2 draws winsA winsB
RandomBot McBot 1 1000 35 306 659 33 860 107 68 413 1519
RandomBot McBot 10 1000 22 261 717 16 919 65 38 326 1636
RandomBot McBot 100 1000 4 131 865 12 973 15 16 146 1838
RandomBot McBot 1000 1000 5 54 941 1 999 0 6 54 1940
RandomBot McBot 10000 1000 0 54 946 0 1000 0 0 54 1946

real	0m16.199s
user	0m16.185s
sys	0m0.324s


#14

@bkamins can you please elaborate where the ceil approach differs from the rand(1:n) approach? Why is it not uniform?


#15

Because in Julia you get 52 bits of randomness when you call rand() and:

  1. e.g. Int64 has 64 bits so it is clear that you will not be able to obtain all Int64 values this way;
  2. even if you want n smaller than 2^52 in 1:n range usually it will not be a divisor of 2^52 which means that you will have distribute the remainder of this division unevenly.

As I have said - those problems are negligible if n is small.


#16

Can you please elaborate on this? This would be useful to know more about, but I could not find it in the documentation.


#17

Thank you.
I do not fully understand what you are saying:

  1. is rand() uniformly distributed in ]0,1[ ?
    I do not think we need to bother about the random bits here (in any case we will only have a finite number of random bits, so the ‘uniform’ property will only be given for a certain bin size (say of a histogram that we consider))

Let us assume n<2^52. Then if rand() is “uniform”, then ceil(Int,rand()) should do the trick. No?

Maybe you could elaborate what rand(1:n) does, starting from a x=rand() (assuming rand() is actually the basis)
I tried looking into the code in Julia Bas, but ran into some issues whith @which , see below.

Main> @which rand(Base.GLOBAL_RNG,1:3)
rand(rng::AbstractRNG, r::UnitRange{#s268} where #s268<:Union{BigInt, Bool, Signed, Unsigned}) in Base.Random at random.jl:673

Main> #these calls do not work

Main> @which rand(Base.GLOBAL_RNG, Base.RangeGenerator(1:3))
ERROR: UndefVarError: RangeGenerator not defined
Stacktrace:
 [1] eval(::Module, ::Any) at .\boot.jl:235

Main> include("C:\\Julia-0.6.X\\share\\julia\\base\\random.jl")
Random

Main> using Random
WARNING: using Random.rand in module Main conflicts with an existing identifier.

Main> @which rand(Random.GLOBAL_RNG, Random.RangeGenerator(1:3))
ERROR: no method found for the specified argument types
Stacktrace:
 [1] which(::Any, ::Any) at .\reflection.jl:823
 [2] eval(::Module, ::Any) at .\boot.jl:235

Main>

#18

rand() in the end (after several intermediate calls) invokes this line in random.jl:

@inline mt_pop!(r::MersenneTwister) = @inbounds return r.vals[r.idx+=1]

And vals is Vector{Float64} that is populated by this call:

function gen_rand(r::MersenneTwister)
    dsfmt_fill_array_close1_open2!(r.state, pointer(r.vals), length(r.vals))
    mt_setfull!(r)
end

the called function is a part of libdSFMT library and it generates values from [1,2[ inverval. It is easy to confirm by inspecting Base.GLOBAL_RNG.vals.

What is the nature of a random Float64 from [1,2[ interval?
Its 12 high bits are 001111111111 and the following 52 bits are random.

You can check that the 12 high bits are constant and that the other bits are random e.g. like this:

[mean(getindex.(bits.(Base.GLOBAL_RNG.vals), i).=='1') for i in 1:64]

(you can devise a more elaborate test by actually calling rand on [1,2[ interval, as above you get only 382 random values)


#19

This is the function definition you are looking for:

# this function uses 32 bit entropy for small ranges of length <= typemax(UInt32) + 1
# RangeGeneratorInt is responsible for providing the right value of k
function rand{T<:Union{UInt64, Int64}}(rng::AbstractRNG, g::RangeGeneratorInt{T,UInt64})
    local x::UInt64
    if (g.k - 1) >> 32 == 0
        x = rand(rng, UInt32)
        while x > g.u
            x = rand(rng, UInt32)
        end
    else
        x = rand(rng, UInt64)
        while x > g.u
            x = rand(rng, UInt64)
        end
    end
    return reinterpret(T, reinterpret(UInt64, g.a) + rem_knuth(x, g.k))
end

It is not calling rand() but either rand(rng, UInt64) or rand(rng, UInt32) depending on the size of n. And it exactly does the corrections I am talking about.

Notice that if n is large actually we have:

function rand(r::MersenneTwister, ::Type{UInt64})
    reserve(r, 2)
    rand_ui52_raw_inbounds(r) << 32 ⊻ rand_ui52_raw_inbounds(r)
end

so Julia uses then two random numbers (of 52 bit size) to produce a proper result.


#20

As a side note. I recommend you to have a look at https://github.com/sunoru/RandomNumbers.jl.
Especially http://sunoru.github.io/RandomNumbers.jl/latest/man/xorshifts/ is a very good RNG.