Clean up DataFrame code for Julia benchmarks


#1

I’ve been hacking https://github.com/JuliaLang/julialang.github.com/blob/master/benchmarks.ipynb to automate several steps in producing the microbenchmark plot on Juialang.org. I’ve gotten it to do what I want, but since this code appears on the Julia website, I’d be grateful if any Dataframe experts could help clean it up.

The code

  1. loads benchmarks.csv datafile as a DataFrame
  2. spiffs up language names with a Dict
  3. computes benchmark timings normalized by C times
  4. computes geometric mean of timings per language
  5. sorts the data, putting C 1st, Julia 2nd, then others sorted by geometric mean
benchmarks = readtable("benchmarks_subset.csv", header=false, names=[:language, :function, :time])

# Capitalize and decorate language names from datafile
dict = Dict("c"=>"C", "julia"=>"Julia", "lua"=>"LuaJIT", "fortran"=>"Fortran", "java"=>"Java",
    "javascript"=>"JavaScript", "matlab"=>"Matlab", "mathematica"=>"Mathematica", 
    "python"=>"Python", "octave"=>"Octave", "r"=>"R", "go"=>"Go")

benchmarks[:language] = [dict[lang] for lang in benchmarks[:language]]

# Normalize benchmark times by C times
ctime = benchmarks[benchmarks[:language].== "C", :]
benchmarks = join(benchmarks, ctime, on=:function)
delete!(benchmarks, :language_1)
rename!(benchmarks, :time_1, :ctime)
benchmarks[:normtime] = benchmarks[:time] ./ benchmarks[:ctime];

# Compute the geometric mean for each language
langs = [];
means = [];
priorities = [];
for lang in values(dict)
    data = benchmarks[benchmarks[:language].== lang, :]
    gmean = geomean(data[:normtime])
    push!(langs, lang)
    push!(means, gmean)
    if (lang == "C")
        push!(priorities, 1)
    elseif (lang == "Julia")
        push!(priorities, 2)        
    else
        push!(priorities, 3)
    end
end

# Add the geometric means back into the benchmarks dataframe
langmean = DataFrame(language=langs, geomean = means, priority = priorities)
benchmarks = join(benchmarks, langmean, on=:language)

# Put C first, Julia second, and sort the rest by geometric mean
sort!(benchmarks, cols=[:priority, :geomean]);

And a minimal datafile, benchmarks_subset.csv

c,iteration_mandelbrot,0.266349
c,iteration_pi_sum,27.368069
julia,iteration_mandelbrot,0.163549
julia,iteration_pi_sum,27.368159
go,iteration_mandelbrot,0.18474092830000002
go,iteration_pi_sum,27.917825880000002
fortran,iteration_mandelbrot,.236753
fortran,iteration_pi_sum,27.367718
javascript,iteration_mandelbrot,0.084
javascript,iteration_pi_sum,27.4
matlab,iteration_mandelbrot,1.31600000
matlab,iteration_pi_sum,27.37700000