Benchmark β (Python+Pandas) && (Julia +JuliaDB)
-
Aim of the benchmark
This benchmark aims to evaluate the Julia code using real-world data from the perspective of a normal user (not expert from computing science ). Hope it could be useful to improve the Language and package. -
Data source
Transit network data of New York (Open Data). -
Data processing task
Extract the points on the bus routes between every pair of the consequential bus stops. -
About the code
Currently, both Python and Julia code are not optimized. In the future, the Julia version will be optimized and all old versions will be recorded for the comparison purpose. The Python code will not be optimized as the benchmark. -
Complete source code and Data
(GitHub - zhangliye/juliabenchmark: benchmark for Julia language)
1 Test and Results
The two version of code implement the same algorithm. Firstly, the algorithm is implemented using Python. Then, the Python code is translated to Julia almost line by line.
# It should be noted that both version of the code are not optimized with much effort.
The data operation mainly includes
- Open the CSV file
- Table data operation (iteration, selection)
- Operation on Dict, Tuple and Vector data structure
Tabel 1- Data structure and tabular data package used
Python 3.6.5 | Julia 1.02 | |
---|---|---|
Package for tabular data | Pandas 0.23 | JuliaDB 0.10 |
List | [x1,x2,β¦] | Vector{T}() |
Dict | {k1:V1, β¦} | Dict{T1,T2}() |
Tuple | (v1,β¦) | Tuple{T1,T2}() |
Performance comparison on processing 500 rows of data.
Table 2 - Julia code of version 1 (second)
Task | Python 3.6.5 | Julia 1.02 | Julia 1.02 (precompile) |
---|---|---|---|
open shapes.txt
|
0.07 | 22.87 | 0.0368 ![]() |
Open stop_times.txt
|
1.37 | 6.54 | 1.178 ![]() |
Open trips.txt
|
0.06 | 1.41 | 0.037 ![]() |
Open stops.txt
|
0.01 | 2.72 | 0.002 ![]() |
extract segment | 38.96 ![]() |
156.07 | 94.84 |
- Experiment environment
Julia Version 1.0.2
Commit d789231e99 (2018-11-08 20:11 UTC)
Platform Info:
OS: Windows (x86_64-w64-mingw32)
CPU: Intel(R) Coreβ’ i7-7600U CPU @ 2.80GHz
WORD_SIZE: 64
LIBM: libopenlibm
LLVM: libLLVM-6.0.0 (ORCJIT, skylake)
2 How to run the test on your machine?
- Step1: Download the code and data
$ git clone https://github.com/zhangliye/juliabenchmark.git
- Step 2: Active the project
$ cd juliabenchmark/tablebenchmark
$ julia
(tablebenchmark) pkg> activate .
- Step 3: Run julia code test
(tablebenchmark) pkg> test
- Step 4: Run Python code test
$python ./src/extract_segment.py
3 Source code
3.1 Data and code on Github
(GitHub - zhangliye/juliabenchmark: benchmark for Julia language)
JuliaBenchmark
3.2 Python Version Code (extraject_segment.py)
import math
import sys
import pandas as pd
def get_stop_pos(frm_stop, stop_id):
"returnt [(lon,lat),...] of the give `stop_id`"
sub_frm = frm_stop[frm_stop['stop_id']==stop_id]
if sub_frm.shape[0]>0:
return (sub_frm.iloc[0]['stop_lon'], sub_frm.iloc[0]['stop_lat'])
else:
return None
## get shape id
def get_shape(frm_trip, trip_id):
"return `shape_id` of the given `trip_id`"
sub_frm = frm_trip[frm_trip["trip_id"]==trip_id]
if sub_frm.shape[0] > 0:
return sub_frm.iloc[0]['shape_id'] # return the first row value
else:
return None
# get stops of shapes
def get_stops(frm_st, trip_id):
"return [stop_id1, ...] of the given `trip_id`"
stops = []
sub_frm = frm_st[frm_st["trip_id"]==trip_id]
for i in range(sub_frm.shape[0]):
stops.append( sub_frm.iloc[i]['stop_id'] )
return stops
# get shape points
def get_points(frm_shape, shape_id):
pts = []
sub_frm = frm_shape[frm_shape["shape_id"]==shape_id]
for i in range(sub_frm.shape[0]):
pts.append( (sub_frm.iloc[i]['shape_pt_lon'], sub_frm.iloc[i]['shape_pt_lat']) )
return pts
def dist_meter(pt1, pt2):
"""pt1-(lon,lat), pt2-(lon, lat)"""
dst = math.sqrt( (pt1[0]-pt2[0])**2 + (pt1[1]-pt2[1])**2 )
return dst*110*1000
def nearest_pos(pt, shape_pts):
dst_near = sys.maxsize
idx = -1
for i in range(len(shape_pts)):
dst = dist_meter(pt, shape_pts[i])
if dst<dst_near:
dst_near = dst
idx = i
return (idx, dst_near)
def process_trip(segements, stops, stop_pts, shape_pts, max_dist=300):
"""
segements: {(from_id, to_id):[(x1, y1),...], ...}
stops: [stop_id1,...]
stop_pts: [(x1,y1), ...]
stop_pts: [(x1,y1), ...]
max_dist: Maximum distance, beyond which
"""
n_error = 0
for i in range(len(stops) - 1):
from_stop = stops[i]
to_stop = stops[i + 1]
from_pos = stop_pts[i]
to_pos = stop_pts[i + 1]
if ((from_stop, to_stop) in segements) or ((to_stop, from_stop) in segements):
continue
i_idx1, dst1 = nearest_pos(from_pos, shape_pts)
i_idx2, dst2 = nearest_pos(to_pos, shape_pts)
if -1 == i_idx1 or -1 == i_idx2 or i_idx1 == i_idx2:
print("Point not found: ", i_idx1, i_idx2)
n_error += 1
continue
if dst1 > max_dist or dst2 > max_dist:
print("Point too far: ", from_stop, to_stop)
n_error += 1
continue
if i_idx1 < i_idx2:
segements[(from_stop, to_stop)] = shape_pts[i_idx1:(i_idx2 + 1)]
else:
segements[(from_stop, to_stop)] = shape_pts[i_idx2:(i_idx1 + 1)] # two direction are the same
if n_error > 0:
print("error stops: ", n_error)
#----------------------------------------- API
def extract_segment(segements, frm_trip, frm_stop, frm_stop_time, frm_shape, max_dist=300, testing=false):
"""
:param segements: {(from_id, to_id):[(x1, y1),...], ...}
:param frm_trip:
:param frm_stop:
:param frm_stop_time:
:param frm_shape:
:return:
"""
stop_chain_key = {} # 'key':0
n = 0
k1 = ''
for i in frm_trip.index:
if i % 100 == 0:
print(i, "found: ", n)
if i>500:
break
trip_id = frm_trip.iloc[i]['trip_id']
stops = get_stops(frm_stop_time, trip_id) # [stop_id1,...]
# k1=str(stops)
k1 = tuple(stops) # [stop_id1, ...]
if k1 in stop_chain_key:
continue
shape_id = frm_trip.iloc[i]['shape_id']
shape_pts = get_points(frm_shape, shape_id) # [(x1,y1), ...]
# check all stops are avaliable
stop_pts = []
for stop_id in stops:
pt = get_stop_pos(frm_stop, stop_id)
if (pt == None):
print("found one error")
continue
stop_pts.append(pt)
process_trip(segements, stops, stop_pts, shape_pts, max_dist=max_dist)
stop_chain_key[k1] = -1
# stop_chain_key[str(stops.reverse())] = -1
stops.reverse()
stop_chain_key[tuple(stops)] = -1
n += 1
testing && break
return segements
3.3 Julia Version Code (extraject_segment.jl)
using JuliaDB
"""
returnt [(lon,lat),...] of the given `stop_id`
"""
function get_stop_pos(frm_stop::IndexedTable, stop_id::String)
sub_frm = filter(p->p.stop_id==stop_id, frm_stop)
if length(sub_frm)>0
return (sub_frm[1][:stop_lon], sub_frm[1][:stop_lat])
else
return (nothing, nothing)
end
end
"""
return `shape_id` of the given `trip_id`
"""
function get_shape(frm_trip::IndexedTable, trip_id::String)
sub_frm = filter(p->p.trip_id==trip_id, frm_trip)
if length(sub_frm)>0
return sub_frm[1][:shape_id] # return the first row value
else
return nothing
end
end
"""
return [stop_id1, ...] of the given `trip_id`
"""
function get_stops(frm_st, trip_id)
stops = Vector{String}()
sub_frm = filter(p->p.trip_id==trip_id, frm_st)
for i in 1:length(sub_frm)
push!(stops, sub_frm[i][:stop_id])
end
return stops
end
"""
get shape points
"""
function get_points(frm_shape, shape_id)
pts = Vector{Tuple{Float64,Float64}}()
sub_frm = filter(p->p.shape_id==shape_id, frm_shape)
for i in 1:length(sub_frm)
push!( pts, (sub_frm[i][:shape_pt_lon], sub_frm[i][:shape_pt_lat]) )
end
return pts
end
"""
pt1-(lon,lat), pt2-(lon, lat)
"""
function dist_meter(pt1::Tuple{Float64,Float64}, pt2::Tuple{Float64,Float64})
dst = sqrt( (pt1[1]-pt2[1])^2 + (pt1[2]-pt2[2])^2 )
return dst*110*1000
end
function nearest_pos(pt::Tuple{Float64,Float64}, shape_pts::Vector{Tuple{Float64,Float64}})
dst_near = typemax(Int)
idx = -1
for i in 1:length(shape_pts)
dst = dist_meter(pt, shape_pts[i])
if dst<dst_near
dst_near = dst
idx = i
end
end
return (idx, dst_near)
end
"""
segements: {(from_id, to_id):[(x1, y1),...], ...}
stops: [stop_id1,...]
stop_pts: [(x1,y1), ...]
stop_pts: [(x1,y1), ...]
"""
function process_trip(segements::Dict{Tuple{String, String},Vector{Tuple{Float64, Float64}}},
stops::Vector{String}, stop_pts::Vector{Tuple{Float64, Float64}},
shape_pts::Vector{Tuple{Float64, Float64}})
n_error = 0
for i in 1:(length(stops)- 1)
from_stop = stops[i]
to_stop = stops[i + 1]
from_pos = stop_pts[i]
to_pos = stop_pts[i + 1]
if haskey(segements, (from_stop, to_stop)) || haskey(segements, (to_stop ,from_stop))
continue
end
i_idx1, dst1 = nearest_pos(from_pos, shape_pts)
i_idx2, dst2 = nearest_pos(to_pos, shape_pts)
if -1 == i_idx1 || -1 == i_idx2 || i_idx1 == i_idx2
println("Point not found: ", i_idx1, i_idx2)
n_error += 1
continue
end
if dst1 > 300 || dst2 > 300
println("Point too far: ", from_stop, to_stop)
n_error += 1
continue
end
if i_idx1 < i_idx2
segements[(from_stop, to_stop)] = shape_pts[i_idx1:(i_idx2)]
else
segements[(from_stop, to_stop)] = shape_pts[i_idx2:(i_idx1)] # two direction are the same
end
end
n_error > 0 && println("error stops: ", n_error)
end
"""
:param segements: {(from_id, to_id):[(x1, y1),...], ...}
:param frm_trip:
:param frm_stop:
:param frm_stop_time:
:param frm_shape:
:return:
"""
function extract_segment(segements::Dict{Tuple{String, String},Vector{Tuple{Float64, Float64}}},
frm_trip::IndexedTable, frm_stop::IndexedTable,
frm_stop_time::IndexedTable, frm_shape::IndexedTable; testing=false)
stop_chain_key = Dict{String, Int}() # 'key':0
n = 0
k1 = ""
for i in 1:length(frm_trip)
i % 100 == 0 && println(i, " found: ", n)
i>500 && break
trip_id = frm_trip[i][:trip_id]
stops = get_stops(frm_stop_time, trip_id) # [stop_id1,...]
# k1=str(stops)
k1 = string(stops) # [stop_id1, ...]
haskey(stop_chain_key, k1) && continue
shape_id = frm_trip[i][:shape_id]
shape_pts = get_points(frm_shape, shape_id) # [(x1,y1), ...]
# check all stops are avaliable
stop_pts = Vector{Tuple{Float64, Float64}}()
for stop_id in stops
pt = get_stop_pos(frm_stop, stop_id)
if pt[1] == nothing
println("found one error")
continue
end
push!(stop_pts, pt)
end
process_trip(segements, stops, stop_pts, shape_pts)
stop_chain_key[k1] = -1
# stop_chain_key[str(stops.reverse())] = -1
reverse!(stops)
stop_chain_key[string(stops)] = -1
n += 1
testing && break
end
end
4 Summary
The current Julia version code is translated from Python code line by line. Please point out the improper code and I will update the new results. Hope the results would be helpful to improve the package and language from the perspective of the normal users.