Hi all, in 0.7 beta, I am trying to create a web service using HTTP.jl that can utilize a periodically-updating tree (built with NearestNeighbors.jl). We need to use sklearn from python to generate vectors with which to populate the tree, and that usage of PyCall seems to be blocking everything and not allowing the web server to respond immediately. Any ideas as to how to get these to work together? Below is basically the code I’m running:
using HTTP
@everywhere using HTTP
using JSON
@everywhere using JSON
using Mongo
@everywhere using Mongo
using PyCall
@everywhere using PyCall
using LibBSON
@everywhere using LibBSON
using Distances
@everywhere using Distances
@everywhere include("NearestNeighbors.jl/src/NearestNeighbors.jl")
const c = RemoteChannel(()-> Channel{Tuple{NearestNeighbors.BallTree, Vector{String}}}(1))
# helper function to strip out bad non-ascii
@everywhere stripc0x(a::String) = replace(a, r"[^\x20-\x7e]" => "")
@everywhere function _process_mongo_article(article::LibBSON.BSONObject)
return (stripc0x(join(article["content"], " ")), article["uid"])
end
@everywhere function process_documents(mongo_cursor::Mongo.MongoCursor, vectorizer::PyObject)
mongo_objs = [_process_mongo_article(x) for x in mongo_cursor]
art_contents = getindex.(mongo_objs, 1)
uids = getindex.(mongo_objs, 2)
m = vectorizer[:transform](art_contents)
return Matrix(m'), uids
end
@everywhere function build_tree(vectorizer::PyObject)
client = MongoClient("mongodb://locallhost")
collection = MongoCollection(client, "db", "collection")
curs = find(collection, ("provider" => "ap"), ("_id" => false, "uid" => true, "content" => true); limit=100)
m, uids = process_documents(curs, vectorizer)
return NearestNeighbors.BallTree(m, Jaccard(); reorder=false), uids
end
function manage_tree(vectorizer::PyObject)
while true
sleep(60)
@async put!(c, build_tree(vectorizer))
end
end
@everywhere function find_nn(req::HTTP.Request, objdict::Dict)
data = JSON.parse(HTTP.payload(req,String))
point = Vector{Float64}(data["point"])
println(c)
if isready(c)
tree, uids = take!(c)
objdict["tree"] = tree
objdict["uids"] = uids
println("new tree")
else
tree = objdict["tree"]
uids = objdict["uids"]
end
println(tree)
ret = NearestNeighbors.inrange(tree, point, 0.8)
ret_dict = Dict("idxs" => ret)
return JSON.json(ret_dict)
end
function build_service()
router = HTTP.Router()
@pyimport sklearn.externals.joblib as jl
vectorizer = jl.load("2c410c61-6016-4326-bb4b-83e81ebc1814.pkl")
tree, uids = build_tree(vectorizer)
objdict = Dict{String, Union{typeof(tree), Vector{String}}}("tree" => tree, "uids" => uids)
HTTP.register!(router, "POST", "find_nn", HTTP.HandlerFunction((req::HTTP.Request) -> find_nn(req, objdict)))
server = HTTP.Servers.Server(router)
@async HTTP.serve(server, ip"0.0.0.0", 8000)
manage_tree(vectorizer)
end
build_service()
I tried running the manage_tree
function in the background, and also the HTTP.serve
function (as is shown above). Both times, I get immediate responses from the web service using POST
requests until the build_tree
function is called. At that point, everything is blocked until it is done.