Issues with PyCall and parallelization using HTTP.jl-built web service



Hi all, in 0.7 beta, I am trying to create a web service using HTTP.jl that can utilize a periodically-updating tree (built with NearestNeighbors.jl). We need to use sklearn from python to generate vectors with which to populate the tree, and that usage of PyCall seems to be blocking everything and not allowing the web server to respond immediately. Any ideas as to how to get these to work together? Below is basically the code I’m running:

using HTTP
@everywhere using HTTP
using JSON
@everywhere using JSON
using Mongo
@everywhere using Mongo
using PyCall
@everywhere using PyCall
using LibBSON
@everywhere using LibBSON
using Distances
@everywhere using Distances

@everywhere include("NearestNeighbors.jl/src/NearestNeighbors.jl")

const c = RemoteChannel(()-> Channel{Tuple{NearestNeighbors.BallTree, Vector{String}}}(1))

# helper function to strip out bad non-ascii
@everywhere stripc0x(a::String) = replace(a, r"[^\x20-\x7e]" => "")

@everywhere function _process_mongo_article(article::LibBSON.BSONObject)
    return (stripc0x(join(article["content"], "  ")), article["uid"])

@everywhere function process_documents(mongo_cursor::Mongo.MongoCursor, vectorizer::PyObject)
    mongo_objs = [_process_mongo_article(x) for x in mongo_cursor]
    art_contents = getindex.(mongo_objs, 1)
    uids = getindex.(mongo_objs, 2)

    m = vectorizer[:transform](art_contents)
    return Matrix(m'), uids

@everywhere function build_tree(vectorizer::PyObject)
    client = MongoClient("mongodb://locallhost")
    collection = MongoCollection(client, "db", "collection")

    curs = find(collection, ("provider" => "ap"), ("_id" => false, "uid" => true, "content" => true); limit=100)
    m, uids = process_documents(curs, vectorizer)
    return NearestNeighbors.BallTree(m, Jaccard(); reorder=false), uids

function manage_tree(vectorizer::PyObject)    
    while true
        @async put!(c, build_tree(vectorizer))

@everywhere function find_nn(req::HTTP.Request, objdict::Dict)
    data = JSON.parse(HTTP.payload(req,String))
    point = Vector{Float64}(data["point"])
    if isready(c)
        tree, uids = take!(c)
        objdict["tree"] = tree
        objdict["uids"] = uids
        println("new tree")
        tree = objdict["tree"]
        uids = objdict["uids"]
    ret = NearestNeighbors.inrange(tree, point, 0.8)
    ret_dict = Dict("idxs" => ret)
    return JSON.json(ret_dict)

function build_service()
    router = HTTP.Router()
    @pyimport sklearn.externals.joblib as jl
    vectorizer = jl.load("2c410c61-6016-4326-bb4b-83e81ebc1814.pkl")
    tree, uids = build_tree(vectorizer)
    objdict = Dict{String, Union{typeof(tree), Vector{String}}}("tree" => tree, "uids" => uids)
    HTTP.register!(router, "POST", "find_nn", HTTP.HandlerFunction((req::HTTP.Request) -> find_nn(req, objdict)))
    server = HTTP.Servers.Server(router)
    @async HTTP.serve(server, ip"", 8000)


I tried running the manage_tree function in the background, and also the HTTP.serve function (as is shown above). Both times, I get immediate responses from the web service using POST requests until the build_tree function is called. At that point, everything is blocked until it is done.


Figured it out (I need to read the docs more thoroughly) -> @async doesn’t actually launch what you call with it on a separate proc, so I needed to call @spawn (which does) and pass around my global channel obj so the main proc and the secondary proc could read/write from/to it.