Issues with PyCall and parallelization using HTTP.jl-built web service

web
parallel

#1

Hi all, in 0.7 beta, I am trying to create a web service using HTTP.jl that can utilize a periodically-updating tree (built with NearestNeighbors.jl). We need to use sklearn from python to generate vectors with which to populate the tree, and that usage of PyCall seems to be blocking everything and not allowing the web server to respond immediately. Any ideas as to how to get these to work together? Below is basically the code I’m running:

using HTTP
@everywhere using HTTP
using JSON
@everywhere using JSON
using Mongo
@everywhere using Mongo
using PyCall
@everywhere using PyCall
using LibBSON
@everywhere using LibBSON
using Distances
@everywhere using Distances

@everywhere include("NearestNeighbors.jl/src/NearestNeighbors.jl")

const c = RemoteChannel(()-> Channel{Tuple{NearestNeighbors.BallTree, Vector{String}}}(1))

# helper function to strip out bad non-ascii
@everywhere stripc0x(a::String) = replace(a, r"[^\x20-\x7e]" => "")

@everywhere function _process_mongo_article(article::LibBSON.BSONObject)
    return (stripc0x(join(article["content"], "  ")), article["uid"])
end

@everywhere function process_documents(mongo_cursor::Mongo.MongoCursor, vectorizer::PyObject)
    mongo_objs = [_process_mongo_article(x) for x in mongo_cursor]
    art_contents = getindex.(mongo_objs, 1)
    uids = getindex.(mongo_objs, 2)

    m = vectorizer[:transform](art_contents)
    return Matrix(m'), uids
end

@everywhere function build_tree(vectorizer::PyObject)
    client = MongoClient("mongodb://locallhost")
    collection = MongoCollection(client, "db", "collection")

    curs = find(collection, ("provider" => "ap"), ("_id" => false, "uid" => true, "content" => true); limit=100)
    m, uids = process_documents(curs, vectorizer)
    return NearestNeighbors.BallTree(m, Jaccard(); reorder=false), uids
end

function manage_tree(vectorizer::PyObject)    
    while true
        sleep(60)
        @async put!(c, build_tree(vectorizer))
    end
end

@everywhere function find_nn(req::HTTP.Request, objdict::Dict)
    data = JSON.parse(HTTP.payload(req,String))
    point = Vector{Float64}(data["point"])
    println(c)
    if isready(c)
        tree, uids = take!(c)
        objdict["tree"] = tree
        objdict["uids"] = uids
        println("new tree")
    else
        tree = objdict["tree"]
        uids = objdict["uids"]
    end
    println(tree)
    ret = NearestNeighbors.inrange(tree, point, 0.8)
    ret_dict = Dict("idxs" => ret)
    return JSON.json(ret_dict)
end

function build_service()
    router = HTTP.Router()
    @pyimport sklearn.externals.joblib as jl
    vectorizer = jl.load("2c410c61-6016-4326-bb4b-83e81ebc1814.pkl")
    tree, uids = build_tree(vectorizer)
    objdict = Dict{String, Union{typeof(tree), Vector{String}}}("tree" => tree, "uids" => uids)
    HTTP.register!(router, "POST", "find_nn", HTTP.HandlerFunction((req::HTTP.Request) -> find_nn(req, objdict)))
    server = HTTP.Servers.Server(router)
    @async HTTP.serve(server, ip"0.0.0.0", 8000)
    manage_tree(vectorizer)
end

build_service()

I tried running the manage_tree function in the background, and also the HTTP.serve function (as is shown above). Both times, I get immediate responses from the web service using POST requests until the build_tree function is called. At that point, everything is blocked until it is done.


#2

Figured it out (I need to read the docs more thoroughly) -> @async doesn’t actually launch what you call with it on a separate proc, so I needed to call @spawn (which does) and pass around my global channel obj so the main proc and the secondary proc could read/write from/to it.

Thanks!