Issues with PyCall and parallelization using HTTP.jl-built web service

pazzo83 · July 25, 2018, 10:05pm

Hi all, in 0.7 beta, I am trying to create a web service using HTTP.jl that can utilize a periodically-updating tree (built with NearestNeighbors.jl). We need to use sklearn from python to generate vectors with which to populate the tree, and that usage of PyCall seems to be blocking everything and not allowing the web server to respond immediately. Any ideas as to how to get these to work together? Below is basically the code I’m running:

using HTTP
@everywhere using HTTP
using JSON
@everywhere using JSON
using Mongo
@everywhere using Mongo
using PyCall
@everywhere using PyCall
using LibBSON
@everywhere using LibBSON
using Distances
@everywhere using Distances

@everywhere include("NearestNeighbors.jl/src/NearestNeighbors.jl")

const c = RemoteChannel(()-> Channel{Tuple{NearestNeighbors.BallTree, Vector{String}}}(1))

# helper function to strip out bad non-ascii
@everywhere stripc0x(a::String) = replace(a, r"[^\x20-\x7e]" => "")

@everywhere function _process_mongo_article(article::LibBSON.BSONObject)
    return (stripc0x(join(article["content"], "  ")), article["uid"])
end

@everywhere function process_documents(mongo_cursor::Mongo.MongoCursor, vectorizer::PyObject)
    mongo_objs = [_process_mongo_article(x) for x in mongo_cursor]
    art_contents = getindex.(mongo_objs, 1)
    uids = getindex.(mongo_objs, 2)

    m = vectorizer[:transform](art_contents)
    return Matrix(m'), uids
end

@everywhere function build_tree(vectorizer::PyObject)
    client = MongoClient("mongodb://locallhost")
    collection = MongoCollection(client, "db", "collection")

    curs = find(collection, ("provider" => "ap"), ("_id" => false, "uid" => true, "content" => true); limit=100)
    m, uids = process_documents(curs, vectorizer)
    return NearestNeighbors.BallTree(m, Jaccard(); reorder=false), uids
end

function manage_tree(vectorizer::PyObject)    
    while true
        sleep(60)
        @async put!(c, build_tree(vectorizer))
    end
end

@everywhere function find_nn(req::HTTP.Request, objdict::Dict)
    data = JSON.parse(HTTP.payload(req,String))
    point = Vector{Float64}(data["point"])
    println(c)
    if isready(c)
        tree, uids = take!(c)
        objdict["tree"] = tree
        objdict["uids"] = uids
        println("new tree")
    else
        tree = objdict["tree"]
        uids = objdict["uids"]
    end
    println(tree)
    ret = NearestNeighbors.inrange(tree, point, 0.8)
    ret_dict = Dict("idxs" => ret)
    return JSON.json(ret_dict)
end

function build_service()
    router = HTTP.Router()
    @pyimport sklearn.externals.joblib as jl
    vectorizer = jl.load("2c410c61-6016-4326-bb4b-83e81ebc1814.pkl")
    tree, uids = build_tree(vectorizer)
    objdict = Dict{String, Union{typeof(tree), Vector{String}}}("tree" => tree, "uids" => uids)
    HTTP.register!(router, "POST", "find_nn", HTTP.HandlerFunction((req::HTTP.Request) -> find_nn(req, objdict)))
    server = HTTP.Servers.Server(router)
    @async HTTP.serve(server, ip"0.0.0.0", 8000)
    manage_tree(vectorizer)
end

build_service()

I tried running the manage_tree function in the background, and also the HTTP.serve function (as is shown above). Both times, I get immediate responses from the web service using POST requests until the build_tree function is called. At that point, everything is blocked until it is done.

pazzo83 · July 26, 2018, 12:12am

Figured it out (I need to read the docs more thoroughly) → @async doesn’t actually launch what you call with it on a separate proc, so I needed to call @spawn (which does) and pass around my global channel obj so the main proc and the secondary proc could read/write from/to it.

Thanks!

Topic		Replies	Views
PyCall error I don't understand General Usage pycall	4	566	September 2, 2021
PythonCall spends a lot of time showing stuff for JAX Performance python , pythoncall , jax	3	216	October 15, 2024
Julia multithreading from pycall - possible to overcome? Specific Domains question	3	580	March 28, 2022
Run multiple python instances with pycall in different threads General Usage pycall , parallel	7	1527	June 13, 2024
How to call PythonCall.jl behind a julia HTTP webserver? General Usage question , multithreading , http , pythoncall	2	74	September 24, 2024

Issues with PyCall and parallelization using HTTP.jl-built web service

Related topics