Quantile regression in Flux v0.10

I am trying various quantile regression methods using Flux, but after upgrading to Julia v1.3.0 and Flux v0.10 I am experiencing problems. For example, the following example code no longer works

using Flux, Statistics, BenchmarkTools

##  some random training data
n = 10_000
p = 10
x = rand(Float32, p, n)
y = rand(Float32, n)    
trdata = [(x[:,i], y[i]) for i in Flux.chunk(1:n, n/100)]

##  neural net
model = Chain(Dense(p, 32, relu),
              Dense(32, 16, relu),
              Dense(16, 1)) 

##  quantile loss function
prob = 0.8f0
function qtloss(yp, y)
    mean( ((y .< yp) .- prob) .* (yp .- y) )
end

loss(x, y) = qtloss(model(x), y)
@btime Flux.@epochs 1 Flux.train!(loss, Flux.params(model), trdata, Flux.ADAM())

and results in the following error

[ Info: Epoch 1
ERROR: MethodError: no method matching size(::Nothing, ::Int64)
Closest candidates are:
  size(::BitArray{1}, ::Integer) at bitarray.jl:81
  size(::Tuple, ::Integer) at tuple.jl:22
  size(::Number, ::Integer) at number.jl:63
  ...
Stacktrace:
 [1] (::Zygote.var"#1413#1414"{Nothing})(::Int64) at /home/johnbb/.julia/packages/Zygote/8dVxG/src/lib/broadcast.jl:46
 [2] ntuple at ./ntuple.jl:41 [inlined]
 [3] trim(::Array{Float32,1}, ::Nothing) at /home/johnbb/.julia/packages/Zygote/8dVxG/src/lib/broadcast.jl:46
 [4] unbroadcast at /home/johnbb/.julia/packages/Zygote/8dVxG/src/lib/broadcast.jl:48 [inlined]
 [5] map at ./tuple.jl:159 [inlined]
 [6] #1459 at /home/johnbb/.julia/packages/Zygote/8dVxG/src/lib/broadcast.jl:131 [inlined]
 [7] #3764#back at /home/johnbb/.julia/packages/ZygoteRules/6nssF/src/adjoint.jl:49 [inlined]
 [8] #153 at /home/johnbb/.julia/packages/Zygote/8dVxG/src/lib/lib.jl:142 [inlined]
 [9] #283#back at /home/johnbb/.julia/packages/ZygoteRules/6nssF/src/adjoint.jl:49 [inlined]
 [10] broadcasted at ./broadcast.jl:1237 [inlined]
 [11] (::typeof(∂(broadcasted)))(::Array{Float32,2}) at /home/johnbb/.julia/packages/Zygote/8dVxG/src/compiler/interface2.jl:0
 [12] (::typeof(∂(qtloss)))(::Float32) at ./REPL[9]:3
 [13] loss at ./REPL[10]:1 [inlined]
 [14] (::typeof(∂(loss)))(::Float32) at /home/johnbb/.julia/packages/Zygote/8dVxG/src/compiler/interface2.jl:0
 [15] #153 at /home/johnbb/.julia/packages/Zygote/8dVxG/src/lib/lib.jl:142 [inlined]
 [16] #283#back at /home/johnbb/.julia/packages/ZygoteRules/6nssF/src/adjoint.jl:49 [inlined]
 [17] #15 at /home/johnbb/.julia/packages/Flux/oX9Pi/src/optimise/train.jl:69 [inlined]
 [18] (::typeof(∂(λ)))(::Float32) at /home/johnbb/.julia/packages/Zygote/8dVxG/src/compiler/interface2.jl:0
 [19] (::Zygote.var"#38#39"{Zygote.Params,Zygote.Context,typeof(∂(λ))})(::Float32) at /home/johnbb/.julia/packages/Zygote/8dVxG/src/compiler/interface.jl:101
 [20] gradient(::Function, ::Zygote.Params) at /home/johnbb/.julia/packages/Zygote/8dVxG/src/compiler/interface.jl:47
 [21] macro expansion at /home/johnbb/.julia/packages/Flux/oX9Pi/src/optimise/train.jl:68 [inlined]
 [22] macro expansion at /home/johnbb/.julia/packages/Juno/oLB1d/src/progress.jl:134 [inlined]
 [23] #train!#12(::Flux.Optimise.var"#16#22", ::typeof(Flux.Optimise.train!), ::Function, ::Zygote.Params, ::Array{Tuple{Array{Float32,2},Array{Float32,1}},1}, ::ADAM) at /home/johnbb/.julia/packages/Flux/oX9Pi/src/optimise/train.jl:66
 [24] train!(::Function, ::Zygote.Params, ::Array{Tuple{Array{Float32,2},Array{Float32,1}},1}, ::ADAM) at /home/johnbb/.julia/packages/Flux/oX9Pi/src/optimise/train.jl:64
 [25] macro expansion at /home/johnbb/.julia/packages/Flux/oX9Pi/src/optimise/train.jl:100 [inlined]
 [26] macro expansion at /home/johnbb/.julia/packages/Juno/oLB1d/src/progress.jl:134 [inlined]
 [27] ##core#506() at /home/johnbb/.julia/packages/BenchmarkTools/7aqwe/src/execution.jl:297
 [28] ##sample#507(::BenchmarkTools.Parameters) at /home/johnbb/.julia/packages/BenchmarkTools/7aqwe/src/execution.jl:303
 [29] #_run#6(::Bool, ::String, ::Base.Iterators.Pairs{Symbol,Integer,NTuple{4,Symbol},NamedTuple{(:samples, :evals, :gctrial, :gcsample),Tuple{Int64,Int64,Bool,Bool}}}, ::typeof(BenchmarkTools._run), ::BenchmarkTools.Benchmark{Symbol("##benchmark#505")}, ::BenchmarkTools.Parameters) at /home/johnbb/.julia/packages/BenchmarkTools/7aqwe/src/execution.jl:331
 [30] (::Base.var"#inner#2"{Base.Iterators.Pairs{Symbol,Integer,NTuple{5,Symbol},NamedTuple{(:verbose, :samples, :evals, :gctrial, :gcsample),Tuple{Bool,Int64,Int64,Bool,Bool}}},typeof(BenchmarkTools._run),Tuple{BenchmarkTools.Benchmark{Symbol("##benchmark#505")},BenchmarkTools.Parameters}})() at ./none:0
 [31] #invokelatest#1 at ./essentials.jl:713 [inlined]
 [32] #invokelatest at ./none:0 [inlined]
 [33] #run_result#37 at /home/johnbb/.julia/packages/BenchmarkTools/7aqwe/src/execution.jl:32 [inlined]
 [34] #run_result at ./none:0 [inlined]
 [35] #run#39(::Base.Iterators.Pairs{Symbol,Integer,NTuple{5,Symbol},NamedTuple{(:verbose, :samples, :evals, :gctrial, :gcsample),Tuple{Bool,Int64,Int64,Bool,Bool}}}, ::typeof(run), ::BenchmarkTools.Benchmark{Symbol("##benchmark#505")}, ::BenchmarkTools.Parameters) at /home/johnbb/.julia/packages/BenchmarkTools/7aqwe/src/execution.jl:46
 [36] #run at ./none:0 [inlined] (repeats 2 times)
 [37] #warmup#42 at /home/johnbb/.julia/packages/BenchmarkTools/7aqwe/src/execution.jl:79 [inlined]
 [38] warmup(::BenchmarkTools.Benchmark{Symbol("##benchmark#505")}) at /home/johnbb/.julia/packages/BenchmarkTools/7aqwe/src/execution.jl:79
 [39] top-level scope at /home/johnbb/.julia/packages/BenchmarkTools/7aqwe/src/execution.jl:390

Any hints are welcome. Thanks.

An example of QR with flux 0.10 is at https://github.com/mcreel/NeuralNetsForIndirectInference.jl. This is a project that depends on a fair number of supporting packages, it’s not a MWE. See the file Train.jl for the hints on QR with Flux.

Thanks! That seems to work with Flux v0.10. I have previously been using the “max” variant of the quantile loss in Keras from R, but I have actually never tried it in Julia because I thought it would be slow. I will see if I can do a benchmark.

A brief test with the following code

using Flux, Statistics, BenchmarkTools

n = 10_000
p = 10
x = rand(Float32, p, n)
y = rand(Float32, n)    
trdata = [(x[:,i], y[i]) for i in Flux.chunk(1:n, n/100)]

model = Chain(Dense(p, 32, relu),
              Dense(32, 16, relu),
              Dense(16, 1)) 

qtloss1(yp, y, prob)  = mean( ((y .< yp) .- prob) .* (yp .- y) )
qtlossS2(yp, y, prob) = (yp - y) > 0 ? (1 - prob) * (yp - y) : -prob * (yp - y)
qtloss2(yp, y, prob)  = mean( qtlossS2.(yp, y, prob) )
qtloss3(yp, y, prob)  = mean( max.(-prob * (y .- yp), (1-prob) * (yp .- y)) )
qtloss4(yp, y, prob)  = mean( max.(-prob .* (y .- yp), (1-prob) .* (yp .- y)) )
function qtloss5(yp, y, prob)
    dev = yp .- y 
    mean( max.(-prob * dev, (1-prob) * dev) )
end
function qtloss6(yp, y, prob)
    dev = yp .- y 
    mean( max.(-prob .* dev, (1-prob) .* dev) )
end

const prob = 0.8f0
loss(x, y) = qtloss1(model(x), y, prob)
@btime Flux.train!(loss, Flux.params(model), trdata, Flux.ADAM())
loss(x, y) = qtloss2(model(x), y, prob)
@btime Flux.train!(loss, Flux.params(model), trdata, Flux.ADAM())
loss(x, y) = qtloss3(model(x), y, prob)
@btime Flux.train!(loss, Flux.params(model), trdata, Flux.ADAM())
loss(x, y) = qtloss4(model(x), y, prob)
@btime Flux.train!(loss, Flux.params(model), trdata, Flux.ADAM())
loss(x, y) = qtloss5(model(x), y, prob)
@btime Flux.train!(loss, Flux.params(model), trdata, Flux.ADAM())
loss(x, y) = qtloss6(model(x), y, prob)
@btime Flux.train!(loss, Flux.params(model), trdata, Flux.ADAM())

gave these results for the respective Julia and Flux versions:

          v1.2 + v0.9    v1.3 + v0.10   
qtloss1     34.730 ms          error
qtloss2     20.124 ms        4.140 s
qtloss3     28.459 ms      227.389 ms
qtloss4      5.440 s       224.270 ms
qtloss5     25.532 ms      157.913 ms
qtloss6     29.383 ms      156.209 ms

Could you open an issue with flux or zygote with the performance regressions, depending on whether they are from the backwards or forward pass?

I can confirm the poor performance and I think it’s the backward pass, since the forward pass seems quite fast.

julia> loss(x, y) = qtloss6(model(x), y, prob)
loss (generic function with 1 method)

julia> @btime Flux.train!(loss, $(Flux.params(model)), $(trdata), $(Flux.ADAM()))
  171.408 ms (10059916 allocations: 260.54 MiB)

julia> @btime foreach(((x,y),)->loss(x,y), trdata)
  1.327 ms (1000 allocations: 11.45 MiB)

julia> @btime Flux.gradient(()->loss(trdata[1]...), $(Flux.params(model)));
  1.437 ms (100304 allocations: 2.59 MiB)

julia> @btime loss($(trdata[1])...);
  12.670 μs (10 allocations: 117.25 KiB)

Are you sure that the arrays have the right dimensions? I get

(size(y), size(yp)) = ((100,), (1, 100))

and if I transpose y it goes much faster. Notice the transposes below

julia> loss(x, y) = qtloss1(model(x), y', prob)
loss (generic function with 1 method)

julia> @btime Flux.train!(loss, Flux.params(model), trdata, Flux.ADAM())

  14.114 ms (158933 allocations: 14.75 MiB)

julia> loss(x, y) = qtloss2(model(x), y', prob)
loss (generic function with 1 method)

julia> @btime Flux.train!(loss, Flux.params(model), trdata, Flux.ADAM())
  60.096 ms (486833 allocations: 26.28 MiB)

julia> loss(x, y) = qtloss3(model(x), y', prob)
loss (generic function with 1 method)

julia> @btime Flux.train!(loss, Flux.params(model), trdata, Flux.ADAM())
  19.418 ms (221833 allocations: 16.48 MiB)

julia> loss(x, y) = qtloss4(model(x), y', prob)
loss (generic function with 1 method)

julia> @btime Flux.train!(loss, Flux.params(model), trdata, Flux.ADAM())
  15.460 ms (211233 allocations: 16.08 MiB)

julia> loss(x, y) = qtloss5(model(x), y', prob)
loss (generic function with 1 method)

julia> @btime Flux.train!(loss, Flux.params(model), trdata, Flux.ADAM())

  17.424 ms (167433 allocations: 15.30 MiB)

julia> loss(x, y) = qtloss6(model(x), y', prob)
loss (generic function with 1 method)

julia> @btime Flux.train!(loss, Flux.params(model), trdata, Flux.ADAM())
  13.962 ms (157033 allocations: 14.90 MiB)

Thanks, you are right! Unfortunately, I forgot once again that Flux expects/returns the transpose of what I am used to. I now get the following results which are in favor of the new version

          v1.2 + v0.9    v1.3 + v0.10   
qtloss1     19.350 ms       13.669 ms
qtloss2     13.557 ms       53.741 ms
qtloss3     17.797 ms       18.613 ms
qtloss4     78.974 ms       15.293 ms
qtloss5     16.407 ms       16.906 ms
qtloss6     17.077 ms       13.938 ms

Thanks for these results! :+1:

Good solution. Thanks. I’ve had similar problem and it does NOT work at GPU. Could anyone help with this? The exercise I need to perform is my own individual loss function that works using Julia 1.3 and Flux v0.10 at GPU.

Thanks for any help.

julia> Flux.train!(loss, Flux.params(model), trdata, Flux.ADAM())
ERROR: MethodError: no method matching iterate(::Nothing)
Closest candidates are:
  iterate(::Core.SimpleVector) at essentials.jl:604
  iterate(::Core.SimpleVector, ::Any) at essentials.jl:604
  iterate(::ExponentialBackOff) at error.jl:214
  ...
Stacktrace:
 [1] append_any(::Any, ::Vararg{Any,N} where N) at ./essentials.jl:722
 [2] (::getfield(Zygote, Symbol("##1712#1716")){getfield(Zygote, Symbol("##1583#1587"))})(::CuArrays.CuArray{Float32,2,Nothing}) at /home/hanusl/.julia/packages/Zygote/fw4Oc/src/lib/broadcast.jl:179
 [3] (::getfield(Zygote, Symbol("##4#back#1717")){getfield(Zygote, Symbol("##1712#1716")){getfield(Zygote, Symbol("##1583#1587"))}})(::CuArrays.CuArray{Float32,2,Nothing}) at /home/hanusl/.julia/packages/ZygoteRules/6nssF/src/adjoint.jl:49
 [4] (::getfield(Zygote, Symbol("##153#154")){getfield(Zygote, Symbol("##4#back#1717")){getfield(Zygote, Symbol("##1712#1716")){getfield(Zygote, Symbol("##1583#1587"))}},Tuple{NTuple{4,Nothing},Tuple{}}})(::CuArrays.CuArray{Float32,2,Nothing}) at /home/hanusl/.julia/packages/Zygote/fw4Oc/src/lib/lib.jl:142
 [5] (::getfield(Zygote, Symbol("##283#back#155")){getfield(Zygote, Symbol("##153#154")){getfield(Zygote, Symbol("##4#back#1717")){getfield(Zygote, Symbol("##1712#1716")){getfield(Zygote, Symbol("##1583#1587"))}},Tuple{NTuple{4,Nothing},Tuple{}}}})(::CuArrays.CuArray{Float32,2,Nothing}) at /home/hanusl/.julia/packages/ZygoteRules/6nssF/src/adjoint.jl:49
 [6] broadcasted at ./broadcast.jl:1216 [inlined]
 [7] qtloss1 at ./REPL[37]:1 [inlined]
 [8] (::typeof(∂(qtloss1)))(::Float32) at /home/hanusl/.julia/packages/Zygote/fw4Oc/src/compiler/interface2.jl:0
 [9] loss at ./REPL[38]:1 [inlined]
 [10] (::typeof(∂(loss)))(::Float32) at /home/hanusl/.julia/packages/Zygote/fw4Oc/src/compiler/interface2.jl:0
 [11] (::getfield(Zygote, Symbol("##153#154")){typeof(∂(loss)),Tuple{Tuple{Nothing,Nothing}}})(::Float32) at /home/hanusl/.julia/packages/Zygote/fw4Oc/src/lib/lib.jl:142
 [12] (::getfield(Zygote, Symbol("##283#back#155")){getfield(Zygote, Symbol("##153#154")){typeof(∂(loss)),Tuple{Tuple{Nothing,Nothing}}}})(::Float32) at /home/hanusl/.julia/packages/ZygoteRules/6nssF/src/adjoint.jl:49
 [13] #15 at /home/hanusl/.julia/packages/Flux/oX9Pi/src/optimise/train.jl:69 [inlined]
 [14] (::typeof(∂(λ)))(::Float32) at /home/hanusl/.julia/packages/Zygote/fw4Oc/src/compiler/interface2.jl:0
 [15] (::getfield(Zygote, Symbol("##38#39")){Zygote.Params,Zygote.Context,typeof(∂(λ))})(::Float32) at /home/hanusl/.julia/packages/Zygote/fw4Oc/src/compiler/interface.jl:101
 [16] gradient(::Function, ::Zygote.Params) at /home/hanusl/.julia/packages/Zygote/fw4Oc/src/compiler/interface.jl:47
 [17] macro expansion at /home/hanusl/.julia/packages/Flux/oX9Pi/src/optimise/train.jl:68 [inlined]
 [18] macro expansion at /home/hanusl/.julia/packages/Juno/oLB1d/src/progress.jl:134 [inlined]
 [19] #train!#12(::getfield(Flux.Optimise, Symbol("##16#22")), ::typeof(Flux.Optimise.train!), ::Function, ::Zygote.Params, ::Array{Tuple{CuArrays.CuArray{Float32,2,Nothing},CuArrays.CuArray{Float32,1,Nothing}},1}, ::ADAM) at /home/hanusl/.julia/packages/Flux/oX9Pi/src/optimise/train.jl:66
 [20] train!(::Function, ::Zygote.Params, ::Array{Tuple{CuArrays.CuArray{Float32,2,Nothing},CuArrays.CuArray{Float32,1,Nothing}},1}, ::ADAM) at /home/hanusl/.julia/packages/Flux/oX9Pi/src/optimise/train.jl:64
 [21] top-level scope at REPL[64]:1

For me all loss functions except qtloss1 work using GPU, but the example is roughly 5 times slower than on CPU with Julia 1.3.1 and Flux 0.10.