Thanks a ton. This worked with minor tweaks to your fg!. Question for you: do we need to pre-initialize G somewhere inside fg!?
Copying the full code and output for someone who may run into something similar. I converted the likelihood function to log-likelihood and adjusted gradients accordingly since that’s a more typical thing to encounter in statistics/economics.
#data
data = DataFrame(x = repeat([1,2], 500), y = sample([0,1], 1000))
#log-likelihood
function likelihood(beta::Array{T}, data::R) where {T, R}
 Prob = SharedArray{T}(1000, 1)
 @distributed (+) for i in 1:1000
  if data[!, :y][i] .== 1
    Prob[i] = exp(beta[1] .* data[!, :x][i])/(1 + exp(beta[1] .* data[!, :x][i]))
  else
    Prob[i] = 1/(1 + exp(beta[1] .* data[!, :x][i]))
  end
 end
 return Prob, -sum(log.(Prob))/1000
end
#manual gradients of log-likelihood
function gradient_likelihood(Prob, data::R) where {R}
 grad = SharedArray{Float64}(1000, 1)
 @distributed (+) for i in 1:1000
  if data[!, :y][i] .== 1
    grad[i] = (Prob[i] .* (1 - Prob[i]) .* data[!, :x][i])./Prob[i]
  else
    grad[i] = - (Prob[i] .* (1 - Prob[i]) .* data[!, :x][i])./Prob[i]
  end
 end
 return -sum(grad)/1000
end
#check results
julia> likelihood([1.0], data)
([0.2689414213699951; 0.8807970779778824; … ; 0.2689414213699951; 0.8807970779778824], 0.9240948492805974)
julia> gradient_likelihood(likelihood([1.0], data)[1], data)
0.4503263672928852
julia> ForwardDiff.gradient(x -> likelihood(x, data)[2], [1.0])
1-element Array{Float64,1}:
 0.4503263672928852
#avoid repeating computations in Optim.optimize
function fg!(F, G, beta)
      Prob = SharedArray{Float64}(1000, 1)
      @distributed (+) for i in 1:1000
       if data[!, :y][i] .== 1
        Prob[i] = exp(beta[1] .* data[!, :x][i])/(1 + exp(beta[1] .* data[!, :x][i]))
       else
        Prob[i] = 1/(1 + exp(beta[1] .* data[!, :x][i]))
      end
     end
    if !(G == nothing)
      grad = SharedArray{Float64}(1000, 1)
      @distributed (+) for i in 1:1000
       if data[!, :y][i] .== 1
         grad[i] = (Prob[i] .* (1 - Prob[i]) .* data[!, :x][i])./Prob[i]
       else
         grad[i] = - (Prob[i] .* (1 - Prob[i]) .* data[!, :x][i])./Prob[i]
       end
      end
      copyto!(G, -sum(grad)/1000)
    end
    if !(F == nothing)
        return -sum(log.(Prob))/1000
    end
end
julia> Optim.optimize(Optim.only_fg!(fg!), rand(Uniform(), 1), Optim.LBFGS(), Optim.Options(show_trace = true))
Iter     Function value   Gradient norm 
     0     7.616226e-01     2.766918e-01
 * time: 0.0
     1     6.915962e-01     1.335341e-02
 * time: 0.031000137329101562
     2     6.914531e-01     1.507051e-06
 * time: 0.04999995231628418
     3     6.914531e-01     1.105449e-15
 * time: 0.07500004768371582
 * Status: success
 * Candidate solution
    Minimizer: [7.37e-02]
    Minimum:   6.914531e-01
 * Found with
    Algorithm:     L-BFGS
    Initial Point: [5.61e-01]
 * Convergence measures
    |x - x'|               = 2.42e-06 ≰ 0.0e+00
    |x - x'|/|x'|          = 3.29e-05 ≰ 0.0e+00
    |f(x) - f(x')|         = 1.83e-12 ≰ 0.0e+00
    |f(x) - f(x')|/|f(x')| = 2.64e-12 ≰ 0.0e+00
    |g(x)|                 = 1.11e-15 ≤ 1.0e-08
 * Work counters
    Seconds run:   0  (vs limit Inf)
    Iterations:    3
    f(x) calls:    9
    ∇f(x) calls:   9