I am * new* to Julia and to Cuda programming. I am using Julia 1.0.5.

I wrote a code to do neural network inferences: it consists on a series of multiplications (neural network weights* Input/layers) and adding Biases.

I want to have your feedback on my code and i am open to any suggestion to speed up the calculations (multiplication and adding operations Preformatted text).

```
export evaluate_network_gpu
using CuArrays, CUDAdrv, CUDAnative, ONNX, Flux, BenchmarkTools
CuArrays.allowscalar(false)
#x = [8252.814067267922,-2.783556398033601,-3.0892319654406055,114.57010404854258,102.87107124694514,1.0,3]
#x_c = CuArray(x)
weights = ONNX.load_weights("../../../gpu/weights.bson")
maxs = [96189.14,3.141593,3.141593,275.7858,321.5928,5.0,5.0]
mins = [11.73505,-3.141593,-3.141593,55.54866,12.71186,1.0,1.0]
#loading weights and biases
d1_k = CuArray(weights["dense_1/MatMul/ReadVariableOp:0" ])
d1_b = CuArray(weights["dense_1/BiasAdd/ReadVariableOp:0" ])
d2_k = CuArray(weights["dense_2/MatMul/ReadVariableOp:0" ])
d2_b = CuArray(weights["dense_2/BiasAdd/ReadVariableOp:0" ])
d3_k = CuArray(weights["dense_3/MatMul/ReadVariableOp:0" ])
d3_b = CuArray(weights["dense_3/BiasAdd/ReadVariableOp:0" ])
d4_k = CuArray(weights["dense_4/MatMul/ReadVariableOp:0" ])
d4_b = CuArray(weights["dense_4/BiasAdd/ReadVariableOp:0" ])
d5_k = CuArray(weights["dense_5/MatMul/ReadVariableOp:0" ])
d5_b = CuArray(weights["dense_5/BiasAdd/ReadVariableOp:0" ])
d6_k = CuArray(weights["dense_6/MatMul/ReadVariableOp:0" ])
d6_b = CuArray(weights["dense_6/BiasAdd/ReadVariableOp:0" ])
d7_k = CuArray(weights["dense_7/MatMul/ReadVariableOp:0" ])
d7_b = CuArray(weights["dense_7/BiasAdd/ReadVariableOp:0" ])
d8_k = CuArray(weights["dense_8/MatMul/ReadVariableOp:0" ])
d8_b = CuArray(weights["dense_8/BiasAdd/ReadVariableOp:0" ])
d9_k = CuArray(weights["dense_9/MatMul/ReadVariableOp:0" ])
d9_b = CuArray(weights["dense_9/BiasAdd/ReadVariableOp:0" ])
d10_k = CuArray(weights["dense_10/MatMul/ReadVariableOp:0" ])
d10_b = CuArray(weights["dense_10/BiasAdd/ReadVariableOp:0" ])
d11_k = CuArray(weights["dense_11/MatMul/ReadVariableOp:0" ])
d11_b = CuArray(weights["dense_11/BiasAdd/ReadVariableOp:0" ])
d12_k = CuArray(weights["dense_12/MatMul/ReadVariableOp:0" ])
d12_b = CuArray(weights["dense_12/BiasAdd/ReadVariableOp:0" ])
d13_k = CuArray(weights["dense_13/MatMul/ReadVariableOp:0" ])
d13_b = CuArray(weights["dense_13/BiasAdd/ReadVariableOp:0" ])
# a is vector form layer -1
# b is the weights vector
# c is bias to add
function mult_add(a, b, c)
return relu.(CuArrays.CUBLAS.gemv('N', b, a).+c)
end
function evaluate_network_gpu(x_c)
x_c = CuArray(x_c)
x_c = (x_c-CuArray(mins))./(CuArray(maxs)-CuArray(mins))
x_c = Float32.(x_c)
layer1 = mult_add(x_c , d1_k, d1_b)
layer2 = mult_add(layer1 , d2_k, d2_b)
layer3 = mult_add(layer2 , d3_k, d3_b)
layer4 = mult_add(layer3 , d4_k, d4_b)
layer5 = mult_add(layer4 , d5_k, d5_b)
layer6 = mult_add(layer5 , d6_k, d6_b)
layer7 = mult_add(layer6 , d7_k, d7_b)
layer8 = mult_add(layer7 , d8_k, d8_b)
layer9 = mult_add(layer8 , d9_k, d9_b)
layer10 = mult_add(layer9 , d10_k, d10_b)
layer11 = mult_add(layer10 , d11_k, d11_b)
layer12 = mult_add(layer11 , d12_k, d12_b)
layer13 = mult_add(layer12 , d13_k, d13_b)
return layer13
end
#function vadd(a, b, c)
# i = (blockIdx().x-1) * blockDim().x + threadIdx().x
# c[i] = a[i] + b[i]
# return
#end
```

Thank you