Here is a reduced version of my code:

```
function f!{T<:Real}(A::MyImmutableType, x::Vector{T}, y::Vector{T}, z::Vector{T}, nchunks::Integer)
for chunk = 1:nchunks
@inbounds for i = 1:128
row_ind, col_ind = ind2sub(A.dims, A.ints[chunk][i])
zval = z[A.ints[chunk][i]]
xval = x[col_ind]
yval = y[row_ind]
y[row_ind] = yval + xval*zval
end
end
end
function f{T<:Real}(A::MyImmutableType, x::Vector{T}, y::Vector{T}, z::Vector{T}, nchunks::Integer)
y_scalar = 0.0
for chunk = 1:nchunks
@inbounds for i = 1:128
row_ind, col_ind = ind2sub(A.dims, A.ints[chunk][i])
zval = z[A.ints[chunk][i]]
xval = x[col_ind]
yval = y[row_ind]
y_scalar += yval + xv*zval
end
end
return y_scalar
end
```

The first function takes ~1.5 seconds to run and the second ~0.03 seconds. Any thoughts?