Apart from perf issues, this looks like a big data race to me. I think you meant to write
function threadedGram2(x::Array{Array{Float64, 2}})
p = size(x[1])[2]
n = length(x)
z = [zeros(Float64, p, p) for i in 1:Threads.nthreads()]
@threads for i in 1:n
LinearAlgebra.mul!(z[Threads.threadid()], x[i]', x[i], 1, 1)
end
r = pop!(z)
for zz in z
r .+= zz
end
return r
end