The difference seems entirely due to @simd:
julia> function mysum(arr)
result = zeros(eltype(arr), size(arr, 2))
@inbounds for j in axes(arr, 2)
t = zero(eltype(arr))
for i in axes(arr, 1)
t += arr[i, j]
end
result[j] = t
end
return result
end
mysum (generic function with 1 method)
julia> @btime mysum($arr);
165.281 ms (2 allocations: 781.30 KiB)
julia> function mysum(arr)
result = zeros(eltype(arr), size(arr, 2))
@inbounds for j in axes(arr, 2)
t = zero(eltype(arr))
@simd for i in axes(arr, 1)
t += arr[i, j]
end
result[j] = t
end
return result
end
mysum (generic function with 1 method)
julia> @btime mysum($arr);
53.112 ms (2 allocations: 781.30 KiB)
julia> @btime sum($arr, dims=1);
52.268 ms (2 allocations: 781.30 KiB)