How to implement a custom StridedArray or DenseArray?

I have some C-code which returns a pointer to an array, which I can wrap using `unsafe_wrap`

. I need to build a custom array-implementation, but operations on this custom array is a lot slower than on the array (wrapped or not):

```
abstract type AbstractFooArray{T,N} <: DenseArray{T,N}
end
mutable struct FooArray{T,N} <: AbstractFooArray{T,N}
array::DenseArray{T,N}
end
Base.IndexStyle(::Type{<:AbstractFooArray{T,N}}) where {T,N} = IndexLinear()
Base.getindex(a::AbstractFooArray{T,N}, i::Int) where {T,N} = a.array[i]
Base.setindex!(a::AbstractFooArray{T,N}, v, i::Int) where {T,N} = a.array[i] = v
Base.size(a::AbstractFooArray{T,N}) where {T,N} = size(a.array)
#Base.strides(a::AbstractFooArray{T,N}) where {T,N} = strides(a.array)
#Base.unsafe_convert(::Type{Ptr{T}}, a::AbstractFooArray{T,N}) where {T,N} = unsafe_convert(Ptr{T}, a.array)
#Base.has_fast_linear_indexing(a::AbstractFooArray{T,N}) where {T,N} = true
a = zeros(UInt8, (3, 2448, 2048));
b = unsafe_wrap(Array, pointer(a), size(a));
a_f = FooArray(a)
b_f = FooArray(b)
```

```
@time sum(a)
@time sum(b)
```

```
0.002121 seconds (4 allocations: 160 bytes)
0.002241 seconds (4 allocations: 160 bytes)
```

```
@time sum(a_f)
@time sum(b_f)
```

```
0.562584 seconds (16 allocations: 400 bytes)
0.546441 seconds (16 allocations: 400 bytes)
```

The reported numbers are after compilation warm-up (second run).