At present, Julia always use C’s “memmove” to copy data from a Array to another Array when they have the same bitstype or bitsuniontype.
On my desktop “memcpy” seems faster than “memmove”. Here’s some benchmark code:
memmove!(dest::Array{T}, src::Array{T}, n = length(src)) where T = begin
t1 = Base.@_gc_preserve_begin dest
t2 = Base.@_gc_preserve_begin src
destp = pointer(dest)
srcp = pointer(src)
ccall(:memmove, Ptr{Cvoid}, (Ptr{Cvoid}, Ptr{Cvoid}, Csize_t),
destp, srcp, n * Base.aligned_sizeof(T))
Base.@_gc_preserve_end t2
Base.@_gc_preserve_end t1
return dest
end
memcpy!(dest::Array{T}, src::Array{T}, n = length(src)) where T = begin
t1 = Base.@_gc_preserve_begin dest
t2 = Base.@_gc_preserve_begin src
destp = pointer(dest)
srcp = pointer(src)
ccall(:memcpy, Ptr{Cvoid}, (Ptr{Cvoid}, Ptr{Cvoid}, Csize_t),
destp, srcp, n * Base.aligned_sizeof(T))
Base.@_gc_preserve_end t2
Base.@_gc_preserve_end t1
return dest
end
len = 8
while len <= 1024 * 1024
a = Array{UInt8}(undef,len); b = similar(a);
println(len,"bytes")
@btime memcpy!($b, $a)
@btime memmove!($b, $a)
len *= 2
end
and it’s result(memsize memcpy memmove):
8bytes 3.200 ns (0 allocations: 0 bytes) 6.200 ns (0 allocations: 0 bytes)
16bytes 3.200 ns (0 allocations: 0 bytes) 6.100 ns (0 allocations: 0 bytes)
32bytes 3.200 ns (0 allocations: 0 bytes) 6.100 ns (0 allocations: 0 bytes)
64bytes 4.800 ns (0 allocations: 0 bytes) 7.300 ns (0 allocations: 0 bytes)
128bytes 5.900 ns (0 allocations: 0 bytes) 8.700 ns (0 allocations: 0 bytes)
256bytes 13.900 ns (0 allocations: 0 bytes) 11.612 ns (0 allocations: 0 bytes)
512bytes 15.716 ns (0 allocations: 0 bytes) 16.132 ns (0 allocations: 0 bytes)
1024bytes 19.100 ns (0 allocations: 0 bytes) 22.267 ns (0 allocations: 0 bytes)
2048bytes 27.008 ns (0 allocations: 0 bytes) 36.556 ns (0 allocations: 0 bytes)
4096bytes 42.944 ns (0 allocations: 0 bytes) 65.682 ns (0 allocations: 0 bytes)
8192bytes 72.290 ns (0 allocations: 0 bytes) 124.127 ns (0 allocations: 0 bytes)
16384bytes 135.402 ns (0 allocations: 0 bytes) 300.000 ns (0 allocations: 0 bytes)
32768bytes 524.084 ns (0 allocations: 0 bytes) 723.529 ns (0 allocations: 0 bytes)
65536bytes 1.030 μs (0 allocations: 0 bytes) 1.470 μs (0 allocations: 0 bytes)
131072bytes 2.333 μs (0 allocations: 0 bytes) 3.163 μs (0 allocations: 0 bytes)
262144bytes 6.080 μs (0 allocations: 0 bytes) 8.000 μs (0 allocations: 0 bytes)
524288bytes 13.000 μs (0 allocations: 0 bytes) 17.700 μs (0 allocations: 0 bytes)
1048576bytes 27.100 μs (0 allocations: 0 bytes) 35.600 μs (0 allocations: 0 bytes)
I think this result might be relevant with the operating system. If this accelation is valid for most cases, maybe we can make a PR to call “memcpy” when the two array is not overlapped?