Use "memcpy" instead of "memmove" to copy Array when there's no overlap?

At present, Julia always use C’s “memmove” to copy data from a Array to another Array when they have the same bitstype or bitsuniontype.
On my desktop “memcpy” seems faster than “memmove”. Here’s some benchmark code:

memmove!(dest::Array{T}, src::Array{T}, n = length(src)) where T = begin
    t1 = Base.@_gc_preserve_begin dest
    t2 = Base.@_gc_preserve_begin src
    destp = pointer(dest)
    srcp = pointer(src)
    ccall(:memmove, Ptr{Cvoid}, (Ptr{Cvoid}, Ptr{Cvoid}, Csize_t),
        destp, srcp, n * Base.aligned_sizeof(T))
    Base.@_gc_preserve_end t2
    Base.@_gc_preserve_end t1
    return dest
end

memcpy!(dest::Array{T}, src::Array{T}, n = length(src)) where T = begin
    t1 = Base.@_gc_preserve_begin dest
    t2 = Base.@_gc_preserve_begin src
    destp = pointer(dest)
    srcp = pointer(src)
    ccall(:memcpy, Ptr{Cvoid}, (Ptr{Cvoid}, Ptr{Cvoid}, Csize_t),
        destp, srcp, n * Base.aligned_sizeof(T))
    Base.@_gc_preserve_end t2
    Base.@_gc_preserve_end t1
    return dest
end
len = 8
while len <= 1024 * 1024
    a = Array{UInt8}(undef,len); b = similar(a);
    println(len,"bytes")
    @btime memcpy!($b, $a)
    @btime memmove!($b, $a)
    len *= 2
end

and it’s result(memsize memcpy memmove):

8bytes     3.200 ns (0 allocations: 0 bytes)  6.200 ns (0 allocations: 0 bytes)
16bytes    3.200 ns (0 allocations: 0 bytes)  6.100 ns (0 allocations: 0 bytes)
32bytes    3.200 ns (0 allocations: 0 bytes)  6.100 ns (0 allocations: 0 bytes)
64bytes    4.800 ns (0 allocations: 0 bytes)  7.300 ns (0 allocations: 0 bytes)
128bytes   5.900 ns (0 allocations: 0 bytes)  8.700 ns (0 allocations: 0 bytes)
256bytes   13.900 ns (0 allocations: 0 bytes)  11.612 ns (0 allocations: 0 bytes)
512bytes   15.716 ns (0 allocations: 0 bytes)  16.132 ns (0 allocations: 0 bytes)
1024bytes  19.100 ns (0 allocations: 0 bytes)  22.267 ns (0 allocations: 0 bytes)
2048bytes  27.008 ns (0 allocations: 0 bytes)  36.556 ns (0 allocations: 0 bytes)
4096bytes  42.944 ns (0 allocations: 0 bytes)  65.682 ns (0 allocations: 0 bytes)
8192bytes  72.290 ns (0 allocations: 0 bytes)  124.127 ns (0 allocations: 0 bytes)
16384bytes  135.402 ns (0 allocations: 0 bytes)  300.000 ns (0 allocations: 0 bytes)
32768bytes  524.084 ns (0 allocations: 0 bytes)  723.529 ns (0 allocations: 0 bytes)
65536bytes  1.030 μs (0 allocations: 0 bytes)  1.470 μs (0 allocations: 0 bytes)
131072bytes  2.333 μs (0 allocations: 0 bytes)  3.163 μs (0 allocations: 0 bytes)
262144bytes  6.080 μs (0 allocations: 0 bytes)  8.000 μs (0 allocations: 0 bytes)
524288bytes  13.000 μs (0 allocations: 0 bytes)  17.700 μs (0 allocations: 0 bytes)
1048576bytes  27.100 μs (0 allocations: 0 bytes)  35.600 μs (0 allocations: 0 bytes)

I think this result might be relevant with the operating system. If this accelation is valid for most cases, maybe we can make a PR to call “memcpy” when the two array is not overlapped?

4 Likes

You could add an issue to the julialang github, pointing at this. At first glance it seems useful, but I don’t know, if there are any potential problems that could arise

There’s an old issue where memcpy was replaced with memmove in cases where aliasing is indeterminate. The current version of unsafe_copy! called by copyto! seems to skip aliasing checks entirely - an aliasing check could presumably be added to the isbitstype(T) branch to select between memcpy and memmove if it doesn’t break anything.

For fun…

julia> function loopcopy!(y,x)
           @inbounds for i ∈ eachindex(y,x)
               y[i] = x[i]
           end
       end
loopcopy! (generic function with 1 method)

julia> memmove!(dest::Array{T}, src::Array{T}, n = length(src)) where T = begin
           t1 = Base.@_gc_preserve_begin dest
           t2 = Base.@_gc_preserve_begin src
           destp = pointer(dest)
           srcp = pointer(src)
           ccall(:memmove, Ptr{Cvoid}, (Ptr{Cvoid}, Ptr{Cvoid}, Csize_t),
               destp, srcp, n * Base.aligned_sizeof(T))
           Base.@_gc_preserve_end t2
           Base.@_gc_preserve_end t1
           return dest
       end
memmove! (generic function with 2 methods)

julia> memcpy!(dest::Array{T}, src::Array{T}, n = length(src)) where T = begin
           t1 = Base.@_gc_preserve_begin dest
           t2 = Base.@_gc_preserve_begin src
           destp = pointer(dest)
           srcp = pointer(src)
           ccall(:memcpy, Ptr{Cvoid}, (Ptr{Cvoid}, Ptr{Cvoid}, Csize_t),
               destp, srcp, n * Base.aligned_sizeof(T))
           Base.@_gc_preserve_end t2
           Base.@_gc_preserve_end t1
           return dest
       end
memcpy! (generic function with 2 methods)

julia> len = 8
8

julia> while len <= 1024 * 1024
           a = Array{UInt8}(undef,len); b = similar(a);
           println(len,"bytes")
           @btime memcpy!($b, $a)
           @btime memmove!($b, $a)
           @btime loopcopy!($b, $a)
           len *= 2
       end
8bytes
  3.617 ns (0 allocations: 0 bytes)
  4.775 ns (0 allocations: 0 bytes)
  6.705 ns (0 allocations: 0 bytes)
16bytes
  2.979 ns (0 allocations: 0 bytes)
  4.662 ns (0 allocations: 0 bytes)
  9.001 ns (0 allocations: 0 bytes)
32bytes
  3.101 ns (0 allocations: 0 bytes)
  4.780 ns (0 allocations: 0 bytes)
  13.222 ns (0 allocations: 0 bytes)
64bytes
  3.108 ns (0 allocations: 0 bytes)
  4.946 ns (0 allocations: 0 bytes)
  22.894 ns (0 allocations: 0 bytes)
128bytes
  4.897 ns (0 allocations: 0 bytes)
  7.253 ns (0 allocations: 0 bytes)
  44.554 ns (0 allocations: 0 bytes)
256bytes
  4.900 ns (0 allocations: 0 bytes)
  7.217 ns (0 allocations: 0 bytes)
  6.434 ns (0 allocations: 0 bytes)
512bytes
  9.818 ns (0 allocations: 0 bytes)
  10.856 ns (0 allocations: 0 bytes)
  8.692 ns (0 allocations: 0 bytes)
1024bytes
  14.015 ns (0 allocations: 0 bytes)
  14.665 ns (0 allocations: 0 bytes)
  11.139 ns (0 allocations: 0 bytes)
2048bytes
  25.384 ns (0 allocations: 0 bytes)
  26.810 ns (0 allocations: 0 bytes)
  17.778 ns (0 allocations: 0 bytes)
4096bytes
  40.784 ns (0 allocations: 0 bytes)
  43.654 ns (0 allocations: 0 bytes)
  26.727 ns (0 allocations: 0 bytes)
8192bytes
  51.374 ns (0 allocations: 0 bytes)
  52.428 ns (0 allocations: 0 bytes)
  44.641 ns (0 allocations: 0 bytes)
16384bytes
  114.295 ns (0 allocations: 0 bytes)
  117.284 ns (0 allocations: 0 bytes)
  144.331 ns (0 allocations: 0 bytes)
32768bytes
  597.660 ns (0 allocations: 0 bytes)
  610.381 ns (0 allocations: 0 bytes)
  639.365 ns (0 allocations: 0 bytes)
65536bytes
  920.333 ns (0 allocations: 0 bytes)
  975.286 ns (0 allocations: 0 bytes)
  908.938 ns (0 allocations: 0 bytes)
131072bytes
  3.623 μs (0 allocations: 0 bytes)
  3.382 μs (0 allocations: 0 bytes)
  2.484 μs (0 allocations: 0 bytes)
262144bytes
  6.359 μs (0 allocations: 0 bytes)
  6.073 μs (0 allocations: 0 bytes)
  3.682 μs (0 allocations: 0 bytes)
524288bytes
  15.064 μs (0 allocations: 0 bytes)
  14.255 μs (0 allocations: 0 bytes)
  7.302 μs (0 allocations: 0 bytes)
1048576bytes
  42.631 μs (0 allocations: 0 bytes)
  42.636 μs (0 allocations: 0 bytes)
  23.643 μs (0 allocations: 0 bytes)

LLVM of course handles aliasing correctly (so long as we don’t add @simd ivdep).
Interesting that memmove and memcpy are about 2x slower.

1 Like

@Elrod
I tried to look into the native code, and found this:

memmove!(dest::Ptr{T}, src::Ptr{T}, n::UInt) where T = 
    ccall(:memmove, Ptr{Cvoid}, (Ptr{Cvoid}, Ptr{Cvoid}, Csize_t),
        dest, src, n)
memcpy!(dest::Ptr{T}, src::Ptr{T}, n::UInt) where T = 
    ccall(:memcpy, Ptr{Cvoid}, (Ptr{Cvoid}, Ptr{Cvoid}, Csize_t),
        dest, src, n)
a = Ptr{Int}(1)
b = Ptr{Int}(2)
println("--------------memmove!-------------------")
@code_native memmove!(b, a, UInt(1000))
println("--------------memcpy!-------------------")
@code_native memcpy!(b, a, UInt(1000))

which prints:

--------------memmove!-------------------
        .text
; ┌ @ REPL[1259]:3 within `memmove!`
        pushq   %rbp
        movq    %rsp, %rbp
        subq    $32, %rsp
        movabsq $memcpy, %rax
        callq   *%rax
        addq    $32, %rsp
        popq    %rbp
        retq
        nopw    (%rax,%rax)
; └
--------------memcpy!-------------------
        .text
; ┌ @ REPL[1259]:6 within `memcpy!`
        pushq   %rbp
        movq    %rsp, %rbp
        pushq   %rsi
        subq    $40, %rsp
        movq    %rcx, %rsi
        movabsq $memmove, %rax
        movq    %rsi, %rcx
        callq   *%rax
        movq    %rsi, %rax
        addq    $40, %rsp
        popq    %rsi
        popq    %rbp
        retq
        nopw    %cs:(%rax,%rax)
; └

It seems memcpy! call memmove, while memmove! call memcpy. Is it a bug, or just a typo Error?

1 Like

Life is always full of coincidence. :joy: :rofl:

Should be "fix"ed after (#43580)

1 Like