/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8 -*-│ │vi: set et ft=asm ts=8 tw=8 fenc=utf-8 :vi│ ╞══════════════════════════════════════════════════════════════════════════════╡ │ Copyright 2020 Justine Alexandra Roberts Tunney │ │ │ │ Permission to use, copy, modify, and/or distribute this software for │ │ any purpose with or without fee is hereby granted, provided that the │ │ above copyright notice and this permission notice appear in all copies. │ │ │ │ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL │ │ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED │ │ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE │ │ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL │ │ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR │ │ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER │ │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR │ │ PERFORMANCE OF THIS SOFTWARE. │ ╚─────────────────────────────────────────────────────────────────────────────*/ // Copies memory. // // DEST and SRC must not overlap, unless DEST≤SRC. // // @param rdi is dest // @param rsi is src // @param rdx is number of bytes // @return original rdi copied to rax // @mode long // @asyncsignalsafe memcpy: mov %rdi,%rax // 𝑠𝑙𝑖𝑑𝑒 .align 16 .type memcpy,@function .size memcpy,.-memcpy .globl memcpy // Copies memory w/ minimal impact ABI. // // @param rdi is dest // @param rsi is src // @param rdx is number of bytes // @clob flags,rcx,xmm3,xmm4 // @mode long MemCpy: mov $.Lmemcpytab.size,%ecx cmp %rcx,%rdx cmovb %rdx,%rcx jmp *memcpytab(,%rcx,8) .Lanchorpoint: .L16r: cmp $1024,%rdx jae .Lerms .L16: movdqu -16(%rsi,%rdx),%xmm4 mov $16,%rcx 0: add $16,%rcx movdqu -32(%rsi,%rcx),%xmm3 movdqu %xmm3,-32(%rdi,%rcx) cmp %rcx,%rdx ja 0b movdqu %xmm4,-16(%rdi,%rdx) pxor %xmm4,%xmm4 pxor %xmm3,%xmm3 jmp .L0 .L8: push %rbx mov (%rsi),%rcx mov -8(%rsi,%rdx),%rbx mov %rcx,(%rdi) mov %rbx,-8(%rdi,%rdx) 1: pop %rbx .L0: ret .L4: push %rbx mov (%rsi),%ecx mov -4(%rsi,%rdx),%ebx mov %ecx,(%rdi) mov %ebx,-4(%rdi,%rdx) jmp 1b .L3: push %rbx mov (%rsi),%cx mov -2(%rsi,%rdx),%bx mov %cx,(%rdi) mov %bx,-2(%rdi,%rdx) jmp 1b .L2: mov (%rsi),%cx mov %cx,(%rdi) jmp .L0 .L1: mov (%rsi),%cl mov %cl,(%rdi) jmp .L0 .Lerms: cmp $1024*1024,%rdx ja .Lnts push %rdi push %rsi mov %rdx,%rcx rep movsb pop %rsi pop %rdi jmp .L0 .Lnts: movdqu (%rsi),%xmm3 movdqu %xmm3,(%rdi) lea 16(%rdi),%rcx and $-16,%rcx sub %rdi,%rcx add %rcx,%rdi add %rcx,%rsi sub %rcx,%rdx mov $16,%rcx 0: add $16,%rcx movdqu -32(%rsi,%rcx),%xmm3 movntdq %xmm3,-32(%rdi,%rcx) cmp %rcx,%rdx ja 0b sfence movdqu -16(%rsi,%rdx),%xmm3 movdqu %xmm3,-16(%rdi,%rdx) pxor %xmm3,%xmm3 jmp .L0 .type MemCpy,@function .size MemCpy,.-MemCpy .globl MemCpy .section .rodata .align 8 memcpytab: .quad .L0 .quad .L1 .quad .L2 .quad .L3 .rept 4 .quad .L4 .endr .rept 8 .quad .L8 .endr .rept 16 .quad .L16 .endr .equ .Lmemcpytab.size,(.-memcpytab)/8 .quad .L16r # SSE + ERMS + NTS .type memcpytab,@object .previous