From c1a2745bfbdf54906ede3411c02b6f320107d052 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Tue, 16 Mar 2021 00:04:03 +0300 Subject: [PATCH] Add one more variant to memcpy benchmark --- base/glibc-compatibility/memcpy/memcpy.h | 2 +- utils/memcpy-bench/memcpy-bench.cpp | 172 ++++++++++++++++++++++- 2 files changed, 172 insertions(+), 2 deletions(-) diff --git a/base/glibc-compatibility/memcpy/memcpy.h b/base/glibc-compatibility/memcpy/memcpy.h index f9f81bcb0fe..211d144cecb 100644 --- a/base/glibc-compatibility/memcpy/memcpy.h +++ b/base/glibc-compatibility/memcpy/memcpy.h @@ -178,7 +178,7 @@ tail: size -= padding; } - /// Aligned unrolled copy. We will use all available SSE registers. + /// Aligned unrolled copy. We will use half of available SSE registers. /// It's not possible to have both src and dst aligned. /// So, we will use aligned stores and unaligned loads. __m128i c0, c1, c2, c3, c4, c5, c6, c7; diff --git a/utils/memcpy-bench/memcpy-bench.cpp b/utils/memcpy-bench/memcpy-bench.cpp index cd769640017..5c664a76fe2 100644 --- a/utils/memcpy-bench/memcpy-bench.cpp +++ b/utils/memcpy-bench/memcpy-bench.cpp @@ -35,7 +35,7 @@ void NO_INLINE loop(uint8_t * dst, uint8_t * src, size_t size, F && chunk_size_d size -= bytes_to_copy; /// Execute at least one SSE instruction as a penalty after running AVX code. - __asm__ volatile ("pxor %%xmm7, %%xmm7" ::: "xmm7"); + __asm__ __volatile__ ("pxor %%xmm15, %%xmm15" ::: "xmm15"); } } @@ -385,6 +385,7 @@ void memcpy_my_medium_avx(uint8_t * __restrict & __restrict dst, const uint8_t * bool have_avx = true; + static uint8_t * memcpy_my(uint8_t * __restrict dst, const uint8_t * __restrict src, size_t size) { uint8_t * ret = dst; @@ -560,6 +561,174 @@ tail: return ret; } + +static uint8_t * memcpy_my2(uint8_t * __restrict dst, const uint8_t * __restrict src, size_t size) +{ + uint8_t * ret = dst; + + if (size <= 16) + { + if (size >= 8) + { + __builtin_memcpy(dst + size - 8, src + size - 8, 8); + __builtin_memcpy(dst, src, 8); + } + else if (size >= 4) + { + __builtin_memcpy(dst + size - 4, src + size - 4, 4); + __builtin_memcpy(dst, src, 4); + } + else if (size >= 2) + { + __builtin_memcpy(dst + size - 2, src + size - 2, 2); + __builtin_memcpy(dst, src, 2); + } + else if (size >= 1) + { + *dst = *src; + } + } + else + { + if (size <= 128) + { + _mm_storeu_si128(reinterpret_cast<__m128i *>(dst + size - 16), _mm_loadu_si128(reinterpret_cast(src + size - 16))); + + while (size > 16) + { + _mm_storeu_si128(reinterpret_cast<__m128i *>(dst), _mm_loadu_si128(reinterpret_cast(src))); + dst += 16; + src += 16; + size -= 16; + } + } + else if (size < 30000 || !have_avx) + { + /// Align destination to 16 bytes boundary. + size_t padding = (16 - (reinterpret_cast(dst) & 15)) & 15; + + if (padding > 0) + { + __m128i head = _mm_loadu_si128(reinterpret_cast(src)); + _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), head); + dst += padding; + src += padding; + size -= padding; + } + + /// Aligned unrolled copy. + __m128i c0, c1, c2, c3, c4, c5, c6, c7; + + while (size >= 128) + { + c0 = _mm_loadu_si128(reinterpret_cast(src) + 0); + c1 = _mm_loadu_si128(reinterpret_cast(src) + 1); + c2 = _mm_loadu_si128(reinterpret_cast(src) + 2); + c3 = _mm_loadu_si128(reinterpret_cast(src) + 3); + c4 = _mm_loadu_si128(reinterpret_cast(src) + 4); + c5 = _mm_loadu_si128(reinterpret_cast(src) + 5); + c6 = _mm_loadu_si128(reinterpret_cast(src) + 6); + c7 = _mm_loadu_si128(reinterpret_cast(src) + 7); + src += 128; + _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 0), c0); + _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 1), c1); + _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 2), c2); + _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 3), c3); + _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 4), c4); + _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 5), c5); + _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 6), c6); + _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 7), c7); + dst += 128; + + size -= 128; + } + + _mm_storeu_si128(reinterpret_cast<__m128i *>(dst + size - 16), _mm_loadu_si128(reinterpret_cast(src + size - 16))); + + while (size > 16) + { + _mm_storeu_si128(reinterpret_cast<__m128i *>(dst), _mm_loadu_si128(reinterpret_cast(src))); + dst += 16; + src += 16; + size -= 16; + } + } + else + { + size_t padding = (32 - (reinterpret_cast(dst) & 31)) & 31; + + if (padding > 0) + { + __asm__( + "vmovups (%[s]), %%ymm0\n" + "vmovups %%ymm0, (%[d])\n" + : [d]"+r"(dst), [s]"+r"(src) + : + : "ymm0", "memory"); + + dst += padding; + src += padding; + size -= padding; + } + + while (size >= 256) + { + __asm__( + "vmovups (%[s]), %%ymm0\n" + "vmovups 0x20(%[s]), %%ymm1\n" + "vmovups 0x40(%[s]), %%ymm2\n" + "vmovups 0x60(%[s]), %%ymm3\n" + "vmovups 0x80(%[s]), %%ymm4\n" + "vmovups 0xa0(%[s]), %%ymm5\n" + "vmovups 0xc0(%[s]), %%ymm6\n" + "vmovups 0xe0(%[s]), %%ymm7\n" + "add $0x100,%[s]\n" + "vmovaps %%ymm0, (%[d])\n" + "vmovaps %%ymm1, 0x20(%[d])\n" + "vmovaps %%ymm2, 0x40(%[d])\n" + "vmovaps %%ymm3, 0x60(%[d])\n" + "vmovaps %%ymm4, 0x80(%[d])\n" + "vmovaps %%ymm5, 0xa0(%[d])\n" + "vmovaps %%ymm6, 0xc0(%[d])\n" + "vmovaps %%ymm7, 0xe0(%[d])\n" + "add $0x100, %[d]\n" + : [d]"+r"(dst), [s]"+r"(src) + : + : "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7", "memory"); + + size -= 256; + } + + __asm__( + "vmovups -0x20(%[s],%[size],1), %%ymm0\n" + "vmovups %%ymm0, -0x20(%[d],%[size],1)\n" + : [d]"+r"(dst), [s]"+r"(src) + : [size]"r"(size) + : "ymm0", "memory"); + + while (size > 32) + { + __asm__( + "vmovups (%[s]), %%ymm0\n" + "vmovups %%ymm0, (%[d])\n" + : [d]"+r"(dst), [s]"+r"(src) + : + : "ymm0", "memory"); + + dst += 32; + src += 32; + size -= 32; + } + + __asm__ __volatile__ ("vzeroupper" + ::: "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7", + "ymm8", "ymm9", "ymm10", "ymm11", "ymm12", "ymm13", "ymm14", "ymm15"); + } + } + + return ret; +} + extern "C" void * __memcpy_erms(void * __restrict destination, const void * __restrict source, size_t size); extern "C" void * __memcpy_sse2_unaligned(void * __restrict destination, const void * __restrict source, size_t size); extern "C" void * __memcpy_ssse3(void * __restrict destination, const void * __restrict source, size_t size); @@ -592,6 +761,7 @@ uint64_t dispatchMemcpyVariants(size_t memcpy_variant, uint8_t * dst, uint8_t * VARIANT(10, memcpy_fast_sse) VARIANT(11, memcpy_fast_avx) VARIANT(12, memcpy_my) + VARIANT(13, memcpy_my2) VARIANT(21, __memcpy_erms) VARIANT(22, __memcpy_sse2_unaligned)