Add one more variant to memcpy benchmark

This commit is contained in:
Alexey Milovidov 2021-03-16 00:04:03 +03:00
parent 1b2ed51ff5
commit c1a2745bfb
2 changed files with 172 additions and 2 deletions

View File

@ -178,7 +178,7 @@ tail:
size -= padding;
}
/// Aligned unrolled copy. We will use all available SSE registers.
/// Aligned unrolled copy. We will use half of available SSE registers.
/// It's not possible to have both src and dst aligned.
/// So, we will use aligned stores and unaligned loads.
__m128i c0, c1, c2, c3, c4, c5, c6, c7;

View File

@ -35,7 +35,7 @@ void NO_INLINE loop(uint8_t * dst, uint8_t * src, size_t size, F && chunk_size_d
size -= bytes_to_copy;
/// Execute at least one SSE instruction as a penalty after running AVX code.
__asm__ volatile ("pxor %%xmm7, %%xmm7" ::: "xmm7");
__asm__ __volatile__ ("pxor %%xmm15, %%xmm15" ::: "xmm15");
}
}
@ -385,6 +385,7 @@ void memcpy_my_medium_avx(uint8_t * __restrict & __restrict dst, const uint8_t *
bool have_avx = true;
static uint8_t * memcpy_my(uint8_t * __restrict dst, const uint8_t * __restrict src, size_t size)
{
uint8_t * ret = dst;
@ -560,6 +561,174 @@ tail:
return ret;
}
static uint8_t * memcpy_my2(uint8_t * __restrict dst, const uint8_t * __restrict src, size_t size)
{
uint8_t * ret = dst;
if (size <= 16)
{
if (size >= 8)
{
__builtin_memcpy(dst + size - 8, src + size - 8, 8);
__builtin_memcpy(dst, src, 8);
}
else if (size >= 4)
{
__builtin_memcpy(dst + size - 4, src + size - 4, 4);
__builtin_memcpy(dst, src, 4);
}
else if (size >= 2)
{
__builtin_memcpy(dst + size - 2, src + size - 2, 2);
__builtin_memcpy(dst, src, 2);
}
else if (size >= 1)
{
*dst = *src;
}
}
else
{
if (size <= 128)
{
_mm_storeu_si128(reinterpret_cast<__m128i *>(dst + size - 16), _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + size - 16)));
while (size > 16)
{
_mm_storeu_si128(reinterpret_cast<__m128i *>(dst), _mm_loadu_si128(reinterpret_cast<const __m128i *>(src)));
dst += 16;
src += 16;
size -= 16;
}
}
else if (size < 30000 || !have_avx)
{
/// Align destination to 16 bytes boundary.
size_t padding = (16 - (reinterpret_cast<size_t>(dst) & 15)) & 15;
if (padding > 0)
{
__m128i head = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src));
_mm_storeu_si128(reinterpret_cast<__m128i*>(dst), head);
dst += padding;
src += padding;
size -= padding;
}
/// Aligned unrolled copy.
__m128i c0, c1, c2, c3, c4, c5, c6, c7;
while (size >= 128)
{
c0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src) + 0);
c1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src) + 1);
c2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src) + 2);
c3 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src) + 3);
c4 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src) + 4);
c5 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src) + 5);
c6 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src) + 6);
c7 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src) + 7);
src += 128;
_mm_store_si128((reinterpret_cast<__m128i*>(dst) + 0), c0);
_mm_store_si128((reinterpret_cast<__m128i*>(dst) + 1), c1);
_mm_store_si128((reinterpret_cast<__m128i*>(dst) + 2), c2);
_mm_store_si128((reinterpret_cast<__m128i*>(dst) + 3), c3);
_mm_store_si128((reinterpret_cast<__m128i*>(dst) + 4), c4);
_mm_store_si128((reinterpret_cast<__m128i*>(dst) + 5), c5);
_mm_store_si128((reinterpret_cast<__m128i*>(dst) + 6), c6);
_mm_store_si128((reinterpret_cast<__m128i*>(dst) + 7), c7);
dst += 128;
size -= 128;
}
_mm_storeu_si128(reinterpret_cast<__m128i *>(dst + size - 16), _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + size - 16)));
while (size > 16)
{
_mm_storeu_si128(reinterpret_cast<__m128i *>(dst), _mm_loadu_si128(reinterpret_cast<const __m128i *>(src)));
dst += 16;
src += 16;
size -= 16;
}
}
else
{
size_t padding = (32 - (reinterpret_cast<size_t>(dst) & 31)) & 31;
if (padding > 0)
{
__asm__(
"vmovups (%[s]), %%ymm0\n"
"vmovups %%ymm0, (%[d])\n"
: [d]"+r"(dst), [s]"+r"(src)
:
: "ymm0", "memory");
dst += padding;
src += padding;
size -= padding;
}
while (size >= 256)
{
__asm__(
"vmovups (%[s]), %%ymm0\n"
"vmovups 0x20(%[s]), %%ymm1\n"
"vmovups 0x40(%[s]), %%ymm2\n"
"vmovups 0x60(%[s]), %%ymm3\n"
"vmovups 0x80(%[s]), %%ymm4\n"
"vmovups 0xa0(%[s]), %%ymm5\n"
"vmovups 0xc0(%[s]), %%ymm6\n"
"vmovups 0xe0(%[s]), %%ymm7\n"
"add $0x100,%[s]\n"
"vmovaps %%ymm0, (%[d])\n"
"vmovaps %%ymm1, 0x20(%[d])\n"
"vmovaps %%ymm2, 0x40(%[d])\n"
"vmovaps %%ymm3, 0x60(%[d])\n"
"vmovaps %%ymm4, 0x80(%[d])\n"
"vmovaps %%ymm5, 0xa0(%[d])\n"
"vmovaps %%ymm6, 0xc0(%[d])\n"
"vmovaps %%ymm7, 0xe0(%[d])\n"
"add $0x100, %[d]\n"
: [d]"+r"(dst), [s]"+r"(src)
:
: "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7", "memory");
size -= 256;
}
__asm__(
"vmovups -0x20(%[s],%[size],1), %%ymm0\n"
"vmovups %%ymm0, -0x20(%[d],%[size],1)\n"
: [d]"+r"(dst), [s]"+r"(src)
: [size]"r"(size)
: "ymm0", "memory");
while (size > 32)
{
__asm__(
"vmovups (%[s]), %%ymm0\n"
"vmovups %%ymm0, (%[d])\n"
: [d]"+r"(dst), [s]"+r"(src)
:
: "ymm0", "memory");
dst += 32;
src += 32;
size -= 32;
}
__asm__ __volatile__ ("vzeroupper"
::: "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7",
"ymm8", "ymm9", "ymm10", "ymm11", "ymm12", "ymm13", "ymm14", "ymm15");
}
}
return ret;
}
extern "C" void * __memcpy_erms(void * __restrict destination, const void * __restrict source, size_t size);
extern "C" void * __memcpy_sse2_unaligned(void * __restrict destination, const void * __restrict source, size_t size);
extern "C" void * __memcpy_ssse3(void * __restrict destination, const void * __restrict source, size_t size);
@ -592,6 +761,7 @@ uint64_t dispatchMemcpyVariants(size_t memcpy_variant, uint8_t * dst, uint8_t *
VARIANT(10, memcpy_fast_sse)
VARIANT(11, memcpy_fast_avx)
VARIANT(12, memcpy_my)
VARIANT(13, memcpy_my2)
VARIANT(21, __memcpy_erms)
VARIANT(22, __memcpy_sse2_unaligned)