#pragma clang diagnostic ignored "-Wreserved-identifier" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include template void NO_INLINE loop(uint8_t * dst, uint8_t * src, size_t size, F && chunk_size_distribution, MemcpyImpl && impl) { while (size) { size_t bytes_to_copy = std::min(size, chunk_size_distribution()); impl(dst, src, bytes_to_copy); dst += bytes_to_copy; src += bytes_to_copy; size -= bytes_to_copy; /// Execute at least one SSE instruction as a penalty after running AVX code. __asm__ __volatile__ ("pxor %%xmm15, %%xmm15" ::: "xmm15"); } } using RNG = pcg32_fast; template size_t generatorUniform(RNG & rng) { return rng() % N; }; template uint64_t test(uint8_t * dst, uint8_t * src, size_t size, size_t iterations, size_t num_threads, F && generator, MemcpyImpl && impl, const char * name) { Stopwatch watch; std::vector threads; threads.reserve(num_threads); for (size_t thread_num = 0; thread_num < num_threads; ++thread_num) { size_t begin = size * thread_num / num_threads; size_t end = size * (thread_num + 1) / num_threads; threads.emplace_back([begin, end, iterations, &src, &dst, &generator, &impl] { for (size_t iteration = 0; iteration < iterations; ++iteration) { loop( iteration % 2 ? &src[begin] : &dst[begin], iteration % 2 ? &dst[begin] : &src[begin], end - begin, [rng = RNG(), &generator]() mutable { return generator(rng); }, std::forward(impl)); /// NOLINT(bugprone-use-after-move,hicpp-invalid-access-moved) } }); } for (auto & thread : threads) thread.join(); uint64_t elapsed_ns = watch.elapsed(); /// Validation for (size_t i = 0; i < size; ++i) if (dst[i] != uint8_t(i)) throw std::logic_error("Incorrect result"); std::cout << name; return elapsed_ns; } using memcpy_type = void * (*)(const void * __restrict, void * __restrict, size_t); static void * memcpy_erms(void * dst, const void * src, size_t size) { asm volatile ( "rep movsb" : "=D"(dst), "=S"(src), "=c"(size) : "0"(dst), "1"(src), "2"(size) : "memory"); return dst; } static void * memcpy_trivial(void * __restrict dst_, const void * __restrict src_, size_t size) { char * __restrict dst = reinterpret_cast(dst_); const char * __restrict src = reinterpret_cast(src_); void * ret = dst; while (size > 0) { *dst = *src; ++dst; ++src; --size; } return ret; } extern "C" void * memcpy_jart(void * dst, const void * src, size_t size); extern "C" void MemCpy(void * dst, const void * src, size_t size); void * memcpy_fast_sse(void * dst, const void * src, size_t size); void * memcpy_fast_avx(void * dst, const void * src, size_t size); void * memcpy_tiny(void * dst, const void * src, size_t size); static void * memcpySSE2(void * __restrict destination, const void * __restrict source, size_t size) { unsigned char *dst = reinterpret_cast(destination); const unsigned char *src = reinterpret_cast(source); size_t padding; // small memory copy if (size <= 16) return memcpy_tiny(dst, src, size); // align destination to 16 bytes boundary padding = (16 - (reinterpret_cast(dst) & 15)) & 15; if (padding > 0) { __m128i head = _mm_loadu_si128(reinterpret_cast(src)); _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), head); dst += padding; src += padding; size -= padding; } // medium size copy __m128i c0; for (; size >= 16; size -= 16) { c0 = _mm_loadu_si128(reinterpret_cast(src)); src += 16; _mm_store_si128((reinterpret_cast<__m128i*>(dst)), c0); dst += 16; } memcpy_tiny(dst, src, size); return destination; } static void * memcpySSE2Unrolled2(void * __restrict destination, const void * __restrict source, size_t size) { unsigned char *dst = reinterpret_cast(destination); const unsigned char *src = reinterpret_cast(source); size_t padding; // small memory copy if (size <= 32) return memcpy_tiny(dst, src, size); // align destination to 16 bytes boundary padding = (16 - (reinterpret_cast(dst) & 15)) & 15; if (padding > 0) { __m128i head = _mm_loadu_si128(reinterpret_cast(src)); _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), head); dst += padding; src += padding; size -= padding; } // medium size copy __m128i c0; __m128i c1; for (; size >= 32; size -= 32) { c0 = _mm_loadu_si128(reinterpret_cast(src) + 0); c1 = _mm_loadu_si128(reinterpret_cast(src) + 1); src += 32; _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 0), c0); _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 1), c1); dst += 32; } memcpy_tiny(dst, src, size); return destination; } static void * memcpySSE2Unrolled4(void * __restrict destination, const void * __restrict source, size_t size) { unsigned char *dst = reinterpret_cast(destination); const unsigned char *src = reinterpret_cast(source); size_t padding; // small memory copy if (size <= 64) return memcpy_tiny(dst, src, size); // align destination to 16 bytes boundary padding = (16 - (reinterpret_cast(dst) & 15)) & 15; if (padding > 0) { __m128i head = _mm_loadu_si128(reinterpret_cast(src)); _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), head); dst += padding; src += padding; size -= padding; } // medium size copy __m128i c0; __m128i c1; __m128i c2; __m128i c3; for (; size >= 64; size -= 64) { c0 = _mm_loadu_si128(reinterpret_cast(src) + 0); c1 = _mm_loadu_si128(reinterpret_cast(src) + 1); c2 = _mm_loadu_si128(reinterpret_cast(src) + 2); c3 = _mm_loadu_si128(reinterpret_cast(src) + 3); src += 64; _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 0), c0); _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 1), c1); _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 2), c2); _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 3), c3); dst += 64; } memcpy_tiny(dst, src, size); return destination; } static void * memcpySSE2Unrolled8(void * __restrict destination, const void * __restrict source, size_t size) { unsigned char *dst = reinterpret_cast(destination); const unsigned char *src = reinterpret_cast(source); size_t padding; // small memory copy if (size <= 128) return memcpy_tiny(dst, src, size); // align destination to 16 bytes boundary padding = (16 - (reinterpret_cast(dst) & 15)) & 15; if (padding > 0) { __m128i head = _mm_loadu_si128(reinterpret_cast(src)); _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), head); dst += padding; src += padding; size -= padding; } // medium size copy __m128i c0; __m128i c1; __m128i c2; __m128i c3; __m128i c4; __m128i c5; __m128i c6; __m128i c7; for (; size >= 128; size -= 128) { c0 = _mm_loadu_si128(reinterpret_cast(src) + 0); c1 = _mm_loadu_si128(reinterpret_cast(src) + 1); c2 = _mm_loadu_si128(reinterpret_cast(src) + 2); c3 = _mm_loadu_si128(reinterpret_cast(src) + 3); c4 = _mm_loadu_si128(reinterpret_cast(src) + 4); c5 = _mm_loadu_si128(reinterpret_cast(src) + 5); c6 = _mm_loadu_si128(reinterpret_cast(src) + 6); c7 = _mm_loadu_si128(reinterpret_cast(src) + 7); src += 128; _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 0), c0); _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 1), c1); _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 2), c2); _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 3), c3); _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 4), c4); _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 5), c5); _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 6), c6); _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 7), c7); dst += 128; } memcpy_tiny(dst, src, size); return destination; } //static __attribute__((__always_inline__, __target__("sse2"))) __attribute__((__always_inline__)) inline void memcpy_my_medium_sse(uint8_t * __restrict & dst, const uint8_t * __restrict & src, size_t & size) { /// Align destination to 16 bytes boundary. size_t padding = (16 - (reinterpret_cast(dst) & 15)) & 15; if (padding > 0) { __m128i head = _mm_loadu_si128(reinterpret_cast(src)); _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), head); dst += padding; src += padding; size -= padding; } /// Aligned unrolled copy. __m128i c0; __m128i c1; __m128i c2; __m128i c3; __m128i c4; __m128i c5; __m128i c6; __m128i c7; while (size >= 128) { c0 = _mm_loadu_si128(reinterpret_cast(src) + 0); c1 = _mm_loadu_si128(reinterpret_cast(src) + 1); c2 = _mm_loadu_si128(reinterpret_cast(src) + 2); c3 = _mm_loadu_si128(reinterpret_cast(src) + 3); c4 = _mm_loadu_si128(reinterpret_cast(src) + 4); c5 = _mm_loadu_si128(reinterpret_cast(src) + 5); c6 = _mm_loadu_si128(reinterpret_cast(src) + 6); c7 = _mm_loadu_si128(reinterpret_cast(src) + 7); src += 128; _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 0), c0); _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 1), c1); _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 2), c2); _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 3), c3); _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 4), c4); _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 5), c5); _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 6), c6); _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 7), c7); dst += 128; size -= 128; } } __attribute__((__target__("avx"))) void memcpy_my_medium_avx(uint8_t * __restrict & __restrict dst, const uint8_t * __restrict & __restrict src, size_t & __restrict size) { size_t padding = (32 - (reinterpret_cast(dst) & 31)) & 31; if (padding > 0) { __m256i head = _mm256_loadu_si256(reinterpret_cast(src)); _mm256_storeu_si256(reinterpret_cast<__m256i*>(dst), head); dst += padding; src += padding; size -= padding; } __m256i c0; __m256i c1; __m256i c2; __m256i c3; __m256i c4; __m256i c5; __m256i c6; __m256i c7; while (size >= 256) { c0 = _mm256_loadu_si256((reinterpret_cast(src)) + 0); c1 = _mm256_loadu_si256((reinterpret_cast(src)) + 1); c2 = _mm256_loadu_si256((reinterpret_cast(src)) + 2); c3 = _mm256_loadu_si256((reinterpret_cast(src)) + 3); c4 = _mm256_loadu_si256((reinterpret_cast(src)) + 4); c5 = _mm256_loadu_si256((reinterpret_cast(src)) + 5); c6 = _mm256_loadu_si256((reinterpret_cast(src)) + 6); c7 = _mm256_loadu_si256((reinterpret_cast(src)) + 7); src += 256; _mm256_store_si256(((reinterpret_cast<__m256i*>(dst)) + 0), c0); _mm256_store_si256(((reinterpret_cast<__m256i*>(dst)) + 1), c1); _mm256_store_si256(((reinterpret_cast<__m256i*>(dst)) + 2), c2); _mm256_store_si256(((reinterpret_cast<__m256i*>(dst)) + 3), c3); _mm256_store_si256(((reinterpret_cast<__m256i*>(dst)) + 4), c4); _mm256_store_si256(((reinterpret_cast<__m256i*>(dst)) + 5), c5); _mm256_store_si256(((reinterpret_cast<__m256i*>(dst)) + 6), c6); _mm256_store_si256(((reinterpret_cast<__m256i*>(dst)) + 7), c7); dst += 256; size -= 256; } } bool have_avx = true; static uint8_t * memcpy_my(uint8_t * __restrict dst, const uint8_t * __restrict src, size_t size) { uint8_t * ret = dst; tail: if (size <= 16) { if (size >= 8) { __builtin_memcpy(dst + size - 8, src + size - 8, 8); __builtin_memcpy(dst, src, 8); } else if (size >= 4) { __builtin_memcpy(dst + size - 4, src + size - 4, 4); __builtin_memcpy(dst, src, 4); } else if (size >= 2) { __builtin_memcpy(dst + size - 2, src + size - 2, 2); __builtin_memcpy(dst, src, 2); } else if (size >= 1) { *dst = *src; } } else if (have_avx) { if (size <= 32) { __builtin_memcpy(dst, src, 8); __builtin_memcpy(dst + 8, src + 8, 8); dst += 16; src += 16; size -= 16; goto tail; } if (size <= 256) { __asm__( "vmovups -0x20(%[s],%[size],1), %%ymm0\n" "vmovups %%ymm0, -0x20(%[d],%[size],1)\n" : [d]"+r"(dst), [s]"+r"(src) : [size]"r"(size) : "ymm0", "memory"); while (size > 32) { __asm__( "vmovups (%[s]), %%ymm0\n" "vmovups %%ymm0, (%[d])\n" : [d]"+r"(dst), [s]"+r"(src) : : "ymm0", "memory"); dst += 32; src += 32; size -= 32; } } else { size_t padding = (32 - (reinterpret_cast(dst) & 31)) & 31; if (padding > 0) { __asm__( "vmovups (%[s]), %%ymm0\n" "vmovups %%ymm0, (%[d])\n" : [d]"+r"(dst), [s]"+r"(src) : : "ymm0", "memory"); dst += padding; src += padding; size -= padding; } while (size >= 256) { __asm__( "vmovups (%[s]), %%ymm0\n" "vmovups 0x20(%[s]), %%ymm1\n" "vmovups 0x40(%[s]), %%ymm2\n" "vmovups 0x60(%[s]), %%ymm3\n" "vmovups 0x80(%[s]), %%ymm4\n" "vmovups 0xa0(%[s]), %%ymm5\n" "vmovups 0xc0(%[s]), %%ymm6\n" "vmovups 0xe0(%[s]), %%ymm7\n" "add $0x100,%[s]\n" "vmovaps %%ymm0, (%[d])\n" "vmovaps %%ymm1, 0x20(%[d])\n" "vmovaps %%ymm2, 0x40(%[d])\n" "vmovaps %%ymm3, 0x60(%[d])\n" "vmovaps %%ymm4, 0x80(%[d])\n" "vmovaps %%ymm5, 0xa0(%[d])\n" "vmovaps %%ymm6, 0xc0(%[d])\n" "vmovaps %%ymm7, 0xe0(%[d])\n" "add $0x100, %[d]\n" : [d]"+r"(dst), [s]"+r"(src) : : "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7", "memory"); size -= 256; } goto tail; } } else { if (size <= 128) { _mm_storeu_si128(reinterpret_cast<__m128i *>(dst + size - 16), _mm_loadu_si128(reinterpret_cast(src + size - 16))); while (size > 16) { _mm_storeu_si128(reinterpret_cast<__m128i *>(dst), _mm_loadu_si128(reinterpret_cast(src))); dst += 16; src += 16; size -= 16; } } else { /// Align destination to 16 bytes boundary. size_t padding = (16 - (reinterpret_cast(dst) & 15)) & 15; if (padding > 0) { __m128i head = _mm_loadu_si128(reinterpret_cast(src)); _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), head); dst += padding; src += padding; size -= padding; } /// Aligned unrolled copy. __m128i c0; __m128i c1; __m128i c2; __m128i c3; __m128i c4; __m128i c5; __m128i c6; __m128i c7; while (size >= 128) { c0 = _mm_loadu_si128(reinterpret_cast(src) + 0); c1 = _mm_loadu_si128(reinterpret_cast(src) + 1); c2 = _mm_loadu_si128(reinterpret_cast(src) + 2); c3 = _mm_loadu_si128(reinterpret_cast(src) + 3); c4 = _mm_loadu_si128(reinterpret_cast(src) + 4); c5 = _mm_loadu_si128(reinterpret_cast(src) + 5); c6 = _mm_loadu_si128(reinterpret_cast(src) + 6); c7 = _mm_loadu_si128(reinterpret_cast(src) + 7); src += 128; _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 0), c0); _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 1), c1); _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 2), c2); _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 3), c3); _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 4), c4); _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 5), c5); _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 6), c6); _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 7), c7); dst += 128; size -= 128; } goto tail; } } return ret; } static uint8_t * memcpy_my2(uint8_t * __restrict dst, const uint8_t * __restrict src, size_t size) { uint8_t * ret = dst; if (size <= 16) { if (size >= 8) { __builtin_memcpy(dst + size - 8, src + size - 8, 8); __builtin_memcpy(dst, src, 8); } else if (size >= 4) { __builtin_memcpy(dst + size - 4, src + size - 4, 4); __builtin_memcpy(dst, src, 4); } else if (size >= 2) { __builtin_memcpy(dst + size - 2, src + size - 2, 2); __builtin_memcpy(dst, src, 2); } else if (size >= 1) { *dst = *src; } } else if (size <= 128) { _mm_storeu_si128(reinterpret_cast<__m128i *>(dst + size - 16), _mm_loadu_si128(reinterpret_cast(src + size - 16))); while (size > 16) { _mm_storeu_si128(reinterpret_cast<__m128i *>(dst), _mm_loadu_si128(reinterpret_cast(src))); dst += 16; src += 16; size -= 16; } } else if (size < 30000 || !have_avx) { /// Align destination to 16 bytes boundary. size_t padding = (16 - (reinterpret_cast(dst) & 15)) & 15; if (padding > 0) { __m128i head = _mm_loadu_si128(reinterpret_cast(src)); _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), head); dst += padding; src += padding; size -= padding; } /// Aligned unrolled copy. __m128i c0; __m128i c1; __m128i c2; __m128i c3; __m128i c4; __m128i c5; __m128i c6; __m128i c7; while (size >= 128) { c0 = _mm_loadu_si128(reinterpret_cast(src) + 0); c1 = _mm_loadu_si128(reinterpret_cast(src) + 1); c2 = _mm_loadu_si128(reinterpret_cast(src) + 2); c3 = _mm_loadu_si128(reinterpret_cast(src) + 3); c4 = _mm_loadu_si128(reinterpret_cast(src) + 4); c5 = _mm_loadu_si128(reinterpret_cast(src) + 5); c6 = _mm_loadu_si128(reinterpret_cast(src) + 6); c7 = _mm_loadu_si128(reinterpret_cast(src) + 7); src += 128; _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 0), c0); _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 1), c1); _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 2), c2); _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 3), c3); _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 4), c4); _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 5), c5); _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 6), c6); _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 7), c7); dst += 128; size -= 128; } _mm_storeu_si128(reinterpret_cast<__m128i *>(dst + size - 16), _mm_loadu_si128(reinterpret_cast(src + size - 16))); while (size > 16) { _mm_storeu_si128(reinterpret_cast<__m128i *>(dst), _mm_loadu_si128(reinterpret_cast(src))); dst += 16; src += 16; size -= 16; } } else { size_t padding = (32 - (reinterpret_cast(dst) & 31)) & 31; if (padding > 0) { __asm__( "vmovups (%[s]), %%ymm0\n" "vmovups %%ymm0, (%[d])\n" : [d]"+r"(dst), [s]"+r"(src) : : "ymm0", "memory"); dst += padding; src += padding; size -= padding; } while (size >= 512) /// NOLINT { __asm__( "vmovups (%[s]), %%ymm0\n" "vmovups 0x20(%[s]), %%ymm1\n" "vmovups 0x40(%[s]), %%ymm2\n" "vmovups 0x60(%[s]), %%ymm3\n" "vmovups 0x80(%[s]), %%ymm4\n" "vmovups 0xa0(%[s]), %%ymm5\n" "vmovups 0xc0(%[s]), %%ymm6\n" "vmovups 0xe0(%[s]), %%ymm7\n" "vmovups 0x100(%[s]), %%ymm8\n" "vmovups 0x120(%[s]), %%ymm9\n" "vmovups 0x140(%[s]), %%ymm10\n" "vmovups 0x160(%[s]), %%ymm11\n" "vmovups 0x180(%[s]), %%ymm12\n" "vmovups 0x1a0(%[s]), %%ymm13\n" "vmovups 0x1c0(%[s]), %%ymm14\n" "vmovups 0x1e0(%[s]), %%ymm15\n" "add $0x200, %[s]\n" "sub $0x200, %[size]\n" "vmovaps %%ymm0, (%[d])\n" "vmovaps %%ymm1, 0x20(%[d])\n" "vmovaps %%ymm2, 0x40(%[d])\n" "vmovaps %%ymm3, 0x60(%[d])\n" "vmovaps %%ymm4, 0x80(%[d])\n" "vmovaps %%ymm5, 0xa0(%[d])\n" "vmovaps %%ymm6, 0xc0(%[d])\n" "vmovaps %%ymm7, 0xe0(%[d])\n" "vmovaps %%ymm8, 0x100(%[d])\n" "vmovaps %%ymm9, 0x120(%[d])\n" "vmovaps %%ymm10, 0x140(%[d])\n" "vmovaps %%ymm11, 0x160(%[d])\n" "vmovaps %%ymm12, 0x180(%[d])\n" "vmovaps %%ymm13, 0x1a0(%[d])\n" "vmovaps %%ymm14, 0x1c0(%[d])\n" "vmovaps %%ymm15, 0x1e0(%[d])\n" "add $0x200, %[d]\n" : [d]"+r"(dst), [s]"+r"(src), [size]"+r"(size) : : "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", "ymm12", "ymm13", "ymm14", "ymm15", "memory"); } /*while (size >= 256) { __asm__( "vmovups (%[s]), %%ymm0\n" "vmovups 0x20(%[s]), %%ymm1\n" "vmovups 0x40(%[s]), %%ymm2\n" "vmovups 0x60(%[s]), %%ymm3\n" "vmovups 0x80(%[s]), %%ymm4\n" "vmovups 0xa0(%[s]), %%ymm5\n" "vmovups 0xc0(%[s]), %%ymm6\n" "vmovups 0xe0(%[s]), %%ymm7\n" "add $0x100,%[s]\n" "vmovaps %%ymm0, (%[d])\n" "vmovaps %%ymm1, 0x20(%[d])\n" "vmovaps %%ymm2, 0x40(%[d])\n" "vmovaps %%ymm3, 0x60(%[d])\n" "vmovaps %%ymm4, 0x80(%[d])\n" "vmovaps %%ymm5, 0xa0(%[d])\n" "vmovaps %%ymm6, 0xc0(%[d])\n" "vmovaps %%ymm7, 0xe0(%[d])\n" "add $0x100, %[d]\n" : [d]"+r"(dst), [s]"+r"(src) : : "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7", "memory"); size -= 256; }*/ /*while (size > 128) { __asm__( "vmovups (%[s]), %%ymm0\n" "vmovups 0x20(%[s]), %%ymm1\n" "vmovups 0x40(%[s]), %%ymm2\n" "vmovups 0x60(%[s]), %%ymm3\n" "add $0x80, %[s]\n" "sub $0x80, %[size]\n" "vmovaps %%ymm0, (%[d])\n" "vmovaps %%ymm1, 0x20(%[d])\n" "vmovaps %%ymm2, 0x40(%[d])\n" "vmovaps %%ymm3, 0x60(%[d])\n" "add $0x80, %[d]\n" : [d]"+r"(dst), [s]"+r"(src), [size]"+r"(size) : : "ymm0", "ymm1", "ymm2", "ymm3", "memory"); }*/ __asm__( "vmovups -0x20(%[s],%[size],1), %%ymm0\n" "vmovups %%ymm0, -0x20(%[d],%[size],1)\n" : [d]"+r"(dst), [s]"+r"(src) : [size]"r"(size) : "ymm0", "memory"); while (size > 32) { __asm__( "vmovups (%[s]), %%ymm0\n" "vmovups %%ymm0, (%[d])\n" : [d]"+r"(dst), [s]"+r"(src) : : "ymm0", "memory"); dst += 32; src += 32; size -= 32; } __asm__ __volatile__ ("vzeroupper" ::: "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", "ymm12", "ymm13", "ymm14", "ymm15"); } return ret; } extern "C" void * __memcpy_erms(void * __restrict destination, const void * __restrict source, size_t size); /// NOLINT extern "C" void * __memcpy_sse2_unaligned(void * __restrict destination, const void * __restrict source, size_t size); /// NOLINT extern "C" void * __memcpy_ssse3(void * __restrict destination, const void * __restrict source, size_t size); /// NOLINT extern "C" void * __memcpy_ssse3_back(void * __restrict destination, const void * __restrict source, size_t size); /// NOLINT extern "C" void * __memcpy_avx_unaligned(void * __restrict destination, const void * __restrict source, size_t size); /// NOLINT extern "C" void * __memcpy_avx_unaligned_erms(void * __restrict destination, const void * __restrict source, size_t size); /// NOLINT extern "C" void * __memcpy_avx512_unaligned(void * __restrict destination, const void * __restrict source, size_t size); /// NOLINT extern "C" void * __memcpy_avx512_unaligned_erms(void * __restrict destination, const void * __restrict source, size_t size); /// NOLINT extern "C" void * __memcpy_avx512_no_vzeroupper(void * __restrict destination, const void * __restrict source, size_t size); /// NOLINT #define VARIANT(N, NAME) \ if (memcpy_variant == (N)) \ return test(dst, src, size, iterations, num_threads, std::forward(generator), NAME, #NAME); template uint64_t dispatchMemcpyVariants(size_t memcpy_variant, uint8_t * dst, uint8_t * src, size_t size, size_t iterations, size_t num_threads, F && generator) { memcpy_type memcpy_libc_old = reinterpret_cast(dlsym(RTLD_NEXT, "memcpy")); VARIANT(1, memcpy) VARIANT(2, memcpy_trivial) VARIANT(3, memcpy_libc_old) VARIANT(4, memcpy_erms) VARIANT(5, MemCpy) VARIANT(6, memcpySSE2) VARIANT(7, memcpySSE2Unrolled2) VARIANT(8, memcpySSE2Unrolled4) VARIANT(9, memcpySSE2Unrolled8) VARIANT(10, memcpy_fast_sse) VARIANT(11, memcpy_fast_avx) VARIANT(12, memcpy_my) VARIANT(13, memcpy_my2) VARIANT(21, __memcpy_erms) VARIANT(22, __memcpy_sse2_unaligned) VARIANT(23, __memcpy_ssse3) VARIANT(24, __memcpy_ssse3_back) VARIANT(25, __memcpy_avx_unaligned) VARIANT(26, __memcpy_avx_unaligned_erms) VARIANT(27, __memcpy_avx512_unaligned) VARIANT(28, __memcpy_avx512_unaligned_erms) VARIANT(29, __memcpy_avx512_no_vzeroupper) return 0; } uint64_t dispatchVariants( size_t memcpy_variant, size_t generator_variant, uint8_t * dst, uint8_t * src, size_t size, size_t iterations, size_t num_threads) { if (generator_variant == 1) return dispatchMemcpyVariants(memcpy_variant, dst, src, size, iterations, num_threads, generatorUniform<16>); if (generator_variant == 2) return dispatchMemcpyVariants(memcpy_variant, dst, src, size, iterations, num_threads, generatorUniform<256>); if (generator_variant == 3) return dispatchMemcpyVariants(memcpy_variant, dst, src, size, iterations, num_threads, generatorUniform<4096>); if (generator_variant == 4) return dispatchMemcpyVariants(memcpy_variant, dst, src, size, iterations, num_threads, generatorUniform<65536>); if (generator_variant == 5) return dispatchMemcpyVariants(memcpy_variant, dst, src, size, iterations, num_threads, generatorUniform<1048576>); return 0; } int main(int argc, char ** argv) { boost::program_options::options_description desc("Allowed options"); desc.add_options()("help,h", "produce help message") ("size", boost::program_options::value()->default_value(1000000), "Bytes to copy on every iteration") ("iterations", boost::program_options::value(), "Number of iterations") ("threads", boost::program_options::value()->default_value(1), "Number of copying threads") ("distribution", boost::program_options::value()->default_value(4), "Distribution of chunk sizes to perform copy") ("variant", boost::program_options::value(), "Variant of memcpy implementation") ("tsv", "Print result in tab-separated format") ; boost::program_options::variables_map options; boost::program_options::store(boost::program_options::parse_command_line(argc, argv, desc), options); if (options.count("help") || !options.count("variant")) { std::cout << R"(Usage: for size in 4096 16384 50000 65536 100000 1000000 10000000 100000000; do for threads in 1 2 4 $(($(nproc) / 2)) $(nproc); do for distribution in 1 2 3 4 5; do for variant in {1..13} {21..29}; do for i in {1..10}; do ./memcpy-bench --tsv --size $size --variant $variant --threads $threads --distribution $distribution; done; done; done; done; done | tee result.tsv clickhouse-local --structure ' name String, size UInt64, iterations UInt64, threads UInt16, generator UInt8, memcpy UInt8, elapsed UInt64 ' --query " SELECT size, name, avg(1000 * elapsed / size / iterations) AS s, count() AS c FROM table GROUP BY size, name ORDER BY size ASC, s DESC " --output-format PrettyCompact < result.tsv )" << std::endl; std::cout << desc << std::endl; return 1; } size_t size = options["size"].as(); size_t num_threads = options["threads"].as(); size_t memcpy_variant = options["variant"].as(); size_t generator_variant = options["distribution"].as(); size_t iterations; if (options.count("iterations")) { iterations = options["iterations"].as(); } else { iterations = 10000000000ULL / size; if (generator_variant == 1) iterations /= 10; } std::unique_ptr src(new uint8_t[size]); std::unique_ptr dst(new uint8_t[size]); /// Fill src with some pattern for validation. for (size_t i = 0; i < size; ++i) src[i] = i; /// Fill dst to avoid page faults. memset(dst.get(), 0, size); uint64_t elapsed_ns = dispatchVariants(memcpy_variant, generator_variant, dst.get(), src.get(), size, iterations, num_threads); std::cout << std::fixed << std::setprecision(3); if (options.count("tsv")) { std::cout << '\t' << size << '\t' << iterations << '\t' << num_threads << '\t' << generator_variant << '\t' << memcpy_variant << '\t' << elapsed_ns << '\n'; } else { std::cout << ": " << num_threads << " threads, " << "size: " << size << ", distribution " << generator_variant << ", processed in " << (elapsed_ns / 1e9) << " sec, " << (size * iterations * 1.0 / elapsed_ns) << " GB/sec\n"; } return 0; }