Merge pull request #21520 from ClickHouse/replace-memcpy

Evaluate yet another memcpy
2024-11-29 11:02:08 +00:00 · 2021-03-14 12:24:48 +03:00 · 2021-03-14 12:24:48 +03:00 · 2ea38ea01e
commit 2ea38ea01e
parent 084bd03672 1606c7e3f3
22 changed files with 2270 additions and 1668 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -155,7 +155,6 @@ option(ENABLE_TESTS "Provide unit_test_dbms target with Google.Test unit tests"

 if (OS_LINUX AND NOT UNBUNDLED AND MAKE_STATIC_LIBRARIES AND NOT SPLIT_SHARED_LIBRARIES AND CMAKE_VERSION VERSION_GREATER "3.9.0")
    # Only for Linux, x86_64.
-    # Implies ${ENABLE_FASTMEMCPY}
    option(GLIBC_COMPATIBILITY "Enable compatibility with older glibc libraries." ON)
 elseif(GLIBC_COMPATIBILITY)
    message (${RECONFIGURE_MESSAGE_LEVEL} "Glibc compatibility cannot be enabled in current configuration")
@ -536,7 +535,7 @@ macro (add_executable target)
    # explicitly acquire and interpose malloc symbols by clickhouse_malloc
    # if GLIBC_COMPATIBILITY is ON and ENABLE_THINLTO is on than provide memcpy symbol explicitly to neutrialize thinlto's libcall generation.
    if (GLIBC_COMPATIBILITY AND ENABLE_THINLTO)
-        _add_executable (${ARGV} $<TARGET_OBJECTS:clickhouse_malloc> $<TARGET_OBJECTS:clickhouse_memcpy>)
+        _add_executable (${ARGV} $<TARGET_OBJECTS:clickhouse_malloc> $<TARGET_OBJECTS:memcpy>)
    else ()
        _add_executable (${ARGV} $<TARGET_OBJECTS:clickhouse_malloc>)
    endif ()
--- a/base/common/CMakeLists.txt
+++ b/base/common/CMakeLists.txt
@ -74,7 +74,6 @@ target_link_libraries (common
        ${CITYHASH_LIBRARIES}
        boost::headers_only
        boost::system
-        FastMemcpy
        Poco::Net
        Poco::Net::SSL
        Poco::Util
--- a/base/common/tests/CMakeLists.txt
+++ b/base/common/tests/CMakeLists.txt
@ -11,7 +11,7 @@ set(PLATFORM_LIBS ${CMAKE_DL_LIBS})
 target_link_libraries (date_lut2 PRIVATE common ${PLATFORM_LIBS})
 target_link_libraries (date_lut3 PRIVATE common ${PLATFORM_LIBS})
 target_link_libraries (date_lut_default_timezone PRIVATE common ${PLATFORM_LIBS})
-target_link_libraries (local_date_time_comparison PRIVATE common)
+target_link_libraries (local_date_time_comparison PRIVATE common ${PLATFORM_LIBS})
 target_link_libraries (realloc-perf PRIVATE common)
 add_check(local_date_time_comparison)

--- a/base/glibc-compatibility/CMakeLists.txt
+++ b/base/glibc-compatibility/CMakeLists.txt
@ -1,5 +1,8 @@
 if (GLIBC_COMPATIBILITY)
-    set (ENABLE_FASTMEMCPY ON)
+    add_subdirectory(memcpy)
+    if(TARGET memcpy)
+        set(MEMCPY_LIBRARY memcpy)
+    endif()

    enable_language(ASM)
    include(CheckIncludeFile)
@ -27,13 +30,6 @@ if (GLIBC_COMPATIBILITY)
        list(APPEND glibc_compatibility_sources musl/getentropy.c)
    endif()

-    if (NOT ARCH_ARM)
-        # clickhouse_memcpy don't support ARCH_ARM, see https://github.com/ClickHouse/ClickHouse/issues/18951
-        add_library (clickhouse_memcpy OBJECT
-            ${ClickHouse_SOURCE_DIR}/contrib/FastMemcpy/memcpy_wrapper.c
-        )
-    endif()
-
    # Need to omit frame pointers to match the performance of glibc
    set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fomit-frame-pointer")

@ -51,15 +47,16 @@ if (GLIBC_COMPATIBILITY)
        target_compile_options(glibc-compatibility PRIVATE -fPIC)
    endif ()

-    target_link_libraries(global-libs INTERFACE glibc-compatibility)
+    target_link_libraries(global-libs INTERFACE glibc-compatibility ${MEMCPY_LIBRARY})

    install(
-        TARGETS glibc-compatibility
+        TARGETS glibc-compatibility ${MEMCPY_LIBRARY}
        EXPORT global
        ARCHIVE DESTINATION lib
    )

    message (STATUS "Some symbols from glibc will be replaced for compatibility")
+
 elseif (YANDEX_OFFICIAL_BUILD)
    message (WARNING "Option GLIBC_COMPATIBILITY must be turned on for production builds.")
 endif ()
--- a/base/glibc-compatibility/memcpy/CMakeLists.txt
+++ b/base/glibc-compatibility/memcpy/CMakeLists.txt
@ -0,0 +1,8 @@
+if (ARCH_AMD64)
+    add_library(memcpy STATIC memcpy.cpp)
+
+    # We allow to include memcpy.h from user code for better inlining.
+    target_include_directories(memcpy PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>)
+
+    target_compile_options(memcpy PRIVATE -fno-builtin-memcpy)
+endif ()
--- a/base/glibc-compatibility/memcpy/memcpy.cpp
+++ b/base/glibc-compatibility/memcpy/memcpy.cpp
@ -0,0 +1,6 @@
+#include "memcpy.h"
+
+extern "C" void * memcpy(void * __restrict dst, const void * __restrict src, size_t size)
+{
+    return inline_memcpy(dst, src, size);
+}
--- a/base/glibc-compatibility/memcpy/memcpy.h
+++ b/base/glibc-compatibility/memcpy/memcpy.h
@ -0,0 +1,217 @@
+#include <cstddef>
+
+#include <emmintrin.h>
+
+
+/** Custom memcpy implementation for ClickHouse.
+  * It has the following benefits over using glibc's implementation:
+  * 1. Avoiding dependency on specific version of glibc's symbol, like memcpy@@GLIBC_2.14 for portability.
+  * 2. Avoiding indirect call via PLT due to shared linking, that can be less efficient.
+  * 3. It's possible to include this header and call inline_memcpy directly for better inlining or interprocedural analysis.
+  * 4. Better results on our performance tests on current CPUs: up to 25% on some queries and up to 0.7%..1% in average across all queries.
+  *
+  * Writing our own memcpy is extremely difficult for the following reasons:
+  * 1. The optimal variant depends on the specific CPU model.
+  * 2. The optimal variant depends on the distribution of size arguments.
+  * 3. It depends on the number of threads copying data concurrently.
+  * 4. It also depends on how the calling code is using the copied data and how the different memcpy calls are related to each other.
+  * Due to vast range of scenarios it makes proper testing especially difficult.
+  * When writing our own memcpy there is a risk to overoptimize it
+  * on non-representative microbenchmarks while making real-world use cases actually worse.
+  *
+  * Most of the benchmarks for memcpy on the internet are wrong.
+  *
+  * Let's look at the details:
+  *
+  * For small size, the order of branches in code is important.
+  * There are variants with specific order of branches (like here or in glibc)
+  * or with jump table (in asm code see example from Cosmopolitan libc:
+  * https://github.com/jart/cosmopolitan/blob/de09bec215675e9b0beb722df89c6f794da74f3f/libc/nexgen32e/memcpy.S#L61)
+  * or with Duff device in C (see https://github.com/skywind3000/FastMemcpy/)
+  *
+  * It's also important how to copy uneven sizes.
+  * Almost every implementation, including this, is using two overlapping movs.
+  *
+  * It is important to disable -ftree-loop-distribute-patterns when compiling memcpy implementation,
+  * otherwise the compiler can replace internal loops to a call to memcpy that will lead to infinite recursion.
+  *
+  * For larger sizes it's important to choose the instructions used:
+  * - SSE or AVX or AVX-512;
+  * - rep movsb;
+  * Performance will depend on the size threshold, on the CPU model, on the "erms" flag
+  * ("Enhansed Rep MovS" - it indicates that performance of "rep movsb" is decent for large sizes)
+  * https://stackoverflow.com/questions/43343231/enhanced-rep-movsb-for-memcpy
+  *
+  * Using AVX-512 can be bad due to throttling.
+  * Using AVX can be bad if most code is using SSE due to switching penalty
+  * (it also depends on the usage of "vzeroupper" instruction).
+  * But in some cases AVX gives a win.
+  *
+  * It also depends on how many times the loop will be unrolled.
+  * We are unrolling the loop 8 times (by the number of available registers), but it not always the best.
+  *
+  * It also depends on the usage of aligned or unaligned loads/stores.
+  * We are using unaligned loads and aligned stores.
+  *
+  * It also depends on the usage of prefetch instructions. It makes sense on some Intel CPUs but can slow down performance on AMD.
+  * Setting up correct offset for prefetching is non-obvious.
+  *
+  * Non-temporary (cache bypassing) stores can be used for very large sizes (more than a half of L3 cache).
+  * But the exact threshold is unclear - when doing memcpy from multiple threads the optimal threshold can be lower,
+  * because L3 cache is shared (and L2 cache is partially shared).
+  *
+  * Very large size of memcpy typically indicates suboptimal (not cache friendly) algorithms in code or unrealistic scenarios,
+  * so we don't pay attention to using non-temporary stores.
+  *
+  * On recent Intel CPUs, the presence of "erms" makes "rep movsb" the most benefitial,
+  * even comparing to non-temporary aligned unrolled stores even with the most wide registers.
+  *
+  * memcpy can be written in asm, C or C++. The latter can also use inline asm.
+  * The asm implementation can be better to make sure that compiler won't make the code worse,
+  * to ensure the order of branches, the code layout, the usage of all required registers.
+  * But if it is located in separate translation unit, inlining will not be possible
+  * (inline asm can be used to overcome this limitation).
+  * Sometimes C or C++ code can be further optimized by compiler.
+  * For example, clang is capable replacing SSE intrinsics to AVX code if -mavx is used.
+  *
+  * Please note that compiler can replace plain code to memcpy and vice versa.
+  * - memcpy with compile-time known small size is replaced to simple instructions without a call to memcpy;
+  *   it is controlled by -fbuiltin-memcpy and can be manually ensured by calling __builtin_memcpy.
+  *   This is often used to implement unaligned load/store without undefined behaviour in C++.
+  * - a loop with copying bytes can be recognized and replaced by a call to memcpy;
+  *   it is controlled by -ftree-loop-distribute-patterns.
+  * - also note that a loop with copying bytes can be unrolled, peeled and vectorized that will give you
+  *   inline code somewhat similar to a decent implementation of memcpy.
+  *
+  * This description is up to date as of Mar 2021.
+  *
+  * How to test the memcpy implementation for performance:
+  * 1. Test on real production workload.
+  * 2. For synthetic test, see utils/memcpy-bench, but make sure you will do the best to exhaust the wide range of scenarios.
+  *
+  * TODO: Add self-tuning memcpy with bayesian bandits algorithm for large sizes.
+  * See https://habr.com/en/company/yandex/blog/457612/
+  */
+
+
+static inline void * inline_memcpy(void * __restrict dst_, const void * __restrict src_, size_t size)
+{
+    /// We will use pointer arithmetic, so char pointer will be used.
+    /// Note that __restrict makes sense (otherwise compiler will reload data from memory
+    /// instead of using the value of registers due to possible aliasing).
+    char * __restrict dst = reinterpret_cast<char * __restrict>(dst_);
+    const char * __restrict src = reinterpret_cast<const char * __restrict>(src_);
+
+    /// Standard memcpy returns the original value of dst. It is rarely used but we have to do it.
+    /// If you use memcpy with small but non-constant sizes, you can call inline_memcpy directly
+    /// for inlining and removing this single instruction.
+    void * ret = dst;
+
+tail:
+    /// Small sizes and tails after the loop for large sizes.
+    /// The order of branches is important but in fact the optimal order depends on the distribution of sizes in your application.
+    /// This order of branches is from the disassembly of glibc's code.
+    /// We copy chunks of possibly uneven size with two overlapping movs.
+    /// Example: to copy 5 bytes [0, 1, 2, 3, 4] we will copy tail [1, 2, 3, 4] first and then head [0, 1, 2, 3].
+    if (size <= 16)
+    {
+        if (size >= 8)
+        {
+            /// Chunks of 8..16 bytes.
+            __builtin_memcpy(dst + size - 8, src + size - 8, 8);
+            __builtin_memcpy(dst, src, 8);
+        }
+        else if (size >= 4)
+        {
+            /// Chunks of 4..7 bytes.
+            __builtin_memcpy(dst + size - 4, src + size - 4, 4);
+            __builtin_memcpy(dst, src, 4);
+        }
+        else if (size >= 2)
+        {
+            /// Chunks of 2..3 bytes.
+            __builtin_memcpy(dst + size - 2, src + size - 2, 2);
+            __builtin_memcpy(dst, src, 2);
+        }
+        else if (size >= 1)
+        {
+            /// A single byte.
+            *dst = *src;
+        }
+        /// No bytes remaining.
+    }
+    else
+    {
+        /// Medium and large sizes.
+        if (size <= 128)
+        {
+            /// Medium size, not enough for full loop unrolling.
+
+            /// We will copy the last 16 bytes.
+            _mm_storeu_si128(reinterpret_cast<__m128i *>(dst + size - 16), _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + size - 16)));
+
+            /// Then we will copy every 16 bytes from the beginning in a loop.
+            /// The last loop iteration will possibly overwrite some part of already copied last 16 bytes.
+            /// This is Ok, similar to the code for small sizes above.
+            while (size > 16)
+            {
+                _mm_storeu_si128(reinterpret_cast<__m128i *>(dst), _mm_loadu_si128(reinterpret_cast<const __m128i *>(src)));
+                dst += 16;
+                src += 16;
+                size -= 16;
+            }
+        }
+        else
+        {
+            /// Large size with fully unrolled loop.
+
+            /// Align destination to 16 bytes boundary.
+            size_t padding = (16 - (reinterpret_cast<size_t>(dst) & 15)) & 15;
+
+            /// If not aligned - we will copy first 16 bytes with unaligned stores.
+            if (padding > 0)
+            {
+                __m128i head = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src));
+                _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), head);
+                dst += padding;
+                src += padding;
+                size -= padding;
+            }
+
+            /// Aligned unrolled copy. We will use all available SSE registers.
+            /// It's not possible to have both src and dst aligned.
+            /// So, we will use aligned stores and unaligned loads.
+            __m128i c0, c1, c2, c3, c4, c5, c6, c7;
+
+            while (size >= 128)
+            {
+                c0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src) + 0);
+                c1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src) + 1);
+                c2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src) + 2);
+                c3 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src) + 3);
+                c4 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src) + 4);
+                c5 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src) + 5);
+                c6 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src) + 6);
+                c7 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src) + 7);
+                src += 128;
+                _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 0), c0);
+                _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 1), c1);
+                _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 2), c2);
+                _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 3), c3);
+                _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 4), c4);
+                _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 5), c5);
+                _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 6), c6);
+                _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 7), c7);
+                dst += 128;
+
+                size -= 128;
+            }
+
+            /// The latest remaining 0..127 bytes will be processed as usual.
+            goto tail;
+        }
+    }
+
+    return ret;
+}
+
--- a/contrib/CMakeLists.txt
+++ b/contrib/CMakeLists.txt
@ -38,7 +38,6 @@ add_subdirectory (boost-cmake)
 add_subdirectory (cctz-cmake)
 add_subdirectory (consistent-hashing)
 add_subdirectory (dragonbox-cmake)
-add_subdirectory (FastMemcpy)
 add_subdirectory (hyperscan-cmake)
 add_subdirectory (jemalloc-cmake)
 add_subdirectory (libcpuid-cmake)
--- a/contrib/FastMemcpy/CMakeLists.txt
+++ b/contrib/FastMemcpy/CMakeLists.txt
@ -1,28 +0,0 @@
-option (ENABLE_FASTMEMCPY "Enable FastMemcpy library (only internal)" ${ENABLE_LIBRARIES})
-
-if (NOT OS_LINUX OR ARCH_AARCH64)
-    set (ENABLE_FASTMEMCPY OFF)
-endif ()
-
-if (ENABLE_FASTMEMCPY)
-    set (LIBRARY_DIR ${ClickHouse_SOURCE_DIR}/contrib/FastMemcpy)
-
-    set (SRCS
-        ${LIBRARY_DIR}/FastMemcpy.c
-
-        memcpy_wrapper.c
-    )
-
-    add_library (FastMemcpy ${SRCS})
-    target_include_directories (FastMemcpy PUBLIC ${LIBRARY_DIR})
-
-    target_compile_definitions(FastMemcpy PUBLIC USE_FASTMEMCPY=1)
-
-    message (STATUS "Using FastMemcpy")
-else ()
-    add_library (FastMemcpy INTERFACE)
-
-    target_compile_definitions(FastMemcpy INTERFACE USE_FASTMEMCPY=0)
-
-    message (STATUS "Not using FastMemcpy")
-endif ()
--- a/contrib/FastMemcpy/FastMemcpy.c
+++ b/contrib/FastMemcpy/FastMemcpy.c
@ -1,220 +0,0 @@
-//=====================================================================
-//
-// FastMemcpy.c - skywind3000@163.com, 2015
-//
-// feature:
-// 50% speed up in avg. vs standard memcpy (tested in vc2012/gcc4.9)
-//
-//=====================================================================
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <time.h>
-
-#if (defined(_WIN32) || defined(WIN32))
-#include <windows.h>
-#include <mmsystem.h>
-#ifdef _MSC_VER
-#pragma comment(lib, "winmm.lib")
-#endif
-#elif defined(__unix)
-#include <sys/time.h>
-#include <unistd.h>
-#else
-#error it can only be compiled under windows or unix
-#endif
-
-#include "FastMemcpy.h"
-
-unsigned int gettime()
-{
-	#if (defined(_WIN32) || defined(WIN32))
-	return timeGetTime();
-	#else
-	static struct timezone tz={ 0,0 };
-	struct timeval time;
-	gettimeofday(&time,&tz);
-	return (time.tv_sec * 1000 + time.tv_usec / 1000);
-	#endif
-}
-
-void sleepms(unsigned int millisec)
-{
-#if defined(_WIN32) || defined(WIN32)
-	Sleep(millisec);
-#else
-	usleep(millisec * 1000);
-#endif
-}
-
-
-void benchmark(int dstalign, int srcalign, size_t size, int times)
-{
-	char *DATA1 = (char*)malloc(size + 64);
-	char *DATA2 = (char*)malloc(size + 64);
-	size_t LINEAR1 = ((size_t)DATA1);
-	size_t LINEAR2 = ((size_t)DATA2);
-	char *ALIGN1 = (char*)(((64 - (LINEAR1 & 63)) & 63) + LINEAR1);
-	char *ALIGN2 = (char*)(((64 - (LINEAR2 & 63)) & 63) + LINEAR2);
-	char *dst = (dstalign)? ALIGN1 : (ALIGN1 + 1);
-	char *src = (srcalign)? ALIGN2 : (ALIGN2 + 3);
-	unsigned int t1, t2;
-	int k;
-	
-	sleepms(100);
-	t1 = gettime();
-	for (k = times; k > 0; k--) {
-		memcpy(dst, src, size);
-	}
-	t1 = gettime() - t1;
-	sleepms(100);
-	t2 = gettime();
-	for (k = times; k > 0; k--) {
-		memcpy_fast(dst, src, size);
-	}
-	t2 = gettime() - t2;
-
-	free(DATA1);
-	free(DATA2);
-
-	printf("result(dst %s, src %s): memcpy_fast=%dms memcpy=%d ms\n",  
-		dstalign? "aligned" : "unalign", 
-		srcalign? "aligned" : "unalign", (int)t2, (int)t1);
-}
-
-
-void bench(int copysize, int times)
-{
-	printf("benchmark(size=%d bytes, times=%d):\n", copysize, times);
-	benchmark(1, 1, copysize, times);
-	benchmark(1, 0, copysize, times);
-	benchmark(0, 1, copysize, times);
-	benchmark(0, 0, copysize, times);
-	printf("\n");
-}
-
-
-void random_bench(int maxsize, int times)
-{
-	static char A[11 * 1024 * 1024 + 2];
-	static char B[11 * 1024 * 1024 + 2];
-	static int random_offsets[0x10000];
-	static int random_sizes[0x8000];
-	unsigned int i, p1, p2;
-	unsigned int t1, t2;
-	for (i = 0; i < 0x10000; i++) {	// generate random offsets
-		random_offsets[i] = rand() % (10 * 1024 * 1024 + 1);
-	}
-	for (i = 0; i < 0x8000; i++) {	// generate random sizes
-		random_sizes[i] = 1 + rand() % maxsize;
-	}
-	sleepms(100);
-	t1 = gettime();
-	for (p1 = 0, p2 = 0, i = 0; i < times; i++) {
-		int offset1 = random_offsets[(p1++) & 0xffff];
-		int offset2 = random_offsets[(p1++) & 0xffff];
-		int size = random_sizes[(p2++) & 0x7fff];
-		memcpy(A + offset1, B + offset2, size);
-	}
-	t1 = gettime() - t1;
-	sleepms(100);
-	t2 = gettime();
-	for (p1 = 0, p2 = 0, i = 0; i < times; i++) {
-		int offset1 = random_offsets[(p1++) & 0xffff];
-		int offset2 = random_offsets[(p1++) & 0xffff];
-		int size = random_sizes[(p2++) & 0x7fff];
-		memcpy_fast(A + offset1, B + offset2, size);
-	}
-	t2 = gettime() - t2;
-	printf("benchmark random access:\n");
-	printf("memcpy_fast=%dms memcpy=%dms\n\n", (int)t2, (int)t1);
-}
-
-
-#ifdef _MSC_VER
-#pragma comment(lib, "winmm.lib")
-#endif
-
-int main(void)
-{
-	bench(32, 0x1000000);
-	bench(64, 0x1000000);
-	bench(512, 0x800000);
-	bench(1024, 0x400000);
-	bench(4096, 0x80000);
-	bench(8192, 0x40000);
-	bench(1024 * 1024 * 1, 0x800);
-	bench(1024 * 1024 * 4, 0x200);
-	bench(1024 * 1024 * 8, 0x100);
-	
-	random_bench(2048, 8000000);
-
-	return 0;
-}
-
-
-
-
-/*
-benchmark(size=32 bytes, times=16777216):
-result(dst aligned, src aligned): memcpy_fast=78ms memcpy=260 ms
-result(dst aligned, src unalign): memcpy_fast=78ms memcpy=250 ms
-result(dst unalign, src aligned): memcpy_fast=78ms memcpy=266 ms
-result(dst unalign, src unalign): memcpy_fast=78ms memcpy=234 ms
-
-benchmark(size=64 bytes, times=16777216):
-result(dst aligned, src aligned): memcpy_fast=109ms memcpy=281 ms
-result(dst aligned, src unalign): memcpy_fast=109ms memcpy=328 ms
-result(dst unalign, src aligned): memcpy_fast=109ms memcpy=343 ms
-result(dst unalign, src unalign): memcpy_fast=93ms memcpy=344 ms
-
-benchmark(size=512 bytes, times=8388608):
-result(dst aligned, src aligned): memcpy_fast=125ms memcpy=218 ms
-result(dst aligned, src unalign): memcpy_fast=156ms memcpy=484 ms
-result(dst unalign, src aligned): memcpy_fast=172ms memcpy=546 ms
-result(dst unalign, src unalign): memcpy_fast=172ms memcpy=515 ms
-
-benchmark(size=1024 bytes, times=4194304):
-result(dst aligned, src aligned): memcpy_fast=109ms memcpy=172 ms
-result(dst aligned, src unalign): memcpy_fast=187ms memcpy=453 ms
-result(dst unalign, src aligned): memcpy_fast=172ms memcpy=437 ms
-result(dst unalign, src unalign): memcpy_fast=156ms memcpy=452 ms
-
-benchmark(size=4096 bytes, times=524288):
-result(dst aligned, src aligned): memcpy_fast=62ms memcpy=78 ms
-result(dst aligned, src unalign): memcpy_fast=109ms memcpy=202 ms
-result(dst unalign, src aligned): memcpy_fast=94ms memcpy=203 ms
-result(dst unalign, src unalign): memcpy_fast=110ms memcpy=218 ms
-
-benchmark(size=8192 bytes, times=262144):
-result(dst aligned, src aligned): memcpy_fast=62ms memcpy=78 ms
-result(dst aligned, src unalign): memcpy_fast=78ms memcpy=202 ms
-result(dst unalign, src aligned): memcpy_fast=78ms memcpy=203 ms
-result(dst unalign, src unalign): memcpy_fast=94ms memcpy=203 ms
-
-benchmark(size=1048576 bytes, times=2048):
-result(dst aligned, src aligned): memcpy_fast=203ms memcpy=191 ms
-result(dst aligned, src unalign): memcpy_fast=219ms memcpy=281 ms
-result(dst unalign, src aligned): memcpy_fast=218ms memcpy=328 ms
-result(dst unalign, src unalign): memcpy_fast=218ms memcpy=312 ms
-
-benchmark(size=4194304 bytes, times=512):
-result(dst aligned, src aligned): memcpy_fast=312ms memcpy=406 ms
-result(dst aligned, src unalign): memcpy_fast=296ms memcpy=421 ms
-result(dst unalign, src aligned): memcpy_fast=312ms memcpy=468 ms
-result(dst unalign, src unalign): memcpy_fast=297ms memcpy=452 ms
-
-benchmark(size=8388608 bytes, times=256):
-result(dst aligned, src aligned): memcpy_fast=281ms memcpy=452 ms
-result(dst aligned, src unalign): memcpy_fast=280ms memcpy=468 ms
-result(dst unalign, src aligned): memcpy_fast=298ms memcpy=514 ms
-result(dst unalign, src unalign): memcpy_fast=344ms memcpy=472 ms
-
-benchmark random access:
-memcpy_fast=515ms memcpy=1014ms
-
-*/
-
-
-
-
--- a/contrib/FastMemcpy/FastMemcpy.h
+++ b/contrib/FastMemcpy/FastMemcpy.h
@ -1,694 +0,0 @@
-//=====================================================================
-//
-// FastMemcpy.c - skywind3000@163.com, 2015
-//
-// feature:
-// 50% speed up in avg. vs standard memcpy (tested in vc2012/gcc5.1)
-//
-//=====================================================================
-#ifndef __FAST_MEMCPY_H__
-#define __FAST_MEMCPY_H__
-
-#include <stddef.h>
-#include <stdint.h>
-#include <emmintrin.h>
-
-
-//---------------------------------------------------------------------
-// force inline for compilers
-//---------------------------------------------------------------------
-#ifndef INLINE
-#ifdef __GNUC__
-#if (__GNUC__ > 3) || ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 1))
-    #define INLINE         __inline__ __attribute__((always_inline))
-#else
-    #define INLINE         __inline__
-#endif
-#elif defined(_MSC_VER)
-	#define INLINE __forceinline
-#elif (defined(__BORLANDC__) || defined(__WATCOMC__))
-    #define INLINE __inline
-#else
-    #define INLINE
-#endif
-#endif
-
-typedef __attribute__((__aligned__(1))) uint16_t uint16_unaligned_t;
-typedef __attribute__((__aligned__(1))) uint32_t uint32_unaligned_t;
-typedef __attribute__((__aligned__(1))) uint64_t uint64_unaligned_t;
-
-//---------------------------------------------------------------------
-// fast copy for different sizes
-//---------------------------------------------------------------------
-static INLINE void memcpy_sse2_16(void *dst, const void *src) {
-	__m128i m0 = _mm_loadu_si128(((const __m128i*)src) + 0);
-	_mm_storeu_si128(((__m128i*)dst) + 0, m0);
-}
-
-static INLINE void memcpy_sse2_32(void *dst, const void *src) {
-	__m128i m0 = _mm_loadu_si128(((const __m128i*)src) + 0);
-	__m128i m1 = _mm_loadu_si128(((const __m128i*)src) + 1);
-	_mm_storeu_si128(((__m128i*)dst) + 0, m0);
-	_mm_storeu_si128(((__m128i*)dst) + 1, m1);
-}
-
-static INLINE void memcpy_sse2_64(void *dst, const void *src) {
-	__m128i m0 = _mm_loadu_si128(((const __m128i*)src) + 0);
-	__m128i m1 = _mm_loadu_si128(((const __m128i*)src) + 1);
-	__m128i m2 = _mm_loadu_si128(((const __m128i*)src) + 2);
-	__m128i m3 = _mm_loadu_si128(((const __m128i*)src) + 3);
-	_mm_storeu_si128(((__m128i*)dst) + 0, m0);
-	_mm_storeu_si128(((__m128i*)dst) + 1, m1);
-	_mm_storeu_si128(((__m128i*)dst) + 2, m2);
-	_mm_storeu_si128(((__m128i*)dst) + 3, m3);
-}
-
-static INLINE void memcpy_sse2_128(void *dst, const void *src) {
-	__m128i m0 = _mm_loadu_si128(((const __m128i*)src) + 0);
-	__m128i m1 = _mm_loadu_si128(((const __m128i*)src) + 1);
-	__m128i m2 = _mm_loadu_si128(((const __m128i*)src) + 2);
-	__m128i m3 = _mm_loadu_si128(((const __m128i*)src) + 3);
-	__m128i m4 = _mm_loadu_si128(((const __m128i*)src) + 4);
-	__m128i m5 = _mm_loadu_si128(((const __m128i*)src) + 5);
-	__m128i m6 = _mm_loadu_si128(((const __m128i*)src) + 6);
-	__m128i m7 = _mm_loadu_si128(((const __m128i*)src) + 7);
-	_mm_storeu_si128(((__m128i*)dst) + 0, m0);
-	_mm_storeu_si128(((__m128i*)dst) + 1, m1);
-	_mm_storeu_si128(((__m128i*)dst) + 2, m2);
-	_mm_storeu_si128(((__m128i*)dst) + 3, m3);
-	_mm_storeu_si128(((__m128i*)dst) + 4, m4);
-	_mm_storeu_si128(((__m128i*)dst) + 5, m5);
-	_mm_storeu_si128(((__m128i*)dst) + 6, m6);
-	_mm_storeu_si128(((__m128i*)dst) + 7, m7);
-}
-
-
-//---------------------------------------------------------------------
-// tiny memory copy with jump table optimized
-//---------------------------------------------------------------------
-/// Attribute is used to avoid an error with undefined behaviour sanitizer
-/// ../contrib/FastMemcpy/FastMemcpy.h:91:56: runtime error: applying zero offset to null pointer
-/// Found by 01307_orc_output_format.sh, cause - ORCBlockInputFormat and external ORC library.
-__attribute__((__no_sanitize__("undefined"))) static INLINE void *memcpy_tiny(void *dst, const void *src, size_t size) {
-	unsigned char *dd = ((unsigned char*)dst) + size;
-	const unsigned char *ss = ((const unsigned char*)src) + size;
-
-	switch (size) {
-	case 64:
-		memcpy_sse2_64(dd - 64, ss - 64);
-	case 0:
-		break;
-
-	case 65:
-		memcpy_sse2_64(dd - 65, ss - 65);
-	case 1:
-		dd[-1] = ss[-1];
-		break;
-
-	case 66:
-		memcpy_sse2_64(dd - 66, ss - 66);
-	case 2:
-		*((uint16_unaligned_t*)(dd - 2)) = *((uint16_unaligned_t*)(ss - 2));
-		break;
-
-	case 67:
-		memcpy_sse2_64(dd - 67, ss - 67);
-	case 3:
-		*((uint16_unaligned_t*)(dd - 3)) = *((uint16_unaligned_t*)(ss - 3));
-		dd[-1] = ss[-1];
-		break;
-
-	case 68:
-		memcpy_sse2_64(dd - 68, ss - 68);
-	case 4:
-		*((uint32_unaligned_t*)(dd - 4)) = *((uint32_unaligned_t*)(ss - 4));
-		break;
-
-	case 69:
-		memcpy_sse2_64(dd - 69, ss - 69);
-	case 5:
-		*((uint32_unaligned_t*)(dd - 5)) = *((uint32_unaligned_t*)(ss - 5));
-		dd[-1] = ss[-1];
-		break;
-
-	case 70:
-		memcpy_sse2_64(dd - 70, ss - 70);
-	case 6:
-		*((uint32_unaligned_t*)(dd - 6)) = *((uint32_unaligned_t*)(ss - 6));
-		*((uint16_unaligned_t*)(dd - 2)) = *((uint16_unaligned_t*)(ss - 2));
-		break;
-
-	case 71:
-		memcpy_sse2_64(dd - 71, ss - 71);
-	case 7:
-		*((uint32_unaligned_t*)(dd - 7)) = *((uint32_unaligned_t*)(ss - 7));
-		*((uint32_unaligned_t*)(dd - 4)) = *((uint32_unaligned_t*)(ss - 4));
-		break;
-
-	case 72:
-		memcpy_sse2_64(dd - 72, ss - 72);
-	case 8:
-		*((uint64_unaligned_t*)(dd - 8)) = *((uint64_unaligned_t*)(ss - 8));
-		break;
-
-	case 73:
-		memcpy_sse2_64(dd - 73, ss - 73);
-	case 9:
-		*((uint64_unaligned_t*)(dd - 9)) = *((uint64_unaligned_t*)(ss - 9));
-		dd[-1] = ss[-1];
-		break;
-
-	case 74:
-		memcpy_sse2_64(dd - 74, ss - 74);
-	case 10:
-		*((uint64_unaligned_t*)(dd - 10)) = *((uint64_unaligned_t*)(ss - 10));
-		*((uint16_unaligned_t*)(dd - 2)) = *((uint16_unaligned_t*)(ss - 2));
-		break;
-
-	case 75:
-		memcpy_sse2_64(dd - 75, ss - 75);
-	case 11:
-		*((uint64_unaligned_t*)(dd - 11)) = *((uint64_unaligned_t*)(ss - 11));
-		*((uint32_unaligned_t*)(dd - 4)) = *((uint32_unaligned_t*)(ss - 4));
-		break;
-
-	case 76:
-		memcpy_sse2_64(dd - 76, ss - 76);
-	case 12:
-		*((uint64_unaligned_t*)(dd - 12)) = *((uint64_unaligned_t*)(ss - 12));
-		*((uint32_unaligned_t*)(dd - 4)) = *((uint32_unaligned_t*)(ss - 4));
-		break;
-
-	case 77:
-		memcpy_sse2_64(dd - 77, ss - 77);
-	case 13:
-		*((uint64_unaligned_t*)(dd - 13)) = *((uint64_unaligned_t*)(ss - 13));
-		*((uint32_unaligned_t*)(dd - 5)) = *((uint32_unaligned_t*)(ss - 5));
-		dd[-1] = ss[-1];
-		break;
-
-	case 78:
-		memcpy_sse2_64(dd - 78, ss - 78);
-	case 14:
-		*((uint64_unaligned_t*)(dd - 14)) = *((uint64_unaligned_t*)(ss - 14));
-		*((uint64_unaligned_t*)(dd - 8)) = *((uint64_unaligned_t*)(ss - 8));
-		break;
-
-	case 79:
-		memcpy_sse2_64(dd - 79, ss - 79);
-	case 15:
-		*((uint64_unaligned_t*)(dd - 15)) = *((uint64_unaligned_t*)(ss - 15));
-		*((uint64_unaligned_t*)(dd - 8)) = *((uint64_unaligned_t*)(ss - 8));
-		break;
-
-	case 80:
-		memcpy_sse2_64(dd - 80, ss - 80);
-	case 16:
-		memcpy_sse2_16(dd - 16, ss - 16);
-		break;
-
-	case 81:
-		memcpy_sse2_64(dd - 81, ss - 81);
-	case 17:
-		memcpy_sse2_16(dd - 17, ss - 17);
-		dd[-1] = ss[-1];
-		break;
-
-	case 82:
-		memcpy_sse2_64(dd - 82, ss - 82);
-	case 18:
-		memcpy_sse2_16(dd - 18, ss - 18);
-		*((uint16_unaligned_t*)(dd - 2)) = *((uint16_unaligned_t*)(ss - 2));
-		break;
-
-	case 83:
-		memcpy_sse2_64(dd - 83, ss - 83);
-	case 19:
-		memcpy_sse2_16(dd - 19, ss - 19);
-		*((uint16_unaligned_t*)(dd - 3)) = *((uint16_unaligned_t*)(ss - 3));
-		dd[-1] = ss[-1];
-		break;
-
-	case 84:
-		memcpy_sse2_64(dd - 84, ss - 84);
-	case 20:
-		memcpy_sse2_16(dd - 20, ss - 20);
-		*((uint32_unaligned_t*)(dd - 4)) = *((uint32_unaligned_t*)(ss - 4));
-		break;
-
-	case 85:
-		memcpy_sse2_64(dd - 85, ss - 85);
-	case 21:
-		memcpy_sse2_16(dd - 21, ss - 21);
-		*((uint32_unaligned_t*)(dd - 5)) = *((uint32_unaligned_t*)(ss - 5));
-		dd[-1] = ss[-1];
-		break;
-
-	case 86:
-		memcpy_sse2_64(dd - 86, ss - 86);
-	case 22:
-		memcpy_sse2_16(dd - 22, ss - 22);
-		*((uint32_unaligned_t*)(dd - 6)) = *((uint32_unaligned_t*)(ss - 6));
-		*((uint16_unaligned_t*)(dd - 2)) = *((uint16_unaligned_t*)(ss - 2));
-		break;
-
-	case 87:
-		memcpy_sse2_64(dd - 87, ss - 87);
-	case 23:
-		memcpy_sse2_16(dd - 23, ss - 23);
-		*((uint32_unaligned_t*)(dd - 7)) = *((uint32_unaligned_t*)(ss - 7));
-		*((uint32_unaligned_t*)(dd - 4)) = *((uint32_unaligned_t*)(ss - 4));
-		break;
-
-	case 88:
-		memcpy_sse2_64(dd - 88, ss - 88);
-	case 24:
-		memcpy_sse2_16(dd - 24, ss - 24);
-		memcpy_sse2_16(dd - 16, ss - 16);
-		break;
-
-	case 89:
-		memcpy_sse2_64(dd - 89, ss - 89);
-	case 25:
-		memcpy_sse2_16(dd - 25, ss - 25);
-		memcpy_sse2_16(dd - 16, ss - 16);
-		break;
-
-	case 90:
-		memcpy_sse2_64(dd - 90, ss - 90);
-	case 26:
-		memcpy_sse2_16(dd - 26, ss - 26);
-		memcpy_sse2_16(dd - 16, ss - 16);
-		break;
-
-	case 91:
-		memcpy_sse2_64(dd - 91, ss - 91);
-	case 27:
-		memcpy_sse2_16(dd - 27, ss - 27);
-		memcpy_sse2_16(dd - 16, ss - 16);
-		break;
-
-	case 92:
-		memcpy_sse2_64(dd - 92, ss - 92);
-	case 28:
-		memcpy_sse2_16(dd - 28, ss - 28);
-		memcpy_sse2_16(dd - 16, ss - 16);
-		break;
-
-	case 93:
-		memcpy_sse2_64(dd - 93, ss - 93);
-	case 29:
-		memcpy_sse2_16(dd - 29, ss - 29);
-		memcpy_sse2_16(dd - 16, ss - 16);
-		break;
-
-	case 94:
-		memcpy_sse2_64(dd - 94, ss - 94);
-	case 30:
-		memcpy_sse2_16(dd - 30, ss - 30);
-		memcpy_sse2_16(dd - 16, ss - 16);
-		break;
-
-	case 95:
-		memcpy_sse2_64(dd - 95, ss - 95);
-	case 31:
-		memcpy_sse2_16(dd - 31, ss - 31);
-		memcpy_sse2_16(dd - 16, ss - 16);
-		break;
-
-	case 96:
-		memcpy_sse2_64(dd - 96, ss - 96);
-	case 32:
-		memcpy_sse2_32(dd - 32, ss - 32);
-		break;
-
-	case 97:
-		memcpy_sse2_64(dd - 97, ss - 97);
-	case 33:
-		memcpy_sse2_32(dd - 33, ss - 33);
-		dd[-1] = ss[-1];
-		break;
-
-	case 98:
-		memcpy_sse2_64(dd - 98, ss - 98);
-	case 34:
-		memcpy_sse2_32(dd - 34, ss - 34);
-		*((uint16_unaligned_t*)(dd - 2)) = *((uint16_unaligned_t*)(ss - 2));
-		break;
-
-	case 99:
-		memcpy_sse2_64(dd - 99, ss - 99);
-	case 35:
-		memcpy_sse2_32(dd - 35, ss - 35);
-		*((uint16_unaligned_t*)(dd - 3)) = *((uint16_unaligned_t*)(ss - 3));
-		dd[-1] = ss[-1];
-		break;
-
-	case 100:
-		memcpy_sse2_64(dd - 100, ss - 100);
-	case 36:
-		memcpy_sse2_32(dd - 36, ss - 36);
-		*((uint32_unaligned_t*)(dd - 4)) = *((uint32_unaligned_t*)(ss - 4));
-		break;
-
-	case 101:
-		memcpy_sse2_64(dd - 101, ss - 101);
-	case 37:
-		memcpy_sse2_32(dd - 37, ss - 37);
-		*((uint32_unaligned_t*)(dd - 5)) = *((uint32_unaligned_t*)(ss - 5));
-		dd[-1] = ss[-1];
-		break;
-
-	case 102:
-		memcpy_sse2_64(dd - 102, ss - 102);
-	case 38:
-		memcpy_sse2_32(dd - 38, ss - 38);
-		*((uint32_unaligned_t*)(dd - 6)) = *((uint32_unaligned_t*)(ss - 6));
-		*((uint16_unaligned_t*)(dd - 2)) = *((uint16_unaligned_t*)(ss - 2));
-		break;
-
-	case 103:
-		memcpy_sse2_64(dd - 103, ss - 103);
-	case 39:
-		memcpy_sse2_32(dd - 39, ss - 39);
-		*((uint32_unaligned_t*)(dd - 7)) = *((uint32_unaligned_t*)(ss - 7));
-		*((uint32_unaligned_t*)(dd - 4)) = *((uint32_unaligned_t*)(ss - 4));
-		break;
-
-	case 104:
-		memcpy_sse2_64(dd - 104, ss - 104);
-	case 40:
-		memcpy_sse2_32(dd - 40, ss - 40);
-		*((uint64_unaligned_t*)(dd - 8)) = *((uint64_unaligned_t*)(ss - 8));
-		break;
-
-	case 105:
-		memcpy_sse2_64(dd - 105, ss - 105);
-	case 41:
-		memcpy_sse2_32(dd - 41, ss - 41);
-		*((uint64_unaligned_t*)(dd - 9)) = *((uint64_unaligned_t*)(ss - 9));
-		dd[-1] = ss[-1];
-		break;
-
-	case 106:
-		memcpy_sse2_64(dd - 106, ss - 106);
-	case 42:
-		memcpy_sse2_32(dd - 42, ss - 42);
-		*((uint64_unaligned_t*)(dd - 10)) = *((uint64_unaligned_t*)(ss - 10));
-		*((uint16_unaligned_t*)(dd - 2)) = *((uint16_unaligned_t*)(ss - 2));
-		break;
-
-	case 107:
-		memcpy_sse2_64(dd - 107, ss - 107);
-	case 43:
-		memcpy_sse2_32(dd - 43, ss - 43);
-		*((uint64_unaligned_t*)(dd - 11)) = *((uint64_unaligned_t*)(ss - 11));
-		*((uint32_unaligned_t*)(dd - 4)) = *((uint32_unaligned_t*)(ss - 4));
-		break;
-
-	case 108:
-		memcpy_sse2_64(dd - 108, ss - 108);
-	case 44:
-		memcpy_sse2_32(dd - 44, ss - 44);
-		*((uint64_unaligned_t*)(dd - 12)) = *((uint64_unaligned_t*)(ss - 12));
-		*((uint32_unaligned_t*)(dd - 4)) = *((uint32_unaligned_t*)(ss - 4));
-		break;
-
-	case 109:
-		memcpy_sse2_64(dd - 109, ss - 109);
-	case 45:
-		memcpy_sse2_32(dd - 45, ss - 45);
-		*((uint64_unaligned_t*)(dd - 13)) = *((uint64_unaligned_t*)(ss - 13));
-		*((uint32_unaligned_t*)(dd - 5)) = *((uint32_unaligned_t*)(ss - 5));
-		dd[-1] = ss[-1];
-		break;
-
-	case 110:
-		memcpy_sse2_64(dd - 110, ss - 110);
-	case 46:
-		memcpy_sse2_32(dd - 46, ss - 46);
-		*((uint64_unaligned_t*)(dd - 14)) = *((uint64_unaligned_t*)(ss - 14));
-		*((uint64_unaligned_t*)(dd - 8)) = *((uint64_unaligned_t*)(ss - 8));
-		break;
-
-	case 111:
-		memcpy_sse2_64(dd - 111, ss - 111);
-	case 47:
-		memcpy_sse2_32(dd - 47, ss - 47);
-		*((uint64_unaligned_t*)(dd - 15)) = *((uint64_unaligned_t*)(ss - 15));
-		*((uint64_unaligned_t*)(dd - 8)) = *((uint64_unaligned_t*)(ss - 8));
-		break;
-
-	case 112:
-		memcpy_sse2_64(dd - 112, ss - 112);
-	case 48:
-		memcpy_sse2_32(dd - 48, ss - 48);
-		memcpy_sse2_16(dd - 16, ss - 16);
-		break;
-
-	case 113:
-		memcpy_sse2_64(dd - 113, ss - 113);
-	case 49:
-		memcpy_sse2_32(dd - 49, ss - 49);
-		memcpy_sse2_16(dd - 17, ss - 17);
-		dd[-1] = ss[-1];
-		break;
-
-	case 114:
-		memcpy_sse2_64(dd - 114, ss - 114);
-	case 50:
-		memcpy_sse2_32(dd - 50, ss - 50);
-		memcpy_sse2_16(dd - 18, ss - 18);
-		*((uint16_unaligned_t*)(dd - 2)) = *((uint16_unaligned_t*)(ss - 2));
-		break;
-
-	case 115:
-		memcpy_sse2_64(dd - 115, ss - 115);
-	case 51:
-		memcpy_sse2_32(dd - 51, ss - 51);
-		memcpy_sse2_16(dd - 19, ss - 19);
-		*((uint16_unaligned_t*)(dd - 3)) = *((uint16_unaligned_t*)(ss - 3));
-		dd[-1] = ss[-1];
-		break;
-
-	case 116:
-		memcpy_sse2_64(dd - 116, ss - 116);
-	case 52:
-		memcpy_sse2_32(dd - 52, ss - 52);
-		memcpy_sse2_16(dd - 20, ss - 20);
-		*((uint32_unaligned_t*)(dd - 4)) = *((uint32_unaligned_t*)(ss - 4));
-		break;
-
-	case 117:
-		memcpy_sse2_64(dd - 117, ss - 117);
-	case 53:
-		memcpy_sse2_32(dd - 53, ss - 53);
-		memcpy_sse2_16(dd - 21, ss - 21);
-		*((uint32_unaligned_t*)(dd - 5)) = *((uint32_unaligned_t*)(ss - 5));
-		dd[-1] = ss[-1];
-		break;
-
-	case 118:
-		memcpy_sse2_64(dd - 118, ss - 118);
-	case 54:
-		memcpy_sse2_32(dd - 54, ss - 54);
-		memcpy_sse2_16(dd - 22, ss - 22);
-		*((uint32_unaligned_t*)(dd - 6)) = *((uint32_unaligned_t*)(ss - 6));
-		*((uint16_unaligned_t*)(dd - 2)) = *((uint16_unaligned_t*)(ss - 2));
-		break;
-
-	case 119:
-		memcpy_sse2_64(dd - 119, ss - 119);
-	case 55:
-		memcpy_sse2_32(dd - 55, ss - 55);
-		memcpy_sse2_16(dd - 23, ss - 23);
-		*((uint32_unaligned_t*)(dd - 7)) = *((uint32_unaligned_t*)(ss - 7));
-		*((uint32_unaligned_t*)(dd - 4)) = *((uint32_unaligned_t*)(ss - 4));
-		break;
-
-	case 120:
-		memcpy_sse2_64(dd - 120, ss - 120);
-	case 56:
-		memcpy_sse2_32(dd - 56, ss - 56);
-		memcpy_sse2_16(dd - 24, ss - 24);
-		memcpy_sse2_16(dd - 16, ss - 16);
-		break;
-
-	case 121:
-		memcpy_sse2_64(dd - 121, ss - 121);
-	case 57:
-		memcpy_sse2_32(dd - 57, ss - 57);
-		memcpy_sse2_16(dd - 25, ss - 25);
-		memcpy_sse2_16(dd - 16, ss - 16);
-		break;
-
-	case 122:
-		memcpy_sse2_64(dd - 122, ss - 122);
-	case 58:
-		memcpy_sse2_32(dd - 58, ss - 58);
-		memcpy_sse2_16(dd - 26, ss - 26);
-		memcpy_sse2_16(dd - 16, ss - 16);
-		break;
-
-	case 123:
-		memcpy_sse2_64(dd - 123, ss - 123);
-	case 59:
-		memcpy_sse2_32(dd - 59, ss - 59);
-		memcpy_sse2_16(dd - 27, ss - 27);
-		memcpy_sse2_16(dd - 16, ss - 16);
-		break;
-
-	case 124:
-		memcpy_sse2_64(dd - 124, ss - 124);
-	case 60:
-		memcpy_sse2_32(dd - 60, ss - 60);
-		memcpy_sse2_16(dd - 28, ss - 28);
-		memcpy_sse2_16(dd - 16, ss - 16);
-		break;
-
-	case 125:
-		memcpy_sse2_64(dd - 125, ss - 125);
-	case 61:
-		memcpy_sse2_32(dd - 61, ss - 61);
-		memcpy_sse2_16(dd - 29, ss - 29);
-		memcpy_sse2_16(dd - 16, ss - 16);
-		break;
-
-	case 126:
-		memcpy_sse2_64(dd - 126, ss - 126);
-	case 62:
-		memcpy_sse2_32(dd - 62, ss - 62);
-		memcpy_sse2_16(dd - 30, ss - 30);
-		memcpy_sse2_16(dd - 16, ss - 16);
-		break;
-
-	case 127:
-		memcpy_sse2_64(dd - 127, ss - 127);
-	case 63:
-		memcpy_sse2_32(dd - 63, ss - 63);
-		memcpy_sse2_16(dd - 31, ss - 31);
-		memcpy_sse2_16(dd - 16, ss - 16);
-		break;
-
-	case 128:
-		memcpy_sse2_128(dd - 128, ss - 128);
-		break;
-	}
-
-	return dst;
-}
-
-
-//---------------------------------------------------------------------
-// main routine
-//---------------------------------------------------------------------
-static void* memcpy_fast(void *destination, const void *source, size_t size)
-{
-	unsigned char *dst = (unsigned char*)destination;
-	const unsigned char *src = (const unsigned char*)source;
-	static size_t cachesize = 0x200000; // L2-cache size
-	size_t padding;
-
-	// small memory copy
-	if (size <= 128) {
-		return memcpy_tiny(dst, src, size);
-	}
-
-	// align destination to 16 bytes boundary
-	padding = (16 - (((size_t)dst) & 15)) & 15;
-
-	if (padding > 0) {
-		__m128i head = _mm_loadu_si128((const __m128i*)src);
-		_mm_storeu_si128((__m128i*)dst, head);
-		dst += padding;
-		src += padding;
-		size -= padding;
-	}
-
-	// medium size copy
-	if (size <= cachesize) {
-		__m128i c0, c1, c2, c3, c4, c5, c6, c7;
-
-		for (; size >= 128; size -= 128) {
-			c0 = _mm_loadu_si128(((const __m128i*)src) + 0);
-			c1 = _mm_loadu_si128(((const __m128i*)src) + 1);
-			c2 = _mm_loadu_si128(((const __m128i*)src) + 2);
-			c3 = _mm_loadu_si128(((const __m128i*)src) + 3);
-			c4 = _mm_loadu_si128(((const __m128i*)src) + 4);
-			c5 = _mm_loadu_si128(((const __m128i*)src) + 5);
-			c6 = _mm_loadu_si128(((const __m128i*)src) + 6);
-			c7 = _mm_loadu_si128(((const __m128i*)src) + 7);
-			_mm_prefetch((const char*)(src + 256), _MM_HINT_NTA);
-			src += 128;
-			_mm_store_si128((((__m128i*)dst) + 0), c0);
-			_mm_store_si128((((__m128i*)dst) + 1), c1);
-			_mm_store_si128((((__m128i*)dst) + 2), c2);
-			_mm_store_si128((((__m128i*)dst) + 3), c3);
-			_mm_store_si128((((__m128i*)dst) + 4), c4);
-			_mm_store_si128((((__m128i*)dst) + 5), c5);
-			_mm_store_si128((((__m128i*)dst) + 6), c6);
-			_mm_store_si128((((__m128i*)dst) + 7), c7);
-			dst += 128;
-		}
-	}
-	else {		// big memory copy
-		__m128i c0, c1, c2, c3, c4, c5, c6, c7;
-
-		_mm_prefetch((const char*)(src), _MM_HINT_NTA);
-
-		if ((((size_t)src) & 15) == 0) {	// source aligned
-			for (; size >= 128; size -= 128) {
-				c0 = _mm_load_si128(((const __m128i*)src) + 0);
-				c1 = _mm_load_si128(((const __m128i*)src) + 1);
-				c2 = _mm_load_si128(((const __m128i*)src) + 2);
-				c3 = _mm_load_si128(((const __m128i*)src) + 3);
-				c4 = _mm_load_si128(((const __m128i*)src) + 4);
-				c5 = _mm_load_si128(((const __m128i*)src) + 5);
-				c6 = _mm_load_si128(((const __m128i*)src) + 6);
-				c7 = _mm_load_si128(((const __m128i*)src) + 7);
-				_mm_prefetch((const char*)(src + 256), _MM_HINT_NTA);
-				src += 128;
-				_mm_stream_si128((((__m128i*)dst) + 0), c0);
-				_mm_stream_si128((((__m128i*)dst) + 1), c1);
-				_mm_stream_si128((((__m128i*)dst) + 2), c2);
-				_mm_stream_si128((((__m128i*)dst) + 3), c3);
-				_mm_stream_si128((((__m128i*)dst) + 4), c4);
-				_mm_stream_si128((((__m128i*)dst) + 5), c5);
-				_mm_stream_si128((((__m128i*)dst) + 6), c6);
-				_mm_stream_si128((((__m128i*)dst) + 7), c7);
-				dst += 128;
-			}
-		}
-		else {							// source unaligned
-			for (; size >= 128; size -= 128) {
-				c0 = _mm_loadu_si128(((const __m128i*)src) + 0);
-				c1 = _mm_loadu_si128(((const __m128i*)src) + 1);
-				c2 = _mm_loadu_si128(((const __m128i*)src) + 2);
-				c3 = _mm_loadu_si128(((const __m128i*)src) + 3);
-				c4 = _mm_loadu_si128(((const __m128i*)src) + 4);
-				c5 = _mm_loadu_si128(((const __m128i*)src) + 5);
-				c6 = _mm_loadu_si128(((const __m128i*)src) + 6);
-				c7 = _mm_loadu_si128(((const __m128i*)src) + 7);
-				_mm_prefetch((const char*)(src + 256), _MM_HINT_NTA);
-				src += 128;
-				_mm_stream_si128((((__m128i*)dst) + 0), c0);
-				_mm_stream_si128((((__m128i*)dst) + 1), c1);
-				_mm_stream_si128((((__m128i*)dst) + 2), c2);
-				_mm_stream_si128((((__m128i*)dst) + 3), c3);
-				_mm_stream_si128((((__m128i*)dst) + 4), c4);
-				_mm_stream_si128((((__m128i*)dst) + 5), c5);
-				_mm_stream_si128((((__m128i*)dst) + 6), c6);
-				_mm_stream_si128((((__m128i*)dst) + 7), c7);
-				dst += 128;
-			}
-		}
-		_mm_sfence();
-	}
-
-	memcpy_tiny(dst, src, size);
-
-	return destination;
-}
-
-
-#endif
--- a/contrib/FastMemcpy/FastMemcpy_Avx.c
+++ b/contrib/FastMemcpy/FastMemcpy_Avx.c
@ -1,171 +0,0 @@
-//=====================================================================
-//
-// FastMemcpy.c - skywind3000@163.com, 2015
-//
-// feature:
-// 50% speed up in avg. vs standard memcpy (tested in vc2012/gcc4.9)
-//
-//=====================================================================
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <time.h>
-#include <assert.h>
-
-#if (defined(_WIN32) || defined(WIN32))
-#include <windows.h>
-#include <mmsystem.h>
-#ifdef _MSC_VER
-#pragma comment(lib, "winmm.lib")
-#endif
-#elif defined(__unix)
-#include <sys/time.h>
-#include <unistd.h>
-#else
-#error it can only be compiled under windows or unix
-#endif
-
-#include "FastMemcpy_Avx.h"
-
-
-unsigned int gettime()
-{
-	#if (defined(_WIN32) || defined(WIN32))
-	return timeGetTime();
-	#else
-	static struct timezone tz={ 0,0 };
-	struct timeval time;
-	gettimeofday(&time,&tz);
-	return (time.tv_sec * 1000 + time.tv_usec / 1000);
-	#endif
-}
-
-void sleepms(unsigned int millisec)
-{
-#if defined(_WIN32) || defined(WIN32)
-	Sleep(millisec);
-#else
-	usleep(millisec * 1000);
-#endif
-}
-
-
-
-void benchmark(int dstalign, int srcalign, size_t size, int times)
-{
-	char *DATA1 = (char*)malloc(size + 64);
-	char *DATA2 = (char*)malloc(size + 64);
-	size_t LINEAR1 = ((size_t)DATA1);
-	size_t LINEAR2 = ((size_t)DATA2);
-	char *ALIGN1 = (char*)(((64 - (LINEAR1 & 63)) & 63) + LINEAR1);
-	char *ALIGN2 = (char*)(((64 - (LINEAR2 & 63)) & 63) + LINEAR2);
-	char *dst = (dstalign)? ALIGN1 : (ALIGN1 + 1);
-	char *src = (srcalign)? ALIGN2 : (ALIGN2 + 3);
-	unsigned int t1, t2;
-	int k;
-	
-	sleepms(100);
-	t1 = gettime();
-	for (k = times; k > 0; k--) {
-		memcpy(dst, src, size);
-	}
-	t1 = gettime() - t1;
-	sleepms(100);
-	t2 = gettime();
-	for (k = times; k > 0; k--) {
-		memcpy_fast(dst, src, size);
-	}
-	t2 = gettime() - t2;
-
-	free(DATA1);
-	free(DATA2);
-
-	printf("result(dst %s, src %s): memcpy_fast=%dms memcpy=%d ms\n",  
-		dstalign? "aligned" : "unalign", 
-		srcalign? "aligned" : "unalign", (int)t2, (int)t1);
-}
-
-
-void bench(int copysize, int times)
-{
-	printf("benchmark(size=%d bytes, times=%d):\n", copysize, times);
-	benchmark(1, 1, copysize, times);
-	benchmark(1, 0, copysize, times);
-	benchmark(0, 1, copysize, times);
-	benchmark(0, 0, copysize, times);
-	printf("\n");
-}
-
-
-void random_bench(int maxsize, int times)
-{
-	static char A[11 * 1024 * 1024 + 2];
-	static char B[11 * 1024 * 1024 + 2];
-	static int random_offsets[0x10000];
-	static int random_sizes[0x8000];
-	unsigned int i, p1, p2;
-	unsigned int t1, t2;
-	for (i = 0; i < 0x10000; i++) {	// generate random offsets
-		random_offsets[i] = rand() % (10 * 1024 * 1024 + 1);
-	}
-	for (i = 0; i < 0x8000; i++) {	// generate random sizes
-		random_sizes[i] = 1 + rand() % maxsize;
-	}
-	sleepms(100);
-	t1 = gettime();
-	for (p1 = 0, p2 = 0, i = 0; i < times; i++) {
-		int offset1 = random_offsets[(p1++) & 0xffff];
-		int offset2 = random_offsets[(p1++) & 0xffff];
-		int size = random_sizes[(p2++) & 0x7fff];
-		memcpy(A + offset1, B + offset2, size);
-	}
-	t1 = gettime() - t1;
-	sleepms(100);
-	t2 = gettime();
-	for (p1 = 0, p2 = 0, i = 0; i < times; i++) {
-		int offset1 = random_offsets[(p1++) & 0xffff];
-		int offset2 = random_offsets[(p1++) & 0xffff];
-		int size = random_sizes[(p2++) & 0x7fff];
-		memcpy_fast(A + offset1, B + offset2, size);
-	}
-	t2 = gettime() - t2;
-	printf("benchmark random access:\n");
-	printf("memcpy_fast=%dms memcpy=%dms\n\n", (int)t2, (int)t1);
-}
-
-
-#ifdef _MSC_VER
-#pragma comment(lib, "winmm.lib")
-#endif
-
-int main(void)
-{
-#if 1
-	bench(32, 0x1000000);
-	bench(64, 0x1000000);
-	bench(512, 0x800000);
-	bench(1024, 0x400000);
-#endif
-	bench(4096, 0x80000);
-	bench(8192, 0x40000);
-#if 1
-	bench(1024 * 1024 * 1, 0x800);
-	bench(1024 * 1024 * 4, 0x200);
-#endif
-	bench(1024 * 1024 * 8, 0x100);
-	
-	random_bench(2048, 8000000);
-
-	return 0;
-}
-
-
-
-
-/*
-
-*/
-
-
-
-
--- a/contrib/FastMemcpy/FastMemcpy_Avx.h
+++ b/contrib/FastMemcpy/FastMemcpy_Avx.h
@ -1,492 +0,0 @@
-//=====================================================================
-//
-// FastMemcpy.c - skywind3000@163.com, 2015
-//
-// feature:
-// 50% speed up in avg. vs standard memcpy (tested in vc2012/gcc5.1)
-//
-//=====================================================================
-#ifndef __FAST_MEMCPY_H__
-#define __FAST_MEMCPY_H__
-
-#include <stddef.h>
-#include <stdint.h>
-#include <immintrin.h>
-
-
-//---------------------------------------------------------------------
-// force inline for compilers
-//---------------------------------------------------------------------
-#ifndef INLINE
-#ifdef __GNUC__
-#if (__GNUC__ > 3) || ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 1))
-    #define INLINE         __inline__ __attribute__((always_inline))
-#else
-    #define INLINE         __inline__
-#endif
-#elif defined(_MSC_VER)
-	#define INLINE __forceinline
-#elif (defined(__BORLANDC__) || defined(__WATCOMC__))
-    #define INLINE __inline
-#else
-    #define INLINE 
-#endif
-#endif
-
-
-
-//---------------------------------------------------------------------
-// fast copy for different sizes
-//---------------------------------------------------------------------
-static INLINE void memcpy_avx_16(void *dst, const void *src) {
-#if 1
-	__m128i m0 = _mm_loadu_si128(((const __m128i*)src) + 0);
-	_mm_storeu_si128(((__m128i*)dst) + 0, m0);
-#else
-	*((uint64_t*)((char*)dst + 0)) = *((uint64_t*)((const char*)src + 0));
-	*((uint64_t*)((char*)dst + 8)) = *((uint64_t*)((const char*)src + 8));
-#endif
-}
-
-static INLINE void memcpy_avx_32(void *dst, const void *src) {
-	__m256i m0 = _mm256_loadu_si256(((const __m256i*)src) + 0);
-	_mm256_storeu_si256(((__m256i*)dst) + 0, m0);
-}
-
-static INLINE void memcpy_avx_64(void *dst, const void *src) {
-	__m256i m0 = _mm256_loadu_si256(((const __m256i*)src) + 0);
-	__m256i m1 = _mm256_loadu_si256(((const __m256i*)src) + 1);
-	_mm256_storeu_si256(((__m256i*)dst) + 0, m0);
-	_mm256_storeu_si256(((__m256i*)dst) + 1, m1);
-}
-
-static INLINE void memcpy_avx_128(void *dst, const void *src) {
-	__m256i m0 = _mm256_loadu_si256(((const __m256i*)src) + 0);
-	__m256i m1 = _mm256_loadu_si256(((const __m256i*)src) + 1);
-	__m256i m2 = _mm256_loadu_si256(((const __m256i*)src) + 2);
-	__m256i m3 = _mm256_loadu_si256(((const __m256i*)src) + 3);
-	_mm256_storeu_si256(((__m256i*)dst) + 0, m0);
-	_mm256_storeu_si256(((__m256i*)dst) + 1, m1);
-	_mm256_storeu_si256(((__m256i*)dst) + 2, m2);
-	_mm256_storeu_si256(((__m256i*)dst) + 3, m3);
-}
-
-static INLINE void memcpy_avx_256(void *dst, const void *src) {
-	__m256i m0 = _mm256_loadu_si256(((const __m256i*)src) + 0);
-	__m256i m1 = _mm256_loadu_si256(((const __m256i*)src) + 1);
-	__m256i m2 = _mm256_loadu_si256(((const __m256i*)src) + 2);
-	__m256i m3 = _mm256_loadu_si256(((const __m256i*)src) + 3);
-	__m256i m4 = _mm256_loadu_si256(((const __m256i*)src) + 4);
-	__m256i m5 = _mm256_loadu_si256(((const __m256i*)src) + 5);
-	__m256i m6 = _mm256_loadu_si256(((const __m256i*)src) + 6);
-	__m256i m7 = _mm256_loadu_si256(((const __m256i*)src) + 7);
-	_mm256_storeu_si256(((__m256i*)dst) + 0, m0);
-	_mm256_storeu_si256(((__m256i*)dst) + 1, m1);
-	_mm256_storeu_si256(((__m256i*)dst) + 2, m2);
-	_mm256_storeu_si256(((__m256i*)dst) + 3, m3);
-	_mm256_storeu_si256(((__m256i*)dst) + 4, m4);
-	_mm256_storeu_si256(((__m256i*)dst) + 5, m5);
-	_mm256_storeu_si256(((__m256i*)dst) + 6, m6);
-	_mm256_storeu_si256(((__m256i*)dst) + 7, m7);
-}
-
-
-//---------------------------------------------------------------------
-// tiny memory copy with jump table optimized
-//---------------------------------------------------------------------
-static INLINE void *memcpy_tiny(void *dst, const void *src, size_t size) {
-	unsigned char *dd = ((unsigned char*)dst) + size;
-	const unsigned char *ss = ((const unsigned char*)src) + size;
-
-	switch (size) { 
-	case 128: memcpy_avx_128(dd - 128, ss - 128);
-	case 0:  break;
-	case 129: memcpy_avx_128(dd - 129, ss - 129);
-	case 1: dd[-1] = ss[-1]; break;
-	case 130: memcpy_avx_128(dd - 130, ss - 130);
-	case 2: *((uint16_t*)(dd - 2)) = *((uint16_t*)(ss - 2)); break;
-	case 131: memcpy_avx_128(dd - 131, ss - 131);
-	case 3: *((uint16_t*)(dd - 3)) = *((uint16_t*)(ss - 3)); dd[-1] = ss[-1]; break;
-	case 132: memcpy_avx_128(dd - 132, ss - 132);
-	case 4: *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); break;
-	case 133: memcpy_avx_128(dd - 133, ss - 133);
-	case 5: *((uint32_t*)(dd - 5)) = *((uint32_t*)(ss - 5)); dd[-1] = ss[-1]; break;
-	case 134: memcpy_avx_128(dd - 134, ss - 134);
-	case 6: *((uint32_t*)(dd - 6)) = *((uint32_t*)(ss - 6)); *((uint16_t*)(dd - 2)) = *((uint16_t*)(ss - 2)); break;
-	case 135: memcpy_avx_128(dd - 135, ss - 135);
-	case 7: *((uint32_t*)(dd - 7)) = *((uint32_t*)(ss - 7)); *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); break;
-	case 136: memcpy_avx_128(dd - 136, ss - 136);
-	case 8: *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break;
-	case 137: memcpy_avx_128(dd - 137, ss - 137);
-	case 9: *((uint64_t*)(dd - 9)) = *((uint64_t*)(ss - 9)); dd[-1] = ss[-1]; break;
-	case 138: memcpy_avx_128(dd - 138, ss - 138);
-	case 10: *((uint64_t*)(dd - 10)) = *((uint64_t*)(ss - 10)); *((uint16_t*)(dd - 2)) = *((uint16_t*)(ss - 2)); break;
-	case 139: memcpy_avx_128(dd - 139, ss - 139);
-	case 11: *((uint64_t*)(dd - 11)) = *((uint64_t*)(ss - 11)); *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); break;
-	case 140: memcpy_avx_128(dd - 140, ss - 140);
-	case 12: *((uint64_t*)(dd - 12)) = *((uint64_t*)(ss - 12)); *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); break;
-	case 141: memcpy_avx_128(dd - 141, ss - 141);
-	case 13: *((uint64_t*)(dd - 13)) = *((uint64_t*)(ss - 13)); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break;
-	case 142: memcpy_avx_128(dd - 142, ss - 142);
-	case 14: *((uint64_t*)(dd - 14)) = *((uint64_t*)(ss - 14)); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break;
-	case 143: memcpy_avx_128(dd - 143, ss - 143);
-	case 15: *((uint64_t*)(dd - 15)) = *((uint64_t*)(ss - 15)); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break;
-	case 144: memcpy_avx_128(dd - 144, ss - 144);
-	case 16: memcpy_avx_16(dd - 16, ss - 16); break;
-	case 145: memcpy_avx_128(dd - 145, ss - 145);
-	case 17: memcpy_avx_16(dd - 17, ss - 17); dd[-1] = ss[-1]; break;
-	case 146: memcpy_avx_128(dd - 146, ss - 146);
-	case 18: memcpy_avx_16(dd - 18, ss - 18); *((uint16_t*)(dd - 2)) = *((uint16_t*)(ss - 2)); break;
-	case 147: memcpy_avx_128(dd - 147, ss - 147);
-	case 19: memcpy_avx_16(dd - 19, ss - 19); *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); break;
-	case 148: memcpy_avx_128(dd - 148, ss - 148);
-	case 20: memcpy_avx_16(dd - 20, ss - 20); *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); break;
-	case 149: memcpy_avx_128(dd - 149, ss - 149);
-	case 21: memcpy_avx_16(dd - 21, ss - 21); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break;
-	case 150: memcpy_avx_128(dd - 150, ss - 150);
-	case 22: memcpy_avx_16(dd - 22, ss - 22); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break;
-	case 151: memcpy_avx_128(dd - 151, ss - 151);
-	case 23: memcpy_avx_16(dd - 23, ss - 23); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break;
-	case 152: memcpy_avx_128(dd - 152, ss - 152);
-	case 24: memcpy_avx_16(dd - 24, ss - 24); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break;
-	case 153: memcpy_avx_128(dd - 153, ss - 153);
-	case 25: memcpy_avx_16(dd - 25, ss - 25); memcpy_avx_16(dd - 16, ss - 16); break;
-	case 154: memcpy_avx_128(dd - 154, ss - 154);
-	case 26: memcpy_avx_16(dd - 26, ss - 26); memcpy_avx_16(dd - 16, ss - 16); break;
-	case 155: memcpy_avx_128(dd - 155, ss - 155);
-	case 27: memcpy_avx_16(dd - 27, ss - 27); memcpy_avx_16(dd - 16, ss - 16); break;
-	case 156: memcpy_avx_128(dd - 156, ss - 156);
-	case 28: memcpy_avx_16(dd - 28, ss - 28); memcpy_avx_16(dd - 16, ss - 16); break;
-	case 157: memcpy_avx_128(dd - 157, ss - 157);
-	case 29: memcpy_avx_16(dd - 29, ss - 29); memcpy_avx_16(dd - 16, ss - 16); break;
-	case 158: memcpy_avx_128(dd - 158, ss - 158);
-	case 30: memcpy_avx_16(dd - 30, ss - 30); memcpy_avx_16(dd - 16, ss - 16); break;
-	case 159: memcpy_avx_128(dd - 159, ss - 159);
-	case 31: memcpy_avx_16(dd - 31, ss - 31); memcpy_avx_16(dd - 16, ss - 16); break;
-	case 160: memcpy_avx_128(dd - 160, ss - 160);
-	case 32: memcpy_avx_32(dd - 32, ss - 32); break;
-	case 161: memcpy_avx_128(dd - 161, ss - 161);
-	case 33: memcpy_avx_32(dd - 33, ss - 33); dd[-1] = ss[-1]; break;
-	case 162: memcpy_avx_128(dd - 162, ss - 162);
-	case 34: memcpy_avx_32(dd - 34, ss - 34); *((uint16_t*)(dd - 2)) = *((uint16_t*)(ss - 2)); break;
-	case 163: memcpy_avx_128(dd - 163, ss - 163);
-	case 35: memcpy_avx_32(dd - 35, ss - 35); *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); break;
-	case 164: memcpy_avx_128(dd - 164, ss - 164);
-	case 36: memcpy_avx_32(dd - 36, ss - 36); *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); break;
-	case 165: memcpy_avx_128(dd - 165, ss - 165);
-	case 37: memcpy_avx_32(dd - 37, ss - 37); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break;
-	case 166: memcpy_avx_128(dd - 166, ss - 166);
-	case 38: memcpy_avx_32(dd - 38, ss - 38); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break;
-	case 167: memcpy_avx_128(dd - 167, ss - 167);
-	case 39: memcpy_avx_32(dd - 39, ss - 39); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break;
-	case 168: memcpy_avx_128(dd - 168, ss - 168);
-	case 40: memcpy_avx_32(dd - 40, ss - 40); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break;
-	case 169: memcpy_avx_128(dd - 169, ss - 169);
-	case 41: memcpy_avx_32(dd - 41, ss - 41); memcpy_avx_16(dd - 16, ss - 16); break;
-	case 170: memcpy_avx_128(dd - 170, ss - 170);
-	case 42: memcpy_avx_32(dd - 42, ss - 42); memcpy_avx_16(dd - 16, ss - 16); break;
-	case 171: memcpy_avx_128(dd - 171, ss - 171);
-	case 43: memcpy_avx_32(dd - 43, ss - 43); memcpy_avx_16(dd - 16, ss - 16); break;
-	case 172: memcpy_avx_128(dd - 172, ss - 172);
-	case 44: memcpy_avx_32(dd - 44, ss - 44); memcpy_avx_16(dd - 16, ss - 16); break;
-	case 173: memcpy_avx_128(dd - 173, ss - 173);
-	case 45: memcpy_avx_32(dd - 45, ss - 45); memcpy_avx_16(dd - 16, ss - 16); break;
-	case 174: memcpy_avx_128(dd - 174, ss - 174);
-	case 46: memcpy_avx_32(dd - 46, ss - 46); memcpy_avx_16(dd - 16, ss - 16); break;
-	case 175: memcpy_avx_128(dd - 175, ss - 175);
-	case 47: memcpy_avx_32(dd - 47, ss - 47); memcpy_avx_16(dd - 16, ss - 16); break;
-	case 176: memcpy_avx_128(dd - 176, ss - 176);
-	case 48: memcpy_avx_32(dd - 48, ss - 48); memcpy_avx_16(dd - 16, ss - 16); break;
-	case 177: memcpy_avx_128(dd - 177, ss - 177);
-	case 49: memcpy_avx_32(dd - 49, ss - 49); memcpy_avx_32(dd - 32, ss - 32); break;
-	case 178: memcpy_avx_128(dd - 178, ss - 178);
-	case 50: memcpy_avx_32(dd - 50, ss - 50); memcpy_avx_32(dd - 32, ss - 32); break;
-	case 179: memcpy_avx_128(dd - 179, ss - 179);
-	case 51: memcpy_avx_32(dd - 51, ss - 51); memcpy_avx_32(dd - 32, ss - 32); break;
-	case 180: memcpy_avx_128(dd - 180, ss - 180);
-	case 52: memcpy_avx_32(dd - 52, ss - 52); memcpy_avx_32(dd - 32, ss - 32); break;
-	case 181: memcpy_avx_128(dd - 181, ss - 181);
-	case 53: memcpy_avx_32(dd - 53, ss - 53); memcpy_avx_32(dd - 32, ss - 32); break;
-	case 182: memcpy_avx_128(dd - 182, ss - 182);
-	case 54: memcpy_avx_32(dd - 54, ss - 54); memcpy_avx_32(dd - 32, ss - 32); break;
-	case 183: memcpy_avx_128(dd - 183, ss - 183);
-	case 55: memcpy_avx_32(dd - 55, ss - 55); memcpy_avx_32(dd - 32, ss - 32); break;
-	case 184: memcpy_avx_128(dd - 184, ss - 184);
-	case 56: memcpy_avx_32(dd - 56, ss - 56); memcpy_avx_32(dd - 32, ss - 32); break;
-	case 185: memcpy_avx_128(dd - 185, ss - 185);
-	case 57: memcpy_avx_32(dd - 57, ss - 57); memcpy_avx_32(dd - 32, ss - 32); break;
-	case 186: memcpy_avx_128(dd - 186, ss - 186);
-	case 58: memcpy_avx_32(dd - 58, ss - 58); memcpy_avx_32(dd - 32, ss - 32); break;
-	case 187: memcpy_avx_128(dd - 187, ss - 187);
-	case 59: memcpy_avx_32(dd - 59, ss - 59); memcpy_avx_32(dd - 32, ss - 32); break;
-	case 188: memcpy_avx_128(dd - 188, ss - 188);
-	case 60: memcpy_avx_32(dd - 60, ss - 60); memcpy_avx_32(dd - 32, ss - 32); break;
-	case 189: memcpy_avx_128(dd - 189, ss - 189);
-	case 61: memcpy_avx_32(dd - 61, ss - 61); memcpy_avx_32(dd - 32, ss - 32); break;
-	case 190: memcpy_avx_128(dd - 190, ss - 190);
-	case 62: memcpy_avx_32(dd - 62, ss - 62); memcpy_avx_32(dd - 32, ss - 32); break;
-	case 191: memcpy_avx_128(dd - 191, ss - 191);
-	case 63: memcpy_avx_32(dd - 63, ss - 63); memcpy_avx_32(dd - 32, ss - 32); break;
-	case 192: memcpy_avx_128(dd - 192, ss - 192);
-	case 64: memcpy_avx_64(dd - 64, ss - 64); break;
-	case 193: memcpy_avx_128(dd - 193, ss - 193);
-	case 65: memcpy_avx_64(dd - 65, ss - 65); dd[-1] = ss[-1]; break;
-	case 194: memcpy_avx_128(dd - 194, ss - 194);
-	case 66: memcpy_avx_64(dd - 66, ss - 66); *((uint16_t*)(dd - 2)) = *((uint16_t*)(ss - 2)); break;
-	case 195: memcpy_avx_128(dd - 195, ss - 195);
-	case 67: memcpy_avx_64(dd - 67, ss - 67); *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); break;
-	case 196: memcpy_avx_128(dd - 196, ss - 196);
-	case 68: memcpy_avx_64(dd - 68, ss - 68); *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); break;
-	case 197: memcpy_avx_128(dd - 197, ss - 197);
-	case 69: memcpy_avx_64(dd - 69, ss - 69); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break;
-	case 198: memcpy_avx_128(dd - 198, ss - 198);
-	case 70: memcpy_avx_64(dd - 70, ss - 70); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break;
-	case 199: memcpy_avx_128(dd - 199, ss - 199);
-	case 71: memcpy_avx_64(dd - 71, ss - 71); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break;
-	case 200: memcpy_avx_128(dd - 200, ss - 200);
-	case 72: memcpy_avx_64(dd - 72, ss - 72); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break;
-	case 201: memcpy_avx_128(dd - 201, ss - 201);
-	case 73: memcpy_avx_64(dd - 73, ss - 73); memcpy_avx_16(dd - 16, ss - 16); break;
-	case 202: memcpy_avx_128(dd - 202, ss - 202);
-	case 74: memcpy_avx_64(dd - 74, ss - 74); memcpy_avx_16(dd - 16, ss - 16); break;
-	case 203: memcpy_avx_128(dd - 203, ss - 203);
-	case 75: memcpy_avx_64(dd - 75, ss - 75); memcpy_avx_16(dd - 16, ss - 16); break;
-	case 204: memcpy_avx_128(dd - 204, ss - 204);
-	case 76: memcpy_avx_64(dd - 76, ss - 76); memcpy_avx_16(dd - 16, ss - 16); break;
-	case 205: memcpy_avx_128(dd - 205, ss - 205);
-	case 77: memcpy_avx_64(dd - 77, ss - 77); memcpy_avx_16(dd - 16, ss - 16); break;
-	case 206: memcpy_avx_128(dd - 206, ss - 206);
-	case 78: memcpy_avx_64(dd - 78, ss - 78); memcpy_avx_16(dd - 16, ss - 16); break;
-	case 207: memcpy_avx_128(dd - 207, ss - 207);
-	case 79: memcpy_avx_64(dd - 79, ss - 79); memcpy_avx_16(dd - 16, ss - 16); break;
-	case 208: memcpy_avx_128(dd - 208, ss - 208);
-	case 80: memcpy_avx_64(dd - 80, ss - 80); memcpy_avx_16(dd - 16, ss - 16); break;
-	case 209: memcpy_avx_128(dd - 209, ss - 209);
-	case 81: memcpy_avx_64(dd - 81, ss - 81); memcpy_avx_32(dd - 32, ss - 32); break;
-	case 210: memcpy_avx_128(dd - 210, ss - 210);
-	case 82: memcpy_avx_64(dd - 82, ss - 82); memcpy_avx_32(dd - 32, ss - 32); break;
-	case 211: memcpy_avx_128(dd - 211, ss - 211);
-	case 83: memcpy_avx_64(dd - 83, ss - 83); memcpy_avx_32(dd - 32, ss - 32); break;
-	case 212: memcpy_avx_128(dd - 212, ss - 212);
-	case 84: memcpy_avx_64(dd - 84, ss - 84); memcpy_avx_32(dd - 32, ss - 32); break;
-	case 213: memcpy_avx_128(dd - 213, ss - 213);
-	case 85: memcpy_avx_64(dd - 85, ss - 85); memcpy_avx_32(dd - 32, ss - 32); break;
-	case 214: memcpy_avx_128(dd - 214, ss - 214);
-	case 86: memcpy_avx_64(dd - 86, ss - 86); memcpy_avx_32(dd - 32, ss - 32); break;
-	case 215: memcpy_avx_128(dd - 215, ss - 215);
-	case 87: memcpy_avx_64(dd - 87, ss - 87); memcpy_avx_32(dd - 32, ss - 32); break;
-	case 216: memcpy_avx_128(dd - 216, ss - 216);
-	case 88: memcpy_avx_64(dd - 88, ss - 88); memcpy_avx_32(dd - 32, ss - 32); break;
-	case 217: memcpy_avx_128(dd - 217, ss - 217);
-	case 89: memcpy_avx_64(dd - 89, ss - 89); memcpy_avx_32(dd - 32, ss - 32); break;
-	case 218: memcpy_avx_128(dd - 218, ss - 218);
-	case 90: memcpy_avx_64(dd - 90, ss - 90); memcpy_avx_32(dd - 32, ss - 32); break;
-	case 219: memcpy_avx_128(dd - 219, ss - 219);
-	case 91: memcpy_avx_64(dd - 91, ss - 91); memcpy_avx_32(dd - 32, ss - 32); break;
-	case 220: memcpy_avx_128(dd - 220, ss - 220);
-	case 92: memcpy_avx_64(dd - 92, ss - 92); memcpy_avx_32(dd - 32, ss - 32); break;
-	case 221: memcpy_avx_128(dd - 221, ss - 221);
-	case 93: memcpy_avx_64(dd - 93, ss - 93); memcpy_avx_32(dd - 32, ss - 32); break;
-	case 222: memcpy_avx_128(dd - 222, ss - 222);
-	case 94: memcpy_avx_64(dd - 94, ss - 94); memcpy_avx_32(dd - 32, ss - 32); break;
-	case 223: memcpy_avx_128(dd - 223, ss - 223);
-	case 95: memcpy_avx_64(dd - 95, ss - 95); memcpy_avx_32(dd - 32, ss - 32); break;
-	case 224: memcpy_avx_128(dd - 224, ss - 224);
-	case 96: memcpy_avx_64(dd - 96, ss - 96); memcpy_avx_32(dd - 32, ss - 32); break;
-	case 225: memcpy_avx_128(dd - 225, ss - 225);
-	case 97: memcpy_avx_64(dd - 97, ss - 97); memcpy_avx_64(dd - 64, ss - 64); break;
-	case 226: memcpy_avx_128(dd - 226, ss - 226);
-	case 98: memcpy_avx_64(dd - 98, ss - 98); memcpy_avx_64(dd - 64, ss - 64); break;
-	case 227: memcpy_avx_128(dd - 227, ss - 227);
-	case 99: memcpy_avx_64(dd - 99, ss - 99); memcpy_avx_64(dd - 64, ss - 64); break;
-	case 228: memcpy_avx_128(dd - 228, ss - 228);
-	case 100: memcpy_avx_64(dd - 100, ss - 100); memcpy_avx_64(dd - 64, ss - 64); break;
-	case 229: memcpy_avx_128(dd - 229, ss - 229);
-	case 101: memcpy_avx_64(dd - 101, ss - 101); memcpy_avx_64(dd - 64, ss - 64); break;
-	case 230: memcpy_avx_128(dd - 230, ss - 230);
-	case 102: memcpy_avx_64(dd - 102, ss - 102); memcpy_avx_64(dd - 64, ss - 64); break;
-	case 231: memcpy_avx_128(dd - 231, ss - 231);
-	case 103: memcpy_avx_64(dd - 103, ss - 103); memcpy_avx_64(dd - 64, ss - 64); break;
-	case 232: memcpy_avx_128(dd - 232, ss - 232);
-	case 104: memcpy_avx_64(dd - 104, ss - 104); memcpy_avx_64(dd - 64, ss - 64); break;
-	case 233: memcpy_avx_128(dd - 233, ss - 233);
-	case 105: memcpy_avx_64(dd - 105, ss - 105); memcpy_avx_64(dd - 64, ss - 64); break;
-	case 234: memcpy_avx_128(dd - 234, ss - 234);
-	case 106: memcpy_avx_64(dd - 106, ss - 106); memcpy_avx_64(dd - 64, ss - 64); break;
-	case 235: memcpy_avx_128(dd - 235, ss - 235);
-	case 107: memcpy_avx_64(dd - 107, ss - 107); memcpy_avx_64(dd - 64, ss - 64); break;
-	case 236: memcpy_avx_128(dd - 236, ss - 236);
-	case 108: memcpy_avx_64(dd - 108, ss - 108); memcpy_avx_64(dd - 64, ss - 64); break;
-	case 237: memcpy_avx_128(dd - 237, ss - 237);
-	case 109: memcpy_avx_64(dd - 109, ss - 109); memcpy_avx_64(dd - 64, ss - 64); break;
-	case 238: memcpy_avx_128(dd - 238, ss - 238);
-	case 110: memcpy_avx_64(dd - 110, ss - 110); memcpy_avx_64(dd - 64, ss - 64); break;
-	case 239: memcpy_avx_128(dd - 239, ss - 239);
-	case 111: memcpy_avx_64(dd - 111, ss - 111); memcpy_avx_64(dd - 64, ss - 64); break;
-	case 240: memcpy_avx_128(dd - 240, ss - 240);
-	case 112: memcpy_avx_64(dd - 112, ss - 112); memcpy_avx_64(dd - 64, ss - 64); break;
-	case 241: memcpy_avx_128(dd - 241, ss - 241);
-	case 113: memcpy_avx_64(dd - 113, ss - 113); memcpy_avx_64(dd - 64, ss - 64); break;
-	case 242: memcpy_avx_128(dd - 242, ss - 242);
-	case 114: memcpy_avx_64(dd - 114, ss - 114); memcpy_avx_64(dd - 64, ss - 64); break;
-	case 243: memcpy_avx_128(dd - 243, ss - 243);
-	case 115: memcpy_avx_64(dd - 115, ss - 115); memcpy_avx_64(dd - 64, ss - 64); break;
-	case 244: memcpy_avx_128(dd - 244, ss - 244);
-	case 116: memcpy_avx_64(dd - 116, ss - 116); memcpy_avx_64(dd - 64, ss - 64); break;
-	case 245: memcpy_avx_128(dd - 245, ss - 245);
-	case 117: memcpy_avx_64(dd - 117, ss - 117); memcpy_avx_64(dd - 64, ss - 64); break;
-	case 246: memcpy_avx_128(dd - 246, ss - 246);
-	case 118: memcpy_avx_64(dd - 118, ss - 118); memcpy_avx_64(dd - 64, ss - 64); break;
-	case 247: memcpy_avx_128(dd - 247, ss - 247);
-	case 119: memcpy_avx_64(dd - 119, ss - 119); memcpy_avx_64(dd - 64, ss - 64); break;
-	case 248: memcpy_avx_128(dd - 248, ss - 248);
-	case 120: memcpy_avx_64(dd - 120, ss - 120); memcpy_avx_64(dd - 64, ss - 64); break;
-	case 249: memcpy_avx_128(dd - 249, ss - 249);
-	case 121: memcpy_avx_64(dd - 121, ss - 121); memcpy_avx_64(dd - 64, ss - 64); break;
-	case 250: memcpy_avx_128(dd - 250, ss - 250);
-	case 122: memcpy_avx_64(dd - 122, ss - 122); memcpy_avx_64(dd - 64, ss - 64); break;
-	case 251: memcpy_avx_128(dd - 251, ss - 251);
-	case 123: memcpy_avx_64(dd - 123, ss - 123); memcpy_avx_64(dd - 64, ss - 64); break;
-	case 252: memcpy_avx_128(dd - 252, ss - 252);
-	case 124: memcpy_avx_64(dd - 124, ss - 124); memcpy_avx_64(dd - 64, ss - 64); break;
-	case 253: memcpy_avx_128(dd - 253, ss - 253);
-	case 125: memcpy_avx_64(dd - 125, ss - 125); memcpy_avx_64(dd - 64, ss - 64); break;
-	case 254: memcpy_avx_128(dd - 254, ss - 254);
-	case 126: memcpy_avx_64(dd - 126, ss - 126); memcpy_avx_64(dd - 64, ss - 64); break;
-	case 255: memcpy_avx_128(dd - 255, ss - 255);
-	case 127: memcpy_avx_64(dd - 127, ss - 127); memcpy_avx_64(dd - 64, ss - 64); break;
-	case 256: memcpy_avx_256(dd - 256, ss - 256); break;
-	}
-
-	return dst;
-}
-
-
-//---------------------------------------------------------------------
-// main routine
-//---------------------------------------------------------------------
-static void* memcpy_fast(void *destination, const void *source, size_t size)
-{
-	unsigned char *dst = (unsigned char*)destination;
-	const unsigned char *src = (const unsigned char*)source;
-	static size_t cachesize = 0x200000; // L3-cache size
-	size_t padding;
-
-	// small memory copy
-	if (size <= 256) {
-		memcpy_tiny(dst, src, size);
-		_mm256_zeroupper();
-		return destination;
-	}
-
-	// align destination to 16 bytes boundary
-	padding = (32 - (((size_t)dst) & 31)) & 31;
-
-#if 0
-	if (padding > 0) {
-		__m256i head = _mm256_loadu_si256((const __m256i*)src);
-		_mm256_storeu_si256((__m256i*)dst, head);
-		dst += padding;
-		src += padding;
-		size -= padding;
-	}
-#else
-	__m256i head = _mm256_loadu_si256((const __m256i*)src);
-	_mm256_storeu_si256((__m256i*)dst, head);
-	dst += padding;
-	src += padding;
-	size -= padding;
-#endif
-
-	// medium size copy
-	if (size <= cachesize) {
-		__m256i c0, c1, c2, c3, c4, c5, c6, c7;
-
-		for (; size >= 256; size -= 256) {
-			c0 = _mm256_loadu_si256(((const __m256i*)src) + 0);
-			c1 = _mm256_loadu_si256(((const __m256i*)src) + 1);
-			c2 = _mm256_loadu_si256(((const __m256i*)src) + 2);
-			c3 = _mm256_loadu_si256(((const __m256i*)src) + 3);
-			c4 = _mm256_loadu_si256(((const __m256i*)src) + 4);
-			c5 = _mm256_loadu_si256(((const __m256i*)src) + 5);
-			c6 = _mm256_loadu_si256(((const __m256i*)src) + 6);
-			c7 = _mm256_loadu_si256(((const __m256i*)src) + 7);
-			_mm_prefetch((const char*)(src + 512), _MM_HINT_NTA);
-			src += 256;
-			_mm256_storeu_si256((((__m256i*)dst) + 0), c0);
-			_mm256_storeu_si256((((__m256i*)dst) + 1), c1);
-			_mm256_storeu_si256((((__m256i*)dst) + 2), c2);
-			_mm256_storeu_si256((((__m256i*)dst) + 3), c3);
-			_mm256_storeu_si256((((__m256i*)dst) + 4), c4);
-			_mm256_storeu_si256((((__m256i*)dst) + 5), c5);
-			_mm256_storeu_si256((((__m256i*)dst) + 6), c6);
-			_mm256_storeu_si256((((__m256i*)dst) + 7), c7);
-			dst += 256;
-		}
-	}
-	else {		// big memory copy
-		__m256i c0, c1, c2, c3, c4, c5, c6, c7;
-		/* __m256i c0, c1, c2, c3, c4, c5, c6, c7; */
-
-		_mm_prefetch((const char*)(src), _MM_HINT_NTA);
-
-		if ((((size_t)src) & 31) == 0) {	// source aligned
-			for (; size >= 256; size -= 256) {
-				c0 = _mm256_load_si256(((const __m256i*)src) + 0);
-				c1 = _mm256_load_si256(((const __m256i*)src) + 1);
-				c2 = _mm256_load_si256(((const __m256i*)src) + 2);
-				c3 = _mm256_load_si256(((const __m256i*)src) + 3);
-				c4 = _mm256_load_si256(((const __m256i*)src) + 4);
-				c5 = _mm256_load_si256(((const __m256i*)src) + 5);
-				c6 = _mm256_load_si256(((const __m256i*)src) + 6);
-				c7 = _mm256_load_si256(((const __m256i*)src) + 7);
-				_mm_prefetch((const char*)(src + 512), _MM_HINT_NTA);
-				src += 256;
-				_mm256_stream_si256((((__m256i*)dst) + 0), c0);
-				_mm256_stream_si256((((__m256i*)dst) + 1), c1);
-				_mm256_stream_si256((((__m256i*)dst) + 2), c2);
-				_mm256_stream_si256((((__m256i*)dst) + 3), c3);
-				_mm256_stream_si256((((__m256i*)dst) + 4), c4);
-				_mm256_stream_si256((((__m256i*)dst) + 5), c5);
-				_mm256_stream_si256((((__m256i*)dst) + 6), c6);
-				_mm256_stream_si256((((__m256i*)dst) + 7), c7);
-				dst += 256;
-			}
-		}
-		else {							// source unaligned
-			for (; size >= 256; size -= 256) {
-				c0 = _mm256_loadu_si256(((const __m256i*)src) + 0);
-				c1 = _mm256_loadu_si256(((const __m256i*)src) + 1);
-				c2 = _mm256_loadu_si256(((const __m256i*)src) + 2);
-				c3 = _mm256_loadu_si256(((const __m256i*)src) + 3);
-				c4 = _mm256_loadu_si256(((const __m256i*)src) + 4);
-				c5 = _mm256_loadu_si256(((const __m256i*)src) + 5);
-				c6 = _mm256_loadu_si256(((const __m256i*)src) + 6);
-				c7 = _mm256_loadu_si256(((const __m256i*)src) + 7);
-				_mm_prefetch((const char*)(src + 512), _MM_HINT_NTA);
-				src += 256;
-				_mm256_stream_si256((((__m256i*)dst) + 0), c0);
-				_mm256_stream_si256((((__m256i*)dst) + 1), c1);
-				_mm256_stream_si256((((__m256i*)dst) + 2), c2);
-				_mm256_stream_si256((((__m256i*)dst) + 3), c3);
-				_mm256_stream_si256((((__m256i*)dst) + 4), c4);
-				_mm256_stream_si256((((__m256i*)dst) + 5), c5);
-				_mm256_stream_si256((((__m256i*)dst) + 6), c6);
-				_mm256_stream_si256((((__m256i*)dst) + 7), c7);
-				dst += 256;
-			}
-		}
-		_mm_sfence();
-	}
-
-	memcpy_tiny(dst, src, size);
-	_mm256_zeroupper();
-
-	return destination;
-}
-
-
-#endif
-
-
-
--- a/contrib/FastMemcpy/LICENSE
+++ b/contrib/FastMemcpy/LICENSE
@ -1,22 +0,0 @@
-The MIT License (MIT)
-
-Copyright (c) 2015 Linwei
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
-
--- a/contrib/FastMemcpy/README.md
+++ b/contrib/FastMemcpy/README.md
@ -1,20 +0,0 @@
-Internal implementation of `memcpy` function.
-
-It has the following advantages over `libc`-supplied implementation:
- it is linked statically, so the function is called directly, not through a `PLT` (procedure lookup table of shared library);
- it is linked statically, so the function can have position-dependent code;
- your binaries will not depend on `glibc`'s memcpy, that forces dependency on specific symbol version like `memcpy@@GLIBC_2.14` and consequently on specific version of `glibc` library;
- you can include `memcpy.h` directly and the function has the chance to be inlined, which is beneficial for small but unknown at compile time sizes of memory regions;
- this version of `memcpy` pretend to be faster (in our benchmarks, the difference is within few percents).
-
-Currently it uses the implementation from **Linwei** (skywind3000@163.com).
-Look at https://www.zhihu.com/question/35172305 for discussion.
-
-Drawbacks:
- only use SSE 2, doesn't use wider (AVX, AVX 512) vector registers when available;
- no CPU dispatching; doesn't take into account actual cache size.
-
-Also worth to look at:
- simple implementation from Facebook: https://github.com/facebook/folly/blob/master/folly/memcpy.S
- implementation from Agner Fog: http://www.agner.org/optimize/
- glibc source code.
--- a/contrib/FastMemcpy/memcpy_wrapper.c
+++ b/contrib/FastMemcpy/memcpy_wrapper.c
@ -1,6 +0,0 @@
-#include "FastMemcpy.h"
-
-void * memcpy(void * __restrict destination, const void * __restrict source, size_t size)
-{
-    return memcpy_fast(destination, source, size);
-}
--- a/utils/CMakeLists.txt
+++ b/utils/CMakeLists.txt
@ -32,6 +32,7 @@ if (NOT DEFINED ENABLE_UTILS OR ENABLE_UTILS)
    add_subdirectory (db-generator)
    add_subdirectory (wal-dump)
    add_subdirectory (check-mysql-binlog)
+    add_subdirectory (memcpy-bench)
 endif ()

 if (ENABLE_CODE_QUALITY)
--- a/utils/memcpy-bench/CMakeLists.txt
+++ b/utils/memcpy-bench/CMakeLists.txt
@ -0,0 +1,5 @@
+enable_language(ASM)
+add_executable (memcpy-bench memcpy-bench.cpp memcpy_jart.S)
+#target_compile_options(memcpy-bench PRIVATE -mavx)
+target_link_libraries(memcpy-bench PRIVATE dbms)
+
--- a/utils/memcpy-bench/FastMemcpy.h
+++ b/utils/memcpy-bench/FastMemcpy.h
@ -0,0 +1,770 @@
+#pragma once
+
+//=====================================================================
+//
+// FastMemcpy.c - skywind3000@163.com, 2015
+//
+// feature:
+// 50% speed up in avg. vs standard memcpy (tested in vc2012/gcc5.1)
+//
+//=====================================================================
+
+#include <stddef.h>
+#include <stdint.h>
+#include <emmintrin.h>
+
+
+//---------------------------------------------------------------------
+// force inline for compilers
+//---------------------------------------------------------------------
+#ifndef INLINE
+#ifdef __GNUC__
+#if (__GNUC__ > 3) || ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 1))
+    #define INLINE         __inline__ __attribute__((always_inline))
+#else
+    #define INLINE         __inline__
+#endif
+#elif defined(_MSC_VER)
+    #define INLINE __forceinline
+#elif (defined(__BORLANDC__) || defined(__WATCOMC__))
+    #define INLINE __inline
+#else
+    #define INLINE
+#endif
+#endif
+
+typedef __attribute__((__aligned__(1))) uint16_t uint16_unaligned_t;
+typedef __attribute__((__aligned__(1))) uint32_t uint32_unaligned_t;
+typedef __attribute__((__aligned__(1))) uint64_t uint64_unaligned_t;
+
+//---------------------------------------------------------------------
+// fast copy for different sizes
+//---------------------------------------------------------------------
+static INLINE void memcpy_sse2_16(void * __restrict dst, const void * __restrict src)
+{
+    __m128i m0 = _mm_loadu_si128((reinterpret_cast<const __m128i*>(src)) + 0);
+    _mm_storeu_si128((reinterpret_cast<__m128i*>(dst)) + 0, m0);
+}
+
+static INLINE void memcpy_sse2_32(void * __restrict dst, const void * __restrict src)
+{
+    __m128i m0 = _mm_loadu_si128((reinterpret_cast<const __m128i*>(src)) + 0);
+    __m128i m1 = _mm_loadu_si128((reinterpret_cast<const __m128i*>(src)) + 1);
+    _mm_storeu_si128((reinterpret_cast<__m128i*>(dst)) + 0, m0);
+    _mm_storeu_si128((reinterpret_cast<__m128i*>(dst)) + 1, m1);
+}
+
+static INLINE void memcpy_sse2_64(void * __restrict dst, const void * __restrict src)
+{
+    __m128i m0 = _mm_loadu_si128((reinterpret_cast<const __m128i*>(src)) + 0);
+    __m128i m1 = _mm_loadu_si128((reinterpret_cast<const __m128i*>(src)) + 1);
+    __m128i m2 = _mm_loadu_si128((reinterpret_cast<const __m128i*>(src)) + 2);
+    __m128i m3 = _mm_loadu_si128((reinterpret_cast<const __m128i*>(src)) + 3);
+    _mm_storeu_si128((reinterpret_cast<__m128i*>(dst)) + 0, m0);
+    _mm_storeu_si128((reinterpret_cast<__m128i*>(dst)) + 1, m1);
+    _mm_storeu_si128((reinterpret_cast<__m128i*>(dst)) + 2, m2);
+    _mm_storeu_si128((reinterpret_cast<__m128i*>(dst)) + 3, m3);
+}
+
+static INLINE void memcpy_sse2_128(void * __restrict dst, const void * __restrict src)
+{
+    __m128i m0 = _mm_loadu_si128((reinterpret_cast<const __m128i*>(src)) + 0);
+    __m128i m1 = _mm_loadu_si128((reinterpret_cast<const __m128i*>(src)) + 1);
+    __m128i m2 = _mm_loadu_si128((reinterpret_cast<const __m128i*>(src)) + 2);
+    __m128i m3 = _mm_loadu_si128((reinterpret_cast<const __m128i*>(src)) + 3);
+    __m128i m4 = _mm_loadu_si128((reinterpret_cast<const __m128i*>(src)) + 4);
+    __m128i m5 = _mm_loadu_si128((reinterpret_cast<const __m128i*>(src)) + 5);
+    __m128i m6 = _mm_loadu_si128((reinterpret_cast<const __m128i*>(src)) + 6);
+    __m128i m7 = _mm_loadu_si128((reinterpret_cast<const __m128i*>(src)) + 7);
+    _mm_storeu_si128((reinterpret_cast<__m128i*>(dst)) + 0, m0);
+    _mm_storeu_si128((reinterpret_cast<__m128i*>(dst)) + 1, m1);
+    _mm_storeu_si128((reinterpret_cast<__m128i*>(dst)) + 2, m2);
+    _mm_storeu_si128((reinterpret_cast<__m128i*>(dst)) + 3, m3);
+    _mm_storeu_si128((reinterpret_cast<__m128i*>(dst)) + 4, m4);
+    _mm_storeu_si128((reinterpret_cast<__m128i*>(dst)) + 5, m5);
+    _mm_storeu_si128((reinterpret_cast<__m128i*>(dst)) + 6, m6);
+    _mm_storeu_si128((reinterpret_cast<__m128i*>(dst)) + 7, m7);
+}
+
+
+//---------------------------------------------------------------------
+// tiny memory copy with jump table optimized
+//---------------------------------------------------------------------
+/// Attribute is used to avoid an error with undefined behaviour sanitizer
+/// ../contrib/FastMemcpy/FastMemcpy.h:91:56: runtime error: applying zero offset to null pointer
+/// Found by 01307_orc_output_format.sh, cause - ORCBlockInputFormat and external ORC library.
+__attribute__((__no_sanitize__("undefined"))) static INLINE void *memcpy_tiny(void * __restrict dst, const void * __restrict src, size_t size)
+{
+    unsigned char *dd = ((unsigned char*)dst) + size;
+    const unsigned char *ss = ((const unsigned char*)src) + size;
+
+    switch (size)
+    {
+    case 64:
+        memcpy_sse2_64(dd - 64, ss - 64);
+        [[fallthrough]];
+    case 0:
+        break;
+
+    case 65:
+        memcpy_sse2_64(dd - 65, ss - 65);
+        [[fallthrough]];
+    case 1:
+        dd[-1] = ss[-1];
+        break;
+
+    case 66:
+        memcpy_sse2_64(dd - 66, ss - 66);
+        [[fallthrough]];
+    case 2:
+        *((uint16_unaligned_t*)(dd - 2)) = *((const uint16_unaligned_t*)(ss - 2));
+        break;
+
+    case 67:
+        memcpy_sse2_64(dd - 67, ss - 67);
+        [[fallthrough]];
+    case 3:
+        *((uint16_unaligned_t*)(dd - 3)) = *((const uint16_unaligned_t*)(ss - 3));
+        dd[-1] = ss[-1];
+        break;
+
+    case 68:
+        memcpy_sse2_64(dd - 68, ss - 68);
+        [[fallthrough]];
+    case 4:
+        *((uint32_unaligned_t*)(dd - 4)) = *((const uint32_unaligned_t*)(ss - 4));
+        break;
+
+    case 69:
+        memcpy_sse2_64(dd - 69, ss - 69);
+        [[fallthrough]];
+    case 5:
+        *((uint32_unaligned_t*)(dd - 5)) = *((const uint32_unaligned_t*)(ss - 5));
+        dd[-1] = ss[-1];
+        break;
+
+    case 70:
+        memcpy_sse2_64(dd - 70, ss - 70);
+        [[fallthrough]];
+    case 6:
+        *((uint32_unaligned_t*)(dd - 6)) = *((const uint32_unaligned_t*)(ss - 6));
+        *((uint16_unaligned_t*)(dd - 2)) = *((const uint16_unaligned_t*)(ss - 2));
+        break;
+
+    case 71:
+        memcpy_sse2_64(dd - 71, ss - 71);
+        [[fallthrough]];
+    case 7:
+        *((uint32_unaligned_t*)(dd - 7)) = *((const uint32_unaligned_t*)(ss - 7));
+        *((uint32_unaligned_t*)(dd - 4)) = *((const uint32_unaligned_t*)(ss - 4));
+        break;
+
+    case 72:
+        memcpy_sse2_64(dd - 72, ss - 72);
+        [[fallthrough]];
+    case 8:
+        *((uint64_unaligned_t*)(dd - 8)) = *((const uint64_unaligned_t*)(ss - 8));
+        break;
+
+    case 73:
+        memcpy_sse2_64(dd - 73, ss - 73);
+        [[fallthrough]];
+    case 9:
+        *((uint64_unaligned_t*)(dd - 9)) = *((const uint64_unaligned_t*)(ss - 9));
+        dd[-1] = ss[-1];
+        break;
+
+    case 74:
+        memcpy_sse2_64(dd - 74, ss - 74);
+        [[fallthrough]];
+    case 10:
+        *((uint64_unaligned_t*)(dd - 10)) = *((const uint64_unaligned_t*)(ss - 10));
+        *((uint16_unaligned_t*)(dd - 2)) = *((const uint16_unaligned_t*)(ss - 2));
+        break;
+
+    case 75:
+        memcpy_sse2_64(dd - 75, ss - 75);
+        [[fallthrough]];
+    case 11:
+        *((uint64_unaligned_t*)(dd - 11)) = *((const uint64_unaligned_t*)(ss - 11));
+        *((uint32_unaligned_t*)(dd - 4)) = *((const uint32_unaligned_t*)(ss - 4));
+        break;
+
+    case 76:
+        memcpy_sse2_64(dd - 76, ss - 76);
+        [[fallthrough]];
+    case 12:
+        *((uint64_unaligned_t*)(dd - 12)) = *((const uint64_unaligned_t*)(ss - 12));
+        *((uint32_unaligned_t*)(dd - 4)) = *((const uint32_unaligned_t*)(ss - 4));
+        break;
+
+    case 77:
+        memcpy_sse2_64(dd - 77, ss - 77);
+        [[fallthrough]];
+    case 13:
+        *((uint64_unaligned_t*)(dd - 13)) = *((const uint64_unaligned_t*)(ss - 13));
+        *((uint32_unaligned_t*)(dd - 5)) = *((const uint32_unaligned_t*)(ss - 5));
+        dd[-1] = ss[-1];
+        break;
+
+    case 78:
+        memcpy_sse2_64(dd - 78, ss - 78);
+        [[fallthrough]];
+    case 14:
+        *((uint64_unaligned_t*)(dd - 14)) = *((const uint64_unaligned_t*)(ss - 14));
+        *((uint64_unaligned_t*)(dd - 8)) = *((const uint64_unaligned_t*)(ss - 8));
+        break;
+
+    case 79:
+        memcpy_sse2_64(dd - 79, ss - 79);
+        [[fallthrough]];
+    case 15:
+        *((uint64_unaligned_t*)(dd - 15)) = *((const uint64_unaligned_t*)(ss - 15));
+        *((uint64_unaligned_t*)(dd - 8)) = *((const uint64_unaligned_t*)(ss - 8));
+        break;
+
+    case 80:
+        memcpy_sse2_64(dd - 80, ss - 80);
+        [[fallthrough]];
+    case 16:
+        memcpy_sse2_16(dd - 16, ss - 16);
+        break;
+
+    case 81:
+        memcpy_sse2_64(dd - 81, ss - 81);
+        [[fallthrough]];
+    case 17:
+        memcpy_sse2_16(dd - 17, ss - 17);
+        dd[-1] = ss[-1];
+        break;
+
+    case 82:
+        memcpy_sse2_64(dd - 82, ss - 82);
+        [[fallthrough]];
+    case 18:
+        memcpy_sse2_16(dd - 18, ss - 18);
+        *((uint16_unaligned_t*)(dd - 2)) = *((const uint16_unaligned_t*)(ss - 2));
+        break;
+
+    case 83:
+        memcpy_sse2_64(dd - 83, ss - 83);
+        [[fallthrough]];
+    case 19:
+        memcpy_sse2_16(dd - 19, ss - 19);
+        *((uint16_unaligned_t*)(dd - 3)) = *((const uint16_unaligned_t*)(ss - 3));
+        dd[-1] = ss[-1];
+        break;
+
+    case 84:
+        memcpy_sse2_64(dd - 84, ss - 84);
+        [[fallthrough]];
+    case 20:
+        memcpy_sse2_16(dd - 20, ss - 20);
+        *((uint32_unaligned_t*)(dd - 4)) = *((const uint32_unaligned_t*)(ss - 4));
+        break;
+
+    case 85:
+        memcpy_sse2_64(dd - 85, ss - 85);
+        [[fallthrough]];
+    case 21:
+        memcpy_sse2_16(dd - 21, ss - 21);
+        *((uint32_unaligned_t*)(dd - 5)) = *((const uint32_unaligned_t*)(ss - 5));
+        dd[-1] = ss[-1];
+        break;
+
+    case 86:
+        memcpy_sse2_64(dd - 86, ss - 86);
+        [[fallthrough]];
+    case 22:
+        memcpy_sse2_16(dd - 22, ss - 22);
+        *((uint32_unaligned_t*)(dd - 6)) = *((const uint32_unaligned_t*)(ss - 6));
+        *((uint16_unaligned_t*)(dd - 2)) = *((const uint16_unaligned_t*)(ss - 2));
+        break;
+
+    case 87:
+        memcpy_sse2_64(dd - 87, ss - 87);
+        [[fallthrough]];
+    case 23:
+        memcpy_sse2_16(dd - 23, ss - 23);
+        *((uint32_unaligned_t*)(dd - 7)) = *((const uint32_unaligned_t*)(ss - 7));
+        *((uint32_unaligned_t*)(dd - 4)) = *((const uint32_unaligned_t*)(ss - 4));
+        break;
+
+    case 88:
+        memcpy_sse2_64(dd - 88, ss - 88);
+        [[fallthrough]];
+    case 24:
+        memcpy_sse2_16(dd - 24, ss - 24);
+        memcpy_sse2_16(dd - 16, ss - 16);
+        break;
+
+    case 89:
+        memcpy_sse2_64(dd - 89, ss - 89);
+        [[fallthrough]];
+    case 25:
+        memcpy_sse2_16(dd - 25, ss - 25);
+        memcpy_sse2_16(dd - 16, ss - 16);
+        break;
+
+    case 90:
+        memcpy_sse2_64(dd - 90, ss - 90);
+        [[fallthrough]];
+    case 26:
+        memcpy_sse2_16(dd - 26, ss - 26);
+        memcpy_sse2_16(dd - 16, ss - 16);
+        break;
+
+    case 91:
+        memcpy_sse2_64(dd - 91, ss - 91);
+        [[fallthrough]];
+    case 27:
+        memcpy_sse2_16(dd - 27, ss - 27);
+        memcpy_sse2_16(dd - 16, ss - 16);
+        break;
+
+    case 92:
+        memcpy_sse2_64(dd - 92, ss - 92);
+        [[fallthrough]];
+    case 28:
+        memcpy_sse2_16(dd - 28, ss - 28);
+        memcpy_sse2_16(dd - 16, ss - 16);
+        break;
+
+    case 93:
+        memcpy_sse2_64(dd - 93, ss - 93);
+        [[fallthrough]];
+    case 29:
+        memcpy_sse2_16(dd - 29, ss - 29);
+        memcpy_sse2_16(dd - 16, ss - 16);
+        break;
+
+    case 94:
+        memcpy_sse2_64(dd - 94, ss - 94);
+        [[fallthrough]];
+    case 30:
+        memcpy_sse2_16(dd - 30, ss - 30);
+        memcpy_sse2_16(dd - 16, ss - 16);
+        break;
+
+    case 95:
+        memcpy_sse2_64(dd - 95, ss - 95);
+        [[fallthrough]];
+    case 31:
+        memcpy_sse2_16(dd - 31, ss - 31);
+        memcpy_sse2_16(dd - 16, ss - 16);
+        break;
+
+    case 96:
+        memcpy_sse2_64(dd - 96, ss - 96);
+        [[fallthrough]];
+    case 32:
+        memcpy_sse2_32(dd - 32, ss - 32);
+        break;
+
+    case 97:
+        memcpy_sse2_64(dd - 97, ss - 97);
+        [[fallthrough]];
+    case 33:
+        memcpy_sse2_32(dd - 33, ss - 33);
+        dd[-1] = ss[-1];
+        break;
+
+    case 98:
+        memcpy_sse2_64(dd - 98, ss - 98);
+        [[fallthrough]];
+    case 34:
+        memcpy_sse2_32(dd - 34, ss - 34);
+        *((uint16_unaligned_t*)(dd - 2)) = *((const uint16_unaligned_t*)(ss - 2));
+        break;
+
+    case 99:
+        memcpy_sse2_64(dd - 99, ss - 99);
+        [[fallthrough]];
+    case 35:
+        memcpy_sse2_32(dd - 35, ss - 35);
+        *((uint16_unaligned_t*)(dd - 3)) = *((const uint16_unaligned_t*)(ss - 3));
+        dd[-1] = ss[-1];
+        break;
+
+    case 100:
+        memcpy_sse2_64(dd - 100, ss - 100);
+        [[fallthrough]];
+    case 36:
+        memcpy_sse2_32(dd - 36, ss - 36);
+        *((uint32_unaligned_t*)(dd - 4)) = *((const uint32_unaligned_t*)(ss - 4));
+        break;
+
+    case 101:
+        memcpy_sse2_64(dd - 101, ss - 101);
+        [[fallthrough]];
+    case 37:
+        memcpy_sse2_32(dd - 37, ss - 37);
+        *((uint32_unaligned_t*)(dd - 5)) = *((const uint32_unaligned_t*)(ss - 5));
+        dd[-1] = ss[-1];
+        break;
+
+    case 102:
+        memcpy_sse2_64(dd - 102, ss - 102);
+        [[fallthrough]];
+    case 38:
+        memcpy_sse2_32(dd - 38, ss - 38);
+        *((uint32_unaligned_t*)(dd - 6)) = *((const uint32_unaligned_t*)(ss - 6));
+        *((uint16_unaligned_t*)(dd - 2)) = *((const uint16_unaligned_t*)(ss - 2));
+        break;
+
+    case 103:
+        memcpy_sse2_64(dd - 103, ss - 103);
+        [[fallthrough]];
+    case 39:
+        memcpy_sse2_32(dd - 39, ss - 39);
+        *((uint32_unaligned_t*)(dd - 7)) = *((const uint32_unaligned_t*)(ss - 7));
+        *((uint32_unaligned_t*)(dd - 4)) = *((const uint32_unaligned_t*)(ss - 4));
+        break;
+
+    case 104:
+        memcpy_sse2_64(dd - 104, ss - 104);
+        [[fallthrough]];
+    case 40:
+        memcpy_sse2_32(dd - 40, ss - 40);
+        *((uint64_unaligned_t*)(dd - 8)) = *((const uint64_unaligned_t*)(ss - 8));
+        break;
+
+    case 105:
+        memcpy_sse2_64(dd - 105, ss - 105);
+        [[fallthrough]];
+    case 41:
+        memcpy_sse2_32(dd - 41, ss - 41);
+        *((uint64_unaligned_t*)(dd - 9)) = *((const uint64_unaligned_t*)(ss - 9));
+        dd[-1] = ss[-1];
+        break;
+
+    case 106:
+        memcpy_sse2_64(dd - 106, ss - 106);
+        [[fallthrough]];
+    case 42:
+        memcpy_sse2_32(dd - 42, ss - 42);
+        *((uint64_unaligned_t*)(dd - 10)) = *((const uint64_unaligned_t*)(ss - 10));
+        *((uint16_unaligned_t*)(dd - 2)) = *((const uint16_unaligned_t*)(ss - 2));
+        break;
+
+    case 107:
+        memcpy_sse2_64(dd - 107, ss - 107);
+        [[fallthrough]];
+    case 43:
+        memcpy_sse2_32(dd - 43, ss - 43);
+        *((uint64_unaligned_t*)(dd - 11)) = *((const uint64_unaligned_t*)(ss - 11));
+        *((uint32_unaligned_t*)(dd - 4)) = *((const uint32_unaligned_t*)(ss - 4));
+        break;
+
+    case 108:
+        memcpy_sse2_64(dd - 108, ss - 108);
+        [[fallthrough]];
+    case 44:
+        memcpy_sse2_32(dd - 44, ss - 44);
+        *((uint64_unaligned_t*)(dd - 12)) = *((const uint64_unaligned_t*)(ss - 12));
+        *((uint32_unaligned_t*)(dd - 4)) = *((const uint32_unaligned_t*)(ss - 4));
+        break;
+
+    case 109:
+        memcpy_sse2_64(dd - 109, ss - 109);
+        [[fallthrough]];
+    case 45:
+        memcpy_sse2_32(dd - 45, ss - 45);
+        *((uint64_unaligned_t*)(dd - 13)) = *((const uint64_unaligned_t*)(ss - 13));
+        *((uint32_unaligned_t*)(dd - 5)) = *((const uint32_unaligned_t*)(ss - 5));
+        dd[-1] = ss[-1];
+        break;
+
+    case 110:
+        memcpy_sse2_64(dd - 110, ss - 110);
+        [[fallthrough]];
+    case 46:
+        memcpy_sse2_32(dd - 46, ss - 46);
+        *((uint64_unaligned_t*)(dd - 14)) = *((const uint64_unaligned_t*)(ss - 14));
+        *((uint64_unaligned_t*)(dd - 8)) = *((const uint64_unaligned_t*)(ss - 8));
+        break;
+
+    case 111:
+        memcpy_sse2_64(dd - 111, ss - 111);
+        [[fallthrough]];
+    case 47:
+        memcpy_sse2_32(dd - 47, ss - 47);
+        *((uint64_unaligned_t*)(dd - 15)) = *((const uint64_unaligned_t*)(ss - 15));
+        *((uint64_unaligned_t*)(dd - 8)) = *((const uint64_unaligned_t*)(ss - 8));
+        break;
+
+    case 112:
+        memcpy_sse2_64(dd - 112, ss - 112);
+        [[fallthrough]];
+    case 48:
+        memcpy_sse2_32(dd - 48, ss - 48);
+        memcpy_sse2_16(dd - 16, ss - 16);
+        break;
+
+    case 113:
+        memcpy_sse2_64(dd - 113, ss - 113);
+        [[fallthrough]];
+    case 49:
+        memcpy_sse2_32(dd - 49, ss - 49);
+        memcpy_sse2_16(dd - 17, ss - 17);
+        dd[-1] = ss[-1];
+        break;
+
+    case 114:
+        memcpy_sse2_64(dd - 114, ss - 114);
+        [[fallthrough]];
+    case 50:
+        memcpy_sse2_32(dd - 50, ss - 50);
+        memcpy_sse2_16(dd - 18, ss - 18);
+        *((uint16_unaligned_t*)(dd - 2)) = *((const uint16_unaligned_t*)(ss - 2));
+        break;
+
+    case 115:
+        memcpy_sse2_64(dd - 115, ss - 115);
+        [[fallthrough]];
+    case 51:
+        memcpy_sse2_32(dd - 51, ss - 51);
+        memcpy_sse2_16(dd - 19, ss - 19);
+        *((uint16_unaligned_t*)(dd - 3)) = *((const uint16_unaligned_t*)(ss - 3));
+        dd[-1] = ss[-1];
+        break;
+
+    case 116:
+        memcpy_sse2_64(dd - 116, ss - 116);
+        [[fallthrough]];
+    case 52:
+        memcpy_sse2_32(dd - 52, ss - 52);
+        memcpy_sse2_16(dd - 20, ss - 20);
+        *((uint32_unaligned_t*)(dd - 4)) = *((const uint32_unaligned_t*)(ss - 4));
+        break;
+
+    case 117:
+        memcpy_sse2_64(dd - 117, ss - 117);
+        [[fallthrough]];
+    case 53:
+        memcpy_sse2_32(dd - 53, ss - 53);
+        memcpy_sse2_16(dd - 21, ss - 21);
+        *((uint32_unaligned_t*)(dd - 5)) = *((const uint32_unaligned_t*)(ss - 5));
+        dd[-1] = ss[-1];
+        break;
+
+    case 118:
+        memcpy_sse2_64(dd - 118, ss - 118);
+        [[fallthrough]];
+    case 54:
+        memcpy_sse2_32(dd - 54, ss - 54);
+        memcpy_sse2_16(dd - 22, ss - 22);
+        *((uint32_unaligned_t*)(dd - 6)) = *((const uint32_unaligned_t*)(ss - 6));
+        *((uint16_unaligned_t*)(dd - 2)) = *((const uint16_unaligned_t*)(ss - 2));
+        break;
+
+    case 119:
+        memcpy_sse2_64(dd - 119, ss - 119);
+        [[fallthrough]];
+    case 55:
+        memcpy_sse2_32(dd - 55, ss - 55);
+        memcpy_sse2_16(dd - 23, ss - 23);
+        *((uint32_unaligned_t*)(dd - 7)) = *((const uint32_unaligned_t*)(ss - 7));
+        *((uint32_unaligned_t*)(dd - 4)) = *((const uint32_unaligned_t*)(ss - 4));
+        break;
+
+    case 120:
+        memcpy_sse2_64(dd - 120, ss - 120);
+        [[fallthrough]];
+    case 56:
+        memcpy_sse2_32(dd - 56, ss - 56);
+        memcpy_sse2_16(dd - 24, ss - 24);
+        memcpy_sse2_16(dd - 16, ss - 16);
+        break;
+
+    case 121:
+        memcpy_sse2_64(dd - 121, ss - 121);
+        [[fallthrough]];
+    case 57:
+        memcpy_sse2_32(dd - 57, ss - 57);
+        memcpy_sse2_16(dd - 25, ss - 25);
+        memcpy_sse2_16(dd - 16, ss - 16);
+        break;
+
+    case 122:
+        memcpy_sse2_64(dd - 122, ss - 122);
+        [[fallthrough]];
+    case 58:
+        memcpy_sse2_32(dd - 58, ss - 58);
+        memcpy_sse2_16(dd - 26, ss - 26);
+        memcpy_sse2_16(dd - 16, ss - 16);
+        break;
+
+    case 123:
+        memcpy_sse2_64(dd - 123, ss - 123);
+        [[fallthrough]];
+    case 59:
+        memcpy_sse2_32(dd - 59, ss - 59);
+        memcpy_sse2_16(dd - 27, ss - 27);
+        memcpy_sse2_16(dd - 16, ss - 16);
+        break;
+
+    case 124:
+        memcpy_sse2_64(dd - 124, ss - 124);
+        [[fallthrough]];
+    case 60:
+        memcpy_sse2_32(dd - 60, ss - 60);
+        memcpy_sse2_16(dd - 28, ss - 28);
+        memcpy_sse2_16(dd - 16, ss - 16);
+        break;
+
+    case 125:
+        memcpy_sse2_64(dd - 125, ss - 125);
+        [[fallthrough]];
+    case 61:
+        memcpy_sse2_32(dd - 61, ss - 61);
+        memcpy_sse2_16(dd - 29, ss - 29);
+        memcpy_sse2_16(dd - 16, ss - 16);
+        break;
+
+    case 126:
+        memcpy_sse2_64(dd - 126, ss - 126);
+        [[fallthrough]];
+    case 62:
+        memcpy_sse2_32(dd - 62, ss - 62);
+        memcpy_sse2_16(dd - 30, ss - 30);
+        memcpy_sse2_16(dd - 16, ss - 16);
+        break;
+
+    case 127:
+        memcpy_sse2_64(dd - 127, ss - 127);
+        [[fallthrough]];
+    case 63:
+        memcpy_sse2_32(dd - 63, ss - 63);
+        memcpy_sse2_16(dd - 31, ss - 31);
+        memcpy_sse2_16(dd - 16, ss - 16);
+        break;
+
+    case 128:
+        memcpy_sse2_128(dd - 128, ss - 128);
+        break;
+    }
+
+    return dst;
+}
+
+
+//---------------------------------------------------------------------
+// main routine
+//---------------------------------------------------------------------
+void* memcpy_fast_sse(void * __restrict destination, const void * __restrict source, size_t size)
+{
+    unsigned char *dst = (unsigned char*)destination;
+    const unsigned char *src = (const unsigned char*)source;
+    static size_t cachesize = 0x200000; // L2-cache size
+    size_t padding;
+
+    // small memory copy
+    if (size <= 128)
+{
+        return memcpy_tiny(dst, src, size);
+    }
+
+    // align destination to 16 bytes boundary
+    padding = (16 - (((size_t)dst) & 15)) & 15;
+
+    if (padding > 0)
+    {
+        __m128i head = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src));
+        _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), head);
+        dst += padding;
+        src += padding;
+        size -= padding;
+    }
+
+    // medium size copy
+    if (size <= cachesize)
+    {
+        __m128i c0, c1, c2, c3, c4, c5, c6, c7;
+
+        for (; size >= 128; size -= 128)
+        {
+            c0 = _mm_loadu_si128((reinterpret_cast<const __m128i*>(src)) + 0);
+            c1 = _mm_loadu_si128((reinterpret_cast<const __m128i*>(src)) + 1);
+            c2 = _mm_loadu_si128((reinterpret_cast<const __m128i*>(src)) + 2);
+            c3 = _mm_loadu_si128((reinterpret_cast<const __m128i*>(src)) + 3);
+            c4 = _mm_loadu_si128((reinterpret_cast<const __m128i*>(src)) + 4);
+            c5 = _mm_loadu_si128((reinterpret_cast<const __m128i*>(src)) + 5);
+            c6 = _mm_loadu_si128((reinterpret_cast<const __m128i*>(src)) + 6);
+            c7 = _mm_loadu_si128((reinterpret_cast<const __m128i*>(src)) + 7);
+            _mm_prefetch((const char*)(src + 256), _MM_HINT_NTA);
+            src += 128;
+            _mm_store_si128(((reinterpret_cast<__m128i*>(dst)) + 0), c0);
+            _mm_store_si128(((reinterpret_cast<__m128i*>(dst)) + 1), c1);
+            _mm_store_si128(((reinterpret_cast<__m128i*>(dst)) + 2), c2);
+            _mm_store_si128(((reinterpret_cast<__m128i*>(dst)) + 3), c3);
+            _mm_store_si128(((reinterpret_cast<__m128i*>(dst)) + 4), c4);
+            _mm_store_si128(((reinterpret_cast<__m128i*>(dst)) + 5), c5);
+            _mm_store_si128(((reinterpret_cast<__m128i*>(dst)) + 6), c6);
+            _mm_store_si128(((reinterpret_cast<__m128i*>(dst)) + 7), c7);
+            dst += 128;
+        }
+    }
+    else
+    {        // big memory copy
+        __m128i c0, c1, c2, c3, c4, c5, c6, c7;
+
+        _mm_prefetch((const char*)(src), _MM_HINT_NTA);
+
+        if ((((size_t)src) & 15) == 0)
+        {    // source aligned
+            for (; size >= 128; size -= 128)
+            {
+                c0 = _mm_load_si128((reinterpret_cast<const __m128i*>(src)) + 0);
+                c1 = _mm_load_si128((reinterpret_cast<const __m128i*>(src)) + 1);
+                c2 = _mm_load_si128((reinterpret_cast<const __m128i*>(src)) + 2);
+                c3 = _mm_load_si128((reinterpret_cast<const __m128i*>(src)) + 3);
+                c4 = _mm_load_si128((reinterpret_cast<const __m128i*>(src)) + 4);
+                c5 = _mm_load_si128((reinterpret_cast<const __m128i*>(src)) + 5);
+                c6 = _mm_load_si128((reinterpret_cast<const __m128i*>(src)) + 6);
+                c7 = _mm_load_si128((reinterpret_cast<const __m128i*>(src)) + 7);
+                _mm_prefetch((const char*)(src + 256), _MM_HINT_NTA);
+                src += 128;
+                _mm_stream_si128(((reinterpret_cast<__m128i*>(dst)) + 0), c0);
+                _mm_stream_si128(((reinterpret_cast<__m128i*>(dst)) + 1), c1);
+                _mm_stream_si128(((reinterpret_cast<__m128i*>(dst)) + 2), c2);
+                _mm_stream_si128(((reinterpret_cast<__m128i*>(dst)) + 3), c3);
+                _mm_stream_si128(((reinterpret_cast<__m128i*>(dst)) + 4), c4);
+                _mm_stream_si128(((reinterpret_cast<__m128i*>(dst)) + 5), c5);
+                _mm_stream_si128(((reinterpret_cast<__m128i*>(dst)) + 6), c6);
+                _mm_stream_si128(((reinterpret_cast<__m128i*>(dst)) + 7), c7);
+                dst += 128;
+            }
+        }
+        else
+        {                            // source unaligned
+            for (; size >= 128; size -= 128)
+            {
+                c0 = _mm_loadu_si128((reinterpret_cast<const __m128i*>(src)) + 0);
+                c1 = _mm_loadu_si128((reinterpret_cast<const __m128i*>(src)) + 1);
+                c2 = _mm_loadu_si128((reinterpret_cast<const __m128i*>(src)) + 2);
+                c3 = _mm_loadu_si128((reinterpret_cast<const __m128i*>(src)) + 3);
+                c4 = _mm_loadu_si128((reinterpret_cast<const __m128i*>(src)) + 4);
+                c5 = _mm_loadu_si128((reinterpret_cast<const __m128i*>(src)) + 5);
+                c6 = _mm_loadu_si128((reinterpret_cast<const __m128i*>(src)) + 6);
+                c7 = _mm_loadu_si128((reinterpret_cast<const __m128i*>(src)) + 7);
+                _mm_prefetch((const char*)(src + 256), _MM_HINT_NTA);
+                src += 128;
+                _mm_stream_si128(((reinterpret_cast<__m128i*>(dst)) + 0), c0);
+                _mm_stream_si128(((reinterpret_cast<__m128i*>(dst)) + 1), c1);
+                _mm_stream_si128(((reinterpret_cast<__m128i*>(dst)) + 2), c2);
+                _mm_stream_si128(((reinterpret_cast<__m128i*>(dst)) + 3), c3);
+                _mm_stream_si128(((reinterpret_cast<__m128i*>(dst)) + 4), c4);
+                _mm_stream_si128(((reinterpret_cast<__m128i*>(dst)) + 5), c5);
+                _mm_stream_si128(((reinterpret_cast<__m128i*>(dst)) + 6), c6);
+                _mm_stream_si128(((reinterpret_cast<__m128i*>(dst)) + 7), c7);
+                dst += 128;
+            }
+        }
+        _mm_sfence();
+    }
+
+    memcpy_tiny(dst, src, size);
+
+    return destination;
+}
--- a/utils/memcpy-bench/FastMemcpy_Avx.h
+++ b/utils/memcpy-bench/FastMemcpy_Avx.h
@ -0,0 +1,496 @@
+#pragma once
+
+//=====================================================================
+//
+// FastMemcpy.c - skywind3000@163.com, 2015
+//
+// feature:
+// 50% speed up in avg. vs standard memcpy (tested in vc2012/gcc5.1)
+//
+//=====================================================================
+
+#include <stddef.h>
+#include <stdint.h>
+#include <immintrin.h>
+
+
+//---------------------------------------------------------------------
+// force inline for compilers
+//---------------------------------------------------------------------
+#ifndef INLINE
+#ifdef __GNUC__
+#if (__GNUC__ > 3) || ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 1))
+    #define INLINE         __inline__ __attribute__((always_inline))
+#else
+    #define INLINE         __inline__
+#endif
+#elif defined(_MSC_VER)
+    #define INLINE __forceinline
+#elif (defined(__BORLANDC__) || defined(__WATCOMC__))
+    #define INLINE __inline
+#else
+    #define INLINE
+#endif
+#endif
+
+
+//---------------------------------------------------------------------
+// fast copy for different sizes
+//---------------------------------------------------------------------
+static INLINE void memcpy_avx_16(void * __restrict dst, const void * __restrict src)
+{
+#if 1
+    __m128i m0 = _mm_loadu_si128(((const __m128i*)src) + 0);
+    _mm_storeu_si128(((__m128i*)dst) + 0, m0);
+#else
+    *((uint64_t*)((char*)dst + 0)) = *((uint64_t*)((const char*)src + 0));
+    *((uint64_t*)((char*)dst + 8)) = *((uint64_t*)((const char*)src + 8));
+#endif
+}
+
+static INLINE void memcpy_avx_32(void *dst, const void *src)
+{
+    __m256i m0 = _mm256_loadu_si256((reinterpret_cast<const __m256i*>(src)) + 0);
+    _mm256_storeu_si256((reinterpret_cast<__m256i*>(dst)) + 0, m0);
+}
+
+static INLINE void memcpy_avx_64(void *dst, const void *src)
+{
+    __m256i m0 = _mm256_loadu_si256((reinterpret_cast<const __m256i*>(src)) + 0);
+    __m256i m1 = _mm256_loadu_si256((reinterpret_cast<const __m256i*>(src)) + 1);
+    _mm256_storeu_si256((reinterpret_cast<__m256i*>(dst)) + 0, m0);
+    _mm256_storeu_si256((reinterpret_cast<__m256i*>(dst)) + 1, m1);
+}
+
+static INLINE void memcpy_avx_128(void *dst, const void *src)
+{
+    __m256i m0 = _mm256_loadu_si256((reinterpret_cast<const __m256i*>(src)) + 0);
+    __m256i m1 = _mm256_loadu_si256((reinterpret_cast<const __m256i*>(src)) + 1);
+    __m256i m2 = _mm256_loadu_si256((reinterpret_cast<const __m256i*>(src)) + 2);
+    __m256i m3 = _mm256_loadu_si256((reinterpret_cast<const __m256i*>(src)) + 3);
+    _mm256_storeu_si256((reinterpret_cast<__m256i*>(dst)) + 0, m0);
+    _mm256_storeu_si256((reinterpret_cast<__m256i*>(dst)) + 1, m1);
+    _mm256_storeu_si256((reinterpret_cast<__m256i*>(dst)) + 2, m2);
+    _mm256_storeu_si256((reinterpret_cast<__m256i*>(dst)) + 3, m3);
+}
+
+static INLINE void memcpy_avx_256(void *dst, const void *src)
+{
+    __m256i m0 = _mm256_loadu_si256((reinterpret_cast<const __m256i*>(src)) + 0);
+    __m256i m1 = _mm256_loadu_si256((reinterpret_cast<const __m256i*>(src)) + 1);
+    __m256i m2 = _mm256_loadu_si256((reinterpret_cast<const __m256i*>(src)) + 2);
+    __m256i m3 = _mm256_loadu_si256((reinterpret_cast<const __m256i*>(src)) + 3);
+    __m256i m4 = _mm256_loadu_si256((reinterpret_cast<const __m256i*>(src)) + 4);
+    __m256i m5 = _mm256_loadu_si256((reinterpret_cast<const __m256i*>(src)) + 5);
+    __m256i m6 = _mm256_loadu_si256((reinterpret_cast<const __m256i*>(src)) + 6);
+    __m256i m7 = _mm256_loadu_si256((reinterpret_cast<const __m256i*>(src)) + 7);
+    _mm256_storeu_si256((reinterpret_cast<__m256i*>(dst)) + 0, m0);
+    _mm256_storeu_si256((reinterpret_cast<__m256i*>(dst)) + 1, m1);
+    _mm256_storeu_si256((reinterpret_cast<__m256i*>(dst)) + 2, m2);
+    _mm256_storeu_si256((reinterpret_cast<__m256i*>(dst)) + 3, m3);
+    _mm256_storeu_si256((reinterpret_cast<__m256i*>(dst)) + 4, m4);
+    _mm256_storeu_si256((reinterpret_cast<__m256i*>(dst)) + 5, m5);
+    _mm256_storeu_si256((reinterpret_cast<__m256i*>(dst)) + 6, m6);
+    _mm256_storeu_si256((reinterpret_cast<__m256i*>(dst)) + 7, m7);
+}
+
+
+//---------------------------------------------------------------------
+// tiny memory copy with jump table optimized
+//---------------------------------------------------------------------
+static INLINE void *memcpy_tiny_avx(void * __restrict dst, const void * __restrict src, size_t size)
+{
+    unsigned char *dd = reinterpret_cast<unsigned char *>(dst) + size;
+    const unsigned char *ss = reinterpret_cast<const unsigned char*>(src) + size;
+
+    switch (size)
+    {
+    case 128: memcpy_avx_128(dd - 128, ss - 128); [[fallthrough]];
+    case 0:  break;
+    case 129: memcpy_avx_128(dd - 129, ss - 129); [[fallthrough]];
+    case 1: dd[-1] = ss[-1]; break;
+    case 130: memcpy_avx_128(dd - 130, ss - 130); [[fallthrough]];
+    case 2: *((uint16_t*)(dd - 2)) = *((uint16_t*)(ss - 2)); break;
+    case 131: memcpy_avx_128(dd - 131, ss - 131); [[fallthrough]];
+    case 3: *((uint16_t*)(dd - 3)) = *((uint16_t*)(ss - 3)); dd[-1] = ss[-1]; break;
+    case 132: memcpy_avx_128(dd - 132, ss - 132); [[fallthrough]];
+    case 4: *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); break;
+    case 133: memcpy_avx_128(dd - 133, ss - 133); [[fallthrough]];
+    case 5: *((uint32_t*)(dd - 5)) = *((uint32_t*)(ss - 5)); dd[-1] = ss[-1]; break;
+    case 134: memcpy_avx_128(dd - 134, ss - 134); [[fallthrough]];
+    case 6: *((uint32_t*)(dd - 6)) = *((uint32_t*)(ss - 6)); *((uint16_t*)(dd - 2)) = *((uint16_t*)(ss - 2)); break;
+    case 135: memcpy_avx_128(dd - 135, ss - 135); [[fallthrough]];
+    case 7: *((uint32_t*)(dd - 7)) = *((uint32_t*)(ss - 7)); *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); break;
+    case 136: memcpy_avx_128(dd - 136, ss - 136); [[fallthrough]];
+    case 8: *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break;
+    case 137: memcpy_avx_128(dd - 137, ss - 137); [[fallthrough]];
+    case 9: *((uint64_t*)(dd - 9)) = *((uint64_t*)(ss - 9)); dd[-1] = ss[-1]; break;
+    case 138: memcpy_avx_128(dd - 138, ss - 138); [[fallthrough]];
+    case 10: *((uint64_t*)(dd - 10)) = *((uint64_t*)(ss - 10)); *((uint16_t*)(dd - 2)) = *((uint16_t*)(ss - 2)); break;
+    case 139: memcpy_avx_128(dd - 139, ss - 139); [[fallthrough]];
+    case 11: *((uint64_t*)(dd - 11)) = *((uint64_t*)(ss - 11)); *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); break;
+    case 140: memcpy_avx_128(dd - 140, ss - 140); [[fallthrough]];
+    case 12: *((uint64_t*)(dd - 12)) = *((uint64_t*)(ss - 12)); *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); break;
+    case 141: memcpy_avx_128(dd - 141, ss - 141); [[fallthrough]];
+    case 13: *((uint64_t*)(dd - 13)) = *((uint64_t*)(ss - 13)); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break;
+    case 142: memcpy_avx_128(dd - 142, ss - 142); [[fallthrough]];
+    case 14: *((uint64_t*)(dd - 14)) = *((uint64_t*)(ss - 14)); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break;
+    case 143: memcpy_avx_128(dd - 143, ss - 143); [[fallthrough]];
+    case 15: *((uint64_t*)(dd - 15)) = *((uint64_t*)(ss - 15)); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break;
+    case 144: memcpy_avx_128(dd - 144, ss - 144); [[fallthrough]];
+    case 16: memcpy_avx_16(dd - 16, ss - 16); break;
+    case 145: memcpy_avx_128(dd - 145, ss - 145); [[fallthrough]];
+    case 17: memcpy_avx_16(dd - 17, ss - 17); dd[-1] = ss[-1]; break;
+    case 146: memcpy_avx_128(dd - 146, ss - 146); [[fallthrough]];
+    case 18: memcpy_avx_16(dd - 18, ss - 18); *((uint16_t*)(dd - 2)) = *((uint16_t*)(ss - 2)); break;
+    case 147: memcpy_avx_128(dd - 147, ss - 147); [[fallthrough]];
+    case 19: memcpy_avx_16(dd - 19, ss - 19); *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); break;
+    case 148: memcpy_avx_128(dd - 148, ss - 148); [[fallthrough]];
+    case 20: memcpy_avx_16(dd - 20, ss - 20); *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); break;
+    case 149: memcpy_avx_128(dd - 149, ss - 149); [[fallthrough]];
+    case 21: memcpy_avx_16(dd - 21, ss - 21); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break;
+    case 150: memcpy_avx_128(dd - 150, ss - 150); [[fallthrough]];
+    case 22: memcpy_avx_16(dd - 22, ss - 22); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break;
+    case 151: memcpy_avx_128(dd - 151, ss - 151); [[fallthrough]];
+    case 23: memcpy_avx_16(dd - 23, ss - 23); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break;
+    case 152: memcpy_avx_128(dd - 152, ss - 152); [[fallthrough]];
+    case 24: memcpy_avx_16(dd - 24, ss - 24); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break;
+    case 153: memcpy_avx_128(dd - 153, ss - 153); [[fallthrough]];
+    case 25: memcpy_avx_16(dd - 25, ss - 25); memcpy_avx_16(dd - 16, ss - 16); break;
+    case 154: memcpy_avx_128(dd - 154, ss - 154); [[fallthrough]];
+    case 26: memcpy_avx_16(dd - 26, ss - 26); memcpy_avx_16(dd - 16, ss - 16); break;
+    case 155: memcpy_avx_128(dd - 155, ss - 155); [[fallthrough]];
+    case 27: memcpy_avx_16(dd - 27, ss - 27); memcpy_avx_16(dd - 16, ss - 16); break;
+    case 156: memcpy_avx_128(dd - 156, ss - 156); [[fallthrough]];
+    case 28: memcpy_avx_16(dd - 28, ss - 28); memcpy_avx_16(dd - 16, ss - 16); break;
+    case 157: memcpy_avx_128(dd - 157, ss - 157); [[fallthrough]];
+    case 29: memcpy_avx_16(dd - 29, ss - 29); memcpy_avx_16(dd - 16, ss - 16); break;
+    case 158: memcpy_avx_128(dd - 158, ss - 158); [[fallthrough]];
+    case 30: memcpy_avx_16(dd - 30, ss - 30); memcpy_avx_16(dd - 16, ss - 16); break;
+    case 159: memcpy_avx_128(dd - 159, ss - 159); [[fallthrough]];
+    case 31: memcpy_avx_16(dd - 31, ss - 31); memcpy_avx_16(dd - 16, ss - 16); break;
+    case 160: memcpy_avx_128(dd - 160, ss - 160); [[fallthrough]];
+    case 32: memcpy_avx_32(dd - 32, ss - 32); break;
+    case 161: memcpy_avx_128(dd - 161, ss - 161); [[fallthrough]];
+    case 33: memcpy_avx_32(dd - 33, ss - 33); dd[-1] = ss[-1]; break;
+    case 162: memcpy_avx_128(dd - 162, ss - 162); [[fallthrough]];
+    case 34: memcpy_avx_32(dd - 34, ss - 34); *((uint16_t*)(dd - 2)) = *((uint16_t*)(ss - 2)); break;
+    case 163: memcpy_avx_128(dd - 163, ss - 163); [[fallthrough]];
+    case 35: memcpy_avx_32(dd - 35, ss - 35); *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); break;
+    case 164: memcpy_avx_128(dd - 164, ss - 164); [[fallthrough]];
+    case 36: memcpy_avx_32(dd - 36, ss - 36); *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); break;
+    case 165: memcpy_avx_128(dd - 165, ss - 165); [[fallthrough]];
+    case 37: memcpy_avx_32(dd - 37, ss - 37); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break;
+    case 166: memcpy_avx_128(dd - 166, ss - 166); [[fallthrough]];
+    case 38: memcpy_avx_32(dd - 38, ss - 38); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break;
+    case 167: memcpy_avx_128(dd - 167, ss - 167); [[fallthrough]];
+    case 39: memcpy_avx_32(dd - 39, ss - 39); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break;
+    case 168: memcpy_avx_128(dd - 168, ss - 168); [[fallthrough]];
+    case 40: memcpy_avx_32(dd - 40, ss - 40); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break;
+    case 169: memcpy_avx_128(dd - 169, ss - 169); [[fallthrough]];
+    case 41: memcpy_avx_32(dd - 41, ss - 41); memcpy_avx_16(dd - 16, ss - 16); break;
+    case 170: memcpy_avx_128(dd - 170, ss - 170); [[fallthrough]];
+    case 42: memcpy_avx_32(dd - 42, ss - 42); memcpy_avx_16(dd - 16, ss - 16); break;
+    case 171: memcpy_avx_128(dd - 171, ss - 171); [[fallthrough]];
+    case 43: memcpy_avx_32(dd - 43, ss - 43); memcpy_avx_16(dd - 16, ss - 16); break;
+    case 172: memcpy_avx_128(dd - 172, ss - 172); [[fallthrough]];
+    case 44: memcpy_avx_32(dd - 44, ss - 44); memcpy_avx_16(dd - 16, ss - 16); break;
+    case 173: memcpy_avx_128(dd - 173, ss - 173); [[fallthrough]];
+    case 45: memcpy_avx_32(dd - 45, ss - 45); memcpy_avx_16(dd - 16, ss - 16); break;
+    case 174: memcpy_avx_128(dd - 174, ss - 174); [[fallthrough]];
+    case 46: memcpy_avx_32(dd - 46, ss - 46); memcpy_avx_16(dd - 16, ss - 16); break;
+    case 175: memcpy_avx_128(dd - 175, ss - 175); [[fallthrough]];
+    case 47: memcpy_avx_32(dd - 47, ss - 47); memcpy_avx_16(dd - 16, ss - 16); break;
+    case 176: memcpy_avx_128(dd - 176, ss - 176); [[fallthrough]];
+    case 48: memcpy_avx_32(dd - 48, ss - 48); memcpy_avx_16(dd - 16, ss - 16); break;
+    case 177: memcpy_avx_128(dd - 177, ss - 177); [[fallthrough]];
+    case 49: memcpy_avx_32(dd - 49, ss - 49); memcpy_avx_32(dd - 32, ss - 32); break;
+    case 178: memcpy_avx_128(dd - 178, ss - 178); [[fallthrough]];
+    case 50: memcpy_avx_32(dd - 50, ss - 50); memcpy_avx_32(dd - 32, ss - 32); break;
+    case 179: memcpy_avx_128(dd - 179, ss - 179); [[fallthrough]];
+    case 51: memcpy_avx_32(dd - 51, ss - 51); memcpy_avx_32(dd - 32, ss - 32); break;
+    case 180: memcpy_avx_128(dd - 180, ss - 180); [[fallthrough]];
+    case 52: memcpy_avx_32(dd - 52, ss - 52); memcpy_avx_32(dd - 32, ss - 32); break;
+    case 181: memcpy_avx_128(dd - 181, ss - 181); [[fallthrough]];
+    case 53: memcpy_avx_32(dd - 53, ss - 53); memcpy_avx_32(dd - 32, ss - 32); break;
+    case 182: memcpy_avx_128(dd - 182, ss - 182); [[fallthrough]];
+    case 54: memcpy_avx_32(dd - 54, ss - 54); memcpy_avx_32(dd - 32, ss - 32); break;
+    case 183: memcpy_avx_128(dd - 183, ss - 183); [[fallthrough]];
+    case 55: memcpy_avx_32(dd - 55, ss - 55); memcpy_avx_32(dd - 32, ss - 32); break;
+    case 184: memcpy_avx_128(dd - 184, ss - 184); [[fallthrough]];
+    case 56: memcpy_avx_32(dd - 56, ss - 56); memcpy_avx_32(dd - 32, ss - 32); break;
+    case 185: memcpy_avx_128(dd - 185, ss - 185); [[fallthrough]];
+    case 57: memcpy_avx_32(dd - 57, ss - 57); memcpy_avx_32(dd - 32, ss - 32); break;
+    case 186: memcpy_avx_128(dd - 186, ss - 186); [[fallthrough]];
+    case 58: memcpy_avx_32(dd - 58, ss - 58); memcpy_avx_32(dd - 32, ss - 32); break;
+    case 187: memcpy_avx_128(dd - 187, ss - 187); [[fallthrough]];
+    case 59: memcpy_avx_32(dd - 59, ss - 59); memcpy_avx_32(dd - 32, ss - 32); break;
+    case 188: memcpy_avx_128(dd - 188, ss - 188); [[fallthrough]];
+    case 60: memcpy_avx_32(dd - 60, ss - 60); memcpy_avx_32(dd - 32, ss - 32); break;
+    case 189: memcpy_avx_128(dd - 189, ss - 189); [[fallthrough]];
+    case 61: memcpy_avx_32(dd - 61, ss - 61); memcpy_avx_32(dd - 32, ss - 32); break;
+    case 190: memcpy_avx_128(dd - 190, ss - 190); [[fallthrough]];
+    case 62: memcpy_avx_32(dd - 62, ss - 62); memcpy_avx_32(dd - 32, ss - 32); break;
+    case 191: memcpy_avx_128(dd - 191, ss - 191); [[fallthrough]];
+    case 63: memcpy_avx_32(dd - 63, ss - 63); memcpy_avx_32(dd - 32, ss - 32); break;
+    case 192: memcpy_avx_128(dd - 192, ss - 192); [[fallthrough]];
+    case 64: memcpy_avx_64(dd - 64, ss - 64); break;
+    case 193: memcpy_avx_128(dd - 193, ss - 193); [[fallthrough]];
+    case 65: memcpy_avx_64(dd - 65, ss - 65); dd[-1] = ss[-1]; break;
+    case 194: memcpy_avx_128(dd - 194, ss - 194); [[fallthrough]];
+    case 66: memcpy_avx_64(dd - 66, ss - 66); *((uint16_t*)(dd - 2)) = *((uint16_t*)(ss - 2)); break;
+    case 195: memcpy_avx_128(dd - 195, ss - 195); [[fallthrough]];
+    case 67: memcpy_avx_64(dd - 67, ss - 67); *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); break;
+    case 196: memcpy_avx_128(dd - 196, ss - 196); [[fallthrough]];
+    case 68: memcpy_avx_64(dd - 68, ss - 68); *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); break;
+    case 197: memcpy_avx_128(dd - 197, ss - 197); [[fallthrough]];
+    case 69: memcpy_avx_64(dd - 69, ss - 69); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break;
+    case 198: memcpy_avx_128(dd - 198, ss - 198); [[fallthrough]];
+    case 70: memcpy_avx_64(dd - 70, ss - 70); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break;
+    case 199: memcpy_avx_128(dd - 199, ss - 199); [[fallthrough]];
+    case 71: memcpy_avx_64(dd - 71, ss - 71); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break;
+    case 200: memcpy_avx_128(dd - 200, ss - 200); [[fallthrough]];
+    case 72: memcpy_avx_64(dd - 72, ss - 72); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break;
+    case 201: memcpy_avx_128(dd - 201, ss - 201); [[fallthrough]];
+    case 73: memcpy_avx_64(dd - 73, ss - 73); memcpy_avx_16(dd - 16, ss - 16); break;
+    case 202: memcpy_avx_128(dd - 202, ss - 202); [[fallthrough]];
+    case 74: memcpy_avx_64(dd - 74, ss - 74); memcpy_avx_16(dd - 16, ss - 16); break;
+    case 203: memcpy_avx_128(dd - 203, ss - 203); [[fallthrough]];
+    case 75: memcpy_avx_64(dd - 75, ss - 75); memcpy_avx_16(dd - 16, ss - 16); break;
+    case 204: memcpy_avx_128(dd - 204, ss - 204); [[fallthrough]];
+    case 76: memcpy_avx_64(dd - 76, ss - 76); memcpy_avx_16(dd - 16, ss - 16); break;
+    case 205: memcpy_avx_128(dd - 205, ss - 205); [[fallthrough]];
+    case 77: memcpy_avx_64(dd - 77, ss - 77); memcpy_avx_16(dd - 16, ss - 16); break;
+    case 206: memcpy_avx_128(dd - 206, ss - 206); [[fallthrough]];
+    case 78: memcpy_avx_64(dd - 78, ss - 78); memcpy_avx_16(dd - 16, ss - 16); break;
+    case 207: memcpy_avx_128(dd - 207, ss - 207); [[fallthrough]];
+    case 79: memcpy_avx_64(dd - 79, ss - 79); memcpy_avx_16(dd - 16, ss - 16); break;
+    case 208: memcpy_avx_128(dd - 208, ss - 208); [[fallthrough]];
+    case 80: memcpy_avx_64(dd - 80, ss - 80); memcpy_avx_16(dd - 16, ss - 16); break;
+    case 209: memcpy_avx_128(dd - 209, ss - 209); [[fallthrough]];
+    case 81: memcpy_avx_64(dd - 81, ss - 81); memcpy_avx_32(dd - 32, ss - 32); break;
+    case 210: memcpy_avx_128(dd - 210, ss - 210); [[fallthrough]];
+    case 82: memcpy_avx_64(dd - 82, ss - 82); memcpy_avx_32(dd - 32, ss - 32); break;
+    case 211: memcpy_avx_128(dd - 211, ss - 211); [[fallthrough]];
+    case 83: memcpy_avx_64(dd - 83, ss - 83); memcpy_avx_32(dd - 32, ss - 32); break;
+    case 212: memcpy_avx_128(dd - 212, ss - 212); [[fallthrough]];
+    case 84: memcpy_avx_64(dd - 84, ss - 84); memcpy_avx_32(dd - 32, ss - 32); break;
+    case 213: memcpy_avx_128(dd - 213, ss - 213); [[fallthrough]];
+    case 85: memcpy_avx_64(dd - 85, ss - 85); memcpy_avx_32(dd - 32, ss - 32); break;
+    case 214: memcpy_avx_128(dd - 214, ss - 214); [[fallthrough]];
+    case 86: memcpy_avx_64(dd - 86, ss - 86); memcpy_avx_32(dd - 32, ss - 32); break;
+    case 215: memcpy_avx_128(dd - 215, ss - 215); [[fallthrough]];
+    case 87: memcpy_avx_64(dd - 87, ss - 87); memcpy_avx_32(dd - 32, ss - 32); break;
+    case 216: memcpy_avx_128(dd - 216, ss - 216); [[fallthrough]];
+    case 88: memcpy_avx_64(dd - 88, ss - 88); memcpy_avx_32(dd - 32, ss - 32); break;
+    case 217: memcpy_avx_128(dd - 217, ss - 217); [[fallthrough]];
+    case 89: memcpy_avx_64(dd - 89, ss - 89); memcpy_avx_32(dd - 32, ss - 32); break;
+    case 218: memcpy_avx_128(dd - 218, ss - 218); [[fallthrough]];
+    case 90: memcpy_avx_64(dd - 90, ss - 90); memcpy_avx_32(dd - 32, ss - 32); break;
+    case 219: memcpy_avx_128(dd - 219, ss - 219); [[fallthrough]];
+    case 91: memcpy_avx_64(dd - 91, ss - 91); memcpy_avx_32(dd - 32, ss - 32); break;
+    case 220: memcpy_avx_128(dd - 220, ss - 220); [[fallthrough]];
+    case 92: memcpy_avx_64(dd - 92, ss - 92); memcpy_avx_32(dd - 32, ss - 32); break;
+    case 221: memcpy_avx_128(dd - 221, ss - 221); [[fallthrough]];
+    case 93: memcpy_avx_64(dd - 93, ss - 93); memcpy_avx_32(dd - 32, ss - 32); break;
+    case 222: memcpy_avx_128(dd - 222, ss - 222); [[fallthrough]];
+    case 94: memcpy_avx_64(dd - 94, ss - 94); memcpy_avx_32(dd - 32, ss - 32); break;
+    case 223: memcpy_avx_128(dd - 223, ss - 223); [[fallthrough]];
+    case 95: memcpy_avx_64(dd - 95, ss - 95); memcpy_avx_32(dd - 32, ss - 32); break;
+    case 224: memcpy_avx_128(dd - 224, ss - 224); [[fallthrough]];
+    case 96: memcpy_avx_64(dd - 96, ss - 96); memcpy_avx_32(dd - 32, ss - 32); break;
+    case 225: memcpy_avx_128(dd - 225, ss - 225); [[fallthrough]];
+    case 97: memcpy_avx_64(dd - 97, ss - 97); memcpy_avx_64(dd - 64, ss - 64); break;
+    case 226: memcpy_avx_128(dd - 226, ss - 226); [[fallthrough]];
+    case 98: memcpy_avx_64(dd - 98, ss - 98); memcpy_avx_64(dd - 64, ss - 64); break;
+    case 227: memcpy_avx_128(dd - 227, ss - 227); [[fallthrough]];
+    case 99: memcpy_avx_64(dd - 99, ss - 99); memcpy_avx_64(dd - 64, ss - 64); break;
+    case 228: memcpy_avx_128(dd - 228, ss - 228); [[fallthrough]];
+    case 100: memcpy_avx_64(dd - 100, ss - 100); memcpy_avx_64(dd - 64, ss - 64); break;
+    case 229: memcpy_avx_128(dd - 229, ss - 229); [[fallthrough]];
+    case 101: memcpy_avx_64(dd - 101, ss - 101); memcpy_avx_64(dd - 64, ss - 64); break;
+    case 230: memcpy_avx_128(dd - 230, ss - 230); [[fallthrough]];
+    case 102: memcpy_avx_64(dd - 102, ss - 102); memcpy_avx_64(dd - 64, ss - 64); break;
+    case 231: memcpy_avx_128(dd - 231, ss - 231); [[fallthrough]];
+    case 103: memcpy_avx_64(dd - 103, ss - 103); memcpy_avx_64(dd - 64, ss - 64); break;
+    case 232: memcpy_avx_128(dd - 232, ss - 232); [[fallthrough]];
+    case 104: memcpy_avx_64(dd - 104, ss - 104); memcpy_avx_64(dd - 64, ss - 64); break;
+    case 233: memcpy_avx_128(dd - 233, ss - 233); [[fallthrough]];
+    case 105: memcpy_avx_64(dd - 105, ss - 105); memcpy_avx_64(dd - 64, ss - 64); break;
+    case 234: memcpy_avx_128(dd - 234, ss - 234); [[fallthrough]];
+    case 106: memcpy_avx_64(dd - 106, ss - 106); memcpy_avx_64(dd - 64, ss - 64); break;
+    case 235: memcpy_avx_128(dd - 235, ss - 235); [[fallthrough]];
+    case 107: memcpy_avx_64(dd - 107, ss - 107); memcpy_avx_64(dd - 64, ss - 64); break;
+    case 236: memcpy_avx_128(dd - 236, ss - 236); [[fallthrough]];
+    case 108: memcpy_avx_64(dd - 108, ss - 108); memcpy_avx_64(dd - 64, ss - 64); break;
+    case 237: memcpy_avx_128(dd - 237, ss - 237); [[fallthrough]];
+    case 109: memcpy_avx_64(dd - 109, ss - 109); memcpy_avx_64(dd - 64, ss - 64); break;
+    case 238: memcpy_avx_128(dd - 238, ss - 238); [[fallthrough]];
+    case 110: memcpy_avx_64(dd - 110, ss - 110); memcpy_avx_64(dd - 64, ss - 64); break;
+    case 239: memcpy_avx_128(dd - 239, ss - 239); [[fallthrough]];
+    case 111: memcpy_avx_64(dd - 111, ss - 111); memcpy_avx_64(dd - 64, ss - 64); break;
+    case 240: memcpy_avx_128(dd - 240, ss - 240); [[fallthrough]];
+    case 112: memcpy_avx_64(dd - 112, ss - 112); memcpy_avx_64(dd - 64, ss - 64); break;
+    case 241: memcpy_avx_128(dd - 241, ss - 241); [[fallthrough]];
+    case 113: memcpy_avx_64(dd - 113, ss - 113); memcpy_avx_64(dd - 64, ss - 64); break;
+    case 242: memcpy_avx_128(dd - 242, ss - 242); [[fallthrough]];
+    case 114: memcpy_avx_64(dd - 114, ss - 114); memcpy_avx_64(dd - 64, ss - 64); break;
+    case 243: memcpy_avx_128(dd - 243, ss - 243); [[fallthrough]];
+    case 115: memcpy_avx_64(dd - 115, ss - 115); memcpy_avx_64(dd - 64, ss - 64); break;
+    case 244: memcpy_avx_128(dd - 244, ss - 244); [[fallthrough]];
+    case 116: memcpy_avx_64(dd - 116, ss - 116); memcpy_avx_64(dd - 64, ss - 64); break;
+    case 245: memcpy_avx_128(dd - 245, ss - 245); [[fallthrough]];
+    case 117: memcpy_avx_64(dd - 117, ss - 117); memcpy_avx_64(dd - 64, ss - 64); break;
+    case 246: memcpy_avx_128(dd - 246, ss - 246); [[fallthrough]];
+    case 118: memcpy_avx_64(dd - 118, ss - 118); memcpy_avx_64(dd - 64, ss - 64); break;
+    case 247: memcpy_avx_128(dd - 247, ss - 247); [[fallthrough]];
+    case 119: memcpy_avx_64(dd - 119, ss - 119); memcpy_avx_64(dd - 64, ss - 64); break;
+    case 248: memcpy_avx_128(dd - 248, ss - 248); [[fallthrough]];
+    case 120: memcpy_avx_64(dd - 120, ss - 120); memcpy_avx_64(dd - 64, ss - 64); break;
+    case 249: memcpy_avx_128(dd - 249, ss - 249); [[fallthrough]];
+    case 121: memcpy_avx_64(dd - 121, ss - 121); memcpy_avx_64(dd - 64, ss - 64); break;
+    case 250: memcpy_avx_128(dd - 250, ss - 250); [[fallthrough]];
+    case 122: memcpy_avx_64(dd - 122, ss - 122); memcpy_avx_64(dd - 64, ss - 64); break;
+    case 251: memcpy_avx_128(dd - 251, ss - 251); [[fallthrough]];
+    case 123: memcpy_avx_64(dd - 123, ss - 123); memcpy_avx_64(dd - 64, ss - 64); break;
+    case 252: memcpy_avx_128(dd - 252, ss - 252); [[fallthrough]];
+    case 124: memcpy_avx_64(dd - 124, ss - 124); memcpy_avx_64(dd - 64, ss - 64); break;
+    case 253: memcpy_avx_128(dd - 253, ss - 253); [[fallthrough]];
+    case 125: memcpy_avx_64(dd - 125, ss - 125); memcpy_avx_64(dd - 64, ss - 64); break;
+    case 254: memcpy_avx_128(dd - 254, ss - 254); [[fallthrough]];
+    case 126: memcpy_avx_64(dd - 126, ss - 126); memcpy_avx_64(dd - 64, ss - 64); break;
+    case 255: memcpy_avx_128(dd - 255, ss - 255); [[fallthrough]];
+    case 127: memcpy_avx_64(dd - 127, ss - 127); memcpy_avx_64(dd - 64, ss - 64); break;
+    case 256: memcpy_avx_256(dd - 256, ss - 256); break;
+    }
+
+    return dst;
+}
+
+
+//---------------------------------------------------------------------
+// main routine
+//---------------------------------------------------------------------
+void* memcpy_fast_avx(void * __restrict destination, const void * __restrict source, size_t size)
+{
+    unsigned char *dst = reinterpret_cast<unsigned char*>(destination);
+    const unsigned char *src = reinterpret_cast<const unsigned char*>(source);
+    static size_t cachesize = 0x200000; // L3-cache size
+    size_t padding;
+
+    // small memory copy
+    if (size <= 256)
+    {
+        memcpy_tiny_avx(dst, src, size);
+        _mm256_zeroupper();
+        return destination;
+    }
+
+    // align destination to 16 bytes boundary
+    padding = (32 - (((size_t)dst) & 31)) & 31;
+
+#if 0
+    if (padding > 0)
+    {
+        __m256i head = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src));
+        _mm256_storeu_si256((__m256i*)dst, head);
+        dst += padding;
+        src += padding;
+        size -= padding;
+    }
+#else
+    __m256i head = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src));
+    _mm256_storeu_si256((__m256i*)dst, head);
+    dst += padding;
+    src += padding;
+    size -= padding;
+#endif
+
+    // medium size copy
+    if (size <= cachesize)
+    {
+        __m256i c0, c1, c2, c3, c4, c5, c6, c7;
+
+        for (; size >= 256; size -= 256)
+        {
+            c0 = _mm256_loadu_si256((reinterpret_cast<const __m256i*>(src)) + 0);
+            c1 = _mm256_loadu_si256((reinterpret_cast<const __m256i*>(src)) + 1);
+            c2 = _mm256_loadu_si256((reinterpret_cast<const __m256i*>(src)) + 2);
+            c3 = _mm256_loadu_si256((reinterpret_cast<const __m256i*>(src)) + 3);
+            c4 = _mm256_loadu_si256((reinterpret_cast<const __m256i*>(src)) + 4);
+            c5 = _mm256_loadu_si256((reinterpret_cast<const __m256i*>(src)) + 5);
+            c6 = _mm256_loadu_si256((reinterpret_cast<const __m256i*>(src)) + 6);
+            c7 = _mm256_loadu_si256((reinterpret_cast<const __m256i*>(src)) + 7);
+            src += 256;
+            _mm256_storeu_si256(((reinterpret_cast<__m256i*>(dst)) + 0), c0);
+            _mm256_storeu_si256(((reinterpret_cast<__m256i*>(dst)) + 1), c1);
+            _mm256_storeu_si256(((reinterpret_cast<__m256i*>(dst)) + 2), c2);
+            _mm256_storeu_si256(((reinterpret_cast<__m256i*>(dst)) + 3), c3);
+            _mm256_storeu_si256(((reinterpret_cast<__m256i*>(dst)) + 4), c4);
+            _mm256_storeu_si256(((reinterpret_cast<__m256i*>(dst)) + 5), c5);
+            _mm256_storeu_si256(((reinterpret_cast<__m256i*>(dst)) + 6), c6);
+            _mm256_storeu_si256(((reinterpret_cast<__m256i*>(dst)) + 7), c7);
+            dst += 256;
+        }
+    }
+    else
+    {        // big memory copy
+        __m256i c0, c1, c2, c3, c4, c5, c6, c7;
+        /* __m256i c0, c1, c2, c3, c4, c5, c6, c7; */
+
+        if ((((size_t)src) & 31) == 0)
+        {    // source aligned
+            for (; size >= 256; size -= 256)
+            {
+                c0 = _mm256_load_si256((reinterpret_cast<const __m256i*>(src)) + 0);
+                c1 = _mm256_load_si256((reinterpret_cast<const __m256i*>(src)) + 1);
+                c2 = _mm256_load_si256((reinterpret_cast<const __m256i*>(src)) + 2);
+                c3 = _mm256_load_si256((reinterpret_cast<const __m256i*>(src)) + 3);
+                c4 = _mm256_load_si256((reinterpret_cast<const __m256i*>(src)) + 4);
+                c5 = _mm256_load_si256((reinterpret_cast<const __m256i*>(src)) + 5);
+                c6 = _mm256_load_si256((reinterpret_cast<const __m256i*>(src)) + 6);
+                c7 = _mm256_load_si256((reinterpret_cast<const __m256i*>(src)) + 7);
+                src += 256;
+                _mm256_stream_si256(((reinterpret_cast<__m256i*>(dst)) + 0), c0);
+                _mm256_stream_si256(((reinterpret_cast<__m256i*>(dst)) + 1), c1);
+                _mm256_stream_si256(((reinterpret_cast<__m256i*>(dst)) + 2), c2);
+                _mm256_stream_si256(((reinterpret_cast<__m256i*>(dst)) + 3), c3);
+                _mm256_stream_si256(((reinterpret_cast<__m256i*>(dst)) + 4), c4);
+                _mm256_stream_si256(((reinterpret_cast<__m256i*>(dst)) + 5), c5);
+                _mm256_stream_si256(((reinterpret_cast<__m256i*>(dst)) + 6), c6);
+                _mm256_stream_si256(((reinterpret_cast<__m256i*>(dst)) + 7), c7);
+                dst += 256;
+            }
+        }
+        else
+        {                            // source unaligned
+            for (; size >= 256; size -= 256)
+            {
+                c0 = _mm256_loadu_si256((reinterpret_cast<const __m256i*>(src)) + 0);
+                c1 = _mm256_loadu_si256((reinterpret_cast<const __m256i*>(src)) + 1);
+                c2 = _mm256_loadu_si256((reinterpret_cast<const __m256i*>(src)) + 2);
+                c3 = _mm256_loadu_si256((reinterpret_cast<const __m256i*>(src)) + 3);
+                c4 = _mm256_loadu_si256((reinterpret_cast<const __m256i*>(src)) + 4);
+                c5 = _mm256_loadu_si256((reinterpret_cast<const __m256i*>(src)) + 5);
+                c6 = _mm256_loadu_si256((reinterpret_cast<const __m256i*>(src)) + 6);
+                c7 = _mm256_loadu_si256((reinterpret_cast<const __m256i*>(src)) + 7);
+                src += 256;
+                _mm256_stream_si256(((reinterpret_cast<__m256i*>(dst)) + 0), c0);
+                _mm256_stream_si256(((reinterpret_cast<__m256i*>(dst)) + 1), c1);
+                _mm256_stream_si256(((reinterpret_cast<__m256i*>(dst)) + 2), c2);
+                _mm256_stream_si256(((reinterpret_cast<__m256i*>(dst)) + 3), c3);
+                _mm256_stream_si256(((reinterpret_cast<__m256i*>(dst)) + 4), c4);
+                _mm256_stream_si256(((reinterpret_cast<__m256i*>(dst)) + 5), c5);
+                _mm256_stream_si256(((reinterpret_cast<__m256i*>(dst)) + 6), c6);
+                _mm256_stream_si256(((reinterpret_cast<__m256i*>(dst)) + 7), c7);
+                dst += 256;
+            }
+        }
+        _mm_sfence();
+    }
+
+    memcpy_tiny_avx(dst, src, size);
+    _mm256_zeroupper();
+
+    return destination;
+}
--- a/utils/memcpy-bench/memcpy-bench.cpp
+++ b/utils/memcpy-bench/memcpy-bench.cpp
@ -0,0 +1,620 @@
+#include <memory>
+#include <cstddef>
+#include <string>
+#include <random>
+#include <iostream>
+#include <iomanip>
+#include <thread>
+
+#include <dlfcn.h>
+
+#include <pcg_random.hpp>
+
+#include <common/defines.h>
+
+#include <Common/Stopwatch.h>
+
+#pragma GCC diagnostic ignored "-Wold-style-cast"
+#pragma GCC diagnostic ignored "-Wcast-align"
+#pragma GCC diagnostic ignored "-Wcast-qual"
+#include "FastMemcpy.h"
+//#include "FastMemcpy_Avx.h"
+
+#include <emmintrin.h>
+#include <immintrin.h>
+
+
+template <typename F, typename MemcpyImpl>
+void NO_INLINE loop(uint8_t * dst, uint8_t * src, size_t size, F && chunk_size_distribution, MemcpyImpl && impl)
+{
+    while (size)
+    {
+        size_t bytes_to_copy = std::min<size_t>(size, chunk_size_distribution());
+
+        impl(dst, src, bytes_to_copy);
+
+        dst += bytes_to_copy;
+        src += bytes_to_copy;
+        size -= bytes_to_copy;
+    }
+}
+
+
+using RNG = pcg32_fast;
+
+template <size_t N>
+size_t generatorUniform(RNG & rng) { return rng() % N; };
+
+
+template <typename F, typename MemcpyImpl>
+void test(uint8_t * dst, uint8_t * src, size_t size, size_t iterations, size_t num_threads, F && generator, MemcpyImpl && impl)
+{
+    Stopwatch watch;
+
+    std::vector<std::thread> threads;
+    threads.reserve(num_threads);
+
+    for (size_t thread_num = 0; thread_num < num_threads; ++thread_num)
+    {
+        size_t begin = size * thread_num / num_threads;
+        size_t end = size * (thread_num + 1) / num_threads;
+
+        threads.emplace_back([begin, end, iterations, &src, &dst, &generator, &impl]
+        {
+            for (size_t iteration = 0; iteration < iterations; ++iteration)
+            {
+                loop(
+                    iteration % 2 ? &src[begin] : &dst[begin],
+                    iteration % 2 ? &dst[begin] : &src[begin],
+                    end - begin,
+                    [rng = RNG(), &generator]() mutable { return generator(rng); },
+                    std::forward<MemcpyImpl>(impl));
+            }
+        });
+    }
+
+    for (auto & thread : threads)
+        thread.join();
+
+    double elapsed_ns = watch.elapsed();
+
+    /// Validation
+    size_t sum = 0;
+    for (size_t i = 0; i < size; ++i)
+        sum += dst[i];
+
+    std::cerr << std::fixed << std::setprecision(3)
+        << "Processed in " << (elapsed_ns / 1e9) << "sec, " << (size * iterations * 1.0 / elapsed_ns) << " GB/sec (sum = " << sum << ")\n";
+}
+
+
+using memcpy_type = void * (*)(const void * __restrict, void * __restrict, size_t);
+
+
+static void * memcpy_erms(void * dst, const void * src, size_t size)
+{
+    asm volatile (
+        "rep movsb"
+        : "=D"(dst), "=S"(src), "=c"(size)
+        : "0"(dst), "1"(src), "2"(size)
+        : "memory");
+    return dst;
+}
+
+extern "C" void * memcpy_jart(void * dst, const void * src, size_t size);
+extern "C" void MemCpy(void * dst, const void * src, size_t size);
+
+
+static void * memcpySSE2(void * __restrict destination, const void * __restrict source, size_t size)
+{
+    unsigned char *dst = reinterpret_cast<unsigned char *>(destination);
+    const unsigned char *src = reinterpret_cast<const unsigned char *>(source);
+    size_t padding;
+
+    // small memory copy
+    if (size <= 16)
+        return memcpy_tiny(dst, src, size);
+
+    // align destination to 16 bytes boundary
+    padding = (16 - (reinterpret_cast<size_t>(dst) & 15)) & 15;
+
+    if (padding > 0)
+    {
+        __m128i head = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src));
+        _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), head);
+        dst += padding;
+        src += padding;
+        size -= padding;
+    }
+
+    // medium size copy
+    __m128i c0;
+
+    for (; size >= 16; size -= 16)
+    {
+        c0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src));
+        src += 16;
+        _mm_store_si128((reinterpret_cast<__m128i*>(dst)), c0);
+        dst += 16;
+    }
+
+    memcpy_tiny(dst, src, size);
+    return destination;
+}
+
+static void * memcpySSE2Unrolled2(void * __restrict destination, const void * __restrict source, size_t size)
+{
+    unsigned char *dst = reinterpret_cast<unsigned char *>(destination);
+    const unsigned char *src = reinterpret_cast<const unsigned char *>(source);
+    size_t padding;
+
+    // small memory copy
+    if (size <= 32)
+        return memcpy_tiny(dst, src, size);
+
+    // align destination to 16 bytes boundary
+    padding = (16 - (reinterpret_cast<size_t>(dst) & 15)) & 15;
+
+    if (padding > 0)
+    {
+        __m128i head = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src));
+        _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), head);
+        dst += padding;
+        src += padding;
+        size -= padding;
+    }
+
+    // medium size copy
+    __m128i c0, c1;
+
+    for (; size >= 32; size -= 32)
+    {
+        c0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src) + 0);
+        c1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src) + 1);
+        src += 32;
+        _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 0), c0);
+        _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 1), c1);
+        dst += 32;
+    }
+
+    memcpy_tiny(dst, src, size);
+    return destination;
+}
+
+static void * memcpySSE2Unrolled4(void * __restrict destination, const void * __restrict source, size_t size)
+{
+    unsigned char *dst = reinterpret_cast<unsigned char *>(destination);
+    const unsigned char *src = reinterpret_cast<const unsigned char *>(source);
+    size_t padding;
+
+    // small memory copy
+    if (size <= 64)
+        return memcpy_tiny(dst, src, size);
+
+    // align destination to 16 bytes boundary
+    padding = (16 - (reinterpret_cast<size_t>(dst) & 15)) & 15;
+
+    if (padding > 0)
+    {
+        __m128i head = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src));
+        _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), head);
+        dst += padding;
+        src += padding;
+        size -= padding;
+    }
+
+    // medium size copy
+    __m128i c0, c1, c2, c3;
+
+    for (; size >= 64; size -= 64)
+    {
+        c0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src) + 0);
+        c1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src) + 1);
+        c2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src) + 2);
+        c3 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src) + 3);
+        src += 64;
+        _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 0), c0);
+        _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 1), c1);
+        _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 2), c2);
+        _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 3), c3);
+        dst += 64;
+    }
+
+    memcpy_tiny(dst, src, size);
+    return destination;
+}
+
+
+static void * memcpySSE2Unrolled8(void * __restrict destination, const void * __restrict source, size_t size)
+{
+    unsigned char *dst = reinterpret_cast<unsigned char *>(destination);
+    const unsigned char *src = reinterpret_cast<const unsigned char *>(source);
+    size_t padding;
+
+    // small memory copy
+    if (size <= 128)
+        return memcpy_tiny(dst, src, size);
+
+    // align destination to 16 bytes boundary
+    padding = (16 - (reinterpret_cast<size_t>(dst) & 15)) & 15;
+
+    if (padding > 0)
+    {
+        __m128i head = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src));
+        _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), head);
+        dst += padding;
+        src += padding;
+        size -= padding;
+    }
+
+    // medium size copy
+    __m128i c0, c1, c2, c3, c4, c5, c6, c7;
+
+    for (; size >= 128; size -= 128)
+    {
+        c0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src) + 0);
+        c1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src) + 1);
+        c2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src) + 2);
+        c3 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src) + 3);
+        c4 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src) + 4);
+        c5 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src) + 5);
+        c6 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src) + 6);
+        c7 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src) + 7);
+        src += 128;
+        _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 0), c0);
+        _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 1), c1);
+        _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 2), c2);
+        _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 3), c3);
+        _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 4), c4);
+        _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 5), c5);
+        _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 6), c6);
+        _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 7), c7);
+        dst += 128;
+    }
+
+    memcpy_tiny(dst, src, size);
+    return destination;
+}
+
+
+//static __attribute__((__always_inline__, __target__("sse2")))
+__attribute__((__always_inline__))
+void memcpy_my_medium_sse(uint8_t * __restrict & dst, const uint8_t * __restrict & src, size_t & size)
+{
+    /// Align destination to 16 bytes boundary.
+    size_t padding = (16 - (reinterpret_cast<size_t>(dst) & 15)) & 15;
+
+    if (padding > 0)
+    {
+        __m128i head = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src));
+        _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), head);
+        dst += padding;
+        src += padding;
+        size -= padding;
+    }
+
+    /// Aligned unrolled copy.
+    __m128i c0, c1, c2, c3, c4, c5, c6, c7;
+
+    while (size >= 128)
+    {
+        c0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src) + 0);
+        c1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src) + 1);
+        c2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src) + 2);
+        c3 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src) + 3);
+        c4 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src) + 4);
+        c5 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src) + 5);
+        c6 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src) + 6);
+        c7 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src) + 7);
+        src += 128;
+        _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 0), c0);
+        _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 1), c1);
+        _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 2), c2);
+        _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 3), c3);
+        _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 4), c4);
+        _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 5), c5);
+        _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 6), c6);
+        _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 7), c7);
+        dst += 128;
+
+        size -= 128;
+    }
+}
+
+__attribute__((__target__("avx")))
+void memcpy_my_medium_avx(uint8_t * __restrict & __restrict dst, const uint8_t * __restrict & __restrict src, size_t & __restrict size)
+{
+    size_t padding = (32 - (reinterpret_cast<size_t>(dst) & 31)) & 31;
+
+    if (padding > 0)
+    {
+        __m256i head = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src));
+        _mm256_storeu_si256((__m256i*)dst, head);
+        dst += padding;
+        src += padding;
+        size -= padding;
+    }
+
+    __m256i c0, c1, c2, c3, c4, c5, c6, c7;
+
+    while (size >= 256)
+    {
+        c0 = _mm256_loadu_si256((reinterpret_cast<const __m256i*>(src)) + 0);
+        c1 = _mm256_loadu_si256((reinterpret_cast<const __m256i*>(src)) + 1);
+        c2 = _mm256_loadu_si256((reinterpret_cast<const __m256i*>(src)) + 2);
+        c3 = _mm256_loadu_si256((reinterpret_cast<const __m256i*>(src)) + 3);
+        c4 = _mm256_loadu_si256((reinterpret_cast<const __m256i*>(src)) + 4);
+        c5 = _mm256_loadu_si256((reinterpret_cast<const __m256i*>(src)) + 5);
+        c6 = _mm256_loadu_si256((reinterpret_cast<const __m256i*>(src)) + 6);
+        c7 = _mm256_loadu_si256((reinterpret_cast<const __m256i*>(src)) + 7);
+        src += 256;
+        _mm256_store_si256(((reinterpret_cast<__m256i*>(dst)) + 0), c0);
+        _mm256_store_si256(((reinterpret_cast<__m256i*>(dst)) + 1), c1);
+        _mm256_store_si256(((reinterpret_cast<__m256i*>(dst)) + 2), c2);
+        _mm256_store_si256(((reinterpret_cast<__m256i*>(dst)) + 3), c3);
+        _mm256_store_si256(((reinterpret_cast<__m256i*>(dst)) + 4), c4);
+        _mm256_store_si256(((reinterpret_cast<__m256i*>(dst)) + 5), c5);
+        _mm256_store_si256(((reinterpret_cast<__m256i*>(dst)) + 6), c6);
+        _mm256_store_si256(((reinterpret_cast<__m256i*>(dst)) + 7), c7);
+        dst += 256;
+
+        size -= 256;
+    }
+}
+
+bool have_avx = true;
+
+static uint8_t * memcpy_my(uint8_t * __restrict dst, const uint8_t * __restrict src, size_t size)
+{
+    uint8_t * ret = dst;
+
+tail:
+    if (size <= 16)
+    {
+        if (size >= 8)
+        {
+            __builtin_memcpy(dst + size - 8, src + size - 8, 8);
+            __builtin_memcpy(dst, src, 8);
+        }
+        else if (size >= 4)
+        {
+            __builtin_memcpy(dst + size - 4, src + size - 4, 4);
+            __builtin_memcpy(dst, src, 4);
+        }
+        else if (size >= 2)
+        {
+            __builtin_memcpy(dst + size - 2, src + size - 2, 2);
+            __builtin_memcpy(dst, src, 2);
+        }
+        else if (size >= 1)
+        {
+            *dst = *src;
+        }
+    }
+    else if (have_avx)
+    {
+        if (size <= 32)
+        {
+            __builtin_memcpy(dst, src, 8);
+            __builtin_memcpy(dst + 8, src + 8, 8);
+
+            dst += 16;
+            src += 16;
+            size -= 16;
+
+            goto tail;
+        }
+
+        if (size <= 256)
+        {
+            __asm__(
+                "vmovups    -0x20(%[s],%[size],1), %%ymm0\n"
+                "vmovups    %%ymm0, -0x20(%[d],%[size],1)\n"
+                : [d]"+r"(dst), [s]"+r"(src)
+                : [size]"r"(size)
+                : "ymm0", "memory");
+
+            while (size > 32)
+            {
+                __asm__(
+                    "vmovups    (%[s]), %%ymm0\n"
+                    "vmovups    %%ymm0, (%[d])\n"
+                    : [d]"+r"(dst), [s]"+r"(src)
+                    :
+                    : "ymm0", "memory");
+
+                dst += 32;
+                src += 32;
+                size -= 32;
+            }
+        }
+        else
+        {
+            size_t padding = (32 - (reinterpret_cast<size_t>(dst) & 31)) & 31;
+
+            if (padding > 0)
+            {
+                __asm__(
+                    "vmovups    (%[s]), %%ymm0\n"
+                    "vmovups    %%ymm0, (%[d])\n"
+                    : [d]"+r"(dst), [s]"+r"(src)
+                    :
+                    : "ymm0", "memory");
+
+                dst += padding;
+                src += padding;
+                size -= padding;
+            }
+
+            while (size >= 256)
+            {
+                __asm__(
+                    "vmovups    (%[s]), %%ymm0\n"
+                    "vmovups    0x20(%[s]), %%ymm1\n"
+                    "vmovups    0x40(%[s]), %%ymm2\n"
+                    "vmovups    0x60(%[s]), %%ymm3\n"
+                    "vmovups    0x80(%[s]), %%ymm4\n"
+                    "vmovups    0xa0(%[s]), %%ymm5\n"
+                    "vmovups    0xc0(%[s]), %%ymm6\n"
+                    "vmovups    0xe0(%[s]), %%ymm7\n"
+                    "add        $0x100,%[s]\n"
+                    "vmovaps    %%ymm0, (%[d])\n"
+                    "vmovaps    %%ymm1, 0x20(%[d])\n"
+                    "vmovaps    %%ymm2, 0x40(%[d])\n"
+                    "vmovaps    %%ymm3, 0x60(%[d])\n"
+                    "vmovaps    %%ymm4, 0x80(%[d])\n"
+                    "vmovaps    %%ymm5, 0xa0(%[d])\n"
+                    "vmovaps    %%ymm6, 0xc0(%[d])\n"
+                    "vmovaps    %%ymm7, 0xe0(%[d])\n"
+                    "add        $0x100, %[d]\n"
+                    : [d]"+r"(dst), [s]"+r"(src)
+                    :
+                    : "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7", "memory");
+
+                size -= 256;
+            }
+
+            goto tail;
+        }
+    }
+    else
+    {
+        if (size <= 128)
+        {
+            _mm_storeu_si128(reinterpret_cast<__m128i *>(dst + size - 16), _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + size - 16)));
+
+            while (size > 16)
+            {
+                _mm_storeu_si128(reinterpret_cast<__m128i *>(dst), _mm_loadu_si128(reinterpret_cast<const __m128i *>(src)));
+                dst += 16;
+                src += 16;
+                size -= 16;
+            }
+        }
+        else
+        {
+            /// Align destination to 16 bytes boundary.
+            size_t padding = (16 - (reinterpret_cast<size_t>(dst) & 15)) & 15;
+
+            if (padding > 0)
+            {
+                __m128i head = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src));
+                _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), head);
+                dst += padding;
+                src += padding;
+                size -= padding;
+            }
+
+            /// Aligned unrolled copy.
+            __m128i c0, c1, c2, c3, c4, c5, c6, c7;
+
+            while (size >= 128)
+            {
+                c0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src) + 0);
+                c1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src) + 1);
+                c2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src) + 2);
+                c3 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src) + 3);
+                c4 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src) + 4);
+                c5 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src) + 5);
+                c6 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src) + 6);
+                c7 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src) + 7);
+                src += 128;
+                _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 0), c0);
+                _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 1), c1);
+                _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 2), c2);
+                _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 3), c3);
+                _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 4), c4);
+                _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 5), c5);
+                _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 6), c6);
+                _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 7), c7);
+                dst += 128;
+
+                size -= 128;
+            }
+
+            goto tail;
+        }
+    }
+
+    return ret;
+}
+
+
+template <typename F>
+void dispatchMemcpyVariants(size_t memcpy_variant, uint8_t * dst, uint8_t * src, size_t size, size_t iterations, size_t num_threads, F && generator)
+{
+    memcpy_type memcpy_libc = reinterpret_cast<memcpy_type>(dlsym(RTLD_NEXT, "memcpy"));
+
+    if (memcpy_variant == 1)
+        test(dst, src, size, iterations, num_threads, std::forward<F>(generator), memcpy);
+    if (memcpy_variant == 2)
+        test(dst, src, size, iterations, num_threads, std::forward<F>(generator), memcpy_libc);
+    if (memcpy_variant == 3)
+        test(dst, src, size, iterations, num_threads, std::forward<F>(generator), memcpy_erms);
+    if (memcpy_variant == 4)
+        test(dst, src, size, iterations, num_threads, std::forward<F>(generator), MemCpy);
+    if (memcpy_variant == 5)
+        test(dst, src, size, iterations, num_threads, std::forward<F>(generator), memcpySSE2);
+    if (memcpy_variant == 6)
+        test(dst, src, size, iterations, num_threads, std::forward<F>(generator), memcpySSE2Unrolled2);
+    if (memcpy_variant == 7)
+        test(dst, src, size, iterations, num_threads, std::forward<F>(generator), memcpySSE2Unrolled4);
+    if (memcpy_variant == 8)
+        test(dst, src, size, iterations, num_threads, std::forward<F>(generator), memcpySSE2Unrolled8);
+//    if (memcpy_variant == 9)
+//        test(dst, src, size, iterations, num_threads, std::forward<F>(generator), memcpy_fast_avx);
+    if (memcpy_variant == 10)
+        test(dst, src, size, iterations, num_threads, std::forward<F>(generator), memcpy_my);
+}
+
+void dispatchVariants(size_t memcpy_variant, size_t generator_variant, uint8_t * dst, uint8_t * src, size_t size, size_t iterations, size_t num_threads)
+{
+    if (generator_variant == 1)
+        dispatchMemcpyVariants(memcpy_variant, dst, src, size, iterations, num_threads, generatorUniform<16>);
+    if (generator_variant == 2)
+        dispatchMemcpyVariants(memcpy_variant, dst, src, size, iterations, num_threads, generatorUniform<256>);
+    if (generator_variant == 3)
+        dispatchMemcpyVariants(memcpy_variant, dst, src, size, iterations, num_threads, generatorUniform<4096>);
+    if (generator_variant == 4)
+        dispatchMemcpyVariants(memcpy_variant, dst, src, size, iterations, num_threads, generatorUniform<65536>);
+    if (generator_variant == 5)
+        dispatchMemcpyVariants(memcpy_variant, dst, src, size, iterations, num_threads, generatorUniform<1048576>);
+}
+
+
+int main(int argc, char ** argv)
+{
+    size_t size = 1000000000;
+    if (argc >= 2)
+        size = std::stoull(argv[1]);
+
+    size_t iterations = 10;
+    if (argc >= 3)
+        iterations = std::stoull(argv[2]);
+
+    size_t num_threads = 1;
+    if (argc >= 4)
+        num_threads = std::stoull(argv[3]);
+
+    size_t memcpy_variant = 1;
+    if (argc >= 5)
+        memcpy_variant = std::stoull(argv[4]);
+
+    size_t generator_variant = 1;
+    if (argc >= 6)
+        generator_variant = std::stoull(argv[5]);
+
+    std::unique_ptr<uint8_t[]> src(new uint8_t[size]);
+    std::unique_ptr<uint8_t[]> dst(new uint8_t[size]);
+
+    /// Fill src with some pattern for validation.
+    for (size_t i = 0; i < size; ++i)
+        src[i] = i;
+
+    /// Fill dst to avoid page faults.
+    memset(dst.get(), 0, size);
+
+    dispatchVariants(memcpy_variant, generator_variant, dst.get(), src.get(), size, iterations, num_threads);
+
+    return 0;
+}
--- a/utils/memcpy-bench/memcpy_jart.S
+++ b/utils/memcpy-bench/memcpy_jart.S
@ -0,0 +1,138 @@
+/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8     -*-│
+│vi: set et ft=asm ts=8 tw=8 fenc=utf-8                                     :vi│
+╞══════════════════════════════════════════════════════════════════════════════╡
+│ Copyright 2020 Justine Alexandra Roberts Tunney                              │
+│                                                                              │
+│ Permission to use, copy, modify, and/or distribute this software for         │
+│ any purpose with or without fee is hereby granted, provided that the         │
+│ above copyright notice and this permission notice appear in all copies.      │
+│                                                                              │
+│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL                │
+│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED                │
+│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE             │
+│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL         │
+│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR        │
+│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER               │
+│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
+│ PERFORMANCE OF THIS SOFTWARE.                                                │
+╚─────────────────────────────────────────────────────────────────────────────*/
+
+//	Copies memory.
+//
+//	DEST and SRC must not overlap, unless DEST≤SRC.
+//
+//	@param	rdi is dest
+//	@param	rsi is src
+//	@param	rdx is number of bytes
+//	@return	original rdi copied to rax
+//	@mode	long
+//	@asyncsignalsafe
+memcpy_jart:	mov	%rdi,%rax
+//	𝑠𝑙𝑖𝑑𝑒
+	.align	16
+	.type	memcpy_jart,@function
+	.size	memcpy_jart,.-memcpy_jart
+	.globl	memcpy_jart
+
+//	Copies memory w/ minimal impact ABI.
+//
+//	@param	rdi is dest
+//	@param	rsi is src
+//	@param	rdx is number of bytes
+//	@clob	flags,rcx,xmm3,xmm4
+//	@mode	long
+MemCpy:	mov	$.Lmemcpytab.size,%ecx
+	cmp	%rcx,%rdx
+	cmovb	%rdx,%rcx
+	jmp	*memcpytab(,%rcx,8)
+.Lanchorpoint:
+.L16r:	cmp	$1024,%rdx
+	jae	.Lerms
+.L16:	movdqu	-16(%rsi,%rdx),%xmm4
+	mov	$16,%rcx
+0:	add	$16,%rcx
+	movdqu	-32(%rsi,%rcx),%xmm3
+	movdqu	%xmm3,-32(%rdi,%rcx)
+	cmp	%rcx,%rdx
+	ja	0b
+	movdqu	%xmm4,-16(%rdi,%rdx)
+	pxor	%xmm4,%xmm4
+	pxor	%xmm3,%xmm3
+	jmp	.L0
+.L8:	push	%rbx
+	mov	(%rsi),%rcx
+	mov	-8(%rsi,%rdx),%rbx
+	mov	%rcx,(%rdi)
+	mov	%rbx,-8(%rdi,%rdx)
+1:	pop	%rbx
+.L0:	ret
+.L4:	push	%rbx
+	mov	(%rsi),%ecx
+	mov	-4(%rsi,%rdx),%ebx
+	mov	%ecx,(%rdi)
+	mov	%ebx,-4(%rdi,%rdx)
+	jmp	1b
+.L3:	push	%rbx
+	mov	(%rsi),%cx
+	mov	-2(%rsi,%rdx),%bx
+	mov	%cx,(%rdi)
+	mov	%bx,-2(%rdi,%rdx)
+	jmp	1b
+.L2:	mov	(%rsi),%cx
+	mov	%cx,(%rdi)
+	jmp	.L0
+.L1:	mov	(%rsi),%cl
+	mov	%cl,(%rdi)
+	jmp	.L0
+.Lerms:	cmp	$1024*1024,%rdx
+	ja	.Lnts
+	push	%rdi
+	push	%rsi
+	mov	%rdx,%rcx
+	rep movsb
+	pop	%rsi
+	pop	%rdi
+	jmp	.L0
+.Lnts:	movdqu	(%rsi),%xmm3
+	movdqu	%xmm3,(%rdi)
+	lea	16(%rdi),%rcx
+	and	$-16,%rcx
+	sub	%rdi,%rcx
+	add	%rcx,%rdi
+	add	%rcx,%rsi
+	sub	%rcx,%rdx
+	mov	$16,%rcx
+0:	add	$16,%rcx
+	movdqu	-32(%rsi,%rcx),%xmm3
+	movntdq	%xmm3,-32(%rdi,%rcx)
+	cmp	%rcx,%rdx
+	ja	0b
+	sfence
+	movdqu	-16(%rsi,%rdx),%xmm3
+	movdqu	%xmm3,-16(%rdi,%rdx)
+	pxor	%xmm3,%xmm3
+	jmp	.L0
+	.type	MemCpy,@function
+	.size	MemCpy,.-MemCpy
+	.globl	MemCpy
+
+	.section .rodata
+	.align	8
+memcpytab:
+	.quad	.L0
+	.quad	.L1
+	.quad	.L2
+	.quad	.L3
+	.rept	4
+	.quad	.L4
+	.endr
+	.rept	8
+	.quad	.L8
+	.endr
+	.rept	16
+	.quad	.L16
+	.endr
+	.equ	.Lmemcpytab.size,(.-memcpytab)/8
+	.quad	.L16r # SSE + ERMS + NTS
+	.type	memcpytab,@object
+	.previous