Merge pull request #21520 from ClickHouse/replace-memcpy

Evaluate yet another memcpy
This commit is contained in:
alexey-milovidov 2021-03-14 12:24:48 +03:00 committed by GitHub
commit 2ea38ea01e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
22 changed files with 2270 additions and 1668 deletions

View File

@ -155,7 +155,6 @@ option(ENABLE_TESTS "Provide unit_test_dbms target with Google.Test unit tests"
if (OS_LINUX AND NOT UNBUNDLED AND MAKE_STATIC_LIBRARIES AND NOT SPLIT_SHARED_LIBRARIES AND CMAKE_VERSION VERSION_GREATER "3.9.0")
# Only for Linux, x86_64.
# Implies ${ENABLE_FASTMEMCPY}
option(GLIBC_COMPATIBILITY "Enable compatibility with older glibc libraries." ON)
elseif(GLIBC_COMPATIBILITY)
message (${RECONFIGURE_MESSAGE_LEVEL} "Glibc compatibility cannot be enabled in current configuration")
@ -536,7 +535,7 @@ macro (add_executable target)
# explicitly acquire and interpose malloc symbols by clickhouse_malloc
# if GLIBC_COMPATIBILITY is ON and ENABLE_THINLTO is on than provide memcpy symbol explicitly to neutrialize thinlto's libcall generation.
if (GLIBC_COMPATIBILITY AND ENABLE_THINLTO)
_add_executable (${ARGV} $<TARGET_OBJECTS:clickhouse_malloc> $<TARGET_OBJECTS:clickhouse_memcpy>)
_add_executable (${ARGV} $<TARGET_OBJECTS:clickhouse_malloc> $<TARGET_OBJECTS:memcpy>)
else ()
_add_executable (${ARGV} $<TARGET_OBJECTS:clickhouse_malloc>)
endif ()

View File

@ -74,7 +74,6 @@ target_link_libraries (common
${CITYHASH_LIBRARIES}
boost::headers_only
boost::system
FastMemcpy
Poco::Net
Poco::Net::SSL
Poco::Util

View File

@ -11,7 +11,7 @@ set(PLATFORM_LIBS ${CMAKE_DL_LIBS})
target_link_libraries (date_lut2 PRIVATE common ${PLATFORM_LIBS})
target_link_libraries (date_lut3 PRIVATE common ${PLATFORM_LIBS})
target_link_libraries (date_lut_default_timezone PRIVATE common ${PLATFORM_LIBS})
target_link_libraries (local_date_time_comparison PRIVATE common)
target_link_libraries (local_date_time_comparison PRIVATE common ${PLATFORM_LIBS})
target_link_libraries (realloc-perf PRIVATE common)
add_check(local_date_time_comparison)

View File

@ -1,5 +1,8 @@
if (GLIBC_COMPATIBILITY)
set (ENABLE_FASTMEMCPY ON)
add_subdirectory(memcpy)
if(TARGET memcpy)
set(MEMCPY_LIBRARY memcpy)
endif()
enable_language(ASM)
include(CheckIncludeFile)
@ -27,13 +30,6 @@ if (GLIBC_COMPATIBILITY)
list(APPEND glibc_compatibility_sources musl/getentropy.c)
endif()
if (NOT ARCH_ARM)
# clickhouse_memcpy don't support ARCH_ARM, see https://github.com/ClickHouse/ClickHouse/issues/18951
add_library (clickhouse_memcpy OBJECT
${ClickHouse_SOURCE_DIR}/contrib/FastMemcpy/memcpy_wrapper.c
)
endif()
# Need to omit frame pointers to match the performance of glibc
set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fomit-frame-pointer")
@ -51,15 +47,16 @@ if (GLIBC_COMPATIBILITY)
target_compile_options(glibc-compatibility PRIVATE -fPIC)
endif ()
target_link_libraries(global-libs INTERFACE glibc-compatibility)
target_link_libraries(global-libs INTERFACE glibc-compatibility ${MEMCPY_LIBRARY})
install(
TARGETS glibc-compatibility
TARGETS glibc-compatibility ${MEMCPY_LIBRARY}
EXPORT global
ARCHIVE DESTINATION lib
)
message (STATUS "Some symbols from glibc will be replaced for compatibility")
elseif (YANDEX_OFFICIAL_BUILD)
message (WARNING "Option GLIBC_COMPATIBILITY must be turned on for production builds.")
endif ()

View File

@ -0,0 +1,8 @@
if (ARCH_AMD64)
add_library(memcpy STATIC memcpy.cpp)
# We allow to include memcpy.h from user code for better inlining.
target_include_directories(memcpy PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>)
target_compile_options(memcpy PRIVATE -fno-builtin-memcpy)
endif ()

View File

@ -0,0 +1,6 @@
#include "memcpy.h"
extern "C" void * memcpy(void * __restrict dst, const void * __restrict src, size_t size)
{
return inline_memcpy(dst, src, size);
}

View File

@ -0,0 +1,217 @@
#include <cstddef>
#include <emmintrin.h>
/** Custom memcpy implementation for ClickHouse.
* It has the following benefits over using glibc's implementation:
* 1. Avoiding dependency on specific version of glibc's symbol, like memcpy@@GLIBC_2.14 for portability.
* 2. Avoiding indirect call via PLT due to shared linking, that can be less efficient.
* 3. It's possible to include this header and call inline_memcpy directly for better inlining or interprocedural analysis.
* 4. Better results on our performance tests on current CPUs: up to 25% on some queries and up to 0.7%..1% in average across all queries.
*
* Writing our own memcpy is extremely difficult for the following reasons:
* 1. The optimal variant depends on the specific CPU model.
* 2. The optimal variant depends on the distribution of size arguments.
* 3. It depends on the number of threads copying data concurrently.
* 4. It also depends on how the calling code is using the copied data and how the different memcpy calls are related to each other.
* Due to vast range of scenarios it makes proper testing especially difficult.
* When writing our own memcpy there is a risk to overoptimize it
* on non-representative microbenchmarks while making real-world use cases actually worse.
*
* Most of the benchmarks for memcpy on the internet are wrong.
*
* Let's look at the details:
*
* For small size, the order of branches in code is important.
* There are variants with specific order of branches (like here or in glibc)
* or with jump table (in asm code see example from Cosmopolitan libc:
* https://github.com/jart/cosmopolitan/blob/de09bec215675e9b0beb722df89c6f794da74f3f/libc/nexgen32e/memcpy.S#L61)
* or with Duff device in C (see https://github.com/skywind3000/FastMemcpy/)
*
* It's also important how to copy uneven sizes.
* Almost every implementation, including this, is using two overlapping movs.
*
* It is important to disable -ftree-loop-distribute-patterns when compiling memcpy implementation,
* otherwise the compiler can replace internal loops to a call to memcpy that will lead to infinite recursion.
*
* For larger sizes it's important to choose the instructions used:
* - SSE or AVX or AVX-512;
* - rep movsb;
* Performance will depend on the size threshold, on the CPU model, on the "erms" flag
* ("Enhansed Rep MovS" - it indicates that performance of "rep movsb" is decent for large sizes)
* https://stackoverflow.com/questions/43343231/enhanced-rep-movsb-for-memcpy
*
* Using AVX-512 can be bad due to throttling.
* Using AVX can be bad if most code is using SSE due to switching penalty
* (it also depends on the usage of "vzeroupper" instruction).
* But in some cases AVX gives a win.
*
* It also depends on how many times the loop will be unrolled.
* We are unrolling the loop 8 times (by the number of available registers), but it not always the best.
*
* It also depends on the usage of aligned or unaligned loads/stores.
* We are using unaligned loads and aligned stores.
*
* It also depends on the usage of prefetch instructions. It makes sense on some Intel CPUs but can slow down performance on AMD.
* Setting up correct offset for prefetching is non-obvious.
*
* Non-temporary (cache bypassing) stores can be used for very large sizes (more than a half of L3 cache).
* But the exact threshold is unclear - when doing memcpy from multiple threads the optimal threshold can be lower,
* because L3 cache is shared (and L2 cache is partially shared).
*
* Very large size of memcpy typically indicates suboptimal (not cache friendly) algorithms in code or unrealistic scenarios,
* so we don't pay attention to using non-temporary stores.
*
* On recent Intel CPUs, the presence of "erms" makes "rep movsb" the most benefitial,
* even comparing to non-temporary aligned unrolled stores even with the most wide registers.
*
* memcpy can be written in asm, C or C++. The latter can also use inline asm.
* The asm implementation can be better to make sure that compiler won't make the code worse,
* to ensure the order of branches, the code layout, the usage of all required registers.
* But if it is located in separate translation unit, inlining will not be possible
* (inline asm can be used to overcome this limitation).
* Sometimes C or C++ code can be further optimized by compiler.
* For example, clang is capable replacing SSE intrinsics to AVX code if -mavx is used.
*
* Please note that compiler can replace plain code to memcpy and vice versa.
* - memcpy with compile-time known small size is replaced to simple instructions without a call to memcpy;
* it is controlled by -fbuiltin-memcpy and can be manually ensured by calling __builtin_memcpy.
* This is often used to implement unaligned load/store without undefined behaviour in C++.
* - a loop with copying bytes can be recognized and replaced by a call to memcpy;
* it is controlled by -ftree-loop-distribute-patterns.
* - also note that a loop with copying bytes can be unrolled, peeled and vectorized that will give you
* inline code somewhat similar to a decent implementation of memcpy.
*
* This description is up to date as of Mar 2021.
*
* How to test the memcpy implementation for performance:
* 1. Test on real production workload.
* 2. For synthetic test, see utils/memcpy-bench, but make sure you will do the best to exhaust the wide range of scenarios.
*
* TODO: Add self-tuning memcpy with bayesian bandits algorithm for large sizes.
* See https://habr.com/en/company/yandex/blog/457612/
*/
static inline void * inline_memcpy(void * __restrict dst_, const void * __restrict src_, size_t size)
{
/// We will use pointer arithmetic, so char pointer will be used.
/// Note that __restrict makes sense (otherwise compiler will reload data from memory
/// instead of using the value of registers due to possible aliasing).
char * __restrict dst = reinterpret_cast<char * __restrict>(dst_);
const char * __restrict src = reinterpret_cast<const char * __restrict>(src_);
/// Standard memcpy returns the original value of dst. It is rarely used but we have to do it.
/// If you use memcpy with small but non-constant sizes, you can call inline_memcpy directly
/// for inlining and removing this single instruction.
void * ret = dst;
tail:
/// Small sizes and tails after the loop for large sizes.
/// The order of branches is important but in fact the optimal order depends on the distribution of sizes in your application.
/// This order of branches is from the disassembly of glibc's code.
/// We copy chunks of possibly uneven size with two overlapping movs.
/// Example: to copy 5 bytes [0, 1, 2, 3, 4] we will copy tail [1, 2, 3, 4] first and then head [0, 1, 2, 3].
if (size <= 16)
{
if (size >= 8)
{
/// Chunks of 8..16 bytes.
__builtin_memcpy(dst + size - 8, src + size - 8, 8);
__builtin_memcpy(dst, src, 8);
}
else if (size >= 4)
{
/// Chunks of 4..7 bytes.
__builtin_memcpy(dst + size - 4, src + size - 4, 4);
__builtin_memcpy(dst, src, 4);
}
else if (size >= 2)
{
/// Chunks of 2..3 bytes.
__builtin_memcpy(dst + size - 2, src + size - 2, 2);
__builtin_memcpy(dst, src, 2);
}
else if (size >= 1)
{
/// A single byte.
*dst = *src;
}
/// No bytes remaining.
}
else
{
/// Medium and large sizes.
if (size <= 128)
{
/// Medium size, not enough for full loop unrolling.
/// We will copy the last 16 bytes.
_mm_storeu_si128(reinterpret_cast<__m128i *>(dst + size - 16), _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + size - 16)));
/// Then we will copy every 16 bytes from the beginning in a loop.
/// The last loop iteration will possibly overwrite some part of already copied last 16 bytes.
/// This is Ok, similar to the code for small sizes above.
while (size > 16)
{
_mm_storeu_si128(reinterpret_cast<__m128i *>(dst), _mm_loadu_si128(reinterpret_cast<const __m128i *>(src)));
dst += 16;
src += 16;
size -= 16;
}
}
else
{
/// Large size with fully unrolled loop.
/// Align destination to 16 bytes boundary.
size_t padding = (16 - (reinterpret_cast<size_t>(dst) & 15)) & 15;
/// If not aligned - we will copy first 16 bytes with unaligned stores.
if (padding > 0)
{
__m128i head = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src));
_mm_storeu_si128(reinterpret_cast<__m128i*>(dst), head);
dst += padding;
src += padding;
size -= padding;
}
/// Aligned unrolled copy. We will use all available SSE registers.
/// It's not possible to have both src and dst aligned.
/// So, we will use aligned stores and unaligned loads.
__m128i c0, c1, c2, c3, c4, c5, c6, c7;
while (size >= 128)
{
c0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src) + 0);
c1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src) + 1);
c2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src) + 2);
c3 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src) + 3);
c4 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src) + 4);
c5 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src) + 5);
c6 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src) + 6);
c7 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src) + 7);
src += 128;
_mm_store_si128((reinterpret_cast<__m128i*>(dst) + 0), c0);
_mm_store_si128((reinterpret_cast<__m128i*>(dst) + 1), c1);
_mm_store_si128((reinterpret_cast<__m128i*>(dst) + 2), c2);
_mm_store_si128((reinterpret_cast<__m128i*>(dst) + 3), c3);
_mm_store_si128((reinterpret_cast<__m128i*>(dst) + 4), c4);
_mm_store_si128((reinterpret_cast<__m128i*>(dst) + 5), c5);
_mm_store_si128((reinterpret_cast<__m128i*>(dst) + 6), c6);
_mm_store_si128((reinterpret_cast<__m128i*>(dst) + 7), c7);
dst += 128;
size -= 128;
}
/// The latest remaining 0..127 bytes will be processed as usual.
goto tail;
}
}
return ret;
}

View File

@ -38,7 +38,6 @@ add_subdirectory (boost-cmake)
add_subdirectory (cctz-cmake)
add_subdirectory (consistent-hashing)
add_subdirectory (dragonbox-cmake)
add_subdirectory (FastMemcpy)
add_subdirectory (hyperscan-cmake)
add_subdirectory (jemalloc-cmake)
add_subdirectory (libcpuid-cmake)

View File

@ -1,28 +0,0 @@
option (ENABLE_FASTMEMCPY "Enable FastMemcpy library (only internal)" ${ENABLE_LIBRARIES})
if (NOT OS_LINUX OR ARCH_AARCH64)
set (ENABLE_FASTMEMCPY OFF)
endif ()
if (ENABLE_FASTMEMCPY)
set (LIBRARY_DIR ${ClickHouse_SOURCE_DIR}/contrib/FastMemcpy)
set (SRCS
${LIBRARY_DIR}/FastMemcpy.c
memcpy_wrapper.c
)
add_library (FastMemcpy ${SRCS})
target_include_directories (FastMemcpy PUBLIC ${LIBRARY_DIR})
target_compile_definitions(FastMemcpy PUBLIC USE_FASTMEMCPY=1)
message (STATUS "Using FastMemcpy")
else ()
add_library (FastMemcpy INTERFACE)
target_compile_definitions(FastMemcpy INTERFACE USE_FASTMEMCPY=0)
message (STATUS "Not using FastMemcpy")
endif ()

View File

@ -1,220 +0,0 @@
//=====================================================================
//
// FastMemcpy.c - skywind3000@163.com, 2015
//
// feature:
// 50% speed up in avg. vs standard memcpy (tested in vc2012/gcc4.9)
//
//=====================================================================
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#if (defined(_WIN32) || defined(WIN32))
#include <windows.h>
#include <mmsystem.h>
#ifdef _MSC_VER
#pragma comment(lib, "winmm.lib")
#endif
#elif defined(__unix)
#include <sys/time.h>
#include <unistd.h>
#else
#error it can only be compiled under windows or unix
#endif
#include "FastMemcpy.h"
unsigned int gettime()
{
#if (defined(_WIN32) || defined(WIN32))
return timeGetTime();
#else
static struct timezone tz={ 0,0 };
struct timeval time;
gettimeofday(&time,&tz);
return (time.tv_sec * 1000 + time.tv_usec / 1000);
#endif
}
void sleepms(unsigned int millisec)
{
#if defined(_WIN32) || defined(WIN32)
Sleep(millisec);
#else
usleep(millisec * 1000);
#endif
}
void benchmark(int dstalign, int srcalign, size_t size, int times)
{
char *DATA1 = (char*)malloc(size + 64);
char *DATA2 = (char*)malloc(size + 64);
size_t LINEAR1 = ((size_t)DATA1);
size_t LINEAR2 = ((size_t)DATA2);
char *ALIGN1 = (char*)(((64 - (LINEAR1 & 63)) & 63) + LINEAR1);
char *ALIGN2 = (char*)(((64 - (LINEAR2 & 63)) & 63) + LINEAR2);
char *dst = (dstalign)? ALIGN1 : (ALIGN1 + 1);
char *src = (srcalign)? ALIGN2 : (ALIGN2 + 3);
unsigned int t1, t2;
int k;
sleepms(100);
t1 = gettime();
for (k = times; k > 0; k--) {
memcpy(dst, src, size);
}
t1 = gettime() - t1;
sleepms(100);
t2 = gettime();
for (k = times; k > 0; k--) {
memcpy_fast(dst, src, size);
}
t2 = gettime() - t2;
free(DATA1);
free(DATA2);
printf("result(dst %s, src %s): memcpy_fast=%dms memcpy=%d ms\n",
dstalign? "aligned" : "unalign",
srcalign? "aligned" : "unalign", (int)t2, (int)t1);
}
void bench(int copysize, int times)
{
printf("benchmark(size=%d bytes, times=%d):\n", copysize, times);
benchmark(1, 1, copysize, times);
benchmark(1, 0, copysize, times);
benchmark(0, 1, copysize, times);
benchmark(0, 0, copysize, times);
printf("\n");
}
void random_bench(int maxsize, int times)
{
static char A[11 * 1024 * 1024 + 2];
static char B[11 * 1024 * 1024 + 2];
static int random_offsets[0x10000];
static int random_sizes[0x8000];
unsigned int i, p1, p2;
unsigned int t1, t2;
for (i = 0; i < 0x10000; i++) { // generate random offsets
random_offsets[i] = rand() % (10 * 1024 * 1024 + 1);
}
for (i = 0; i < 0x8000; i++) { // generate random sizes
random_sizes[i] = 1 + rand() % maxsize;
}
sleepms(100);
t1 = gettime();
for (p1 = 0, p2 = 0, i = 0; i < times; i++) {
int offset1 = random_offsets[(p1++) & 0xffff];
int offset2 = random_offsets[(p1++) & 0xffff];
int size = random_sizes[(p2++) & 0x7fff];
memcpy(A + offset1, B + offset2, size);
}
t1 = gettime() - t1;
sleepms(100);
t2 = gettime();
for (p1 = 0, p2 = 0, i = 0; i < times; i++) {
int offset1 = random_offsets[(p1++) & 0xffff];
int offset2 = random_offsets[(p1++) & 0xffff];
int size = random_sizes[(p2++) & 0x7fff];
memcpy_fast(A + offset1, B + offset2, size);
}
t2 = gettime() - t2;
printf("benchmark random access:\n");
printf("memcpy_fast=%dms memcpy=%dms\n\n", (int)t2, (int)t1);
}
#ifdef _MSC_VER
#pragma comment(lib, "winmm.lib")
#endif
int main(void)
{
bench(32, 0x1000000);
bench(64, 0x1000000);
bench(512, 0x800000);
bench(1024, 0x400000);
bench(4096, 0x80000);
bench(8192, 0x40000);
bench(1024 * 1024 * 1, 0x800);
bench(1024 * 1024 * 4, 0x200);
bench(1024 * 1024 * 8, 0x100);
random_bench(2048, 8000000);
return 0;
}
/*
benchmark(size=32 bytes, times=16777216):
result(dst aligned, src aligned): memcpy_fast=78ms memcpy=260 ms
result(dst aligned, src unalign): memcpy_fast=78ms memcpy=250 ms
result(dst unalign, src aligned): memcpy_fast=78ms memcpy=266 ms
result(dst unalign, src unalign): memcpy_fast=78ms memcpy=234 ms
benchmark(size=64 bytes, times=16777216):
result(dst aligned, src aligned): memcpy_fast=109ms memcpy=281 ms
result(dst aligned, src unalign): memcpy_fast=109ms memcpy=328 ms
result(dst unalign, src aligned): memcpy_fast=109ms memcpy=343 ms
result(dst unalign, src unalign): memcpy_fast=93ms memcpy=344 ms
benchmark(size=512 bytes, times=8388608):
result(dst aligned, src aligned): memcpy_fast=125ms memcpy=218 ms
result(dst aligned, src unalign): memcpy_fast=156ms memcpy=484 ms
result(dst unalign, src aligned): memcpy_fast=172ms memcpy=546 ms
result(dst unalign, src unalign): memcpy_fast=172ms memcpy=515 ms
benchmark(size=1024 bytes, times=4194304):
result(dst aligned, src aligned): memcpy_fast=109ms memcpy=172 ms
result(dst aligned, src unalign): memcpy_fast=187ms memcpy=453 ms
result(dst unalign, src aligned): memcpy_fast=172ms memcpy=437 ms
result(dst unalign, src unalign): memcpy_fast=156ms memcpy=452 ms
benchmark(size=4096 bytes, times=524288):
result(dst aligned, src aligned): memcpy_fast=62ms memcpy=78 ms
result(dst aligned, src unalign): memcpy_fast=109ms memcpy=202 ms
result(dst unalign, src aligned): memcpy_fast=94ms memcpy=203 ms
result(dst unalign, src unalign): memcpy_fast=110ms memcpy=218 ms
benchmark(size=8192 bytes, times=262144):
result(dst aligned, src aligned): memcpy_fast=62ms memcpy=78 ms
result(dst aligned, src unalign): memcpy_fast=78ms memcpy=202 ms
result(dst unalign, src aligned): memcpy_fast=78ms memcpy=203 ms
result(dst unalign, src unalign): memcpy_fast=94ms memcpy=203 ms
benchmark(size=1048576 bytes, times=2048):
result(dst aligned, src aligned): memcpy_fast=203ms memcpy=191 ms
result(dst aligned, src unalign): memcpy_fast=219ms memcpy=281 ms
result(dst unalign, src aligned): memcpy_fast=218ms memcpy=328 ms
result(dst unalign, src unalign): memcpy_fast=218ms memcpy=312 ms
benchmark(size=4194304 bytes, times=512):
result(dst aligned, src aligned): memcpy_fast=312ms memcpy=406 ms
result(dst aligned, src unalign): memcpy_fast=296ms memcpy=421 ms
result(dst unalign, src aligned): memcpy_fast=312ms memcpy=468 ms
result(dst unalign, src unalign): memcpy_fast=297ms memcpy=452 ms
benchmark(size=8388608 bytes, times=256):
result(dst aligned, src aligned): memcpy_fast=281ms memcpy=452 ms
result(dst aligned, src unalign): memcpy_fast=280ms memcpy=468 ms
result(dst unalign, src aligned): memcpy_fast=298ms memcpy=514 ms
result(dst unalign, src unalign): memcpy_fast=344ms memcpy=472 ms
benchmark random access:
memcpy_fast=515ms memcpy=1014ms
*/

View File

@ -1,694 +0,0 @@
//=====================================================================
//
// FastMemcpy.c - skywind3000@163.com, 2015
//
// feature:
// 50% speed up in avg. vs standard memcpy (tested in vc2012/gcc5.1)
//
//=====================================================================
#ifndef __FAST_MEMCPY_H__
#define __FAST_MEMCPY_H__
#include <stddef.h>
#include <stdint.h>
#include <emmintrin.h>
//---------------------------------------------------------------------
// force inline for compilers
//---------------------------------------------------------------------
#ifndef INLINE
#ifdef __GNUC__
#if (__GNUC__ > 3) || ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 1))
#define INLINE __inline__ __attribute__((always_inline))
#else
#define INLINE __inline__
#endif
#elif defined(_MSC_VER)
#define INLINE __forceinline
#elif (defined(__BORLANDC__) || defined(__WATCOMC__))
#define INLINE __inline
#else
#define INLINE
#endif
#endif
typedef __attribute__((__aligned__(1))) uint16_t uint16_unaligned_t;
typedef __attribute__((__aligned__(1))) uint32_t uint32_unaligned_t;
typedef __attribute__((__aligned__(1))) uint64_t uint64_unaligned_t;
//---------------------------------------------------------------------
// fast copy for different sizes
//---------------------------------------------------------------------
static INLINE void memcpy_sse2_16(void *dst, const void *src) {
__m128i m0 = _mm_loadu_si128(((const __m128i*)src) + 0);
_mm_storeu_si128(((__m128i*)dst) + 0, m0);
}
static INLINE void memcpy_sse2_32(void *dst, const void *src) {
__m128i m0 = _mm_loadu_si128(((const __m128i*)src) + 0);
__m128i m1 = _mm_loadu_si128(((const __m128i*)src) + 1);
_mm_storeu_si128(((__m128i*)dst) + 0, m0);
_mm_storeu_si128(((__m128i*)dst) + 1, m1);
}
static INLINE void memcpy_sse2_64(void *dst, const void *src) {
__m128i m0 = _mm_loadu_si128(((const __m128i*)src) + 0);
__m128i m1 = _mm_loadu_si128(((const __m128i*)src) + 1);
__m128i m2 = _mm_loadu_si128(((const __m128i*)src) + 2);
__m128i m3 = _mm_loadu_si128(((const __m128i*)src) + 3);
_mm_storeu_si128(((__m128i*)dst) + 0, m0);
_mm_storeu_si128(((__m128i*)dst) + 1, m1);
_mm_storeu_si128(((__m128i*)dst) + 2, m2);
_mm_storeu_si128(((__m128i*)dst) + 3, m3);
}
static INLINE void memcpy_sse2_128(void *dst, const void *src) {
__m128i m0 = _mm_loadu_si128(((const __m128i*)src) + 0);
__m128i m1 = _mm_loadu_si128(((const __m128i*)src) + 1);
__m128i m2 = _mm_loadu_si128(((const __m128i*)src) + 2);
__m128i m3 = _mm_loadu_si128(((const __m128i*)src) + 3);
__m128i m4 = _mm_loadu_si128(((const __m128i*)src) + 4);
__m128i m5 = _mm_loadu_si128(((const __m128i*)src) + 5);
__m128i m6 = _mm_loadu_si128(((const __m128i*)src) + 6);
__m128i m7 = _mm_loadu_si128(((const __m128i*)src) + 7);
_mm_storeu_si128(((__m128i*)dst) + 0, m0);
_mm_storeu_si128(((__m128i*)dst) + 1, m1);
_mm_storeu_si128(((__m128i*)dst) + 2, m2);
_mm_storeu_si128(((__m128i*)dst) + 3, m3);
_mm_storeu_si128(((__m128i*)dst) + 4, m4);
_mm_storeu_si128(((__m128i*)dst) + 5, m5);
_mm_storeu_si128(((__m128i*)dst) + 6, m6);
_mm_storeu_si128(((__m128i*)dst) + 7, m7);
}
//---------------------------------------------------------------------
// tiny memory copy with jump table optimized
//---------------------------------------------------------------------
/// Attribute is used to avoid an error with undefined behaviour sanitizer
/// ../contrib/FastMemcpy/FastMemcpy.h:91:56: runtime error: applying zero offset to null pointer
/// Found by 01307_orc_output_format.sh, cause - ORCBlockInputFormat and external ORC library.
__attribute__((__no_sanitize__("undefined"))) static INLINE void *memcpy_tiny(void *dst, const void *src, size_t size) {
unsigned char *dd = ((unsigned char*)dst) + size;
const unsigned char *ss = ((const unsigned char*)src) + size;
switch (size) {
case 64:
memcpy_sse2_64(dd - 64, ss - 64);
case 0:
break;
case 65:
memcpy_sse2_64(dd - 65, ss - 65);
case 1:
dd[-1] = ss[-1];
break;
case 66:
memcpy_sse2_64(dd - 66, ss - 66);
case 2:
*((uint16_unaligned_t*)(dd - 2)) = *((uint16_unaligned_t*)(ss - 2));
break;
case 67:
memcpy_sse2_64(dd - 67, ss - 67);
case 3:
*((uint16_unaligned_t*)(dd - 3)) = *((uint16_unaligned_t*)(ss - 3));
dd[-1] = ss[-1];
break;
case 68:
memcpy_sse2_64(dd - 68, ss - 68);
case 4:
*((uint32_unaligned_t*)(dd - 4)) = *((uint32_unaligned_t*)(ss - 4));
break;
case 69:
memcpy_sse2_64(dd - 69, ss - 69);
case 5:
*((uint32_unaligned_t*)(dd - 5)) = *((uint32_unaligned_t*)(ss - 5));
dd[-1] = ss[-1];
break;
case 70:
memcpy_sse2_64(dd - 70, ss - 70);
case 6:
*((uint32_unaligned_t*)(dd - 6)) = *((uint32_unaligned_t*)(ss - 6));
*((uint16_unaligned_t*)(dd - 2)) = *((uint16_unaligned_t*)(ss - 2));
break;
case 71:
memcpy_sse2_64(dd - 71, ss - 71);
case 7:
*((uint32_unaligned_t*)(dd - 7)) = *((uint32_unaligned_t*)(ss - 7));
*((uint32_unaligned_t*)(dd - 4)) = *((uint32_unaligned_t*)(ss - 4));
break;
case 72:
memcpy_sse2_64(dd - 72, ss - 72);
case 8:
*((uint64_unaligned_t*)(dd - 8)) = *((uint64_unaligned_t*)(ss - 8));
break;
case 73:
memcpy_sse2_64(dd - 73, ss - 73);
case 9:
*((uint64_unaligned_t*)(dd - 9)) = *((uint64_unaligned_t*)(ss - 9));
dd[-1] = ss[-1];
break;
case 74:
memcpy_sse2_64(dd - 74, ss - 74);
case 10:
*((uint64_unaligned_t*)(dd - 10)) = *((uint64_unaligned_t*)(ss - 10));
*((uint16_unaligned_t*)(dd - 2)) = *((uint16_unaligned_t*)(ss - 2));
break;
case 75:
memcpy_sse2_64(dd - 75, ss - 75);
case 11:
*((uint64_unaligned_t*)(dd - 11)) = *((uint64_unaligned_t*)(ss - 11));
*((uint32_unaligned_t*)(dd - 4)) = *((uint32_unaligned_t*)(ss - 4));
break;
case 76:
memcpy_sse2_64(dd - 76, ss - 76);
case 12:
*((uint64_unaligned_t*)(dd - 12)) = *((uint64_unaligned_t*)(ss - 12));
*((uint32_unaligned_t*)(dd - 4)) = *((uint32_unaligned_t*)(ss - 4));
break;
case 77:
memcpy_sse2_64(dd - 77, ss - 77);
case 13:
*((uint64_unaligned_t*)(dd - 13)) = *((uint64_unaligned_t*)(ss - 13));
*((uint32_unaligned_t*)(dd - 5)) = *((uint32_unaligned_t*)(ss - 5));
dd[-1] = ss[-1];
break;
case 78:
memcpy_sse2_64(dd - 78, ss - 78);
case 14:
*((uint64_unaligned_t*)(dd - 14)) = *((uint64_unaligned_t*)(ss - 14));
*((uint64_unaligned_t*)(dd - 8)) = *((uint64_unaligned_t*)(ss - 8));
break;
case 79:
memcpy_sse2_64(dd - 79, ss - 79);
case 15:
*((uint64_unaligned_t*)(dd - 15)) = *((uint64_unaligned_t*)(ss - 15));
*((uint64_unaligned_t*)(dd - 8)) = *((uint64_unaligned_t*)(ss - 8));
break;
case 80:
memcpy_sse2_64(dd - 80, ss - 80);
case 16:
memcpy_sse2_16(dd - 16, ss - 16);
break;
case 81:
memcpy_sse2_64(dd - 81, ss - 81);
case 17:
memcpy_sse2_16(dd - 17, ss - 17);
dd[-1] = ss[-1];
break;
case 82:
memcpy_sse2_64(dd - 82, ss - 82);
case 18:
memcpy_sse2_16(dd - 18, ss - 18);
*((uint16_unaligned_t*)(dd - 2)) = *((uint16_unaligned_t*)(ss - 2));
break;
case 83:
memcpy_sse2_64(dd - 83, ss - 83);
case 19:
memcpy_sse2_16(dd - 19, ss - 19);
*((uint16_unaligned_t*)(dd - 3)) = *((uint16_unaligned_t*)(ss - 3));
dd[-1] = ss[-1];
break;
case 84:
memcpy_sse2_64(dd - 84, ss - 84);
case 20:
memcpy_sse2_16(dd - 20, ss - 20);
*((uint32_unaligned_t*)(dd - 4)) = *((uint32_unaligned_t*)(ss - 4));
break;
case 85:
memcpy_sse2_64(dd - 85, ss - 85);
case 21:
memcpy_sse2_16(dd - 21, ss - 21);
*((uint32_unaligned_t*)(dd - 5)) = *((uint32_unaligned_t*)(ss - 5));
dd[-1] = ss[-1];
break;
case 86:
memcpy_sse2_64(dd - 86, ss - 86);
case 22:
memcpy_sse2_16(dd - 22, ss - 22);
*((uint32_unaligned_t*)(dd - 6)) = *((uint32_unaligned_t*)(ss - 6));
*((uint16_unaligned_t*)(dd - 2)) = *((uint16_unaligned_t*)(ss - 2));
break;
case 87:
memcpy_sse2_64(dd - 87, ss - 87);
case 23:
memcpy_sse2_16(dd - 23, ss - 23);
*((uint32_unaligned_t*)(dd - 7)) = *((uint32_unaligned_t*)(ss - 7));
*((uint32_unaligned_t*)(dd - 4)) = *((uint32_unaligned_t*)(ss - 4));
break;
case 88:
memcpy_sse2_64(dd - 88, ss - 88);
case 24:
memcpy_sse2_16(dd - 24, ss - 24);
memcpy_sse2_16(dd - 16, ss - 16);
break;
case 89:
memcpy_sse2_64(dd - 89, ss - 89);
case 25:
memcpy_sse2_16(dd - 25, ss - 25);
memcpy_sse2_16(dd - 16, ss - 16);
break;
case 90:
memcpy_sse2_64(dd - 90, ss - 90);
case 26:
memcpy_sse2_16(dd - 26, ss - 26);
memcpy_sse2_16(dd - 16, ss - 16);
break;
case 91:
memcpy_sse2_64(dd - 91, ss - 91);
case 27:
memcpy_sse2_16(dd - 27, ss - 27);
memcpy_sse2_16(dd - 16, ss - 16);
break;
case 92:
memcpy_sse2_64(dd - 92, ss - 92);
case 28:
memcpy_sse2_16(dd - 28, ss - 28);
memcpy_sse2_16(dd - 16, ss - 16);
break;
case 93:
memcpy_sse2_64(dd - 93, ss - 93);
case 29:
memcpy_sse2_16(dd - 29, ss - 29);
memcpy_sse2_16(dd - 16, ss - 16);
break;
case 94:
memcpy_sse2_64(dd - 94, ss - 94);
case 30:
memcpy_sse2_16(dd - 30, ss - 30);
memcpy_sse2_16(dd - 16, ss - 16);
break;
case 95:
memcpy_sse2_64(dd - 95, ss - 95);
case 31:
memcpy_sse2_16(dd - 31, ss - 31);
memcpy_sse2_16(dd - 16, ss - 16);
break;
case 96:
memcpy_sse2_64(dd - 96, ss - 96);
case 32:
memcpy_sse2_32(dd - 32, ss - 32);
break;
case 97:
memcpy_sse2_64(dd - 97, ss - 97);
case 33:
memcpy_sse2_32(dd - 33, ss - 33);
dd[-1] = ss[-1];
break;
case 98:
memcpy_sse2_64(dd - 98, ss - 98);
case 34:
memcpy_sse2_32(dd - 34, ss - 34);
*((uint16_unaligned_t*)(dd - 2)) = *((uint16_unaligned_t*)(ss - 2));
break;
case 99:
memcpy_sse2_64(dd - 99, ss - 99);
case 35:
memcpy_sse2_32(dd - 35, ss - 35);
*((uint16_unaligned_t*)(dd - 3)) = *((uint16_unaligned_t*)(ss - 3));
dd[-1] = ss[-1];
break;
case 100:
memcpy_sse2_64(dd - 100, ss - 100);
case 36:
memcpy_sse2_32(dd - 36, ss - 36);
*((uint32_unaligned_t*)(dd - 4)) = *((uint32_unaligned_t*)(ss - 4));
break;
case 101:
memcpy_sse2_64(dd - 101, ss - 101);
case 37:
memcpy_sse2_32(dd - 37, ss - 37);
*((uint32_unaligned_t*)(dd - 5)) = *((uint32_unaligned_t*)(ss - 5));
dd[-1] = ss[-1];
break;
case 102:
memcpy_sse2_64(dd - 102, ss - 102);
case 38:
memcpy_sse2_32(dd - 38, ss - 38);
*((uint32_unaligned_t*)(dd - 6)) = *((uint32_unaligned_t*)(ss - 6));
*((uint16_unaligned_t*)(dd - 2)) = *((uint16_unaligned_t*)(ss - 2));
break;
case 103:
memcpy_sse2_64(dd - 103, ss - 103);
case 39:
memcpy_sse2_32(dd - 39, ss - 39);
*((uint32_unaligned_t*)(dd - 7)) = *((uint32_unaligned_t*)(ss - 7));
*((uint32_unaligned_t*)(dd - 4)) = *((uint32_unaligned_t*)(ss - 4));
break;
case 104:
memcpy_sse2_64(dd - 104, ss - 104);
case 40:
memcpy_sse2_32(dd - 40, ss - 40);
*((uint64_unaligned_t*)(dd - 8)) = *((uint64_unaligned_t*)(ss - 8));
break;
case 105:
memcpy_sse2_64(dd - 105, ss - 105);
case 41:
memcpy_sse2_32(dd - 41, ss - 41);
*((uint64_unaligned_t*)(dd - 9)) = *((uint64_unaligned_t*)(ss - 9));
dd[-1] = ss[-1];
break;
case 106:
memcpy_sse2_64(dd - 106, ss - 106);
case 42:
memcpy_sse2_32(dd - 42, ss - 42);
*((uint64_unaligned_t*)(dd - 10)) = *((uint64_unaligned_t*)(ss - 10));
*((uint16_unaligned_t*)(dd - 2)) = *((uint16_unaligned_t*)(ss - 2));
break;
case 107:
memcpy_sse2_64(dd - 107, ss - 107);
case 43:
memcpy_sse2_32(dd - 43, ss - 43);
*((uint64_unaligned_t*)(dd - 11)) = *((uint64_unaligned_t*)(ss - 11));
*((uint32_unaligned_t*)(dd - 4)) = *((uint32_unaligned_t*)(ss - 4));
break;
case 108:
memcpy_sse2_64(dd - 108, ss - 108);
case 44:
memcpy_sse2_32(dd - 44, ss - 44);
*((uint64_unaligned_t*)(dd - 12)) = *((uint64_unaligned_t*)(ss - 12));
*((uint32_unaligned_t*)(dd - 4)) = *((uint32_unaligned_t*)(ss - 4));
break;
case 109:
memcpy_sse2_64(dd - 109, ss - 109);
case 45:
memcpy_sse2_32(dd - 45, ss - 45);
*((uint64_unaligned_t*)(dd - 13)) = *((uint64_unaligned_t*)(ss - 13));
*((uint32_unaligned_t*)(dd - 5)) = *((uint32_unaligned_t*)(ss - 5));
dd[-1] = ss[-1];
break;
case 110:
memcpy_sse2_64(dd - 110, ss - 110);
case 46:
memcpy_sse2_32(dd - 46, ss - 46);
*((uint64_unaligned_t*)(dd - 14)) = *((uint64_unaligned_t*)(ss - 14));
*((uint64_unaligned_t*)(dd - 8)) = *((uint64_unaligned_t*)(ss - 8));
break;
case 111:
memcpy_sse2_64(dd - 111, ss - 111);
case 47:
memcpy_sse2_32(dd - 47, ss - 47);
*((uint64_unaligned_t*)(dd - 15)) = *((uint64_unaligned_t*)(ss - 15));
*((uint64_unaligned_t*)(dd - 8)) = *((uint64_unaligned_t*)(ss - 8));
break;
case 112:
memcpy_sse2_64(dd - 112, ss - 112);
case 48:
memcpy_sse2_32(dd - 48, ss - 48);
memcpy_sse2_16(dd - 16, ss - 16);
break;
case 113:
memcpy_sse2_64(dd - 113, ss - 113);
case 49:
memcpy_sse2_32(dd - 49, ss - 49);
memcpy_sse2_16(dd - 17, ss - 17);
dd[-1] = ss[-1];
break;
case 114:
memcpy_sse2_64(dd - 114, ss - 114);
case 50:
memcpy_sse2_32(dd - 50, ss - 50);
memcpy_sse2_16(dd - 18, ss - 18);
*((uint16_unaligned_t*)(dd - 2)) = *((uint16_unaligned_t*)(ss - 2));
break;
case 115:
memcpy_sse2_64(dd - 115, ss - 115);
case 51:
memcpy_sse2_32(dd - 51, ss - 51);
memcpy_sse2_16(dd - 19, ss - 19);
*((uint16_unaligned_t*)(dd - 3)) = *((uint16_unaligned_t*)(ss - 3));
dd[-1] = ss[-1];
break;
case 116:
memcpy_sse2_64(dd - 116, ss - 116);
case 52:
memcpy_sse2_32(dd - 52, ss - 52);
memcpy_sse2_16(dd - 20, ss - 20);
*((uint32_unaligned_t*)(dd - 4)) = *((uint32_unaligned_t*)(ss - 4));
break;
case 117:
memcpy_sse2_64(dd - 117, ss - 117);
case 53:
memcpy_sse2_32(dd - 53, ss - 53);
memcpy_sse2_16(dd - 21, ss - 21);
*((uint32_unaligned_t*)(dd - 5)) = *((uint32_unaligned_t*)(ss - 5));
dd[-1] = ss[-1];
break;
case 118:
memcpy_sse2_64(dd - 118, ss - 118);
case 54:
memcpy_sse2_32(dd - 54, ss - 54);
memcpy_sse2_16(dd - 22, ss - 22);
*((uint32_unaligned_t*)(dd - 6)) = *((uint32_unaligned_t*)(ss - 6));
*((uint16_unaligned_t*)(dd - 2)) = *((uint16_unaligned_t*)(ss - 2));
break;
case 119:
memcpy_sse2_64(dd - 119, ss - 119);
case 55:
memcpy_sse2_32(dd - 55, ss - 55);
memcpy_sse2_16(dd - 23, ss - 23);
*((uint32_unaligned_t*)(dd - 7)) = *((uint32_unaligned_t*)(ss - 7));
*((uint32_unaligned_t*)(dd - 4)) = *((uint32_unaligned_t*)(ss - 4));
break;
case 120:
memcpy_sse2_64(dd - 120, ss - 120);
case 56:
memcpy_sse2_32(dd - 56, ss - 56);
memcpy_sse2_16(dd - 24, ss - 24);
memcpy_sse2_16(dd - 16, ss - 16);
break;
case 121:
memcpy_sse2_64(dd - 121, ss - 121);
case 57:
memcpy_sse2_32(dd - 57, ss - 57);
memcpy_sse2_16(dd - 25, ss - 25);
memcpy_sse2_16(dd - 16, ss - 16);
break;
case 122:
memcpy_sse2_64(dd - 122, ss - 122);
case 58:
memcpy_sse2_32(dd - 58, ss - 58);
memcpy_sse2_16(dd - 26, ss - 26);
memcpy_sse2_16(dd - 16, ss - 16);
break;
case 123:
memcpy_sse2_64(dd - 123, ss - 123);
case 59:
memcpy_sse2_32(dd - 59, ss - 59);
memcpy_sse2_16(dd - 27, ss - 27);
memcpy_sse2_16(dd - 16, ss - 16);
break;
case 124:
memcpy_sse2_64(dd - 124, ss - 124);
case 60:
memcpy_sse2_32(dd - 60, ss - 60);
memcpy_sse2_16(dd - 28, ss - 28);
memcpy_sse2_16(dd - 16, ss - 16);
break;
case 125:
memcpy_sse2_64(dd - 125, ss - 125);
case 61:
memcpy_sse2_32(dd - 61, ss - 61);
memcpy_sse2_16(dd - 29, ss - 29);
memcpy_sse2_16(dd - 16, ss - 16);
break;
case 126:
memcpy_sse2_64(dd - 126, ss - 126);
case 62:
memcpy_sse2_32(dd - 62, ss - 62);
memcpy_sse2_16(dd - 30, ss - 30);
memcpy_sse2_16(dd - 16, ss - 16);
break;
case 127:
memcpy_sse2_64(dd - 127, ss - 127);
case 63:
memcpy_sse2_32(dd - 63, ss - 63);
memcpy_sse2_16(dd - 31, ss - 31);
memcpy_sse2_16(dd - 16, ss - 16);
break;
case 128:
memcpy_sse2_128(dd - 128, ss - 128);
break;
}
return dst;
}
//---------------------------------------------------------------------
// main routine
//---------------------------------------------------------------------
static void* memcpy_fast(void *destination, const void *source, size_t size)
{
unsigned char *dst = (unsigned char*)destination;
const unsigned char *src = (const unsigned char*)source;
static size_t cachesize = 0x200000; // L2-cache size
size_t padding;
// small memory copy
if (size <= 128) {
return memcpy_tiny(dst, src, size);
}
// align destination to 16 bytes boundary
padding = (16 - (((size_t)dst) & 15)) & 15;
if (padding > 0) {
__m128i head = _mm_loadu_si128((const __m128i*)src);
_mm_storeu_si128((__m128i*)dst, head);
dst += padding;
src += padding;
size -= padding;
}
// medium size copy
if (size <= cachesize) {
__m128i c0, c1, c2, c3, c4, c5, c6, c7;
for (; size >= 128; size -= 128) {
c0 = _mm_loadu_si128(((const __m128i*)src) + 0);
c1 = _mm_loadu_si128(((const __m128i*)src) + 1);
c2 = _mm_loadu_si128(((const __m128i*)src) + 2);
c3 = _mm_loadu_si128(((const __m128i*)src) + 3);
c4 = _mm_loadu_si128(((const __m128i*)src) + 4);
c5 = _mm_loadu_si128(((const __m128i*)src) + 5);
c6 = _mm_loadu_si128(((const __m128i*)src) + 6);
c7 = _mm_loadu_si128(((const __m128i*)src) + 7);
_mm_prefetch((const char*)(src + 256), _MM_HINT_NTA);
src += 128;
_mm_store_si128((((__m128i*)dst) + 0), c0);
_mm_store_si128((((__m128i*)dst) + 1), c1);
_mm_store_si128((((__m128i*)dst) + 2), c2);
_mm_store_si128((((__m128i*)dst) + 3), c3);
_mm_store_si128((((__m128i*)dst) + 4), c4);
_mm_store_si128((((__m128i*)dst) + 5), c5);
_mm_store_si128((((__m128i*)dst) + 6), c6);
_mm_store_si128((((__m128i*)dst) + 7), c7);
dst += 128;
}
}
else { // big memory copy
__m128i c0, c1, c2, c3, c4, c5, c6, c7;
_mm_prefetch((const char*)(src), _MM_HINT_NTA);
if ((((size_t)src) & 15) == 0) { // source aligned
for (; size >= 128; size -= 128) {
c0 = _mm_load_si128(((const __m128i*)src) + 0);
c1 = _mm_load_si128(((const __m128i*)src) + 1);
c2 = _mm_load_si128(((const __m128i*)src) + 2);
c3 = _mm_load_si128(((const __m128i*)src) + 3);
c4 = _mm_load_si128(((const __m128i*)src) + 4);
c5 = _mm_load_si128(((const __m128i*)src) + 5);
c6 = _mm_load_si128(((const __m128i*)src) + 6);
c7 = _mm_load_si128(((const __m128i*)src) + 7);
_mm_prefetch((const char*)(src + 256), _MM_HINT_NTA);
src += 128;
_mm_stream_si128((((__m128i*)dst) + 0), c0);
_mm_stream_si128((((__m128i*)dst) + 1), c1);
_mm_stream_si128((((__m128i*)dst) + 2), c2);
_mm_stream_si128((((__m128i*)dst) + 3), c3);
_mm_stream_si128((((__m128i*)dst) + 4), c4);
_mm_stream_si128((((__m128i*)dst) + 5), c5);
_mm_stream_si128((((__m128i*)dst) + 6), c6);
_mm_stream_si128((((__m128i*)dst) + 7), c7);
dst += 128;
}
}
else { // source unaligned
for (; size >= 128; size -= 128) {
c0 = _mm_loadu_si128(((const __m128i*)src) + 0);
c1 = _mm_loadu_si128(((const __m128i*)src) + 1);
c2 = _mm_loadu_si128(((const __m128i*)src) + 2);
c3 = _mm_loadu_si128(((const __m128i*)src) + 3);
c4 = _mm_loadu_si128(((const __m128i*)src) + 4);
c5 = _mm_loadu_si128(((const __m128i*)src) + 5);
c6 = _mm_loadu_si128(((const __m128i*)src) + 6);
c7 = _mm_loadu_si128(((const __m128i*)src) + 7);
_mm_prefetch((const char*)(src + 256), _MM_HINT_NTA);
src += 128;
_mm_stream_si128((((__m128i*)dst) + 0), c0);
_mm_stream_si128((((__m128i*)dst) + 1), c1);
_mm_stream_si128((((__m128i*)dst) + 2), c2);
_mm_stream_si128((((__m128i*)dst) + 3), c3);
_mm_stream_si128((((__m128i*)dst) + 4), c4);
_mm_stream_si128((((__m128i*)dst) + 5), c5);
_mm_stream_si128((((__m128i*)dst) + 6), c6);
_mm_stream_si128((((__m128i*)dst) + 7), c7);
dst += 128;
}
}
_mm_sfence();
}
memcpy_tiny(dst, src, size);
return destination;
}
#endif

View File

@ -1,171 +0,0 @@
//=====================================================================
//
// FastMemcpy.c - skywind3000@163.com, 2015
//
// feature:
// 50% speed up in avg. vs standard memcpy (tested in vc2012/gcc4.9)
//
//=====================================================================
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <assert.h>
#if (defined(_WIN32) || defined(WIN32))
#include <windows.h>
#include <mmsystem.h>
#ifdef _MSC_VER
#pragma comment(lib, "winmm.lib")
#endif
#elif defined(__unix)
#include <sys/time.h>
#include <unistd.h>
#else
#error it can only be compiled under windows or unix
#endif
#include "FastMemcpy_Avx.h"
unsigned int gettime()
{
#if (defined(_WIN32) || defined(WIN32))
return timeGetTime();
#else
static struct timezone tz={ 0,0 };
struct timeval time;
gettimeofday(&time,&tz);
return (time.tv_sec * 1000 + time.tv_usec / 1000);
#endif
}
void sleepms(unsigned int millisec)
{
#if defined(_WIN32) || defined(WIN32)
Sleep(millisec);
#else
usleep(millisec * 1000);
#endif
}
void benchmark(int dstalign, int srcalign, size_t size, int times)
{
char *DATA1 = (char*)malloc(size + 64);
char *DATA2 = (char*)malloc(size + 64);
size_t LINEAR1 = ((size_t)DATA1);
size_t LINEAR2 = ((size_t)DATA2);
char *ALIGN1 = (char*)(((64 - (LINEAR1 & 63)) & 63) + LINEAR1);
char *ALIGN2 = (char*)(((64 - (LINEAR2 & 63)) & 63) + LINEAR2);
char *dst = (dstalign)? ALIGN1 : (ALIGN1 + 1);
char *src = (srcalign)? ALIGN2 : (ALIGN2 + 3);
unsigned int t1, t2;
int k;
sleepms(100);
t1 = gettime();
for (k = times; k > 0; k--) {
memcpy(dst, src, size);
}
t1 = gettime() - t1;
sleepms(100);
t2 = gettime();
for (k = times; k > 0; k--) {
memcpy_fast(dst, src, size);
}
t2 = gettime() - t2;
free(DATA1);
free(DATA2);
printf("result(dst %s, src %s): memcpy_fast=%dms memcpy=%d ms\n",
dstalign? "aligned" : "unalign",
srcalign? "aligned" : "unalign", (int)t2, (int)t1);
}
void bench(int copysize, int times)
{
printf("benchmark(size=%d bytes, times=%d):\n", copysize, times);
benchmark(1, 1, copysize, times);
benchmark(1, 0, copysize, times);
benchmark(0, 1, copysize, times);
benchmark(0, 0, copysize, times);
printf("\n");
}
void random_bench(int maxsize, int times)
{
static char A[11 * 1024 * 1024 + 2];
static char B[11 * 1024 * 1024 + 2];
static int random_offsets[0x10000];
static int random_sizes[0x8000];
unsigned int i, p1, p2;
unsigned int t1, t2;
for (i = 0; i < 0x10000; i++) { // generate random offsets
random_offsets[i] = rand() % (10 * 1024 * 1024 + 1);
}
for (i = 0; i < 0x8000; i++) { // generate random sizes
random_sizes[i] = 1 + rand() % maxsize;
}
sleepms(100);
t1 = gettime();
for (p1 = 0, p2 = 0, i = 0; i < times; i++) {
int offset1 = random_offsets[(p1++) & 0xffff];
int offset2 = random_offsets[(p1++) & 0xffff];
int size = random_sizes[(p2++) & 0x7fff];
memcpy(A + offset1, B + offset2, size);
}
t1 = gettime() - t1;
sleepms(100);
t2 = gettime();
for (p1 = 0, p2 = 0, i = 0; i < times; i++) {
int offset1 = random_offsets[(p1++) & 0xffff];
int offset2 = random_offsets[(p1++) & 0xffff];
int size = random_sizes[(p2++) & 0x7fff];
memcpy_fast(A + offset1, B + offset2, size);
}
t2 = gettime() - t2;
printf("benchmark random access:\n");
printf("memcpy_fast=%dms memcpy=%dms\n\n", (int)t2, (int)t1);
}
#ifdef _MSC_VER
#pragma comment(lib, "winmm.lib")
#endif
int main(void)
{
#if 1
bench(32, 0x1000000);
bench(64, 0x1000000);
bench(512, 0x800000);
bench(1024, 0x400000);
#endif
bench(4096, 0x80000);
bench(8192, 0x40000);
#if 1
bench(1024 * 1024 * 1, 0x800);
bench(1024 * 1024 * 4, 0x200);
#endif
bench(1024 * 1024 * 8, 0x100);
random_bench(2048, 8000000);
return 0;
}
/*
*/

View File

@ -1,492 +0,0 @@
//=====================================================================
//
// FastMemcpy.c - skywind3000@163.com, 2015
//
// feature:
// 50% speed up in avg. vs standard memcpy (tested in vc2012/gcc5.1)
//
//=====================================================================
#ifndef __FAST_MEMCPY_H__
#define __FAST_MEMCPY_H__
#include <stddef.h>
#include <stdint.h>
#include <immintrin.h>
//---------------------------------------------------------------------
// force inline for compilers
//---------------------------------------------------------------------
#ifndef INLINE
#ifdef __GNUC__
#if (__GNUC__ > 3) || ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 1))
#define INLINE __inline__ __attribute__((always_inline))
#else
#define INLINE __inline__
#endif
#elif defined(_MSC_VER)
#define INLINE __forceinline
#elif (defined(__BORLANDC__) || defined(__WATCOMC__))
#define INLINE __inline
#else
#define INLINE
#endif
#endif
//---------------------------------------------------------------------
// fast copy for different sizes
//---------------------------------------------------------------------
static INLINE void memcpy_avx_16(void *dst, const void *src) {
#if 1
__m128i m0 = _mm_loadu_si128(((const __m128i*)src) + 0);
_mm_storeu_si128(((__m128i*)dst) + 0, m0);
#else
*((uint64_t*)((char*)dst + 0)) = *((uint64_t*)((const char*)src + 0));
*((uint64_t*)((char*)dst + 8)) = *((uint64_t*)((const char*)src + 8));
#endif
}
static INLINE void memcpy_avx_32(void *dst, const void *src) {
__m256i m0 = _mm256_loadu_si256(((const __m256i*)src) + 0);
_mm256_storeu_si256(((__m256i*)dst) + 0, m0);
}
static INLINE void memcpy_avx_64(void *dst, const void *src) {
__m256i m0 = _mm256_loadu_si256(((const __m256i*)src) + 0);
__m256i m1 = _mm256_loadu_si256(((const __m256i*)src) + 1);
_mm256_storeu_si256(((__m256i*)dst) + 0, m0);
_mm256_storeu_si256(((__m256i*)dst) + 1, m1);
}
static INLINE void memcpy_avx_128(void *dst, const void *src) {
__m256i m0 = _mm256_loadu_si256(((const __m256i*)src) + 0);
__m256i m1 = _mm256_loadu_si256(((const __m256i*)src) + 1);
__m256i m2 = _mm256_loadu_si256(((const __m256i*)src) + 2);
__m256i m3 = _mm256_loadu_si256(((const __m256i*)src) + 3);
_mm256_storeu_si256(((__m256i*)dst) + 0, m0);
_mm256_storeu_si256(((__m256i*)dst) + 1, m1);
_mm256_storeu_si256(((__m256i*)dst) + 2, m2);
_mm256_storeu_si256(((__m256i*)dst) + 3, m3);
}
static INLINE void memcpy_avx_256(void *dst, const void *src) {
__m256i m0 = _mm256_loadu_si256(((const __m256i*)src) + 0);
__m256i m1 = _mm256_loadu_si256(((const __m256i*)src) + 1);
__m256i m2 = _mm256_loadu_si256(((const __m256i*)src) + 2);
__m256i m3 = _mm256_loadu_si256(((const __m256i*)src) + 3);
__m256i m4 = _mm256_loadu_si256(((const __m256i*)src) + 4);
__m256i m5 = _mm256_loadu_si256(((const __m256i*)src) + 5);
__m256i m6 = _mm256_loadu_si256(((const __m256i*)src) + 6);
__m256i m7 = _mm256_loadu_si256(((const __m256i*)src) + 7);
_mm256_storeu_si256(((__m256i*)dst) + 0, m0);
_mm256_storeu_si256(((__m256i*)dst) + 1, m1);
_mm256_storeu_si256(((__m256i*)dst) + 2, m2);
_mm256_storeu_si256(((__m256i*)dst) + 3, m3);
_mm256_storeu_si256(((__m256i*)dst) + 4, m4);
_mm256_storeu_si256(((__m256i*)dst) + 5, m5);
_mm256_storeu_si256(((__m256i*)dst) + 6, m6);
_mm256_storeu_si256(((__m256i*)dst) + 7, m7);
}
//---------------------------------------------------------------------
// tiny memory copy with jump table optimized
//---------------------------------------------------------------------
static INLINE void *memcpy_tiny(void *dst, const void *src, size_t size) {
unsigned char *dd = ((unsigned char*)dst) + size;
const unsigned char *ss = ((const unsigned char*)src) + size;
switch (size) {
case 128: memcpy_avx_128(dd - 128, ss - 128);
case 0: break;
case 129: memcpy_avx_128(dd - 129, ss - 129);
case 1: dd[-1] = ss[-1]; break;
case 130: memcpy_avx_128(dd - 130, ss - 130);
case 2: *((uint16_t*)(dd - 2)) = *((uint16_t*)(ss - 2)); break;
case 131: memcpy_avx_128(dd - 131, ss - 131);
case 3: *((uint16_t*)(dd - 3)) = *((uint16_t*)(ss - 3)); dd[-1] = ss[-1]; break;
case 132: memcpy_avx_128(dd - 132, ss - 132);
case 4: *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); break;
case 133: memcpy_avx_128(dd - 133, ss - 133);
case 5: *((uint32_t*)(dd - 5)) = *((uint32_t*)(ss - 5)); dd[-1] = ss[-1]; break;
case 134: memcpy_avx_128(dd - 134, ss - 134);
case 6: *((uint32_t*)(dd - 6)) = *((uint32_t*)(ss - 6)); *((uint16_t*)(dd - 2)) = *((uint16_t*)(ss - 2)); break;
case 135: memcpy_avx_128(dd - 135, ss - 135);
case 7: *((uint32_t*)(dd - 7)) = *((uint32_t*)(ss - 7)); *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); break;
case 136: memcpy_avx_128(dd - 136, ss - 136);
case 8: *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break;
case 137: memcpy_avx_128(dd - 137, ss - 137);
case 9: *((uint64_t*)(dd - 9)) = *((uint64_t*)(ss - 9)); dd[-1] = ss[-1]; break;
case 138: memcpy_avx_128(dd - 138, ss - 138);
case 10: *((uint64_t*)(dd - 10)) = *((uint64_t*)(ss - 10)); *((uint16_t*)(dd - 2)) = *((uint16_t*)(ss - 2)); break;
case 139: memcpy_avx_128(dd - 139, ss - 139);
case 11: *((uint64_t*)(dd - 11)) = *((uint64_t*)(ss - 11)); *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); break;
case 140: memcpy_avx_128(dd - 140, ss - 140);
case 12: *((uint64_t*)(dd - 12)) = *((uint64_t*)(ss - 12)); *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); break;
case 141: memcpy_avx_128(dd - 141, ss - 141);
case 13: *((uint64_t*)(dd - 13)) = *((uint64_t*)(ss - 13)); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break;
case 142: memcpy_avx_128(dd - 142, ss - 142);
case 14: *((uint64_t*)(dd - 14)) = *((uint64_t*)(ss - 14)); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break;
case 143: memcpy_avx_128(dd - 143, ss - 143);
case 15: *((uint64_t*)(dd - 15)) = *((uint64_t*)(ss - 15)); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break;
case 144: memcpy_avx_128(dd - 144, ss - 144);
case 16: memcpy_avx_16(dd - 16, ss - 16); break;
case 145: memcpy_avx_128(dd - 145, ss - 145);
case 17: memcpy_avx_16(dd - 17, ss - 17); dd[-1] = ss[-1]; break;
case 146: memcpy_avx_128(dd - 146, ss - 146);
case 18: memcpy_avx_16(dd - 18, ss - 18); *((uint16_t*)(dd - 2)) = *((uint16_t*)(ss - 2)); break;
case 147: memcpy_avx_128(dd - 147, ss - 147);
case 19: memcpy_avx_16(dd - 19, ss - 19); *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); break;
case 148: memcpy_avx_128(dd - 148, ss - 148);
case 20: memcpy_avx_16(dd - 20, ss - 20); *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); break;
case 149: memcpy_avx_128(dd - 149, ss - 149);
case 21: memcpy_avx_16(dd - 21, ss - 21); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break;
case 150: memcpy_avx_128(dd - 150, ss - 150);
case 22: memcpy_avx_16(dd - 22, ss - 22); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break;
case 151: memcpy_avx_128(dd - 151, ss - 151);
case 23: memcpy_avx_16(dd - 23, ss - 23); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break;
case 152: memcpy_avx_128(dd - 152, ss - 152);
case 24: memcpy_avx_16(dd - 24, ss - 24); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break;
case 153: memcpy_avx_128(dd - 153, ss - 153);
case 25: memcpy_avx_16(dd - 25, ss - 25); memcpy_avx_16(dd - 16, ss - 16); break;
case 154: memcpy_avx_128(dd - 154, ss - 154);
case 26: memcpy_avx_16(dd - 26, ss - 26); memcpy_avx_16(dd - 16, ss - 16); break;
case 155: memcpy_avx_128(dd - 155, ss - 155);
case 27: memcpy_avx_16(dd - 27, ss - 27); memcpy_avx_16(dd - 16, ss - 16); break;
case 156: memcpy_avx_128(dd - 156, ss - 156);
case 28: memcpy_avx_16(dd - 28, ss - 28); memcpy_avx_16(dd - 16, ss - 16); break;
case 157: memcpy_avx_128(dd - 157, ss - 157);
case 29: memcpy_avx_16(dd - 29, ss - 29); memcpy_avx_16(dd - 16, ss - 16); break;
case 158: memcpy_avx_128(dd - 158, ss - 158);
case 30: memcpy_avx_16(dd - 30, ss - 30); memcpy_avx_16(dd - 16, ss - 16); break;
case 159: memcpy_avx_128(dd - 159, ss - 159);
case 31: memcpy_avx_16(dd - 31, ss - 31); memcpy_avx_16(dd - 16, ss - 16); break;
case 160: memcpy_avx_128(dd - 160, ss - 160);
case 32: memcpy_avx_32(dd - 32, ss - 32); break;
case 161: memcpy_avx_128(dd - 161, ss - 161);
case 33: memcpy_avx_32(dd - 33, ss - 33); dd[-1] = ss[-1]; break;
case 162: memcpy_avx_128(dd - 162, ss - 162);
case 34: memcpy_avx_32(dd - 34, ss - 34); *((uint16_t*)(dd - 2)) = *((uint16_t*)(ss - 2)); break;
case 163: memcpy_avx_128(dd - 163, ss - 163);
case 35: memcpy_avx_32(dd - 35, ss - 35); *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); break;
case 164: memcpy_avx_128(dd - 164, ss - 164);
case 36: memcpy_avx_32(dd - 36, ss - 36); *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); break;
case 165: memcpy_avx_128(dd - 165, ss - 165);
case 37: memcpy_avx_32(dd - 37, ss - 37); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break;
case 166: memcpy_avx_128(dd - 166, ss - 166);
case 38: memcpy_avx_32(dd - 38, ss - 38); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break;
case 167: memcpy_avx_128(dd - 167, ss - 167);
case 39: memcpy_avx_32(dd - 39, ss - 39); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break;
case 168: memcpy_avx_128(dd - 168, ss - 168);
case 40: memcpy_avx_32(dd - 40, ss - 40); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break;
case 169: memcpy_avx_128(dd - 169, ss - 169);
case 41: memcpy_avx_32(dd - 41, ss - 41); memcpy_avx_16(dd - 16, ss - 16); break;
case 170: memcpy_avx_128(dd - 170, ss - 170);
case 42: memcpy_avx_32(dd - 42, ss - 42); memcpy_avx_16(dd - 16, ss - 16); break;
case 171: memcpy_avx_128(dd - 171, ss - 171);
case 43: memcpy_avx_32(dd - 43, ss - 43); memcpy_avx_16(dd - 16, ss - 16); break;
case 172: memcpy_avx_128(dd - 172, ss - 172);
case 44: memcpy_avx_32(dd - 44, ss - 44); memcpy_avx_16(dd - 16, ss - 16); break;
case 173: memcpy_avx_128(dd - 173, ss - 173);
case 45: memcpy_avx_32(dd - 45, ss - 45); memcpy_avx_16(dd - 16, ss - 16); break;
case 174: memcpy_avx_128(dd - 174, ss - 174);
case 46: memcpy_avx_32(dd - 46, ss - 46); memcpy_avx_16(dd - 16, ss - 16); break;
case 175: memcpy_avx_128(dd - 175, ss - 175);
case 47: memcpy_avx_32(dd - 47, ss - 47); memcpy_avx_16(dd - 16, ss - 16); break;
case 176: memcpy_avx_128(dd - 176, ss - 176);
case 48: memcpy_avx_32(dd - 48, ss - 48); memcpy_avx_16(dd - 16, ss - 16); break;
case 177: memcpy_avx_128(dd - 177, ss - 177);
case 49: memcpy_avx_32(dd - 49, ss - 49); memcpy_avx_32(dd - 32, ss - 32); break;
case 178: memcpy_avx_128(dd - 178, ss - 178);
case 50: memcpy_avx_32(dd - 50, ss - 50); memcpy_avx_32(dd - 32, ss - 32); break;
case 179: memcpy_avx_128(dd - 179, ss - 179);
case 51: memcpy_avx_32(dd - 51, ss - 51); memcpy_avx_32(dd - 32, ss - 32); break;
case 180: memcpy_avx_128(dd - 180, ss - 180);
case 52: memcpy_avx_32(dd - 52, ss - 52); memcpy_avx_32(dd - 32, ss - 32); break;
case 181: memcpy_avx_128(dd - 181, ss - 181);
case 53: memcpy_avx_32(dd - 53, ss - 53); memcpy_avx_32(dd - 32, ss - 32); break;
case 182: memcpy_avx_128(dd - 182, ss - 182);
case 54: memcpy_avx_32(dd - 54, ss - 54); memcpy_avx_32(dd - 32, ss - 32); break;
case 183: memcpy_avx_128(dd - 183, ss - 183);
case 55: memcpy_avx_32(dd - 55, ss - 55); memcpy_avx_32(dd - 32, ss - 32); break;
case 184: memcpy_avx_128(dd - 184, ss - 184);
case 56: memcpy_avx_32(dd - 56, ss - 56); memcpy_avx_32(dd - 32, ss - 32); break;
case 185: memcpy_avx_128(dd - 185, ss - 185);
case 57: memcpy_avx_32(dd - 57, ss - 57); memcpy_avx_32(dd - 32, ss - 32); break;
case 186: memcpy_avx_128(dd - 186, ss - 186);
case 58: memcpy_avx_32(dd - 58, ss - 58); memcpy_avx_32(dd - 32, ss - 32); break;
case 187: memcpy_avx_128(dd - 187, ss - 187);
case 59: memcpy_avx_32(dd - 59, ss - 59); memcpy_avx_32(dd - 32, ss - 32); break;
case 188: memcpy_avx_128(dd - 188, ss - 188);
case 60: memcpy_avx_32(dd - 60, ss - 60); memcpy_avx_32(dd - 32, ss - 32); break;
case 189: memcpy_avx_128(dd - 189, ss - 189);
case 61: memcpy_avx_32(dd - 61, ss - 61); memcpy_avx_32(dd - 32, ss - 32); break;
case 190: memcpy_avx_128(dd - 190, ss - 190);
case 62: memcpy_avx_32(dd - 62, ss - 62); memcpy_avx_32(dd - 32, ss - 32); break;
case 191: memcpy_avx_128(dd - 191, ss - 191);
case 63: memcpy_avx_32(dd - 63, ss - 63); memcpy_avx_32(dd - 32, ss - 32); break;
case 192: memcpy_avx_128(dd - 192, ss - 192);
case 64: memcpy_avx_64(dd - 64, ss - 64); break;
case 193: memcpy_avx_128(dd - 193, ss - 193);
case 65: memcpy_avx_64(dd - 65, ss - 65); dd[-1] = ss[-1]; break;
case 194: memcpy_avx_128(dd - 194, ss - 194);
case 66: memcpy_avx_64(dd - 66, ss - 66); *((uint16_t*)(dd - 2)) = *((uint16_t*)(ss - 2)); break;
case 195: memcpy_avx_128(dd - 195, ss - 195);
case 67: memcpy_avx_64(dd - 67, ss - 67); *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); break;
case 196: memcpy_avx_128(dd - 196, ss - 196);
case 68: memcpy_avx_64(dd - 68, ss - 68); *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); break;
case 197: memcpy_avx_128(dd - 197, ss - 197);
case 69: memcpy_avx_64(dd - 69, ss - 69); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break;
case 198: memcpy_avx_128(dd - 198, ss - 198);
case 70: memcpy_avx_64(dd - 70, ss - 70); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break;
case 199: memcpy_avx_128(dd - 199, ss - 199);
case 71: memcpy_avx_64(dd - 71, ss - 71); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break;
case 200: memcpy_avx_128(dd - 200, ss - 200);
case 72: memcpy_avx_64(dd - 72, ss - 72); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break;
case 201: memcpy_avx_128(dd - 201, ss - 201);
case 73: memcpy_avx_64(dd - 73, ss - 73); memcpy_avx_16(dd - 16, ss - 16); break;
case 202: memcpy_avx_128(dd - 202, ss - 202);
case 74: memcpy_avx_64(dd - 74, ss - 74); memcpy_avx_16(dd - 16, ss - 16); break;
case 203: memcpy_avx_128(dd - 203, ss - 203);
case 75: memcpy_avx_64(dd - 75, ss - 75); memcpy_avx_16(dd - 16, ss - 16); break;
case 204: memcpy_avx_128(dd - 204, ss - 204);
case 76: memcpy_avx_64(dd - 76, ss - 76); memcpy_avx_16(dd - 16, ss - 16); break;
case 205: memcpy_avx_128(dd - 205, ss - 205);
case 77: memcpy_avx_64(dd - 77, ss - 77); memcpy_avx_16(dd - 16, ss - 16); break;
case 206: memcpy_avx_128(dd - 206, ss - 206);
case 78: memcpy_avx_64(dd - 78, ss - 78); memcpy_avx_16(dd - 16, ss - 16); break;
case 207: memcpy_avx_128(dd - 207, ss - 207);
case 79: memcpy_avx_64(dd - 79, ss - 79); memcpy_avx_16(dd - 16, ss - 16); break;
case 208: memcpy_avx_128(dd - 208, ss - 208);
case 80: memcpy_avx_64(dd - 80, ss - 80); memcpy_avx_16(dd - 16, ss - 16); break;
case 209: memcpy_avx_128(dd - 209, ss - 209);
case 81: memcpy_avx_64(dd - 81, ss - 81); memcpy_avx_32(dd - 32, ss - 32); break;
case 210: memcpy_avx_128(dd - 210, ss - 210);
case 82: memcpy_avx_64(dd - 82, ss - 82); memcpy_avx_32(dd - 32, ss - 32); break;
case 211: memcpy_avx_128(dd - 211, ss - 211);
case 83: memcpy_avx_64(dd - 83, ss - 83); memcpy_avx_32(dd - 32, ss - 32); break;
case 212: memcpy_avx_128(dd - 212, ss - 212);
case 84: memcpy_avx_64(dd - 84, ss - 84); memcpy_avx_32(dd - 32, ss - 32); break;
case 213: memcpy_avx_128(dd - 213, ss - 213);
case 85: memcpy_avx_64(dd - 85, ss - 85); memcpy_avx_32(dd - 32, ss - 32); break;
case 214: memcpy_avx_128(dd - 214, ss - 214);
case 86: memcpy_avx_64(dd - 86, ss - 86); memcpy_avx_32(dd - 32, ss - 32); break;
case 215: memcpy_avx_128(dd - 215, ss - 215);
case 87: memcpy_avx_64(dd - 87, ss - 87); memcpy_avx_32(dd - 32, ss - 32); break;
case 216: memcpy_avx_128(dd - 216, ss - 216);
case 88: memcpy_avx_64(dd - 88, ss - 88); memcpy_avx_32(dd - 32, ss - 32); break;
case 217: memcpy_avx_128(dd - 217, ss - 217);
case 89: memcpy_avx_64(dd - 89, ss - 89); memcpy_avx_32(dd - 32, ss - 32); break;
case 218: memcpy_avx_128(dd - 218, ss - 218);
case 90: memcpy_avx_64(dd - 90, ss - 90); memcpy_avx_32(dd - 32, ss - 32); break;
case 219: memcpy_avx_128(dd - 219, ss - 219);
case 91: memcpy_avx_64(dd - 91, ss - 91); memcpy_avx_32(dd - 32, ss - 32); break;
case 220: memcpy_avx_128(dd - 220, ss - 220);
case 92: memcpy_avx_64(dd - 92, ss - 92); memcpy_avx_32(dd - 32, ss - 32); break;
case 221: memcpy_avx_128(dd - 221, ss - 221);
case 93: memcpy_avx_64(dd - 93, ss - 93); memcpy_avx_32(dd - 32, ss - 32); break;
case 222: memcpy_avx_128(dd - 222, ss - 222);
case 94: memcpy_avx_64(dd - 94, ss - 94); memcpy_avx_32(dd - 32, ss - 32); break;
case 223: memcpy_avx_128(dd - 223, ss - 223);
case 95: memcpy_avx_64(dd - 95, ss - 95); memcpy_avx_32(dd - 32, ss - 32); break;
case 224: memcpy_avx_128(dd - 224, ss - 224);
case 96: memcpy_avx_64(dd - 96, ss - 96); memcpy_avx_32(dd - 32, ss - 32); break;
case 225: memcpy_avx_128(dd - 225, ss - 225);
case 97: memcpy_avx_64(dd - 97, ss - 97); memcpy_avx_64(dd - 64, ss - 64); break;
case 226: memcpy_avx_128(dd - 226, ss - 226);
case 98: memcpy_avx_64(dd - 98, ss - 98); memcpy_avx_64(dd - 64, ss - 64); break;
case 227: memcpy_avx_128(dd - 227, ss - 227);
case 99: memcpy_avx_64(dd - 99, ss - 99); memcpy_avx_64(dd - 64, ss - 64); break;
case 228: memcpy_avx_128(dd - 228, ss - 228);
case 100: memcpy_avx_64(dd - 100, ss - 100); memcpy_avx_64(dd - 64, ss - 64); break;
case 229: memcpy_avx_128(dd - 229, ss - 229);
case 101: memcpy_avx_64(dd - 101, ss - 101); memcpy_avx_64(dd - 64, ss - 64); break;
case 230: memcpy_avx_128(dd - 230, ss - 230);
case 102: memcpy_avx_64(dd - 102, ss - 102); memcpy_avx_64(dd - 64, ss - 64); break;
case 231: memcpy_avx_128(dd - 231, ss - 231);
case 103: memcpy_avx_64(dd - 103, ss - 103); memcpy_avx_64(dd - 64, ss - 64); break;
case 232: memcpy_avx_128(dd - 232, ss - 232);
case 104: memcpy_avx_64(dd - 104, ss - 104); memcpy_avx_64(dd - 64, ss - 64); break;
case 233: memcpy_avx_128(dd - 233, ss - 233);
case 105: memcpy_avx_64(dd - 105, ss - 105); memcpy_avx_64(dd - 64, ss - 64); break;
case 234: memcpy_avx_128(dd - 234, ss - 234);
case 106: memcpy_avx_64(dd - 106, ss - 106); memcpy_avx_64(dd - 64, ss - 64); break;
case 235: memcpy_avx_128(dd - 235, ss - 235);
case 107: memcpy_avx_64(dd - 107, ss - 107); memcpy_avx_64(dd - 64, ss - 64); break;
case 236: memcpy_avx_128(dd - 236, ss - 236);
case 108: memcpy_avx_64(dd - 108, ss - 108); memcpy_avx_64(dd - 64, ss - 64); break;
case 237: memcpy_avx_128(dd - 237, ss - 237);
case 109: memcpy_avx_64(dd - 109, ss - 109); memcpy_avx_64(dd - 64, ss - 64); break;
case 238: memcpy_avx_128(dd - 238, ss - 238);
case 110: memcpy_avx_64(dd - 110, ss - 110); memcpy_avx_64(dd - 64, ss - 64); break;
case 239: memcpy_avx_128(dd - 239, ss - 239);
case 111: memcpy_avx_64(dd - 111, ss - 111); memcpy_avx_64(dd - 64, ss - 64); break;
case 240: memcpy_avx_128(dd - 240, ss - 240);
case 112: memcpy_avx_64(dd - 112, ss - 112); memcpy_avx_64(dd - 64, ss - 64); break;
case 241: memcpy_avx_128(dd - 241, ss - 241);
case 113: memcpy_avx_64(dd - 113, ss - 113); memcpy_avx_64(dd - 64, ss - 64); break;
case 242: memcpy_avx_128(dd - 242, ss - 242);
case 114: memcpy_avx_64(dd - 114, ss - 114); memcpy_avx_64(dd - 64, ss - 64); break;
case 243: memcpy_avx_128(dd - 243, ss - 243);
case 115: memcpy_avx_64(dd - 115, ss - 115); memcpy_avx_64(dd - 64, ss - 64); break;
case 244: memcpy_avx_128(dd - 244, ss - 244);
case 116: memcpy_avx_64(dd - 116, ss - 116); memcpy_avx_64(dd - 64, ss - 64); break;
case 245: memcpy_avx_128(dd - 245, ss - 245);
case 117: memcpy_avx_64(dd - 117, ss - 117); memcpy_avx_64(dd - 64, ss - 64); break;
case 246: memcpy_avx_128(dd - 246, ss - 246);
case 118: memcpy_avx_64(dd - 118, ss - 118); memcpy_avx_64(dd - 64, ss - 64); break;
case 247: memcpy_avx_128(dd - 247, ss - 247);
case 119: memcpy_avx_64(dd - 119, ss - 119); memcpy_avx_64(dd - 64, ss - 64); break;
case 248: memcpy_avx_128(dd - 248, ss - 248);
case 120: memcpy_avx_64(dd - 120, ss - 120); memcpy_avx_64(dd - 64, ss - 64); break;
case 249: memcpy_avx_128(dd - 249, ss - 249);
case 121: memcpy_avx_64(dd - 121, ss - 121); memcpy_avx_64(dd - 64, ss - 64); break;
case 250: memcpy_avx_128(dd - 250, ss - 250);
case 122: memcpy_avx_64(dd - 122, ss - 122); memcpy_avx_64(dd - 64, ss - 64); break;
case 251: memcpy_avx_128(dd - 251, ss - 251);
case 123: memcpy_avx_64(dd - 123, ss - 123); memcpy_avx_64(dd - 64, ss - 64); break;
case 252: memcpy_avx_128(dd - 252, ss - 252);
case 124: memcpy_avx_64(dd - 124, ss - 124); memcpy_avx_64(dd - 64, ss - 64); break;
case 253: memcpy_avx_128(dd - 253, ss - 253);
case 125: memcpy_avx_64(dd - 125, ss - 125); memcpy_avx_64(dd - 64, ss - 64); break;
case 254: memcpy_avx_128(dd - 254, ss - 254);
case 126: memcpy_avx_64(dd - 126, ss - 126); memcpy_avx_64(dd - 64, ss - 64); break;
case 255: memcpy_avx_128(dd - 255, ss - 255);
case 127: memcpy_avx_64(dd - 127, ss - 127); memcpy_avx_64(dd - 64, ss - 64); break;
case 256: memcpy_avx_256(dd - 256, ss - 256); break;
}
return dst;
}
//---------------------------------------------------------------------
// main routine
//---------------------------------------------------------------------
static void* memcpy_fast(void *destination, const void *source, size_t size)
{
unsigned char *dst = (unsigned char*)destination;
const unsigned char *src = (const unsigned char*)source;
static size_t cachesize = 0x200000; // L3-cache size
size_t padding;
// small memory copy
if (size <= 256) {
memcpy_tiny(dst, src, size);
_mm256_zeroupper();
return destination;
}
// align destination to 16 bytes boundary
padding = (32 - (((size_t)dst) & 31)) & 31;
#if 0
if (padding > 0) {
__m256i head = _mm256_loadu_si256((const __m256i*)src);
_mm256_storeu_si256((__m256i*)dst, head);
dst += padding;
src += padding;
size -= padding;
}
#else
__m256i head = _mm256_loadu_si256((const __m256i*)src);
_mm256_storeu_si256((__m256i*)dst, head);
dst += padding;
src += padding;
size -= padding;
#endif
// medium size copy
if (size <= cachesize) {
__m256i c0, c1, c2, c3, c4, c5, c6, c7;
for (; size >= 256; size -= 256) {
c0 = _mm256_loadu_si256(((const __m256i*)src) + 0);
c1 = _mm256_loadu_si256(((const __m256i*)src) + 1);
c2 = _mm256_loadu_si256(((const __m256i*)src) + 2);
c3 = _mm256_loadu_si256(((const __m256i*)src) + 3);
c4 = _mm256_loadu_si256(((const __m256i*)src) + 4);
c5 = _mm256_loadu_si256(((const __m256i*)src) + 5);
c6 = _mm256_loadu_si256(((const __m256i*)src) + 6);
c7 = _mm256_loadu_si256(((const __m256i*)src) + 7);
_mm_prefetch((const char*)(src + 512), _MM_HINT_NTA);
src += 256;
_mm256_storeu_si256((((__m256i*)dst) + 0), c0);
_mm256_storeu_si256((((__m256i*)dst) + 1), c1);
_mm256_storeu_si256((((__m256i*)dst) + 2), c2);
_mm256_storeu_si256((((__m256i*)dst) + 3), c3);
_mm256_storeu_si256((((__m256i*)dst) + 4), c4);
_mm256_storeu_si256((((__m256i*)dst) + 5), c5);
_mm256_storeu_si256((((__m256i*)dst) + 6), c6);
_mm256_storeu_si256((((__m256i*)dst) + 7), c7);
dst += 256;
}
}
else { // big memory copy
__m256i c0, c1, c2, c3, c4, c5, c6, c7;
/* __m256i c0, c1, c2, c3, c4, c5, c6, c7; */
_mm_prefetch((const char*)(src), _MM_HINT_NTA);
if ((((size_t)src) & 31) == 0) { // source aligned
for (; size >= 256; size -= 256) {
c0 = _mm256_load_si256(((const __m256i*)src) + 0);
c1 = _mm256_load_si256(((const __m256i*)src) + 1);
c2 = _mm256_load_si256(((const __m256i*)src) + 2);
c3 = _mm256_load_si256(((const __m256i*)src) + 3);
c4 = _mm256_load_si256(((const __m256i*)src) + 4);
c5 = _mm256_load_si256(((const __m256i*)src) + 5);
c6 = _mm256_load_si256(((const __m256i*)src) + 6);
c7 = _mm256_load_si256(((const __m256i*)src) + 7);
_mm_prefetch((const char*)(src + 512), _MM_HINT_NTA);
src += 256;
_mm256_stream_si256((((__m256i*)dst) + 0), c0);
_mm256_stream_si256((((__m256i*)dst) + 1), c1);
_mm256_stream_si256((((__m256i*)dst) + 2), c2);
_mm256_stream_si256((((__m256i*)dst) + 3), c3);
_mm256_stream_si256((((__m256i*)dst) + 4), c4);
_mm256_stream_si256((((__m256i*)dst) + 5), c5);
_mm256_stream_si256((((__m256i*)dst) + 6), c6);
_mm256_stream_si256((((__m256i*)dst) + 7), c7);
dst += 256;
}
}
else { // source unaligned
for (; size >= 256; size -= 256) {
c0 = _mm256_loadu_si256(((const __m256i*)src) + 0);
c1 = _mm256_loadu_si256(((const __m256i*)src) + 1);
c2 = _mm256_loadu_si256(((const __m256i*)src) + 2);
c3 = _mm256_loadu_si256(((const __m256i*)src) + 3);
c4 = _mm256_loadu_si256(((const __m256i*)src) + 4);
c5 = _mm256_loadu_si256(((const __m256i*)src) + 5);
c6 = _mm256_loadu_si256(((const __m256i*)src) + 6);
c7 = _mm256_loadu_si256(((const __m256i*)src) + 7);
_mm_prefetch((const char*)(src + 512), _MM_HINT_NTA);
src += 256;
_mm256_stream_si256((((__m256i*)dst) + 0), c0);
_mm256_stream_si256((((__m256i*)dst) + 1), c1);
_mm256_stream_si256((((__m256i*)dst) + 2), c2);
_mm256_stream_si256((((__m256i*)dst) + 3), c3);
_mm256_stream_si256((((__m256i*)dst) + 4), c4);
_mm256_stream_si256((((__m256i*)dst) + 5), c5);
_mm256_stream_si256((((__m256i*)dst) + 6), c6);
_mm256_stream_si256((((__m256i*)dst) + 7), c7);
dst += 256;
}
}
_mm_sfence();
}
memcpy_tiny(dst, src, size);
_mm256_zeroupper();
return destination;
}
#endif

View File

@ -1,22 +0,0 @@
The MIT License (MIT)
Copyright (c) 2015 Linwei
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

View File

@ -1,20 +0,0 @@
Internal implementation of `memcpy` function.
It has the following advantages over `libc`-supplied implementation:
- it is linked statically, so the function is called directly, not through a `PLT` (procedure lookup table of shared library);
- it is linked statically, so the function can have position-dependent code;
- your binaries will not depend on `glibc`'s memcpy, that forces dependency on specific symbol version like `memcpy@@GLIBC_2.14` and consequently on specific version of `glibc` library;
- you can include `memcpy.h` directly and the function has the chance to be inlined, which is beneficial for small but unknown at compile time sizes of memory regions;
- this version of `memcpy` pretend to be faster (in our benchmarks, the difference is within few percents).
Currently it uses the implementation from **Linwei** (skywind3000@163.com).
Look at https://www.zhihu.com/question/35172305 for discussion.
Drawbacks:
- only use SSE 2, doesn't use wider (AVX, AVX 512) vector registers when available;
- no CPU dispatching; doesn't take into account actual cache size.
Also worth to look at:
- simple implementation from Facebook: https://github.com/facebook/folly/blob/master/folly/memcpy.S
- implementation from Agner Fog: http://www.agner.org/optimize/
- glibc source code.

View File

@ -1,6 +0,0 @@
#include "FastMemcpy.h"
void * memcpy(void * __restrict destination, const void * __restrict source, size_t size)
{
return memcpy_fast(destination, source, size);
}

View File

@ -32,6 +32,7 @@ if (NOT DEFINED ENABLE_UTILS OR ENABLE_UTILS)
add_subdirectory (db-generator)
add_subdirectory (wal-dump)
add_subdirectory (check-mysql-binlog)
add_subdirectory (memcpy-bench)
endif ()
if (ENABLE_CODE_QUALITY)

View File

@ -0,0 +1,5 @@
enable_language(ASM)
add_executable (memcpy-bench memcpy-bench.cpp memcpy_jart.S)
#target_compile_options(memcpy-bench PRIVATE -mavx)
target_link_libraries(memcpy-bench PRIVATE dbms)

View File

@ -0,0 +1,770 @@
#pragma once
//=====================================================================
//
// FastMemcpy.c - skywind3000@163.com, 2015
//
// feature:
// 50% speed up in avg. vs standard memcpy (tested in vc2012/gcc5.1)
//
//=====================================================================
#include <stddef.h>
#include <stdint.h>
#include <emmintrin.h>
//---------------------------------------------------------------------
// force inline for compilers
//---------------------------------------------------------------------
#ifndef INLINE
#ifdef __GNUC__
#if (__GNUC__ > 3) || ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 1))
#define INLINE __inline__ __attribute__((always_inline))
#else
#define INLINE __inline__
#endif
#elif defined(_MSC_VER)
#define INLINE __forceinline
#elif (defined(__BORLANDC__) || defined(__WATCOMC__))
#define INLINE __inline
#else
#define INLINE
#endif
#endif
typedef __attribute__((__aligned__(1))) uint16_t uint16_unaligned_t;
typedef __attribute__((__aligned__(1))) uint32_t uint32_unaligned_t;
typedef __attribute__((__aligned__(1))) uint64_t uint64_unaligned_t;
//---------------------------------------------------------------------
// fast copy for different sizes
//---------------------------------------------------------------------
static INLINE void memcpy_sse2_16(void * __restrict dst, const void * __restrict src)
{
__m128i m0 = _mm_loadu_si128((reinterpret_cast<const __m128i*>(src)) + 0);
_mm_storeu_si128((reinterpret_cast<__m128i*>(dst)) + 0, m0);
}
static INLINE void memcpy_sse2_32(void * __restrict dst, const void * __restrict src)
{
__m128i m0 = _mm_loadu_si128((reinterpret_cast<const __m128i*>(src)) + 0);
__m128i m1 = _mm_loadu_si128((reinterpret_cast<const __m128i*>(src)) + 1);
_mm_storeu_si128((reinterpret_cast<__m128i*>(dst)) + 0, m0);
_mm_storeu_si128((reinterpret_cast<__m128i*>(dst)) + 1, m1);
}
static INLINE void memcpy_sse2_64(void * __restrict dst, const void * __restrict src)
{
__m128i m0 = _mm_loadu_si128((reinterpret_cast<const __m128i*>(src)) + 0);
__m128i m1 = _mm_loadu_si128((reinterpret_cast<const __m128i*>(src)) + 1);
__m128i m2 = _mm_loadu_si128((reinterpret_cast<const __m128i*>(src)) + 2);
__m128i m3 = _mm_loadu_si128((reinterpret_cast<const __m128i*>(src)) + 3);
_mm_storeu_si128((reinterpret_cast<__m128i*>(dst)) + 0, m0);
_mm_storeu_si128((reinterpret_cast<__m128i*>(dst)) + 1, m1);
_mm_storeu_si128((reinterpret_cast<__m128i*>(dst)) + 2, m2);
_mm_storeu_si128((reinterpret_cast<__m128i*>(dst)) + 3, m3);
}
static INLINE void memcpy_sse2_128(void * __restrict dst, const void * __restrict src)
{
__m128i m0 = _mm_loadu_si128((reinterpret_cast<const __m128i*>(src)) + 0);
__m128i m1 = _mm_loadu_si128((reinterpret_cast<const __m128i*>(src)) + 1);
__m128i m2 = _mm_loadu_si128((reinterpret_cast<const __m128i*>(src)) + 2);
__m128i m3 = _mm_loadu_si128((reinterpret_cast<const __m128i*>(src)) + 3);
__m128i m4 = _mm_loadu_si128((reinterpret_cast<const __m128i*>(src)) + 4);
__m128i m5 = _mm_loadu_si128((reinterpret_cast<const __m128i*>(src)) + 5);
__m128i m6 = _mm_loadu_si128((reinterpret_cast<const __m128i*>(src)) + 6);
__m128i m7 = _mm_loadu_si128((reinterpret_cast<const __m128i*>(src)) + 7);
_mm_storeu_si128((reinterpret_cast<__m128i*>(dst)) + 0, m0);
_mm_storeu_si128((reinterpret_cast<__m128i*>(dst)) + 1, m1);
_mm_storeu_si128((reinterpret_cast<__m128i*>(dst)) + 2, m2);
_mm_storeu_si128((reinterpret_cast<__m128i*>(dst)) + 3, m3);
_mm_storeu_si128((reinterpret_cast<__m128i*>(dst)) + 4, m4);
_mm_storeu_si128((reinterpret_cast<__m128i*>(dst)) + 5, m5);
_mm_storeu_si128((reinterpret_cast<__m128i*>(dst)) + 6, m6);
_mm_storeu_si128((reinterpret_cast<__m128i*>(dst)) + 7, m7);
}
//---------------------------------------------------------------------
// tiny memory copy with jump table optimized
//---------------------------------------------------------------------
/// Attribute is used to avoid an error with undefined behaviour sanitizer
/// ../contrib/FastMemcpy/FastMemcpy.h:91:56: runtime error: applying zero offset to null pointer
/// Found by 01307_orc_output_format.sh, cause - ORCBlockInputFormat and external ORC library.
__attribute__((__no_sanitize__("undefined"))) static INLINE void *memcpy_tiny(void * __restrict dst, const void * __restrict src, size_t size)
{
unsigned char *dd = ((unsigned char*)dst) + size;
const unsigned char *ss = ((const unsigned char*)src) + size;
switch (size)
{
case 64:
memcpy_sse2_64(dd - 64, ss - 64);
[[fallthrough]];
case 0:
break;
case 65:
memcpy_sse2_64(dd - 65, ss - 65);
[[fallthrough]];
case 1:
dd[-1] = ss[-1];
break;
case 66:
memcpy_sse2_64(dd - 66, ss - 66);
[[fallthrough]];
case 2:
*((uint16_unaligned_t*)(dd - 2)) = *((const uint16_unaligned_t*)(ss - 2));
break;
case 67:
memcpy_sse2_64(dd - 67, ss - 67);
[[fallthrough]];
case 3:
*((uint16_unaligned_t*)(dd - 3)) = *((const uint16_unaligned_t*)(ss - 3));
dd[-1] = ss[-1];
break;
case 68:
memcpy_sse2_64(dd - 68, ss - 68);
[[fallthrough]];
case 4:
*((uint32_unaligned_t*)(dd - 4)) = *((const uint32_unaligned_t*)(ss - 4));
break;
case 69:
memcpy_sse2_64(dd - 69, ss - 69);
[[fallthrough]];
case 5:
*((uint32_unaligned_t*)(dd - 5)) = *((const uint32_unaligned_t*)(ss - 5));
dd[-1] = ss[-1];
break;
case 70:
memcpy_sse2_64(dd - 70, ss - 70);
[[fallthrough]];
case 6:
*((uint32_unaligned_t*)(dd - 6)) = *((const uint32_unaligned_t*)(ss - 6));
*((uint16_unaligned_t*)(dd - 2)) = *((const uint16_unaligned_t*)(ss - 2));
break;
case 71:
memcpy_sse2_64(dd - 71, ss - 71);
[[fallthrough]];
case 7:
*((uint32_unaligned_t*)(dd - 7)) = *((const uint32_unaligned_t*)(ss - 7));
*((uint32_unaligned_t*)(dd - 4)) = *((const uint32_unaligned_t*)(ss - 4));
break;
case 72:
memcpy_sse2_64(dd - 72, ss - 72);
[[fallthrough]];
case 8:
*((uint64_unaligned_t*)(dd - 8)) = *((const uint64_unaligned_t*)(ss - 8));
break;
case 73:
memcpy_sse2_64(dd - 73, ss - 73);
[[fallthrough]];
case 9:
*((uint64_unaligned_t*)(dd - 9)) = *((const uint64_unaligned_t*)(ss - 9));
dd[-1] = ss[-1];
break;
case 74:
memcpy_sse2_64(dd - 74, ss - 74);
[[fallthrough]];
case 10:
*((uint64_unaligned_t*)(dd - 10)) = *((const uint64_unaligned_t*)(ss - 10));
*((uint16_unaligned_t*)(dd - 2)) = *((const uint16_unaligned_t*)(ss - 2));
break;
case 75:
memcpy_sse2_64(dd - 75, ss - 75);
[[fallthrough]];
case 11:
*((uint64_unaligned_t*)(dd - 11)) = *((const uint64_unaligned_t*)(ss - 11));
*((uint32_unaligned_t*)(dd - 4)) = *((const uint32_unaligned_t*)(ss - 4));
break;
case 76:
memcpy_sse2_64(dd - 76, ss - 76);
[[fallthrough]];
case 12:
*((uint64_unaligned_t*)(dd - 12)) = *((const uint64_unaligned_t*)(ss - 12));
*((uint32_unaligned_t*)(dd - 4)) = *((const uint32_unaligned_t*)(ss - 4));
break;
case 77:
memcpy_sse2_64(dd - 77, ss - 77);
[[fallthrough]];
case 13:
*((uint64_unaligned_t*)(dd - 13)) = *((const uint64_unaligned_t*)(ss - 13));
*((uint32_unaligned_t*)(dd - 5)) = *((const uint32_unaligned_t*)(ss - 5));
dd[-1] = ss[-1];
break;
case 78:
memcpy_sse2_64(dd - 78, ss - 78);
[[fallthrough]];
case 14:
*((uint64_unaligned_t*)(dd - 14)) = *((const uint64_unaligned_t*)(ss - 14));
*((uint64_unaligned_t*)(dd - 8)) = *((const uint64_unaligned_t*)(ss - 8));
break;
case 79:
memcpy_sse2_64(dd - 79, ss - 79);
[[fallthrough]];
case 15:
*((uint64_unaligned_t*)(dd - 15)) = *((const uint64_unaligned_t*)(ss - 15));
*((uint64_unaligned_t*)(dd - 8)) = *((const uint64_unaligned_t*)(ss - 8));
break;
case 80:
memcpy_sse2_64(dd - 80, ss - 80);
[[fallthrough]];
case 16:
memcpy_sse2_16(dd - 16, ss - 16);
break;
case 81:
memcpy_sse2_64(dd - 81, ss - 81);
[[fallthrough]];
case 17:
memcpy_sse2_16(dd - 17, ss - 17);
dd[-1] = ss[-1];
break;
case 82:
memcpy_sse2_64(dd - 82, ss - 82);
[[fallthrough]];
case 18:
memcpy_sse2_16(dd - 18, ss - 18);
*((uint16_unaligned_t*)(dd - 2)) = *((const uint16_unaligned_t*)(ss - 2));
break;
case 83:
memcpy_sse2_64(dd - 83, ss - 83);
[[fallthrough]];
case 19:
memcpy_sse2_16(dd - 19, ss - 19);
*((uint16_unaligned_t*)(dd - 3)) = *((const uint16_unaligned_t*)(ss - 3));
dd[-1] = ss[-1];
break;
case 84:
memcpy_sse2_64(dd - 84, ss - 84);
[[fallthrough]];
case 20:
memcpy_sse2_16(dd - 20, ss - 20);
*((uint32_unaligned_t*)(dd - 4)) = *((const uint32_unaligned_t*)(ss - 4));
break;
case 85:
memcpy_sse2_64(dd - 85, ss - 85);
[[fallthrough]];
case 21:
memcpy_sse2_16(dd - 21, ss - 21);
*((uint32_unaligned_t*)(dd - 5)) = *((const uint32_unaligned_t*)(ss - 5));
dd[-1] = ss[-1];
break;
case 86:
memcpy_sse2_64(dd - 86, ss - 86);
[[fallthrough]];
case 22:
memcpy_sse2_16(dd - 22, ss - 22);
*((uint32_unaligned_t*)(dd - 6)) = *((const uint32_unaligned_t*)(ss - 6));
*((uint16_unaligned_t*)(dd - 2)) = *((const uint16_unaligned_t*)(ss - 2));
break;
case 87:
memcpy_sse2_64(dd - 87, ss - 87);
[[fallthrough]];
case 23:
memcpy_sse2_16(dd - 23, ss - 23);
*((uint32_unaligned_t*)(dd - 7)) = *((const uint32_unaligned_t*)(ss - 7));
*((uint32_unaligned_t*)(dd - 4)) = *((const uint32_unaligned_t*)(ss - 4));
break;
case 88:
memcpy_sse2_64(dd - 88, ss - 88);
[[fallthrough]];
case 24:
memcpy_sse2_16(dd - 24, ss - 24);
memcpy_sse2_16(dd - 16, ss - 16);
break;
case 89:
memcpy_sse2_64(dd - 89, ss - 89);
[[fallthrough]];
case 25:
memcpy_sse2_16(dd - 25, ss - 25);
memcpy_sse2_16(dd - 16, ss - 16);
break;
case 90:
memcpy_sse2_64(dd - 90, ss - 90);
[[fallthrough]];
case 26:
memcpy_sse2_16(dd - 26, ss - 26);
memcpy_sse2_16(dd - 16, ss - 16);
break;
case 91:
memcpy_sse2_64(dd - 91, ss - 91);
[[fallthrough]];
case 27:
memcpy_sse2_16(dd - 27, ss - 27);
memcpy_sse2_16(dd - 16, ss - 16);
break;
case 92:
memcpy_sse2_64(dd - 92, ss - 92);
[[fallthrough]];
case 28:
memcpy_sse2_16(dd - 28, ss - 28);
memcpy_sse2_16(dd - 16, ss - 16);
break;
case 93:
memcpy_sse2_64(dd - 93, ss - 93);
[[fallthrough]];
case 29:
memcpy_sse2_16(dd - 29, ss - 29);
memcpy_sse2_16(dd - 16, ss - 16);
break;
case 94:
memcpy_sse2_64(dd - 94, ss - 94);
[[fallthrough]];
case 30:
memcpy_sse2_16(dd - 30, ss - 30);
memcpy_sse2_16(dd - 16, ss - 16);
break;
case 95:
memcpy_sse2_64(dd - 95, ss - 95);
[[fallthrough]];
case 31:
memcpy_sse2_16(dd - 31, ss - 31);
memcpy_sse2_16(dd - 16, ss - 16);
break;
case 96:
memcpy_sse2_64(dd - 96, ss - 96);
[[fallthrough]];
case 32:
memcpy_sse2_32(dd - 32, ss - 32);
break;
case 97:
memcpy_sse2_64(dd - 97, ss - 97);
[[fallthrough]];
case 33:
memcpy_sse2_32(dd - 33, ss - 33);
dd[-1] = ss[-1];
break;
case 98:
memcpy_sse2_64(dd - 98, ss - 98);
[[fallthrough]];
case 34:
memcpy_sse2_32(dd - 34, ss - 34);
*((uint16_unaligned_t*)(dd - 2)) = *((const uint16_unaligned_t*)(ss - 2));
break;
case 99:
memcpy_sse2_64(dd - 99, ss - 99);
[[fallthrough]];
case 35:
memcpy_sse2_32(dd - 35, ss - 35);
*((uint16_unaligned_t*)(dd - 3)) = *((const uint16_unaligned_t*)(ss - 3));
dd[-1] = ss[-1];
break;
case 100:
memcpy_sse2_64(dd - 100, ss - 100);
[[fallthrough]];
case 36:
memcpy_sse2_32(dd - 36, ss - 36);
*((uint32_unaligned_t*)(dd - 4)) = *((const uint32_unaligned_t*)(ss - 4));
break;
case 101:
memcpy_sse2_64(dd - 101, ss - 101);
[[fallthrough]];
case 37:
memcpy_sse2_32(dd - 37, ss - 37);
*((uint32_unaligned_t*)(dd - 5)) = *((const uint32_unaligned_t*)(ss - 5));
dd[-1] = ss[-1];
break;
case 102:
memcpy_sse2_64(dd - 102, ss - 102);
[[fallthrough]];
case 38:
memcpy_sse2_32(dd - 38, ss - 38);
*((uint32_unaligned_t*)(dd - 6)) = *((const uint32_unaligned_t*)(ss - 6));
*((uint16_unaligned_t*)(dd - 2)) = *((const uint16_unaligned_t*)(ss - 2));
break;
case 103:
memcpy_sse2_64(dd - 103, ss - 103);
[[fallthrough]];
case 39:
memcpy_sse2_32(dd - 39, ss - 39);
*((uint32_unaligned_t*)(dd - 7)) = *((const uint32_unaligned_t*)(ss - 7));
*((uint32_unaligned_t*)(dd - 4)) = *((const uint32_unaligned_t*)(ss - 4));
break;
case 104:
memcpy_sse2_64(dd - 104, ss - 104);
[[fallthrough]];
case 40:
memcpy_sse2_32(dd - 40, ss - 40);
*((uint64_unaligned_t*)(dd - 8)) = *((const uint64_unaligned_t*)(ss - 8));
break;
case 105:
memcpy_sse2_64(dd - 105, ss - 105);
[[fallthrough]];
case 41:
memcpy_sse2_32(dd - 41, ss - 41);
*((uint64_unaligned_t*)(dd - 9)) = *((const uint64_unaligned_t*)(ss - 9));
dd[-1] = ss[-1];
break;
case 106:
memcpy_sse2_64(dd - 106, ss - 106);
[[fallthrough]];
case 42:
memcpy_sse2_32(dd - 42, ss - 42);
*((uint64_unaligned_t*)(dd - 10)) = *((const uint64_unaligned_t*)(ss - 10));
*((uint16_unaligned_t*)(dd - 2)) = *((const uint16_unaligned_t*)(ss - 2));
break;
case 107:
memcpy_sse2_64(dd - 107, ss - 107);
[[fallthrough]];
case 43:
memcpy_sse2_32(dd - 43, ss - 43);
*((uint64_unaligned_t*)(dd - 11)) = *((const uint64_unaligned_t*)(ss - 11));
*((uint32_unaligned_t*)(dd - 4)) = *((const uint32_unaligned_t*)(ss - 4));
break;
case 108:
memcpy_sse2_64(dd - 108, ss - 108);
[[fallthrough]];
case 44:
memcpy_sse2_32(dd - 44, ss - 44);
*((uint64_unaligned_t*)(dd - 12)) = *((const uint64_unaligned_t*)(ss - 12));
*((uint32_unaligned_t*)(dd - 4)) = *((const uint32_unaligned_t*)(ss - 4));
break;
case 109:
memcpy_sse2_64(dd - 109, ss - 109);
[[fallthrough]];
case 45:
memcpy_sse2_32(dd - 45, ss - 45);
*((uint64_unaligned_t*)(dd - 13)) = *((const uint64_unaligned_t*)(ss - 13));
*((uint32_unaligned_t*)(dd - 5)) = *((const uint32_unaligned_t*)(ss - 5));
dd[-1] = ss[-1];
break;
case 110:
memcpy_sse2_64(dd - 110, ss - 110);
[[fallthrough]];
case 46:
memcpy_sse2_32(dd - 46, ss - 46);
*((uint64_unaligned_t*)(dd - 14)) = *((const uint64_unaligned_t*)(ss - 14));
*((uint64_unaligned_t*)(dd - 8)) = *((const uint64_unaligned_t*)(ss - 8));
break;
case 111:
memcpy_sse2_64(dd - 111, ss - 111);
[[fallthrough]];
case 47:
memcpy_sse2_32(dd - 47, ss - 47);
*((uint64_unaligned_t*)(dd - 15)) = *((const uint64_unaligned_t*)(ss - 15));
*((uint64_unaligned_t*)(dd - 8)) = *((const uint64_unaligned_t*)(ss - 8));
break;
case 112:
memcpy_sse2_64(dd - 112, ss - 112);
[[fallthrough]];
case 48:
memcpy_sse2_32(dd - 48, ss - 48);
memcpy_sse2_16(dd - 16, ss - 16);
break;
case 113:
memcpy_sse2_64(dd - 113, ss - 113);
[[fallthrough]];
case 49:
memcpy_sse2_32(dd - 49, ss - 49);
memcpy_sse2_16(dd - 17, ss - 17);
dd[-1] = ss[-1];
break;
case 114:
memcpy_sse2_64(dd - 114, ss - 114);
[[fallthrough]];
case 50:
memcpy_sse2_32(dd - 50, ss - 50);
memcpy_sse2_16(dd - 18, ss - 18);
*((uint16_unaligned_t*)(dd - 2)) = *((const uint16_unaligned_t*)(ss - 2));
break;
case 115:
memcpy_sse2_64(dd - 115, ss - 115);
[[fallthrough]];
case 51:
memcpy_sse2_32(dd - 51, ss - 51);
memcpy_sse2_16(dd - 19, ss - 19);
*((uint16_unaligned_t*)(dd - 3)) = *((const uint16_unaligned_t*)(ss - 3));
dd[-1] = ss[-1];
break;
case 116:
memcpy_sse2_64(dd - 116, ss - 116);
[[fallthrough]];
case 52:
memcpy_sse2_32(dd - 52, ss - 52);
memcpy_sse2_16(dd - 20, ss - 20);
*((uint32_unaligned_t*)(dd - 4)) = *((const uint32_unaligned_t*)(ss - 4));
break;
case 117:
memcpy_sse2_64(dd - 117, ss - 117);
[[fallthrough]];
case 53:
memcpy_sse2_32(dd - 53, ss - 53);
memcpy_sse2_16(dd - 21, ss - 21);
*((uint32_unaligned_t*)(dd - 5)) = *((const uint32_unaligned_t*)(ss - 5));
dd[-1] = ss[-1];
break;
case 118:
memcpy_sse2_64(dd - 118, ss - 118);
[[fallthrough]];
case 54:
memcpy_sse2_32(dd - 54, ss - 54);
memcpy_sse2_16(dd - 22, ss - 22);
*((uint32_unaligned_t*)(dd - 6)) = *((const uint32_unaligned_t*)(ss - 6));
*((uint16_unaligned_t*)(dd - 2)) = *((const uint16_unaligned_t*)(ss - 2));
break;
case 119:
memcpy_sse2_64(dd - 119, ss - 119);
[[fallthrough]];
case 55:
memcpy_sse2_32(dd - 55, ss - 55);
memcpy_sse2_16(dd - 23, ss - 23);
*((uint32_unaligned_t*)(dd - 7)) = *((const uint32_unaligned_t*)(ss - 7));
*((uint32_unaligned_t*)(dd - 4)) = *((const uint32_unaligned_t*)(ss - 4));
break;
case 120:
memcpy_sse2_64(dd - 120, ss - 120);
[[fallthrough]];
case 56:
memcpy_sse2_32(dd - 56, ss - 56);
memcpy_sse2_16(dd - 24, ss - 24);
memcpy_sse2_16(dd - 16, ss - 16);
break;
case 121:
memcpy_sse2_64(dd - 121, ss - 121);
[[fallthrough]];
case 57:
memcpy_sse2_32(dd - 57, ss - 57);
memcpy_sse2_16(dd - 25, ss - 25);
memcpy_sse2_16(dd - 16, ss - 16);
break;
case 122:
memcpy_sse2_64(dd - 122, ss - 122);
[[fallthrough]];
case 58:
memcpy_sse2_32(dd - 58, ss - 58);
memcpy_sse2_16(dd - 26, ss - 26);
memcpy_sse2_16(dd - 16, ss - 16);
break;
case 123:
memcpy_sse2_64(dd - 123, ss - 123);
[[fallthrough]];
case 59:
memcpy_sse2_32(dd - 59, ss - 59);
memcpy_sse2_16(dd - 27, ss - 27);
memcpy_sse2_16(dd - 16, ss - 16);
break;
case 124:
memcpy_sse2_64(dd - 124, ss - 124);
[[fallthrough]];
case 60:
memcpy_sse2_32(dd - 60, ss - 60);
memcpy_sse2_16(dd - 28, ss - 28);
memcpy_sse2_16(dd - 16, ss - 16);
break;
case 125:
memcpy_sse2_64(dd - 125, ss - 125);
[[fallthrough]];
case 61:
memcpy_sse2_32(dd - 61, ss - 61);
memcpy_sse2_16(dd - 29, ss - 29);
memcpy_sse2_16(dd - 16, ss - 16);
break;
case 126:
memcpy_sse2_64(dd - 126, ss - 126);
[[fallthrough]];
case 62:
memcpy_sse2_32(dd - 62, ss - 62);
memcpy_sse2_16(dd - 30, ss - 30);
memcpy_sse2_16(dd - 16, ss - 16);
break;
case 127:
memcpy_sse2_64(dd - 127, ss - 127);
[[fallthrough]];
case 63:
memcpy_sse2_32(dd - 63, ss - 63);
memcpy_sse2_16(dd - 31, ss - 31);
memcpy_sse2_16(dd - 16, ss - 16);
break;
case 128:
memcpy_sse2_128(dd - 128, ss - 128);
break;
}
return dst;
}
//---------------------------------------------------------------------
// main routine
//---------------------------------------------------------------------
void* memcpy_fast_sse(void * __restrict destination, const void * __restrict source, size_t size)
{
unsigned char *dst = (unsigned char*)destination;
const unsigned char *src = (const unsigned char*)source;
static size_t cachesize = 0x200000; // L2-cache size
size_t padding;
// small memory copy
if (size <= 128)
{
return memcpy_tiny(dst, src, size);
}
// align destination to 16 bytes boundary
padding = (16 - (((size_t)dst) & 15)) & 15;
if (padding > 0)
{
__m128i head = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src));
_mm_storeu_si128(reinterpret_cast<__m128i*>(dst), head);
dst += padding;
src += padding;
size -= padding;
}
// medium size copy
if (size <= cachesize)
{
__m128i c0, c1, c2, c3, c4, c5, c6, c7;
for (; size >= 128; size -= 128)
{
c0 = _mm_loadu_si128((reinterpret_cast<const __m128i*>(src)) + 0);
c1 = _mm_loadu_si128((reinterpret_cast<const __m128i*>(src)) + 1);
c2 = _mm_loadu_si128((reinterpret_cast<const __m128i*>(src)) + 2);
c3 = _mm_loadu_si128((reinterpret_cast<const __m128i*>(src)) + 3);
c4 = _mm_loadu_si128((reinterpret_cast<const __m128i*>(src)) + 4);
c5 = _mm_loadu_si128((reinterpret_cast<const __m128i*>(src)) + 5);
c6 = _mm_loadu_si128((reinterpret_cast<const __m128i*>(src)) + 6);
c7 = _mm_loadu_si128((reinterpret_cast<const __m128i*>(src)) + 7);
_mm_prefetch((const char*)(src + 256), _MM_HINT_NTA);
src += 128;
_mm_store_si128(((reinterpret_cast<__m128i*>(dst)) + 0), c0);
_mm_store_si128(((reinterpret_cast<__m128i*>(dst)) + 1), c1);
_mm_store_si128(((reinterpret_cast<__m128i*>(dst)) + 2), c2);
_mm_store_si128(((reinterpret_cast<__m128i*>(dst)) + 3), c3);
_mm_store_si128(((reinterpret_cast<__m128i*>(dst)) + 4), c4);
_mm_store_si128(((reinterpret_cast<__m128i*>(dst)) + 5), c5);
_mm_store_si128(((reinterpret_cast<__m128i*>(dst)) + 6), c6);
_mm_store_si128(((reinterpret_cast<__m128i*>(dst)) + 7), c7);
dst += 128;
}
}
else
{ // big memory copy
__m128i c0, c1, c2, c3, c4, c5, c6, c7;
_mm_prefetch((const char*)(src), _MM_HINT_NTA);
if ((((size_t)src) & 15) == 0)
{ // source aligned
for (; size >= 128; size -= 128)
{
c0 = _mm_load_si128((reinterpret_cast<const __m128i*>(src)) + 0);
c1 = _mm_load_si128((reinterpret_cast<const __m128i*>(src)) + 1);
c2 = _mm_load_si128((reinterpret_cast<const __m128i*>(src)) + 2);
c3 = _mm_load_si128((reinterpret_cast<const __m128i*>(src)) + 3);
c4 = _mm_load_si128((reinterpret_cast<const __m128i*>(src)) + 4);
c5 = _mm_load_si128((reinterpret_cast<const __m128i*>(src)) + 5);
c6 = _mm_load_si128((reinterpret_cast<const __m128i*>(src)) + 6);
c7 = _mm_load_si128((reinterpret_cast<const __m128i*>(src)) + 7);
_mm_prefetch((const char*)(src + 256), _MM_HINT_NTA);
src += 128;
_mm_stream_si128(((reinterpret_cast<__m128i*>(dst)) + 0), c0);
_mm_stream_si128(((reinterpret_cast<__m128i*>(dst)) + 1), c1);
_mm_stream_si128(((reinterpret_cast<__m128i*>(dst)) + 2), c2);
_mm_stream_si128(((reinterpret_cast<__m128i*>(dst)) + 3), c3);
_mm_stream_si128(((reinterpret_cast<__m128i*>(dst)) + 4), c4);
_mm_stream_si128(((reinterpret_cast<__m128i*>(dst)) + 5), c5);
_mm_stream_si128(((reinterpret_cast<__m128i*>(dst)) + 6), c6);
_mm_stream_si128(((reinterpret_cast<__m128i*>(dst)) + 7), c7);
dst += 128;
}
}
else
{ // source unaligned
for (; size >= 128; size -= 128)
{
c0 = _mm_loadu_si128((reinterpret_cast<const __m128i*>(src)) + 0);
c1 = _mm_loadu_si128((reinterpret_cast<const __m128i*>(src)) + 1);
c2 = _mm_loadu_si128((reinterpret_cast<const __m128i*>(src)) + 2);
c3 = _mm_loadu_si128((reinterpret_cast<const __m128i*>(src)) + 3);
c4 = _mm_loadu_si128((reinterpret_cast<const __m128i*>(src)) + 4);
c5 = _mm_loadu_si128((reinterpret_cast<const __m128i*>(src)) + 5);
c6 = _mm_loadu_si128((reinterpret_cast<const __m128i*>(src)) + 6);
c7 = _mm_loadu_si128((reinterpret_cast<const __m128i*>(src)) + 7);
_mm_prefetch((const char*)(src + 256), _MM_HINT_NTA);
src += 128;
_mm_stream_si128(((reinterpret_cast<__m128i*>(dst)) + 0), c0);
_mm_stream_si128(((reinterpret_cast<__m128i*>(dst)) + 1), c1);
_mm_stream_si128(((reinterpret_cast<__m128i*>(dst)) + 2), c2);
_mm_stream_si128(((reinterpret_cast<__m128i*>(dst)) + 3), c3);
_mm_stream_si128(((reinterpret_cast<__m128i*>(dst)) + 4), c4);
_mm_stream_si128(((reinterpret_cast<__m128i*>(dst)) + 5), c5);
_mm_stream_si128(((reinterpret_cast<__m128i*>(dst)) + 6), c6);
_mm_stream_si128(((reinterpret_cast<__m128i*>(dst)) + 7), c7);
dst += 128;
}
}
_mm_sfence();
}
memcpy_tiny(dst, src, size);
return destination;
}

View File

@ -0,0 +1,496 @@
#pragma once
//=====================================================================
//
// FastMemcpy.c - skywind3000@163.com, 2015
//
// feature:
// 50% speed up in avg. vs standard memcpy (tested in vc2012/gcc5.1)
//
//=====================================================================
#include <stddef.h>
#include <stdint.h>
#include <immintrin.h>
//---------------------------------------------------------------------
// force inline for compilers
//---------------------------------------------------------------------
#ifndef INLINE
#ifdef __GNUC__
#if (__GNUC__ > 3) || ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 1))
#define INLINE __inline__ __attribute__((always_inline))
#else
#define INLINE __inline__
#endif
#elif defined(_MSC_VER)
#define INLINE __forceinline
#elif (defined(__BORLANDC__) || defined(__WATCOMC__))
#define INLINE __inline
#else
#define INLINE
#endif
#endif
//---------------------------------------------------------------------
// fast copy for different sizes
//---------------------------------------------------------------------
static INLINE void memcpy_avx_16(void * __restrict dst, const void * __restrict src)
{
#if 1
__m128i m0 = _mm_loadu_si128(((const __m128i*)src) + 0);
_mm_storeu_si128(((__m128i*)dst) + 0, m0);
#else
*((uint64_t*)((char*)dst + 0)) = *((uint64_t*)((const char*)src + 0));
*((uint64_t*)((char*)dst + 8)) = *((uint64_t*)((const char*)src + 8));
#endif
}
static INLINE void memcpy_avx_32(void *dst, const void *src)
{
__m256i m0 = _mm256_loadu_si256((reinterpret_cast<const __m256i*>(src)) + 0);
_mm256_storeu_si256((reinterpret_cast<__m256i*>(dst)) + 0, m0);
}
static INLINE void memcpy_avx_64(void *dst, const void *src)
{
__m256i m0 = _mm256_loadu_si256((reinterpret_cast<const __m256i*>(src)) + 0);
__m256i m1 = _mm256_loadu_si256((reinterpret_cast<const __m256i*>(src)) + 1);
_mm256_storeu_si256((reinterpret_cast<__m256i*>(dst)) + 0, m0);
_mm256_storeu_si256((reinterpret_cast<__m256i*>(dst)) + 1, m1);
}
static INLINE void memcpy_avx_128(void *dst, const void *src)
{
__m256i m0 = _mm256_loadu_si256((reinterpret_cast<const __m256i*>(src)) + 0);
__m256i m1 = _mm256_loadu_si256((reinterpret_cast<const __m256i*>(src)) + 1);
__m256i m2 = _mm256_loadu_si256((reinterpret_cast<const __m256i*>(src)) + 2);
__m256i m3 = _mm256_loadu_si256((reinterpret_cast<const __m256i*>(src)) + 3);
_mm256_storeu_si256((reinterpret_cast<__m256i*>(dst)) + 0, m0);
_mm256_storeu_si256((reinterpret_cast<__m256i*>(dst)) + 1, m1);
_mm256_storeu_si256((reinterpret_cast<__m256i*>(dst)) + 2, m2);
_mm256_storeu_si256((reinterpret_cast<__m256i*>(dst)) + 3, m3);
}
static INLINE void memcpy_avx_256(void *dst, const void *src)
{
__m256i m0 = _mm256_loadu_si256((reinterpret_cast<const __m256i*>(src)) + 0);
__m256i m1 = _mm256_loadu_si256((reinterpret_cast<const __m256i*>(src)) + 1);
__m256i m2 = _mm256_loadu_si256((reinterpret_cast<const __m256i*>(src)) + 2);
__m256i m3 = _mm256_loadu_si256((reinterpret_cast<const __m256i*>(src)) + 3);
__m256i m4 = _mm256_loadu_si256((reinterpret_cast<const __m256i*>(src)) + 4);
__m256i m5 = _mm256_loadu_si256((reinterpret_cast<const __m256i*>(src)) + 5);
__m256i m6 = _mm256_loadu_si256((reinterpret_cast<const __m256i*>(src)) + 6);
__m256i m7 = _mm256_loadu_si256((reinterpret_cast<const __m256i*>(src)) + 7);
_mm256_storeu_si256((reinterpret_cast<__m256i*>(dst)) + 0, m0);
_mm256_storeu_si256((reinterpret_cast<__m256i*>(dst)) + 1, m1);
_mm256_storeu_si256((reinterpret_cast<__m256i*>(dst)) + 2, m2);
_mm256_storeu_si256((reinterpret_cast<__m256i*>(dst)) + 3, m3);
_mm256_storeu_si256((reinterpret_cast<__m256i*>(dst)) + 4, m4);
_mm256_storeu_si256((reinterpret_cast<__m256i*>(dst)) + 5, m5);
_mm256_storeu_si256((reinterpret_cast<__m256i*>(dst)) + 6, m6);
_mm256_storeu_si256((reinterpret_cast<__m256i*>(dst)) + 7, m7);
}
//---------------------------------------------------------------------
// tiny memory copy with jump table optimized
//---------------------------------------------------------------------
static INLINE void *memcpy_tiny_avx(void * __restrict dst, const void * __restrict src, size_t size)
{
unsigned char *dd = reinterpret_cast<unsigned char *>(dst) + size;
const unsigned char *ss = reinterpret_cast<const unsigned char*>(src) + size;
switch (size)
{
case 128: memcpy_avx_128(dd - 128, ss - 128); [[fallthrough]];
case 0: break;
case 129: memcpy_avx_128(dd - 129, ss - 129); [[fallthrough]];
case 1: dd[-1] = ss[-1]; break;
case 130: memcpy_avx_128(dd - 130, ss - 130); [[fallthrough]];
case 2: *((uint16_t*)(dd - 2)) = *((uint16_t*)(ss - 2)); break;
case 131: memcpy_avx_128(dd - 131, ss - 131); [[fallthrough]];
case 3: *((uint16_t*)(dd - 3)) = *((uint16_t*)(ss - 3)); dd[-1] = ss[-1]; break;
case 132: memcpy_avx_128(dd - 132, ss - 132); [[fallthrough]];
case 4: *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); break;
case 133: memcpy_avx_128(dd - 133, ss - 133); [[fallthrough]];
case 5: *((uint32_t*)(dd - 5)) = *((uint32_t*)(ss - 5)); dd[-1] = ss[-1]; break;
case 134: memcpy_avx_128(dd - 134, ss - 134); [[fallthrough]];
case 6: *((uint32_t*)(dd - 6)) = *((uint32_t*)(ss - 6)); *((uint16_t*)(dd - 2)) = *((uint16_t*)(ss - 2)); break;
case 135: memcpy_avx_128(dd - 135, ss - 135); [[fallthrough]];
case 7: *((uint32_t*)(dd - 7)) = *((uint32_t*)(ss - 7)); *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); break;
case 136: memcpy_avx_128(dd - 136, ss - 136); [[fallthrough]];
case 8: *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break;
case 137: memcpy_avx_128(dd - 137, ss - 137); [[fallthrough]];
case 9: *((uint64_t*)(dd - 9)) = *((uint64_t*)(ss - 9)); dd[-1] = ss[-1]; break;
case 138: memcpy_avx_128(dd - 138, ss - 138); [[fallthrough]];
case 10: *((uint64_t*)(dd - 10)) = *((uint64_t*)(ss - 10)); *((uint16_t*)(dd - 2)) = *((uint16_t*)(ss - 2)); break;
case 139: memcpy_avx_128(dd - 139, ss - 139); [[fallthrough]];
case 11: *((uint64_t*)(dd - 11)) = *((uint64_t*)(ss - 11)); *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); break;
case 140: memcpy_avx_128(dd - 140, ss - 140); [[fallthrough]];
case 12: *((uint64_t*)(dd - 12)) = *((uint64_t*)(ss - 12)); *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); break;
case 141: memcpy_avx_128(dd - 141, ss - 141); [[fallthrough]];
case 13: *((uint64_t*)(dd - 13)) = *((uint64_t*)(ss - 13)); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break;
case 142: memcpy_avx_128(dd - 142, ss - 142); [[fallthrough]];
case 14: *((uint64_t*)(dd - 14)) = *((uint64_t*)(ss - 14)); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break;
case 143: memcpy_avx_128(dd - 143, ss - 143); [[fallthrough]];
case 15: *((uint64_t*)(dd - 15)) = *((uint64_t*)(ss - 15)); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break;
case 144: memcpy_avx_128(dd - 144, ss - 144); [[fallthrough]];
case 16: memcpy_avx_16(dd - 16, ss - 16); break;
case 145: memcpy_avx_128(dd - 145, ss - 145); [[fallthrough]];
case 17: memcpy_avx_16(dd - 17, ss - 17); dd[-1] = ss[-1]; break;
case 146: memcpy_avx_128(dd - 146, ss - 146); [[fallthrough]];
case 18: memcpy_avx_16(dd - 18, ss - 18); *((uint16_t*)(dd - 2)) = *((uint16_t*)(ss - 2)); break;
case 147: memcpy_avx_128(dd - 147, ss - 147); [[fallthrough]];
case 19: memcpy_avx_16(dd - 19, ss - 19); *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); break;
case 148: memcpy_avx_128(dd - 148, ss - 148); [[fallthrough]];
case 20: memcpy_avx_16(dd - 20, ss - 20); *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); break;
case 149: memcpy_avx_128(dd - 149, ss - 149); [[fallthrough]];
case 21: memcpy_avx_16(dd - 21, ss - 21); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break;
case 150: memcpy_avx_128(dd - 150, ss - 150); [[fallthrough]];
case 22: memcpy_avx_16(dd - 22, ss - 22); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break;
case 151: memcpy_avx_128(dd - 151, ss - 151); [[fallthrough]];
case 23: memcpy_avx_16(dd - 23, ss - 23); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break;
case 152: memcpy_avx_128(dd - 152, ss - 152); [[fallthrough]];
case 24: memcpy_avx_16(dd - 24, ss - 24); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break;
case 153: memcpy_avx_128(dd - 153, ss - 153); [[fallthrough]];
case 25: memcpy_avx_16(dd - 25, ss - 25); memcpy_avx_16(dd - 16, ss - 16); break;
case 154: memcpy_avx_128(dd - 154, ss - 154); [[fallthrough]];
case 26: memcpy_avx_16(dd - 26, ss - 26); memcpy_avx_16(dd - 16, ss - 16); break;
case 155: memcpy_avx_128(dd - 155, ss - 155); [[fallthrough]];
case 27: memcpy_avx_16(dd - 27, ss - 27); memcpy_avx_16(dd - 16, ss - 16); break;
case 156: memcpy_avx_128(dd - 156, ss - 156); [[fallthrough]];
case 28: memcpy_avx_16(dd - 28, ss - 28); memcpy_avx_16(dd - 16, ss - 16); break;
case 157: memcpy_avx_128(dd - 157, ss - 157); [[fallthrough]];
case 29: memcpy_avx_16(dd - 29, ss - 29); memcpy_avx_16(dd - 16, ss - 16); break;
case 158: memcpy_avx_128(dd - 158, ss - 158); [[fallthrough]];
case 30: memcpy_avx_16(dd - 30, ss - 30); memcpy_avx_16(dd - 16, ss - 16); break;
case 159: memcpy_avx_128(dd - 159, ss - 159); [[fallthrough]];
case 31: memcpy_avx_16(dd - 31, ss - 31); memcpy_avx_16(dd - 16, ss - 16); break;
case 160: memcpy_avx_128(dd - 160, ss - 160); [[fallthrough]];
case 32: memcpy_avx_32(dd - 32, ss - 32); break;
case 161: memcpy_avx_128(dd - 161, ss - 161); [[fallthrough]];
case 33: memcpy_avx_32(dd - 33, ss - 33); dd[-1] = ss[-1]; break;
case 162: memcpy_avx_128(dd - 162, ss - 162); [[fallthrough]];
case 34: memcpy_avx_32(dd - 34, ss - 34); *((uint16_t*)(dd - 2)) = *((uint16_t*)(ss - 2)); break;
case 163: memcpy_avx_128(dd - 163, ss - 163); [[fallthrough]];
case 35: memcpy_avx_32(dd - 35, ss - 35); *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); break;
case 164: memcpy_avx_128(dd - 164, ss - 164); [[fallthrough]];
case 36: memcpy_avx_32(dd - 36, ss - 36); *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); break;
case 165: memcpy_avx_128(dd - 165, ss - 165); [[fallthrough]];
case 37: memcpy_avx_32(dd - 37, ss - 37); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break;
case 166: memcpy_avx_128(dd - 166, ss - 166); [[fallthrough]];
case 38: memcpy_avx_32(dd - 38, ss - 38); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break;
case 167: memcpy_avx_128(dd - 167, ss - 167); [[fallthrough]];
case 39: memcpy_avx_32(dd - 39, ss - 39); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break;
case 168: memcpy_avx_128(dd - 168, ss - 168); [[fallthrough]];
case 40: memcpy_avx_32(dd - 40, ss - 40); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break;
case 169: memcpy_avx_128(dd - 169, ss - 169); [[fallthrough]];
case 41: memcpy_avx_32(dd - 41, ss - 41); memcpy_avx_16(dd - 16, ss - 16); break;
case 170: memcpy_avx_128(dd - 170, ss - 170); [[fallthrough]];
case 42: memcpy_avx_32(dd - 42, ss - 42); memcpy_avx_16(dd - 16, ss - 16); break;
case 171: memcpy_avx_128(dd - 171, ss - 171); [[fallthrough]];
case 43: memcpy_avx_32(dd - 43, ss - 43); memcpy_avx_16(dd - 16, ss - 16); break;
case 172: memcpy_avx_128(dd - 172, ss - 172); [[fallthrough]];
case 44: memcpy_avx_32(dd - 44, ss - 44); memcpy_avx_16(dd - 16, ss - 16); break;
case 173: memcpy_avx_128(dd - 173, ss - 173); [[fallthrough]];
case 45: memcpy_avx_32(dd - 45, ss - 45); memcpy_avx_16(dd - 16, ss - 16); break;
case 174: memcpy_avx_128(dd - 174, ss - 174); [[fallthrough]];
case 46: memcpy_avx_32(dd - 46, ss - 46); memcpy_avx_16(dd - 16, ss - 16); break;
case 175: memcpy_avx_128(dd - 175, ss - 175); [[fallthrough]];
case 47: memcpy_avx_32(dd - 47, ss - 47); memcpy_avx_16(dd - 16, ss - 16); break;
case 176: memcpy_avx_128(dd - 176, ss - 176); [[fallthrough]];
case 48: memcpy_avx_32(dd - 48, ss - 48); memcpy_avx_16(dd - 16, ss - 16); break;
case 177: memcpy_avx_128(dd - 177, ss - 177); [[fallthrough]];
case 49: memcpy_avx_32(dd - 49, ss - 49); memcpy_avx_32(dd - 32, ss - 32); break;
case 178: memcpy_avx_128(dd - 178, ss - 178); [[fallthrough]];
case 50: memcpy_avx_32(dd - 50, ss - 50); memcpy_avx_32(dd - 32, ss - 32); break;
case 179: memcpy_avx_128(dd - 179, ss - 179); [[fallthrough]];
case 51: memcpy_avx_32(dd - 51, ss - 51); memcpy_avx_32(dd - 32, ss - 32); break;
case 180: memcpy_avx_128(dd - 180, ss - 180); [[fallthrough]];
case 52: memcpy_avx_32(dd - 52, ss - 52); memcpy_avx_32(dd - 32, ss - 32); break;
case 181: memcpy_avx_128(dd - 181, ss - 181); [[fallthrough]];
case 53: memcpy_avx_32(dd - 53, ss - 53); memcpy_avx_32(dd - 32, ss - 32); break;
case 182: memcpy_avx_128(dd - 182, ss - 182); [[fallthrough]];
case 54: memcpy_avx_32(dd - 54, ss - 54); memcpy_avx_32(dd - 32, ss - 32); break;
case 183: memcpy_avx_128(dd - 183, ss - 183); [[fallthrough]];
case 55: memcpy_avx_32(dd - 55, ss - 55); memcpy_avx_32(dd - 32, ss - 32); break;
case 184: memcpy_avx_128(dd - 184, ss - 184); [[fallthrough]];
case 56: memcpy_avx_32(dd - 56, ss - 56); memcpy_avx_32(dd - 32, ss - 32); break;
case 185: memcpy_avx_128(dd - 185, ss - 185); [[fallthrough]];
case 57: memcpy_avx_32(dd - 57, ss - 57); memcpy_avx_32(dd - 32, ss - 32); break;
case 186: memcpy_avx_128(dd - 186, ss - 186); [[fallthrough]];
case 58: memcpy_avx_32(dd - 58, ss - 58); memcpy_avx_32(dd - 32, ss - 32); break;
case 187: memcpy_avx_128(dd - 187, ss - 187); [[fallthrough]];
case 59: memcpy_avx_32(dd - 59, ss - 59); memcpy_avx_32(dd - 32, ss - 32); break;
case 188: memcpy_avx_128(dd - 188, ss - 188); [[fallthrough]];
case 60: memcpy_avx_32(dd - 60, ss - 60); memcpy_avx_32(dd - 32, ss - 32); break;
case 189: memcpy_avx_128(dd - 189, ss - 189); [[fallthrough]];
case 61: memcpy_avx_32(dd - 61, ss - 61); memcpy_avx_32(dd - 32, ss - 32); break;
case 190: memcpy_avx_128(dd - 190, ss - 190); [[fallthrough]];
case 62: memcpy_avx_32(dd - 62, ss - 62); memcpy_avx_32(dd - 32, ss - 32); break;
case 191: memcpy_avx_128(dd - 191, ss - 191); [[fallthrough]];
case 63: memcpy_avx_32(dd - 63, ss - 63); memcpy_avx_32(dd - 32, ss - 32); break;
case 192: memcpy_avx_128(dd - 192, ss - 192); [[fallthrough]];
case 64: memcpy_avx_64(dd - 64, ss - 64); break;
case 193: memcpy_avx_128(dd - 193, ss - 193); [[fallthrough]];
case 65: memcpy_avx_64(dd - 65, ss - 65); dd[-1] = ss[-1]; break;
case 194: memcpy_avx_128(dd - 194, ss - 194); [[fallthrough]];
case 66: memcpy_avx_64(dd - 66, ss - 66); *((uint16_t*)(dd - 2)) = *((uint16_t*)(ss - 2)); break;
case 195: memcpy_avx_128(dd - 195, ss - 195); [[fallthrough]];
case 67: memcpy_avx_64(dd - 67, ss - 67); *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); break;
case 196: memcpy_avx_128(dd - 196, ss - 196); [[fallthrough]];
case 68: memcpy_avx_64(dd - 68, ss - 68); *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); break;
case 197: memcpy_avx_128(dd - 197, ss - 197); [[fallthrough]];
case 69: memcpy_avx_64(dd - 69, ss - 69); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break;
case 198: memcpy_avx_128(dd - 198, ss - 198); [[fallthrough]];
case 70: memcpy_avx_64(dd - 70, ss - 70); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break;
case 199: memcpy_avx_128(dd - 199, ss - 199); [[fallthrough]];
case 71: memcpy_avx_64(dd - 71, ss - 71); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break;
case 200: memcpy_avx_128(dd - 200, ss - 200); [[fallthrough]];
case 72: memcpy_avx_64(dd - 72, ss - 72); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break;
case 201: memcpy_avx_128(dd - 201, ss - 201); [[fallthrough]];
case 73: memcpy_avx_64(dd - 73, ss - 73); memcpy_avx_16(dd - 16, ss - 16); break;
case 202: memcpy_avx_128(dd - 202, ss - 202); [[fallthrough]];
case 74: memcpy_avx_64(dd - 74, ss - 74); memcpy_avx_16(dd - 16, ss - 16); break;
case 203: memcpy_avx_128(dd - 203, ss - 203); [[fallthrough]];
case 75: memcpy_avx_64(dd - 75, ss - 75); memcpy_avx_16(dd - 16, ss - 16); break;
case 204: memcpy_avx_128(dd - 204, ss - 204); [[fallthrough]];
case 76: memcpy_avx_64(dd - 76, ss - 76); memcpy_avx_16(dd - 16, ss - 16); break;
case 205: memcpy_avx_128(dd - 205, ss - 205); [[fallthrough]];
case 77: memcpy_avx_64(dd - 77, ss - 77); memcpy_avx_16(dd - 16, ss - 16); break;
case 206: memcpy_avx_128(dd - 206, ss - 206); [[fallthrough]];
case 78: memcpy_avx_64(dd - 78, ss - 78); memcpy_avx_16(dd - 16, ss - 16); break;
case 207: memcpy_avx_128(dd - 207, ss - 207); [[fallthrough]];
case 79: memcpy_avx_64(dd - 79, ss - 79); memcpy_avx_16(dd - 16, ss - 16); break;
case 208: memcpy_avx_128(dd - 208, ss - 208); [[fallthrough]];
case 80: memcpy_avx_64(dd - 80, ss - 80); memcpy_avx_16(dd - 16, ss - 16); break;
case 209: memcpy_avx_128(dd - 209, ss - 209); [[fallthrough]];
case 81: memcpy_avx_64(dd - 81, ss - 81); memcpy_avx_32(dd - 32, ss - 32); break;
case 210: memcpy_avx_128(dd - 210, ss - 210); [[fallthrough]];
case 82: memcpy_avx_64(dd - 82, ss - 82); memcpy_avx_32(dd - 32, ss - 32); break;
case 211: memcpy_avx_128(dd - 211, ss - 211); [[fallthrough]];
case 83: memcpy_avx_64(dd - 83, ss - 83); memcpy_avx_32(dd - 32, ss - 32); break;
case 212: memcpy_avx_128(dd - 212, ss - 212); [[fallthrough]];
case 84: memcpy_avx_64(dd - 84, ss - 84); memcpy_avx_32(dd - 32, ss - 32); break;
case 213: memcpy_avx_128(dd - 213, ss - 213); [[fallthrough]];
case 85: memcpy_avx_64(dd - 85, ss - 85); memcpy_avx_32(dd - 32, ss - 32); break;
case 214: memcpy_avx_128(dd - 214, ss - 214); [[fallthrough]];
case 86: memcpy_avx_64(dd - 86, ss - 86); memcpy_avx_32(dd - 32, ss - 32); break;
case 215: memcpy_avx_128(dd - 215, ss - 215); [[fallthrough]];
case 87: memcpy_avx_64(dd - 87, ss - 87); memcpy_avx_32(dd - 32, ss - 32); break;
case 216: memcpy_avx_128(dd - 216, ss - 216); [[fallthrough]];
case 88: memcpy_avx_64(dd - 88, ss - 88); memcpy_avx_32(dd - 32, ss - 32); break;
case 217: memcpy_avx_128(dd - 217, ss - 217); [[fallthrough]];
case 89: memcpy_avx_64(dd - 89, ss - 89); memcpy_avx_32(dd - 32, ss - 32); break;
case 218: memcpy_avx_128(dd - 218, ss - 218); [[fallthrough]];
case 90: memcpy_avx_64(dd - 90, ss - 90); memcpy_avx_32(dd - 32, ss - 32); break;
case 219: memcpy_avx_128(dd - 219, ss - 219); [[fallthrough]];
case 91: memcpy_avx_64(dd - 91, ss - 91); memcpy_avx_32(dd - 32, ss - 32); break;
case 220: memcpy_avx_128(dd - 220, ss - 220); [[fallthrough]];
case 92: memcpy_avx_64(dd - 92, ss - 92); memcpy_avx_32(dd - 32, ss - 32); break;
case 221: memcpy_avx_128(dd - 221, ss - 221); [[fallthrough]];
case 93: memcpy_avx_64(dd - 93, ss - 93); memcpy_avx_32(dd - 32, ss - 32); break;
case 222: memcpy_avx_128(dd - 222, ss - 222); [[fallthrough]];
case 94: memcpy_avx_64(dd - 94, ss - 94); memcpy_avx_32(dd - 32, ss - 32); break;
case 223: memcpy_avx_128(dd - 223, ss - 223); [[fallthrough]];
case 95: memcpy_avx_64(dd - 95, ss - 95); memcpy_avx_32(dd - 32, ss - 32); break;
case 224: memcpy_avx_128(dd - 224, ss - 224); [[fallthrough]];
case 96: memcpy_avx_64(dd - 96, ss - 96); memcpy_avx_32(dd - 32, ss - 32); break;
case 225: memcpy_avx_128(dd - 225, ss - 225); [[fallthrough]];
case 97: memcpy_avx_64(dd - 97, ss - 97); memcpy_avx_64(dd - 64, ss - 64); break;
case 226: memcpy_avx_128(dd - 226, ss - 226); [[fallthrough]];
case 98: memcpy_avx_64(dd - 98, ss - 98); memcpy_avx_64(dd - 64, ss - 64); break;
case 227: memcpy_avx_128(dd - 227, ss - 227); [[fallthrough]];
case 99: memcpy_avx_64(dd - 99, ss - 99); memcpy_avx_64(dd - 64, ss - 64); break;
case 228: memcpy_avx_128(dd - 228, ss - 228); [[fallthrough]];
case 100: memcpy_avx_64(dd - 100, ss - 100); memcpy_avx_64(dd - 64, ss - 64); break;
case 229: memcpy_avx_128(dd - 229, ss - 229); [[fallthrough]];
case 101: memcpy_avx_64(dd - 101, ss - 101); memcpy_avx_64(dd - 64, ss - 64); break;
case 230: memcpy_avx_128(dd - 230, ss - 230); [[fallthrough]];
case 102: memcpy_avx_64(dd - 102, ss - 102); memcpy_avx_64(dd - 64, ss - 64); break;
case 231: memcpy_avx_128(dd - 231, ss - 231); [[fallthrough]];
case 103: memcpy_avx_64(dd - 103, ss - 103); memcpy_avx_64(dd - 64, ss - 64); break;
case 232: memcpy_avx_128(dd - 232, ss - 232); [[fallthrough]];
case 104: memcpy_avx_64(dd - 104, ss - 104); memcpy_avx_64(dd - 64, ss - 64); break;
case 233: memcpy_avx_128(dd - 233, ss - 233); [[fallthrough]];
case 105: memcpy_avx_64(dd - 105, ss - 105); memcpy_avx_64(dd - 64, ss - 64); break;
case 234: memcpy_avx_128(dd - 234, ss - 234); [[fallthrough]];
case 106: memcpy_avx_64(dd - 106, ss - 106); memcpy_avx_64(dd - 64, ss - 64); break;
case 235: memcpy_avx_128(dd - 235, ss - 235); [[fallthrough]];
case 107: memcpy_avx_64(dd - 107, ss - 107); memcpy_avx_64(dd - 64, ss - 64); break;
case 236: memcpy_avx_128(dd - 236, ss - 236); [[fallthrough]];
case 108: memcpy_avx_64(dd - 108, ss - 108); memcpy_avx_64(dd - 64, ss - 64); break;
case 237: memcpy_avx_128(dd - 237, ss - 237); [[fallthrough]];
case 109: memcpy_avx_64(dd - 109, ss - 109); memcpy_avx_64(dd - 64, ss - 64); break;
case 238: memcpy_avx_128(dd - 238, ss - 238); [[fallthrough]];
case 110: memcpy_avx_64(dd - 110, ss - 110); memcpy_avx_64(dd - 64, ss - 64); break;
case 239: memcpy_avx_128(dd - 239, ss - 239); [[fallthrough]];
case 111: memcpy_avx_64(dd - 111, ss - 111); memcpy_avx_64(dd - 64, ss - 64); break;
case 240: memcpy_avx_128(dd - 240, ss - 240); [[fallthrough]];
case 112: memcpy_avx_64(dd - 112, ss - 112); memcpy_avx_64(dd - 64, ss - 64); break;
case 241: memcpy_avx_128(dd - 241, ss - 241); [[fallthrough]];
case 113: memcpy_avx_64(dd - 113, ss - 113); memcpy_avx_64(dd - 64, ss - 64); break;
case 242: memcpy_avx_128(dd - 242, ss - 242); [[fallthrough]];
case 114: memcpy_avx_64(dd - 114, ss - 114); memcpy_avx_64(dd - 64, ss - 64); break;
case 243: memcpy_avx_128(dd - 243, ss - 243); [[fallthrough]];
case 115: memcpy_avx_64(dd - 115, ss - 115); memcpy_avx_64(dd - 64, ss - 64); break;
case 244: memcpy_avx_128(dd - 244, ss - 244); [[fallthrough]];
case 116: memcpy_avx_64(dd - 116, ss - 116); memcpy_avx_64(dd - 64, ss - 64); break;
case 245: memcpy_avx_128(dd - 245, ss - 245); [[fallthrough]];
case 117: memcpy_avx_64(dd - 117, ss - 117); memcpy_avx_64(dd - 64, ss - 64); break;
case 246: memcpy_avx_128(dd - 246, ss - 246); [[fallthrough]];
case 118: memcpy_avx_64(dd - 118, ss - 118); memcpy_avx_64(dd - 64, ss - 64); break;
case 247: memcpy_avx_128(dd - 247, ss - 247); [[fallthrough]];
case 119: memcpy_avx_64(dd - 119, ss - 119); memcpy_avx_64(dd - 64, ss - 64); break;
case 248: memcpy_avx_128(dd - 248, ss - 248); [[fallthrough]];
case 120: memcpy_avx_64(dd - 120, ss - 120); memcpy_avx_64(dd - 64, ss - 64); break;
case 249: memcpy_avx_128(dd - 249, ss - 249); [[fallthrough]];
case 121: memcpy_avx_64(dd - 121, ss - 121); memcpy_avx_64(dd - 64, ss - 64); break;
case 250: memcpy_avx_128(dd - 250, ss - 250); [[fallthrough]];
case 122: memcpy_avx_64(dd - 122, ss - 122); memcpy_avx_64(dd - 64, ss - 64); break;
case 251: memcpy_avx_128(dd - 251, ss - 251); [[fallthrough]];
case 123: memcpy_avx_64(dd - 123, ss - 123); memcpy_avx_64(dd - 64, ss - 64); break;
case 252: memcpy_avx_128(dd - 252, ss - 252); [[fallthrough]];
case 124: memcpy_avx_64(dd - 124, ss - 124); memcpy_avx_64(dd - 64, ss - 64); break;
case 253: memcpy_avx_128(dd - 253, ss - 253); [[fallthrough]];
case 125: memcpy_avx_64(dd - 125, ss - 125); memcpy_avx_64(dd - 64, ss - 64); break;
case 254: memcpy_avx_128(dd - 254, ss - 254); [[fallthrough]];
case 126: memcpy_avx_64(dd - 126, ss - 126); memcpy_avx_64(dd - 64, ss - 64); break;
case 255: memcpy_avx_128(dd - 255, ss - 255); [[fallthrough]];
case 127: memcpy_avx_64(dd - 127, ss - 127); memcpy_avx_64(dd - 64, ss - 64); break;
case 256: memcpy_avx_256(dd - 256, ss - 256); break;
}
return dst;
}
//---------------------------------------------------------------------
// main routine
//---------------------------------------------------------------------
void* memcpy_fast_avx(void * __restrict destination, const void * __restrict source, size_t size)
{
unsigned char *dst = reinterpret_cast<unsigned char*>(destination);
const unsigned char *src = reinterpret_cast<const unsigned char*>(source);
static size_t cachesize = 0x200000; // L3-cache size
size_t padding;
// small memory copy
if (size <= 256)
{
memcpy_tiny_avx(dst, src, size);
_mm256_zeroupper();
return destination;
}
// align destination to 16 bytes boundary
padding = (32 - (((size_t)dst) & 31)) & 31;
#if 0
if (padding > 0)
{
__m256i head = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src));
_mm256_storeu_si256((__m256i*)dst, head);
dst += padding;
src += padding;
size -= padding;
}
#else
__m256i head = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src));
_mm256_storeu_si256((__m256i*)dst, head);
dst += padding;
src += padding;
size -= padding;
#endif
// medium size copy
if (size <= cachesize)
{
__m256i c0, c1, c2, c3, c4, c5, c6, c7;
for (; size >= 256; size -= 256)
{
c0 = _mm256_loadu_si256((reinterpret_cast<const __m256i*>(src)) + 0);
c1 = _mm256_loadu_si256((reinterpret_cast<const __m256i*>(src)) + 1);
c2 = _mm256_loadu_si256((reinterpret_cast<const __m256i*>(src)) + 2);
c3 = _mm256_loadu_si256((reinterpret_cast<const __m256i*>(src)) + 3);
c4 = _mm256_loadu_si256((reinterpret_cast<const __m256i*>(src)) + 4);
c5 = _mm256_loadu_si256((reinterpret_cast<const __m256i*>(src)) + 5);
c6 = _mm256_loadu_si256((reinterpret_cast<const __m256i*>(src)) + 6);
c7 = _mm256_loadu_si256((reinterpret_cast<const __m256i*>(src)) + 7);
src += 256;
_mm256_storeu_si256(((reinterpret_cast<__m256i*>(dst)) + 0), c0);
_mm256_storeu_si256(((reinterpret_cast<__m256i*>(dst)) + 1), c1);
_mm256_storeu_si256(((reinterpret_cast<__m256i*>(dst)) + 2), c2);
_mm256_storeu_si256(((reinterpret_cast<__m256i*>(dst)) + 3), c3);
_mm256_storeu_si256(((reinterpret_cast<__m256i*>(dst)) + 4), c4);
_mm256_storeu_si256(((reinterpret_cast<__m256i*>(dst)) + 5), c5);
_mm256_storeu_si256(((reinterpret_cast<__m256i*>(dst)) + 6), c6);
_mm256_storeu_si256(((reinterpret_cast<__m256i*>(dst)) + 7), c7);
dst += 256;
}
}
else
{ // big memory copy
__m256i c0, c1, c2, c3, c4, c5, c6, c7;
/* __m256i c0, c1, c2, c3, c4, c5, c6, c7; */
if ((((size_t)src) & 31) == 0)
{ // source aligned
for (; size >= 256; size -= 256)
{
c0 = _mm256_load_si256((reinterpret_cast<const __m256i*>(src)) + 0);
c1 = _mm256_load_si256((reinterpret_cast<const __m256i*>(src)) + 1);
c2 = _mm256_load_si256((reinterpret_cast<const __m256i*>(src)) + 2);
c3 = _mm256_load_si256((reinterpret_cast<const __m256i*>(src)) + 3);
c4 = _mm256_load_si256((reinterpret_cast<const __m256i*>(src)) + 4);
c5 = _mm256_load_si256((reinterpret_cast<const __m256i*>(src)) + 5);
c6 = _mm256_load_si256((reinterpret_cast<const __m256i*>(src)) + 6);
c7 = _mm256_load_si256((reinterpret_cast<const __m256i*>(src)) + 7);
src += 256;
_mm256_stream_si256(((reinterpret_cast<__m256i*>(dst)) + 0), c0);
_mm256_stream_si256(((reinterpret_cast<__m256i*>(dst)) + 1), c1);
_mm256_stream_si256(((reinterpret_cast<__m256i*>(dst)) + 2), c2);
_mm256_stream_si256(((reinterpret_cast<__m256i*>(dst)) + 3), c3);
_mm256_stream_si256(((reinterpret_cast<__m256i*>(dst)) + 4), c4);
_mm256_stream_si256(((reinterpret_cast<__m256i*>(dst)) + 5), c5);
_mm256_stream_si256(((reinterpret_cast<__m256i*>(dst)) + 6), c6);
_mm256_stream_si256(((reinterpret_cast<__m256i*>(dst)) + 7), c7);
dst += 256;
}
}
else
{ // source unaligned
for (; size >= 256; size -= 256)
{
c0 = _mm256_loadu_si256((reinterpret_cast<const __m256i*>(src)) + 0);
c1 = _mm256_loadu_si256((reinterpret_cast<const __m256i*>(src)) + 1);
c2 = _mm256_loadu_si256((reinterpret_cast<const __m256i*>(src)) + 2);
c3 = _mm256_loadu_si256((reinterpret_cast<const __m256i*>(src)) + 3);
c4 = _mm256_loadu_si256((reinterpret_cast<const __m256i*>(src)) + 4);
c5 = _mm256_loadu_si256((reinterpret_cast<const __m256i*>(src)) + 5);
c6 = _mm256_loadu_si256((reinterpret_cast<const __m256i*>(src)) + 6);
c7 = _mm256_loadu_si256((reinterpret_cast<const __m256i*>(src)) + 7);
src += 256;
_mm256_stream_si256(((reinterpret_cast<__m256i*>(dst)) + 0), c0);
_mm256_stream_si256(((reinterpret_cast<__m256i*>(dst)) + 1), c1);
_mm256_stream_si256(((reinterpret_cast<__m256i*>(dst)) + 2), c2);
_mm256_stream_si256(((reinterpret_cast<__m256i*>(dst)) + 3), c3);
_mm256_stream_si256(((reinterpret_cast<__m256i*>(dst)) + 4), c4);
_mm256_stream_si256(((reinterpret_cast<__m256i*>(dst)) + 5), c5);
_mm256_stream_si256(((reinterpret_cast<__m256i*>(dst)) + 6), c6);
_mm256_stream_si256(((reinterpret_cast<__m256i*>(dst)) + 7), c7);
dst += 256;
}
}
_mm_sfence();
}
memcpy_tiny_avx(dst, src, size);
_mm256_zeroupper();
return destination;
}

View File

@ -0,0 +1,620 @@
#include <memory>
#include <cstddef>
#include <string>
#include <random>
#include <iostream>
#include <iomanip>
#include <thread>
#include <dlfcn.h>
#include <pcg_random.hpp>
#include <common/defines.h>
#include <Common/Stopwatch.h>
#pragma GCC diagnostic ignored "-Wold-style-cast"
#pragma GCC diagnostic ignored "-Wcast-align"
#pragma GCC diagnostic ignored "-Wcast-qual"
#include "FastMemcpy.h"
//#include "FastMemcpy_Avx.h"
#include <emmintrin.h>
#include <immintrin.h>
template <typename F, typename MemcpyImpl>
void NO_INLINE loop(uint8_t * dst, uint8_t * src, size_t size, F && chunk_size_distribution, MemcpyImpl && impl)
{
while (size)
{
size_t bytes_to_copy = std::min<size_t>(size, chunk_size_distribution());
impl(dst, src, bytes_to_copy);
dst += bytes_to_copy;
src += bytes_to_copy;
size -= bytes_to_copy;
}
}
using RNG = pcg32_fast;
template <size_t N>
size_t generatorUniform(RNG & rng) { return rng() % N; };
template <typename F, typename MemcpyImpl>
void test(uint8_t * dst, uint8_t * src, size_t size, size_t iterations, size_t num_threads, F && generator, MemcpyImpl && impl)
{
Stopwatch watch;
std::vector<std::thread> threads;
threads.reserve(num_threads);
for (size_t thread_num = 0; thread_num < num_threads; ++thread_num)
{
size_t begin = size * thread_num / num_threads;
size_t end = size * (thread_num + 1) / num_threads;
threads.emplace_back([begin, end, iterations, &src, &dst, &generator, &impl]
{
for (size_t iteration = 0; iteration < iterations; ++iteration)
{
loop(
iteration % 2 ? &src[begin] : &dst[begin],
iteration % 2 ? &dst[begin] : &src[begin],
end - begin,
[rng = RNG(), &generator]() mutable { return generator(rng); },
std::forward<MemcpyImpl>(impl));
}
});
}
for (auto & thread : threads)
thread.join();
double elapsed_ns = watch.elapsed();
/// Validation
size_t sum = 0;
for (size_t i = 0; i < size; ++i)
sum += dst[i];
std::cerr << std::fixed << std::setprecision(3)
<< "Processed in " << (elapsed_ns / 1e9) << "sec, " << (size * iterations * 1.0 / elapsed_ns) << " GB/sec (sum = " << sum << ")\n";
}
using memcpy_type = void * (*)(const void * __restrict, void * __restrict, size_t);
static void * memcpy_erms(void * dst, const void * src, size_t size)
{
asm volatile (
"rep movsb"
: "=D"(dst), "=S"(src), "=c"(size)
: "0"(dst), "1"(src), "2"(size)
: "memory");
return dst;
}
extern "C" void * memcpy_jart(void * dst, const void * src, size_t size);
extern "C" void MemCpy(void * dst, const void * src, size_t size);
static void * memcpySSE2(void * __restrict destination, const void * __restrict source, size_t size)
{
unsigned char *dst = reinterpret_cast<unsigned char *>(destination);
const unsigned char *src = reinterpret_cast<const unsigned char *>(source);
size_t padding;
// small memory copy
if (size <= 16)
return memcpy_tiny(dst, src, size);
// align destination to 16 bytes boundary
padding = (16 - (reinterpret_cast<size_t>(dst) & 15)) & 15;
if (padding > 0)
{
__m128i head = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src));
_mm_storeu_si128(reinterpret_cast<__m128i*>(dst), head);
dst += padding;
src += padding;
size -= padding;
}
// medium size copy
__m128i c0;
for (; size >= 16; size -= 16)
{
c0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src));
src += 16;
_mm_store_si128((reinterpret_cast<__m128i*>(dst)), c0);
dst += 16;
}
memcpy_tiny(dst, src, size);
return destination;
}
static void * memcpySSE2Unrolled2(void * __restrict destination, const void * __restrict source, size_t size)
{
unsigned char *dst = reinterpret_cast<unsigned char *>(destination);
const unsigned char *src = reinterpret_cast<const unsigned char *>(source);
size_t padding;
// small memory copy
if (size <= 32)
return memcpy_tiny(dst, src, size);
// align destination to 16 bytes boundary
padding = (16 - (reinterpret_cast<size_t>(dst) & 15)) & 15;
if (padding > 0)
{
__m128i head = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src));
_mm_storeu_si128(reinterpret_cast<__m128i*>(dst), head);
dst += padding;
src += padding;
size -= padding;
}
// medium size copy
__m128i c0, c1;
for (; size >= 32; size -= 32)
{
c0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src) + 0);
c1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src) + 1);
src += 32;
_mm_store_si128((reinterpret_cast<__m128i*>(dst) + 0), c0);
_mm_store_si128((reinterpret_cast<__m128i*>(dst) + 1), c1);
dst += 32;
}
memcpy_tiny(dst, src, size);
return destination;
}
static void * memcpySSE2Unrolled4(void * __restrict destination, const void * __restrict source, size_t size)
{
unsigned char *dst = reinterpret_cast<unsigned char *>(destination);
const unsigned char *src = reinterpret_cast<const unsigned char *>(source);
size_t padding;
// small memory copy
if (size <= 64)
return memcpy_tiny(dst, src, size);
// align destination to 16 bytes boundary
padding = (16 - (reinterpret_cast<size_t>(dst) & 15)) & 15;
if (padding > 0)
{
__m128i head = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src));
_mm_storeu_si128(reinterpret_cast<__m128i*>(dst), head);
dst += padding;
src += padding;
size -= padding;
}
// medium size copy
__m128i c0, c1, c2, c3;
for (; size >= 64; size -= 64)
{
c0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src) + 0);
c1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src) + 1);
c2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src) + 2);
c3 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src) + 3);
src += 64;
_mm_store_si128((reinterpret_cast<__m128i*>(dst) + 0), c0);
_mm_store_si128((reinterpret_cast<__m128i*>(dst) + 1), c1);
_mm_store_si128((reinterpret_cast<__m128i*>(dst) + 2), c2);
_mm_store_si128((reinterpret_cast<__m128i*>(dst) + 3), c3);
dst += 64;
}
memcpy_tiny(dst, src, size);
return destination;
}
static void * memcpySSE2Unrolled8(void * __restrict destination, const void * __restrict source, size_t size)
{
unsigned char *dst = reinterpret_cast<unsigned char *>(destination);
const unsigned char *src = reinterpret_cast<const unsigned char *>(source);
size_t padding;
// small memory copy
if (size <= 128)
return memcpy_tiny(dst, src, size);
// align destination to 16 bytes boundary
padding = (16 - (reinterpret_cast<size_t>(dst) & 15)) & 15;
if (padding > 0)
{
__m128i head = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src));
_mm_storeu_si128(reinterpret_cast<__m128i*>(dst), head);
dst += padding;
src += padding;
size -= padding;
}
// medium size copy
__m128i c0, c1, c2, c3, c4, c5, c6, c7;
for (; size >= 128; size -= 128)
{
c0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src) + 0);
c1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src) + 1);
c2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src) + 2);
c3 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src) + 3);
c4 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src) + 4);
c5 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src) + 5);
c6 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src) + 6);
c7 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src) + 7);
src += 128;
_mm_store_si128((reinterpret_cast<__m128i*>(dst) + 0), c0);
_mm_store_si128((reinterpret_cast<__m128i*>(dst) + 1), c1);
_mm_store_si128((reinterpret_cast<__m128i*>(dst) + 2), c2);
_mm_store_si128((reinterpret_cast<__m128i*>(dst) + 3), c3);
_mm_store_si128((reinterpret_cast<__m128i*>(dst) + 4), c4);
_mm_store_si128((reinterpret_cast<__m128i*>(dst) + 5), c5);
_mm_store_si128((reinterpret_cast<__m128i*>(dst) + 6), c6);
_mm_store_si128((reinterpret_cast<__m128i*>(dst) + 7), c7);
dst += 128;
}
memcpy_tiny(dst, src, size);
return destination;
}
//static __attribute__((__always_inline__, __target__("sse2")))
__attribute__((__always_inline__))
void memcpy_my_medium_sse(uint8_t * __restrict & dst, const uint8_t * __restrict & src, size_t & size)
{
/// Align destination to 16 bytes boundary.
size_t padding = (16 - (reinterpret_cast<size_t>(dst) & 15)) & 15;
if (padding > 0)
{
__m128i head = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src));
_mm_storeu_si128(reinterpret_cast<__m128i*>(dst), head);
dst += padding;
src += padding;
size -= padding;
}
/// Aligned unrolled copy.
__m128i c0, c1, c2, c3, c4, c5, c6, c7;
while (size >= 128)
{
c0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src) + 0);
c1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src) + 1);
c2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src) + 2);
c3 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src) + 3);
c4 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src) + 4);
c5 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src) + 5);
c6 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src) + 6);
c7 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src) + 7);
src += 128;
_mm_store_si128((reinterpret_cast<__m128i*>(dst) + 0), c0);
_mm_store_si128((reinterpret_cast<__m128i*>(dst) + 1), c1);
_mm_store_si128((reinterpret_cast<__m128i*>(dst) + 2), c2);
_mm_store_si128((reinterpret_cast<__m128i*>(dst) + 3), c3);
_mm_store_si128((reinterpret_cast<__m128i*>(dst) + 4), c4);
_mm_store_si128((reinterpret_cast<__m128i*>(dst) + 5), c5);
_mm_store_si128((reinterpret_cast<__m128i*>(dst) + 6), c6);
_mm_store_si128((reinterpret_cast<__m128i*>(dst) + 7), c7);
dst += 128;
size -= 128;
}
}
__attribute__((__target__("avx")))
void memcpy_my_medium_avx(uint8_t * __restrict & __restrict dst, const uint8_t * __restrict & __restrict src, size_t & __restrict size)
{
size_t padding = (32 - (reinterpret_cast<size_t>(dst) & 31)) & 31;
if (padding > 0)
{
__m256i head = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src));
_mm256_storeu_si256((__m256i*)dst, head);
dst += padding;
src += padding;
size -= padding;
}
__m256i c0, c1, c2, c3, c4, c5, c6, c7;
while (size >= 256)
{
c0 = _mm256_loadu_si256((reinterpret_cast<const __m256i*>(src)) + 0);
c1 = _mm256_loadu_si256((reinterpret_cast<const __m256i*>(src)) + 1);
c2 = _mm256_loadu_si256((reinterpret_cast<const __m256i*>(src)) + 2);
c3 = _mm256_loadu_si256((reinterpret_cast<const __m256i*>(src)) + 3);
c4 = _mm256_loadu_si256((reinterpret_cast<const __m256i*>(src)) + 4);
c5 = _mm256_loadu_si256((reinterpret_cast<const __m256i*>(src)) + 5);
c6 = _mm256_loadu_si256((reinterpret_cast<const __m256i*>(src)) + 6);
c7 = _mm256_loadu_si256((reinterpret_cast<const __m256i*>(src)) + 7);
src += 256;
_mm256_store_si256(((reinterpret_cast<__m256i*>(dst)) + 0), c0);
_mm256_store_si256(((reinterpret_cast<__m256i*>(dst)) + 1), c1);
_mm256_store_si256(((reinterpret_cast<__m256i*>(dst)) + 2), c2);
_mm256_store_si256(((reinterpret_cast<__m256i*>(dst)) + 3), c3);
_mm256_store_si256(((reinterpret_cast<__m256i*>(dst)) + 4), c4);
_mm256_store_si256(((reinterpret_cast<__m256i*>(dst)) + 5), c5);
_mm256_store_si256(((reinterpret_cast<__m256i*>(dst)) + 6), c6);
_mm256_store_si256(((reinterpret_cast<__m256i*>(dst)) + 7), c7);
dst += 256;
size -= 256;
}
}
bool have_avx = true;
static uint8_t * memcpy_my(uint8_t * __restrict dst, const uint8_t * __restrict src, size_t size)
{
uint8_t * ret = dst;
tail:
if (size <= 16)
{
if (size >= 8)
{
__builtin_memcpy(dst + size - 8, src + size - 8, 8);
__builtin_memcpy(dst, src, 8);
}
else if (size >= 4)
{
__builtin_memcpy(dst + size - 4, src + size - 4, 4);
__builtin_memcpy(dst, src, 4);
}
else if (size >= 2)
{
__builtin_memcpy(dst + size - 2, src + size - 2, 2);
__builtin_memcpy(dst, src, 2);
}
else if (size >= 1)
{
*dst = *src;
}
}
else if (have_avx)
{
if (size <= 32)
{
__builtin_memcpy(dst, src, 8);
__builtin_memcpy(dst + 8, src + 8, 8);
dst += 16;
src += 16;
size -= 16;
goto tail;
}
if (size <= 256)
{
__asm__(
"vmovups -0x20(%[s],%[size],1), %%ymm0\n"
"vmovups %%ymm0, -0x20(%[d],%[size],1)\n"
: [d]"+r"(dst), [s]"+r"(src)
: [size]"r"(size)
: "ymm0", "memory");
while (size > 32)
{
__asm__(
"vmovups (%[s]), %%ymm0\n"
"vmovups %%ymm0, (%[d])\n"
: [d]"+r"(dst), [s]"+r"(src)
:
: "ymm0", "memory");
dst += 32;
src += 32;
size -= 32;
}
}
else
{
size_t padding = (32 - (reinterpret_cast<size_t>(dst) & 31)) & 31;
if (padding > 0)
{
__asm__(
"vmovups (%[s]), %%ymm0\n"
"vmovups %%ymm0, (%[d])\n"
: [d]"+r"(dst), [s]"+r"(src)
:
: "ymm0", "memory");
dst += padding;
src += padding;
size -= padding;
}
while (size >= 256)
{
__asm__(
"vmovups (%[s]), %%ymm0\n"
"vmovups 0x20(%[s]), %%ymm1\n"
"vmovups 0x40(%[s]), %%ymm2\n"
"vmovups 0x60(%[s]), %%ymm3\n"
"vmovups 0x80(%[s]), %%ymm4\n"
"vmovups 0xa0(%[s]), %%ymm5\n"
"vmovups 0xc0(%[s]), %%ymm6\n"
"vmovups 0xe0(%[s]), %%ymm7\n"
"add $0x100,%[s]\n"
"vmovaps %%ymm0, (%[d])\n"
"vmovaps %%ymm1, 0x20(%[d])\n"
"vmovaps %%ymm2, 0x40(%[d])\n"
"vmovaps %%ymm3, 0x60(%[d])\n"
"vmovaps %%ymm4, 0x80(%[d])\n"
"vmovaps %%ymm5, 0xa0(%[d])\n"
"vmovaps %%ymm6, 0xc0(%[d])\n"
"vmovaps %%ymm7, 0xe0(%[d])\n"
"add $0x100, %[d]\n"
: [d]"+r"(dst), [s]"+r"(src)
:
: "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7", "memory");
size -= 256;
}
goto tail;
}
}
else
{
if (size <= 128)
{
_mm_storeu_si128(reinterpret_cast<__m128i *>(dst + size - 16), _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + size - 16)));
while (size > 16)
{
_mm_storeu_si128(reinterpret_cast<__m128i *>(dst), _mm_loadu_si128(reinterpret_cast<const __m128i *>(src)));
dst += 16;
src += 16;
size -= 16;
}
}
else
{
/// Align destination to 16 bytes boundary.
size_t padding = (16 - (reinterpret_cast<size_t>(dst) & 15)) & 15;
if (padding > 0)
{
__m128i head = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src));
_mm_storeu_si128(reinterpret_cast<__m128i*>(dst), head);
dst += padding;
src += padding;
size -= padding;
}
/// Aligned unrolled copy.
__m128i c0, c1, c2, c3, c4, c5, c6, c7;
while (size >= 128)
{
c0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src) + 0);
c1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src) + 1);
c2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src) + 2);
c3 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src) + 3);
c4 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src) + 4);
c5 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src) + 5);
c6 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src) + 6);
c7 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src) + 7);
src += 128;
_mm_store_si128((reinterpret_cast<__m128i*>(dst) + 0), c0);
_mm_store_si128((reinterpret_cast<__m128i*>(dst) + 1), c1);
_mm_store_si128((reinterpret_cast<__m128i*>(dst) + 2), c2);
_mm_store_si128((reinterpret_cast<__m128i*>(dst) + 3), c3);
_mm_store_si128((reinterpret_cast<__m128i*>(dst) + 4), c4);
_mm_store_si128((reinterpret_cast<__m128i*>(dst) + 5), c5);
_mm_store_si128((reinterpret_cast<__m128i*>(dst) + 6), c6);
_mm_store_si128((reinterpret_cast<__m128i*>(dst) + 7), c7);
dst += 128;
size -= 128;
}
goto tail;
}
}
return ret;
}
template <typename F>
void dispatchMemcpyVariants(size_t memcpy_variant, uint8_t * dst, uint8_t * src, size_t size, size_t iterations, size_t num_threads, F && generator)
{
memcpy_type memcpy_libc = reinterpret_cast<memcpy_type>(dlsym(RTLD_NEXT, "memcpy"));
if (memcpy_variant == 1)
test(dst, src, size, iterations, num_threads, std::forward<F>(generator), memcpy);
if (memcpy_variant == 2)
test(dst, src, size, iterations, num_threads, std::forward<F>(generator), memcpy_libc);
if (memcpy_variant == 3)
test(dst, src, size, iterations, num_threads, std::forward<F>(generator), memcpy_erms);
if (memcpy_variant == 4)
test(dst, src, size, iterations, num_threads, std::forward<F>(generator), MemCpy);
if (memcpy_variant == 5)
test(dst, src, size, iterations, num_threads, std::forward<F>(generator), memcpySSE2);
if (memcpy_variant == 6)
test(dst, src, size, iterations, num_threads, std::forward<F>(generator), memcpySSE2Unrolled2);
if (memcpy_variant == 7)
test(dst, src, size, iterations, num_threads, std::forward<F>(generator), memcpySSE2Unrolled4);
if (memcpy_variant == 8)
test(dst, src, size, iterations, num_threads, std::forward<F>(generator), memcpySSE2Unrolled8);
// if (memcpy_variant == 9)
// test(dst, src, size, iterations, num_threads, std::forward<F>(generator), memcpy_fast_avx);
if (memcpy_variant == 10)
test(dst, src, size, iterations, num_threads, std::forward<F>(generator), memcpy_my);
}
void dispatchVariants(size_t memcpy_variant, size_t generator_variant, uint8_t * dst, uint8_t * src, size_t size, size_t iterations, size_t num_threads)
{
if (generator_variant == 1)
dispatchMemcpyVariants(memcpy_variant, dst, src, size, iterations, num_threads, generatorUniform<16>);
if (generator_variant == 2)
dispatchMemcpyVariants(memcpy_variant, dst, src, size, iterations, num_threads, generatorUniform<256>);
if (generator_variant == 3)
dispatchMemcpyVariants(memcpy_variant, dst, src, size, iterations, num_threads, generatorUniform<4096>);
if (generator_variant == 4)
dispatchMemcpyVariants(memcpy_variant, dst, src, size, iterations, num_threads, generatorUniform<65536>);
if (generator_variant == 5)
dispatchMemcpyVariants(memcpy_variant, dst, src, size, iterations, num_threads, generatorUniform<1048576>);
}
int main(int argc, char ** argv)
{
size_t size = 1000000000;
if (argc >= 2)
size = std::stoull(argv[1]);
size_t iterations = 10;
if (argc >= 3)
iterations = std::stoull(argv[2]);
size_t num_threads = 1;
if (argc >= 4)
num_threads = std::stoull(argv[3]);
size_t memcpy_variant = 1;
if (argc >= 5)
memcpy_variant = std::stoull(argv[4]);
size_t generator_variant = 1;
if (argc >= 6)
generator_variant = std::stoull(argv[5]);
std::unique_ptr<uint8_t[]> src(new uint8_t[size]);
std::unique_ptr<uint8_t[]> dst(new uint8_t[size]);
/// Fill src with some pattern for validation.
for (size_t i = 0; i < size; ++i)
src[i] = i;
/// Fill dst to avoid page faults.
memset(dst.get(), 0, size);
dispatchVariants(memcpy_variant, generator_variant, dst.get(), src.get(), size, iterations, num_threads);
return 0;
}

View File

@ -0,0 +1,138 @@
/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8 -*-│
vi: set et ft=asm ts=8 tw=8 fenc=utf-8 :vi
Copyright 2020 Justine Alexandra Roberts Tunney
Permission to use, copy, modify, and/or distribute this software for
any purpose with or without fee is hereby granted, provided that the
above copyright notice and this permission notice appear in all copies.
THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL
WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE
AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL
DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR
PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
PERFORMANCE OF THIS SOFTWARE.
*/
// Copies memory.
//
// DEST and SRC must not overlap, unless DESTSRC.
//
// @param rdi is dest
// @param rsi is src
// @param rdx is number of bytes
// @return original rdi copied to rax
// @mode long
// @asyncsignalsafe
memcpy_jart: mov %rdi,%rax
// 𝑠𝑙𝑖𝑑𝑒
.align 16
.type memcpy_jart,@function
.size memcpy_jart,.-memcpy_jart
.globl memcpy_jart
// Copies memory w/ minimal impact ABI.
//
// @param rdi is dest
// @param rsi is src
// @param rdx is number of bytes
// @clob flags,rcx,xmm3,xmm4
// @mode long
MemCpy: mov $.Lmemcpytab.size,%ecx
cmp %rcx,%rdx
cmovb %rdx,%rcx
jmp *memcpytab(,%rcx,8)
.Lanchorpoint:
.L16r: cmp $1024,%rdx
jae .Lerms
.L16: movdqu -16(%rsi,%rdx),%xmm4
mov $16,%rcx
0: add $16,%rcx
movdqu -32(%rsi,%rcx),%xmm3
movdqu %xmm3,-32(%rdi,%rcx)
cmp %rcx,%rdx
ja 0b
movdqu %xmm4,-16(%rdi,%rdx)
pxor %xmm4,%xmm4
pxor %xmm3,%xmm3
jmp .L0
.L8: push %rbx
mov (%rsi),%rcx
mov -8(%rsi,%rdx),%rbx
mov %rcx,(%rdi)
mov %rbx,-8(%rdi,%rdx)
1: pop %rbx
.L0: ret
.L4: push %rbx
mov (%rsi),%ecx
mov -4(%rsi,%rdx),%ebx
mov %ecx,(%rdi)
mov %ebx,-4(%rdi,%rdx)
jmp 1b
.L3: push %rbx
mov (%rsi),%cx
mov -2(%rsi,%rdx),%bx
mov %cx,(%rdi)
mov %bx,-2(%rdi,%rdx)
jmp 1b
.L2: mov (%rsi),%cx
mov %cx,(%rdi)
jmp .L0
.L1: mov (%rsi),%cl
mov %cl,(%rdi)
jmp .L0
.Lerms: cmp $1024*1024,%rdx
ja .Lnts
push %rdi
push %rsi
mov %rdx,%rcx
rep movsb
pop %rsi
pop %rdi
jmp .L0
.Lnts: movdqu (%rsi),%xmm3
movdqu %xmm3,(%rdi)
lea 16(%rdi),%rcx
and $-16,%rcx
sub %rdi,%rcx
add %rcx,%rdi
add %rcx,%rsi
sub %rcx,%rdx
mov $16,%rcx
0: add $16,%rcx
movdqu -32(%rsi,%rcx),%xmm3
movntdq %xmm3,-32(%rdi,%rcx)
cmp %rcx,%rdx
ja 0b
sfence
movdqu -16(%rsi,%rdx),%xmm3
movdqu %xmm3,-16(%rdi,%rdx)
pxor %xmm3,%xmm3
jmp .L0
.type MemCpy,@function
.size MemCpy,.-MemCpy
.globl MemCpy
.section .rodata
.align 8
memcpytab:
.quad .L0
.quad .L1
.quad .L2
.quad .L3
.rept 4
.quad .L4
.endr
.rept 8
.quad .L8
.endr
.rept 16
.quad .L16
.endr
.equ .Lmemcpytab.size,(.-memcpytab)/8
.quad .L16r # SSE + ERMS + NTS
.type memcpytab,@object
.previous