2016-04-15 00:33:21 +00:00
|
|
|
#pragma once
|
|
|
|
|
2016-04-14 21:44:12 +00:00
|
|
|
#include <string.h>
|
2016-04-15 00:33:21 +00:00
|
|
|
#include <DB/Core/Defines.h>
|
|
|
|
|
|
|
|
|
|
|
|
#if defined(__x86_64__)
|
|
|
|
|
|
|
|
#include <emmintrin.h>
|
2016-04-14 21:44:12 +00:00
|
|
|
|
|
|
|
|
2017-01-20 01:00:13 +00:00
|
|
|
/** memcpy function could work suboptimal if all the following conditions are met:
|
|
|
|
* 1. Size of memory region is relatively small (approximately, under 50 bytes).
|
|
|
|
* 2. Size of memory region is not known at compile-time.
|
2016-04-14 21:44:12 +00:00
|
|
|
*
|
2017-01-20 01:00:13 +00:00
|
|
|
* In that case, memcpy works suboptimal by following reasons:
|
|
|
|
* 1. Function is not inlined.
|
|
|
|
* 2. Much time/instructions are spend to process "tails" of data.
|
2016-04-14 21:44:12 +00:00
|
|
|
*
|
2017-01-20 01:00:13 +00:00
|
|
|
* There are cases when function could be implemented in more optimal way, with help of some assumptions.
|
|
|
|
* One of that assumptions - ability to read and write some number of bytes after end of passed memory regions.
|
|
|
|
* Under that assumption, it is possible not to implement difficult code to process tails of data and do copy always by big chunks.
|
2016-04-14 21:44:12 +00:00
|
|
|
*
|
2017-01-20 01:00:13 +00:00
|
|
|
* This case is typical, for example, when many small pieces of data are gathered to single contiguous piece of memory in a loop.
|
|
|
|
* - because each next copy will overwrite excessive data after previous copy.
|
2016-04-14 21:44:12 +00:00
|
|
|
*
|
2017-01-20 01:00:13 +00:00
|
|
|
* Assumption that size of memory region is small enough allows us to not unroll the loop.
|
|
|
|
* This is slower, when size of memory is actually big.
|
|
|
|
*
|
|
|
|
* Use with caution.
|
2016-04-14 21:44:12 +00:00
|
|
|
*/
|
2016-04-15 00:33:21 +00:00
|
|
|
|
2016-04-15 03:57:38 +00:00
|
|
|
namespace detail
|
|
|
|
{
|
|
|
|
inline void memcpySmallAllowReadWriteOverflow15Impl(char * __restrict dst, const char * __restrict src, ssize_t n)
|
|
|
|
{
|
|
|
|
while (n > 0)
|
|
|
|
{
|
|
|
|
_mm_storeu_si128(reinterpret_cast<__m128i *>(dst),
|
|
|
|
_mm_loadu_si128(reinterpret_cast<const __m128i *>(src)));
|
|
|
|
|
|
|
|
dst += 16;
|
|
|
|
src += 16;
|
|
|
|
n -= 16;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2017-01-20 01:00:13 +00:00
|
|
|
/** Works under assumption, that it's possible to read up to 15 excessive bytes after end of 'src' region
|
|
|
|
* and to write any garbage into up to 15 bytes after end of 'dst' region.
|
2016-04-15 00:33:21 +00:00
|
|
|
*/
|
|
|
|
inline void memcpySmallAllowReadWriteOverflow15(void * __restrict dst, const void * __restrict src, size_t n)
|
2016-04-14 21:44:12 +00:00
|
|
|
{
|
2016-04-15 03:57:38 +00:00
|
|
|
detail::memcpySmallAllowReadWriteOverflow15Impl(reinterpret_cast<char *>(dst), reinterpret_cast<const char *>(src), n);
|
2016-04-14 21:44:12 +00:00
|
|
|
}
|
|
|
|
|
2017-01-20 01:00:13 +00:00
|
|
|
/** NOTE There was also a function, that assumes, that you could read any bytes inside same memory page of src.
|
|
|
|
* This function was unused, and also it requires special handling for Valgrind and ASan.
|
2016-04-15 00:33:21 +00:00
|
|
|
*/
|
|
|
|
|
2017-01-20 01:00:13 +00:00
|
|
|
#else /// Implementation for other platforms.
|
2016-04-15 00:33:21 +00:00
|
|
|
|
|
|
|
inline void memcpySmallAllowReadWriteOverflow15(void * __restrict dst, const void * __restrict src, size_t n)
|
|
|
|
{
|
|
|
|
memcpy(dst, src, n);
|
|
|
|
}
|
|
|
|
|
|
|
|
#endif
|