diff --git a/CMakeLists.txt b/CMakeLists.txt index 76b79a0b6c8..d310f7c298c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -155,7 +155,6 @@ option(ENABLE_TESTS "Provide unit_test_dbms target with Google.Test unit tests" if (OS_LINUX AND NOT UNBUNDLED AND MAKE_STATIC_LIBRARIES AND NOT SPLIT_SHARED_LIBRARIES AND CMAKE_VERSION VERSION_GREATER "3.9.0") # Only for Linux, x86_64. - # Implies ${ENABLE_FASTMEMCPY} option(GLIBC_COMPATIBILITY "Enable compatibility with older glibc libraries." ON) elseif(GLIBC_COMPATIBILITY) message (${RECONFIGURE_MESSAGE_LEVEL} "Glibc compatibility cannot be enabled in current configuration") @@ -241,9 +240,7 @@ else() message(STATUS "Disabling compiler -pipe option (have only ${AVAILABLE_PHYSICAL_MEMORY} mb of memory)") endif() -if(NOT DISABLE_CPU_OPTIMIZE) - include(cmake/cpu_features.cmake) -endif() +include(cmake/cpu_features.cmake) option(ARCH_NATIVE "Add -march=native compiler flag") @@ -536,7 +533,7 @@ macro (add_executable target) # explicitly acquire and interpose malloc symbols by clickhouse_malloc # if GLIBC_COMPATIBILITY is ON and ENABLE_THINLTO is on than provide memcpy symbol explicitly to neutrialize thinlto's libcall generation. if (GLIBC_COMPATIBILITY AND ENABLE_THINLTO) - _add_executable (${ARGV} $ $) + _add_executable (${ARGV} $ $) else () _add_executable (${ARGV} $) endif () diff --git a/base/common/CMakeLists.txt b/base/common/CMakeLists.txt index cea52b443dd..b4bf4f55466 100644 --- a/base/common/CMakeLists.txt +++ b/base/common/CMakeLists.txt @@ -74,7 +74,6 @@ target_link_libraries (common ${CITYHASH_LIBRARIES} boost::headers_only boost::system - FastMemcpy Poco::Net Poco::Net::SSL Poco::Util diff --git a/base/common/tests/CMakeLists.txt b/base/common/tests/CMakeLists.txt index b7082ee9900..6775d443fb6 100644 --- a/base/common/tests/CMakeLists.txt +++ b/base/common/tests/CMakeLists.txt @@ -11,7 +11,7 @@ set(PLATFORM_LIBS ${CMAKE_DL_LIBS}) target_link_libraries (date_lut2 PRIVATE common ${PLATFORM_LIBS}) target_link_libraries (date_lut3 PRIVATE common ${PLATFORM_LIBS}) target_link_libraries (date_lut_default_timezone PRIVATE common ${PLATFORM_LIBS}) -target_link_libraries (local_date_time_comparison PRIVATE common) +target_link_libraries (local_date_time_comparison PRIVATE common ${PLATFORM_LIBS}) target_link_libraries (realloc-perf PRIVATE common) add_check(local_date_time_comparison) diff --git a/base/glibc-compatibility/CMakeLists.txt b/base/glibc-compatibility/CMakeLists.txt index 684c6162941..e785e2ab2ce 100644 --- a/base/glibc-compatibility/CMakeLists.txt +++ b/base/glibc-compatibility/CMakeLists.txt @@ -1,5 +1,8 @@ if (GLIBC_COMPATIBILITY) - set (ENABLE_FASTMEMCPY ON) + add_subdirectory(memcpy) + if(TARGET memcpy) + set(MEMCPY_LIBRARY memcpy) + endif() enable_language(ASM) include(CheckIncludeFile) @@ -27,13 +30,6 @@ if (GLIBC_COMPATIBILITY) list(APPEND glibc_compatibility_sources musl/getentropy.c) endif() - if (NOT ARCH_ARM) - # clickhouse_memcpy don't support ARCH_ARM, see https://github.com/ClickHouse/ClickHouse/issues/18951 - add_library (clickhouse_memcpy OBJECT - ${ClickHouse_SOURCE_DIR}/contrib/FastMemcpy/memcpy_wrapper.c - ) - endif() - # Need to omit frame pointers to match the performance of glibc set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fomit-frame-pointer") @@ -51,15 +47,16 @@ if (GLIBC_COMPATIBILITY) target_compile_options(glibc-compatibility PRIVATE -fPIC) endif () - target_link_libraries(global-libs INTERFACE glibc-compatibility) + target_link_libraries(global-libs INTERFACE glibc-compatibility ${MEMCPY_LIBRARY}) install( - TARGETS glibc-compatibility + TARGETS glibc-compatibility ${MEMCPY_LIBRARY} EXPORT global ARCHIVE DESTINATION lib ) message (STATUS "Some symbols from glibc will be replaced for compatibility") + elseif (YANDEX_OFFICIAL_BUILD) message (WARNING "Option GLIBC_COMPATIBILITY must be turned on for production builds.") endif () diff --git a/base/glibc-compatibility/memcpy/CMakeLists.txt b/base/glibc-compatibility/memcpy/CMakeLists.txt new file mode 100644 index 00000000000..133995d9b96 --- /dev/null +++ b/base/glibc-compatibility/memcpy/CMakeLists.txt @@ -0,0 +1,8 @@ +if (ARCH_AMD64) + add_library(memcpy STATIC memcpy.cpp) + + # We allow to include memcpy.h from user code for better inlining. + target_include_directories(memcpy PUBLIC $) + + target_compile_options(memcpy PRIVATE -fno-builtin-memcpy) +endif () diff --git a/base/glibc-compatibility/memcpy/memcpy.cpp b/base/glibc-compatibility/memcpy/memcpy.cpp new file mode 100644 index 00000000000..ec43a2c3649 --- /dev/null +++ b/base/glibc-compatibility/memcpy/memcpy.cpp @@ -0,0 +1,6 @@ +#include "memcpy.h" + +extern "C" void * memcpy(void * __restrict dst, const void * __restrict src, size_t size) +{ + return inline_memcpy(dst, src, size); +} diff --git a/base/glibc-compatibility/memcpy/memcpy.h b/base/glibc-compatibility/memcpy/memcpy.h new file mode 100644 index 00000000000..f9f81bcb0fe --- /dev/null +++ b/base/glibc-compatibility/memcpy/memcpy.h @@ -0,0 +1,217 @@ +#include + +#include + + +/** Custom memcpy implementation for ClickHouse. + * It has the following benefits over using glibc's implementation: + * 1. Avoiding dependency on specific version of glibc's symbol, like memcpy@@GLIBC_2.14 for portability. + * 2. Avoiding indirect call via PLT due to shared linking, that can be less efficient. + * 3. It's possible to include this header and call inline_memcpy directly for better inlining or interprocedural analysis. + * 4. Better results on our performance tests on current CPUs: up to 25% on some queries and up to 0.7%..1% in average across all queries. + * + * Writing our own memcpy is extremely difficult for the following reasons: + * 1. The optimal variant depends on the specific CPU model. + * 2. The optimal variant depends on the distribution of size arguments. + * 3. It depends on the number of threads copying data concurrently. + * 4. It also depends on how the calling code is using the copied data and how the different memcpy calls are related to each other. + * Due to vast range of scenarios it makes proper testing especially difficult. + * When writing our own memcpy there is a risk to overoptimize it + * on non-representative microbenchmarks while making real-world use cases actually worse. + * + * Most of the benchmarks for memcpy on the internet are wrong. + * + * Let's look at the details: + * + * For small size, the order of branches in code is important. + * There are variants with specific order of branches (like here or in glibc) + * or with jump table (in asm code see example from Cosmopolitan libc: + * https://github.com/jart/cosmopolitan/blob/de09bec215675e9b0beb722df89c6f794da74f3f/libc/nexgen32e/memcpy.S#L61) + * or with Duff device in C (see https://github.com/skywind3000/FastMemcpy/) + * + * It's also important how to copy uneven sizes. + * Almost every implementation, including this, is using two overlapping movs. + * + * It is important to disable -ftree-loop-distribute-patterns when compiling memcpy implementation, + * otherwise the compiler can replace internal loops to a call to memcpy that will lead to infinite recursion. + * + * For larger sizes it's important to choose the instructions used: + * - SSE or AVX or AVX-512; + * - rep movsb; + * Performance will depend on the size threshold, on the CPU model, on the "erms" flag + * ("Enhansed Rep MovS" - it indicates that performance of "rep movsb" is decent for large sizes) + * https://stackoverflow.com/questions/43343231/enhanced-rep-movsb-for-memcpy + * + * Using AVX-512 can be bad due to throttling. + * Using AVX can be bad if most code is using SSE due to switching penalty + * (it also depends on the usage of "vzeroupper" instruction). + * But in some cases AVX gives a win. + * + * It also depends on how many times the loop will be unrolled. + * We are unrolling the loop 8 times (by the number of available registers), but it not always the best. + * + * It also depends on the usage of aligned or unaligned loads/stores. + * We are using unaligned loads and aligned stores. + * + * It also depends on the usage of prefetch instructions. It makes sense on some Intel CPUs but can slow down performance on AMD. + * Setting up correct offset for prefetching is non-obvious. + * + * Non-temporary (cache bypassing) stores can be used for very large sizes (more than a half of L3 cache). + * But the exact threshold is unclear - when doing memcpy from multiple threads the optimal threshold can be lower, + * because L3 cache is shared (and L2 cache is partially shared). + * + * Very large size of memcpy typically indicates suboptimal (not cache friendly) algorithms in code or unrealistic scenarios, + * so we don't pay attention to using non-temporary stores. + * + * On recent Intel CPUs, the presence of "erms" makes "rep movsb" the most benefitial, + * even comparing to non-temporary aligned unrolled stores even with the most wide registers. + * + * memcpy can be written in asm, C or C++. The latter can also use inline asm. + * The asm implementation can be better to make sure that compiler won't make the code worse, + * to ensure the order of branches, the code layout, the usage of all required registers. + * But if it is located in separate translation unit, inlining will not be possible + * (inline asm can be used to overcome this limitation). + * Sometimes C or C++ code can be further optimized by compiler. + * For example, clang is capable replacing SSE intrinsics to AVX code if -mavx is used. + * + * Please note that compiler can replace plain code to memcpy and vice versa. + * - memcpy with compile-time known small size is replaced to simple instructions without a call to memcpy; + * it is controlled by -fbuiltin-memcpy and can be manually ensured by calling __builtin_memcpy. + * This is often used to implement unaligned load/store without undefined behaviour in C++. + * - a loop with copying bytes can be recognized and replaced by a call to memcpy; + * it is controlled by -ftree-loop-distribute-patterns. + * - also note that a loop with copying bytes can be unrolled, peeled and vectorized that will give you + * inline code somewhat similar to a decent implementation of memcpy. + * + * This description is up to date as of Mar 2021. + * + * How to test the memcpy implementation for performance: + * 1. Test on real production workload. + * 2. For synthetic test, see utils/memcpy-bench, but make sure you will do the best to exhaust the wide range of scenarios. + * + * TODO: Add self-tuning memcpy with bayesian bandits algorithm for large sizes. + * See https://habr.com/en/company/yandex/blog/457612/ + */ + + +static inline void * inline_memcpy(void * __restrict dst_, const void * __restrict src_, size_t size) +{ + /// We will use pointer arithmetic, so char pointer will be used. + /// Note that __restrict makes sense (otherwise compiler will reload data from memory + /// instead of using the value of registers due to possible aliasing). + char * __restrict dst = reinterpret_cast(dst_); + const char * __restrict src = reinterpret_cast(src_); + + /// Standard memcpy returns the original value of dst. It is rarely used but we have to do it. + /// If you use memcpy with small but non-constant sizes, you can call inline_memcpy directly + /// for inlining and removing this single instruction. + void * ret = dst; + +tail: + /// Small sizes and tails after the loop for large sizes. + /// The order of branches is important but in fact the optimal order depends on the distribution of sizes in your application. + /// This order of branches is from the disassembly of glibc's code. + /// We copy chunks of possibly uneven size with two overlapping movs. + /// Example: to copy 5 bytes [0, 1, 2, 3, 4] we will copy tail [1, 2, 3, 4] first and then head [0, 1, 2, 3]. + if (size <= 16) + { + if (size >= 8) + { + /// Chunks of 8..16 bytes. + __builtin_memcpy(dst + size - 8, src + size - 8, 8); + __builtin_memcpy(dst, src, 8); + } + else if (size >= 4) + { + /// Chunks of 4..7 bytes. + __builtin_memcpy(dst + size - 4, src + size - 4, 4); + __builtin_memcpy(dst, src, 4); + } + else if (size >= 2) + { + /// Chunks of 2..3 bytes. + __builtin_memcpy(dst + size - 2, src + size - 2, 2); + __builtin_memcpy(dst, src, 2); + } + else if (size >= 1) + { + /// A single byte. + *dst = *src; + } + /// No bytes remaining. + } + else + { + /// Medium and large sizes. + if (size <= 128) + { + /// Medium size, not enough for full loop unrolling. + + /// We will copy the last 16 bytes. + _mm_storeu_si128(reinterpret_cast<__m128i *>(dst + size - 16), _mm_loadu_si128(reinterpret_cast(src + size - 16))); + + /// Then we will copy every 16 bytes from the beginning in a loop. + /// The last loop iteration will possibly overwrite some part of already copied last 16 bytes. + /// This is Ok, similar to the code for small sizes above. + while (size > 16) + { + _mm_storeu_si128(reinterpret_cast<__m128i *>(dst), _mm_loadu_si128(reinterpret_cast(src))); + dst += 16; + src += 16; + size -= 16; + } + } + else + { + /// Large size with fully unrolled loop. + + /// Align destination to 16 bytes boundary. + size_t padding = (16 - (reinterpret_cast(dst) & 15)) & 15; + + /// If not aligned - we will copy first 16 bytes with unaligned stores. + if (padding > 0) + { + __m128i head = _mm_loadu_si128(reinterpret_cast(src)); + _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), head); + dst += padding; + src += padding; + size -= padding; + } + + /// Aligned unrolled copy. We will use all available SSE registers. + /// It's not possible to have both src and dst aligned. + /// So, we will use aligned stores and unaligned loads. + __m128i c0, c1, c2, c3, c4, c5, c6, c7; + + while (size >= 128) + { + c0 = _mm_loadu_si128(reinterpret_cast(src) + 0); + c1 = _mm_loadu_si128(reinterpret_cast(src) + 1); + c2 = _mm_loadu_si128(reinterpret_cast(src) + 2); + c3 = _mm_loadu_si128(reinterpret_cast(src) + 3); + c4 = _mm_loadu_si128(reinterpret_cast(src) + 4); + c5 = _mm_loadu_si128(reinterpret_cast(src) + 5); + c6 = _mm_loadu_si128(reinterpret_cast(src) + 6); + c7 = _mm_loadu_si128(reinterpret_cast(src) + 7); + src += 128; + _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 0), c0); + _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 1), c1); + _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 2), c2); + _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 3), c3); + _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 4), c4); + _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 5), c5); + _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 6), c6); + _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 7), c7); + dst += 128; + + size -= 128; + } + + /// The latest remaining 0..127 bytes will be processed as usual. + goto tail; + } + } + + return ret; +} + diff --git a/contrib/CMakeLists.txt b/contrib/CMakeLists.txt index bf4bf5eb472..6a56b4cc733 100644 --- a/contrib/CMakeLists.txt +++ b/contrib/CMakeLists.txt @@ -38,7 +38,6 @@ add_subdirectory (boost-cmake) add_subdirectory (cctz-cmake) add_subdirectory (consistent-hashing) add_subdirectory (dragonbox-cmake) -add_subdirectory (FastMemcpy) add_subdirectory (hyperscan-cmake) add_subdirectory (jemalloc-cmake) add_subdirectory (libcpuid-cmake) diff --git a/contrib/FastMemcpy/CMakeLists.txt b/contrib/FastMemcpy/CMakeLists.txt deleted file mode 100644 index 8efe6d45dff..00000000000 --- a/contrib/FastMemcpy/CMakeLists.txt +++ /dev/null @@ -1,28 +0,0 @@ -option (ENABLE_FASTMEMCPY "Enable FastMemcpy library (only internal)" ${ENABLE_LIBRARIES}) - -if (NOT OS_LINUX OR ARCH_AARCH64) - set (ENABLE_FASTMEMCPY OFF) -endif () - -if (ENABLE_FASTMEMCPY) - set (LIBRARY_DIR ${ClickHouse_SOURCE_DIR}/contrib/FastMemcpy) - - set (SRCS - ${LIBRARY_DIR}/FastMemcpy.c - - memcpy_wrapper.c - ) - - add_library (FastMemcpy ${SRCS}) - target_include_directories (FastMemcpy PUBLIC ${LIBRARY_DIR}) - - target_compile_definitions(FastMemcpy PUBLIC USE_FASTMEMCPY=1) - - message (STATUS "Using FastMemcpy") -else () - add_library (FastMemcpy INTERFACE) - - target_compile_definitions(FastMemcpy INTERFACE USE_FASTMEMCPY=0) - - message (STATUS "Not using FastMemcpy") -endif () diff --git a/contrib/FastMemcpy/FastMemcpy.c b/contrib/FastMemcpy/FastMemcpy.c deleted file mode 100644 index 5021bcc7d16..00000000000 --- a/contrib/FastMemcpy/FastMemcpy.c +++ /dev/null @@ -1,220 +0,0 @@ -//===================================================================== -// -// FastMemcpy.c - skywind3000@163.com, 2015 -// -// feature: -// 50% speed up in avg. vs standard memcpy (tested in vc2012/gcc4.9) -// -//===================================================================== -#include -#include -#include -#include - -#if (defined(_WIN32) || defined(WIN32)) -#include -#include -#ifdef _MSC_VER -#pragma comment(lib, "winmm.lib") -#endif -#elif defined(__unix) -#include -#include -#else -#error it can only be compiled under windows or unix -#endif - -#include "FastMemcpy.h" - -unsigned int gettime() -{ - #if (defined(_WIN32) || defined(WIN32)) - return timeGetTime(); - #else - static struct timezone tz={ 0,0 }; - struct timeval time; - gettimeofday(&time,&tz); - return (time.tv_sec * 1000 + time.tv_usec / 1000); - #endif -} - -void sleepms(unsigned int millisec) -{ -#if defined(_WIN32) || defined(WIN32) - Sleep(millisec); -#else - usleep(millisec * 1000); -#endif -} - - -void benchmark(int dstalign, int srcalign, size_t size, int times) -{ - char *DATA1 = (char*)malloc(size + 64); - char *DATA2 = (char*)malloc(size + 64); - size_t LINEAR1 = ((size_t)DATA1); - size_t LINEAR2 = ((size_t)DATA2); - char *ALIGN1 = (char*)(((64 - (LINEAR1 & 63)) & 63) + LINEAR1); - char *ALIGN2 = (char*)(((64 - (LINEAR2 & 63)) & 63) + LINEAR2); - char *dst = (dstalign)? ALIGN1 : (ALIGN1 + 1); - char *src = (srcalign)? ALIGN2 : (ALIGN2 + 3); - unsigned int t1, t2; - int k; - - sleepms(100); - t1 = gettime(); - for (k = times; k > 0; k--) { - memcpy(dst, src, size); - } - t1 = gettime() - t1; - sleepms(100); - t2 = gettime(); - for (k = times; k > 0; k--) { - memcpy_fast(dst, src, size); - } - t2 = gettime() - t2; - - free(DATA1); - free(DATA2); - - printf("result(dst %s, src %s): memcpy_fast=%dms memcpy=%d ms\n", - dstalign? "aligned" : "unalign", - srcalign? "aligned" : "unalign", (int)t2, (int)t1); -} - - -void bench(int copysize, int times) -{ - printf("benchmark(size=%d bytes, times=%d):\n", copysize, times); - benchmark(1, 1, copysize, times); - benchmark(1, 0, copysize, times); - benchmark(0, 1, copysize, times); - benchmark(0, 0, copysize, times); - printf("\n"); -} - - -void random_bench(int maxsize, int times) -{ - static char A[11 * 1024 * 1024 + 2]; - static char B[11 * 1024 * 1024 + 2]; - static int random_offsets[0x10000]; - static int random_sizes[0x8000]; - unsigned int i, p1, p2; - unsigned int t1, t2; - for (i = 0; i < 0x10000; i++) { // generate random offsets - random_offsets[i] = rand() % (10 * 1024 * 1024 + 1); - } - for (i = 0; i < 0x8000; i++) { // generate random sizes - random_sizes[i] = 1 + rand() % maxsize; - } - sleepms(100); - t1 = gettime(); - for (p1 = 0, p2 = 0, i = 0; i < times; i++) { - int offset1 = random_offsets[(p1++) & 0xffff]; - int offset2 = random_offsets[(p1++) & 0xffff]; - int size = random_sizes[(p2++) & 0x7fff]; - memcpy(A + offset1, B + offset2, size); - } - t1 = gettime() - t1; - sleepms(100); - t2 = gettime(); - for (p1 = 0, p2 = 0, i = 0; i < times; i++) { - int offset1 = random_offsets[(p1++) & 0xffff]; - int offset2 = random_offsets[(p1++) & 0xffff]; - int size = random_sizes[(p2++) & 0x7fff]; - memcpy_fast(A + offset1, B + offset2, size); - } - t2 = gettime() - t2; - printf("benchmark random access:\n"); - printf("memcpy_fast=%dms memcpy=%dms\n\n", (int)t2, (int)t1); -} - - -#ifdef _MSC_VER -#pragma comment(lib, "winmm.lib") -#endif - -int main(void) -{ - bench(32, 0x1000000); - bench(64, 0x1000000); - bench(512, 0x800000); - bench(1024, 0x400000); - bench(4096, 0x80000); - bench(8192, 0x40000); - bench(1024 * 1024 * 1, 0x800); - bench(1024 * 1024 * 4, 0x200); - bench(1024 * 1024 * 8, 0x100); - - random_bench(2048, 8000000); - - return 0; -} - - - - -/* -benchmark(size=32 bytes, times=16777216): -result(dst aligned, src aligned): memcpy_fast=78ms memcpy=260 ms -result(dst aligned, src unalign): memcpy_fast=78ms memcpy=250 ms -result(dst unalign, src aligned): memcpy_fast=78ms memcpy=266 ms -result(dst unalign, src unalign): memcpy_fast=78ms memcpy=234 ms - -benchmark(size=64 bytes, times=16777216): -result(dst aligned, src aligned): memcpy_fast=109ms memcpy=281 ms -result(dst aligned, src unalign): memcpy_fast=109ms memcpy=328 ms -result(dst unalign, src aligned): memcpy_fast=109ms memcpy=343 ms -result(dst unalign, src unalign): memcpy_fast=93ms memcpy=344 ms - -benchmark(size=512 bytes, times=8388608): -result(dst aligned, src aligned): memcpy_fast=125ms memcpy=218 ms -result(dst aligned, src unalign): memcpy_fast=156ms memcpy=484 ms -result(dst unalign, src aligned): memcpy_fast=172ms memcpy=546 ms -result(dst unalign, src unalign): memcpy_fast=172ms memcpy=515 ms - -benchmark(size=1024 bytes, times=4194304): -result(dst aligned, src aligned): memcpy_fast=109ms memcpy=172 ms -result(dst aligned, src unalign): memcpy_fast=187ms memcpy=453 ms -result(dst unalign, src aligned): memcpy_fast=172ms memcpy=437 ms -result(dst unalign, src unalign): memcpy_fast=156ms memcpy=452 ms - -benchmark(size=4096 bytes, times=524288): -result(dst aligned, src aligned): memcpy_fast=62ms memcpy=78 ms -result(dst aligned, src unalign): memcpy_fast=109ms memcpy=202 ms -result(dst unalign, src aligned): memcpy_fast=94ms memcpy=203 ms -result(dst unalign, src unalign): memcpy_fast=110ms memcpy=218 ms - -benchmark(size=8192 bytes, times=262144): -result(dst aligned, src aligned): memcpy_fast=62ms memcpy=78 ms -result(dst aligned, src unalign): memcpy_fast=78ms memcpy=202 ms -result(dst unalign, src aligned): memcpy_fast=78ms memcpy=203 ms -result(dst unalign, src unalign): memcpy_fast=94ms memcpy=203 ms - -benchmark(size=1048576 bytes, times=2048): -result(dst aligned, src aligned): memcpy_fast=203ms memcpy=191 ms -result(dst aligned, src unalign): memcpy_fast=219ms memcpy=281 ms -result(dst unalign, src aligned): memcpy_fast=218ms memcpy=328 ms -result(dst unalign, src unalign): memcpy_fast=218ms memcpy=312 ms - -benchmark(size=4194304 bytes, times=512): -result(dst aligned, src aligned): memcpy_fast=312ms memcpy=406 ms -result(dst aligned, src unalign): memcpy_fast=296ms memcpy=421 ms -result(dst unalign, src aligned): memcpy_fast=312ms memcpy=468 ms -result(dst unalign, src unalign): memcpy_fast=297ms memcpy=452 ms - -benchmark(size=8388608 bytes, times=256): -result(dst aligned, src aligned): memcpy_fast=281ms memcpy=452 ms -result(dst aligned, src unalign): memcpy_fast=280ms memcpy=468 ms -result(dst unalign, src aligned): memcpy_fast=298ms memcpy=514 ms -result(dst unalign, src unalign): memcpy_fast=344ms memcpy=472 ms - -benchmark random access: -memcpy_fast=515ms memcpy=1014ms - -*/ - - - - diff --git a/contrib/FastMemcpy/FastMemcpy.h b/contrib/FastMemcpy/FastMemcpy.h deleted file mode 100644 index 5dcbfcf1656..00000000000 --- a/contrib/FastMemcpy/FastMemcpy.h +++ /dev/null @@ -1,694 +0,0 @@ -//===================================================================== -// -// FastMemcpy.c - skywind3000@163.com, 2015 -// -// feature: -// 50% speed up in avg. vs standard memcpy (tested in vc2012/gcc5.1) -// -//===================================================================== -#ifndef __FAST_MEMCPY_H__ -#define __FAST_MEMCPY_H__ - -#include -#include -#include - - -//--------------------------------------------------------------------- -// force inline for compilers -//--------------------------------------------------------------------- -#ifndef INLINE -#ifdef __GNUC__ -#if (__GNUC__ > 3) || ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 1)) - #define INLINE __inline__ __attribute__((always_inline)) -#else - #define INLINE __inline__ -#endif -#elif defined(_MSC_VER) - #define INLINE __forceinline -#elif (defined(__BORLANDC__) || defined(__WATCOMC__)) - #define INLINE __inline -#else - #define INLINE -#endif -#endif - -typedef __attribute__((__aligned__(1))) uint16_t uint16_unaligned_t; -typedef __attribute__((__aligned__(1))) uint32_t uint32_unaligned_t; -typedef __attribute__((__aligned__(1))) uint64_t uint64_unaligned_t; - -//--------------------------------------------------------------------- -// fast copy for different sizes -//--------------------------------------------------------------------- -static INLINE void memcpy_sse2_16(void *dst, const void *src) { - __m128i m0 = _mm_loadu_si128(((const __m128i*)src) + 0); - _mm_storeu_si128(((__m128i*)dst) + 0, m0); -} - -static INLINE void memcpy_sse2_32(void *dst, const void *src) { - __m128i m0 = _mm_loadu_si128(((const __m128i*)src) + 0); - __m128i m1 = _mm_loadu_si128(((const __m128i*)src) + 1); - _mm_storeu_si128(((__m128i*)dst) + 0, m0); - _mm_storeu_si128(((__m128i*)dst) + 1, m1); -} - -static INLINE void memcpy_sse2_64(void *dst, const void *src) { - __m128i m0 = _mm_loadu_si128(((const __m128i*)src) + 0); - __m128i m1 = _mm_loadu_si128(((const __m128i*)src) + 1); - __m128i m2 = _mm_loadu_si128(((const __m128i*)src) + 2); - __m128i m3 = _mm_loadu_si128(((const __m128i*)src) + 3); - _mm_storeu_si128(((__m128i*)dst) + 0, m0); - _mm_storeu_si128(((__m128i*)dst) + 1, m1); - _mm_storeu_si128(((__m128i*)dst) + 2, m2); - _mm_storeu_si128(((__m128i*)dst) + 3, m3); -} - -static INLINE void memcpy_sse2_128(void *dst, const void *src) { - __m128i m0 = _mm_loadu_si128(((const __m128i*)src) + 0); - __m128i m1 = _mm_loadu_si128(((const __m128i*)src) + 1); - __m128i m2 = _mm_loadu_si128(((const __m128i*)src) + 2); - __m128i m3 = _mm_loadu_si128(((const __m128i*)src) + 3); - __m128i m4 = _mm_loadu_si128(((const __m128i*)src) + 4); - __m128i m5 = _mm_loadu_si128(((const __m128i*)src) + 5); - __m128i m6 = _mm_loadu_si128(((const __m128i*)src) + 6); - __m128i m7 = _mm_loadu_si128(((const __m128i*)src) + 7); - _mm_storeu_si128(((__m128i*)dst) + 0, m0); - _mm_storeu_si128(((__m128i*)dst) + 1, m1); - _mm_storeu_si128(((__m128i*)dst) + 2, m2); - _mm_storeu_si128(((__m128i*)dst) + 3, m3); - _mm_storeu_si128(((__m128i*)dst) + 4, m4); - _mm_storeu_si128(((__m128i*)dst) + 5, m5); - _mm_storeu_si128(((__m128i*)dst) + 6, m6); - _mm_storeu_si128(((__m128i*)dst) + 7, m7); -} - - -//--------------------------------------------------------------------- -// tiny memory copy with jump table optimized -//--------------------------------------------------------------------- -/// Attribute is used to avoid an error with undefined behaviour sanitizer -/// ../contrib/FastMemcpy/FastMemcpy.h:91:56: runtime error: applying zero offset to null pointer -/// Found by 01307_orc_output_format.sh, cause - ORCBlockInputFormat and external ORC library. -__attribute__((__no_sanitize__("undefined"))) static INLINE void *memcpy_tiny(void *dst, const void *src, size_t size) { - unsigned char *dd = ((unsigned char*)dst) + size; - const unsigned char *ss = ((const unsigned char*)src) + size; - - switch (size) { - case 64: - memcpy_sse2_64(dd - 64, ss - 64); - case 0: - break; - - case 65: - memcpy_sse2_64(dd - 65, ss - 65); - case 1: - dd[-1] = ss[-1]; - break; - - case 66: - memcpy_sse2_64(dd - 66, ss - 66); - case 2: - *((uint16_unaligned_t*)(dd - 2)) = *((uint16_unaligned_t*)(ss - 2)); - break; - - case 67: - memcpy_sse2_64(dd - 67, ss - 67); - case 3: - *((uint16_unaligned_t*)(dd - 3)) = *((uint16_unaligned_t*)(ss - 3)); - dd[-1] = ss[-1]; - break; - - case 68: - memcpy_sse2_64(dd - 68, ss - 68); - case 4: - *((uint32_unaligned_t*)(dd - 4)) = *((uint32_unaligned_t*)(ss - 4)); - break; - - case 69: - memcpy_sse2_64(dd - 69, ss - 69); - case 5: - *((uint32_unaligned_t*)(dd - 5)) = *((uint32_unaligned_t*)(ss - 5)); - dd[-1] = ss[-1]; - break; - - case 70: - memcpy_sse2_64(dd - 70, ss - 70); - case 6: - *((uint32_unaligned_t*)(dd - 6)) = *((uint32_unaligned_t*)(ss - 6)); - *((uint16_unaligned_t*)(dd - 2)) = *((uint16_unaligned_t*)(ss - 2)); - break; - - case 71: - memcpy_sse2_64(dd - 71, ss - 71); - case 7: - *((uint32_unaligned_t*)(dd - 7)) = *((uint32_unaligned_t*)(ss - 7)); - *((uint32_unaligned_t*)(dd - 4)) = *((uint32_unaligned_t*)(ss - 4)); - break; - - case 72: - memcpy_sse2_64(dd - 72, ss - 72); - case 8: - *((uint64_unaligned_t*)(dd - 8)) = *((uint64_unaligned_t*)(ss - 8)); - break; - - case 73: - memcpy_sse2_64(dd - 73, ss - 73); - case 9: - *((uint64_unaligned_t*)(dd - 9)) = *((uint64_unaligned_t*)(ss - 9)); - dd[-1] = ss[-1]; - break; - - case 74: - memcpy_sse2_64(dd - 74, ss - 74); - case 10: - *((uint64_unaligned_t*)(dd - 10)) = *((uint64_unaligned_t*)(ss - 10)); - *((uint16_unaligned_t*)(dd - 2)) = *((uint16_unaligned_t*)(ss - 2)); - break; - - case 75: - memcpy_sse2_64(dd - 75, ss - 75); - case 11: - *((uint64_unaligned_t*)(dd - 11)) = *((uint64_unaligned_t*)(ss - 11)); - *((uint32_unaligned_t*)(dd - 4)) = *((uint32_unaligned_t*)(ss - 4)); - break; - - case 76: - memcpy_sse2_64(dd - 76, ss - 76); - case 12: - *((uint64_unaligned_t*)(dd - 12)) = *((uint64_unaligned_t*)(ss - 12)); - *((uint32_unaligned_t*)(dd - 4)) = *((uint32_unaligned_t*)(ss - 4)); - break; - - case 77: - memcpy_sse2_64(dd - 77, ss - 77); - case 13: - *((uint64_unaligned_t*)(dd - 13)) = *((uint64_unaligned_t*)(ss - 13)); - *((uint32_unaligned_t*)(dd - 5)) = *((uint32_unaligned_t*)(ss - 5)); - dd[-1] = ss[-1]; - break; - - case 78: - memcpy_sse2_64(dd - 78, ss - 78); - case 14: - *((uint64_unaligned_t*)(dd - 14)) = *((uint64_unaligned_t*)(ss - 14)); - *((uint64_unaligned_t*)(dd - 8)) = *((uint64_unaligned_t*)(ss - 8)); - break; - - case 79: - memcpy_sse2_64(dd - 79, ss - 79); - case 15: - *((uint64_unaligned_t*)(dd - 15)) = *((uint64_unaligned_t*)(ss - 15)); - *((uint64_unaligned_t*)(dd - 8)) = *((uint64_unaligned_t*)(ss - 8)); - break; - - case 80: - memcpy_sse2_64(dd - 80, ss - 80); - case 16: - memcpy_sse2_16(dd - 16, ss - 16); - break; - - case 81: - memcpy_sse2_64(dd - 81, ss - 81); - case 17: - memcpy_sse2_16(dd - 17, ss - 17); - dd[-1] = ss[-1]; - break; - - case 82: - memcpy_sse2_64(dd - 82, ss - 82); - case 18: - memcpy_sse2_16(dd - 18, ss - 18); - *((uint16_unaligned_t*)(dd - 2)) = *((uint16_unaligned_t*)(ss - 2)); - break; - - case 83: - memcpy_sse2_64(dd - 83, ss - 83); - case 19: - memcpy_sse2_16(dd - 19, ss - 19); - *((uint16_unaligned_t*)(dd - 3)) = *((uint16_unaligned_t*)(ss - 3)); - dd[-1] = ss[-1]; - break; - - case 84: - memcpy_sse2_64(dd - 84, ss - 84); - case 20: - memcpy_sse2_16(dd - 20, ss - 20); - *((uint32_unaligned_t*)(dd - 4)) = *((uint32_unaligned_t*)(ss - 4)); - break; - - case 85: - memcpy_sse2_64(dd - 85, ss - 85); - case 21: - memcpy_sse2_16(dd - 21, ss - 21); - *((uint32_unaligned_t*)(dd - 5)) = *((uint32_unaligned_t*)(ss - 5)); - dd[-1] = ss[-1]; - break; - - case 86: - memcpy_sse2_64(dd - 86, ss - 86); - case 22: - memcpy_sse2_16(dd - 22, ss - 22); - *((uint32_unaligned_t*)(dd - 6)) = *((uint32_unaligned_t*)(ss - 6)); - *((uint16_unaligned_t*)(dd - 2)) = *((uint16_unaligned_t*)(ss - 2)); - break; - - case 87: - memcpy_sse2_64(dd - 87, ss - 87); - case 23: - memcpy_sse2_16(dd - 23, ss - 23); - *((uint32_unaligned_t*)(dd - 7)) = *((uint32_unaligned_t*)(ss - 7)); - *((uint32_unaligned_t*)(dd - 4)) = *((uint32_unaligned_t*)(ss - 4)); - break; - - case 88: - memcpy_sse2_64(dd - 88, ss - 88); - case 24: - memcpy_sse2_16(dd - 24, ss - 24); - memcpy_sse2_16(dd - 16, ss - 16); - break; - - case 89: - memcpy_sse2_64(dd - 89, ss - 89); - case 25: - memcpy_sse2_16(dd - 25, ss - 25); - memcpy_sse2_16(dd - 16, ss - 16); - break; - - case 90: - memcpy_sse2_64(dd - 90, ss - 90); - case 26: - memcpy_sse2_16(dd - 26, ss - 26); - memcpy_sse2_16(dd - 16, ss - 16); - break; - - case 91: - memcpy_sse2_64(dd - 91, ss - 91); - case 27: - memcpy_sse2_16(dd - 27, ss - 27); - memcpy_sse2_16(dd - 16, ss - 16); - break; - - case 92: - memcpy_sse2_64(dd - 92, ss - 92); - case 28: - memcpy_sse2_16(dd - 28, ss - 28); - memcpy_sse2_16(dd - 16, ss - 16); - break; - - case 93: - memcpy_sse2_64(dd - 93, ss - 93); - case 29: - memcpy_sse2_16(dd - 29, ss - 29); - memcpy_sse2_16(dd - 16, ss - 16); - break; - - case 94: - memcpy_sse2_64(dd - 94, ss - 94); - case 30: - memcpy_sse2_16(dd - 30, ss - 30); - memcpy_sse2_16(dd - 16, ss - 16); - break; - - case 95: - memcpy_sse2_64(dd - 95, ss - 95); - case 31: - memcpy_sse2_16(dd - 31, ss - 31); - memcpy_sse2_16(dd - 16, ss - 16); - break; - - case 96: - memcpy_sse2_64(dd - 96, ss - 96); - case 32: - memcpy_sse2_32(dd - 32, ss - 32); - break; - - case 97: - memcpy_sse2_64(dd - 97, ss - 97); - case 33: - memcpy_sse2_32(dd - 33, ss - 33); - dd[-1] = ss[-1]; - break; - - case 98: - memcpy_sse2_64(dd - 98, ss - 98); - case 34: - memcpy_sse2_32(dd - 34, ss - 34); - *((uint16_unaligned_t*)(dd - 2)) = *((uint16_unaligned_t*)(ss - 2)); - break; - - case 99: - memcpy_sse2_64(dd - 99, ss - 99); - case 35: - memcpy_sse2_32(dd - 35, ss - 35); - *((uint16_unaligned_t*)(dd - 3)) = *((uint16_unaligned_t*)(ss - 3)); - dd[-1] = ss[-1]; - break; - - case 100: - memcpy_sse2_64(dd - 100, ss - 100); - case 36: - memcpy_sse2_32(dd - 36, ss - 36); - *((uint32_unaligned_t*)(dd - 4)) = *((uint32_unaligned_t*)(ss - 4)); - break; - - case 101: - memcpy_sse2_64(dd - 101, ss - 101); - case 37: - memcpy_sse2_32(dd - 37, ss - 37); - *((uint32_unaligned_t*)(dd - 5)) = *((uint32_unaligned_t*)(ss - 5)); - dd[-1] = ss[-1]; - break; - - case 102: - memcpy_sse2_64(dd - 102, ss - 102); - case 38: - memcpy_sse2_32(dd - 38, ss - 38); - *((uint32_unaligned_t*)(dd - 6)) = *((uint32_unaligned_t*)(ss - 6)); - *((uint16_unaligned_t*)(dd - 2)) = *((uint16_unaligned_t*)(ss - 2)); - break; - - case 103: - memcpy_sse2_64(dd - 103, ss - 103); - case 39: - memcpy_sse2_32(dd - 39, ss - 39); - *((uint32_unaligned_t*)(dd - 7)) = *((uint32_unaligned_t*)(ss - 7)); - *((uint32_unaligned_t*)(dd - 4)) = *((uint32_unaligned_t*)(ss - 4)); - break; - - case 104: - memcpy_sse2_64(dd - 104, ss - 104); - case 40: - memcpy_sse2_32(dd - 40, ss - 40); - *((uint64_unaligned_t*)(dd - 8)) = *((uint64_unaligned_t*)(ss - 8)); - break; - - case 105: - memcpy_sse2_64(dd - 105, ss - 105); - case 41: - memcpy_sse2_32(dd - 41, ss - 41); - *((uint64_unaligned_t*)(dd - 9)) = *((uint64_unaligned_t*)(ss - 9)); - dd[-1] = ss[-1]; - break; - - case 106: - memcpy_sse2_64(dd - 106, ss - 106); - case 42: - memcpy_sse2_32(dd - 42, ss - 42); - *((uint64_unaligned_t*)(dd - 10)) = *((uint64_unaligned_t*)(ss - 10)); - *((uint16_unaligned_t*)(dd - 2)) = *((uint16_unaligned_t*)(ss - 2)); - break; - - case 107: - memcpy_sse2_64(dd - 107, ss - 107); - case 43: - memcpy_sse2_32(dd - 43, ss - 43); - *((uint64_unaligned_t*)(dd - 11)) = *((uint64_unaligned_t*)(ss - 11)); - *((uint32_unaligned_t*)(dd - 4)) = *((uint32_unaligned_t*)(ss - 4)); - break; - - case 108: - memcpy_sse2_64(dd - 108, ss - 108); - case 44: - memcpy_sse2_32(dd - 44, ss - 44); - *((uint64_unaligned_t*)(dd - 12)) = *((uint64_unaligned_t*)(ss - 12)); - *((uint32_unaligned_t*)(dd - 4)) = *((uint32_unaligned_t*)(ss - 4)); - break; - - case 109: - memcpy_sse2_64(dd - 109, ss - 109); - case 45: - memcpy_sse2_32(dd - 45, ss - 45); - *((uint64_unaligned_t*)(dd - 13)) = *((uint64_unaligned_t*)(ss - 13)); - *((uint32_unaligned_t*)(dd - 5)) = *((uint32_unaligned_t*)(ss - 5)); - dd[-1] = ss[-1]; - break; - - case 110: - memcpy_sse2_64(dd - 110, ss - 110); - case 46: - memcpy_sse2_32(dd - 46, ss - 46); - *((uint64_unaligned_t*)(dd - 14)) = *((uint64_unaligned_t*)(ss - 14)); - *((uint64_unaligned_t*)(dd - 8)) = *((uint64_unaligned_t*)(ss - 8)); - break; - - case 111: - memcpy_sse2_64(dd - 111, ss - 111); - case 47: - memcpy_sse2_32(dd - 47, ss - 47); - *((uint64_unaligned_t*)(dd - 15)) = *((uint64_unaligned_t*)(ss - 15)); - *((uint64_unaligned_t*)(dd - 8)) = *((uint64_unaligned_t*)(ss - 8)); - break; - - case 112: - memcpy_sse2_64(dd - 112, ss - 112); - case 48: - memcpy_sse2_32(dd - 48, ss - 48); - memcpy_sse2_16(dd - 16, ss - 16); - break; - - case 113: - memcpy_sse2_64(dd - 113, ss - 113); - case 49: - memcpy_sse2_32(dd - 49, ss - 49); - memcpy_sse2_16(dd - 17, ss - 17); - dd[-1] = ss[-1]; - break; - - case 114: - memcpy_sse2_64(dd - 114, ss - 114); - case 50: - memcpy_sse2_32(dd - 50, ss - 50); - memcpy_sse2_16(dd - 18, ss - 18); - *((uint16_unaligned_t*)(dd - 2)) = *((uint16_unaligned_t*)(ss - 2)); - break; - - case 115: - memcpy_sse2_64(dd - 115, ss - 115); - case 51: - memcpy_sse2_32(dd - 51, ss - 51); - memcpy_sse2_16(dd - 19, ss - 19); - *((uint16_unaligned_t*)(dd - 3)) = *((uint16_unaligned_t*)(ss - 3)); - dd[-1] = ss[-1]; - break; - - case 116: - memcpy_sse2_64(dd - 116, ss - 116); - case 52: - memcpy_sse2_32(dd - 52, ss - 52); - memcpy_sse2_16(dd - 20, ss - 20); - *((uint32_unaligned_t*)(dd - 4)) = *((uint32_unaligned_t*)(ss - 4)); - break; - - case 117: - memcpy_sse2_64(dd - 117, ss - 117); - case 53: - memcpy_sse2_32(dd - 53, ss - 53); - memcpy_sse2_16(dd - 21, ss - 21); - *((uint32_unaligned_t*)(dd - 5)) = *((uint32_unaligned_t*)(ss - 5)); - dd[-1] = ss[-1]; - break; - - case 118: - memcpy_sse2_64(dd - 118, ss - 118); - case 54: - memcpy_sse2_32(dd - 54, ss - 54); - memcpy_sse2_16(dd - 22, ss - 22); - *((uint32_unaligned_t*)(dd - 6)) = *((uint32_unaligned_t*)(ss - 6)); - *((uint16_unaligned_t*)(dd - 2)) = *((uint16_unaligned_t*)(ss - 2)); - break; - - case 119: - memcpy_sse2_64(dd - 119, ss - 119); - case 55: - memcpy_sse2_32(dd - 55, ss - 55); - memcpy_sse2_16(dd - 23, ss - 23); - *((uint32_unaligned_t*)(dd - 7)) = *((uint32_unaligned_t*)(ss - 7)); - *((uint32_unaligned_t*)(dd - 4)) = *((uint32_unaligned_t*)(ss - 4)); - break; - - case 120: - memcpy_sse2_64(dd - 120, ss - 120); - case 56: - memcpy_sse2_32(dd - 56, ss - 56); - memcpy_sse2_16(dd - 24, ss - 24); - memcpy_sse2_16(dd - 16, ss - 16); - break; - - case 121: - memcpy_sse2_64(dd - 121, ss - 121); - case 57: - memcpy_sse2_32(dd - 57, ss - 57); - memcpy_sse2_16(dd - 25, ss - 25); - memcpy_sse2_16(dd - 16, ss - 16); - break; - - case 122: - memcpy_sse2_64(dd - 122, ss - 122); - case 58: - memcpy_sse2_32(dd - 58, ss - 58); - memcpy_sse2_16(dd - 26, ss - 26); - memcpy_sse2_16(dd - 16, ss - 16); - break; - - case 123: - memcpy_sse2_64(dd - 123, ss - 123); - case 59: - memcpy_sse2_32(dd - 59, ss - 59); - memcpy_sse2_16(dd - 27, ss - 27); - memcpy_sse2_16(dd - 16, ss - 16); - break; - - case 124: - memcpy_sse2_64(dd - 124, ss - 124); - case 60: - memcpy_sse2_32(dd - 60, ss - 60); - memcpy_sse2_16(dd - 28, ss - 28); - memcpy_sse2_16(dd - 16, ss - 16); - break; - - case 125: - memcpy_sse2_64(dd - 125, ss - 125); - case 61: - memcpy_sse2_32(dd - 61, ss - 61); - memcpy_sse2_16(dd - 29, ss - 29); - memcpy_sse2_16(dd - 16, ss - 16); - break; - - case 126: - memcpy_sse2_64(dd - 126, ss - 126); - case 62: - memcpy_sse2_32(dd - 62, ss - 62); - memcpy_sse2_16(dd - 30, ss - 30); - memcpy_sse2_16(dd - 16, ss - 16); - break; - - case 127: - memcpy_sse2_64(dd - 127, ss - 127); - case 63: - memcpy_sse2_32(dd - 63, ss - 63); - memcpy_sse2_16(dd - 31, ss - 31); - memcpy_sse2_16(dd - 16, ss - 16); - break; - - case 128: - memcpy_sse2_128(dd - 128, ss - 128); - break; - } - - return dst; -} - - -//--------------------------------------------------------------------- -// main routine -//--------------------------------------------------------------------- -static void* memcpy_fast(void *destination, const void *source, size_t size) -{ - unsigned char *dst = (unsigned char*)destination; - const unsigned char *src = (const unsigned char*)source; - static size_t cachesize = 0x200000; // L2-cache size - size_t padding; - - // small memory copy - if (size <= 128) { - return memcpy_tiny(dst, src, size); - } - - // align destination to 16 bytes boundary - padding = (16 - (((size_t)dst) & 15)) & 15; - - if (padding > 0) { - __m128i head = _mm_loadu_si128((const __m128i*)src); - _mm_storeu_si128((__m128i*)dst, head); - dst += padding; - src += padding; - size -= padding; - } - - // medium size copy - if (size <= cachesize) { - __m128i c0, c1, c2, c3, c4, c5, c6, c7; - - for (; size >= 128; size -= 128) { - c0 = _mm_loadu_si128(((const __m128i*)src) + 0); - c1 = _mm_loadu_si128(((const __m128i*)src) + 1); - c2 = _mm_loadu_si128(((const __m128i*)src) + 2); - c3 = _mm_loadu_si128(((const __m128i*)src) + 3); - c4 = _mm_loadu_si128(((const __m128i*)src) + 4); - c5 = _mm_loadu_si128(((const __m128i*)src) + 5); - c6 = _mm_loadu_si128(((const __m128i*)src) + 6); - c7 = _mm_loadu_si128(((const __m128i*)src) + 7); - _mm_prefetch((const char*)(src + 256), _MM_HINT_NTA); - src += 128; - _mm_store_si128((((__m128i*)dst) + 0), c0); - _mm_store_si128((((__m128i*)dst) + 1), c1); - _mm_store_si128((((__m128i*)dst) + 2), c2); - _mm_store_si128((((__m128i*)dst) + 3), c3); - _mm_store_si128((((__m128i*)dst) + 4), c4); - _mm_store_si128((((__m128i*)dst) + 5), c5); - _mm_store_si128((((__m128i*)dst) + 6), c6); - _mm_store_si128((((__m128i*)dst) + 7), c7); - dst += 128; - } - } - else { // big memory copy - __m128i c0, c1, c2, c3, c4, c5, c6, c7; - - _mm_prefetch((const char*)(src), _MM_HINT_NTA); - - if ((((size_t)src) & 15) == 0) { // source aligned - for (; size >= 128; size -= 128) { - c0 = _mm_load_si128(((const __m128i*)src) + 0); - c1 = _mm_load_si128(((const __m128i*)src) + 1); - c2 = _mm_load_si128(((const __m128i*)src) + 2); - c3 = _mm_load_si128(((const __m128i*)src) + 3); - c4 = _mm_load_si128(((const __m128i*)src) + 4); - c5 = _mm_load_si128(((const __m128i*)src) + 5); - c6 = _mm_load_si128(((const __m128i*)src) + 6); - c7 = _mm_load_si128(((const __m128i*)src) + 7); - _mm_prefetch((const char*)(src + 256), _MM_HINT_NTA); - src += 128; - _mm_stream_si128((((__m128i*)dst) + 0), c0); - _mm_stream_si128((((__m128i*)dst) + 1), c1); - _mm_stream_si128((((__m128i*)dst) + 2), c2); - _mm_stream_si128((((__m128i*)dst) + 3), c3); - _mm_stream_si128((((__m128i*)dst) + 4), c4); - _mm_stream_si128((((__m128i*)dst) + 5), c5); - _mm_stream_si128((((__m128i*)dst) + 6), c6); - _mm_stream_si128((((__m128i*)dst) + 7), c7); - dst += 128; - } - } - else { // source unaligned - for (; size >= 128; size -= 128) { - c0 = _mm_loadu_si128(((const __m128i*)src) + 0); - c1 = _mm_loadu_si128(((const __m128i*)src) + 1); - c2 = _mm_loadu_si128(((const __m128i*)src) + 2); - c3 = _mm_loadu_si128(((const __m128i*)src) + 3); - c4 = _mm_loadu_si128(((const __m128i*)src) + 4); - c5 = _mm_loadu_si128(((const __m128i*)src) + 5); - c6 = _mm_loadu_si128(((const __m128i*)src) + 6); - c7 = _mm_loadu_si128(((const __m128i*)src) + 7); - _mm_prefetch((const char*)(src + 256), _MM_HINT_NTA); - src += 128; - _mm_stream_si128((((__m128i*)dst) + 0), c0); - _mm_stream_si128((((__m128i*)dst) + 1), c1); - _mm_stream_si128((((__m128i*)dst) + 2), c2); - _mm_stream_si128((((__m128i*)dst) + 3), c3); - _mm_stream_si128((((__m128i*)dst) + 4), c4); - _mm_stream_si128((((__m128i*)dst) + 5), c5); - _mm_stream_si128((((__m128i*)dst) + 6), c6); - _mm_stream_si128((((__m128i*)dst) + 7), c7); - dst += 128; - } - } - _mm_sfence(); - } - - memcpy_tiny(dst, src, size); - - return destination; -} - - -#endif diff --git a/contrib/FastMemcpy/FastMemcpy_Avx.c b/contrib/FastMemcpy/FastMemcpy_Avx.c deleted file mode 100644 index 6538c6b2126..00000000000 --- a/contrib/FastMemcpy/FastMemcpy_Avx.c +++ /dev/null @@ -1,171 +0,0 @@ -//===================================================================== -// -// FastMemcpy.c - skywind3000@163.com, 2015 -// -// feature: -// 50% speed up in avg. vs standard memcpy (tested in vc2012/gcc4.9) -// -//===================================================================== -#include -#include -#include -#include -#include - -#if (defined(_WIN32) || defined(WIN32)) -#include -#include -#ifdef _MSC_VER -#pragma comment(lib, "winmm.lib") -#endif -#elif defined(__unix) -#include -#include -#else -#error it can only be compiled under windows or unix -#endif - -#include "FastMemcpy_Avx.h" - - -unsigned int gettime() -{ - #if (defined(_WIN32) || defined(WIN32)) - return timeGetTime(); - #else - static struct timezone tz={ 0,0 }; - struct timeval time; - gettimeofday(&time,&tz); - return (time.tv_sec * 1000 + time.tv_usec / 1000); - #endif -} - -void sleepms(unsigned int millisec) -{ -#if defined(_WIN32) || defined(WIN32) - Sleep(millisec); -#else - usleep(millisec * 1000); -#endif -} - - - -void benchmark(int dstalign, int srcalign, size_t size, int times) -{ - char *DATA1 = (char*)malloc(size + 64); - char *DATA2 = (char*)malloc(size + 64); - size_t LINEAR1 = ((size_t)DATA1); - size_t LINEAR2 = ((size_t)DATA2); - char *ALIGN1 = (char*)(((64 - (LINEAR1 & 63)) & 63) + LINEAR1); - char *ALIGN2 = (char*)(((64 - (LINEAR2 & 63)) & 63) + LINEAR2); - char *dst = (dstalign)? ALIGN1 : (ALIGN1 + 1); - char *src = (srcalign)? ALIGN2 : (ALIGN2 + 3); - unsigned int t1, t2; - int k; - - sleepms(100); - t1 = gettime(); - for (k = times; k > 0; k--) { - memcpy(dst, src, size); - } - t1 = gettime() - t1; - sleepms(100); - t2 = gettime(); - for (k = times; k > 0; k--) { - memcpy_fast(dst, src, size); - } - t2 = gettime() - t2; - - free(DATA1); - free(DATA2); - - printf("result(dst %s, src %s): memcpy_fast=%dms memcpy=%d ms\n", - dstalign? "aligned" : "unalign", - srcalign? "aligned" : "unalign", (int)t2, (int)t1); -} - - -void bench(int copysize, int times) -{ - printf("benchmark(size=%d bytes, times=%d):\n", copysize, times); - benchmark(1, 1, copysize, times); - benchmark(1, 0, copysize, times); - benchmark(0, 1, copysize, times); - benchmark(0, 0, copysize, times); - printf("\n"); -} - - -void random_bench(int maxsize, int times) -{ - static char A[11 * 1024 * 1024 + 2]; - static char B[11 * 1024 * 1024 + 2]; - static int random_offsets[0x10000]; - static int random_sizes[0x8000]; - unsigned int i, p1, p2; - unsigned int t1, t2; - for (i = 0; i < 0x10000; i++) { // generate random offsets - random_offsets[i] = rand() % (10 * 1024 * 1024 + 1); - } - for (i = 0; i < 0x8000; i++) { // generate random sizes - random_sizes[i] = 1 + rand() % maxsize; - } - sleepms(100); - t1 = gettime(); - for (p1 = 0, p2 = 0, i = 0; i < times; i++) { - int offset1 = random_offsets[(p1++) & 0xffff]; - int offset2 = random_offsets[(p1++) & 0xffff]; - int size = random_sizes[(p2++) & 0x7fff]; - memcpy(A + offset1, B + offset2, size); - } - t1 = gettime() - t1; - sleepms(100); - t2 = gettime(); - for (p1 = 0, p2 = 0, i = 0; i < times; i++) { - int offset1 = random_offsets[(p1++) & 0xffff]; - int offset2 = random_offsets[(p1++) & 0xffff]; - int size = random_sizes[(p2++) & 0x7fff]; - memcpy_fast(A + offset1, B + offset2, size); - } - t2 = gettime() - t2; - printf("benchmark random access:\n"); - printf("memcpy_fast=%dms memcpy=%dms\n\n", (int)t2, (int)t1); -} - - -#ifdef _MSC_VER -#pragma comment(lib, "winmm.lib") -#endif - -int main(void) -{ -#if 1 - bench(32, 0x1000000); - bench(64, 0x1000000); - bench(512, 0x800000); - bench(1024, 0x400000); -#endif - bench(4096, 0x80000); - bench(8192, 0x40000); -#if 1 - bench(1024 * 1024 * 1, 0x800); - bench(1024 * 1024 * 4, 0x200); -#endif - bench(1024 * 1024 * 8, 0x100); - - random_bench(2048, 8000000); - - return 0; -} - - - - -/* - -*/ - - - - diff --git a/contrib/FastMemcpy/FastMemcpy_Avx.h b/contrib/FastMemcpy/FastMemcpy_Avx.h deleted file mode 100644 index 8ba064b0350..00000000000 --- a/contrib/FastMemcpy/FastMemcpy_Avx.h +++ /dev/null @@ -1,492 +0,0 @@ -//===================================================================== -// -// FastMemcpy.c - skywind3000@163.com, 2015 -// -// feature: -// 50% speed up in avg. vs standard memcpy (tested in vc2012/gcc5.1) -// -//===================================================================== -#ifndef __FAST_MEMCPY_H__ -#define __FAST_MEMCPY_H__ - -#include -#include -#include - - -//--------------------------------------------------------------------- -// force inline for compilers -//--------------------------------------------------------------------- -#ifndef INLINE -#ifdef __GNUC__ -#if (__GNUC__ > 3) || ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 1)) - #define INLINE __inline__ __attribute__((always_inline)) -#else - #define INLINE __inline__ -#endif -#elif defined(_MSC_VER) - #define INLINE __forceinline -#elif (defined(__BORLANDC__) || defined(__WATCOMC__)) - #define INLINE __inline -#else - #define INLINE -#endif -#endif - - - -//--------------------------------------------------------------------- -// fast copy for different sizes -//--------------------------------------------------------------------- -static INLINE void memcpy_avx_16(void *dst, const void *src) { -#if 1 - __m128i m0 = _mm_loadu_si128(((const __m128i*)src) + 0); - _mm_storeu_si128(((__m128i*)dst) + 0, m0); -#else - *((uint64_t*)((char*)dst + 0)) = *((uint64_t*)((const char*)src + 0)); - *((uint64_t*)((char*)dst + 8)) = *((uint64_t*)((const char*)src + 8)); -#endif -} - -static INLINE void memcpy_avx_32(void *dst, const void *src) { - __m256i m0 = _mm256_loadu_si256(((const __m256i*)src) + 0); - _mm256_storeu_si256(((__m256i*)dst) + 0, m0); -} - -static INLINE void memcpy_avx_64(void *dst, const void *src) { - __m256i m0 = _mm256_loadu_si256(((const __m256i*)src) + 0); - __m256i m1 = _mm256_loadu_si256(((const __m256i*)src) + 1); - _mm256_storeu_si256(((__m256i*)dst) + 0, m0); - _mm256_storeu_si256(((__m256i*)dst) + 1, m1); -} - -static INLINE void memcpy_avx_128(void *dst, const void *src) { - __m256i m0 = _mm256_loadu_si256(((const __m256i*)src) + 0); - __m256i m1 = _mm256_loadu_si256(((const __m256i*)src) + 1); - __m256i m2 = _mm256_loadu_si256(((const __m256i*)src) + 2); - __m256i m3 = _mm256_loadu_si256(((const __m256i*)src) + 3); - _mm256_storeu_si256(((__m256i*)dst) + 0, m0); - _mm256_storeu_si256(((__m256i*)dst) + 1, m1); - _mm256_storeu_si256(((__m256i*)dst) + 2, m2); - _mm256_storeu_si256(((__m256i*)dst) + 3, m3); -} - -static INLINE void memcpy_avx_256(void *dst, const void *src) { - __m256i m0 = _mm256_loadu_si256(((const __m256i*)src) + 0); - __m256i m1 = _mm256_loadu_si256(((const __m256i*)src) + 1); - __m256i m2 = _mm256_loadu_si256(((const __m256i*)src) + 2); - __m256i m3 = _mm256_loadu_si256(((const __m256i*)src) + 3); - __m256i m4 = _mm256_loadu_si256(((const __m256i*)src) + 4); - __m256i m5 = _mm256_loadu_si256(((const __m256i*)src) + 5); - __m256i m6 = _mm256_loadu_si256(((const __m256i*)src) + 6); - __m256i m7 = _mm256_loadu_si256(((const __m256i*)src) + 7); - _mm256_storeu_si256(((__m256i*)dst) + 0, m0); - _mm256_storeu_si256(((__m256i*)dst) + 1, m1); - _mm256_storeu_si256(((__m256i*)dst) + 2, m2); - _mm256_storeu_si256(((__m256i*)dst) + 3, m3); - _mm256_storeu_si256(((__m256i*)dst) + 4, m4); - _mm256_storeu_si256(((__m256i*)dst) + 5, m5); - _mm256_storeu_si256(((__m256i*)dst) + 6, m6); - _mm256_storeu_si256(((__m256i*)dst) + 7, m7); -} - - -//--------------------------------------------------------------------- -// tiny memory copy with jump table optimized -//--------------------------------------------------------------------- -static INLINE void *memcpy_tiny(void *dst, const void *src, size_t size) { - unsigned char *dd = ((unsigned char*)dst) + size; - const unsigned char *ss = ((const unsigned char*)src) + size; - - switch (size) { - case 128: memcpy_avx_128(dd - 128, ss - 128); - case 0: break; - case 129: memcpy_avx_128(dd - 129, ss - 129); - case 1: dd[-1] = ss[-1]; break; - case 130: memcpy_avx_128(dd - 130, ss - 130); - case 2: *((uint16_t*)(dd - 2)) = *((uint16_t*)(ss - 2)); break; - case 131: memcpy_avx_128(dd - 131, ss - 131); - case 3: *((uint16_t*)(dd - 3)) = *((uint16_t*)(ss - 3)); dd[-1] = ss[-1]; break; - case 132: memcpy_avx_128(dd - 132, ss - 132); - case 4: *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); break; - case 133: memcpy_avx_128(dd - 133, ss - 133); - case 5: *((uint32_t*)(dd - 5)) = *((uint32_t*)(ss - 5)); dd[-1] = ss[-1]; break; - case 134: memcpy_avx_128(dd - 134, ss - 134); - case 6: *((uint32_t*)(dd - 6)) = *((uint32_t*)(ss - 6)); *((uint16_t*)(dd - 2)) = *((uint16_t*)(ss - 2)); break; - case 135: memcpy_avx_128(dd - 135, ss - 135); - case 7: *((uint32_t*)(dd - 7)) = *((uint32_t*)(ss - 7)); *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); break; - case 136: memcpy_avx_128(dd - 136, ss - 136); - case 8: *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break; - case 137: memcpy_avx_128(dd - 137, ss - 137); - case 9: *((uint64_t*)(dd - 9)) = *((uint64_t*)(ss - 9)); dd[-1] = ss[-1]; break; - case 138: memcpy_avx_128(dd - 138, ss - 138); - case 10: *((uint64_t*)(dd - 10)) = *((uint64_t*)(ss - 10)); *((uint16_t*)(dd - 2)) = *((uint16_t*)(ss - 2)); break; - case 139: memcpy_avx_128(dd - 139, ss - 139); - case 11: *((uint64_t*)(dd - 11)) = *((uint64_t*)(ss - 11)); *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); break; - case 140: memcpy_avx_128(dd - 140, ss - 140); - case 12: *((uint64_t*)(dd - 12)) = *((uint64_t*)(ss - 12)); *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); break; - case 141: memcpy_avx_128(dd - 141, ss - 141); - case 13: *((uint64_t*)(dd - 13)) = *((uint64_t*)(ss - 13)); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break; - case 142: memcpy_avx_128(dd - 142, ss - 142); - case 14: *((uint64_t*)(dd - 14)) = *((uint64_t*)(ss - 14)); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break; - case 143: memcpy_avx_128(dd - 143, ss - 143); - case 15: *((uint64_t*)(dd - 15)) = *((uint64_t*)(ss - 15)); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break; - case 144: memcpy_avx_128(dd - 144, ss - 144); - case 16: memcpy_avx_16(dd - 16, ss - 16); break; - case 145: memcpy_avx_128(dd - 145, ss - 145); - case 17: memcpy_avx_16(dd - 17, ss - 17); dd[-1] = ss[-1]; break; - case 146: memcpy_avx_128(dd - 146, ss - 146); - case 18: memcpy_avx_16(dd - 18, ss - 18); *((uint16_t*)(dd - 2)) = *((uint16_t*)(ss - 2)); break; - case 147: memcpy_avx_128(dd - 147, ss - 147); - case 19: memcpy_avx_16(dd - 19, ss - 19); *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); break; - case 148: memcpy_avx_128(dd - 148, ss - 148); - case 20: memcpy_avx_16(dd - 20, ss - 20); *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); break; - case 149: memcpy_avx_128(dd - 149, ss - 149); - case 21: memcpy_avx_16(dd - 21, ss - 21); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break; - case 150: memcpy_avx_128(dd - 150, ss - 150); - case 22: memcpy_avx_16(dd - 22, ss - 22); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break; - case 151: memcpy_avx_128(dd - 151, ss - 151); - case 23: memcpy_avx_16(dd - 23, ss - 23); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break; - case 152: memcpy_avx_128(dd - 152, ss - 152); - case 24: memcpy_avx_16(dd - 24, ss - 24); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break; - case 153: memcpy_avx_128(dd - 153, ss - 153); - case 25: memcpy_avx_16(dd - 25, ss - 25); memcpy_avx_16(dd - 16, ss - 16); break; - case 154: memcpy_avx_128(dd - 154, ss - 154); - case 26: memcpy_avx_16(dd - 26, ss - 26); memcpy_avx_16(dd - 16, ss - 16); break; - case 155: memcpy_avx_128(dd - 155, ss - 155); - case 27: memcpy_avx_16(dd - 27, ss - 27); memcpy_avx_16(dd - 16, ss - 16); break; - case 156: memcpy_avx_128(dd - 156, ss - 156); - case 28: memcpy_avx_16(dd - 28, ss - 28); memcpy_avx_16(dd - 16, ss - 16); break; - case 157: memcpy_avx_128(dd - 157, ss - 157); - case 29: memcpy_avx_16(dd - 29, ss - 29); memcpy_avx_16(dd - 16, ss - 16); break; - case 158: memcpy_avx_128(dd - 158, ss - 158); - case 30: memcpy_avx_16(dd - 30, ss - 30); memcpy_avx_16(dd - 16, ss - 16); break; - case 159: memcpy_avx_128(dd - 159, ss - 159); - case 31: memcpy_avx_16(dd - 31, ss - 31); memcpy_avx_16(dd - 16, ss - 16); break; - case 160: memcpy_avx_128(dd - 160, ss - 160); - case 32: memcpy_avx_32(dd - 32, ss - 32); break; - case 161: memcpy_avx_128(dd - 161, ss - 161); - case 33: memcpy_avx_32(dd - 33, ss - 33); dd[-1] = ss[-1]; break; - case 162: memcpy_avx_128(dd - 162, ss - 162); - case 34: memcpy_avx_32(dd - 34, ss - 34); *((uint16_t*)(dd - 2)) = *((uint16_t*)(ss - 2)); break; - case 163: memcpy_avx_128(dd - 163, ss - 163); - case 35: memcpy_avx_32(dd - 35, ss - 35); *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); break; - case 164: memcpy_avx_128(dd - 164, ss - 164); - case 36: memcpy_avx_32(dd - 36, ss - 36); *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); break; - case 165: memcpy_avx_128(dd - 165, ss - 165); - case 37: memcpy_avx_32(dd - 37, ss - 37); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break; - case 166: memcpy_avx_128(dd - 166, ss - 166); - case 38: memcpy_avx_32(dd - 38, ss - 38); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break; - case 167: memcpy_avx_128(dd - 167, ss - 167); - case 39: memcpy_avx_32(dd - 39, ss - 39); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break; - case 168: memcpy_avx_128(dd - 168, ss - 168); - case 40: memcpy_avx_32(dd - 40, ss - 40); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break; - case 169: memcpy_avx_128(dd - 169, ss - 169); - case 41: memcpy_avx_32(dd - 41, ss - 41); memcpy_avx_16(dd - 16, ss - 16); break; - case 170: memcpy_avx_128(dd - 170, ss - 170); - case 42: memcpy_avx_32(dd - 42, ss - 42); memcpy_avx_16(dd - 16, ss - 16); break; - case 171: memcpy_avx_128(dd - 171, ss - 171); - case 43: memcpy_avx_32(dd - 43, ss - 43); memcpy_avx_16(dd - 16, ss - 16); break; - case 172: memcpy_avx_128(dd - 172, ss - 172); - case 44: memcpy_avx_32(dd - 44, ss - 44); memcpy_avx_16(dd - 16, ss - 16); break; - case 173: memcpy_avx_128(dd - 173, ss - 173); - case 45: memcpy_avx_32(dd - 45, ss - 45); memcpy_avx_16(dd - 16, ss - 16); break; - case 174: memcpy_avx_128(dd - 174, ss - 174); - case 46: memcpy_avx_32(dd - 46, ss - 46); memcpy_avx_16(dd - 16, ss - 16); break; - case 175: memcpy_avx_128(dd - 175, ss - 175); - case 47: memcpy_avx_32(dd - 47, ss - 47); memcpy_avx_16(dd - 16, ss - 16); break; - case 176: memcpy_avx_128(dd - 176, ss - 176); - case 48: memcpy_avx_32(dd - 48, ss - 48); memcpy_avx_16(dd - 16, ss - 16); break; - case 177: memcpy_avx_128(dd - 177, ss - 177); - case 49: memcpy_avx_32(dd - 49, ss - 49); memcpy_avx_32(dd - 32, ss - 32); break; - case 178: memcpy_avx_128(dd - 178, ss - 178); - case 50: memcpy_avx_32(dd - 50, ss - 50); memcpy_avx_32(dd - 32, ss - 32); break; - case 179: memcpy_avx_128(dd - 179, ss - 179); - case 51: memcpy_avx_32(dd - 51, ss - 51); memcpy_avx_32(dd - 32, ss - 32); break; - case 180: memcpy_avx_128(dd - 180, ss - 180); - case 52: memcpy_avx_32(dd - 52, ss - 52); memcpy_avx_32(dd - 32, ss - 32); break; - case 181: memcpy_avx_128(dd - 181, ss - 181); - case 53: memcpy_avx_32(dd - 53, ss - 53); memcpy_avx_32(dd - 32, ss - 32); break; - case 182: memcpy_avx_128(dd - 182, ss - 182); - case 54: memcpy_avx_32(dd - 54, ss - 54); memcpy_avx_32(dd - 32, ss - 32); break; - case 183: memcpy_avx_128(dd - 183, ss - 183); - case 55: memcpy_avx_32(dd - 55, ss - 55); memcpy_avx_32(dd - 32, ss - 32); break; - case 184: memcpy_avx_128(dd - 184, ss - 184); - case 56: memcpy_avx_32(dd - 56, ss - 56); memcpy_avx_32(dd - 32, ss - 32); break; - case 185: memcpy_avx_128(dd - 185, ss - 185); - case 57: memcpy_avx_32(dd - 57, ss - 57); memcpy_avx_32(dd - 32, ss - 32); break; - case 186: memcpy_avx_128(dd - 186, ss - 186); - case 58: memcpy_avx_32(dd - 58, ss - 58); memcpy_avx_32(dd - 32, ss - 32); break; - case 187: memcpy_avx_128(dd - 187, ss - 187); - case 59: memcpy_avx_32(dd - 59, ss - 59); memcpy_avx_32(dd - 32, ss - 32); break; - case 188: memcpy_avx_128(dd - 188, ss - 188); - case 60: memcpy_avx_32(dd - 60, ss - 60); memcpy_avx_32(dd - 32, ss - 32); break; - case 189: memcpy_avx_128(dd - 189, ss - 189); - case 61: memcpy_avx_32(dd - 61, ss - 61); memcpy_avx_32(dd - 32, ss - 32); break; - case 190: memcpy_avx_128(dd - 190, ss - 190); - case 62: memcpy_avx_32(dd - 62, ss - 62); memcpy_avx_32(dd - 32, ss - 32); break; - case 191: memcpy_avx_128(dd - 191, ss - 191); - case 63: memcpy_avx_32(dd - 63, ss - 63); memcpy_avx_32(dd - 32, ss - 32); break; - case 192: memcpy_avx_128(dd - 192, ss - 192); - case 64: memcpy_avx_64(dd - 64, ss - 64); break; - case 193: memcpy_avx_128(dd - 193, ss - 193); - case 65: memcpy_avx_64(dd - 65, ss - 65); dd[-1] = ss[-1]; break; - case 194: memcpy_avx_128(dd - 194, ss - 194); - case 66: memcpy_avx_64(dd - 66, ss - 66); *((uint16_t*)(dd - 2)) = *((uint16_t*)(ss - 2)); break; - case 195: memcpy_avx_128(dd - 195, ss - 195); - case 67: memcpy_avx_64(dd - 67, ss - 67); *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); break; - case 196: memcpy_avx_128(dd - 196, ss - 196); - case 68: memcpy_avx_64(dd - 68, ss - 68); *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); break; - case 197: memcpy_avx_128(dd - 197, ss - 197); - case 69: memcpy_avx_64(dd - 69, ss - 69); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break; - case 198: memcpy_avx_128(dd - 198, ss - 198); - case 70: memcpy_avx_64(dd - 70, ss - 70); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break; - case 199: memcpy_avx_128(dd - 199, ss - 199); - case 71: memcpy_avx_64(dd - 71, ss - 71); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break; - case 200: memcpy_avx_128(dd - 200, ss - 200); - case 72: memcpy_avx_64(dd - 72, ss - 72); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break; - case 201: memcpy_avx_128(dd - 201, ss - 201); - case 73: memcpy_avx_64(dd - 73, ss - 73); memcpy_avx_16(dd - 16, ss - 16); break; - case 202: memcpy_avx_128(dd - 202, ss - 202); - case 74: memcpy_avx_64(dd - 74, ss - 74); memcpy_avx_16(dd - 16, ss - 16); break; - case 203: memcpy_avx_128(dd - 203, ss - 203); - case 75: memcpy_avx_64(dd - 75, ss - 75); memcpy_avx_16(dd - 16, ss - 16); break; - case 204: memcpy_avx_128(dd - 204, ss - 204); - case 76: memcpy_avx_64(dd - 76, ss - 76); memcpy_avx_16(dd - 16, ss - 16); break; - case 205: memcpy_avx_128(dd - 205, ss - 205); - case 77: memcpy_avx_64(dd - 77, ss - 77); memcpy_avx_16(dd - 16, ss - 16); break; - case 206: memcpy_avx_128(dd - 206, ss - 206); - case 78: memcpy_avx_64(dd - 78, ss - 78); memcpy_avx_16(dd - 16, ss - 16); break; - case 207: memcpy_avx_128(dd - 207, ss - 207); - case 79: memcpy_avx_64(dd - 79, ss - 79); memcpy_avx_16(dd - 16, ss - 16); break; - case 208: memcpy_avx_128(dd - 208, ss - 208); - case 80: memcpy_avx_64(dd - 80, ss - 80); memcpy_avx_16(dd - 16, ss - 16); break; - case 209: memcpy_avx_128(dd - 209, ss - 209); - case 81: memcpy_avx_64(dd - 81, ss - 81); memcpy_avx_32(dd - 32, ss - 32); break; - case 210: memcpy_avx_128(dd - 210, ss - 210); - case 82: memcpy_avx_64(dd - 82, ss - 82); memcpy_avx_32(dd - 32, ss - 32); break; - case 211: memcpy_avx_128(dd - 211, ss - 211); - case 83: memcpy_avx_64(dd - 83, ss - 83); memcpy_avx_32(dd - 32, ss - 32); break; - case 212: memcpy_avx_128(dd - 212, ss - 212); - case 84: memcpy_avx_64(dd - 84, ss - 84); memcpy_avx_32(dd - 32, ss - 32); break; - case 213: memcpy_avx_128(dd - 213, ss - 213); - case 85: memcpy_avx_64(dd - 85, ss - 85); memcpy_avx_32(dd - 32, ss - 32); break; - case 214: memcpy_avx_128(dd - 214, ss - 214); - case 86: memcpy_avx_64(dd - 86, ss - 86); memcpy_avx_32(dd - 32, ss - 32); break; - case 215: memcpy_avx_128(dd - 215, ss - 215); - case 87: memcpy_avx_64(dd - 87, ss - 87); memcpy_avx_32(dd - 32, ss - 32); break; - case 216: memcpy_avx_128(dd - 216, ss - 216); - case 88: memcpy_avx_64(dd - 88, ss - 88); memcpy_avx_32(dd - 32, ss - 32); break; - case 217: memcpy_avx_128(dd - 217, ss - 217); - case 89: memcpy_avx_64(dd - 89, ss - 89); memcpy_avx_32(dd - 32, ss - 32); break; - case 218: memcpy_avx_128(dd - 218, ss - 218); - case 90: memcpy_avx_64(dd - 90, ss - 90); memcpy_avx_32(dd - 32, ss - 32); break; - case 219: memcpy_avx_128(dd - 219, ss - 219); - case 91: memcpy_avx_64(dd - 91, ss - 91); memcpy_avx_32(dd - 32, ss - 32); break; - case 220: memcpy_avx_128(dd - 220, ss - 220); - case 92: memcpy_avx_64(dd - 92, ss - 92); memcpy_avx_32(dd - 32, ss - 32); break; - case 221: memcpy_avx_128(dd - 221, ss - 221); - case 93: memcpy_avx_64(dd - 93, ss - 93); memcpy_avx_32(dd - 32, ss - 32); break; - case 222: memcpy_avx_128(dd - 222, ss - 222); - case 94: memcpy_avx_64(dd - 94, ss - 94); memcpy_avx_32(dd - 32, ss - 32); break; - case 223: memcpy_avx_128(dd - 223, ss - 223); - case 95: memcpy_avx_64(dd - 95, ss - 95); memcpy_avx_32(dd - 32, ss - 32); break; - case 224: memcpy_avx_128(dd - 224, ss - 224); - case 96: memcpy_avx_64(dd - 96, ss - 96); memcpy_avx_32(dd - 32, ss - 32); break; - case 225: memcpy_avx_128(dd - 225, ss - 225); - case 97: memcpy_avx_64(dd - 97, ss - 97); memcpy_avx_64(dd - 64, ss - 64); break; - case 226: memcpy_avx_128(dd - 226, ss - 226); - case 98: memcpy_avx_64(dd - 98, ss - 98); memcpy_avx_64(dd - 64, ss - 64); break; - case 227: memcpy_avx_128(dd - 227, ss - 227); - case 99: memcpy_avx_64(dd - 99, ss - 99); memcpy_avx_64(dd - 64, ss - 64); break; - case 228: memcpy_avx_128(dd - 228, ss - 228); - case 100: memcpy_avx_64(dd - 100, ss - 100); memcpy_avx_64(dd - 64, ss - 64); break; - case 229: memcpy_avx_128(dd - 229, ss - 229); - case 101: memcpy_avx_64(dd - 101, ss - 101); memcpy_avx_64(dd - 64, ss - 64); break; - case 230: memcpy_avx_128(dd - 230, ss - 230); - case 102: memcpy_avx_64(dd - 102, ss - 102); memcpy_avx_64(dd - 64, ss - 64); break; - case 231: memcpy_avx_128(dd - 231, ss - 231); - case 103: memcpy_avx_64(dd - 103, ss - 103); memcpy_avx_64(dd - 64, ss - 64); break; - case 232: memcpy_avx_128(dd - 232, ss - 232); - case 104: memcpy_avx_64(dd - 104, ss - 104); memcpy_avx_64(dd - 64, ss - 64); break; - case 233: memcpy_avx_128(dd - 233, ss - 233); - case 105: memcpy_avx_64(dd - 105, ss - 105); memcpy_avx_64(dd - 64, ss - 64); break; - case 234: memcpy_avx_128(dd - 234, ss - 234); - case 106: memcpy_avx_64(dd - 106, ss - 106); memcpy_avx_64(dd - 64, ss - 64); break; - case 235: memcpy_avx_128(dd - 235, ss - 235); - case 107: memcpy_avx_64(dd - 107, ss - 107); memcpy_avx_64(dd - 64, ss - 64); break; - case 236: memcpy_avx_128(dd - 236, ss - 236); - case 108: memcpy_avx_64(dd - 108, ss - 108); memcpy_avx_64(dd - 64, ss - 64); break; - case 237: memcpy_avx_128(dd - 237, ss - 237); - case 109: memcpy_avx_64(dd - 109, ss - 109); memcpy_avx_64(dd - 64, ss - 64); break; - case 238: memcpy_avx_128(dd - 238, ss - 238); - case 110: memcpy_avx_64(dd - 110, ss - 110); memcpy_avx_64(dd - 64, ss - 64); break; - case 239: memcpy_avx_128(dd - 239, ss - 239); - case 111: memcpy_avx_64(dd - 111, ss - 111); memcpy_avx_64(dd - 64, ss - 64); break; - case 240: memcpy_avx_128(dd - 240, ss - 240); - case 112: memcpy_avx_64(dd - 112, ss - 112); memcpy_avx_64(dd - 64, ss - 64); break; - case 241: memcpy_avx_128(dd - 241, ss - 241); - case 113: memcpy_avx_64(dd - 113, ss - 113); memcpy_avx_64(dd - 64, ss - 64); break; - case 242: memcpy_avx_128(dd - 242, ss - 242); - case 114: memcpy_avx_64(dd - 114, ss - 114); memcpy_avx_64(dd - 64, ss - 64); break; - case 243: memcpy_avx_128(dd - 243, ss - 243); - case 115: memcpy_avx_64(dd - 115, ss - 115); memcpy_avx_64(dd - 64, ss - 64); break; - case 244: memcpy_avx_128(dd - 244, ss - 244); - case 116: memcpy_avx_64(dd - 116, ss - 116); memcpy_avx_64(dd - 64, ss - 64); break; - case 245: memcpy_avx_128(dd - 245, ss - 245); - case 117: memcpy_avx_64(dd - 117, ss - 117); memcpy_avx_64(dd - 64, ss - 64); break; - case 246: memcpy_avx_128(dd - 246, ss - 246); - case 118: memcpy_avx_64(dd - 118, ss - 118); memcpy_avx_64(dd - 64, ss - 64); break; - case 247: memcpy_avx_128(dd - 247, ss - 247); - case 119: memcpy_avx_64(dd - 119, ss - 119); memcpy_avx_64(dd - 64, ss - 64); break; - case 248: memcpy_avx_128(dd - 248, ss - 248); - case 120: memcpy_avx_64(dd - 120, ss - 120); memcpy_avx_64(dd - 64, ss - 64); break; - case 249: memcpy_avx_128(dd - 249, ss - 249); - case 121: memcpy_avx_64(dd - 121, ss - 121); memcpy_avx_64(dd - 64, ss - 64); break; - case 250: memcpy_avx_128(dd - 250, ss - 250); - case 122: memcpy_avx_64(dd - 122, ss - 122); memcpy_avx_64(dd - 64, ss - 64); break; - case 251: memcpy_avx_128(dd - 251, ss - 251); - case 123: memcpy_avx_64(dd - 123, ss - 123); memcpy_avx_64(dd - 64, ss - 64); break; - case 252: memcpy_avx_128(dd - 252, ss - 252); - case 124: memcpy_avx_64(dd - 124, ss - 124); memcpy_avx_64(dd - 64, ss - 64); break; - case 253: memcpy_avx_128(dd - 253, ss - 253); - case 125: memcpy_avx_64(dd - 125, ss - 125); memcpy_avx_64(dd - 64, ss - 64); break; - case 254: memcpy_avx_128(dd - 254, ss - 254); - case 126: memcpy_avx_64(dd - 126, ss - 126); memcpy_avx_64(dd - 64, ss - 64); break; - case 255: memcpy_avx_128(dd - 255, ss - 255); - case 127: memcpy_avx_64(dd - 127, ss - 127); memcpy_avx_64(dd - 64, ss - 64); break; - case 256: memcpy_avx_256(dd - 256, ss - 256); break; - } - - return dst; -} - - -//--------------------------------------------------------------------- -// main routine -//--------------------------------------------------------------------- -static void* memcpy_fast(void *destination, const void *source, size_t size) -{ - unsigned char *dst = (unsigned char*)destination; - const unsigned char *src = (const unsigned char*)source; - static size_t cachesize = 0x200000; // L3-cache size - size_t padding; - - // small memory copy - if (size <= 256) { - memcpy_tiny(dst, src, size); - _mm256_zeroupper(); - return destination; - } - - // align destination to 16 bytes boundary - padding = (32 - (((size_t)dst) & 31)) & 31; - -#if 0 - if (padding > 0) { - __m256i head = _mm256_loadu_si256((const __m256i*)src); - _mm256_storeu_si256((__m256i*)dst, head); - dst += padding; - src += padding; - size -= padding; - } -#else - __m256i head = _mm256_loadu_si256((const __m256i*)src); - _mm256_storeu_si256((__m256i*)dst, head); - dst += padding; - src += padding; - size -= padding; -#endif - - // medium size copy - if (size <= cachesize) { - __m256i c0, c1, c2, c3, c4, c5, c6, c7; - - for (; size >= 256; size -= 256) { - c0 = _mm256_loadu_si256(((const __m256i*)src) + 0); - c1 = _mm256_loadu_si256(((const __m256i*)src) + 1); - c2 = _mm256_loadu_si256(((const __m256i*)src) + 2); - c3 = _mm256_loadu_si256(((const __m256i*)src) + 3); - c4 = _mm256_loadu_si256(((const __m256i*)src) + 4); - c5 = _mm256_loadu_si256(((const __m256i*)src) + 5); - c6 = _mm256_loadu_si256(((const __m256i*)src) + 6); - c7 = _mm256_loadu_si256(((const __m256i*)src) + 7); - _mm_prefetch((const char*)(src + 512), _MM_HINT_NTA); - src += 256; - _mm256_storeu_si256((((__m256i*)dst) + 0), c0); - _mm256_storeu_si256((((__m256i*)dst) + 1), c1); - _mm256_storeu_si256((((__m256i*)dst) + 2), c2); - _mm256_storeu_si256((((__m256i*)dst) + 3), c3); - _mm256_storeu_si256((((__m256i*)dst) + 4), c4); - _mm256_storeu_si256((((__m256i*)dst) + 5), c5); - _mm256_storeu_si256((((__m256i*)dst) + 6), c6); - _mm256_storeu_si256((((__m256i*)dst) + 7), c7); - dst += 256; - } - } - else { // big memory copy - __m256i c0, c1, c2, c3, c4, c5, c6, c7; - /* __m256i c0, c1, c2, c3, c4, c5, c6, c7; */ - - _mm_prefetch((const char*)(src), _MM_HINT_NTA); - - if ((((size_t)src) & 31) == 0) { // source aligned - for (; size >= 256; size -= 256) { - c0 = _mm256_load_si256(((const __m256i*)src) + 0); - c1 = _mm256_load_si256(((const __m256i*)src) + 1); - c2 = _mm256_load_si256(((const __m256i*)src) + 2); - c3 = _mm256_load_si256(((const __m256i*)src) + 3); - c4 = _mm256_load_si256(((const __m256i*)src) + 4); - c5 = _mm256_load_si256(((const __m256i*)src) + 5); - c6 = _mm256_load_si256(((const __m256i*)src) + 6); - c7 = _mm256_load_si256(((const __m256i*)src) + 7); - _mm_prefetch((const char*)(src + 512), _MM_HINT_NTA); - src += 256; - _mm256_stream_si256((((__m256i*)dst) + 0), c0); - _mm256_stream_si256((((__m256i*)dst) + 1), c1); - _mm256_stream_si256((((__m256i*)dst) + 2), c2); - _mm256_stream_si256((((__m256i*)dst) + 3), c3); - _mm256_stream_si256((((__m256i*)dst) + 4), c4); - _mm256_stream_si256((((__m256i*)dst) + 5), c5); - _mm256_stream_si256((((__m256i*)dst) + 6), c6); - _mm256_stream_si256((((__m256i*)dst) + 7), c7); - dst += 256; - } - } - else { // source unaligned - for (; size >= 256; size -= 256) { - c0 = _mm256_loadu_si256(((const __m256i*)src) + 0); - c1 = _mm256_loadu_si256(((const __m256i*)src) + 1); - c2 = _mm256_loadu_si256(((const __m256i*)src) + 2); - c3 = _mm256_loadu_si256(((const __m256i*)src) + 3); - c4 = _mm256_loadu_si256(((const __m256i*)src) + 4); - c5 = _mm256_loadu_si256(((const __m256i*)src) + 5); - c6 = _mm256_loadu_si256(((const __m256i*)src) + 6); - c7 = _mm256_loadu_si256(((const __m256i*)src) + 7); - _mm_prefetch((const char*)(src + 512), _MM_HINT_NTA); - src += 256; - _mm256_stream_si256((((__m256i*)dst) + 0), c0); - _mm256_stream_si256((((__m256i*)dst) + 1), c1); - _mm256_stream_si256((((__m256i*)dst) + 2), c2); - _mm256_stream_si256((((__m256i*)dst) + 3), c3); - _mm256_stream_si256((((__m256i*)dst) + 4), c4); - _mm256_stream_si256((((__m256i*)dst) + 5), c5); - _mm256_stream_si256((((__m256i*)dst) + 6), c6); - _mm256_stream_si256((((__m256i*)dst) + 7), c7); - dst += 256; - } - } - _mm_sfence(); - } - - memcpy_tiny(dst, src, size); - _mm256_zeroupper(); - - return destination; -} - - -#endif - - - diff --git a/contrib/FastMemcpy/LICENSE b/contrib/FastMemcpy/LICENSE deleted file mode 100644 index c449da6aa8a..00000000000 --- a/contrib/FastMemcpy/LICENSE +++ /dev/null @@ -1,22 +0,0 @@ -The MIT License (MIT) - -Copyright (c) 2015 Linwei - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. - diff --git a/contrib/FastMemcpy/README.md b/contrib/FastMemcpy/README.md deleted file mode 100644 index e253f6bf5dd..00000000000 --- a/contrib/FastMemcpy/README.md +++ /dev/null @@ -1,20 +0,0 @@ -Internal implementation of `memcpy` function. - -It has the following advantages over `libc`-supplied implementation: -- it is linked statically, so the function is called directly, not through a `PLT` (procedure lookup table of shared library); -- it is linked statically, so the function can have position-dependent code; -- your binaries will not depend on `glibc`'s memcpy, that forces dependency on specific symbol version like `memcpy@@GLIBC_2.14` and consequently on specific version of `glibc` library; -- you can include `memcpy.h` directly and the function has the chance to be inlined, which is beneficial for small but unknown at compile time sizes of memory regions; -- this version of `memcpy` pretend to be faster (in our benchmarks, the difference is within few percents). - -Currently it uses the implementation from **Linwei** (skywind3000@163.com). -Look at https://www.zhihu.com/question/35172305 for discussion. - -Drawbacks: -- only use SSE 2, doesn't use wider (AVX, AVX 512) vector registers when available; -- no CPU dispatching; doesn't take into account actual cache size. - -Also worth to look at: -- simple implementation from Facebook: https://github.com/facebook/folly/blob/master/folly/memcpy.S -- implementation from Agner Fog: http://www.agner.org/optimize/ -- glibc source code. diff --git a/contrib/FastMemcpy/memcpy_wrapper.c b/contrib/FastMemcpy/memcpy_wrapper.c deleted file mode 100644 index 1f57345980a..00000000000 --- a/contrib/FastMemcpy/memcpy_wrapper.c +++ /dev/null @@ -1,6 +0,0 @@ -#include "FastMemcpy.h" - -void * memcpy(void * __restrict destination, const void * __restrict source, size_t size) -{ - return memcpy_fast(destination, source, size); -} diff --git a/docker/test/sqlancer/Dockerfile b/docker/test/sqlancer/Dockerfile index 32012efb064..6bcdc3df5cd 100644 --- a/docker/test/sqlancer/Dockerfile +++ b/docker/test/sqlancer/Dockerfile @@ -1,7 +1,7 @@ # docker build -t yandex/clickhouse-sqlancer-test . FROM ubuntu:20.04 -RUN apt-get update --yes && env DEBIAN_FRONTEND=noninteractive apt-get install wget unzip git openjdk-14-jdk maven --yes --no-install-recommends +RUN apt-get update --yes && env DEBIAN_FRONTEND=noninteractive apt-get install wget unzip git openjdk-14-jdk maven python3 --yes --no-install-recommends RUN wget https://github.com/sqlancer/sqlancer/archive/master.zip -O /sqlancer.zip RUN mkdir /sqlancer && \ diff --git a/docs/en/sql-reference/aggregate-functions/reference/avg.md b/docs/en/sql-reference/aggregate-functions/reference/avg.md index d53a47a36a3..cbd409ccab6 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/avg.md +++ b/docs/en/sql-reference/aggregate-functions/reference/avg.md @@ -14,26 +14,19 @@ avg(x) **Arguments** -- `x` — Values. - -`x` must be -[Integer](../../../sql-reference/data-types/int-uint.md), -[floating-point](../../../sql-reference/data-types/float.md), or -[Decimal](../../../sql-reference/data-types/decimal.md). +- `x` — input values, must be [Integer](../../../sql-reference/data-types/int-uint.md), [Float](../../../sql-reference/data-types/float.md), or [Decimal](../../../sql-reference/data-types/decimal.md). **Returned value** -- `NaN` if the supplied parameter is empty. -- Mean otherwise. - -**Return type** is always [Float64](../../../sql-reference/data-types/float.md). +- The arithmetic mean, always as [Float64](../../../sql-reference/data-types/float.md). +- `NaN` if the input parameter `x` is empty. **Example** Query: ``` sql -SELECT avg(x) FROM values('x Int8', 0, 1, 2, 3, 4, 5) +SELECT avg(x) FROM values('x Int8', 0, 1, 2, 3, 4, 5); ``` Result: @@ -46,11 +39,20 @@ Result: **Example** +Create a temp table: + Query: ``` sql CREATE table test (t UInt8) ENGINE = Memory; -SELECT avg(t) FROM test +``` + +Get the arithmetic mean: + +Query: + +``` +SELECT avg(t) FROM test; ``` Result: @@ -60,3 +62,5 @@ Result: │ nan │ └────────┘ ``` + +[Original article](https://clickhouse.tech/docs/en/sql-reference/aggregate-functions/reference/avg/) diff --git a/docs/ru/sql-reference/aggregate-functions/reference/argmax.md b/docs/ru/sql-reference/aggregate-functions/reference/argmax.md index f44e65831a9..dd2df23e1cd 100644 --- a/docs/ru/sql-reference/aggregate-functions/reference/argmax.md +++ b/docs/ru/sql-reference/aggregate-functions/reference/argmax.md @@ -27,13 +27,13 @@ argMax(tuple(arg, val)) **Возвращаемое значение** -- Значение `arg`, соответствующее максимальному значению `val`. +- значение `arg`, соответствующее максимальному значению `val`. Тип: соответствует типу `arg`. Если передан кортеж: -- Кортеж `(arg, val)` c максимальным значением `val` и соответствующим ему `arg`. +- кортеж `(arg, val)` c максимальным значением `val` и соответствующим ему `arg`. Тип: [Tuple](../../../sql-reference/data-types/tuple.md). @@ -52,15 +52,15 @@ argMax(tuple(arg, val)) Запрос: ``` sql -SELECT argMax(user, salary), argMax(tuple(user, salary)) FROM salary; +SELECT argMax(user, salary), argMax(tuple(user, salary), salary), argMax(tuple(user, salary)) FROM salary; ``` Результат: ``` text -┌─argMax(user, salary)─┬─argMax(tuple(user, salary))─┐ -│ director │ ('director',5000) │ -└──────────────────────┴─────────────────────────────┘ +┌─argMax(user, salary)─┬─argMax(tuple(user, salary), salary)─┬─argMax(tuple(user, salary))─┐ +│ director │ ('director',5000) │ ('director',5000) │ +└──────────────────────┴─────────────────────────────────────┴─────────────────────────────┘ ``` [Оригинальная статья](https://clickhouse.tech/docs/ru/sql-reference/aggregate-functions/reference/argmax/) diff --git a/docs/ru/sql-reference/aggregate-functions/reference/avg.md b/docs/ru/sql-reference/aggregate-functions/reference/avg.md index b0bee64ec66..c032199aa32 100644 --- a/docs/ru/sql-reference/aggregate-functions/reference/avg.md +++ b/docs/ru/sql-reference/aggregate-functions/reference/avg.md @@ -4,8 +4,61 @@ toc_priority: 5 # avg {#agg_function-avg} -Вычисляет среднее. -Работает только для чисел. -Результат всегда Float64. +Вычисляет среднее арифметическое. -[Оригинальная статья](https://clickhouse.tech/docs/en/sql-reference/aggregate-functions/reference/avg/) +**Синтаксис** + +``` sql +avg(x) +``` + +**Аргументы** + +- `x` — входное значение типа [Integer](../../../sql-reference/data-types/int-uint.md), [Float](../../../sql-reference/data-types/float.md) или [Decimal](../../../sql-reference/data-types/decimal.md). + +**Возвращаемое значение** + +- среднее арифметическое, всегда типа [Float64](../../../sql-reference/data-types/float.md). +- `NaN`, если входное значение `x` — пустое. + +**Пример** + +Запрос: + +``` sql +SELECT avg(x) FROM values('x Int8', 0, 1, 2, 3, 4, 5); +``` + +Результат: + +``` text +┌─avg(x)─┐ +│ 2.5 │ +└────────┘ +``` + +**Пример** + +Создайте временную таблицу: + +Запрос: + +``` sql +CREATE table test (t UInt8) ENGINE = Memory; +``` + +Выполните запрос: + +``` sql +SELECT avg(t) FROM test; +``` + +Результат: + +``` text +┌─avg(x)─┐ +│ nan │ +└────────┘ +``` + +[Оригинальная статья](https://clickhouse.tech/docs/ru/sql-reference/aggregate-functions/reference/avg/) diff --git a/src/Common/ErrorCodes.cpp b/src/Common/ErrorCodes.cpp index f44cfd938d6..40ce23fffb2 100644 --- a/src/Common/ErrorCodes.cpp +++ b/src/Common/ErrorCodes.cpp @@ -545,6 +545,7 @@ M(576, KERBEROS_ERROR) \ M(577, INVALID_SHARD_ID) \ M(578, INVALID_FORMAT_INSERT_QUERY_WITH_DATA) \ + M(579, INCORRECT_PART_TYPE) \ \ M(999, KEEPER_EXCEPTION) \ M(1000, POCO_EXCEPTION) \ diff --git a/src/Common/PODArray.h b/src/Common/PODArray.h index 26be7fe82ba..163a6503d2e 100644 --- a/src/Common/PODArray.h +++ b/src/Common/PODArray.h @@ -568,7 +568,7 @@ public: /// arr1 takes ownership of the heap memory of arr2. arr1.c_start = arr2.c_start; - arr1.c_end_of_storage = arr1.c_start + heap_allocated - arr1.pad_right; + arr1.c_end_of_storage = arr1.c_start + heap_allocated - arr2.pad_right - arr2.pad_left; arr1.c_end = arr1.c_start + this->byte_size(heap_size); /// Allocate stack space for arr2. @@ -585,7 +585,7 @@ public: dest.dealloc(); dest.alloc(src.allocated_bytes(), std::forward(allocator_params)...); memcpy(dest.c_start, src.c_start, this->byte_size(src.size())); - dest.c_end = dest.c_start + (src.c_end - src.c_start); + dest.c_end = dest.c_start + this->byte_size(src.size()); src.c_start = Base::null; src.c_end = Base::null; @@ -639,8 +639,8 @@ public: size_t rhs_size = rhs.size(); size_t rhs_allocated = rhs.allocated_bytes(); - this->c_end_of_storage = this->c_start + rhs_allocated - Base::pad_right; - rhs.c_end_of_storage = rhs.c_start + lhs_allocated - Base::pad_right; + this->c_end_of_storage = this->c_start + rhs_allocated - Base::pad_right - Base::pad_left; + rhs.c_end_of_storage = rhs.c_start + lhs_allocated - Base::pad_right - Base::pad_left; this->c_end = this->c_start + this->byte_size(rhs_size); rhs.c_end = rhs.c_start + this->byte_size(lhs_size); @@ -693,34 +693,34 @@ public: } - bool operator== (const PODArray & other) const + bool operator== (const PODArray & rhs) const { - if (this->size() != other.size()) + if (this->size() != rhs.size()) return false; - const_iterator this_it = begin(); - const_iterator that_it = other.begin(); + const_iterator lhs_it = begin(); + const_iterator rhs_it = rhs.begin(); - while (this_it != end()) + while (lhs_it != end()) { - if (*this_it != *that_it) + if (*lhs_it != *rhs_it) return false; - ++this_it; - ++that_it; + ++lhs_it; + ++rhs_it; } return true; } - bool operator!= (const PODArray & other) const + bool operator!= (const PODArray & rhs) const { - return !operator==(other); + return !operator==(rhs); } }; -template -void swap(PODArray & lhs, PODArray & rhs) +template +void swap(PODArray & lhs, PODArray & rhs) { lhs.swap(rhs); } diff --git a/src/Common/SharedLibrary.cpp b/src/Common/SharedLibrary.cpp index 689179be7d8..37da308d5af 100644 --- a/src/Common/SharedLibrary.cpp +++ b/src/Common/SharedLibrary.cpp @@ -13,11 +13,11 @@ namespace ErrorCodes extern const int CANNOT_DLSYM; } -SharedLibrary::SharedLibrary(const std::string & path, int flags) +SharedLibrary::SharedLibrary(std::string_view path, int flags) { - handle = dlopen(path.c_str(), flags); + handle = dlopen(path.data(), flags); if (!handle) - throw Exception(std::string("Cannot dlopen: ") + dlerror(), ErrorCodes::CANNOT_DLOPEN); + throw Exception(ErrorCodes::CANNOT_DLOPEN, "Cannot dlopen: ({})", dlerror()); updatePHDRCache(); @@ -31,17 +31,18 @@ SharedLibrary::~SharedLibrary() std::terminate(); } -void * SharedLibrary::getImpl(const std::string & name, bool no_throw) +void * SharedLibrary::getImpl(std::string_view name, bool no_throw) { dlerror(); - auto * res = dlsym(handle, name.c_str()); + auto * res = dlsym(handle, name.data()); if (char * error = dlerror()) { if (no_throw) return nullptr; - throw Exception(std::string("Cannot dlsym: ") + error, ErrorCodes::CANNOT_DLSYM); + + throw Exception(ErrorCodes::CANNOT_DLSYM, "Cannot dlsym: ({})", error); } return res; diff --git a/src/Common/SharedLibrary.h b/src/Common/SharedLibrary.h index 9d2b9bc7843..866e60fbd33 100644 --- a/src/Common/SharedLibrary.h +++ b/src/Common/SharedLibrary.h @@ -14,23 +14,24 @@ namespace DB class SharedLibrary : private boost::noncopyable { public: - explicit SharedLibrary(const std::string & path, int flags = RTLD_LAZY); + explicit SharedLibrary(std::string_view path, int flags = RTLD_LAZY); ~SharedLibrary(); template - Func get(const std::string & name) + Func get(std::string_view name) { return reinterpret_cast(getImpl(name)); } + template - Func tryGet(const std::string & name) + Func tryGet(std::string_view name) { return reinterpret_cast(getImpl(name, true)); } private: - void * getImpl(const std::string & name, bool no_throw = false); + void * getImpl(std::string_view name, bool no_throw = false); void * handle = nullptr; }; diff --git a/src/Core/Settings.h b/src/Core/Settings.h index d3e77695820..3fdedf385a1 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -215,7 +215,7 @@ class IColumn; \ M(Bool, insert_distributed_sync, false, "If setting is enabled, insert query into distributed waits until data will be sent to all nodes in cluster.", 0) \ M(UInt64, insert_distributed_timeout, 0, "Timeout for insert query into distributed. Setting is used only with insert_distributed_sync enabled. Zero value means no timeout.", 0) \ - M(Int64, distributed_ddl_task_timeout, 180, "Timeout for DDL query responses from all hosts in cluster. If a ddl request has not been performed on all hosts, a response will contain a timeout error and a request will be executed in an async mode. Negative value means infinite.", 0) \ + M(Int64, distributed_ddl_task_timeout, 180, "Timeout for DDL query responses from all hosts in cluster. If a ddl request has not been performed on all hosts, a response will contain a timeout error and a request will be executed in an async mode. Negative value means infinite. Zero means async mode.", 0) \ M(Milliseconds, stream_flush_interval_ms, 7500, "Timeout for flushing data from streaming storages.", 0) \ M(Milliseconds, stream_poll_timeout_ms, 500, "Timeout for polling data from/to streaming storages.", 0) \ \ @@ -434,7 +434,9 @@ class IColumn; M(Bool, engine_file_truncate_on_insert, false, "Enables or disables truncate before insert in file engine tables", 0) \ M(Bool, allow_experimental_database_replicated, false, "Allow to create databases with Replicated engine", 0) \ M(UInt64, database_replicated_initial_query_timeout_sec, 300, "How long initial DDL query should wait for Replicated database to precess previous DDL queue entries", 0) \ - M(Bool, database_replicated_ddl_output, true, "Return table with query execution status as a result of DDL query", 0) \ + M(Bool, database_replicated_always_detach_permanently, false, "Execute DETACH TABLE as DETACH TABLE PERMANENTLY if database engine is Replicated", 0) \ + M(DistributedDDLOutputMode, distributed_ddl_output_mode, DistributedDDLOutputMode::THROW, "Format of distributed DDL query result", 0) \ + M(UInt64, distributed_ddl_entry_format_version, 1, "Version of DDL entry to write into ZooKeeper", 0) \ \ /** Obsolete settings that do nothing but left for compatibility reasons. Remove each one after half a year of obsolescence. */ \ \ @@ -446,6 +448,7 @@ class IColumn; M(Bool, optimize_aggregators_of_group_by_keys, true, "Eliminates min/max/any/anyLast aggregators of GROUP BY keys in SELECT section", 0) \ M(Bool, optimize_group_by_function_keys, true, "Eliminates functions of other keys in GROUP BY section", 0) \ M(UInt64, query_plan_max_optimizations_to_apply, 10000, "Limit the total number of optimizations applied to query plan. If zero, ignored. If limit reached, throw exception", 0) \ + M(Bool, database_replicated_ddl_output, true, "Obsolete setting, does nothing. Will be removed after 2021-09-08", 0) \ // End of COMMON_SETTINGS // Please add settings related to formats into the FORMAT_FACTORY_SETTINGS below. diff --git a/src/Core/SettingsEnums.cpp b/src/Core/SettingsEnums.cpp index 2e1cf025256..64ba51d1c68 100644 --- a/src/Core/SettingsEnums.cpp +++ b/src/Core/SettingsEnums.cpp @@ -102,4 +102,10 @@ IMPLEMENT_SETTING_ENUM(UnionMode, ErrorCodes::UNKNOWN_UNION, {"ALL", UnionMode::ALL}, {"DISTINCT", UnionMode::DISTINCT}}) +IMPLEMENT_SETTING_ENUM(DistributedDDLOutputMode, ErrorCodes::BAD_ARGUMENTS, + {{"none", DistributedDDLOutputMode::NONE}, + {"throw", DistributedDDLOutputMode::THROW}, + {"null_status_on_timeout", DistributedDDLOutputMode::NULL_STATUS_ON_TIMEOUT}, + {"never_throw", DistributedDDLOutputMode::NEVER_THROW}}) + } diff --git a/src/Core/SettingsEnums.h b/src/Core/SettingsEnums.h index c2ef08135eb..7615b185a61 100644 --- a/src/Core/SettingsEnums.h +++ b/src/Core/SettingsEnums.h @@ -138,4 +138,15 @@ enum class UnionMode DECLARE_SETTING_ENUM(UnionMode) + +enum class DistributedDDLOutputMode +{ + NONE, + THROW, + NULL_STATUS_ON_TIMEOUT, + NEVER_THROW, +}; + +DECLARE_SETTING_ENUM(DistributedDDLOutputMode) + } diff --git a/src/Databases/DatabaseOnDisk.cpp b/src/Databases/DatabaseOnDisk.cpp index e5d2b23ace0..79373128713 100644 --- a/src/Databases/DatabaseOnDisk.cpp +++ b/src/Databases/DatabaseOnDisk.cpp @@ -231,8 +231,8 @@ void DatabaseOnDisk::createTable( if (create.attach_short_syntax) { /// Metadata already exists, table was detached + removeDetachedPermanentlyFlag(context, table_name, table_metadata_path, true); attachTable(table_name, table, getTableDataPath(create)); - removeDetachedPermanentlyFlag(table_name, table_metadata_path); return; } @@ -270,12 +270,12 @@ void DatabaseOnDisk::createTable( commitCreateTable(create, table, table_metadata_tmp_path, table_metadata_path, context); - removeDetachedPermanentlyFlag(table_name, table_metadata_path); + removeDetachedPermanentlyFlag(context, table_name, table_metadata_path, false); } /// If the table was detached permanently we will have a flag file with /// .sql.detached extension, is not needed anymore since we attached the table back -void DatabaseOnDisk::removeDetachedPermanentlyFlag(const String & table_name, const String & table_metadata_path) const +void DatabaseOnDisk::removeDetachedPermanentlyFlag(const Context &, const String & table_name, const String & table_metadata_path, bool) const { try { diff --git a/src/Databases/DatabaseOnDisk.h b/src/Databases/DatabaseOnDisk.h index fefe6e91606..b96a24f3345 100644 --- a/src/Databases/DatabaseOnDisk.h +++ b/src/Databases/DatabaseOnDisk.h @@ -94,11 +94,10 @@ protected: virtual void commitCreateTable(const ASTCreateQuery & query, const StoragePtr & table, const String & table_metadata_tmp_path, const String & table_metadata_path, const Context & query_context); + virtual void removeDetachedPermanentlyFlag(const Context & context, const String & table_name, const String & table_metadata_path, bool attach) const; + const String metadata_path; const String data_path; - -private: - void removeDetachedPermanentlyFlag(const String & table_name, const String & table_metadata_path) const; }; } diff --git a/src/Databases/DatabaseReplicated.cpp b/src/Databases/DatabaseReplicated.cpp index 12cff3407d3..83609606bb8 100644 --- a/src/Databases/DatabaseReplicated.cpp +++ b/src/Databases/DatabaseReplicated.cpp @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -105,7 +106,27 @@ std::pair DatabaseReplicated::parseFullReplicaName(const String ClusterPtr DatabaseReplicated::getCluster() const { - /// TODO Maintain up-to-date Cluster and allow to use it in Distributed tables + { + std::lock_guard lock{mutex}; + if (cluster) + return cluster; + } + + ClusterPtr new_cluster = getClusterImpl(); + std::lock_guard lock{mutex}; + if (!cluster) + cluster = std::move(new_cluster); + return cluster; +} + +void DatabaseReplicated::setCluster(ClusterPtr && new_cluster) +{ + std::lock_guard lock{mutex}; + cluster = std::move(new_cluster); +} + +ClusterPtr DatabaseReplicated::getClusterImpl() const +{ Strings hosts; Strings host_ids; @@ -120,7 +141,7 @@ ClusterPtr DatabaseReplicated::getCluster() const hosts = zookeeper->getChildren(zookeeper_path + "/replicas", &stat); if (hosts.empty()) throw Exception(ErrorCodes::LOGICAL_ERROR, "No hosts found"); - Int32 cver = stat.cversion; + Int32 cversion = stat.cversion; std::sort(hosts.begin(), hosts.end()); std::vector futures; @@ -139,7 +160,9 @@ ClusterPtr DatabaseReplicated::getCluster() const } zookeeper->get(zookeeper_path + "/replicas", &stat); - if (success && cver == stat.version) + if (cversion != stat.cversion) + success = false; + if (success) break; } if (!success) @@ -157,22 +180,23 @@ ClusterPtr DatabaseReplicated::getCluster() const if (id == DROPPED_MARK) continue; auto [shard, replica] = parseFullReplicaName(hosts[i]); - auto pos = id.find(':'); - String host = id.substr(0, pos); + auto pos = id.rfind(':'); + String host_port = id.substr(0, pos); if (shard != current_shard) { current_shard = shard; if (!shards.back().empty()) shards.emplace_back(); } - shards.back().emplace_back(unescapeForFileName(host)); + shards.back().emplace_back(unescapeForFileName(host_port)); } - /// TODO make it configurable - String username = "default"; - String password; + String username = db_settings.cluster_username; + String password = db_settings.cluster_password; + UInt16 default_port = global_context.getTCPPort(); + bool secure = db_settings.cluster_secure_connection; - return std::make_shared(global_context.getSettingsRef(), shards, username, password, global_context.getTCPPort(), false); + return std::make_shared(global_context.getSettingsRef(), shards, username, password, default_port, false, secure); } void DatabaseReplicated::tryConnectToZooKeeperAndInitDatabase(bool force_attach) @@ -253,11 +277,8 @@ bool DatabaseReplicated::createDatabaseNodesInZooKeeper(const zkutil::ZooKeeperP __builtin_unreachable(); } -void DatabaseReplicated::createReplicaNodesInZooKeeper(const zkutil::ZooKeeperPtr & current_zookeeper) +void DatabaseReplicated::createEmptyLogEntry(Coordination::Requests & ops, const ZooKeeperPtr & current_zookeeper) { - /// Write host name to replica_path, it will protect from multiple replicas with the same name - auto host_id = getHostID(global_context, db_uuid); - /// On replica creation add empty entry to log. Can be used to trigger some actions on other replicas (e.g. update cluster info). DDLLogEntry entry{}; @@ -266,11 +287,20 @@ void DatabaseReplicated::createReplicaNodesInZooKeeper(const zkutil::ZooKeeperPt String counter_path = current_zookeeper->create(counter_prefix, "", zkutil::CreateMode::EphemeralSequential); String query_path = query_path_prefix + counter_path.substr(counter_prefix.size()); + ops.emplace_back(zkutil::makeCreateRequest(query_path, entry.toString(), zkutil::CreateMode::Persistent)); + ops.emplace_back(zkutil::makeCreateRequest(query_path + "/committed", getFullReplicaName(), zkutil::CreateMode::Persistent)); + ops.emplace_back(zkutil::makeRemoveRequest(counter_path, -1)); +} + +void DatabaseReplicated::createReplicaNodesInZooKeeper(const zkutil::ZooKeeperPtr & current_zookeeper) +{ + /// Write host name to replica_path, it will protect from multiple replicas with the same name + auto host_id = getHostID(global_context, db_uuid); + Coordination::Requests ops; ops.emplace_back(zkutil::makeCreateRequest(replica_path, host_id, zkutil::CreateMode::Persistent)); ops.emplace_back(zkutil::makeCreateRequest(replica_path + "/log_ptr", "0", zkutil::CreateMode::Persistent)); - ops.emplace_back(zkutil::makeCreateRequest(query_path, entry.toString(), zkutil::CreateMode::Persistent)); - ops.emplace_back(zkutil::makeRemoveRequest(counter_path, -1)); + createEmptyLogEntry(ops, current_zookeeper); current_zookeeper->multi(ops); } @@ -294,7 +324,11 @@ BlockIO DatabaseReplicated::tryEnqueueReplicatedDDL(const ASTPtr & query, const /// Replicas will set correct name of current database in query context (database name can be different on replicas) if (auto * ddl_query = query->as()) + { + if (ddl_query->database != getDatabaseName()) + throw Exception(ErrorCodes::UNKNOWN_DATABASE, "Database was renamed"); ddl_query->database.clear(); + } if (const auto * query_alter = query->as()) { @@ -305,23 +339,26 @@ BlockIO DatabaseReplicated::tryEnqueueReplicatedDDL(const ASTPtr & query, const } } + if (auto * query_drop = query->as()) + { + if (query_drop->kind == ASTDropQuery::Kind::Detach && query_context.getSettingsRef().database_replicated_always_detach_permanently) + query_drop->permanently = true; + if (query_drop->kind == ASTDropQuery::Kind::Detach && !query_drop->permanently) + throw Exception(ErrorCodes::INCORRECT_QUERY, "DETACH TABLE is not allowed for Replicated databases. " + "Use DETACH TABLE PERMANENTLY or SYSTEM RESTART REPLICA or set " + "database_replicated_always_detach_permanently to 1"); + } + LOG_DEBUG(log, "Proposing query: {}", queryToString(query)); - /// TODO maybe write current settings to log entry? DDLLogEntry entry; entry.query = queryToString(query); entry.initiator = ddl_worker->getCommonHostID(); + entry.setSettingsIfRequired(query_context); String node_path = ddl_worker->tryEnqueueAndExecuteEntry(entry, query_context); - BlockIO io; - if (query_context.getSettingsRef().distributed_ddl_task_timeout == 0) - return io; - Strings hosts_to_wait = getZooKeeper()->getChildren(zookeeper_path + "/replicas"); - auto stream = std::make_shared(node_path, entry, query_context, hosts_to_wait); - if (query_context.getSettingsRef().database_replicated_ddl_output) - io.in = std::move(stream); - return io; + return getDistributedDDLStatus(node_path, entry, query_context, hosts_to_wait); } static UUID getTableUUIDIfReplicated(const String & metadata, const Context & context) @@ -557,12 +594,14 @@ ASTPtr DatabaseReplicated::parseQueryFromMetadataInZooKeeper(const String & node auto ast = parseQuery(parser, query, description, 0, global_context.getSettingsRef().max_parser_depth); auto & create = ast->as(); - if (create.uuid == UUIDHelpers::Nil || create.table != TABLE_WITH_UUID_NAME_PLACEHOLDER || ! create.database.empty()) + if (create.uuid == UUIDHelpers::Nil || create.table != TABLE_WITH_UUID_NAME_PLACEHOLDER || !create.database.empty()) throw Exception(ErrorCodes::LOGICAL_ERROR, "Got unexpected query from {}: {}", node_name, query); + bool is_materialized_view_with_inner_table = create.is_materialized_view && create.to_table_id.empty(); + create.database = getDatabaseName(); create.table = unescapeForFileName(node_name); - create.attach = false; + create.attach = is_materialized_view_with_inner_table; return ast; } @@ -570,8 +609,13 @@ ASTPtr DatabaseReplicated::parseQueryFromMetadataInZooKeeper(const String & node void DatabaseReplicated::drop(const Context & context_) { auto current_zookeeper = getZooKeeper(); - current_zookeeper->set(replica_path, DROPPED_MARK); + Coordination::Requests ops; + ops.emplace_back(zkutil::makeSetRequest(replica_path, DROPPED_MARK, -1)); + createEmptyLogEntry(ops, current_zookeeper); + current_zookeeper->multi(ops); + DatabaseAtomic::drop(context_); + current_zookeeper->tryRemoveRecursive(replica_path); /// TODO it may leave garbage in ZooKeeper if the last node lost connection here if (current_zookeeper->tryRemove(zookeeper_path + "/replicas") == Coordination::Error::ZOK) @@ -598,7 +642,7 @@ void DatabaseReplicated::shutdown() void DatabaseReplicated::dropTable(const Context & context, const String & table_name, bool no_delay) { auto txn = context.getZooKeeperMetadataTransaction(); - assert(!ddl_worker->isCurrentlyActive() || txn); + assert(!ddl_worker->isCurrentlyActive() || txn || startsWith(table_name, ".inner_id.")); if (txn && txn->isInitialQuery()) { String metadata_zk_path = zookeeper_path + "/metadata/" + escapeForFileName(table_name); @@ -702,12 +746,28 @@ void DatabaseReplicated::detachTablePermanently(const Context & context, const S assert(!ddl_worker->isCurrentlyActive() || txn); if (txn && txn->isInitialQuery()) { + /// We have to remove metadata from zookeeper, because we do not distinguish permanently detached tables + /// from attached tables when recovering replica. String metadata_zk_path = zookeeper_path + "/metadata/" + escapeForFileName(table_name); txn->addOp(zkutil::makeRemoveRequest(metadata_zk_path, -1)); } DatabaseAtomic::detachTablePermanently(context, table_name); } +void DatabaseReplicated::removeDetachedPermanentlyFlag(const Context & context, const String & table_name, const String & table_metadata_path, bool attach) const +{ + auto txn = context.getZooKeeperMetadataTransaction(); + assert(!ddl_worker->isCurrentlyActive() || txn); + if (txn && txn->isInitialQuery() && attach) + { + String metadata_zk_path = zookeeper_path + "/metadata/" + escapeForFileName(table_name); + String statement = readMetadataFile(table_name); + txn->addOp(zkutil::makeCreateRequest(metadata_zk_path, statement, zkutil::CreateMode::Persistent)); + } + DatabaseAtomic::removeDetachedPermanentlyFlag(context, table_name, table_metadata_path, attach); +} + + String DatabaseReplicated::readMetadataFile(const String & table_name) const { String statement; diff --git a/src/Databases/DatabaseReplicated.h b/src/Databases/DatabaseReplicated.h index fde53cf2c29..8f2ccd27627 100644 --- a/src/Databases/DatabaseReplicated.h +++ b/src/Databases/DatabaseReplicated.h @@ -45,6 +45,7 @@ public: const ASTPtr & query) override; void removeDictionary(const Context & context, const String & dictionary_name) override; void detachTablePermanently(const Context & context, const String & table_name) override; + void removeDetachedPermanentlyFlag(const Context & context, const String & table_name, const String & table_metadata_path, bool attach) const override; /// Try to execute DLL query on current host as initial query. If query is succeed, /// then it will be executed on all replicas. @@ -76,6 +77,11 @@ private: ASTPtr parseQueryFromMetadataInZooKeeper(const String & node_name, const String & query); String readMetadataFile(const String & table_name) const; + ClusterPtr getClusterImpl() const; + void setCluster(ClusterPtr && new_cluster); + + void createEmptyLogEntry(Coordination::Requests & ops, const ZooKeeperPtr & current_zookeeper); + String zookeeper_path; String shard_name; String replica_name; @@ -86,6 +92,8 @@ private: std::atomic_bool is_readonly = true; std::unique_ptr ddl_worker; + + mutable ClusterPtr cluster; }; } diff --git a/src/Databases/DatabaseReplicatedSettings.h b/src/Databases/DatabaseReplicatedSettings.h index 11d5b3820e4..43003af1120 100644 --- a/src/Databases/DatabaseReplicatedSettings.h +++ b/src/Databases/DatabaseReplicatedSettings.h @@ -11,6 +11,9 @@ class ASTStorage; M(Float, max_broken_tables_ratio, 0.5, "Do not recover replica automatically if the ratio of staled tables to all tables is greater", 0) \ M(UInt64, max_replication_lag_to_enqueue, 10, "Replica will throw exception on attempt to execute query if its replication lag greater", 0) \ M(UInt64, wait_entry_commited_timeout_sec, 3600, "Replicas will try to cancel query if timeout exceed, but initiator host has not executed it yet", 0) \ + M(String, cluster_username, "default", "Username to use when connecting to hosts of cluster", 0) \ + M(String, cluster_password, "", "Password to use when connecting to hosts of cluster", 0) \ + M(Bool, cluster_secure_connection, false, "Enable TLS when connecting to hosts of cluster", 0) \ DECLARE_SETTINGS_TRAITS(DatabaseReplicatedSettingsTraits, LIST_OF_DATABASE_REPLICATED_SETTINGS) diff --git a/src/Databases/DatabaseReplicatedWorker.cpp b/src/Databases/DatabaseReplicatedWorker.cpp index ee5a3b5eed0..b69e76697b0 100644 --- a/src/Databases/DatabaseReplicatedWorker.cpp +++ b/src/Databases/DatabaseReplicatedWorker.cpp @@ -237,6 +237,8 @@ DDLTaskPtr DatabaseReplicatedDDLWorker::initAndCheckTask(const String & entry_na if (task->entry.query.empty()) { + /// Some replica is added or removed, let's update cached cluster + database->setCluster(database->getClusterImpl()); out_reason = fmt::format("Entry {} is a dummy task", entry_name); return {}; } diff --git a/src/Dictionaries/DictionaryStructure.cpp b/src/Dictionaries/DictionaryStructure.cpp index 25e29d7e0e8..1e8e9ee58dc 100644 --- a/src/Dictionaries/DictionaryStructure.cpp +++ b/src/Dictionaries/DictionaryStructure.cpp @@ -241,6 +241,8 @@ const DictionaryAttribute & DictionaryStructure::getAttribute(const std::string for (const auto & key_attribute : *key) if (key_attribute.name == attribute_name) return key_attribute; + + throw Exception{"No such attribute '" + attribute_name + "' in keys", ErrorCodes::BAD_ARGUMENTS}; } size_t attribute_index = it->second; @@ -354,6 +356,7 @@ std::vector DictionaryStructure::getAttributes( config.keys(config_prefix, config_elems); auto has_hierarchy = false; + std::unordered_set attribute_names; std::vector res_attributes; const FormatSettings format_settings; @@ -376,6 +379,15 @@ std::vector DictionaryStructure::getAttributes( if ((range_min && name == range_min->name) || (range_max && name == range_max->name)) continue; + auto insert_result = attribute_names.insert(name); + bool inserted = insert_result.second; + + if (!inserted) + throw Exception( + ErrorCodes::BAD_ARGUMENTS, + "Dictionary attributes names must be unique. Attribute name ({}) is not unique", + name); + const auto type_string = config.getString(prefix + "type"); const auto initial_type = DataTypeFactory::instance().get(type_string); auto type = initial_type; diff --git a/src/Dictionaries/ExecutablePoolDictionarySource.cpp b/src/Dictionaries/ExecutablePoolDictionarySource.cpp index 4bafeeecf7e..0c9ca4ce714 100644 --- a/src/Dictionaries/ExecutablePoolDictionarySource.cpp +++ b/src/Dictionaries/ExecutablePoolDictionarySource.cpp @@ -221,7 +221,9 @@ BlockInputStreamPtr ExecutablePoolDictionarySource::getStreamForBlock(const Bloc }, configuration.max_command_execution_time * 10000); if (!result) - throw Exception(ErrorCodes::TIMEOUT_EXCEEDED, "Could not get process from pool, max command execution timeout exceeded"); + throw Exception(ErrorCodes::TIMEOUT_EXCEEDED, + "Could not get process from pool, max command execution timeout exceeded ({}) seconds", + configuration.max_command_execution_time); size_t rows_to_read = block.rows(); auto read_stream = context.getInputFormat(configuration.format, process->out, sample_block, rows_to_read); @@ -298,7 +300,7 @@ void registerDictionarySourceExecutablePool(DictionarySourceFactory & factory) size_t max_command_execution_time = config.getUInt64(configuration_config_prefix + ".max_command_execution_time", 10); size_t max_execution_time_seconds = static_cast(context.getSettings().max_execution_time.totalSeconds()); - if (max_command_execution_time > max_execution_time_seconds) + if (max_execution_time_seconds != 0 && max_command_execution_time > max_execution_time_seconds) max_command_execution_time = max_execution_time_seconds; ExecutablePoolDictionarySource::Configuration configuration diff --git a/src/Dictionaries/LibraryDictionarySource.cpp b/src/Dictionaries/LibraryDictionarySource.cpp index 6d763444b54..0632dd3e30f 100644 --- a/src/Dictionaries/LibraryDictionarySource.cpp +++ b/src/Dictionaries/LibraryDictionarySource.cpp @@ -72,7 +72,7 @@ namespace } - Block dataToBlock(const Block & sample_block, const void * data) + Block dataToBlock(const Block & sample_block, const ClickHouseLibrary::RawClickHouseLibraryTable data) { if (!data) throw Exception("LibraryDictionarySource: No data returned", ErrorCodes::EXTERNAL_LIBRARY_ERROR); @@ -84,9 +84,7 @@ namespace + (columns_received->error_string ? columns_received->error_string : ""), ErrorCodes::EXTERNAL_LIBRARY_ERROR); - MutableColumns columns(sample_block.columns()); - for (const auto i : ext::range(0, columns.size())) - columns[i] = sample_block.getByPosition(i).column->cloneEmpty(); + MutableColumns columns = sample_block.cloneEmptyColumns(); for (size_t col_n = 0; col_n < columns_received->size; ++col_n) { @@ -151,8 +149,8 @@ LibraryDictionarySource::LibraryDictionarySource( #endif ); settings = std::make_shared(getLibSettings(config, config_prefix + lib_config_settings)); - if (auto lib_new = library->tryGetstrings), decltype(&ClickHouseLibrary::log))>( - "ClickHouseDictionary_v3_libNew")) + + if (auto lib_new = library->tryGet(ClickHouseLibrary::LIBRARY_CREATE_NEW_FUNC_NAME)) lib_data = lib_new(&settings->strings, ClickHouseLibrary::log); } @@ -166,17 +164,15 @@ LibraryDictionarySource::LibraryDictionarySource(const LibraryDictionarySource & , description{other.description} , settings{other.settings} { - if (auto lib_clone = library->tryGet("ClickHouseDictionary_v3_libClone")) + if (auto lib_clone = library->tryGet(ClickHouseLibrary::LIBRARY_CLONE_FUNC_NAME)) lib_data = lib_clone(other.lib_data); - else if ( - auto lib_new = library->tryGetstrings), decltype(&ClickHouseLibrary::log))>( - "ClickHouseDictionary_v3_libNew")) + else if (auto lib_new = library->tryGet(ClickHouseLibrary::LIBRARY_CREATE_NEW_FUNC_NAME)) lib_data = lib_new(&settings->strings, ClickHouseLibrary::log); } LibraryDictionarySource::~LibraryDictionarySource() { - if (auto lib_delete = library->tryGet("ClickHouseDictionary_v3_libDelete")) + if (auto lib_delete = library->tryGet(ClickHouseLibrary::LIBRARY_DELETE_FUNC_NAME)) lib_delete(lib_data); } @@ -193,15 +189,17 @@ BlockInputStreamPtr LibraryDictionarySource::loadAll() columns.data[i] = a.name.c_str(); ++i; } - void * data_ptr = nullptr; - /// Get function pointer before dataNew call because library->get may throw. - auto func_load_all - = library->getstrings), decltype(&columns))>("ClickHouseDictionary_v3_loadAll"); - data_ptr = library->get("ClickHouseDictionary_v3_dataNew")(lib_data); - auto * data = func_load_all(data_ptr, &settings->strings, &columns); + auto load_all_func = library->get(ClickHouseLibrary::LIBRARY_LOAD_ALL_FUNC_NAME); + auto data_new_func = library->get(ClickHouseLibrary::LIBRARY_DATA_NEW_FUNC_NAME); + auto data_delete_func = library->get(ClickHouseLibrary::LIBRARY_DATA_DELETE_FUNC_NAME); + + ClickHouseLibrary::LibraryData data_ptr = data_new_func(lib_data); + SCOPE_EXIT(data_delete_func(lib_data, data_ptr)); + + ClickHouseLibrary::RawClickHouseLibraryTable data = load_all_func(data_ptr, &settings->strings, &columns); auto block = dataToBlock(description.sample_block, data); - SCOPE_EXIT(library->get("ClickHouseDictionary_v3_dataDelete")(lib_data, data_ptr)); + return std::make_shared(block); } @@ -219,16 +217,17 @@ BlockInputStreamPtr LibraryDictionarySource::loadIds(const std::vector & columns_pass.data[i] = a.name.c_str(); ++i; } - void * data_ptr = nullptr; - /// Get function pointer before dataNew call because library->get may throw. - auto func_load_ids - = library->getstrings), decltype(&columns_pass), decltype(&ids_data))>( - "ClickHouseDictionary_v3_loadIds"); - data_ptr = library->get("ClickHouseDictionary_v3_dataNew")(lib_data); - auto * data = func_load_ids(data_ptr, &settings->strings, &columns_pass, &ids_data); + auto load_ids_func = library->get(ClickHouseLibrary::LIBRARY_LOAD_IDS_FUNC_NAME); + auto data_new_func = library->get(ClickHouseLibrary::LIBRARY_DATA_NEW_FUNC_NAME); + auto data_delete_func = library->get(ClickHouseLibrary::LIBRARY_DATA_DELETE_FUNC_NAME); + + ClickHouseLibrary::LibraryData data_ptr = data_new_func(lib_data); + SCOPE_EXIT(data_delete_func(lib_data, data_ptr)); + + ClickHouseLibrary::RawClickHouseLibraryTable data = load_ids_func(data_ptr, &settings->strings, &columns_pass, &ids_data); auto block = dataToBlock(description.sample_block, data); - SCOPE_EXIT(library->get("ClickHouseDictionary_v3_dataDelete")(lib_data, data_ptr)); + return std::make_shared(block); } @@ -254,30 +253,34 @@ BlockInputStreamPtr LibraryDictionarySource::loadKeys(const Columns & key_column ClickHouseLibrary::Table request_cols{.data = static_cast(holder.get()), .size = key_columns.size()}; - void * data_ptr = nullptr; - /// Get function pointer before dataNew call because library->get may throw. - auto func_load_keys = library->getstrings), decltype(&request_cols))>( - "ClickHouseDictionary_v3_loadKeys"); - data_ptr = library->get("ClickHouseDictionary_v3_dataNew")(lib_data); - auto * data = func_load_keys(data_ptr, &settings->strings, &request_cols); + auto load_keys_func = library->get(ClickHouseLibrary::LIBRARY_LOAD_KEYS_FUNC_NAME); + auto data_new_func = library->get(ClickHouseLibrary::LIBRARY_DATA_NEW_FUNC_NAME); + auto data_delete_func = library->get(ClickHouseLibrary::LIBRARY_DATA_DELETE_FUNC_NAME); + + ClickHouseLibrary::LibraryData data_ptr = data_new_func(lib_data); + SCOPE_EXIT(data_delete_func(lib_data, data_ptr)); + + ClickHouseLibrary::RawClickHouseLibraryTable data = load_keys_func(data_ptr, &settings->strings, &request_cols); auto block = dataToBlock(description.sample_block, data); - SCOPE_EXIT(library->get("ClickHouseDictionary_v3_dataDelete")(lib_data, data_ptr)); + return std::make_shared(block); } bool LibraryDictionarySource::isModified() const { - if (auto func_is_modified - = library->tryGetstrings))>("ClickHouseDictionary_v3_isModified")) + if (auto func_is_modified = library->tryGet( + ClickHouseLibrary::LIBRARY_IS_MODIFIED_FUNC_NAME)) return func_is_modified(lib_data, &settings->strings); + return true; } bool LibraryDictionarySource::supportsSelectiveLoad() const { - if (auto func_supports_selective_load - = library->tryGetstrings))>("ClickHouseDictionary_v3_supportsSelectiveLoad")) + if (auto func_supports_selective_load = library->tryGet( + ClickHouseLibrary::LIBRARY_SUPPORTS_SELECTIVE_LOAD_FUNC_NAME)) return func_supports_selective_load(lib_data, &settings->strings); + return true; } diff --git a/src/Dictionaries/LibraryDictionarySourceExternal.cpp b/src/Dictionaries/LibraryDictionarySourceExternal.cpp index 2e944056283..259d0a2846a 100644 --- a/src/Dictionaries/LibraryDictionarySourceExternal.cpp +++ b/src/Dictionaries/LibraryDictionarySourceExternal.cpp @@ -6,10 +6,25 @@ namespace const char DICT_LOGGER_NAME[] = "LibraryDictionarySourceExternal"; } -void ClickHouseLibrary::log(ClickHouseLibrary::LogLevel level, ClickHouseLibrary::CString msg) +namespace ClickHouseLibrary { - using ClickHouseLibrary::LogLevel; +std::string_view LIBRARY_CREATE_NEW_FUNC_NAME = "ClickHouseDictionary_v3_libNew"; +std::string_view LIBRARY_CLONE_FUNC_NAME = "ClickHouseDictionary_v3_libClone"; +std::string_view LIBRARY_DELETE_FUNC_NAME = "ClickHouseDictionary_v3_libDelete"; + +std::string_view LIBRARY_DATA_NEW_FUNC_NAME = "ClickHouseDictionary_v3_dataNew"; +std::string_view LIBRARY_DATA_DELETE_FUNC_NAME = "ClickHouseDictionary_v3_dataDelete"; + +std::string_view LIBRARY_LOAD_ALL_FUNC_NAME = "ClickHouseDictionary_v3_loadAll"; +std::string_view LIBRARY_LOAD_IDS_FUNC_NAME = "ClickHouseDictionary_v3_loadIds"; +std::string_view LIBRARY_LOAD_KEYS_FUNC_NAME = "ClickHouseDictionary_v3_loadKeys"; + +std::string_view LIBRARY_IS_MODIFIED_FUNC_NAME = "ClickHouseDictionary_v3_isModified"; +std::string_view LIBRARY_SUPPORTS_SELECTIVE_LOAD_FUNC_NAME = "ClickHouseDictionary_v3_supportsSelectiveLoad"; + +void log(LogLevel level, CString msg) +{ auto & logger = Poco::Logger::get(DICT_LOGGER_NAME); switch (level) { @@ -47,3 +62,5 @@ void ClickHouseLibrary::log(ClickHouseLibrary::LogLevel level, ClickHouseLibrary break; } } + +} diff --git a/src/Dictionaries/LibraryDictionarySourceExternal.h b/src/Dictionaries/LibraryDictionarySourceExternal.h index 7a031cdb315..3b92707d091 100644 --- a/src/Dictionaries/LibraryDictionarySourceExternal.h +++ b/src/Dictionaries/LibraryDictionarySourceExternal.h @@ -1,6 +1,7 @@ #pragma once #include +#include #define CLICKHOUSE_DICTIONARY_LIBRARY_API 1 @@ -61,4 +62,49 @@ enum LogLevel }; void log(LogLevel level, CString msg); + +extern std::string_view LIBRARY_CREATE_NEW_FUNC_NAME; +extern std::string_view LIBRARY_CLONE_FUNC_NAME; +extern std::string_view LIBRARY_DELETE_FUNC_NAME; + +extern std::string_view LIBRARY_DATA_NEW_FUNC_NAME; +extern std::string_view LIBRARY_DATA_DELETE_FUNC_NAME; + +extern std::string_view LIBRARY_LOAD_ALL_FUNC_NAME; +extern std::string_view LIBRARY_LOAD_IDS_FUNC_NAME; +extern std::string_view LIBRARY_LOAD_KEYS_FUNC_NAME; + +extern std::string_view LIBRARY_IS_MODIFIED_FUNC_NAME; +extern std::string_view LIBRARY_SUPPORTS_SELECTIVE_LOAD_FUNC_NAME; + +using LibraryContext = void *; + +using LibraryLoggerFunc = void (*)(LogLevel, CString /* message */); + +using LibrarySettings = CStrings *; + +using LibraryNewFunc = LibraryContext (*)(LibrarySettings, LibraryLoggerFunc); +using LibraryCloneFunc = LibraryContext (*)(LibraryContext); +using LibraryDeleteFunc = void (*)(LibraryContext); + +using LibraryData = void *; +using LibraryDataNewFunc = LibraryData (*)(LibraryContext); +using LibraryDataDeleteFunc = void (*)(LibraryContext, LibraryData); + +/// Can be safely casted into const Table * with static_cast +using RawClickHouseLibraryTable = void *; +using RequestedColumnsNames = CStrings *; + +using LibraryLoadAllFunc = RawClickHouseLibraryTable (*)(LibraryData, LibrarySettings, RequestedColumnsNames); + +using RequestedIds = const VectorUInt64 *; +using LibraryLoadIdsFunc = RawClickHouseLibraryTable (*)(LibraryData, LibrarySettings, RequestedColumnsNames, RequestedIds); + +using RequestedKeys = Table *; +/// There is no requested columns names for load keys func +using LibraryLoadKeysFunc = RawClickHouseLibraryTable (*)(LibraryData, LibrarySettings, RequestedKeys); + +using LibraryIsModifiedFunc = bool (*)(LibraryContext, LibrarySettings); +using LibrarySupportsSelectiveLoadFunc = bool (*)(LibraryContext, LibrarySettings); + } diff --git a/src/Dictionaries/RangeDictionaryBlockInputStream.h b/src/Dictionaries/RangeDictionaryBlockInputStream.h index ccd77d49e0f..6531f5cba9d 100644 --- a/src/Dictionaries/RangeDictionaryBlockInputStream.h +++ b/src/Dictionaries/RangeDictionaryBlockInputStream.h @@ -17,14 +17,14 @@ namespace DB * BlockInputStream implementation for external dictionaries * read() returns single block consisting of the in-memory contents of the dictionaries */ -template +template class RangeDictionaryBlockInputStream : public DictionaryBlockInputStreamBase { public: - using DictionaryPtr = std::shared_ptr; + using Key = UInt64; RangeDictionaryBlockInputStream( - DictionaryPtr dictionary, + std::shared_ptr dictionary, size_t max_block_size, const Names & column_names, PaddedPODArray && ids_to_fill, @@ -40,35 +40,26 @@ private: template ColumnPtr getColumnFromPODArray(const PaddedPODArray & array) const; - template - void addSpecialColumn( - const std::optional & attribute, - DataTypePtr type, - const std::string & default_name, - const std::unordered_set & column_names_set, - const PaddedPODArray & values, - ColumnsWithTypeAndName & columns, - bool force = false) const; - Block fillBlock( const PaddedPODArray & ids_to_fill, const PaddedPODArray & block_start_dates, const PaddedPODArray & block_end_dates) const; - PaddedPODArray - makeDateKey(const PaddedPODArray & block_start_dates, const PaddedPODArray & block_end_dates) const; + PaddedPODArray makeDateKey( + const PaddedPODArray & block_start_dates, + const PaddedPODArray & block_end_dates) const; - DictionaryPtr dictionary; - Names column_names; + std::shared_ptr dictionary; + NameSet column_names; PaddedPODArray ids; PaddedPODArray start_dates; PaddedPODArray end_dates; }; -template -RangeDictionaryBlockInputStream::RangeDictionaryBlockInputStream( - DictionaryPtr dictionary_, +template +RangeDictionaryBlockInputStream::RangeDictionaryBlockInputStream( + std::shared_ptr dictionary_, size_t max_block_size_, const Names & column_names_, PaddedPODArray && ids_, @@ -76,15 +67,15 @@ RangeDictionaryBlockInputStream::RangeDictionary PaddedPODArray && block_end_dates) : DictionaryBlockInputStreamBase(ids_.size(), max_block_size_) , dictionary(dictionary_) - , column_names(column_names_) + , column_names(column_names_.begin(), column_names_.end()) , ids(std::move(ids_)) , start_dates(std::move(block_start_dates)) , end_dates(std::move(block_end_dates)) { } -template -Block RangeDictionaryBlockInputStream::getBlock(size_t start, size_t length) const +template +Block RangeDictionaryBlockInputStream::getBlock(size_t start, size_t length) const { PaddedPODArray block_ids; PaddedPODArray block_start_dates; @@ -103,38 +94,19 @@ Block RangeDictionaryBlockInputStream::getBlock( return fillBlock(block_ids, block_start_dates, block_end_dates); } -template +template template -ColumnPtr RangeDictionaryBlockInputStream::getColumnFromPODArray(const PaddedPODArray & array) const +ColumnPtr RangeDictionaryBlockInputStream::getColumnFromPODArray(const PaddedPODArray & array) const { auto column_vector = ColumnVector::create(); column_vector->getData().reserve(array.size()); - for (T value : array) - column_vector->insertValue(value); + column_vector->getData().insert(array.begin(), array.end()); + return column_vector; } -template -template -void RangeDictionaryBlockInputStream::addSpecialColumn( - const std::optional & attribute, - DataTypePtr type, - const std::string & default_name, - const std::unordered_set & column_names_set, - const PaddedPODArray & values, - ColumnsWithTypeAndName & columns, - bool force) const -{ - std::string name = default_name; - if (attribute) - name = attribute->name; - - if (force || column_names_set.find(name) != column_names_set.end()) - columns.emplace_back(getColumnFromPODArray(values), type, name); -} - -template -PaddedPODArray RangeDictionaryBlockInputStream::makeDateKey( +template +PaddedPODArray RangeDictionaryBlockInputStream::makeDateKey( const PaddedPODArray & block_start_dates, const PaddedPODArray & block_end_dates) const { PaddedPODArray key(block_start_dates.size()); @@ -150,8 +122,8 @@ PaddedPODArray RangeDictionaryBlockInputStream -Block RangeDictionaryBlockInputStream::fillBlock( +template +Block RangeDictionaryBlockInputStream::fillBlock( const PaddedPODArray & ids_to_fill, const PaddedPODArray & block_start_dates, const PaddedPODArray & block_end_dates) const @@ -159,20 +131,32 @@ Block RangeDictionaryBlockInputStream::fillBlock ColumnsWithTypeAndName columns; const DictionaryStructure & structure = dictionary->getStructure(); - std::unordered_set names(column_names.begin(), column_names.end()); - - addSpecialColumn(structure.id, std::make_shared(), "ID", names, ids_to_fill, columns, true); - auto ids_column = columns.back().column; - addSpecialColumn(structure.range_min, structure.range_max->type, "Range Start", names, block_start_dates, columns); - addSpecialColumn(structure.range_max, structure.range_max->type, "Range End", names, block_end_dates, columns); + auto ids_column = getColumnFromPODArray(ids_to_fill); + const std::string & id_column_name = structure.id->name; + if (column_names.find(id_column_name) != column_names.end()) + columns.emplace_back(ids_column, std::make_shared(), id_column_name); auto date_key = makeDateKey(block_start_dates, block_end_dates); auto date_column = getColumnFromPODArray(date_key); + const std::string & range_min_column_name = structure.range_min->name; + if (column_names.find(range_min_column_name) != column_names.end()) + { + auto range_min_column = getColumnFromPODArray(block_start_dates); + columns.emplace_back(range_min_column, structure.range_max->type, range_min_column_name); + } + + const std::string & range_max_column_name = structure.range_max->name; + if (column_names.find(range_max_column_name) != column_names.end()) + { + auto range_max_column = getColumnFromPODArray(block_end_dates); + columns.emplace_back(range_max_column, structure.range_max->type, range_max_column_name); + } + for (const auto idx : ext::range(0, structure.attributes.size())) { const DictionaryAttribute & attribute = structure.attributes[idx]; - if (names.find(attribute.name) != names.end()) + if (column_names.find(attribute.name) != column_names.end()) { ColumnPtr column = dictionary->getColumn( attribute.name, diff --git a/src/Dictionaries/RangeHashedDictionary.cpp b/src/Dictionaries/RangeHashedDictionary.cpp index f5be04c120d..52b6f219bac 100644 --- a/src/Dictionaries/RangeHashedDictionary.cpp +++ b/src/Dictionaries/RangeHashedDictionary.cpp @@ -52,7 +52,6 @@ namespace ErrorCodes extern const int DICTIONARY_IS_EMPTY; extern const int TYPE_MISMATCH; extern const int UNSUPPORTED_METHOD; - extern const int NOT_IMPLEMENTED; } bool RangeHashedDictionary::Range::isCorrectDate(const RangeStorageType & date) @@ -178,10 +177,76 @@ ColumnPtr RangeHashedDictionary::getColumn( return result; } -ColumnUInt8::Ptr RangeHashedDictionary::hasKeys(const Columns &, const DataTypes &) const +ColumnUInt8::Ptr RangeHashedDictionary::hasKeys(const Columns & key_columns, const DataTypes & key_types) const { - throw Exception(ErrorCodes::NOT_IMPLEMENTED, - "Has not supported", getDictionaryID().getNameForLogs()); + auto range_storage_column = key_columns[1]; + ColumnWithTypeAndName column_to_cast = {range_storage_column->convertToFullColumnIfConst(), key_types[1], ""}; + + auto range_column_storage_type = std::make_shared(); + auto range_column_updated = castColumnAccurate(column_to_cast, range_column_storage_type); + + PaddedPODArray key_backup_storage; + PaddedPODArray range_backup_storage; + + const PaddedPODArray & ids = getColumnVectorData(this, key_columns[0], key_backup_storage); + const PaddedPODArray & dates = getColumnVectorData(this, range_column_updated, range_backup_storage); + + const auto & attribute = attributes.front(); + + ColumnUInt8::Ptr result; + + auto type_call = [&](const auto & dictionary_attribute_type) + { + using Type = std::decay_t; + using AttributeType = typename Type::AttributeType; + using ValueType = DictionaryValueType; + result = hasKeysImpl(attribute, ids, dates); + }; + + callOnDictionaryAttributeType(attribute.type, type_call); + + query_count.fetch_add(ids.size(), std::memory_order_relaxed); + + return result; +} + +template +ColumnUInt8::Ptr RangeHashedDictionary::hasKeysImpl( + const Attribute & attribute, + const PaddedPODArray & ids, + const PaddedPODArray & dates) const +{ + auto result = ColumnUInt8::create(ids.size()); + auto& out = result->getData(); + + const auto & attr = *std::get>(attribute.maps); + + for (const auto row : ext::range(0, ids.size())) + { + const auto it = attr.find(ids[row]); + + if (it) + { + const auto date = dates[row]; + const auto & ranges_and_values = it->getMapped(); + const auto val_it = std::find_if( + std::begin(ranges_and_values), + std::end(ranges_and_values), + [date](const Value & v) + { + return v.range.contains(date); + }); + + if (val_it != std::end(ranges_and_values)) + out[row] = true; + else + out[row] = false; + } + else + out[row] = false; + } + + return result; } void RangeHashedDictionary::createAttributes() @@ -450,7 +515,9 @@ RangeHashedDictionary::getAttributeWithType(const std::string & attribute_name, template void RangeHashedDictionary::getIdsAndDates( - PaddedPODArray & ids, PaddedPODArray & start_dates, PaddedPODArray & end_dates) const + PaddedPODArray & ids, + PaddedPODArray & start_dates, + PaddedPODArray & end_dates) const { const auto & attribute = attributes.front(); @@ -458,11 +525,9 @@ void RangeHashedDictionary::getIdsAndDates( { using Type = std::decay_t; using AttributeType = typename Type::AttributeType; + using ValueType = DictionaryValueType; - if constexpr (std::is_same_v) - getIdsAndDates(attribute, ids, start_dates, end_dates); - else - getIdsAndDates(attribute, ids, start_dates, end_dates); + getIdsAndDates(attribute, ids, start_dates, end_dates); }; callOnDictionaryAttributeType(attribute.type, type_call); @@ -506,13 +571,20 @@ BlockInputStreamPtr RangeHashedDictionary::getBlockInputStreamImpl(const Names & PaddedPODArray end_dates; getIdsAndDates(ids, start_dates, end_dates); - using BlockInputStreamType = RangeDictionaryBlockInputStream; - auto dict_ptr = std::static_pointer_cast(shared_from_this()); - return std::make_shared( - dict_ptr, max_block_size, column_names, std::move(ids), std::move(start_dates), std::move(end_dates)); + using BlockInputStreamType = RangeDictionaryBlockInputStream; + + auto stream = std::make_shared( + shared_from_this(), + max_block_size, + column_names, + std::move(ids), + std::move(start_dates), + std::move(end_dates)); + + return stream; } -struct RangeHashedDIctionaryCallGetBlockInputStreamImpl +struct RangeHashedDictionaryCallGetBlockInputStreamImpl { BlockInputStreamPtr stream; const RangeHashedDictionary * dict; @@ -532,7 +604,7 @@ BlockInputStreamPtr RangeHashedDictionary::getBlockInputStream(const Names & col { using ListType = TypeList; - RangeHashedDIctionaryCallGetBlockInputStreamImpl callable; + RangeHashedDictionaryCallGetBlockInputStreamImpl callable; callable.dict = this; callable.column_names = &column_names; callable.max_block_size = max_block_size; diff --git a/src/Dictionaries/RangeHashedDictionary.h b/src/Dictionaries/RangeHashedDictionary.h index 1f93fa75775..f2b24e52dfc 100644 --- a/src/Dictionaries/RangeHashedDictionary.h +++ b/src/Dictionaries/RangeHashedDictionary.h @@ -93,8 +93,6 @@ private: template using Ptr = std::unique_ptr>; - using NullableSet = HashSet>; - struct Attribute final { public: @@ -159,6 +157,12 @@ private: ValueSetter && set_value, DefaultValueExtractor & default_value_extractor) const; + template + ColumnUInt8::Ptr hasKeysImpl( + const Attribute & attribute, + const PaddedPODArray & ids, + const PaddedPODArray & dates) const; + template static void setAttributeValueImpl(Attribute & attribute, const Key id, const Range & range, const Field & value); @@ -181,7 +185,7 @@ private: template BlockInputStreamPtr getBlockInputStreamImpl(const Names & column_names, size_t max_block_size) const; - friend struct RangeHashedDIctionaryCallGetBlockInputStreamImpl; + friend struct RangeHashedDictionaryCallGetBlockInputStreamImpl; const DictionaryStructure dict_struct; const DictionarySourcePtr source_ptr; diff --git a/src/Disks/DiskCacheWrapper.cpp b/src/Disks/DiskCacheWrapper.cpp index 6d991c17c67..47a0e0655d6 100644 --- a/src/Disks/DiskCacheWrapper.cpp +++ b/src/Disks/DiskCacheWrapper.cpp @@ -265,6 +265,20 @@ void DiskCacheWrapper::removeRecursive(const String & path) DiskDecorator::removeRecursive(path); } +void DiskCacheWrapper::removeSharedFile(const String & path, bool keep_s3) +{ + if (cache_disk->exists(path)) + cache_disk->removeSharedFile(path, keep_s3); + DiskDecorator::removeSharedFile(path, keep_s3); +} + +void DiskCacheWrapper::removeSharedRecursive(const String & path, bool keep_s3) +{ + if (cache_disk->exists(path)) + cache_disk->removeSharedRecursive(path, keep_s3); + DiskDecorator::removeSharedRecursive(path, keep_s3); +} + void DiskCacheWrapper::createHardLink(const String & src_path, const String & dst_path) { /// Don't create hardlinks for cache files to shadow directory as it just waste cache disk space. diff --git a/src/Disks/DiskCacheWrapper.h b/src/Disks/DiskCacheWrapper.h index bf1a5df693a..46aadb78007 100644 --- a/src/Disks/DiskCacheWrapper.h +++ b/src/Disks/DiskCacheWrapper.h @@ -40,6 +40,8 @@ public: void removeFileIfExists(const String & path) override; void removeDirectory(const String & path) override; void removeRecursive(const String & path) override; + void removeSharedFile(const String & path, bool keep_s3) override; + void removeSharedRecursive(const String & path, bool keep_s3) override; void createHardLink(const String & src_path, const String & dst_path) override; ReservationPtr reserve(UInt64 bytes) override; diff --git a/src/Disks/DiskDecorator.cpp b/src/Disks/DiskDecorator.cpp index 3ebd1b6cf3b..eeb72fe1246 100644 --- a/src/Disks/DiskDecorator.cpp +++ b/src/Disks/DiskDecorator.cpp @@ -145,6 +145,16 @@ void DiskDecorator::removeRecursive(const String & path) delegate->removeRecursive(path); } +void DiskDecorator::removeSharedFile(const String & path, bool keep_s3) +{ + delegate->removeSharedFile(path, keep_s3); +} + +void DiskDecorator::removeSharedRecursive(const String & path, bool keep_s3) +{ + delegate->removeSharedRecursive(path, keep_s3); +} + void DiskDecorator::setLastModified(const String & path, const Poco::Timestamp & timestamp) { delegate->setLastModified(path, timestamp); diff --git a/src/Disks/DiskDecorator.h b/src/Disks/DiskDecorator.h index c204d10bed9..d5ac6f0fda0 100644 --- a/src/Disks/DiskDecorator.h +++ b/src/Disks/DiskDecorator.h @@ -42,11 +42,18 @@ public: void removeFileIfExists(const String & path) override; void removeDirectory(const String & path) override; void removeRecursive(const String & path) override; + void removeSharedFile(const String & path, bool keep_s3) override; + void removeSharedRecursive(const String & path, bool keep_s3) override; void setLastModified(const String & path, const Poco::Timestamp & timestamp) override; Poco::Timestamp getLastModified(const String & path) override; void setReadOnly(const String & path) override; void createHardLink(const String & src_path, const String & dst_path) override; void truncateFile(const String & path, size_t size) override; + int open(const String & path, mode_t mode) const; + void close(int fd) const; + void sync(int fd) const; + String getUniqueId(const String & path) const override { return delegate->getUniqueId(path); } + bool checkUniqueId(const String & id) const override { return delegate->checkUniqueId(id); } DiskType::Type getType() const override { return delegate->getType(); } Executor & getExecutor() override; void onFreeze(const String & path) override; diff --git a/src/Disks/DiskType.h b/src/Disks/DiskType.h new file mode 100644 index 00000000000..4e0ae226af4 --- /dev/null +++ b/src/Disks/DiskType.h @@ -0,0 +1,32 @@ +#pragma once + +#include + +namespace DB +{ + +struct DiskType +{ + enum class Type + { + Local, + RAM, + S3 + }; + static String toString(Type disk_type) + { + switch (disk_type) + { + case Type::Local: + return "local"; + case Type::RAM: + return "memory"; + case Type::S3: + return "s3"; + } + __builtin_unreachable(); + } +}; + +} + diff --git a/src/Disks/IDisk.h b/src/Disks/IDisk.h index 6f021346174..44c4fe73d37 100644 --- a/src/Disks/IDisk.h +++ b/src/Disks/IDisk.h @@ -5,6 +5,7 @@ #include #include #include +#include #include #include @@ -57,29 +58,6 @@ public: using SpacePtr = std::shared_ptr; -struct DiskType -{ - enum class Type - { - Local, - RAM, - S3 - }; - static String toString(Type disk_type) - { - switch (disk_type) - { - case Type::Local: - return "local"; - case Type::RAM: - return "memory"; - case Type::S3: - return "s3"; - } - __builtin_unreachable(); - } -}; - /** * A guard, that should synchronize file's or directory's state * with storage device (e.g. fsync in POSIX) in its destructor. @@ -195,6 +173,21 @@ public: /// Remove file or directory with all children. Use with extra caution. Throws exception if file doesn't exists. virtual void removeRecursive(const String & path) = 0; + /// Remove file. Throws exception if file doesn't exists or if directory is not empty. + /// Differs from removeFile for S3 disks + /// Second bool param is a flag to remove (true) or keep (false) shared data on S3 + virtual void removeSharedFile(const String & path, bool) { removeFile(path); } + + /// Remove file or directory with all children. Use with extra caution. Throws exception if file doesn't exists. + /// Differs from removeRecursive for S3 disks + /// Second bool param is a flag to remove (true) or keep (false) shared data on S3 + virtual void removeSharedRecursive(const String & path, bool) { removeRecursive(path); } + + /// Remove file or directory if it exists. + /// Differs from removeFileIfExists for S3 disks + /// Second bool param is a flag to remove (true) or keep (false) shared data on S3 + virtual void removeSharedFileIfExists(const String & path, bool) { removeFileIfExists(path); } + /// Set last modified time to file or directory at `path`. virtual void setLastModified(const String & path, const Poco::Timestamp & timestamp) = 0; @@ -216,6 +209,15 @@ public: /// Invoked when Global Context is shutdown. virtual void shutdown() { } + /// Return some uniq string for file, overrode for S3 + /// Required for distinguish different copies of the same part on S3 + virtual String getUniqueId(const String & path) const { return path; } + + /// Check file exists and ClickHouse has an access to it + /// Overrode in DiskS3 + /// Required for S3 to ensure that replica has access to data wroten by other node + virtual bool checkUniqueId(const String & id) const { return exists(id); } + /// Returns executor to perform asynchronous operations. virtual Executor & getExecutor() { return *executor; } diff --git a/src/Disks/IStoragePolicy.h b/src/Disks/IStoragePolicy.h index a41ea87c328..59cff3c85d5 100644 --- a/src/Disks/IStoragePolicy.h +++ b/src/Disks/IStoragePolicy.h @@ -1,4 +1,7 @@ #pragma once + +#include + #include #include #include @@ -36,6 +39,7 @@ public: /// mutations files virtual DiskPtr getAnyDisk() const = 0; virtual DiskPtr getDiskByName(const String & disk_name) const = 0; + virtual Disks getDisksByType(DiskType::Type type) const = 0; /// Get free space from most free disk virtual UInt64 getMaxUnreservedFreeSpace() const = 0; /// Reserves space on any volume with index > min_volume_index or returns nullptr @@ -57,6 +61,7 @@ public: /// Check if we have any volume with stopped merges virtual bool hasAnyVolumeWithDisabledMerges() const = 0; virtual bool containsVolume(const String & volume_name) const = 0; + /// Returns disks by type ordered by volumes priority }; } diff --git a/src/Disks/S3/DiskS3.cpp b/src/Disks/S3/DiskS3.cpp index 3d91d5fbb78..bb9966eb6ff 100644 --- a/src/Disks/S3/DiskS3.cpp +++ b/src/Disks/S3/DiskS3.cpp @@ -90,6 +90,16 @@ void throwIfError(Aws::Utils::Outcome & response) } } +template +void throwIfError(const Aws::Utils::Outcome & response) +{ + if (!response.IsSuccess()) + { + const auto & err = response.GetError(); + throw Exception(err.GetMessage(), static_cast(err.GetErrorType())); + } +} + /** * S3 metadata file layout: * Number of S3 objects, Total size of all S3 objects. @@ -609,6 +619,15 @@ void DiskS3::createDirectories(const String & path) Poco::File(metadata_path + path).createDirectories(); } +String DiskS3::getUniqueId(const String & path) const +{ + Metadata metadata(s3_root_path, metadata_path, path); + String id; + if (!metadata.s3_objects.empty()) + id = metadata.s3_root_path + metadata.s3_objects[0].first; + return id; +} + DiskDirectoryIteratorPtr DiskS3::iterateDirectory(const String & path) { return std::make_unique(metadata_path + path, path); @@ -791,13 +810,6 @@ void DiskS3::removeAws(const AwsS3KeyKeeper & keys) } } -void DiskS3::removeFile(const String & path) -{ - AwsS3KeyKeeper keys; - removeMeta(path, keys); - removeAws(keys); -} - void DiskS3::removeFileIfExists(const String & path) { AwsS3KeyKeeper keys; @@ -813,11 +825,20 @@ void DiskS3::removeDirectory(const String & path) Poco::File(metadata_path + path).remove(); } -void DiskS3::removeRecursive(const String & path) +void DiskS3::removeSharedFile(const String & path, bool keep_s3) +{ + AwsS3KeyKeeper keys; + removeMeta(path, keys); + if (!keep_s3) + removeAws(keys); +} + +void DiskS3::removeSharedRecursive(const String & path, bool keep_s3) { AwsS3KeyKeeper keys; removeMetaRecursive(path, keys); - removeAws(keys); + if (!keep_s3) + removeAws(keys); } bool DiskS3::tryReserve(UInt64 bytes) @@ -956,6 +977,23 @@ bool DiskS3::checkObjectExists(const String & prefix) return !outcome.GetResult().GetContents().empty(); } +bool DiskS3::checkUniqueId(const String & id) const +{ + /// Check that we have right s3 and have access rights + /// Actually interprets id as s3 object name and checks if it exists + Aws::S3::Model::ListObjectsV2Request request; + request.SetBucket(bucket); + request.SetPrefix(id); + auto resp = client->ListObjectsV2(request); + throwIfError(resp); + Aws::Vector object_list = resp.GetResult().GetContents(); + + for (const auto & object : object_list) + if (object.GetKey() == id) + return true; + return false; +} + Aws::S3::Model::HeadObjectResult DiskS3::headObject(const String & source_bucket, const String & key) { Aws::S3::Model::HeadObjectRequest request; diff --git a/src/Disks/S3/DiskS3.h b/src/Disks/S3/DiskS3.h index 5182ae4801b..5d9effa16fa 100644 --- a/src/Disks/S3/DiskS3.h +++ b/src/Disks/S3/DiskS3.h @@ -96,10 +96,13 @@ public: size_t buf_size, WriteMode mode) override; - void removeFile(const String & path) override; + void removeFile(const String & path) override { removeSharedFile(path, false); } void removeFileIfExists(const String & path) override; void removeDirectory(const String & path) override; - void removeRecursive(const String & path) override; + void removeRecursive(const String & path) override { removeSharedRecursive(path, false); } + + void removeSharedFile(const String & path, bool keep_s3) override; + void removeSharedRecursive(const String & path, bool keep_s3) override; void createHardLink(const String & src_path, const String & dst_path) override; @@ -115,6 +118,14 @@ public: void shutdown() override; + /// Return some uniq string for file + /// Required for distinguish different copies of the same part on S3 + String getUniqueId(const String & path) const override; + + /// Check file exists and ClickHouse has an access to it + /// Required for S3 to ensure that replica has access to data wroten by other node + bool checkUniqueId(const String & id) const override; + /// Actions performed after disk creation. void startup(); diff --git a/src/Disks/StoragePolicy.cpp b/src/Disks/StoragePolicy.cpp index a1345879c83..cff2685ca24 100644 --- a/src/Disks/StoragePolicy.cpp +++ b/src/Disks/StoragePolicy.cpp @@ -159,6 +159,17 @@ Disks StoragePolicy::getDisks() const } +Disks StoragePolicy::getDisksByType(DiskType::Type type) const +{ + Disks res; + for (const auto & volume : volumes) + for (const auto & disk : volume->getDisks()) + if (disk->getType() == type) + res.push_back(disk); + return res; +} + + DiskPtr StoragePolicy::getAnyDisk() const { /// StoragePolicy must contain at least one Volume diff --git a/src/Disks/StoragePolicy.h b/src/Disks/StoragePolicy.h index 6676ab19043..71773e91f70 100644 --- a/src/Disks/StoragePolicy.h +++ b/src/Disks/StoragePolicy.h @@ -47,6 +47,9 @@ public: /// Returns disks ordered by volumes priority Disks getDisks() const override; + /// Returns disks by type ordered by volumes priority + Disks getDisksByType(DiskType::Type type) const override; + /// Returns any disk /// Used when it's not important, for example for /// mutations files diff --git a/src/Functions/FunctionsExternalDictionaries.h b/src/Functions/FunctionsExternalDictionaries.h index acf158d42ef..4177d686f57 100644 --- a/src/Functions/FunctionsExternalDictionaries.h +++ b/src/Functions/FunctionsExternalDictionaries.h @@ -54,7 +54,6 @@ namespace ErrorCodes extern const int ILLEGAL_COLUMN; extern const int BAD_ARGUMENTS; extern const int TYPE_MISMATCH; - extern const int NOT_IMPLEMENTED; } @@ -154,13 +153,20 @@ public: String getName() const override { return name; } private: - size_t getNumberOfArguments() const override { return 2; } + size_t getNumberOfArguments() const override { return 0; } + bool isVariadic() const override { return true; } + + bool isDeterministic() const override { return false; } bool useDefaultImplementationForConstants() const final { return true; } + ColumnNumbers getArgumentsThatAreAlwaysConstant() const final { return {0}; } DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override { + if (arguments.size() < 2) + throw Exception{"Wrong argument count for function " + getName(), ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH}; + if (!isString(arguments[0])) throw Exception{"Illegal type " + arguments[0]->getName() + " of first argument of function " + getName() + ", expected a string.", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT}; @@ -173,8 +179,6 @@ private: return std::make_shared(); } - bool isDeterministic() const override { return false; } - ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t input_rows_count) const override { /** Do not require existence of the dictionary if the function is called for empty columns. @@ -194,6 +198,24 @@ private: const auto key_column = key_column_with_type.column; const auto key_column_type = WhichDataType(key_column_with_type.type); + ColumnPtr range_col = nullptr; + DataTypePtr range_col_type = nullptr; + + if (dictionary_key_type == DictionaryKeyType::range) + { + if (arguments.size() != 3) + throw Exception{"Wrong argument count for function " + getName() + + " when dictionary has key type range", ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH}; + + range_col = arguments[2].column; + range_col_type = arguments[2].type; + + if (!(range_col_type->isValueRepresentedByInteger() && range_col_type->getSizeOfValueInMemory() <= sizeof(Int64))) + throw Exception{"Illegal type " + range_col_type->getName() + " of fourth argument of function " + + getName() + " must be convertible to Int64.", + ErrorCodes::ILLEGAL_COLUMN}; + } + if (dictionary_key_type == DictionaryKeyType::simple) { if (!key_column_type.isUInt64()) @@ -217,7 +239,7 @@ private: return dictionary->hasKeys(key_columns, key_types); } else - throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Has not supported for range dictionary", dictionary->getDictionaryID().getNameForLogs()); + return dictionary->hasKeys({key_column, range_col}, {std::make_shared(), range_col_type}); } mutable FunctionDictHelper helper; diff --git a/src/Functions/modulo.cpp b/src/Functions/modulo.cpp index d9bf74ccaf5..fe215851bb6 100644 --- a/src/Functions/modulo.cpp +++ b/src/Functions/modulo.cpp @@ -70,6 +70,14 @@ struct ModuloByConstantImpl if (unlikely(static_cast(b) == 0)) throw Exception("Division by zero", ErrorCodes::ILLEGAL_DIVISION); + /// Division by min negative value. + if (std::is_signed_v && b == std::numeric_limits::lowest()) + throw Exception("Division by the most negative number", ErrorCodes::ILLEGAL_DIVISION); + + /// Modulo of division by negative number is the same as the positive number. + if (b < 0) + b = -b; + libdivide::divider divider(b); /// Here we failed to make the SSE variant from libdivide give an advantage. diff --git a/src/Interpreters/Cluster.cpp b/src/Interpreters/Cluster.cpp index fb9788e84c4..0dbda7d2c75 100644 --- a/src/Interpreters/Cluster.cpp +++ b/src/Interpreters/Cluster.cpp @@ -115,7 +115,9 @@ Cluster::Address::Address( const String & password_, UInt16 clickhouse_port, bool secure_, - Int64 priority_) + Int64 priority_, + UInt32 shard_index_, + UInt32 replica_index_) : user(user_) , password(password_) { @@ -125,6 +127,8 @@ Cluster::Address::Address( secure = secure_ ? Protocol::Secure::Enable : Protocol::Secure::Disable; priority = priority_; is_local = isLocal(clickhouse_port); + shard_index = shard_index_; + replica_index = replica_index_; } @@ -465,7 +469,7 @@ Cluster::Cluster(const Settings & settings, const std::vector #include #include +#include #include #include #include @@ -1774,11 +1775,14 @@ std::optional Context::getTCPPortSecure() const std::shared_ptr Context::getCluster(const std::string & cluster_name) const { auto res = getClusters().getCluster(cluster_name); + if (res) + return res; - if (!res) - throw Exception("Requested cluster '" + cluster_name + "' not found", ErrorCodes::BAD_GET); + res = tryGetReplicatedDatabaseCluster(cluster_name); + if (res) + return res; - return res; + throw Exception("Requested cluster '" + cluster_name + "' not found", ErrorCodes::BAD_GET); } diff --git a/src/Interpreters/CrossToInnerJoinVisitor.cpp b/src/Interpreters/CrossToInnerJoinVisitor.cpp index c4d330831bb..2fcd75b1f23 100644 --- a/src/Interpreters/CrossToInnerJoinVisitor.cpp +++ b/src/Interpreters/CrossToInnerJoinVisitor.cpp @@ -136,7 +136,7 @@ std::optional getIdentMembership(const ASTIdentifier & ident, const std: std::optional table_pos = IdentifierSemantic::getMembership(ident); if (table_pos) return table_pos; - return IdentifierSemantic::chooseTableColumnMatch(ident, tables); + return IdentifierSemantic::chooseTableColumnMatch(ident, tables, true); } std::optional getIdentsMembership(const ASTPtr ast, diff --git a/src/Interpreters/DDLTask.cpp b/src/Interpreters/DDLTask.cpp index 0c4c8a1bc34..1cfd113e81f 100644 --- a/src/Interpreters/DDLTask.cpp +++ b/src/Interpreters/DDLTask.cpp @@ -10,6 +10,7 @@ #include #include #include +#include #include #include @@ -43,20 +44,47 @@ bool HostID::isLocalAddress(UInt16 clickhouse_port) const } } +void DDLLogEntry::assertVersion() const +{ + constexpr UInt64 max_version = 2; + if (version == 0 || max_version < version) + throw Exception(ErrorCodes::UNKNOWN_FORMAT_VERSION, "Unknown DDLLogEntry format version: {}." + "Maximum supported version is {}", version, max_version); +} + +void DDLLogEntry::setSettingsIfRequired(const Context & context) +{ + version = context.getSettingsRef().distributed_ddl_entry_format_version; + if (version == 2) + settings.emplace(context.getSettingsRef().changes()); +} String DDLLogEntry::toString() const { WriteBufferFromOwnString wb; - Strings host_id_strings(hosts.size()); - std::transform(hosts.begin(), hosts.end(), host_id_strings.begin(), HostID::applyToString); - - auto version = CURRENT_VERSION; wb << "version: " << version << "\n"; wb << "query: " << escape << query << "\n"; - wb << "hosts: " << host_id_strings << "\n"; + + bool write_hosts = version == 1 || !hosts.empty(); + if (write_hosts) + { + Strings host_id_strings(hosts.size()); + std::transform(hosts.begin(), hosts.end(), host_id_strings.begin(), HostID::applyToString); + wb << "hosts: " << host_id_strings << "\n"; + } + wb << "initiator: " << initiator << "\n"; + bool write_settings = 1 <= version && settings && !settings->empty(); + if (write_settings) + { + ASTSetQuery ast; + ast.is_standalone = false; + ast.changes = *settings; + wb << "settings: " << serializeAST(ast) << "\n"; + } + return wb.str(); } @@ -64,25 +92,46 @@ void DDLLogEntry::parse(const String & data) { ReadBufferFromString rb(data); - int version; rb >> "version: " >> version >> "\n"; - - if (version != CURRENT_VERSION) - throw Exception(ErrorCodes::UNKNOWN_FORMAT_VERSION, "Unknown DDLLogEntry format version: {}", version); + assertVersion(); Strings host_id_strings; rb >> "query: " >> escape >> query >> "\n"; - rb >> "hosts: " >> host_id_strings >> "\n"; + if (version == 1) + { + rb >> "hosts: " >> host_id_strings >> "\n"; - if (!rb.eof()) - rb >> "initiator: " >> initiator >> "\n"; - else - initiator.clear(); + if (!rb.eof()) + rb >> "initiator: " >> initiator >> "\n"; + else + initiator.clear(); + } + else if (version == 2) + { + + if (!rb.eof() && *rb.position() == 'h') + rb >> "hosts: " >> host_id_strings >> "\n"; + if (!rb.eof() && *rb.position() == 'i') + rb >> "initiator: " >> initiator >> "\n"; + if (!rb.eof() && *rb.position() == 's') + { + String settings_str; + rb >> "settings: " >> settings_str >> "\n"; + ParserSetQuery parser{true}; + constexpr UInt64 max_size = 4096; + constexpr UInt64 max_depth = 16; + ASTPtr settings_ast = parseQuery(parser, settings_str, max_size, max_depth); + settings.emplace(std::move(settings_ast->as()->changes)); + } + } assertEOF(rb); - hosts.resize(host_id_strings.size()); - std::transform(host_id_strings.begin(), host_id_strings.end(), hosts.begin(), HostID::fromString); + if (!host_id_strings.empty()) + { + hosts.resize(host_id_strings.size()); + std::transform(host_id_strings.begin(), host_id_strings.end(), hosts.begin(), HostID::fromString); + } } @@ -102,6 +151,8 @@ std::unique_ptr DDLTaskBase::makeQueryContext(Context & from_context, c query_context->makeQueryContext(); query_context->setCurrentQueryId(""); // generate random query_id query_context->getClientInfo().query_kind = ClientInfo::QueryKind::SECONDARY_QUERY; + if (entry.settings) + query_context->applySettingsChanges(*entry.settings); return query_context; } @@ -345,4 +396,11 @@ void ZooKeeperMetadataTransaction::commit() state = COMMITTED; } +ClusterPtr tryGetReplicatedDatabaseCluster(const String & cluster_name) +{ + if (const auto * replicated_db = dynamic_cast(DatabaseCatalog::instance().tryGetDatabase(cluster_name).get())) + return replicated_db->getCluster(); + return {}; +} + } diff --git a/src/Interpreters/DDLTask.h b/src/Interpreters/DDLTask.h index 45702599fcf..b794668f802 100644 --- a/src/Interpreters/DDLTask.h +++ b/src/Interpreters/DDLTask.h @@ -18,6 +18,7 @@ namespace DB class ASTQueryWithOnCluster; using ZooKeeperPtr = std::shared_ptr; +using ClusterPtr = std::shared_ptr; class DatabaseReplicated; class ZooKeeperMetadataTransaction; @@ -56,15 +57,16 @@ struct HostID struct DDLLogEntry { + UInt64 version = 1; String query; std::vector hosts; String initiator; // optional + std::optional settings; - static constexpr int CURRENT_VERSION = 1; - + void setSettingsIfRequired(const Context & context); String toString() const; - void parse(const String & data); + void assertVersion() const; }; struct DDLTaskBase @@ -192,4 +194,6 @@ public: ~ZooKeeperMetadataTransaction() { assert(isExecuted() || std::uncaught_exceptions()); } }; +ClusterPtr tryGetReplicatedDatabaseCluster(const String & cluster_name); + } diff --git a/src/Interpreters/InterpreterCreateQuery.cpp b/src/Interpreters/InterpreterCreateQuery.cpp index d1af86e7b11..1c63dee1de2 100644 --- a/src/Interpreters/InterpreterCreateQuery.cpp +++ b/src/Interpreters/InterpreterCreateQuery.cpp @@ -712,6 +712,19 @@ void InterpreterCreateQuery::setEngine(ASTCreateQuery & create) const } } +static void generateUUIDForTable(ASTCreateQuery & create) +{ + if (create.uuid == UUIDHelpers::Nil) + create.uuid = UUIDHelpers::generateV4(); + + /// If destination table (to_table_id) is not specified for materialized view, + /// then MV will create inner table. We should generate UUID of inner table here, + /// so it will be the same on all hosts if query in ON CLUSTER or database engine is Replicated. + bool need_uuid_for_inner_table = create.is_materialized_view && !create.to_table_id; + if (need_uuid_for_inner_table && create.to_inner_uuid == UUIDHelpers::Nil) + create.to_inner_uuid = UUIDHelpers::generateV4(); +} + void InterpreterCreateQuery::assertOrSetUUID(ASTCreateQuery & create, const DatabasePtr & database) const { const auto * kind = create.is_dictionary ? "Dictionary" : "Table"; @@ -743,18 +756,19 @@ void InterpreterCreateQuery::assertOrSetUUID(ASTCreateQuery & create, const Data kind_upper, create.table); } - if (create.uuid == UUIDHelpers::Nil) - create.uuid = UUIDHelpers::generateV4(); + generateUUIDForTable(create); } else { bool is_on_cluster = context.getClientInfo().query_kind == ClientInfo::QueryKind::SECONDARY_QUERY; - if (create.uuid != UUIDHelpers::Nil && !is_on_cluster) + bool has_uuid = create.uuid != UUIDHelpers::Nil || create.to_inner_uuid != UUIDHelpers::Nil; + if (has_uuid && !is_on_cluster) throw Exception(ErrorCodes::INCORRECT_QUERY, "{} UUID specified, but engine of database {} is not Atomic", kind, create.database); /// Ignore UUID if it's ON CLUSTER query create.uuid = UUIDHelpers::Nil; + create.to_inner_uuid = UUIDHelpers::Nil; } if (create.replace_table) @@ -803,6 +817,17 @@ BlockIO InterpreterCreateQuery::createTable(ASTCreateQuery & create) if (create.attach && !create.storage && !create.columns_list) { auto database = DatabaseCatalog::instance().getDatabase(database_name); + if (database->getEngineName() == "Replicated") + { + auto guard = DatabaseCatalog::instance().getDDLGuard(database_name, create.table); + if (typeid_cast(database.get()) && context.getClientInfo().query_kind != ClientInfo::QueryKind::SECONDARY_QUERY) + { + create.database = database_name; + guard->releaseTableLock(); + return typeid_cast(database.get())->tryEnqueueReplicatedDDL(query_ptr, context); + } + } + bool if_not_exists = create.if_not_exists; // Table SQL definition is available even if the table is detached (even permanently) @@ -876,7 +901,6 @@ BlockIO InterpreterCreateQuery::createTable(ASTCreateQuery & create) if (need_add_to_database && database->getEngineName() == "Replicated") { auto guard = DatabaseCatalog::instance().getDDLGuard(create.database, create.table); - database = DatabaseCatalog::instance().getDatabase(create.database); if (typeid_cast(database.get()) && context.getClientInfo().query_kind != ClientInfo::QueryKind::SECONDARY_QUERY) { assertOrSetUUID(create, database); @@ -1134,8 +1158,7 @@ void InterpreterCreateQuery::prepareOnClusterQuery(ASTCreateQuery & create, cons /// For CREATE query generate UUID on initiator, so it will be the same on all hosts. /// It will be ignored if database does not support UUIDs. - if (create.uuid == UUIDHelpers::Nil) - create.uuid = UUIDHelpers::generateV4(); + generateUUIDForTable(create); /// For cross-replication cluster we cannot use UUID in replica path. String cluster_name_expanded = context.getMacros()->expand(cluster_name); diff --git a/src/Interpreters/InterpreterDropQuery.cpp b/src/Interpreters/InterpreterDropQuery.cpp index 33e93a79c41..b30996b1dbf 100644 --- a/src/Interpreters/InterpreterDropQuery.cpp +++ b/src/Interpreters/InterpreterDropQuery.cpp @@ -133,10 +133,6 @@ BlockIO InterpreterDropQuery::executeToTableImpl(const ASTDropQuery & query, Dat !is_drop_or_detach_database; if (is_replicated_ddl_query) { - if (query.kind == ASTDropQuery::Kind::Detach && !query.permanently) - throw Exception(ErrorCodes::INCORRECT_QUERY, "DETACH TABLE is not allowed for Replicated databases. " - "Use DETACH TABLE PERMANENTLY or SYSTEM RESTART REPLICA"); - if (query.kind == ASTDropQuery::Kind::Detach) context.checkAccess(table->isView() ? AccessType::DROP_VIEW : AccessType::DROP_TABLE, table_id); else if (query.kind == ASTDropQuery::Kind::Truncate) diff --git a/src/Interpreters/InterpreterGrantQuery.cpp b/src/Interpreters/InterpreterGrantQuery.cpp index dafe4d2e18c..034ebcec050 100644 --- a/src/Interpreters/InterpreterGrantQuery.cpp +++ b/src/Interpreters/InterpreterGrantQuery.cpp @@ -1,4 +1,5 @@ #include +#include #include #include #include @@ -209,4 +210,13 @@ void InterpreterGrantQuery::updateRoleFromQuery(Role & role, const ASTGrantQuery updateFromQueryImpl(role, query, roles_to_grant_or_revoke); } +void InterpreterGrantQuery::extendQueryLogElemImpl(QueryLogElement & elem, const ASTPtr & /*ast*/, const Context &) const +{ + auto & query = query_ptr->as(); + if (query.kind == Kind::GRANT) + elem.query_kind = "Grant"; + else if (query.kind == Kind::REVOKE) + elem.query_kind = "Revoke"; +} + } diff --git a/src/Interpreters/InterpreterGrantQuery.h b/src/Interpreters/InterpreterGrantQuery.h index 32810faecd2..645b33346c3 100644 --- a/src/Interpreters/InterpreterGrantQuery.h +++ b/src/Interpreters/InterpreterGrantQuery.h @@ -21,6 +21,7 @@ public: static void updateUserFromQuery(User & user, const ASTGrantQuery & query); static void updateRoleFromQuery(Role & role, const ASTGrantQuery & query); + void extendQueryLogElemImpl(QueryLogElement &, const ASTPtr &, const Context &) const override; private: ASTPtr query_ptr; diff --git a/src/Interpreters/InterpreterShowCreateQuery.cpp b/src/Interpreters/InterpreterShowCreateQuery.cpp index 10c8339c135..a853c4b9a9b 100644 --- a/src/Interpreters/InterpreterShowCreateQuery.cpp +++ b/src/Interpreters/InterpreterShowCreateQuery.cpp @@ -82,6 +82,7 @@ BlockInputStreamPtr InterpreterShowCreateQuery::executeImpl() { auto & create = create_query->as(); create.uuid = UUIDHelpers::Nil; + create.to_inner_uuid = UUIDHelpers::Nil; } WriteBufferFromOwnString buf; diff --git a/src/Interpreters/InterpreterSystemQuery.cpp b/src/Interpreters/InterpreterSystemQuery.cpp index ece3209621b..53fb865be2d 100644 --- a/src/Interpreters/InterpreterSystemQuery.cpp +++ b/src/Interpreters/InterpreterSystemQuery.cpp @@ -754,4 +754,9 @@ AccessRightsElements InterpreterSystemQuery::getRequiredAccessForDDLOnCluster() return required_access; } +void InterpreterSystemQuery::extendQueryLogElemImpl(QueryLogElement & elem, const ASTPtr & /*ast*/, const Context &) const +{ + elem.query_kind = "System"; +} + } diff --git a/src/Interpreters/InterpreterSystemQuery.h b/src/Interpreters/InterpreterSystemQuery.h index 6fa0a432191..39f8bf69a6e 100644 --- a/src/Interpreters/InterpreterSystemQuery.h +++ b/src/Interpreters/InterpreterSystemQuery.h @@ -56,6 +56,8 @@ private: AccessRightsElements getRequiredAccessForDDLOnCluster() const; void startStopAction(StorageActionBlockType action_type, bool start); + + void extendQueryLogElemImpl(QueryLogElement &, const ASTPtr &, const Context &) const override; }; diff --git a/src/Interpreters/InterserverIOHandler.h b/src/Interpreters/InterserverIOHandler.h index db95a00d0f7..b4768c30f32 100644 --- a/src/Interpreters/InterserverIOHandler.h +++ b/src/Interpreters/InterserverIOHandler.h @@ -16,6 +16,12 @@ #include #include +namespace zkutil +{ + class ZooKeeper; + using ZooKeeperPtr = std::shared_ptr; +} + namespace DB { diff --git a/src/Interpreters/executeDDLQueryOnCluster.cpp b/src/Interpreters/executeDDLQueryOnCluster.cpp index 1937fbaf905..428f403efd8 100644 --- a/src/Interpreters/executeDDLQueryOnCluster.cpp +++ b/src/Interpreters/executeDDLQueryOnCluster.cpp @@ -13,6 +13,9 @@ #include #include #include +#include +#include +#include #include namespace fs = std::filesystem; @@ -163,18 +166,32 @@ BlockIO executeDDLQueryOnCluster(const ASTPtr & query_ptr_, const Context & cont entry.hosts = std::move(hosts); entry.query = queryToString(query_ptr); entry.initiator = ddl_worker.getCommonHostID(); + entry.setSettingsIfRequired(context); String node_path = ddl_worker.enqueueQuery(entry); + return getDistributedDDLStatus(node_path, entry, context); +} + +BlockIO getDistributedDDLStatus(const String & node_path, const DDLLogEntry & entry, const Context & context, const std::optional & hosts_to_wait) +{ BlockIO io; if (context.getSettingsRef().distributed_ddl_task_timeout == 0) return io; - auto stream = std::make_shared(node_path, entry, context); - io.in = std::move(stream); + auto stream = std::make_shared(node_path, entry, context, hosts_to_wait); + if (context.getSettingsRef().distributed_ddl_output_mode == DistributedDDLOutputMode::NONE) + { + /// Wait for query to finish, but ignore output + NullBlockOutputStream output{Block{}}; + copyData(*stream, output); + } + else + { + io.in = std::move(stream); + } return io; } - DDLQueryStatusInputStream::DDLQueryStatusInputStream(const String & zk_node_path, const DDLLogEntry & entry, const Context & context_, const std::optional & hosts_to_wait) : node_path(zk_node_path) @@ -182,19 +199,36 @@ DDLQueryStatusInputStream::DDLQueryStatusInputStream(const String & zk_node_path , watch(CLOCK_MONOTONIC_COARSE) , log(&Poco::Logger::get("DDLQueryStatusInputStream")) { + if (context.getSettingsRef().distributed_ddl_output_mode == DistributedDDLOutputMode::THROW || + context.getSettingsRef().distributed_ddl_output_mode == DistributedDDLOutputMode::NONE) + throw_on_timeout = true; + else if (context.getSettingsRef().distributed_ddl_output_mode == DistributedDDLOutputMode::NULL_STATUS_ON_TIMEOUT || + context.getSettingsRef().distributed_ddl_output_mode == DistributedDDLOutputMode::NEVER_THROW) + throw_on_timeout = false; + else + throw Exception(ErrorCodes::LOGICAL_ERROR, "Unknown output mode"); + + auto maybe_make_nullable = [&](const DataTypePtr & type) -> DataTypePtr + { + if (throw_on_timeout) + return type; + return std::make_shared(type); + }; + sample = Block{ - {std::make_shared(), "host"}, - {std::make_shared(), "port"}, - {std::make_shared(), "status"}, - {std::make_shared(), "error"}, - {std::make_shared(), "num_hosts_remaining"}, - {std::make_shared(), "num_hosts_active"}, + {std::make_shared(), "host"}, + {std::make_shared(), "port"}, + {maybe_make_nullable(std::make_shared()), "status"}, + {maybe_make_nullable(std::make_shared()), "error"}, + {std::make_shared(), "num_hosts_remaining"}, + {std::make_shared(), "num_hosts_active"}, }; if (hosts_to_wait) { waiting_hosts = NameSet(hosts_to_wait->begin(), hosts_to_wait->end()); by_hostname = false; + sample.erase("port"); } else { @@ -207,12 +241,29 @@ DDLQueryStatusInputStream::DDLQueryStatusInputStream(const String & zk_node_path timeout_seconds = context.getSettingsRef().distributed_ddl_task_timeout; } +std::pair DDLQueryStatusInputStream::parseHostAndPort(const String & host_id) const +{ + String host = host_id; + UInt16 port = 0; + if (by_hostname) + { + auto host_and_port = Cluster::Address::fromString(host_id); + host = host_and_port.first; + port = host_and_port.second; + } + return {host, port}; +} + Block DDLQueryStatusInputStream::readImpl() { Block res; - if (num_hosts_finished >= waiting_hosts.size()) + bool all_hosts_finished = num_hosts_finished >= waiting_hosts.size(); + /// Seems like num_hosts_finished cannot be strictly greater than waiting_hosts.size() + assert(num_hosts_finished <= waiting_hosts.size()); + if (all_hosts_finished || timeout_exceeded) { - if (first_exception) + bool throw_if_error_on_host = context.getSettingsRef().distributed_ddl_output_mode != DistributedDDLOutputMode::NEVER_THROW; + if (first_exception && throw_if_error_on_host) throw Exception(*first_exception); return res; @@ -225,7 +276,8 @@ Block DDLQueryStatusInputStream::readImpl() { if (isCancelled()) { - if (first_exception) + bool throw_if_error_on_host = context.getSettingsRef().distributed_ddl_output_mode != DistributedDDLOutputMode::NEVER_THROW; + if (first_exception && throw_if_error_on_host) throw Exception(*first_exception); return res; @@ -236,11 +288,36 @@ Block DDLQueryStatusInputStream::readImpl() size_t num_unfinished_hosts = waiting_hosts.size() - num_hosts_finished; size_t num_active_hosts = current_active_hosts.size(); + constexpr const char * msg_format = "Watching task {} is executing longer than distributed_ddl_task_timeout (={}) seconds. " + "There are {} unfinished hosts ({} of them are currently active), " + "they are going to execute the query in background"; + if (throw_on_timeout) + throw Exception(ErrorCodes::TIMEOUT_EXCEEDED, msg_format, + node_path, timeout_seconds, num_unfinished_hosts, num_active_hosts); - throw Exception(ErrorCodes::TIMEOUT_EXCEEDED, - "Watching task {} is executing longer than distributed_ddl_task_timeout (={}) seconds. " - "There are {} unfinished hosts ({} of them are currently active), they are going to execute the query in background", - node_path, timeout_seconds, num_unfinished_hosts, num_active_hosts); + timeout_exceeded = true; + LOG_INFO(log, msg_format, node_path, timeout_seconds, num_unfinished_hosts, num_active_hosts); + + NameSet unfinished_hosts = waiting_hosts; + for (const auto & host_id : finished_hosts) + unfinished_hosts.erase(host_id); + + /// Query is not finished on the rest hosts, so fill the corresponding rows with NULLs. + MutableColumns columns = sample.cloneEmptyColumns(); + for (const String & host_id : unfinished_hosts) + { + auto [host, port] = parseHostAndPort(host_id); + size_t num = 0; + columns[num++]->insert(host); + if (by_hostname) + columns[num++]->insert(port); + columns[num++]->insert(Field{}); + columns[num++]->insert(Field{}); + columns[num++]->insert(num_unfinished_hosts); + columns[num++]->insert(num_active_hosts); + } + res = sample.cloneWithColumns(std::move(columns)); + return res; } if (num_hosts_finished != 0 || try_number != 0) @@ -272,26 +349,21 @@ Block DDLQueryStatusInputStream::readImpl() status.tryDeserializeText(status_data); } - String host = host_id; - UInt16 port = 0; - if (by_hostname) - { - auto host_and_port = Cluster::Address::fromString(host_id); - host = host_and_port.first; - port = host_and_port.second; - } + auto [host, port] = parseHostAndPort(host_id); if (status.code != 0 && first_exception == nullptr) first_exception = std::make_unique(status.code, "There was an error on [{}:{}]: {}", host, port, status.message); ++num_hosts_finished; - columns[0]->insert(host); - columns[1]->insert(port); - columns[2]->insert(status.code); - columns[3]->insert(status.message); - columns[4]->insert(waiting_hosts.size() - num_hosts_finished); - columns[5]->insert(current_active_hosts.size()); + size_t num = 0; + columns[num++]->insert(host); + if (by_hostname) + columns[num++]->insert(port); + columns[num++]->insert(status.code); + columns[num++]->insert(status.message); + columns[num++]->insert(waiting_hosts.size() - num_hosts_finished); + columns[num++]->insert(current_active_hosts.size()); } res = sample.cloneWithColumns(std::move(columns)); } diff --git a/src/Interpreters/executeDDLQueryOnCluster.h b/src/Interpreters/executeDDLQueryOnCluster.h index 2b272d3b0da..196cd7c5c2c 100644 --- a/src/Interpreters/executeDDLQueryOnCluster.h +++ b/src/Interpreters/executeDDLQueryOnCluster.h @@ -24,6 +24,7 @@ BlockIO executeDDLQueryOnCluster(const ASTPtr & query_ptr, const Context & conte BlockIO executeDDLQueryOnCluster(const ASTPtr & query_ptr, const Context & context, const AccessRightsElements & query_requires_access, bool query_requires_grant_option = false); BlockIO executeDDLQueryOnCluster(const ASTPtr & query_ptr, const Context & context, AccessRightsElements && query_requires_access, bool query_requires_grant_option = false); +BlockIO getDistributedDDLStatus(const String & node_path, const DDLLogEntry & entry, const Context & context, const std::optional & hosts_to_wait = {}); class DDLQueryStatusInputStream final : public IBlockInputStream { @@ -44,6 +45,8 @@ private: Strings getNewAndUpdate(const Strings & current_list_of_finished_hosts); + std::pair parseHostAndPort(const String & host_id) const; + String node_path; const Context & context; Stopwatch watch; @@ -62,6 +65,8 @@ private: Int64 timeout_seconds = 120; bool by_hostname = true; + bool throw_on_timeout = true; + bool timeout_exceeded = false; }; } diff --git a/src/Parsers/ASTCreateQuery.cpp b/src/Parsers/ASTCreateQuery.cpp index 2af0d2d4a45..1192fcc6ebd 100644 --- a/src/Parsers/ASTCreateQuery.cpp +++ b/src/Parsers/ASTCreateQuery.cpp @@ -297,12 +297,20 @@ void ASTCreateQuery::formatQueryImpl(const FormatSettings & settings, FormatStat if (to_table_id) { + assert(is_materialized_view && to_inner_uuid == UUIDHelpers::Nil); settings.ostr << (settings.hilite ? hilite_keyword : "") << " TO " << (settings.hilite ? hilite_none : "") << (!to_table_id.database_name.empty() ? backQuoteIfNeed(to_table_id.database_name) + "." : "") << backQuoteIfNeed(to_table_id.table_name); } + if (to_inner_uuid != UUIDHelpers::Nil) + { + assert(is_materialized_view && !to_table_id); + settings.ostr << (settings.hilite ? hilite_keyword : "") << " TO INNER UUID " << (settings.hilite ? hilite_none : "") + << quoteString(toString(to_inner_uuid)); + } + if (!as_table.empty()) { settings.ostr diff --git a/src/Parsers/ASTCreateQuery.h b/src/Parsers/ASTCreateQuery.h index c9a6251cb94..d6d5c22240c 100644 --- a/src/Parsers/ASTCreateQuery.h +++ b/src/Parsers/ASTCreateQuery.h @@ -66,6 +66,7 @@ public: ASTExpressionList * tables = nullptr; StorageID to_table_id = StorageID::createEmpty(); /// For CREATE MATERIALIZED VIEW mv TO table. + UUID to_inner_uuid = UUIDHelpers::Nil; /// For materialized view with inner table ASTStorage * storage = nullptr; String as_database; String as_table; diff --git a/src/Parsers/ParserCreateQuery.cpp b/src/Parsers/ParserCreateQuery.cpp index 4cef79fdf42..bfd51b7633d 100644 --- a/src/Parsers/ParserCreateQuery.cpp +++ b/src/Parsers/ParserCreateQuery.cpp @@ -780,6 +780,7 @@ bool ParserCreateViewQuery::parseImpl(Pos & pos, ASTPtr & node, Expected & expec ASTPtr table; ASTPtr to_table; + ASTPtr to_inner_uuid; ASTPtr columns_list; ASTPtr storage; ASTPtr as_database; @@ -830,9 +831,16 @@ bool ParserCreateViewQuery::parseImpl(Pos & pos, ASTPtr & node, Expected & expec return false; } - // TO [db.]table - if (ParserKeyword{"TO"}.ignore(pos, expected)) + + if (ParserKeyword{"TO INNER UUID"}.ignore(pos, expected)) { + ParserLiteral literal_p; + if (!literal_p.parse(pos, to_inner_uuid, expected)) + return false; + } + else if (ParserKeyword{"TO"}.ignore(pos, expected)) + { + // TO [db.]table if (!table_name_p.parse(pos, to_table, expected)) return false; } @@ -883,6 +891,8 @@ bool ParserCreateViewQuery::parseImpl(Pos & pos, ASTPtr & node, Expected & expec if (to_table) query->to_table_id = getTableIdentifier(to_table); + if (to_inner_uuid) + query->to_inner_uuid = parseFromString(to_inner_uuid->as()->value.get()); query->set(query->columns_list, columns_list); query->set(query->storage, storage); diff --git a/src/Storages/MergeTree/DataPartsExchange.cpp b/src/Storages/MergeTree/DataPartsExchange.cpp index 3216b46e7ce..7081695ad49 100644 --- a/src/Storages/MergeTree/DataPartsExchange.cpp +++ b/src/Storages/MergeTree/DataPartsExchange.cpp @@ -11,6 +11,7 @@ #include #include #include +#include #include #include @@ -36,6 +37,8 @@ namespace ErrorCodes extern const int INSECURE_PATH; extern const int CORRUPTED_DATA; extern const int LOGICAL_ERROR; + extern const int S3_ERROR; + extern const int INCORRECT_PART_TYPE; } namespace DataPartsExchange @@ -48,6 +51,7 @@ constexpr auto REPLICATION_PROTOCOL_VERSION_WITH_PARTS_SIZE_AND_TTL_INFOS = 2; constexpr auto REPLICATION_PROTOCOL_VERSION_WITH_PARTS_TYPE = 3; constexpr auto REPLICATION_PROTOCOL_VERSION_WITH_PARTS_DEFAULT_COMPRESSION = 4; constexpr auto REPLICATION_PROTOCOL_VERSION_WITH_PARTS_UUID = 5; +constexpr auto REPLICATION_PROTOCOL_VERSION_WITH_PARTS_S3_COPY = 6; std::string getEndpointId(const std::string & node_id) @@ -112,7 +116,7 @@ void Service::processQuery(const HTMLForm & params, ReadBuffer & /*body*/, Write } /// We pretend to work as older server version, to be sure that client will correctly process our version - response.addCookie({"server_protocol_version", toString(std::min(client_protocol_version, REPLICATION_PROTOCOL_VERSION_WITH_PARTS_UUID))}); + response.addCookie({"server_protocol_version", toString(std::min(client_protocol_version, REPLICATION_PROTOCOL_VERSION_WITH_PARTS_S3_COPY))}); ++total_sends; SCOPE_EXIT({--total_sends;}); @@ -148,7 +152,30 @@ void Service::processQuery(const HTMLForm & params, ReadBuffer & /*body*/, Write sendPartFromMemory(part, out); else { - sendPartFromDisk(part, out, client_protocol_version); + bool try_use_s3_copy = false; + + if (data_settings->allow_s3_zero_copy_replication + && client_protocol_version >= REPLICATION_PROTOCOL_VERSION_WITH_PARTS_S3_COPY) + { /// if source and destination are in the same S3 storage we try to use S3 CopyObject request first + int send_s3_metadata = parse(params.get("send_s3_metadata", "0")); + if (send_s3_metadata == 1) + { + auto disk = part->volume->getDisk(); + if (disk->getType() == DB::DiskType::Type::S3) + { + try_use_s3_copy = true; + } + } + } + if (try_use_s3_copy) + { + response.addCookie({"send_s3_metadata", "1"}); + sendPartS3Metadata(part, out); + } + else + { + sendPartFromDisk(part, out, client_protocol_version); + } } } catch (const NetException &) @@ -229,6 +256,55 @@ void Service::sendPartFromDisk(const MergeTreeData::DataPartPtr & part, WriteBuf part->checksums.checkEqual(data_checksums, false); } +void Service::sendPartS3Metadata(const MergeTreeData::DataPartPtr & part, WriteBuffer & out) +{ + /// We'll take a list of files from the list of checksums. + MergeTreeData::DataPart::Checksums checksums = part->checksums; + /// Add files that are not in the checksum list. + auto file_names_without_checksums = part->getFileNamesWithoutChecksums(); + for (const auto & file_name : file_names_without_checksums) + checksums.files[file_name] = {}; + + auto disk = part->volume->getDisk(); + if (disk->getType() != DB::DiskType::Type::S3) + throw Exception("S3 disk is not S3 anymore", ErrorCodes::LOGICAL_ERROR); + + part->storage.lockSharedData(*part); + + String part_id = part->getUniqueId(); + writeStringBinary(part_id, out); + + writeBinary(checksums.files.size(), out); + for (const auto & it : checksums.files) + { + String file_name = it.first; + + String metadata_file = disk->getPath() + part->getFullRelativePath() + file_name; + + Poco::File metadata(metadata_file); + + if (!metadata.exists()) + throw Exception("S3 metadata '" + file_name + "' is not exists", ErrorCodes::CORRUPTED_DATA); + if (!metadata.isFile()) + throw Exception("S3 metadata '" + file_name + "' is not a file", ErrorCodes::CORRUPTED_DATA); + UInt64 file_size = metadata.getSize(); + + writeStringBinary(it.first, out); + writeBinary(file_size, out); + + auto file_in = createReadBufferFromFileBase(metadata_file, 0, 0, 0, DBMS_DEFAULT_BUFFER_SIZE); + HashingWriteBuffer hashing_out(out); + copyData(*file_in, hashing_out, blocker.getCounter()); + if (blocker.isCancelled()) + throw Exception("Transferring part to replica was cancelled", ErrorCodes::ABORTED); + + if (hashing_out.count() != file_size) + throw Exception("Unexpected size of file " + metadata_file, ErrorCodes::BAD_SIZE_OF_FILE_IN_DATA_PART); + + writePODBinary(hashing_out.getHash(), out); + } +} + MergeTreeData::DataPartPtr Service::findPart(const String & name) { /// It is important to include PreCommitted and Outdated parts here because remote replicas cannot reliably @@ -253,7 +329,9 @@ MergeTreeData::MutableDataPartPtr Fetcher::fetchPart( const String & interserver_scheme, bool to_detached, const String & tmp_prefix_, - std::optional * tagger_ptr) + std::optional * tagger_ptr, + bool try_use_s3_copy, + const DiskPtr disk_s3) { if (blocker.isCancelled()) throw Exception("Fetching of part was cancelled", ErrorCodes::ABORTED); @@ -270,10 +348,36 @@ MergeTreeData::MutableDataPartPtr Fetcher::fetchPart( { {"endpoint", getEndpointId(replica_path)}, {"part", part_name}, - {"client_protocol_version", toString(REPLICATION_PROTOCOL_VERSION_WITH_PARTS_UUID)}, + {"client_protocol_version", toString(REPLICATION_PROTOCOL_VERSION_WITH_PARTS_S3_COPY)}, {"compress", "false"} }); + if (try_use_s3_copy && disk_s3 && disk_s3->getType() != DB::DiskType::Type::S3) + throw Exception("Try to fetch shared s3 part on non-s3 disk", ErrorCodes::LOGICAL_ERROR); + + Disks disks_s3; + + if (!data_settings->allow_s3_zero_copy_replication) + try_use_s3_copy = false; + + if (try_use_s3_copy) + { + if (disk_s3) + disks_s3.push_back(disk_s3); + else + { + disks_s3 = data.getDisksByType(DiskType::Type::S3); + + if (disks_s3.empty()) + try_use_s3_copy = false; + } + } + + if (try_use_s3_copy) + { + uri.addQueryParameter("send_s3_metadata", "1"); + } + Poco::Net::HTTPBasicCredentials creds{}; if (!user.empty()) { @@ -294,6 +398,44 @@ MergeTreeData::MutableDataPartPtr Fetcher::fetchPart( int server_protocol_version = parse(in.getResponseCookie("server_protocol_version", "0")); + int send_s3 = parse(in.getResponseCookie("send_s3_metadata", "0")); + + if (send_s3 == 1) + { + if (server_protocol_version < REPLICATION_PROTOCOL_VERSION_WITH_PARTS_S3_COPY) + throw Exception("Got 'send_s3_metadata' cookie with old protocol version", ErrorCodes::LOGICAL_ERROR); + if (!try_use_s3_copy) + throw Exception("Got 'send_s3_metadata' cookie when was not requested", ErrorCodes::LOGICAL_ERROR); + + size_t sum_files_size = 0; + readBinary(sum_files_size, in); + IMergeTreeDataPart::TTLInfos ttl_infos; + /// Skip ttl infos, not required for S3 metadata + String ttl_infos_string; + readBinary(ttl_infos_string, in); + String part_type = "Wide"; + readStringBinary(part_type, in); + if (part_type == "InMemory") + throw Exception("Got 'send_s3_metadata' cookie for in-memory part", ErrorCodes::INCORRECT_PART_TYPE); + + UUID part_uuid = UUIDHelpers::Nil; + if (server_protocol_version >= REPLICATION_PROTOCOL_VERSION_WITH_PARTS_UUID) + readUUIDText(part_uuid, in); + + try + { + return downloadPartToS3(part_name, replica_path, to_detached, tmp_prefix_, std::move(disks_s3), in); + } + catch (const Exception & e) + { + if (e.code() != ErrorCodes::S3_ERROR) + throw; + /// Try again but without S3 copy + return fetchPart(metadata_snapshot, part_name, replica_path, host, port, timeouts, + user, password, interserver_scheme, to_detached, tmp_prefix_, nullptr, false); + } + } + ReservationPtr reservation; size_t sum_files_size = 0; if (server_protocol_version >= REPLICATION_PROTOCOL_VERSION_WITH_PARTS_SIZE) @@ -471,6 +613,100 @@ MergeTreeData::MutableDataPartPtr Fetcher::downloadPartToDisk( return new_data_part; } +MergeTreeData::MutableDataPartPtr Fetcher::downloadPartToS3( + const String & part_name, + const String & replica_path, + bool to_detached, + const String & tmp_prefix_, + const Disks & disks_s3, + PooledReadWriteBufferFromHTTP & in + ) +{ + if (disks_s3.empty()) + throw Exception("No S3 disks anymore", ErrorCodes::LOGICAL_ERROR); + + String part_id; + readStringBinary(part_id, in); + + DiskPtr disk = disks_s3[0]; + + for (const auto & disk_s3 : disks_s3) + { + if (disk_s3->checkUniqueId(part_id)) + { + disk = disk_s3; + break; + } + } + + static const String TMP_PREFIX = "tmp_fetch_"; + String tmp_prefix = tmp_prefix_.empty() ? TMP_PREFIX : tmp_prefix_; + + String part_relative_path = String(to_detached ? "detached/" : "") + tmp_prefix + part_name; + String part_download_path = data.getRelativeDataPath() + part_relative_path + "/"; + + if (disk->exists(part_download_path)) + throw Exception("Directory " + fullPath(disk, part_download_path) + " already exists.", ErrorCodes::DIRECTORY_ALREADY_EXISTS); + + CurrentMetrics::Increment metric_increment{CurrentMetrics::ReplicatedFetch}; + + disk->createDirectories(part_download_path); + + size_t files; + readBinary(files, in); + + auto volume = std::make_shared("volume_" + part_name, disk); + MergeTreeData::MutableDataPartPtr new_data_part = data.createPart(part_name, volume, part_relative_path); + + for (size_t i = 0; i < files; ++i) + { + String file_name; + UInt64 file_size; + + readStringBinary(file_name, in); + readBinary(file_size, in); + + String data_path = new_data_part->getFullRelativePath() + file_name; + String metadata_file = fullPath(disk, data_path); + + { + auto file_out = std::make_unique(metadata_file, DBMS_DEFAULT_BUFFER_SIZE, -1, 0666, nullptr, 0); + + HashingWriteBuffer hashing_out(*file_out); + + copyData(in, hashing_out, file_size, blocker.getCounter()); + + if (blocker.isCancelled()) + { + /// NOTE The is_cancelled flag also makes sense to check every time you read over the network, + /// performing a poll with a not very large timeout. + /// And now we check it only between read chunks (in the `copyData` function). + disk->removeSharedRecursive(part_download_path, true); + throw Exception("Fetching of part was cancelled", ErrorCodes::ABORTED); + } + + MergeTreeDataPartChecksum::uint128 expected_hash; + readPODBinary(expected_hash, in); + + if (expected_hash != hashing_out.getHash()) + { + throw Exception("Checksum mismatch for file " + metadata_file + " transferred from " + replica_path, + ErrorCodes::CHECKSUM_DOESNT_MATCH); + } + } + } + + assertEOF(in); + + new_data_part->is_temp = true; + new_data_part->modification_time = time(nullptr); + new_data_part->loadColumnsChecksumsIndexes(true, false); + + new_data_part->storage.lockSharedData(*new_data_part); + + return new_data_part; +} + } } diff --git a/src/Storages/MergeTree/DataPartsExchange.h b/src/Storages/MergeTree/DataPartsExchange.h index 2e2aad57c3c..e9b3d443fcd 100644 --- a/src/Storages/MergeTree/DataPartsExchange.h +++ b/src/Storages/MergeTree/DataPartsExchange.h @@ -9,6 +9,12 @@ #include +namespace zkutil +{ + class ZooKeeper; + using ZooKeeperPtr = std::shared_ptr; +} + namespace DB { @@ -32,6 +38,7 @@ private: MergeTreeData::DataPartPtr findPart(const String & name); void sendPartFromMemory(const MergeTreeData::DataPartPtr & part, WriteBuffer & out); void sendPartFromDisk(const MergeTreeData::DataPartPtr & part, WriteBuffer & out, int client_protocol_version); + void sendPartS3Metadata(const MergeTreeData::DataPartPtr & part, WriteBuffer & out); /// StorageReplicatedMergeTree::shutdown() waits for all parts exchange handlers to finish, /// so Service will never access dangling reference to storage @@ -59,7 +66,9 @@ public: const String & interserver_scheme, bool to_detached = false, const String & tmp_prefix_ = "", - std::optional * tagger_ptr = nullptr); + std::optional * tagger_ptr = nullptr, + bool try_use_s3_copy = true, + const DiskPtr disk_s3 = nullptr); /// You need to stop the data transfer. ActionBlocker blocker; @@ -81,6 +90,14 @@ private: ReservationPtr reservation, PooledReadWriteBufferFromHTTP & in); + MergeTreeData::MutableDataPartPtr downloadPartToS3( + const String & part_name, + const String & replica_path, + bool to_detached, + const String & tmp_prefix_, + const Disks & disks_s3, + PooledReadWriteBufferFromHTTP & in); + MergeTreeData & data; Poco::Logger * log; }; diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.cpp b/src/Storages/MergeTree/IMergeTreeDataPart.cpp index 1568ca16254..1f18c894465 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPart.cpp +++ b/src/Storages/MergeTree/IMergeTreeDataPart.cpp @@ -9,8 +9,10 @@ #include #include #include +#include #include #include +#include #include #include #include @@ -35,6 +37,7 @@ namespace CurrentMetrics namespace DB { + namespace ErrorCodes { extern const int DIRECTORY_ALREADY_EXISTS; @@ -404,7 +407,7 @@ void IMergeTreeDataPart::removeIfNeeded() } } - remove(); + remove(false); if (state == State::DeleteOnDestroy) { @@ -934,7 +937,8 @@ void IMergeTreeDataPart::loadColumns(bool require) { /// We can get list of columns only from columns.txt in compact parts. if (require || part_type == Type::COMPACT) - throw Exception("No columns.txt in part " + name, ErrorCodes::NO_FILE_IN_DATA_PART); + throw Exception("No columns.txt in part " + name + ", expected path " + path + " on drive " + volume->getDisk()->getName(), + ErrorCodes::NO_FILE_IN_DATA_PART); /// If there is no file with a list of columns, write it down. for (const NameAndTypePair & column : metadata_snapshot->getColumns().getAllPhysical()) @@ -1015,10 +1019,12 @@ void IMergeTreeDataPart::renameTo(const String & new_relative_path, bool remove_ SyncGuardPtr sync_guard; if (storage.getSettings()->fsync_part_directory) sync_guard = volume->getDisk()->getDirectorySyncGuard(to); + + storage.lockSharedData(*this); } -void IMergeTreeDataPart::remove() const +void IMergeTreeDataPart::remove(bool keep_s3) const { if (!isStoredOnDisk()) return; @@ -1048,7 +1054,7 @@ void IMergeTreeDataPart::remove() const try { - volume->getDisk()->removeRecursive(to + "/"); + volume->getDisk()->removeSharedRecursive(to + "/", keep_s3); } catch (...) { @@ -1071,7 +1077,7 @@ void IMergeTreeDataPart::remove() const if (checksums.empty()) { /// If the part is not completely written, we cannot use fast path by listing files. - volume->getDisk()->removeRecursive(to + "/"); + volume->getDisk()->removeSharedRecursive(to + "/", keep_s3); } else { @@ -1084,16 +1090,16 @@ void IMergeTreeDataPart::remove() const # pragma GCC diagnostic ignored "-Wunused-variable" #endif for (const auto & [file, _] : checksums.files) - volume->getDisk()->removeFile(to + "/" + file); + volume->getDisk()->removeSharedFile(to + "/" + file, keep_s3); #if !__clang__ # pragma GCC diagnostic pop #endif for (const auto & file : {"checksums.txt", "columns.txt"}) - volume->getDisk()->removeFile(to + "/" + file); + volume->getDisk()->removeSharedFile(to + "/" + file, keep_s3); - volume->getDisk()->removeFileIfExists(to + "/" + DEFAULT_COMPRESSION_CODEC_FILE_NAME); - volume->getDisk()->removeFileIfExists(to + "/" + DELETE_ON_DESTROY_MARKER_FILE_NAME); + volume->getDisk()->removeSharedFileIfExists(to + "/" + DEFAULT_COMPRESSION_CODEC_FILE_NAME, keep_s3); + volume->getDisk()->removeSharedFileIfExists(to + "/" + DELETE_ON_DESTROY_MARKER_FILE_NAME, keep_s3); volume->getDisk()->removeDirectory(to); } @@ -1103,7 +1109,7 @@ void IMergeTreeDataPart::remove() const LOG_ERROR(storage.log, "Cannot quickly remove directory {} by removing files; fallback to recursive removal. Reason: {}", fullPath(volume->getDisk(), to), getCurrentExceptionMessage(false)); - volume->getDisk()->removeRecursive(to + "/"); + volume->getDisk()->removeSharedRecursive(to + "/", keep_s3); } } } @@ -1168,7 +1174,6 @@ void IMergeTreeDataPart::makeCloneOnDisk(const DiskPtr & disk, const String & di disk->removeRecursive(path_to_clone + relative_path + '/'); } disk->createDirectories(path_to_clone); - volume->getDisk()->copy(getFullRelativePath(), disk, path_to_clone); volume->getDisk()->removeFileIfExists(path_to_clone + '/' + DELETE_ON_DESTROY_MARKER_FILE_NAME); } @@ -1305,6 +1310,21 @@ bool IMergeTreeDataPart::checkAllTTLCalculated(const StorageMetadataPtr & metada return true; } +String IMergeTreeDataPart::getUniqueId() const +{ + String id; + + auto disk = volume->getDisk(); + + if (disk->getType() == DB::DiskType::Type::S3) + id = disk->getUniqueId(getFullRelativePath() + "checksums.txt"); + + if (id.empty()) + throw Exception("Can't get unique S3 object", ErrorCodes::LOGICAL_ERROR); + + return id; +} + bool isCompactPart(const MergeTreeDataPartPtr & data_part) { return (data_part && data_part->getType() == MergeTreeDataPartType::COMPACT); @@ -1321,3 +1341,4 @@ bool isInMemoryPart(const MergeTreeDataPartPtr & data_part) } } + diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.h b/src/Storages/MergeTree/IMergeTreeDataPart.h index 2f531bd8391..83f8c672001 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPart.h +++ b/src/Storages/MergeTree/IMergeTreeDataPart.h @@ -22,6 +22,12 @@ #include +namespace zkutil +{ + class ZooKeeper; + using ZooKeeperPtr = std::shared_ptr; +} + namespace DB { @@ -124,7 +130,7 @@ public: /// Throws an exception if part is not stored in on-disk format. void assertOnDisk() const; - void remove() const; + void remove(bool keep_s3 = false) const; /// Initialize columns (from columns.txt if exists, or create from column files if not). /// Load checksums from checksums.txt if exists. Load index if required. @@ -361,6 +367,10 @@ public: /// part creation (using alter query with materialize_ttl setting). bool checkAllTTLCalculated(const StorageMetadataPtr & metadata_snapshot) const; + /// Return some uniq string for file + /// Required for distinguish different copies of the same part on S3 + String getUniqueId() const; + protected: /// Total size of all columns, calculated once in calcuateColumnSizesOnDisk diff --git a/src/Storages/MergeTree/MergeTreeData.h b/src/Storages/MergeTree/MergeTreeData.h index e1c351bd323..d1371609e1a 100644 --- a/src/Storages/MergeTree/MergeTreeData.h +++ b/src/Storages/MergeTree/MergeTreeData.h @@ -691,6 +691,8 @@ public: /// Reserves 0 bytes ReservationPtr makeEmptyReservationOnLargestDisk() { return getStoragePolicy()->makeEmptyReservationOnLargestDisk(); } + Disks getDisksByType(DiskType::Type type) const { return getStoragePolicy()->getDisksByType(type); } + /// Return alter conversions for part which must be applied on fly. AlterConversions getAlterConversionsForPart(const MergeTreeDataPartPtr part) const; /// Returns destination disk or volume for the TTL rule according to current storage policy @@ -767,6 +769,18 @@ public: std::optional getDataMovingJob(); bool areBackgroundMovesNeeded() const; + /// Lock part in zookeeper for use common S3 data in several nodes + /// Overridden in StorageReplicatedMergeTree + virtual void lockSharedData(const IMergeTreeDataPart &) const {} + + /// Unlock common S3 data part in zookeeper + /// Overridden in StorageReplicatedMergeTree + virtual bool unlockSharedData(const IMergeTreeDataPart &) const { return true; } + + /// Fetch part only if some replica has it on shared storage like S3 + /// Overridden in StorageReplicatedMergeTree + virtual bool tryToFetchIfShared(const IMergeTreeDataPart &, const DiskPtr &, const String &) { return false; } + /// Parts that currently submerging (merging to bigger parts) or emerging /// (to be appeared after merging finished). These two variables have to be used /// with `currently_submerging_emerging_mutex`. diff --git a/src/Storages/MergeTree/MergeTreeDataMergerMutator.cpp b/src/Storages/MergeTree/MergeTreeDataMergerMutator.cpp index f2f8172837c..afcb85c2479 100644 --- a/src/Storages/MergeTree/MergeTreeDataMergerMutator.cpp +++ b/src/Storages/MergeTree/MergeTreeDataMergerMutator.cpp @@ -757,6 +757,8 @@ MergeTreeData::MutableDataPartPtr MergeTreeDataMergerMutator::mergePartsToTempor std::unique_ptr rows_sources_write_buf; std::optional column_sizes; + SyncGuardPtr sync_guard; + if (chosen_merge_algorithm == MergeAlgorithm::Vertical) { tmp_disk->createDirectories(new_part_tmp_path); @@ -769,6 +771,9 @@ MergeTreeData::MutableDataPartPtr MergeTreeDataMergerMutator::mergePartsToTempor part->accumulateColumnSizes(merged_column_to_size); column_sizes = ColumnSizeEstimator(merged_column_to_size, merging_column_names, gathering_column_names); + + if (data.getSettings()->fsync_part_directory) + sync_guard = disk->getDirectorySyncGuard(new_part_tmp_path); } else { @@ -778,10 +783,6 @@ MergeTreeData::MutableDataPartPtr MergeTreeDataMergerMutator::mergePartsToTempor gathering_column_names.clear(); } - SyncGuardPtr sync_guard; - if (data.getSettings()->fsync_part_directory) - sync_guard = disk->getDirectorySyncGuard(new_part_tmp_path); - /** Read from all parts, merge and write into a new one. * In passing, we calculate expression for sorting. */ @@ -1894,9 +1895,9 @@ void MergeTreeDataMergerMutator::finalizeMutatedPart( MergeTreeData::DataPart::calculateTotalSizeOnDisk(new_data_part->volume->getDisk(), new_data_part->getFullRelativePath())); new_data_part->default_codec = codec; new_data_part->calculateColumnsSizesOnDisk(); + new_data_part->storage.lockSharedData(*new_data_part); } - bool MergeTreeDataMergerMutator::checkOperationIsNotCanceled(const MergeListEntry & merge_entry) const { if (merges_blocker.isCancelled() || merge_entry->is_cancelled) diff --git a/src/Storages/MergeTree/MergeTreePartsMover.cpp b/src/Storages/MergeTree/MergeTreePartsMover.cpp index 7b8c88b1bff..41eae7fed38 100644 --- a/src/Storages/MergeTree/MergeTreePartsMover.cpp +++ b/src/Storages/MergeTree/MergeTreePartsMover.cpp @@ -194,15 +194,40 @@ MergeTreeData::DataPartPtr MergeTreePartsMover::clonePart(const MergeTreeMoveEnt if (moves_blocker.isCancelled()) throw Exception("Cancelled moving parts.", ErrorCodes::ABORTED); - LOG_TRACE(log, "Cloning part {}", moving_part.part->name); + auto settings = data->getSettings(); + auto part = moving_part.part; + LOG_TRACE(log, "Cloning part {}", part->name); + + auto disk = moving_part.reserved_space->getDisk(); const String directory_to_move = "moving"; - moving_part.part->makeCloneOnDisk(moving_part.reserved_space->getDisk(), directory_to_move); + if (settings->allow_s3_zero_copy_replication) + { + /// Try to fetch part from S3 without copy and fallback to default copy + /// if it's not possible + moving_part.part->assertOnDisk(); + String path_to_clone = data->getRelativeDataPath() + directory_to_move + '/'; + String relative_path = part->relative_path; + if (disk->exists(path_to_clone + relative_path)) + { + LOG_WARNING(log, "Path " + fullPath(disk, path_to_clone + relative_path) + " already exists. Will remove it and clone again."); + disk->removeRecursive(path_to_clone + relative_path + '/'); + } + disk->createDirectories(path_to_clone); + bool is_fetched = data->tryToFetchIfShared(*part, disk, path_to_clone + "/" + part->name); + if (!is_fetched) + part->volume->getDisk()->copy(data->getRelativeDataPath() + relative_path, disk, path_to_clone); + part->volume->getDisk()->removeFileIfExists(path_to_clone + '/' + IMergeTreeDataPart::DELETE_ON_DESTROY_MARKER_FILE_NAME); + } + else + { + part->makeCloneOnDisk(disk, directory_to_move); + } - auto single_disk_volume = std::make_shared("volume_" + moving_part.part->name, moving_part.reserved_space->getDisk(), 0); + auto single_disk_volume = std::make_shared("volume_" + part->name, moving_part.reserved_space->getDisk(), 0); MergeTreeData::MutableDataPartPtr cloned_part = - data->createPart(moving_part.part->name, single_disk_volume, directory_to_move + '/' + moving_part.part->name); - LOG_TRACE(log, "Part {} was cloned to {}", moving_part.part->name, cloned_part->getFullPath()); + data->createPart(part->name, single_disk_volume, directory_to_move + '/' + part->name); + LOG_TRACE(log, "Part {} was cloned to {}", part->name, cloned_part->getFullPath()); cloned_part->loadColumnsChecksumsIndexes(true, true); return cloned_part; diff --git a/src/Storages/MergeTree/MergeTreeSettings.h b/src/Storages/MergeTree/MergeTreeSettings.h index 4cb1380b49b..8a1ea6eb649 100644 --- a/src/Storages/MergeTree/MergeTreeSettings.h +++ b/src/Storages/MergeTree/MergeTreeSettings.h @@ -71,6 +71,7 @@ struct Settings; M(Seconds, prefer_fetch_merged_part_time_threshold, 3600, "If time passed after replication log entry creation exceeds this threshold and sum size of parts is greater than \"prefer_fetch_merged_part_size_threshold\", prefer fetching merged part from replica instead of doing merge locally. To speed up very long merges.", 0) \ M(UInt64, prefer_fetch_merged_part_size_threshold, 10ULL * 1024 * 1024 * 1024, "If sum size of parts exceeds this threshold and time passed after replication log entry creation is greater than \"prefer_fetch_merged_part_time_threshold\", prefer fetching merged part from replica instead of doing merge locally. To speed up very long merges.", 0) \ M(Seconds, execute_merges_on_single_replica_time_threshold, 0, "When greater than zero only a single replica starts the merge immediately, others wait up to that amount of time to download the result instead of doing merges locally. If the chosen replica doesn't finish the merge during that amount of time, fallback to standard behavior happens.", 0) \ + M(Seconds, s3_execute_merges_on_single_replica_time_threshold, 3 * 60 * 60, "When greater than zero only a single replica starts the merge immediatelys when merged part on S3 storage and 'allow_s3_zero_copy_replication' is enabled.", 0) \ M(Seconds, try_fetch_recompressed_part_timeout, 7200, "Recompression works slow in most cases, so we don't start merge with recompression until this timeout and trying to fetch recompressed part from replica which assigned this merge with recompression.", 0) \ M(Bool, always_fetch_merged_part, 0, "If true, replica never merge parts and always download merged parts from other replicas.", 0) \ M(UInt64, max_suspicious_broken_parts, 10, "Max broken parts, if more - deny automatic deletion.", 0) \ @@ -114,6 +115,7 @@ struct Settings; M(UInt64, concurrent_part_removal_threshold, 100, "Activate concurrent part removal (see 'max_part_removal_threads') only if the number of inactive data parts is at least this.", 0) \ M(String, storage_policy, "default", "Name of storage disk policy", 0) \ M(Bool, allow_nullable_key, false, "Allow Nullable types as primary keys.", 0) \ + M(Bool, allow_s3_zero_copy_replication, false, "Allow Zero-copy replication over S3", 0) \ M(Bool, remove_empty_parts, true, "Remove empty parts after they were pruned by TTL, mutation, or collapsing merge algorithm", 0) \ M(Bool, assign_part_uuids, false, "Generate UUIDs for parts. Before enabling check that all replicas support new format.", 0) \ M(Int64, max_partitions_to_read, -1, "Limit the max number of partitions that can be accessed in one query. <= 0 means unlimited. This setting is the default that can be overridden by the query-level setting with the same name.", 0) \ diff --git a/src/Storages/MergeTree/MergedBlockOutputStream.cpp b/src/Storages/MergeTree/MergedBlockOutputStream.cpp index 1605ec693cb..1b852622efc 100644 --- a/src/Storages/MergeTree/MergedBlockOutputStream.cpp +++ b/src/Storages/MergeTree/MergedBlockOutputStream.cpp @@ -91,6 +91,7 @@ void MergedBlockOutputStream::writeSuffixAndFinalizePart( new_part->calculateColumnsSizesOnDisk(); if (default_codec != nullptr) new_part->default_codec = default_codec; + new_part->storage.lockSharedData(*new_part); } void MergedBlockOutputStream::finalizePartOnDisk( diff --git a/src/Storages/MergeTree/ReplicatedMergeTreeLogEntry.h b/src/Storages/MergeTree/ReplicatedMergeTreeLogEntry.h index cdf5a40d5a9..afd8c963943 100644 --- a/src/Storages/MergeTree/ReplicatedMergeTreeLogEntry.h +++ b/src/Storages/MergeTree/ReplicatedMergeTreeLogEntry.h @@ -6,6 +6,7 @@ #include #include #include +#include #include #include @@ -29,29 +30,29 @@ struct ReplicatedMergeTreeLogEntryData { enum Type { - EMPTY, /// Not used. - GET_PART, /// Get the part from another replica. - MERGE_PARTS, /// Merge the parts. - DROP_RANGE, /// Delete the parts in the specified partition in the specified number range. - CLEAR_COLUMN, /// NOTE: Deprecated. Drop specific column from specified partition. - CLEAR_INDEX, /// NOTE: Deprecated. Drop specific index from specified partition. - REPLACE_RANGE, /// Drop certain range of partitions and replace them by new ones - MUTATE_PART, /// Apply one or several mutations to the part. - ALTER_METADATA, /// Apply alter modification according to global /metadata and /columns paths + EMPTY, /// Not used. + GET_PART, /// Get the part from another replica. + MERGE_PARTS, /// Merge the parts. + DROP_RANGE, /// Delete the parts in the specified partition in the specified number range. + CLEAR_COLUMN, /// NOTE: Deprecated. Drop specific column from specified partition. + CLEAR_INDEX, /// NOTE: Deprecated. Drop specific index from specified partition. + REPLACE_RANGE, /// Drop certain range of partitions and replace them by new ones + MUTATE_PART, /// Apply one or several mutations to the part. + ALTER_METADATA, /// Apply alter modification according to global /metadata and /columns paths }; static String typeToString(Type type) { switch (type) { - case ReplicatedMergeTreeLogEntryData::GET_PART: return "GET_PART"; - case ReplicatedMergeTreeLogEntryData::MERGE_PARTS: return "MERGE_PARTS"; - case ReplicatedMergeTreeLogEntryData::DROP_RANGE: return "DROP_RANGE"; - case ReplicatedMergeTreeLogEntryData::CLEAR_COLUMN: return "CLEAR_COLUMN"; - case ReplicatedMergeTreeLogEntryData::CLEAR_INDEX: return "CLEAR_INDEX"; - case ReplicatedMergeTreeLogEntryData::REPLACE_RANGE: return "REPLACE_RANGE"; - case ReplicatedMergeTreeLogEntryData::MUTATE_PART: return "MUTATE_PART"; - case ReplicatedMergeTreeLogEntryData::ALTER_METADATA: return "ALTER_METADATA"; + case ReplicatedMergeTreeLogEntryData::GET_PART: return "GET_PART"; + case ReplicatedMergeTreeLogEntryData::MERGE_PARTS: return "MERGE_PARTS"; + case ReplicatedMergeTreeLogEntryData::DROP_RANGE: return "DROP_RANGE"; + case ReplicatedMergeTreeLogEntryData::CLEAR_COLUMN: return "CLEAR_COLUMN"; + case ReplicatedMergeTreeLogEntryData::CLEAR_INDEX: return "CLEAR_INDEX"; + case ReplicatedMergeTreeLogEntryData::REPLACE_RANGE: return "REPLACE_RANGE"; + case ReplicatedMergeTreeLogEntryData::MUTATE_PART: return "MUTATE_PART"; + case ReplicatedMergeTreeLogEntryData::ALTER_METADATA: return "ALTER_METADATA"; default: throw Exception("Unknown log entry type: " + DB::toString(type), ErrorCodes::LOGICAL_ERROR); } diff --git a/src/Storages/MergeTree/ReplicatedMergeTreeMergeStrategyPicker.cpp b/src/Storages/MergeTree/ReplicatedMergeTreeMergeStrategyPicker.cpp index d90183abd95..65da6080e86 100644 --- a/src/Storages/MergeTree/ReplicatedMergeTreeMergeStrategyPicker.cpp +++ b/src/Storages/MergeTree/ReplicatedMergeTreeMergeStrategyPicker.cpp @@ -56,6 +56,17 @@ bool ReplicatedMergeTreeMergeStrategyPicker::shouldMergeOnSingleReplica(const Re } +bool ReplicatedMergeTreeMergeStrategyPicker::shouldMergeOnSingleReplicaS3Shared(const ReplicatedMergeTreeLogEntryData & entry) const +{ + time_t threshold = s3_execute_merges_on_single_replica_time_threshold; + return ( + threshold > 0 /// feature turned on + && entry.type == ReplicatedMergeTreeLogEntry::MERGE_PARTS /// it is a merge log entry + && entry.create_time + threshold > time(nullptr) /// not too much time waited + ); +} + + /// that will return the same replica name for ReplicatedMergeTreeLogEntry on all the replicas (if the replica set is the same). /// that way each replica knows who is responsible for doing a certain merge. @@ -90,18 +101,23 @@ std::optional ReplicatedMergeTreeMergeStrategyPicker::pickReplicaToExecu void ReplicatedMergeTreeMergeStrategyPicker::refreshState() { auto threshold = storage.getSettings()->execute_merges_on_single_replica_time_threshold.totalSeconds(); + auto threshold_s3 = 0; + if (storage.getSettings()->allow_s3_zero_copy_replication) + threshold_s3 = storage.getSettings()->s3_execute_merges_on_single_replica_time_threshold.totalSeconds(); if (threshold == 0) - { /// we can reset the settings w/o lock (it's atomic) execute_merges_on_single_replica_time_threshold = threshold; + if (threshold_s3 == 0) + s3_execute_merges_on_single_replica_time_threshold = threshold_s3; + if (threshold == 0 && threshold_s3 == 0) return; - } auto now = time(nullptr); /// the setting was already enabled, and last state refresh was done recently - if (execute_merges_on_single_replica_time_threshold != 0 + if (((threshold != 0 && execute_merges_on_single_replica_time_threshold != 0) + || (threshold_s3 != 0 && s3_execute_merges_on_single_replica_time_threshold != 0)) && now - last_refresh_time < REFRESH_STATE_MINIMUM_INTERVAL_SECONDS) return; @@ -130,11 +146,15 @@ void ReplicatedMergeTreeMergeStrategyPicker::refreshState() LOG_WARNING(storage.log, "Can't find current replica in the active replicas list, or too few active replicas to use execute_merges_on_single_replica_time_threshold!"); /// we can reset the settings w/o lock (it's atomic) execute_merges_on_single_replica_time_threshold = 0; + s3_execute_merges_on_single_replica_time_threshold = 0; return; } std::lock_guard lock(mutex); - execute_merges_on_single_replica_time_threshold = threshold; + if (threshold != 0) /// Zeros already reset + execute_merges_on_single_replica_time_threshold = threshold; + if (threshold_s3 != 0) + s3_execute_merges_on_single_replica_time_threshold = threshold_s3; last_refresh_time = now; current_replica_index = current_replica_index_tmp; active_replicas = active_replicas_tmp; diff --git a/src/Storages/MergeTree/ReplicatedMergeTreeMergeStrategyPicker.h b/src/Storages/MergeTree/ReplicatedMergeTreeMergeStrategyPicker.h index 02a760d1ace..8adf206676a 100644 --- a/src/Storages/MergeTree/ReplicatedMergeTreeMergeStrategyPicker.h +++ b/src/Storages/MergeTree/ReplicatedMergeTreeMergeStrategyPicker.h @@ -52,6 +52,10 @@ public: /// and we may need to do a fetch (or postpone) instead of merge bool shouldMergeOnSingleReplica(const ReplicatedMergeTreeLogEntryData & entry) const; + /// return true if s3_execute_merges_on_single_replica_time_threshold feature is active + /// and we may need to do a fetch (or postpone) instead of merge + bool shouldMergeOnSingleReplicaS3Shared(const ReplicatedMergeTreeLogEntryData & entry) const; + /// returns the replica name /// and it's not current replica should do the merge /// used in shouldExecuteLogEntry and in tryExecuteMerge @@ -68,6 +72,7 @@ private: uint64_t getEntryHash(const ReplicatedMergeTreeLogEntryData & entry) const; std::atomic execute_merges_on_single_replica_time_threshold = 0; + std::atomic s3_execute_merges_on_single_replica_time_threshold = 0; std::atomic last_refresh_time = 0; std::mutex mutex; diff --git a/src/Storages/StorageMaterializedView.cpp b/src/Storages/StorageMaterializedView.cpp index d2cb418ce97..f915f76bbfc 100644 --- a/src/Storages/StorageMaterializedView.cpp +++ b/src/Storages/StorageMaterializedView.cpp @@ -84,7 +84,7 @@ StorageMaterializedView::StorageMaterializedView( else if (attach_) { /// If there is an ATTACH request, then the internal table must already be created. - target_table_id = StorageID(getStorageID().database_name, generateInnerTableName(getStorageID())); + target_table_id = StorageID(getStorageID().database_name, generateInnerTableName(getStorageID()), query.to_inner_uuid); } else { @@ -93,6 +93,7 @@ StorageMaterializedView::StorageMaterializedView( auto manual_create_query = std::make_shared(); manual_create_query->database = getStorageID().database_name; manual_create_query->table = generateInnerTableName(getStorageID()); + manual_create_query->uuid = query.to_inner_uuid; auto new_columns_list = std::make_shared(); new_columns_list->set(new_columns_list->columns, query.columns_list->columns->ptr()); diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp index 54ca48ef24d..d08c5b6ad7c 100644 --- a/src/Storages/StorageReplicatedMergeTree.cpp +++ b/src/Storages/StorageReplicatedMergeTree.cpp @@ -142,6 +142,10 @@ static const auto MERGE_SELECTING_SLEEP_MS = 5 * 1000; static const auto MUTATIONS_FINALIZING_SLEEP_MS = 1 * 1000; static const auto MUTATIONS_FINALIZING_IDLE_SLEEP_MS = 5 * 1000; + +std::atomic_uint StorageReplicatedMergeTree::total_fetches {0}; + + void StorageReplicatedMergeTree::setZooKeeper() { /// Every ReplicatedMergeTree table is using only one ZooKeeper session. @@ -539,6 +543,13 @@ void StorageReplicatedMergeTree::createNewZooKeeperNodes() /// Mutations zookeeper->createIfNotExists(zookeeper_path + "/mutations", String()); zookeeper->createIfNotExists(replica_path + "/mutation_pointer", String()); + + /// Nodes for zero-copy S3 replication + if (storage_settings.get()->allow_s3_zero_copy_replication) + { + zookeeper->createIfNotExists(zookeeper_path + "/zero_copy_s3", String()); + zookeeper->createIfNotExists(zookeeper_path + "/zero_copy_s3/shared", String()); + } } @@ -1446,9 +1457,12 @@ bool StorageReplicatedMergeTree::tryExecuteMerge(const LogEntry & entry) /// In some use cases merging can be more expensive than fetching /// and it may be better to spread merges tasks across the replicas /// instead of doing exactly the same merge cluster-wise + std::optional replica_to_execute_merge; + bool replica_to_execute_merge_picked = false; if (merge_strategy_picker.shouldMergeOnSingleReplica(entry)) { - auto replica_to_execute_merge = merge_strategy_picker.pickReplicaToExecuteMerge(entry); + replica_to_execute_merge = merge_strategy_picker.pickReplicaToExecuteMerge(entry); + replica_to_execute_merge_picked = true; if (replica_to_execute_merge) { @@ -1542,6 +1556,25 @@ bool StorageReplicatedMergeTree::tryExecuteMerge(const LogEntry & entry) future_merged_part.updatePath(*this, reserved_space); future_merged_part.merge_type = entry.merge_type; + if (storage_settings_ptr->allow_s3_zero_copy_replication) + { + auto disk = reserved_space->getDisk(); + if (disk->getType() == DB::DiskType::Type::S3) + { + if (merge_strategy_picker.shouldMergeOnSingleReplicaS3Shared(entry)) + { + if (!replica_to_execute_merge_picked) + replica_to_execute_merge = merge_strategy_picker.pickReplicaToExecuteMerge(entry); + + if (replica_to_execute_merge) + { + LOG_DEBUG(log, "Prefer fetching part {} from replica {} due s3_execute_merges_on_single_replica_time_threshold", entry.new_part_name, replica_to_execute_merge.value()); + return false; + } + } + } + } + /// Account TTL merge if (isTTLMergeType(future_merged_part.merge_type)) global_context.getMergeList().bookMergeWithTTL(); @@ -1745,7 +1778,6 @@ bool StorageReplicatedMergeTree::executeFetch(LogEntry & entry) const auto storage_settings_ptr = getSettings(); auto metadata_snapshot = getInMemoryMetadataPtr(); - static std::atomic_uint total_fetches {0}; if (storage_settings_ptr->replicated_max_parallel_fetches && total_fetches >= storage_settings_ptr->replicated_max_parallel_fetches) { throw Exception("Too many total fetches from replicas, maximum: " + storage_settings_ptr->replicated_max_parallel_fetches.toString(), @@ -1933,6 +1965,57 @@ bool StorageReplicatedMergeTree::executeFetch(LogEntry & entry) } +bool StorageReplicatedMergeTree::executeFetchShared( + const String & source_replica, + const String & new_part_name, + const DiskPtr & disk, + const String & path) +{ + if (source_replica.empty()) + { + LOG_INFO(log, "No active replica has part {} on S3.", new_part_name); + return false; + } + + const auto storage_settings_ptr = getSettings(); + auto metadata_snapshot = getInMemoryMetadataPtr(); + + if (storage_settings_ptr->replicated_max_parallel_fetches && total_fetches >= storage_settings_ptr->replicated_max_parallel_fetches) + { + throw Exception("Too many total fetches from replicas, maximum: " + storage_settings_ptr->replicated_max_parallel_fetches.toString(), + ErrorCodes::TOO_MANY_FETCHES); + } + + ++total_fetches; + SCOPE_EXIT({--total_fetches;}); + + if (storage_settings_ptr->replicated_max_parallel_fetches_for_table + && current_table_fetches >= storage_settings_ptr->replicated_max_parallel_fetches_for_table) + { + throw Exception("Too many fetches from replicas for table, maximum: " + storage_settings_ptr->replicated_max_parallel_fetches_for_table.toString(), + ErrorCodes::TOO_MANY_FETCHES); + } + + ++current_table_fetches; + SCOPE_EXIT({--current_table_fetches;}); + + try + { + if (!fetchExistsPart(new_part_name, metadata_snapshot, zookeeper_path + "/replicas/" + source_replica, disk, path)) + return false; + } + catch (Exception & e) + { + if (e.code() == ErrorCodes::RECEIVED_ERROR_TOO_MANY_REQUESTS) + e.addMessage("Too busy replica. Will try later."); + tryLogCurrentException(log, __PRETTY_FUNCTION__); + throw; + } + + return true; +} + + void StorageReplicatedMergeTree::executeDropRange(const LogEntry & entry) { auto drop_range_info = MergeTreePartInfo::fromPartName(entry.new_part_name, format_version); @@ -3194,7 +3277,6 @@ String StorageReplicatedMergeTree::findReplicaHavingPart(const String & part_nam return {}; } - String StorageReplicatedMergeTree::findReplicaHavingCoveringPart(LogEntry & entry, bool active) { auto zookeeper = getZooKeeper(); @@ -3456,6 +3538,7 @@ bool StorageReplicatedMergeTree::partIsInsertingWithParallelQuorum(const MergeTr return zookeeper->exists(zookeeper_path + "/quorum/parallel/" + part_info.getPartName()); } + bool StorageReplicatedMergeTree::partIsLastQuorumPart(const MergeTreePartInfo & part_info) const { auto zookeeper = getZooKeeper(); @@ -3477,6 +3560,7 @@ bool StorageReplicatedMergeTree::partIsLastQuorumPart(const MergeTreePartInfo & return partition_it->second == part_info.getPartName(); } + bool StorageReplicatedMergeTree::fetchPart(const String & part_name, const StorageMetadataPtr & metadata_snapshot, const String & source_replica_path, bool to_detached, size_t quorum, zkutil::ZooKeeper::Ptr zookeeper_) { @@ -3562,6 +3646,10 @@ bool StorageReplicatedMergeTree::fetchPart(const String & part_name, const Stora } + ReplicatedMergeTreeAddress address; + ConnectionTimeouts timeouts; + std::pair user_password; + String interserver_scheme; std::optional tagger_ptr; std::function get_part; if (part_to_clone) @@ -3573,10 +3661,10 @@ bool StorageReplicatedMergeTree::fetchPart(const String & part_name, const Stora } else { - ReplicatedMergeTreeAddress address(zookeeper->get(source_replica_path + "/host")); - auto timeouts = ConnectionTimeouts::getHTTPTimeouts(global_context); - auto user_password = global_context.getInterserverCredentials(); - String interserver_scheme = global_context.getInterserverScheme(); + address.fromString(zookeeper->get(source_replica_path + "/host")); + timeouts = ConnectionTimeouts::getHTTPTimeouts(global_context); + user_password = global_context.getInterserverCredentials(); + interserver_scheme = global_context.getInterserverScheme(); get_part = [&, address, timeouts, user_password, interserver_scheme]() { @@ -3597,7 +3685,8 @@ bool StorageReplicatedMergeTree::fetchPart(const String & part_name, const Stora interserver_scheme, to_detached, "", - &tagger_ptr); + &tagger_ptr, + true); }; } @@ -3610,11 +3699,6 @@ bool StorageReplicatedMergeTree::fetchPart(const String & part_name, const Stora Transaction transaction(*this); renameTempPartAndReplace(part, nullptr, &transaction); - /** NOTE - * Here, an error occurs if ALTER occurred with a change in the column type or column deletion, - * and the part on remote server has not yet been modified. - * After a while, one of the following attempts to make `fetchPart` succeed. - */ replaced_parts = checkPartChecksumsAndCommit(transaction, part); /** If a quorum is tracked for this part, you must update it. @@ -3686,6 +3770,104 @@ bool StorageReplicatedMergeTree::fetchPart(const String & part_name, const Stora } +bool StorageReplicatedMergeTree::fetchExistsPart(const String & part_name, const StorageMetadataPtr & metadata_snapshot, + const String & source_replica_path, DiskPtr replaced_disk, String replaced_part_path) +{ + auto zookeeper = getZooKeeper(); + const auto part_info = MergeTreePartInfo::fromPartName(part_name, format_version); + + if (auto part = getPartIfExists(part_info, {IMergeTreeDataPart::State::Outdated, IMergeTreeDataPart::State::Deleting})) + { + LOG_DEBUG(log, "Part {} should be deleted after previous attempt before fetch", part->name); + /// Force immediate parts cleanup to delete the part that was left from the previous fetch attempt. + cleanup_thread.wakeup(); + return false; + } + + { + std::lock_guard lock(currently_fetching_parts_mutex); + if (!currently_fetching_parts.insert(part_name).second) + { + LOG_DEBUG(log, "Part {} is already fetching right now", part_name); + return false; + } + } + + SCOPE_EXIT + ({ + std::lock_guard lock(currently_fetching_parts_mutex); + currently_fetching_parts.erase(part_name); + }); + + LOG_DEBUG(log, "Fetching part {} from {}", part_name, source_replica_path); + + TableLockHolder table_lock_holder = lockForShare(RWLockImpl::NO_QUERY, getSettings()->lock_acquire_timeout_for_background_operations); + + /// Logging + Stopwatch stopwatch; + MutableDataPartPtr part; + DataPartsVector replaced_parts; + + auto write_part_log = [&] (const ExecutionStatus & execution_status) + { + writePartLog( + PartLogElement::DOWNLOAD_PART, execution_status, stopwatch.elapsed(), + part_name, part, replaced_parts, nullptr); + }; + + std::function get_part; + + ReplicatedMergeTreeAddress address(zookeeper->get(source_replica_path + "/host")); + auto timeouts = ConnectionTimeouts::getHTTPTimeouts(global_context); + auto user_password = global_context.getInterserverCredentials(); + String interserver_scheme = global_context.getInterserverScheme(); + + get_part = [&, address, timeouts, user_password, interserver_scheme]() + { + if (interserver_scheme != address.scheme) + throw Exception("Interserver schemes are different: '" + interserver_scheme + + "' != '" + address.scheme + "', can't fetch part from " + address.host, + ErrorCodes::INTERSERVER_SCHEME_DOESNT_MATCH); + + return fetcher.fetchPart( + metadata_snapshot, part_name, source_replica_path, + address.host, address.replication_port, + timeouts, user_password.first, user_password.second, interserver_scheme, false, "", nullptr, true, + replaced_disk); + }; + + try + { + part = get_part(); + + if (part->volume->getDisk()->getName() != replaced_disk->getName()) + throw Exception("Part " + part->name + " fetched on wrong disk " + part->volume->getDisk()->getName(), ErrorCodes::LOGICAL_ERROR); + replaced_disk->removeFileIfExists(replaced_part_path); + replaced_disk->moveDirectory(part->getFullRelativePath(), replaced_part_path); + } + catch (const Exception & e) + { + /// The same part is being written right now (but probably it's not committed yet). + /// We will check the need for fetch later. + if (e.code() == ErrorCodes::DIRECTORY_ALREADY_EXISTS) + return false; + + throw; + } + catch (...) + { + write_part_log(ExecutionStatus::fromCurrentException()); + throw; + } + + ProfileEvents::increment(ProfileEvents::ReplicatedPartFetches); + + LOG_DEBUG(log, "Fetched part {} from {}", part_name, source_replica_path); + + return true; +} + + void StorageReplicatedMergeTree::startup() { if (is_readonly) @@ -5422,13 +5604,14 @@ void StorageReplicatedMergeTree::clearOldPartsAndRemoveFromZK() } parts.clear(); - auto remove_parts_from_filesystem = [log=log] (const DataPartsVector & parts_to_remove) + auto remove_parts_from_filesystem = [log=log, this] (const DataPartsVector & parts_to_remove) { for (const auto & part : parts_to_remove) { try { - part->remove(); + bool keep_s3 = !this->unlockSharedData(*part); + part->remove(keep_s3); } catch (...) { @@ -6367,6 +6550,7 @@ CheckResults StorageReplicatedMergeTree::checkData(const ASTPtr & query, const C return results; } + bool StorageReplicatedMergeTree::canUseAdaptiveGranularity() const { const auto storage_settings_ptr = getSettings(); @@ -6381,10 +6565,196 @@ MutationCommands StorageReplicatedMergeTree::getFirstAlterMutationCommandsForPar return queue.getFirstAlterMutationCommandsForPart(part); } + void StorageReplicatedMergeTree::startBackgroundMovesIfNeeded() { if (areBackgroundMovesNeeded()) background_moves_executor.start(); } + +void StorageReplicatedMergeTree::lockSharedData(const IMergeTreeDataPart & part) const +{ + if (!part.volume) + return; + DiskPtr disk = part.volume->getDisk(); + if (!disk) + return; + if (disk->getType() != DB::DiskType::Type::S3) + return; + + zkutil::ZooKeeperPtr zookeeper = tryGetZooKeeper(); + if (!zookeeper) + return; + + String id = part.getUniqueId(); + boost::replace_all(id, "/", "_"); + + String zookeeper_node = zookeeper_path + "/zero_copy_s3/shared/" + part.name + "/" + id + "/" + replica_name; + + LOG_TRACE(log, "Set zookeeper lock {}", zookeeper_node); + + /// In rare case other replica can remove path between createAncestors and createIfNotExists + /// So we make up to 5 attempts + for (int attempts = 5; attempts > 0; --attempts) + { + try + { + zookeeper->createAncestors(zookeeper_node); + zookeeper->createIfNotExists(zookeeper_node, "lock"); + break; + } + catch (const zkutil::KeeperException & e) + { + if (e.code == Coordination::Error::ZNONODE) + continue; + throw; + } + } +} + + +bool StorageReplicatedMergeTree::unlockSharedData(const IMergeTreeDataPart & part) const +{ + if (!part.volume) + return true; + DiskPtr disk = part.volume->getDisk(); + if (!disk) + return true; + if (disk->getType() != DB::DiskType::Type::S3) + return true; + + zkutil::ZooKeeperPtr zookeeper = tryGetZooKeeper(); + if (!zookeeper) + return true; + + String id = part.getUniqueId(); + boost::replace_all(id, "/", "_"); + + String zookeeper_part_node = zookeeper_path + "/zero_copy_s3/shared/" + part.name; + String zookeeper_part_uniq_node = zookeeper_part_node + "/" + id; + String zookeeper_node = zookeeper_part_uniq_node + "/" + replica_name; + + LOG_TRACE(log, "Remove zookeeper lock {}", zookeeper_node); + + zookeeper->tryRemove(zookeeper_node); + + Strings children; + zookeeper->tryGetChildren(zookeeper_part_uniq_node, children); + + if (!children.empty()) + { + LOG_TRACE(log, "Found zookeper locks for {}", zookeeper_part_uniq_node); + return false; + } + + zookeeper->tryRemove(zookeeper_part_uniq_node); + + /// Even when we have lock with same part name, but with different uniq, we can remove files on S3 + children.clear(); + zookeeper->tryGetChildren(zookeeper_part_node, children); + if (children.empty()) + /// Cleanup after last uniq removing + zookeeper->tryRemove(zookeeper_part_node); + + return true; +} + + +bool StorageReplicatedMergeTree::tryToFetchIfShared( + const IMergeTreeDataPart & part, + const DiskPtr & disk, + const String & path) +{ + const auto data_settings = getSettings(); + if (!data_settings->allow_s3_zero_copy_replication) + return false; + + if (disk->getType() != DB::DiskType::Type::S3) + return false; + + String replica = getSharedDataReplica(part); + + /// We can't fetch part when none replicas have this part on S3 + if (replica.empty()) + return false; + + return executeFetchShared(replica, part.name, disk, path); +} + + +String StorageReplicatedMergeTree::getSharedDataReplica( + const IMergeTreeDataPart & part) const +{ + String best_replica; + + zkutil::ZooKeeperPtr zookeeper = tryGetZooKeeper(); + if (!zookeeper) + return best_replica; + + String zookeeper_part_node = zookeeper_path + "/zero_copy_s3/shared/" + part.name; + + Strings ids; + zookeeper->tryGetChildren(zookeeper_part_node, ids); + + Strings replicas; + for (const auto & id : ids) + { + String zookeeper_part_uniq_node = zookeeper_part_node + "/" + id; + Strings id_replicas; + zookeeper->tryGetChildren(zookeeper_part_uniq_node, id_replicas); + LOG_TRACE(log, "Found zookeper replicas for {}: {}", zookeeper_part_uniq_node, id_replicas.size()); + replicas.insert(replicas.end(), id_replicas.begin(), id_replicas.end()); + } + + LOG_TRACE(log, "Found zookeper replicas for part {}: {}", part.name, replicas.size()); + + Strings active_replicas; + + /// TODO: Move best replica choose in common method (here is the same code as in StorageReplicatedMergeTree::fetchPartition) + + /// Leave only active replicas. + active_replicas.reserve(replicas.size()); + + for (const String & replica : replicas) + if ((replica != replica_name) && (zookeeper->exists(zookeeper_path + "/replicas/" + replica + "/is_active"))) + active_replicas.push_back(replica); + + LOG_TRACE(log, "Found zookeper active replicas for part {}: {}", part.name, active_replicas.size()); + + if (active_replicas.empty()) + return best_replica; + + /** You must select the best (most relevant) replica. + * This is a replica with the maximum `log_pointer`, then with the minimum `queue` size. + * NOTE This is not exactly the best criteria. It does not make sense to download old partitions, + * and it would be nice to be able to choose the replica closest by network. + * NOTE Of course, there are data races here. You can solve it by retrying. + */ + Int64 max_log_pointer = -1; + UInt64 min_queue_size = std::numeric_limits::max(); + + for (const String & replica : active_replicas) + { + String current_replica_path = zookeeper_path + "/replicas/" + replica; + + String log_pointer_str = zookeeper->get(current_replica_path + "/log_pointer"); + Int64 log_pointer = log_pointer_str.empty() ? 0 : parse(log_pointer_str); + + Coordination::Stat stat; + zookeeper->get(current_replica_path + "/queue", &stat); + size_t queue_size = stat.numChildren; + + if (log_pointer > max_log_pointer + || (log_pointer == max_log_pointer && queue_size < min_queue_size)) + { + max_log_pointer = log_pointer; + min_queue_size = queue_size; + best_replica = replica; + } + } + + return best_replica; +} + } diff --git a/src/Storages/StorageReplicatedMergeTree.h b/src/Storages/StorageReplicatedMergeTree.h index a1a70ada9b2..2d1b50ede4c 100644 --- a/src/Storages/StorageReplicatedMergeTree.h +++ b/src/Storages/StorageReplicatedMergeTree.h @@ -212,6 +212,23 @@ public: /// is not overloaded bool canExecuteFetch(const ReplicatedMergeTreeLogEntry & entry, String & disable_reason) const; + /// Fetch part only when it stored on shared storage like S3 + bool executeFetchShared(const String & source_replica, const String & new_part_name, const DiskPtr & disk, const String & path); + + /// Lock part in zookeeper for use common S3 data in several nodes + void lockSharedData(const IMergeTreeDataPart & part) const override; + + /// Unlock common S3 data part in zookeeper + /// Return true if data unlocked + /// Return false if data is still used by another node + bool unlockSharedData(const IMergeTreeDataPart & part) const override; + + /// Fetch part only if some replica has it on shared storage like S3 + bool tryToFetchIfShared(const IMergeTreeDataPart & part, const DiskPtr & disk, const String & path) override; + + /// Get best replica having this partition on S3 + String getSharedDataReplica(const IMergeTreeDataPart & part) const; + private: /// Get a sequential consistent view of current parts. ReplicatedMergeTreeQuorumAddedParts::PartitionIdToMaxBlock getMaxAddedBlocks() const; @@ -289,6 +306,9 @@ private: /// Event that is signalled (and is reset) by the restarting_thread when the ZooKeeper session expires. Poco::Event partial_shutdown_event {false}; /// Poco::Event::EVENT_MANUALRESET + /// Limiting parallel fetches per node + static std::atomic_uint total_fetches; + /// Limiting parallel fetches per one table std::atomic_uint current_table_fetches {0}; @@ -370,8 +390,7 @@ private: String getChecksumsForZooKeeper(const MergeTreeDataPartChecksums & checksums) const; /// Accepts a PreComitted part, atomically checks its checksums with ones on other replicas and commit the part - DataPartsVector checkPartChecksumsAndCommit(Transaction & transaction, - const DataPartPtr & part); + DataPartsVector checkPartChecksumsAndCommit(Transaction & transaction, const DataPartPtr & part); bool partIsAssignedToBackgroundOperation(const DataPartPtr & part) const override; @@ -515,6 +534,17 @@ private: size_t quorum, zkutil::ZooKeeper::Ptr zookeeper_ = nullptr); + /** Download the specified part from the specified replica. + * Used for replace local part on the same s3-shared part in hybrid storage. + * Returns false if part is already fetching right now. + */ + bool fetchExistsPart( + const String & part_name, + const StorageMetadataPtr & metadata_snapshot, + const String & replica_path, + DiskPtr replaced_disk, + String replaced_part_path); + /// Required only to avoid races between executeLogEntry and fetchPartition std::unordered_set currently_fetching_parts; std::mutex currently_fetching_parts_mutex; diff --git a/src/Storages/System/StorageSystemTables.cpp b/src/Storages/System/StorageSystemTables.cpp index 132ed234323..fffe909715f 100644 --- a/src/Storages/System/StorageSystemTables.cpp +++ b/src/Storages/System/StorageSystemTables.cpp @@ -359,6 +359,7 @@ protected: { auto & create = ast->as(); create.uuid = UUIDHelpers::Nil; + create.to_inner_uuid = UUIDHelpers::Nil; } if (columns_mask[src_index++]) diff --git a/tests/clickhouse-test b/tests/clickhouse-test index 6d7ca807992..fce8fe65f30 100755 --- a/tests/clickhouse-test +++ b/tests/clickhouse-test @@ -138,7 +138,7 @@ def run_single_test(args, ext, server_logs_level, client_options, case_file, std os.environ["CLICKHOUSE_DATABASE"] = database # This is for .sh tests - os.environ.setdefault("CLICKHOUSE_LOG_COMMENT", case_file) + os.environ["CLICKHOUSE_LOG_COMMENT"] = case_file params = { 'client': args.client + ' --database=' + database, @@ -248,11 +248,11 @@ def get_server_pid(server_tcp_port): output = None for cmd in commands: try: - output = subprocess.check_output(cmd, shell=True, stderr=subprocess.STDOUT) + output = subprocess.check_output(cmd, shell=True, stderr=subprocess.STDOUT, universal_newlines=True) if output: return int(output) except Exception as e: - print("Cannot get server pid with {}, got {}: {}", cmd, output, e) + print("Cannot get server pid with {}, got {}: {}".format(cmd, output, e)) return None # most likely server dead diff --git a/tests/config/config.d/clusters.xml b/tests/config/config.d/clusters.xml index c0babf0ff89..b783372f83b 100644 --- a/tests/config/config.d/clusters.xml +++ b/tests/config/config.d/clusters.xml @@ -17,4 +17,4 @@ - \ No newline at end of file + diff --git a/tests/config/users.d/database_replicated.xml b/tests/config/users.d/database_replicated.xml index 23801d00154..903b8a64e22 100644 --- a/tests/config/users.d/database_replicated.xml +++ b/tests/config/users.d/database_replicated.xml @@ -2,9 +2,11 @@ 1 - 0 + none 30 30 + 1 + 2 diff --git a/tests/integration/test_replace_partition/test.py b/tests/integration/test_replace_partition/test.py index 06e7f4be82b..d30a038825f 100644 --- a/tests/integration/test_replace_partition/test.py +++ b/tests/integration/test_replace_partition/test.py @@ -1,3 +1,8 @@ +# pylint: disable=line-too-long +# pylint: disable=unused-argument +# pylint: disable=redefined-outer-name: + +import time import pytest from helpers.cluster import ClickHouseCluster @@ -13,13 +18,13 @@ def _fill_nodes(nodes, shard): node.query( ''' CREATE DATABASE test; - + CREATE TABLE real_table(date Date, id UInt32, dummy UInt32) ENGINE = MergeTree(date, id, 8192); - + CREATE TABLE other_table(date Date, id UInt32, dummy UInt32) ENGINE = MergeTree(date, id, 8192); - + CREATE TABLE test_table(date Date, id UInt32, dummy UInt32) ENGINE = ReplicatedMergeTree('/clickhouse/tables/test{shard}/replicated', '{replica}', date, id, 8192); '''.format(shard=shard, replica=node.name)) @@ -97,12 +102,13 @@ def test_drop_failover(drop_failover): # Drop partition on source node node3.query("ALTER TABLE test_table DROP PARTITION 201706") - # connection restored + # Wait few seconds for connection to zookeeper to be restored + time.sleep(5) - node4.query_with_retry("select last_exception from system.replication_queue where type = 'REPLACE_RANGE'", - check_callback=lambda x: 'Not found part' not in x, sleep_time=1) - assert 'Not found part' not in node4.query( - "select last_exception from system.replication_queue where type = 'REPLACE_RANGE'") + msg = node4.query_with_retry( + "select last_exception from system.replication_queue where type = 'REPLACE_RANGE'", + check_callback=lambda x: 'Not found part' not in x, sleep_time=1) + assert 'Not found part' not in msg assert_eq_with_retry(node4, "SELECT id FROM test_table order by id", '') @@ -151,8 +157,11 @@ def test_replace_after_replace_failover(replace_after_replace_failover): assert_eq_with_retry(node5, "SELECT id FROM test_table order by id", '333') - node6.query_with_retry("select last_exception from system.replication_queue where type = 'REPLACE_RANGE'", - check_callback=lambda x: 'Not found part' not in x, sleep_time=1) - assert 'Not found part' not in node6.query( - "select last_exception from system.replication_queue where type = 'REPLACE_RANGE'") + # Wait few seconds for connection to zookeeper to be restored + time.sleep(5) + + msg = node6.query_with_retry( + "select last_exception from system.replication_queue where type = 'REPLACE_RANGE'", + check_callback=lambda x: 'Not found part' not in x, sleep_time=1) + assert 'Not found part' not in msg assert_eq_with_retry(node6, "SELECT id FROM test_table order by id", '333') diff --git a/tests/integration/test_replicated_database/configs/config.xml b/tests/integration/test_replicated_database/configs/config.xml index ebceee3aa5c..d751454437c 100644 --- a/tests/integration/test_replicated_database/configs/config.xml +++ b/tests/integration/test_replicated_database/configs/config.xml @@ -1,34 +1,3 @@ 10 - - - - - true - - main_node - 9000 - - - dummy_node - 9000 - - - competing_node - 9000 - - - - true - - snapshotting_node - 9000 - - - snapshot_recovering_node - 9000 - - - - diff --git a/tests/integration/test_replicated_database/configs/settings.xml b/tests/integration/test_replicated_database/configs/settings.xml index e0f7e8691e6..7f45502e20d 100644 --- a/tests/integration/test_replicated_database/configs/settings.xml +++ b/tests/integration/test_replicated_database/configs/settings.xml @@ -2,6 +2,7 @@ 1 + 1 diff --git a/tests/integration/test_replicated_database/test.py b/tests/integration/test_replicated_database/test.py index 99e7d6077f8..f02457b144a 100644 --- a/tests/integration/test_replicated_database/test.py +++ b/tests/integration/test_replicated_database/test.py @@ -99,16 +99,20 @@ def test_alters_from_different_replicas(started_cluster): "(CounterID UInt32, StartDate Date, UserID UInt32, VisitID UInt32, NestedColumn Nested(A UInt8, S String), ToDrop UInt32) " "ENGINE = MergeTree(StartDate, intHash32(UserID), (CounterID, StartDate, intHash32(UserID), VisitID), 8192);") - main_node.query("CREATE TABLE testdb.dist AS testdb.concurrent_test ENGINE = Distributed(cluster, testdb, concurrent_test, CounterID)") + main_node.query("CREATE TABLE testdb.dist AS testdb.concurrent_test ENGINE = Distributed(testdb, testdb, concurrent_test, CounterID)") dummy_node.stop_clickhouse(kill=True) - settings = {"distributed_ddl_task_timeout": 10} + settings = {"distributed_ddl_task_timeout": 5} assert "There are 1 unfinished hosts (0 of them are currently active)" in \ competing_node.query_and_get_error("ALTER TABLE testdb.concurrent_test ADD COLUMN Added0 UInt32;", settings=settings) + settings = {"distributed_ddl_task_timeout": 5, "distributed_ddl_output_mode": "null_status_on_timeout"} + assert "shard1|replica2\t\\N\t\\N" in \ + main_node.query("ALTER TABLE testdb.concurrent_test ADD COLUMN Added2 UInt32;", settings=settings) + settings = {"distributed_ddl_task_timeout": 5, "distributed_ddl_output_mode": "never_throw"} + assert "shard1|replica2\t\\N\t\\N" in \ + competing_node.query("ALTER TABLE testdb.concurrent_test ADD COLUMN Added1 UInt32 AFTER Added0;", settings=settings) dummy_node.start_clickhouse() - main_node.query("ALTER TABLE testdb.concurrent_test ADD COLUMN Added2 UInt32;") - competing_node.query("ALTER TABLE testdb.concurrent_test ADD COLUMN Added1 UInt32 AFTER Added0;") main_node.query("ALTER TABLE testdb.concurrent_test ADD COLUMN AddedNested1 Nested(A UInt32, B UInt64) AFTER Added2;") competing_node.query("ALTER TABLE testdb.concurrent_test ADD COLUMN AddedNested1.C Array(String) AFTER AddedNested1.B;") main_node.query("ALTER TABLE testdb.concurrent_test ADD COLUMN AddedNested2 Nested(A UInt32, B UInt64) AFTER AddedNested1;") @@ -198,8 +202,14 @@ def test_recover_staled_replica(started_cluster): dummy_node.query("CREATE TABLE recover.rmt2 (n int) ENGINE=ReplicatedMergeTree order by n", settings=settings) main_node.query("CREATE TABLE recover.rmt3 (n int) ENGINE=ReplicatedMergeTree order by n", settings=settings) dummy_node.query("CREATE TABLE recover.rmt5 (n int) ENGINE=ReplicatedMergeTree order by n", settings=settings) - main_node.query("CREATE DICTIONARY recover.d1 (n int DEFAULT 0, m int DEFAULT 1) PRIMARY KEY n SOURCE(CLICKHOUSE(HOST 'localhost' PORT 9000 USER 'default' TABLE 'rmt1' PASSWORD '' DB 'recover')) LIFETIME(MIN 1 MAX 10) LAYOUT(FLAT())") - dummy_node.query("CREATE DICTIONARY recover.d2 (n int DEFAULT 0, m int DEFAULT 1) PRIMARY KEY n SOURCE(CLICKHOUSE(HOST 'localhost' PORT 9000 USER 'default' TABLE 'rmt2' PASSWORD '' DB 'recover')) LIFETIME(MIN 1 MAX 10) LAYOUT(FLAT())") + main_node.query("CREATE MATERIALIZED VIEW recover.mv1 (n int) ENGINE=ReplicatedMergeTree order by n AS SELECT n FROM recover.rmt1", settings=settings) + dummy_node.query("CREATE MATERIALIZED VIEW recover.mv2 (n int) ENGINE=ReplicatedMergeTree order by n AS SELECT n FROM recover.rmt2", settings=settings) + main_node.query("CREATE DICTIONARY recover.d1 (n int DEFAULT 0, m int DEFAULT 1) PRIMARY KEY n " + "SOURCE(CLICKHOUSE(HOST 'localhost' PORT 9000 USER 'default' TABLE 'rmt1' PASSWORD '' DB 'recover')) " + "LIFETIME(MIN 1 MAX 10) LAYOUT(FLAT())") + dummy_node.query("CREATE DICTIONARY recover.d2 (n int DEFAULT 0, m int DEFAULT 1) PRIMARY KEY n " + "SOURCE(CLICKHOUSE(HOST 'localhost' PORT 9000 USER 'default' TABLE 'rmt2' PASSWORD '' DB 'recover')) " + "LIFETIME(MIN 1 MAX 10) LAYOUT(FLAT())") for table in ['t1', 't2', 'mt1', 'mt2', 'rmt1', 'rmt2', 'rmt3', 'rmt5']: main_node.query("INSERT INTO recover.{} VALUES (42)".format(table)) @@ -217,35 +227,44 @@ def test_recover_staled_replica(started_cluster): main_node.query("RENAME TABLE recover.rmt3 TO recover.rmt4", settings=settings) main_node.query("DROP TABLE recover.rmt5", settings=settings) main_node.query("DROP DICTIONARY recover.d2", settings=settings) - main_node.query("CREATE DICTIONARY recover.d2 (n int DEFAULT 0, m int DEFAULT 1) PRIMARY KEY n SOURCE(CLICKHOUSE(HOST 'localhost' PORT 9000 USER 'default' TABLE 'rmt1' PASSWORD '' DB 'recover')) LIFETIME(MIN 1 MAX 10) LAYOUT(FLAT());", settings=settings) + main_node.query("CREATE DICTIONARY recover.d2 (n int DEFAULT 0, m int DEFAULT 1) PRIMARY KEY n " + "SOURCE(CLICKHOUSE(HOST 'localhost' PORT 9000 USER 'default' TABLE 'rmt1' PASSWORD '' DB 'recover')) " + "LIFETIME(MIN 1 MAX 10) LAYOUT(FLAT());", settings=settings) + + inner_table = ".inner_id." + dummy_node.query("SELECT uuid FROM system.tables WHERE database='recover' AND name='mv1'").strip() + main_node.query("ALTER TABLE recover.`{}` MODIFY COLUMN n int DEFAULT 42".format(inner_table), settings=settings) + main_node.query("ALTER TABLE recover.mv1 MODIFY QUERY SELECT m FROM recover.rmt1".format(inner_table), settings=settings) + main_node.query("RENAME TABLE recover.mv2 TO recover.mv3".format(inner_table), settings=settings) main_node.query("CREATE TABLE recover.tmp AS recover.m1", settings=settings) main_node.query("DROP TABLE recover.tmp", settings=settings) main_node.query("CREATE TABLE recover.tmp AS recover.m1", settings=settings) main_node.query("DROP TABLE recover.tmp", settings=settings) main_node.query("CREATE TABLE recover.tmp AS recover.m1", settings=settings) - main_node.query("DROP TABLE recover.tmp", settings=settings) - main_node.query("CREATE TABLE recover.tmp AS recover.m1", settings=settings) - assert main_node.query("SELECT name FROM system.tables WHERE database='recover' ORDER BY name") == "d1\nd2\nm1\nmt1\nmt2\nrmt1\nrmt2\nrmt4\nt2\ntmp\n" - query = "SELECT name, uuid, create_table_query FROM system.tables WHERE database='recover' ORDER BY name" + assert main_node.query("SELECT name FROM system.tables WHERE database='recover' AND name NOT LIKE '.inner_id.%' ORDER BY name") == \ + "d1\nd2\nm1\nmt1\nmt2\nmv1\nmv3\nrmt1\nrmt2\nrmt4\nt2\ntmp\n" + query = "SELECT name, uuid, create_table_query FROM system.tables WHERE database='recover' AND name NOT LIKE '.inner_id.%' " \ + "ORDER BY name SETTINGS show_table_uuid_in_table_create_query_if_not_nil=1" expected = main_node.query(query) assert_eq_with_retry(dummy_node, query, expected) + assert main_node.query("SELECT count() FROM system.tables WHERE database='recover' AND name LIKE '.inner_id.%'") == "2\n" + assert dummy_node.query("SELECT count() FROM system.tables WHERE database='recover' AND name LIKE '.inner_id.%'") == "2\n" - for table in ['m1', 't2', 'mt1', 'mt2', 'rmt1', 'rmt2', 'rmt4', 'd1', 'd2']: + for table in ['m1', 't2', 'mt1', 'mt2', 'rmt1', 'rmt2', 'rmt4', 'd1', 'd2', 'mv1', 'mv3']: assert main_node.query("SELECT (*,).1 FROM recover.{}".format(table)) == "42\n" - for table in ['t2', 'rmt1', 'rmt2', 'rmt4', 'd1', 'd2', 'mt2']: + for table in ['t2', 'rmt1', 'rmt2', 'rmt4', 'd1', 'd2', 'mt2', 'mv1', 'mv3']: assert dummy_node.query("SELECT (*,).1 FROM recover.{}".format(table)) == "42\n" for table in ['m1', 'mt1']: assert dummy_node.query("SELECT count() FROM recover.{}".format(table)) == "0\n" assert dummy_node.query("SELECT count() FROM system.tables WHERE database='recover_broken_tables'") == "2\n" - table = dummy_node.query("SHOW TABLES FROM recover_broken_tables LIKE 'mt1_26_%'").strip() + table = dummy_node.query("SHOW TABLES FROM recover_broken_tables LIKE 'mt1_29_%'").strip() assert dummy_node.query("SELECT (*,).1 FROM recover_broken_tables.{}".format(table)) == "42\n" - table = dummy_node.query("SHOW TABLES FROM recover_broken_tables LIKE 'rmt5_26_%'").strip() + table = dummy_node.query("SHOW TABLES FROM recover_broken_tables LIKE 'rmt5_29_%'").strip() assert dummy_node.query("SELECT (*,).1 FROM recover_broken_tables.{}".format(table)) == "42\n" - expected = "Cleaned 4 outdated objects: dropped 1 dictionaries and 1 tables, moved 2 tables" + expected = "Cleaned 6 outdated objects: dropped 1 dictionaries and 3 tables, moved 2 tables" assert_logs_contain(dummy_node, expected) dummy_node.query("DROP TABLE recover.tmp") diff --git a/tests/integration/test_replicated_merge_tree_s3/configs/config.d/storage_conf.xml b/tests/integration/test_replicated_merge_tree_s3/configs/config.d/storage_conf.xml index 20b750ffff3..1f75a4efeae 100644 --- a/tests/integration/test_replicated_merge_tree_s3/configs/config.d/storage_conf.xml +++ b/tests/integration/test_replicated_merge_tree_s3/configs/config.d/storage_conf.xml @@ -21,6 +21,7 @@ 0 + 0 diff --git a/tests/integration/test_replicated_merge_tree_s3_zero_copy/__init__.py b/tests/integration/test_replicated_merge_tree_s3_zero_copy/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/integration/test_replicated_merge_tree_s3_zero_copy/configs/config.d/storage_conf.xml b/tests/integration/test_replicated_merge_tree_s3_zero_copy/configs/config.d/storage_conf.xml new file mode 100644 index 00000000000..d8c7f49fc49 --- /dev/null +++ b/tests/integration/test_replicated_merge_tree_s3_zero_copy/configs/config.d/storage_conf.xml @@ -0,0 +1,50 @@ + + + + + s3 + http://minio1:9001/root/data/ + minio + minio123 + + + + + +
+ s3 +
+
+
+
+
+ + + 0 + 1 + + + + + + + node1 + 9000 + + + node2 + 9000 + + + node3 + 9000 + + + + + + + 0 + + +
diff --git a/tests/integration/test_replicated_merge_tree_s3_zero_copy/test.py b/tests/integration/test_replicated_merge_tree_s3_zero_copy/test.py new file mode 100644 index 00000000000..793abc53566 --- /dev/null +++ b/tests/integration/test_replicated_merge_tree_s3_zero_copy/test.py @@ -0,0 +1,105 @@ +import logging +import random +import string + +import pytest +from helpers.cluster import ClickHouseCluster + +logging.getLogger().setLevel(logging.INFO) +logging.getLogger().addHandler(logging.StreamHandler()) + + +@pytest.fixture(scope="module") +def cluster(): + try: + cluster = ClickHouseCluster(__file__) + + cluster.add_instance("node1", main_configs=["configs/config.d/storage_conf.xml"], macros={'replica': '1'}, + with_minio=True, with_zookeeper=True) + cluster.add_instance("node2", main_configs=["configs/config.d/storage_conf.xml"], macros={'replica': '2'}, + with_zookeeper=True) + cluster.add_instance("node3", main_configs=["configs/config.d/storage_conf.xml"], macros={'replica': '3'}, + with_zookeeper=True) + + logging.info("Starting cluster...") + cluster.start() + logging.info("Cluster started") + + yield cluster + finally: + cluster.shutdown() + + +FILES_OVERHEAD = 1 +FILES_OVERHEAD_PER_COLUMN = 2 # Data and mark files +FILES_OVERHEAD_PER_PART_WIDE = FILES_OVERHEAD_PER_COLUMN * 3 + 2 + 6 + 1 +FILES_OVERHEAD_PER_PART_COMPACT = 10 + 1 + + +def random_string(length): + letters = string.ascii_letters + return ''.join(random.choice(letters) for i in range(length)) + + +def generate_values(date_str, count, sign=1): + data = [[date_str, sign * (i + 1), random_string(10)] for i in range(count)] + data.sort(key=lambda tup: tup[1]) + return ",".join(["('{}',{},'{}')".format(x, y, z) for x, y, z in data]) + + +def create_table(cluster, additional_settings=None): + create_table_statement = """ + CREATE TABLE s3_test ON CLUSTER cluster( + dt Date, + id Int64, + data String, + INDEX min_max (id) TYPE minmax GRANULARITY 3 + ) ENGINE=ReplicatedMergeTree() + PARTITION BY dt + ORDER BY (dt, id) + SETTINGS storage_policy='s3' + """ + if additional_settings: + create_table_statement += "," + create_table_statement += additional_settings + + list(cluster.instances.values())[0].query(create_table_statement) + + +@pytest.fixture(autouse=True) +def drop_table(cluster): + yield + for node in list(cluster.instances.values()): + node.query("DROP TABLE IF EXISTS s3_test") + + minio = cluster.minio_client + # Remove extra objects to prevent tests cascade failing + for obj in list(minio.list_objects(cluster.minio_bucket, 'data/')): + minio.remove_object(cluster.minio_bucket, obj.object_name) + +@pytest.mark.parametrize( + "min_rows_for_wide_part,files_per_part", + [ + (0, FILES_OVERHEAD_PER_PART_WIDE), + (8192, FILES_OVERHEAD_PER_PART_COMPACT) + ] +) +def test_insert_select_replicated(cluster, min_rows_for_wide_part, files_per_part): + create_table(cluster, additional_settings="min_rows_for_wide_part={}".format(min_rows_for_wide_part)) + + all_values = "" + for node_idx in range(1, 4): + node = cluster.instances["node" + str(node_idx)] + values = generate_values("2020-01-0" + str(node_idx), 4096) + node.query("INSERT INTO s3_test VALUES {}".format(values), settings={"insert_quorum": 3}) + if node_idx != 1: + all_values += "," + all_values += values + + for node_idx in range(1, 4): + node = cluster.instances["node" + str(node_idx)] + assert node.query("SELECT * FROM s3_test order by dt, id FORMAT Values", + settings={"select_sequential_consistency": 1}) == all_values + + minio = cluster.minio_client + assert len(list(minio.list_objects(cluster.minio_bucket, 'data/'))) == (3 * FILES_OVERHEAD) + (files_per_part * 3) diff --git a/tests/integration/test_s3_zero_copy_replication/__init__.py b/tests/integration/test_s3_zero_copy_replication/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/integration/test_s3_zero_copy_replication/configs/config.d/s3.xml b/tests/integration/test_s3_zero_copy_replication/configs/config.d/s3.xml new file mode 100644 index 00000000000..7d8492ed68c --- /dev/null +++ b/tests/integration/test_s3_zero_copy_replication/configs/config.d/s3.xml @@ -0,0 +1,50 @@ + + + + + + s3 + http://minio1:9001/root/data/ + minio + minio123 + + + + + +
+ s31 +
+
+
+
+
+ + + 0 + 1 + 1 + + + + + + + node1 + 9000 + + + + + node2 + 9000 + + + + + + + test_cluster + + +
diff --git a/tests/integration/test_s3_zero_copy_replication/test.py b/tests/integration/test_s3_zero_copy_replication/test.py new file mode 100644 index 00000000000..6a7336b9090 --- /dev/null +++ b/tests/integration/test_s3_zero_copy_replication/test.py @@ -0,0 +1,90 @@ +import logging +import time + +import pytest +from helpers.cluster import ClickHouseCluster + +logging.getLogger().setLevel(logging.INFO) +logging.getLogger().addHandler(logging.StreamHandler()) + + +@pytest.fixture(scope="module") +def cluster(): + try: + cluster = ClickHouseCluster(__file__) + cluster.add_instance("node1", main_configs=["configs/config.d/s3.xml"], macros={'replica': '1'}, + with_minio=True, + with_zookeeper=True) + cluster.add_instance("node2", main_configs=["configs/config.d/s3.xml"], macros={'replica': '2'}, + with_minio=True, + with_zookeeper=True) + logging.info("Starting cluster...") + cluster.start() + logging.info("Cluster started") + + yield cluster + finally: + cluster.shutdown() + + +def get_large_objects_count(cluster, size=100): + minio = cluster.minio_client + counter = 0 + for obj in minio.list_objects(cluster.minio_bucket, 'data/'): + if obj.size >= size: + counter = counter + 1 + return counter + + +@pytest.mark.parametrize( + "policy", ["s3"] +) +def test_s3_zero_copy_replication(cluster, policy): + node1 = cluster.instances["node1"] + node2 = cluster.instances["node2"] + + node1.query( + """ + CREATE TABLE s3_test ON CLUSTER test_cluster (id UInt32, value String) + ENGINE=ReplicatedMergeTree('/clickhouse/tables/s3_test', '{}') + ORDER BY id + SETTINGS storage_policy='{}' + """ + .format('{replica}', policy) + ) + + node1.query("INSERT INTO s3_test VALUES (0,'data'),(1,'data')") + time.sleep(1) + assert node1.query("SELECT * FROM s3_test order by id FORMAT Values") == "(0,'data'),(1,'data')" + assert node2.query("SELECT * FROM s3_test order by id FORMAT Values") == "(0,'data'),(1,'data')" + + # Based on version 20.x - should be only one file with size 100+ (checksums.txt), used by both nodes + assert get_large_objects_count(cluster) == 1 + + node2.query("INSERT INTO s3_test VALUES (2,'data'),(3,'data')") + time.sleep(1) + assert node2.query("SELECT * FROM s3_test order by id FORMAT Values") == "(0,'data'),(1,'data'),(2,'data'),(3,'data')" + assert node1.query("SELECT * FROM s3_test order by id FORMAT Values") == "(0,'data'),(1,'data'),(2,'data'),(3,'data')" + + # Based on version 20.x - two parts + assert get_large_objects_count(cluster) == 2 + + node1.query("OPTIMIZE TABLE s3_test") + + time.sleep(1) + + # Based on version 20.x - after merge, two old parts and one merged + assert get_large_objects_count(cluster) == 3 + + # Based on version 20.x - after cleanup - only one merged part + countdown = 60 + while countdown > 0: + if get_large_objects_count(cluster) == 1: + break + time.sleep(1) + countdown -= 1 + assert get_large_objects_count(cluster) == 1 + + node1.query("DROP TABLE IF EXISTS s3_test NO DELAY") + node2.query("DROP TABLE IF EXISTS s3_test NO DELAY") + diff --git a/tests/queries/0_stateless/00966_live_view_watch_events_http.py b/tests/queries/0_stateless/00966_live_view_watch_events_http.py index 3d407ec5602..1c00a5d1236 100755 --- a/tests/queries/0_stateless/00966_live_view_watch_events_http.py +++ b/tests/queries/0_stateless/00966_live_view_watch_events_http.py @@ -28,13 +28,14 @@ with client(name='client1>', log=log) as client1: client1.expect(prompt) - with http_client({'method':'GET', 'url': '/?allow_experimental_live_view=1&query=WATCH%20test.lv%20EVENTS'}, name='client2>', log=log) as client2: - client2.expect('.*1\n') - client1.send('INSERT INTO test.mt VALUES (1),(2),(3)') + try: + with http_client({'method':'GET', 'url': '/?allow_experimental_live_view=1&query=WATCH%20test.lv%20EVENTS'}, name='client2>', log=log) as client2: + client2.expect('.*1\n') + client1.send('INSERT INTO test.mt VALUES (1),(2),(3)') + client1.expect(prompt) + client2.expect('.*2\n') + finally: + client1.send('DROP TABLE test.lv') + client1.expect(prompt) + client1.send('DROP TABLE test.mt') client1.expect(prompt) - client2.expect('.*2\n') - - client1.send('DROP TABLE test.lv') - client1.expect(prompt) - client1.send('DROP TABLE test.mt') - client1.expect(prompt) diff --git a/tests/queries/0_stateless/00967_live_view_watch_http.py b/tests/queries/0_stateless/00967_live_view_watch_http.py index d26bb5402e7..c41b9f0c861 100755 --- a/tests/queries/0_stateless/00967_live_view_watch_http.py +++ b/tests/queries/0_stateless/00967_live_view_watch_http.py @@ -27,14 +27,14 @@ with client(name='client1>', log=log) as client1: client1.send('CREATE LIVE VIEW test.lv AS SELECT sum(a) FROM test.mt') client1.expect(prompt) - - with http_client({'method':'GET', 'url':'/?allow_experimental_live_view=1&query=WATCH%20test.lv'}, name='client2>', log=log) as client2: - client2.expect('.*0\t1\n') - client1.send('INSERT INTO test.mt VALUES (1),(2),(3)') + try: + with http_client({'method':'GET', 'url':'/?allow_experimental_live_view=1&query=WATCH%20test.lv'}, name='client2>', log=log) as client2: + client2.expect('.*0\t1\n') + client1.send('INSERT INTO test.mt VALUES (1),(2),(3)') + client1.expect(prompt) + client2.expect('.*6\t2\n') + finally: + client1.send('DROP TABLE test.lv') + client1.expect(prompt) + client1.send('DROP TABLE test.mt') client1.expect(prompt) - client2.expect('.*6\t2\n') - - client1.send('DROP TABLE test.lv') - client1.expect(prompt) - client1.send('DROP TABLE test.mt') - client1.expect(prompt) diff --git a/tests/queries/0_stateless/01018_ip_dictionary.reference b/tests/queries/0_stateless/01018_ip_dictionary_long.reference similarity index 100% rename from tests/queries/0_stateless/01018_ip_dictionary.reference rename to tests/queries/0_stateless/01018_ip_dictionary_long.reference diff --git a/tests/queries/0_stateless/01018_ip_dictionary.sql b/tests/queries/0_stateless/01018_ip_dictionary_long.sql similarity index 99% rename from tests/queries/0_stateless/01018_ip_dictionary.sql rename to tests/queries/0_stateless/01018_ip_dictionary_long.sql index 5df1afcd559..2abd51cc9fe 100644 --- a/tests/queries/0_stateless/01018_ip_dictionary.sql +++ b/tests/queries/0_stateless/01018_ip_dictionary_long.sql @@ -41,6 +41,9 @@ SOURCE(CLICKHOUSE(host 'localhost' port 9000 user 'default' db 'database_for_dic LAYOUT(IP_TRIE()) LIFETIME(MIN 10 MAX 100); +-- fuzzer +SELECT '127.0.0.0/24' = dictGetString('database_for_dict.dict_ipv4_trie', 'prefixprefixprefixprefix', tuple(IPv4StringToNum('127.0.0.0127.0.0.0'))); -- { serverError 36 } + SELECT 0 == dictGetUInt32('database_for_dict.dict_ipv4_trie', 'asn', tuple(IPv4StringToNum('0.0.0.0'))); SELECT 1 == dictGetUInt32('database_for_dict.dict_ipv4_trie', 'asn', tuple(IPv4StringToNum('128.0.0.0'))); SELECT 2 == dictGetUInt32('database_for_dict.dict_ipv4_trie', 'asn', tuple(IPv4StringToNum('192.0.0.0'))); diff --git a/tests/queries/0_stateless/01035_lc_empty_part_bug.sh b/tests/queries/0_stateless/01035_lc_empty_part_bug.sh index b65cf87d1ca..185c4ef4a4e 100755 --- a/tests/queries/0_stateless/01035_lc_empty_part_bug.sh +++ b/tests/queries/0_stateless/01035_lc_empty_part_bug.sh @@ -8,7 +8,7 @@ CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) ${CLICKHOUSE_CLIENT} --multiquery --query=" DROP TABLE IF EXISTS lc_empty_part_bug; - create table lc_empty_part_bug (id UInt64, s String) Engine=MergeTree ORDER BY id; + create table lc_empty_part_bug (id UInt64, s String) Engine=MergeTree ORDER BY id SETTINGS number_of_free_entries_in_pool_to_execute_mutation=0; insert into lc_empty_part_bug select number as id, toString(rand()) from numbers(100); alter table lc_empty_part_bug delete where id < 100; " --mutations_sync=1 diff --git a/tests/queries/0_stateless/01175_distributed_ddl_output_mode_long.reference b/tests/queries/0_stateless/01175_distributed_ddl_output_mode_long.reference new file mode 100644 index 00000000000..0cc8c788fed --- /dev/null +++ b/tests/queries/0_stateless/01175_distributed_ddl_output_mode_long.reference @@ -0,0 +1,25 @@ +none +Received exception from server: +Code: 57. Error: Received from localhost:9000. Error: There was an error on [localhost:9000]: Code: 57, e.displayText() = Error: Table default.throw already exists +Received exception from server: +Code: 159. Error: Received from localhost:9000. Error: Watching task is executing longer than distributed_ddl_task_timeout (=8) seconds. There are 1 unfinished hosts (0 of them are currently active), they are going to execute the query in background. +throw +localhost 9000 0 0 0 +localhost 9000 57 Code: 57, e.displayText() = Error: Table default.throw already exists. 0 0 +Received exception from server: +Code: 57. Error: Received from localhost:9000. Error: There was an error on [localhost:9000]: Code: 57, e.displayText() = Error: Table default.throw already exists +localhost 9000 0 1 0 +Received exception from server: +Code: 159. Error: Received from localhost:9000. Error: Watching task is executing longer than distributed_ddl_task_timeout (=8) seconds. There are 1 unfinished hosts (0 of them are currently active), they are going to execute the query in background. +null_status_on_timeout +localhost 9000 0 0 0 +localhost 9000 57 Code: 57, e.displayText() = Error: Table default.null_status already exists. 0 0 +Received exception from server: +Code: 57. Error: Received from localhost:9000. Error: There was an error on [localhost:9000]: Code: 57, e.displayText() = Error: Table default.null_status already exists +localhost 9000 0 1 0 +localhost 1 \N \N 1 0 +never_throw +localhost 9000 0 0 0 +localhost 9000 57 Code: 57, e.displayText() = Error: Table default.never_throw already exists. 0 0 +localhost 9000 0 1 0 +localhost 1 \N \N 1 0 diff --git a/tests/queries/0_stateless/01175_distributed_ddl_output_mode_long.sh b/tests/queries/0_stateless/01175_distributed_ddl_output_mode_long.sh new file mode 100755 index 00000000000..66ceef21682 --- /dev/null +++ b/tests/queries/0_stateless/01175_distributed_ddl_output_mode_long.sh @@ -0,0 +1,39 @@ +#!/usr/bin/env bash + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + + +$CLICKHOUSE_CLIENT -q "drop table if exists throw;" +$CLICKHOUSE_CLIENT -q "drop table if exists null_status;" +$CLICKHOUSE_CLIENT -q "drop table if exists never_throw;" + +CLICKHOUSE_CLIENT_OPT=$(echo ${CLICKHOUSE_CLIENT_OPT} | sed 's/'"--send_logs_level=${CLICKHOUSE_CLIENT_SERVER_LOGS_LEVEL}"'/--send_logs_level=fatal/g') + +CLIENT="$CLICKHOUSE_CLIENT_BINARY $CLICKHOUSE_CLIENT_OPT --distributed_ddl_task_timeout=8 --distributed_ddl_output_mode=none" +$CLIENT -q "select value from system.settings where name='distributed_ddl_output_mode';" +# Ok +$CLIENT -q "create table throw on cluster test_shard_localhost (n int) engine=Memory;" +# Table exists +$CLIENT -q "create table throw on cluster test_shard_localhost (n int) engine=Memory;" 2>&1| grep -Fv "@ 0x" | sed "s/DB::Exception/Error/g" | sed "s/ (version.*)//" | sed "s/exists.. /exists/" +# Timeout +$CLIENT -q "drop table throw on cluster test_unavailable_shard;" 2>&1| grep -Fv "@ 0x" | sed "s/DB::Exception/Error/g" | sed "s/ (version.*)//" | sed "s/Watching task .* is executing longer/Watching task is executing longer/" | sed "s/background. /background./" + +CLIENT="$CLICKHOUSE_CLIENT_BINARY $CLICKHOUSE_CLIENT_OPT --distributed_ddl_task_timeout=8 --distributed_ddl_output_mode=throw" +$CLIENT -q "select value from system.settings where name='distributed_ddl_output_mode';" +$CLIENT -q "create table throw on cluster test_shard_localhost (n int) engine=Memory;" +$CLIENT -q "create table throw on cluster test_shard_localhost (n int) engine=Memory;" 2>&1| grep -Fv "@ 0x" | sed "s/DB::Exception/Error/g" | sed "s/ (version.*)//" | sed "s/exists.. /exists/" +$CLIENT -q "drop table throw on cluster test_unavailable_shard;" 2>&1| grep -Fv "@ 0x" | sed "s/DB::Exception/Error/g" | sed "s/ (version.*)//" | sed "s/Watching task .* is executing longer/Watching task is executing longer/" | sed "s/background. /background./" + +CLIENT="$CLICKHOUSE_CLIENT_BINARY $CLICKHOUSE_CLIENT_OPT --distributed_ddl_task_timeout=8 --distributed_ddl_output_mode=null_status_on_timeout" +$CLIENT -q "select value from system.settings where name='distributed_ddl_output_mode';" +$CLIENT -q "create table null_status on cluster test_shard_localhost (n int) engine=Memory;" +$CLIENT -q "create table null_status on cluster test_shard_localhost (n int) engine=Memory;" 2>&1| grep -Fv "@ 0x" | sed "s/DB::Exception/Error/g" | sed "s/ (version.*)//" | sed "s/exists.. /exists/" +$CLIENT -q "drop table null_status on cluster test_unavailable_shard;" + +CLIENT="$CLICKHOUSE_CLIENT_BINARY $CLICKHOUSE_CLIENT_OPT --distributed_ddl_task_timeout=8 --distributed_ddl_output_mode=never_throw" +$CLIENT -q "select value from system.settings where name='distributed_ddl_output_mode';" +$CLIENT -q "create table never_throw on cluster test_shard_localhost (n int) engine=Memory;" +$CLIENT -q "create table never_throw on cluster test_shard_localhost (n int) engine=Memory;" 2>&1| sed "s/DB::Exception/Error/g" | sed "s/ (version.*)//" +$CLIENT -q "drop table never_throw on cluster test_unavailable_shard;" diff --git a/tests/queries/0_stateless/01280_ssd_complex_key_dictionary.sql b/tests/queries/0_stateless/01280_ssd_complex_key_dictionary.sql index 8c304818602..50b34c4b18f 100644 --- a/tests/queries/0_stateless/01280_ssd_complex_key_dictionary.sql +++ b/tests/queries/0_stateless/01280_ssd_complex_key_dictionary.sql @@ -42,8 +42,7 @@ LAYOUT(COMPLEX_KEY_SSD_CACHE(FILE_SIZE 8192 PATH '/var/lib/clickhouse/clickhouse SELECT 'TEST_SMALL'; SELECT 'VALUE FROM RAM BUFFER'; --- NUMBER_OF_ARGUMENTS_DOESNT_MATCH -SELECT dictHas('01280_db.ssd_dict', 'a', tuple('1')); -- { serverError 42 } +SELECT dictHas('01280_db.ssd_dict', 'a', tuple('1')); -- { serverError 43 } SELECT dictGetUInt64('01280_db.ssd_dict', 'a', tuple('1', toInt32(3))); SELECT dictGetInt32('01280_db.ssd_dict', 'b', tuple('1', toInt32(3))); diff --git a/tests/queries/0_stateless/01294_lazy_database_concurrent_recreate_reattach_and_show_tables.reference b/tests/queries/0_stateless/01294_lazy_database_concurrent_recreate_reattach_and_show_tables_long.reference similarity index 100% rename from tests/queries/0_stateless/01294_lazy_database_concurrent_recreate_reattach_and_show_tables.reference rename to tests/queries/0_stateless/01294_lazy_database_concurrent_recreate_reattach_and_show_tables_long.reference diff --git a/tests/queries/0_stateless/01294_lazy_database_concurrent_recreate_reattach_and_show_tables.sh b/tests/queries/0_stateless/01294_lazy_database_concurrent_recreate_reattach_and_show_tables_long.sh similarity index 94% rename from tests/queries/0_stateless/01294_lazy_database_concurrent_recreate_reattach_and_show_tables.sh rename to tests/queries/0_stateless/01294_lazy_database_concurrent_recreate_reattach_and_show_tables_long.sh index d8f72c7837d..f5a4a1adac0 100755 --- a/tests/queries/0_stateless/01294_lazy_database_concurrent_recreate_reattach_and_show_tables.sh +++ b/tests/queries/0_stateless/01294_lazy_database_concurrent_recreate_reattach_and_show_tables_long.sh @@ -70,8 +70,10 @@ function recreate_lazy_func4() function test_func() { while true; do - $CLICKHOUSE_CLIENT -q "SYSTEM STOP TTL MERGES"; + for table in log tlog slog tlog2; do + $CLICKHOUSE_CLIENT -q "SYSTEM STOP TTL MERGES $CURR_DATABASE.$table" >& /dev/null done + done } diff --git a/tests/queries/0_stateless/01643_merge_tree_fsync_smoke.reference b/tests/queries/0_stateless/01643_merge_tree_fsync_smoke.reference index 654db9dbc86..f57d5df6efd 100644 --- a/tests/queries/0_stateless/01643_merge_tree_fsync_smoke.reference +++ b/tests/queries/0_stateless/01643_merge_tree_fsync_smoke.reference @@ -10,3 +10,5 @@ wide fsync_after_insert,fsync_part_directory 1 memory in_memory_parts_insert_sync 1 +wide fsync_part_directory,vertical +1 diff --git a/tests/queries/0_stateless/01643_merge_tree_fsync_smoke.sql b/tests/queries/0_stateless/01643_merge_tree_fsync_smoke.sql index 21ebb607693..644cf063a33 100644 --- a/tests/queries/0_stateless/01643_merge_tree_fsync_smoke.sql +++ b/tests/queries/0_stateless/01643_merge_tree_fsync_smoke.sql @@ -4,34 +4,47 @@ select 'default'; create table data_01643 (key Int) engine=MergeTree() order by key; insert into data_01643 values (1); select * from data_01643; +optimize table data_01643 final; drop table data_01643; select 'compact fsync_after_insert'; create table data_01643 (key Int) engine=MergeTree() order by key settings min_rows_for_wide_part=2, fsync_after_insert=1; insert into data_01643 values (1); select * from data_01643; +optimize table data_01643 final; drop table data_01643; select 'compact fsync_after_insert,fsync_part_directory'; create table data_01643 (key Int) engine=MergeTree() order by key settings min_rows_for_wide_part=2, fsync_after_insert=1, fsync_part_directory=1; insert into data_01643 values (1); select * from data_01643; +optimize table data_01643 final; drop table data_01643; select 'wide fsync_after_insert'; create table data_01643 (key Int) engine=MergeTree() order by key settings min_bytes_for_wide_part=0, fsync_after_insert=1; insert into data_01643 values (1); select * from data_01643; +optimize table data_01643 final; drop table data_01643; select 'wide fsync_after_insert,fsync_part_directory'; create table data_01643 (key Int) engine=MergeTree() order by key settings min_bytes_for_wide_part=0, fsync_after_insert=1, fsync_part_directory=1; insert into data_01643 values (1); select * from data_01643; +optimize table data_01643 final; drop table data_01643; select 'memory in_memory_parts_insert_sync'; create table data_01643 (key Int) engine=MergeTree() order by key settings min_rows_for_compact_part=2, in_memory_parts_insert_sync=1, fsync_after_insert=1, fsync_part_directory=1; insert into data_01643 values (1); select * from data_01643; +optimize table data_01643 final; +drop table data_01643; + +select 'wide fsync_part_directory,vertical'; +create table data_01643 (key Int) engine=MergeTree() order by key settings min_bytes_for_wide_part=0, fsync_part_directory=1, enable_vertical_merge_algorithm=1, vertical_merge_algorithm_min_rows_to_activate=1, vertical_merge_algorithm_min_columns_to_activate=1; +insert into data_01643 values (1); +select * from data_01643; +optimize table data_01643 final; drop table data_01643; diff --git a/tests/queries/0_stateless/01666_blns.sql b/tests/queries/0_stateless/01666_blns.sql index 787f0369244..be9632092bc 100644 --- a/tests/queries/0_stateless/01666_blns.sql +++ b/tests/queries/0_stateless/01666_blns.sql @@ -554,9 +554,9 @@ SELECT count() FROM test; DROP TABLE IF EXISTS test_r1; DROP TABLE IF EXISTS test_r2; -CREATE TABLE test_r1 AS test ENGINE = ReplicatedMergeTree('/clickhouse/test', 'r1') ORDER BY "\\" SETTINGS min_bytes_for_wide_part = '100G'; +CREATE TABLE test_r1 AS test ENGINE = ReplicatedMergeTree('/clickhouse/test_01666', 'r1') ORDER BY "\\" SETTINGS min_bytes_for_wide_part = '100G'; INSERT INTO test_r1 SELECT * FROM test; -CREATE TABLE test_r2 AS test ENGINE = ReplicatedMergeTree('/clickhouse/test', 'r2') ORDER BY "\\" SETTINGS min_bytes_for_wide_part = '100G'; +CREATE TABLE test_r2 AS test ENGINE = ReplicatedMergeTree('/clickhouse/test_01666', 'r2') ORDER BY "\\" SETTINGS min_bytes_for_wide_part = '100G'; SYSTEM SYNC REPLICA test_r2; diff --git a/tests/queries/0_stateless/01676_range_hashed_dictionary.reference b/tests/queries/0_stateless/01676_range_hashed_dictionary.reference new file mode 100644 index 00000000000..23a5180d99c --- /dev/null +++ b/tests/queries/0_stateless/01676_range_hashed_dictionary.reference @@ -0,0 +1,58 @@ +Dictionary not nullable +dictGet +0.33 +0.42 +0.46 +0.2 +0.4 +dictHas +1 +1 +1 +0 +select columns from dictionary +allColumns +1 2019-05-05 2019-05-20 0.33 +1 2019-05-21 2019-05-30 0.42 +2 2019-05-21 2019-05-30 0.46 +noColumns +1 +1 +1 +onlySpecificColumns +1 2019-05-05 0.33 +1 2019-05-21 0.42 +2 2019-05-21 0.46 +onlySpecificColumn +0.33 +0.42 +0.46 +Dictionary nullable +dictGet +0.33 +0.42 +\N +0.2 +0.4 +dictHas +1 +1 +1 +0 +select columns from dictionary +allColumns +1 2019-05-05 2019-05-20 0.33 +1 2019-05-21 2019-05-30 0.42 +2 2019-05-21 2019-05-30 \N +noColumns +1 +1 +1 +onlySpecificColumns +1 2019-05-05 0.33 +1 2019-05-21 0.42 +2 2019-05-21 \N +onlySpecificColumn +0.33 +0.42 +\N diff --git a/tests/queries/0_stateless/01676_range_hashed_dictionary.sql b/tests/queries/0_stateless/01676_range_hashed_dictionary.sql new file mode 100644 index 00000000000..455e850b239 --- /dev/null +++ b/tests/queries/0_stateless/01676_range_hashed_dictionary.sql @@ -0,0 +1,110 @@ +DROP DATABASE IF EXISTS database_for_range_dict; + +CREATE DATABASE database_for_range_dict; + +CREATE TABLE database_for_range_dict.date_table +( + CountryID UInt64, + StartDate Date, + EndDate Date, + Tax Float64 +) +ENGINE = MergeTree() +ORDER BY CountryID; + +INSERT INTO database_for_range_dict.date_table VALUES(1, toDate('2019-05-05'), toDate('2019-05-20'), 0.33); +INSERT INTO database_for_range_dict.date_table VALUES(1, toDate('2019-05-21'), toDate('2019-05-30'), 0.42); +INSERT INTO database_for_range_dict.date_table VALUES(2, toDate('2019-05-21'), toDate('2019-05-30'), 0.46); + +CREATE DICTIONARY database_for_range_dict.range_dictionary +( + CountryID UInt64, + StartDate Date, + EndDate Date, + Tax Float64 DEFAULT 0.2 +) +PRIMARY KEY CountryID +SOURCE(CLICKHOUSE(HOST 'localhost' PORT tcpPort() USER 'default' TABLE 'date_table' DB 'database_for_range_dict')) +LIFETIME(MIN 1 MAX 1000) +LAYOUT(RANGE_HASHED()) +RANGE(MIN StartDate MAX EndDate); + +SELECT 'Dictionary not nullable'; +SELECT 'dictGet'; +SELECT dictGet('database_for_range_dict.range_dictionary', 'Tax', toUInt64(1), toDate('2019-05-15')); +SELECT dictGet('database_for_range_dict.range_dictionary', 'Tax', toUInt64(1), toDate('2019-05-29')); +SELECT dictGet('database_for_range_dict.range_dictionary', 'Tax', toUInt64(2), toDate('2019-05-29')); +SELECT dictGet('database_for_range_dict.range_dictionary', 'Tax', toUInt64(2), toDate('2019-05-31')); +SELECT dictGetOrDefault('database_for_range_dict.range_dictionary', 'Tax', toUInt64(2), toDate('2019-05-31'), 0.4); +SELECT 'dictHas'; +SELECT dictHas('database_for_range_dict.range_dictionary', toUInt64(1), toDate('2019-05-15')); +SELECT dictHas('database_for_range_dict.range_dictionary', toUInt64(1), toDate('2019-05-29')); +SELECT dictHas('database_for_range_dict.range_dictionary', toUInt64(2), toDate('2019-05-29')); +SELECT dictHas('database_for_range_dict.range_dictionary', toUInt64(2), toDate('2019-05-31')); +SELECT 'select columns from dictionary'; +SELECT 'allColumns'; +SELECT * FROM database_for_range_dict.range_dictionary; +SELECT 'noColumns'; +SELECT 1 FROM database_for_range_dict.range_dictionary; +SELECT 'onlySpecificColumns'; +SELECT CountryID, StartDate, Tax FROM database_for_range_dict.range_dictionary; +SELECT 'onlySpecificColumn'; +SELECT Tax FROM database_for_range_dict.range_dictionary; + +DROP TABLE database_for_range_dict.date_table; +DROP DICTIONARY database_for_range_dict.range_dictionary; + +CREATE TABLE database_for_range_dict.date_table +( + CountryID UInt64, + StartDate Date, + EndDate Date, + Tax Nullable(Float64) +) +ENGINE = MergeTree() +ORDER BY CountryID; + +INSERT INTO database_for_range_dict.date_table VALUES(1, toDate('2019-05-05'), toDate('2019-05-20'), 0.33); +INSERT INTO database_for_range_dict.date_table VALUES(1, toDate('2019-05-21'), toDate('2019-05-30'), 0.42); +INSERT INTO database_for_range_dict.date_table VALUES(2, toDate('2019-05-21'), toDate('2019-05-30'), NULL); + +CREATE DICTIONARY database_for_range_dict.range_dictionary_nullable +( + CountryID UInt64, + StartDate Date, + EndDate Date, + Tax Nullable(Float64) DEFAULT 0.2 +) +PRIMARY KEY CountryID +SOURCE(CLICKHOUSE(HOST 'localhost' PORT tcpPort() USER 'default' TABLE 'date_table' DB 'database_for_range_dict')) +LIFETIME(MIN 1 MAX 1000) +LAYOUT(RANGE_HASHED()) +RANGE(MIN StartDate MAX EndDate); + +SELECT 'Dictionary nullable'; +SELECT 'dictGet'; +SELECT dictGet('database_for_range_dict.range_dictionary_nullable', 'Tax', toUInt64(1), toDate('2019-05-15')); +SELECT dictGet('database_for_range_dict.range_dictionary_nullable', 'Tax', toUInt64(1), toDate('2019-05-29')); +SELECT dictGet('database_for_range_dict.range_dictionary_nullable', 'Tax', toUInt64(2), toDate('2019-05-29')); +SELECT dictGet('database_for_range_dict.range_dictionary_nullable', 'Tax', toUInt64(2), toDate('2019-05-31')); +SELECT dictGetOrDefault('database_for_range_dict.range_dictionary_nullable', 'Tax', toUInt64(2), toDate('2019-05-31'), 0.4); +SELECT 'dictHas'; +SELECT dictHas('database_for_range_dict.range_dictionary_nullable', toUInt64(1), toDate('2019-05-15')); +SELECT dictHas('database_for_range_dict.range_dictionary_nullable', toUInt64(1), toDate('2019-05-29')); +SELECT dictHas('database_for_range_dict.range_dictionary_nullable', toUInt64(2), toDate('2019-05-29')); +SELECT dictHas('database_for_range_dict.range_dictionary_nullable', toUInt64(2), toDate('2019-05-31')); +SELECT 'select columns from dictionary'; +SELECT 'allColumns'; +SELECT * FROM database_for_range_dict.range_dictionary_nullable; +SELECT 'noColumns'; +SELECT 1 FROM database_for_range_dict.range_dictionary_nullable; +SELECT 'onlySpecificColumns'; +SELECT CountryID, StartDate, Tax FROM database_for_range_dict.range_dictionary_nullable; +SELECT 'onlySpecificColumn'; +SELECT Tax FROM database_for_range_dict.range_dictionary_nullable; + +DROP TABLE database_for_range_dict.date_table; +DROP DICTIONARY database_for_range_dict.range_dictionary_nullable; + +DROP DATABASE database_for_range_dict; + diff --git a/tests/queries/0_stateless/01702_system_query_log.reference b/tests/queries/0_stateless/01702_system_query_log.reference new file mode 100644 index 00000000000..6d8908249bf --- /dev/null +++ b/tests/queries/0_stateless/01702_system_query_log.reference @@ -0,0 +1,92 @@ +DROP queries and also a cleanup before the test +CREATE queries +SET queries +ALTER TABLE queries +SYSTEM queries +SHOW queries +GRANT queries +REVOKE queries +Misc queries +ACTUAL LOG CONTENT: +Select SELECT \'DROP queries and also a cleanup before the test\'; +Drop DROP DATABASE IF EXISTS sqllt SYNC; + DROP USER IF EXISTS sqllt_user; + DROP ROLE IF EXISTS sqllt_role; + DROP POLICY IF EXISTS sqllt_policy ON sqllt.table, sqllt.view, sqllt.dictionary; + DROP ROW POLICY IF EXISTS sqllt_row_policy ON sqllt.table, sqllt.view, sqllt.dictionary; + DROP QUOTA IF EXISTS sqllt_quota; + DROP SETTINGS PROFILE IF EXISTS sqllt_settings_profile; +Select SELECT \'CREATE queries\'; +Create CREATE DATABASE sqllt; +Create CREATE TABLE sqllt.table\n(\n i UInt8, s String\n)\nENGINE = MergeTree PARTITION BY tuple() ORDER BY tuple(); +Create CREATE VIEW sqllt.view AS SELECT i, s FROM sqllt.table; +Create CREATE DICTIONARY sqllt.dictionary (key UInt64, value UInt64) PRIMARY KEY key SOURCE(CLICKHOUSE(DB \'sqllt\' TABLE \'table\' HOST \'localhost\' PORT 9001)) LIFETIME(0) LAYOUT(FLAT()); + CREATE USER sqllt_user IDENTIFIED WITH PLAINTEXT_PASSWORD BY \'password\'; + CREATE ROLE sqllt_role; + CREATE POLICY sqllt_policy ON sqllt.table, sqllt.view, sqllt.dictionary AS PERMISSIVE TO ALL; + CREATE POLICY sqllt_row_policy ON sqllt.table, sqllt.view, sqllt.dictionary AS PERMISSIVE TO ALL; + CREATE QUOTA sqllt_quota KEYED BY user_name TO sqllt_role; + CREATE SETTINGS PROFILE sqllt_settings_profile SETTINGS interactive_delay = 200000; +Grant GRANT sqllt_role TO sqllt_user; +Select SELECT \'SET queries\'; + SET log_profile_events=false; + SET DEFAULT ROLE sqllt_role TO sqllt_user; +Select -- SET ROLE sqllt_role; -- tests are executed by user `default` which is defined in XML and is impossible to update.\n\nSELECT \'ALTER TABLE queries\'; +Alter ALTER TABLE sqllt.table ADD COLUMN new_col UInt32 DEFAULT 123456789; +Alter ALTER TABLE sqllt.table COMMENT COLUMN new_col \'dummy column with a comment\'; +Alter ALTER TABLE sqllt.table CLEAR COLUMN new_col; +Alter ALTER TABLE sqllt.table MODIFY COLUMN new_col DateTime DEFAULT \'2015-05-18 07:40:13\'; +Alter ALTER TABLE sqllt.table MODIFY COLUMN new_col REMOVE COMMENT; +Alter ALTER TABLE sqllt.table RENAME COLUMN new_col TO the_new_col; +Alter ALTER TABLE sqllt.table DROP COLUMN the_new_col; +Alter ALTER TABLE sqllt.table UPDATE i = i + 1 WHERE 1; +Alter ALTER TABLE sqllt.table DELETE WHERE i > 65535; +Select -- not done, seems to hard, so I\'ve skipped queries of ALTER-X, where X is:\n-- PARTITION\n-- ORDER BY\n-- SAMPLE BY\n-- INDEX\n-- CONSTRAINT\n-- TTL\n-- USER\n-- QUOTA\n-- ROLE\n-- ROW POLICY\n-- SETTINGS PROFILE\n\nSELECT \'SYSTEM queries\'; +System SYSTEM RELOAD EMBEDDED DICTIONARIES; +System SYSTEM RELOAD DICTIONARIES; +System SYSTEM DROP DNS CACHE; +System SYSTEM DROP MARK CACHE; +System SYSTEM DROP UNCOMPRESSED CACHE; +System SYSTEM FLUSH LOGS; +System SYSTEM RELOAD CONFIG; +System SYSTEM STOP MERGES; +System SYSTEM START MERGES; +System SYSTEM STOP TTL MERGES; +System SYSTEM START TTL MERGES; +System SYSTEM STOP MOVES; +System SYSTEM START MOVES; +System SYSTEM STOP FETCHES; +System SYSTEM START FETCHES; +System SYSTEM STOP REPLICATED SENDS; +System SYSTEM START REPLICATED SENDS; +Select -- SYSTEM RELOAD DICTIONARY sqllt.dictionary; -- temporary out of order: Code: 210, Connection refused (localhost:9001) (version 21.3.1.1)\n-- DROP REPLICA\n-- haha, no\n-- SYSTEM KILL;\n-- SYSTEM SHUTDOWN;\n\n-- Since we don\'t really care about the actual output, suppress it with `FORMAT Null`.\nSELECT \'SHOW queries\'; + SHOW CREATE TABLE sqllt.table FORMAT Null; + SHOW CREATE DICTIONARY sqllt.dictionary FORMAT Null; + SHOW DATABASES LIKE \'sqllt\' FORMAT Null; + SHOW TABLES FROM sqllt FORMAT Null; + SHOW DICTIONARIES FROM sqllt FORMAT Null; + SHOW GRANTS FORMAT Null; + SHOW GRANTS FOR sqllt_user FORMAT Null; + SHOW CREATE USER sqllt_user FORMAT Null; + SHOW CREATE ROLE sqllt_role FORMAT Null; + SHOW CREATE POLICY sqllt_policy FORMAT Null; + SHOW CREATE ROW POLICY sqllt_row_policy FORMAT Null; + SHOW CREATE QUOTA sqllt_quota FORMAT Null; + SHOW CREATE SETTINGS PROFILE sqllt_settings_profile FORMAT Null; +Select SELECT \'GRANT queries\'; +Grant GRANT SELECT ON sqllt.table TO sqllt_user; +Grant GRANT DROP ON sqllt.view TO sqllt_user; +Select SELECT \'REVOKE queries\'; +Revoke REVOKE SELECT ON sqllt.table FROM sqllt_user; +Revoke REVOKE DROP ON sqllt.view FROM sqllt_user; +Select SELECT \'Misc queries\'; + DESCRIBE TABLE sqllt.table FORMAT Null; + CHECK TABLE sqllt.table FORMAT Null; +Drop DETACH TABLE sqllt.table; +Create ATTACH TABLE sqllt.table; +Rename RENAME TABLE sqllt.table TO sqllt.table_new; +Rename RENAME TABLE sqllt.table_new TO sqllt.table; +Drop TRUNCATE TABLE sqllt.table; +Drop DROP TABLE sqllt.table SYNC; + SET log_comment=\'\'; +DROP queries and also a cleanup after the test diff --git a/tests/queries/0_stateless/01702_system_query_log.sql b/tests/queries/0_stateless/01702_system_query_log.sql new file mode 100644 index 00000000000..f31d8de3577 --- /dev/null +++ b/tests/queries/0_stateless/01702_system_query_log.sql @@ -0,0 +1,152 @@ +-- fire all kinds of queries and then check if those are present in the system.query_log +SET log_comment='system.query_log logging test'; + +SELECT 'DROP queries and also a cleanup before the test'; +DROP DATABASE IF EXISTS sqllt SYNC; +DROP USER IF EXISTS sqllt_user; +DROP ROLE IF EXISTS sqllt_role; +DROP POLICY IF EXISTS sqllt_policy ON sqllt.table, sqllt.view, sqllt.dictionary; +DROP ROW POLICY IF EXISTS sqllt_row_policy ON sqllt.table, sqllt.view, sqllt.dictionary; +DROP QUOTA IF EXISTS sqllt_quota; +DROP SETTINGS PROFILE IF EXISTS sqllt_settings_profile; + +SELECT 'CREATE queries'; +CREATE DATABASE sqllt; + +CREATE TABLE sqllt.table +( + i UInt8, s String +) +ENGINE = MergeTree PARTITION BY tuple() ORDER BY tuple(); + +CREATE VIEW sqllt.view AS SELECT i, s FROM sqllt.table; +CREATE DICTIONARY sqllt.dictionary (key UInt64, value UInt64) PRIMARY KEY key SOURCE(CLICKHOUSE(DB 'sqllt' TABLE 'table' HOST 'localhost' PORT 9001)) LIFETIME(0) LAYOUT(FLAT()); + +CREATE USER sqllt_user IDENTIFIED WITH PLAINTEXT_PASSWORD BY 'password'; +CREATE ROLE sqllt_role; + +CREATE POLICY sqllt_policy ON sqllt.table, sqllt.view, sqllt.dictionary AS PERMISSIVE TO ALL; +CREATE POLICY sqllt_row_policy ON sqllt.table, sqllt.view, sqllt.dictionary AS PERMISSIVE TO ALL; + +CREATE QUOTA sqllt_quota KEYED BY user_name TO sqllt_role; +CREATE SETTINGS PROFILE sqllt_settings_profile SETTINGS interactive_delay = 200000; + +GRANT sqllt_role TO sqllt_user; + + +SELECT 'SET queries'; +SET log_profile_events=false; +SET DEFAULT ROLE sqllt_role TO sqllt_user; +-- SET ROLE sqllt_role; -- tests are executed by user `default` which is defined in XML and is impossible to update. + +SELECT 'ALTER TABLE queries'; +ALTER TABLE sqllt.table ADD COLUMN new_col UInt32 DEFAULT 123456789; +ALTER TABLE sqllt.table COMMENT COLUMN new_col 'dummy column with a comment'; +ALTER TABLE sqllt.table CLEAR COLUMN new_col; +ALTER TABLE sqllt.table MODIFY COLUMN new_col DateTime DEFAULT '2015-05-18 07:40:13'; +ALTER TABLE sqllt.table MODIFY COLUMN new_col REMOVE COMMENT; +ALTER TABLE sqllt.table RENAME COLUMN new_col TO the_new_col; +ALTER TABLE sqllt.table DROP COLUMN the_new_col; +ALTER TABLE sqllt.table UPDATE i = i + 1 WHERE 1; +ALTER TABLE sqllt.table DELETE WHERE i > 65535; + +-- not done, seems to hard, so I've skipped queries of ALTER-X, where X is: +-- PARTITION +-- ORDER BY +-- SAMPLE BY +-- INDEX +-- CONSTRAINT +-- TTL +-- USER +-- QUOTA +-- ROLE +-- ROW POLICY +-- SETTINGS PROFILE + +SELECT 'SYSTEM queries'; +SYSTEM RELOAD EMBEDDED DICTIONARIES; +SYSTEM RELOAD DICTIONARIES; +SYSTEM DROP DNS CACHE; +SYSTEM DROP MARK CACHE; +SYSTEM DROP UNCOMPRESSED CACHE; +SYSTEM FLUSH LOGS; +SYSTEM RELOAD CONFIG; +SYSTEM STOP MERGES; +SYSTEM START MERGES; +SYSTEM STOP TTL MERGES; +SYSTEM START TTL MERGES; +SYSTEM STOP MOVES; +SYSTEM START MOVES; +SYSTEM STOP FETCHES; +SYSTEM START FETCHES; +SYSTEM STOP REPLICATED SENDS; +SYSTEM START REPLICATED SENDS; + +-- SYSTEM RELOAD DICTIONARY sqllt.dictionary; -- temporary out of order: Code: 210, Connection refused (localhost:9001) (version 21.3.1.1) +-- DROP REPLICA +-- haha, no +-- SYSTEM KILL; +-- SYSTEM SHUTDOWN; + +-- Since we don't really care about the actual output, suppress it with `FORMAT Null`. +SELECT 'SHOW queries'; + +SHOW CREATE TABLE sqllt.table FORMAT Null; +SHOW CREATE DICTIONARY sqllt.dictionary FORMAT Null; +SHOW DATABASES LIKE 'sqllt' FORMAT Null; +SHOW TABLES FROM sqllt FORMAT Null; +SHOW DICTIONARIES FROM sqllt FORMAT Null; +SHOW GRANTS FORMAT Null; +SHOW GRANTS FOR sqllt_user FORMAT Null; +SHOW CREATE USER sqllt_user FORMAT Null; +SHOW CREATE ROLE sqllt_role FORMAT Null; +SHOW CREATE POLICY sqllt_policy FORMAT Null; +SHOW CREATE ROW POLICY sqllt_row_policy FORMAT Null; +SHOW CREATE QUOTA sqllt_quota FORMAT Null; +SHOW CREATE SETTINGS PROFILE sqllt_settings_profile FORMAT Null; + +SELECT 'GRANT queries'; +GRANT SELECT ON sqllt.table TO sqllt_user; +GRANT DROP ON sqllt.view TO sqllt_user; + +SELECT 'REVOKE queries'; +REVOKE SELECT ON sqllt.table FROM sqllt_user; +REVOKE DROP ON sqllt.view FROM sqllt_user; + +SELECT 'Misc queries'; +DESCRIBE TABLE sqllt.table FORMAT Null; + +CHECK TABLE sqllt.table FORMAT Null; +DETACH TABLE sqllt.table; +ATTACH TABLE sqllt.table; + +RENAME TABLE sqllt.table TO sqllt.table_new; +RENAME TABLE sqllt.table_new TO sqllt.table; +TRUNCATE TABLE sqllt.table; +DROP TABLE sqllt.table SYNC; + +SET log_comment=''; +--------------------------------------------------------------------------------------------------- +-- Now get all logs related to this test +--------------------------------------------------------------------------------------------------- + +SYSTEM FLUSH LOGS; +SELECT 'ACTUAL LOG CONTENT:'; + +-- Try to filter out all possible previous junk events by excluding old log entries, +SELECT query_kind, query FROM system.query_log +WHERE + log_comment LIKE '%system.query_log%' AND type == 'QueryStart' AND event_time >= now() - 10 + AND current_database == currentDatabase() +ORDER BY event_time_microseconds; + + +-- cleanup +SELECT 'DROP queries and also a cleanup after the test'; +DROP DATABASE IF EXISTS sqllt; +DROP USER IF EXISTS sqllt_user; +DROP ROLE IF EXISTS sqllt_role; +DROP POLICY IF EXISTS sqllt_policy ON sqllt.table, sqllt.view, sqllt.dictionary; +DROP ROW POLICY IF EXISTS sqllt_row_policy ON sqllt.table, sqllt.view, sqllt.dictionary; +DROP QUOTA IF EXISTS sqllt_quota; +DROP SETTINGS PROFILE IF EXISTS sqllt_settings_profile; diff --git a/tests/queries/0_stateless/01732_union_and_union_all.reference b/tests/queries/0_stateless/01732_union_and_union_all.reference new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/queries/0_stateless/01732_union_and_union_all.sql b/tests/queries/0_stateless/01732_union_and_union_all.sql new file mode 100644 index 00000000000..2de6daa5bb9 --- /dev/null +++ b/tests/queries/0_stateless/01732_union_and_union_all.sql @@ -0,0 +1 @@ +select 1 UNION select 1 UNION ALL select 1; -- { serverError 558 } diff --git a/tests/queries/0_stateless/01759_dictionary_unique_attribute_names.reference b/tests/queries/0_stateless/01759_dictionary_unique_attribute_names.reference new file mode 100644 index 00000000000..bb08bbbe0b5 --- /dev/null +++ b/tests/queries/0_stateless/01759_dictionary_unique_attribute_names.reference @@ -0,0 +1,3 @@ +0 2 3 +1 5 6 +2 8 9 diff --git a/tests/queries/0_stateless/01759_dictionary_unique_attribute_names.sql b/tests/queries/0_stateless/01759_dictionary_unique_attribute_names.sql new file mode 100644 index 00000000000..11a52976716 --- /dev/null +++ b/tests/queries/0_stateless/01759_dictionary_unique_attribute_names.sql @@ -0,0 +1,32 @@ +DROP DATABASE IF EXISTS 01759_db; +CREATE DATABASE 01759_db; + +DROP TABLE IF EXISTS 01759_db.dictionary_source_table; +CREATE TABLE 01759_db.dictionary_source_table +( + key UInt64, + value1 UInt64, + value2 UInt64 +) +ENGINE = TinyLog; + +INSERT INTO 01759_db.dictionary_source_table VALUES (0, 2, 3), (1, 5, 6), (2, 8, 9); + +DROP DICTIONARY IF EXISTS 01759_db.test_dictionary; + +CREATE DICTIONARY 01759_db.test_dictionary(key UInt64, value1 UInt64, value1 UInt64) +PRIMARY KEY key +SOURCE(CLICKHOUSE(HOST 'localhost' PORT tcpPort() USER 'default' TABLE 'dictionary_source_table' DB '01759_db')) +LAYOUT(COMPLEX_KEY_DIRECT()); -- {serverError 36} + +CREATE DICTIONARY 01759_db.test_dictionary(key UInt64, value1 UInt64, value2 UInt64) +PRIMARY KEY key +SOURCE(CLICKHOUSE(HOST 'localhost' PORT tcpPort() USER 'default' TABLE 'dictionary_source_table' DB '01759_db')) +LAYOUT(COMPLEX_KEY_DIRECT()); + +SELECT number, dictGet('01759_db.test_dictionary', 'value1', tuple(number)) as value1, + dictGet('01759_db.test_dictionary', 'value2', tuple(number)) as value2 FROM system.numbers LIMIT 3; + +DROP TABLE 01759_db.dictionary_source_table; + +DROP DATABASE 01759_db; diff --git a/tests/queries/0_stateless/01760_modulo_negative.reference b/tests/queries/0_stateless/01760_modulo_negative.reference new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/queries/0_stateless/01760_modulo_negative.sql b/tests/queries/0_stateless/01760_modulo_negative.sql new file mode 100644 index 00000000000..dbea06cc100 --- /dev/null +++ b/tests/queries/0_stateless/01760_modulo_negative.sql @@ -0,0 +1 @@ +SELECT -number % -9223372036854775808 FROM system.numbers; -- { serverError 153 } diff --git a/tests/queries/0_stateless/arcadia_skip_list.txt b/tests/queries/0_stateless/arcadia_skip_list.txt index f28eac03c37..25569f95dae 100644 --- a/tests/queries/0_stateless/arcadia_skip_list.txt +++ b/tests/queries/0_stateless/arcadia_skip_list.txt @@ -176,7 +176,7 @@ 01560_timeseriesgroupsum_segfault 00976_ttl_with_old_parts 01584_distributed_buffer_cannot_find_column -01018_ip_dictionary +01018_ip_dictionary_long 01582_distinct_subquery_groupby 01558_ttest 01558_ttest_scipy @@ -221,3 +221,4 @@ 01304_polygons_sym_difference 01305_polygons_union 01306_polygons_intersection +01702_system_query_log diff --git a/tests/queries/query_test.py b/tests/queries/query_test.py index 417a51fe523..b747ac2944e 100644 --- a/tests/queries/query_test.py +++ b/tests/queries/query_test.py @@ -26,7 +26,7 @@ SKIP_LIST = [ "00990_metric_log_table_not_empty", "01014_lazy_database_concurrent_recreate_reattach_and_show_tables", "01018_Distributed__shard_num", - "01018_ip_dictionary", + "01018_ip_dictionary_long", "01050_clickhouse_dict_source_with_subquery", "01053_ssd_dictionary", "01054_cache_dictionary_overflow_cell", @@ -46,7 +46,7 @@ SKIP_LIST = [ "01293_client_interactive_vertical_singleline", # expect-test "01293_system_distribution_queue", # FLAKY "01293_show_clusters", - "01294_lazy_database_concurrent_recreate_reattach_and_show_tables", + "01294_lazy_database_concurrent_recreate_reattach_and_show_tables_long", "01294_system_distributed_on_cluster", "01300_client_save_history_when_terminated", # expect-test "01304_direct_io", diff --git a/tests/queries/skip_list.json b/tests/queries/skip_list.json index cc34b828d36..957108cc9cc 100644 --- a/tests/queries/skip_list.json +++ b/tests/queries/skip_list.json @@ -45,6 +45,7 @@ "capnproto", "query_profiler", "memory_profiler", + "01175_distributed_ddl_output_mode_long", /// issue 21600 "01103_check_cpu_instructions_at_startup", "01086_odbc_roundtrip", /// can't pass because odbc libraries are not instrumented "00877_memory_limit_for_new_delete", /// memory limits don't work correctly under msan because it replaces malloc/free @@ -106,164 +107,50 @@ "00738_lock_for_inner_table" ], "database-replicated": [ - /// Tests with DETACH TABLE (it's not allowed) - /// and tests with SET (session and query settings are not supported) "memory_tracking", "memory_usage", "live_view", - "01720_type_map_and_casts", - "01413_alter_update_supertype", - "01149_zookeeper_mutation_stuck_after_replace_partition", - "00836_indices_alter_replicated_zookeeper", - "00652_mutations_alter_update", - "01715_tuple_insert_null_as_default", - "00825_protobuf_format_map", - "00152_insert_different_granularity", - "01715_background_checker_blather_zookeeper", - "01714_alter_drop_version", - "01114_materialize_clear_index_compact_parts", - "00814_replicated_minimalistic_part_header_zookeeper", - "01188_attach_table_from_pat", - "01415_sticking_mutations", - "01130_in_memory_parts", - "01110_dictionary_layout_without_arguments", - "01018_ddl_dictionaries_create", - "01018_ddl_dictionaries_select", - "01414_freeze_does_not_prevent_alters", - "01018_ddl_dictionaries_bad_queries", - "01686_rocksdb", - "01550_mutation_subquery", - "01070_mutations_with_dependencies", - "01070_materialize_ttl", - "01055_compact_parts", - "01017_mutations_with_nondeterministic_functions_zookeeper", - "00926_adaptive_index_granularity_pk", - "00910_zookeeper_test_alter_compression_codecs", - "00908_bloom_filter_index", - "00616_final_single_part", - "00446_clear_column_in_partition_zookeeper", - "01533_multiple_nested", - "01213_alter_rename_column_zookeeper", - "01575_disable_detach_table_of_dictionary", - "01457_create_as_table_function_structure", - "01415_inconsistent_merge_tree_settings", - "01413_allow_non_metadata_alters", - "01378_alter_rename_with_ttl_zookeeper", - "01349_mutation_datetime_key", - "01325_freeze_mutation_stuck", - "01272_suspicious_codecs", "01181_db_atomic_drop_on_cluster", - "00957_delta_diff_bug", - "00910_zookeeper_custom_compression_codecs_replicated", - "00899_long_attach_memory_limit", - "00804_test_custom_compression_codes_log_storages", - "00804_test_alter_compression_codecs", - "00804_test_delta_codec_no_type_alter", - "00804_test_custom_compression_codecs", - "00753_alter_attach", - "00715_fetch_merged_or_mutated_part_zookeeper", - "00688_low_cardinality_serialization", - "01575_disable_detach_table_of_dictionary", - "00738_lock_for_inner_table", - "01666_blns", - "01652_ignore_and_low_cardinality", - "01651_map_functions", - "01650_fetch_patition_with_macro_in_zk_path", - "01648_mutations_and_escaping", - "01640_marks_corruption_regression", - "01622_byte_size", - "01611_string_to_low_cardinality_key_alter", - "01602_show_create_view", - "01600_log_queries_with_extensive_info", - "01560_ttl_remove_empty_parts", - "01554_bloom_filter_index_big_integer_uuid", - "01550_type_map_formats_input", - "01550_type_map_formats", - "01550_create_map_type", - "01532_primary_key_without_order_by_zookeeper", - "01511_alter_version_versioned_collapsing_merge_tree_zookeeper", - "01509_parallel_quorum_insert_no_replicas", - "01504_compression_multiple_streams", - "01494_storage_join_persistency", - "01493_storage_set_persistency", - "01493_alter_remove_properties_zookeeper", - "01475_read_subcolumns_storages", - "01475_read_subcolumns", - "01451_replicated_detach_drop_part", - "01451_detach_drop_part", - "01440_big_int_exotic_casts", - "01430_modify_sample_by_zookeeper", - "01417_freeze_partition_verbose_zookeeper", - "01417_freeze_partition_verbose", - "01396_inactive_replica_cleanup_nodes_zookeeper", - "01375_compact_parts_codecs", - "01357_version_collapsing_attach_detach_zookeeper", - "01355_alter_column_with_order", - "01291_geo_types", - "01270_optimize_skip_unused_shards_low_cardinality", - "01182_materialized_view_different_structure", - "01150_ddl_guard_rwr", - "01148_zookeeper_path_macros_unfolding", - "01135_default_and_alter_zookeeper", - "01130_in_memory_parts_partitons", - "01127_month_partitioning_consistency_select", - "01114_database_atomic", - "01083_expressions_in_engine_arguments", - "01073_attach_if_not_exists", - "01072_optimize_skip_unused_shards_const_expr_eval", - "01071_prohibition_secondary_index_with_old_format_merge_tree", - "01062_alter_on_mutataion_zookeeper", - "01060_shutdown_table_after_detach", - "01056_create_table_as", - "01035_avg", - "01021_only_tuple_columns", - "01019_alter_materialized_view_query", - "01019_alter_materialized_view_consistent", - "01019_alter_materialized_view_atomic", - "01015_attach_part", - "00989_parallel_parts_loading", + "01175_distributed_ddl_output_mode", + "01415_sticking_mutations", "00980_zookeeper_merge_tree_alter_settings", - "00980_merge_alter_settings", + "01148_zookeeper_path_macros_unfolding", + "01294_system_distributed_on_cluster", + "01269_create_with_null", + /// grep -c + "01018_ddl_dictionaries_bad_queries", + "00908_bloom_filter_index", + /// Unsupported type of ALTER query + "01650_fetch_patition_with_macro_in_zk_path", + "01451_detach_drop_part", + "01451_replicated_detach_drop_part", + "01417_freeze_partition_verbose", + "01417_freeze_partition_verbose_zookeeper", + "01130_in_memory_parts_partitons", + "01060_shutdown_table_after_detach", + "01021_only_tuple_columns", + "01015_attach_part", "00955_test_final_mark", - "00933_reserved_word", - "00926_zookeeper_adaptive_index_granularity_replicated_merge_tree", - "00926_adaptive_index_granularity_replacing_merge_tree", - "00926_adaptive_index_granularity_merge_tree", - "00925_zookeeper_empty_replicated_merge_tree_optimize_final", - "00800_low_cardinality_distinct_numeric", - "00754_alter_modify_order_by_replicated_zookeeper", - "00751_low_cardinality_nullable_group_by", - "00751_default_databasename_for_view", - "00719_parallel_ddl_table", - "00718_low_cardinaliry_alter", - "00717_low_cardinaliry_distributed_group_by", - "00688_low_cardinality_syntax", - "00688_low_cardinality_nullable_cast", - "00688_low_cardinality_in", - "00652_replicated_mutations_zookeeper", - "00634_rename_view", + "00753_alter_attach", + "00626_replace_partition_from_table_zookeeper", "00626_replace_partition_from_table", - "00625_arrays_in_nested", + "00152_insert_different_granularity", + /// Old syntax is not allowed + "01062_alter_on_mutataion_zookeeper", + "00925_zookeeper_empty_replicated_merge_tree_optimize_final", + "00754_alter_modify_order_by_replicated_zookeeper", + "00652_replicated_mutations_zookeeper", "00623_replicated_truncate_table_zookeeper", - "00619_union_highlite", - "00599_create_view_with_subquery", - "00571_non_exist_database_when_create_materializ_view", - "00553_buff_exists_materlized_column", "00516_deduplication_after_drop_partition_zookeeper", - "00508_materialized_view_to", "00446_clear_column_in_partition_concurrent_zookeeper", - "00423_storage_log_single_thread", - "00311_array_primary_key", "00236_replicated_drop_on_non_leader_zookeeper", "00226_zookeeper_deduplication_and_unexpected_parts", "00215_primary_key_order_zookeeper", - "00180_attach_materialized_view", "00121_drop_column_zookeeper", - "00116_storage_set", "00083_create_merge_tree_zookeeper", "00062_replicated_merge_tree_alter_zookeeper", - "01720_constraints_complex_types", - "01747_alter_partition_key_enum_zookeeper" + /// Does not support renaming of multiple tables in single query + "00634_rename_view" ], "polymorphic-parts": [ "01508_partition_pruning_long", /// bug, shoud be fixed @@ -396,7 +283,7 @@ "01293_pretty_max_value_width", "01293_show_settings", "01294_create_settings_profile", - "01294_lazy_database_concurrent_recreate_reattach_and_show_tables", + "01294_lazy_database_concurrent_recreate_reattach_and_show_tables_long", "01295_create_row_policy", "01296_create_row_policy_in_current_database", "01297_create_quota", @@ -592,7 +479,7 @@ "01018_ddl_dictionaries_select", "01018_ddl_dictionaries_special", "01018_dictionaries_from_dictionaries", - "01018_ip_dictionary", + "01018_ip_dictionary_long", "01021_only_tuple_columns", "01023_materialized_view_query_context", "01031_mutations_interpreter_and_context", @@ -681,7 +568,7 @@ "01281_unsucceeded_insert_select_queries_counter", "01293_system_distribution_queue", "01294_lazy_database_concurrent", - "01294_lazy_database_concurrent_recreate_reattach_and_show_tables", + "01294_lazy_database_concurrent_recreate_reattach_and_show_tables_long", "01294_system_distributed_on_cluster", "01296_create_row_policy_in_current_database", "01297_create_quota", @@ -746,6 +633,7 @@ "01601_detach_permanently", "01602_show_create_view", "01603_rename_overwrite_bug", + "01666_blns", "01646_system_restart_replicas_smoke", // system restart replicas is a global query "01656_test_query_log_factories_info", "01669_columns_declaration_serde", @@ -763,6 +651,7 @@ "polygon_dicts", // they use an explicitly specified database "01658_read_file_to_stringcolumn", "01721_engine_file_truncate_on_insert", // It's ok to execute in parallel but not several instances of the same test. + "01702_system_query_log", // It's ok to execute in parallel with oter tests but not several instances of the same test. "01748_dictionary_table_dot", // creates database "00950_dict_get", "01683_flat_dictionary", diff --git a/utils/CMakeLists.txt b/utils/CMakeLists.txt index 8a39d591612..d38b34f3419 100644 --- a/utils/CMakeLists.txt +++ b/utils/CMakeLists.txt @@ -32,6 +32,7 @@ if (NOT DEFINED ENABLE_UTILS OR ENABLE_UTILS) add_subdirectory (db-generator) add_subdirectory (wal-dump) add_subdirectory (check-mysql-binlog) + add_subdirectory (memcpy-bench) endif () if (ENABLE_CODE_QUALITY) diff --git a/utils/memcpy-bench/CMakeLists.txt b/utils/memcpy-bench/CMakeLists.txt new file mode 100644 index 00000000000..54dd0398912 --- /dev/null +++ b/utils/memcpy-bench/CMakeLists.txt @@ -0,0 +1,5 @@ +enable_language(ASM) +add_executable (memcpy-bench memcpy-bench.cpp memcpy_jart.S) +#target_compile_options(memcpy-bench PRIVATE -mavx) +target_link_libraries(memcpy-bench PRIVATE dbms) + diff --git a/utils/memcpy-bench/FastMemcpy.h b/utils/memcpy-bench/FastMemcpy.h new file mode 100644 index 00000000000..9c37524443a --- /dev/null +++ b/utils/memcpy-bench/FastMemcpy.h @@ -0,0 +1,770 @@ +#pragma once + +//===================================================================== +// +// FastMemcpy.c - skywind3000@163.com, 2015 +// +// feature: +// 50% speed up in avg. vs standard memcpy (tested in vc2012/gcc5.1) +// +//===================================================================== + +#include +#include +#include + + +//--------------------------------------------------------------------- +// force inline for compilers +//--------------------------------------------------------------------- +#ifndef INLINE +#ifdef __GNUC__ +#if (__GNUC__ > 3) || ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 1)) + #define INLINE __inline__ __attribute__((always_inline)) +#else + #define INLINE __inline__ +#endif +#elif defined(_MSC_VER) + #define INLINE __forceinline +#elif (defined(__BORLANDC__) || defined(__WATCOMC__)) + #define INLINE __inline +#else + #define INLINE +#endif +#endif + +typedef __attribute__((__aligned__(1))) uint16_t uint16_unaligned_t; +typedef __attribute__((__aligned__(1))) uint32_t uint32_unaligned_t; +typedef __attribute__((__aligned__(1))) uint64_t uint64_unaligned_t; + +//--------------------------------------------------------------------- +// fast copy for different sizes +//--------------------------------------------------------------------- +static INLINE void memcpy_sse2_16(void * __restrict dst, const void * __restrict src) +{ + __m128i m0 = _mm_loadu_si128((reinterpret_cast(src)) + 0); + _mm_storeu_si128((reinterpret_cast<__m128i*>(dst)) + 0, m0); +} + +static INLINE void memcpy_sse2_32(void * __restrict dst, const void * __restrict src) +{ + __m128i m0 = _mm_loadu_si128((reinterpret_cast(src)) + 0); + __m128i m1 = _mm_loadu_si128((reinterpret_cast(src)) + 1); + _mm_storeu_si128((reinterpret_cast<__m128i*>(dst)) + 0, m0); + _mm_storeu_si128((reinterpret_cast<__m128i*>(dst)) + 1, m1); +} + +static INLINE void memcpy_sse2_64(void * __restrict dst, const void * __restrict src) +{ + __m128i m0 = _mm_loadu_si128((reinterpret_cast(src)) + 0); + __m128i m1 = _mm_loadu_si128((reinterpret_cast(src)) + 1); + __m128i m2 = _mm_loadu_si128((reinterpret_cast(src)) + 2); + __m128i m3 = _mm_loadu_si128((reinterpret_cast(src)) + 3); + _mm_storeu_si128((reinterpret_cast<__m128i*>(dst)) + 0, m0); + _mm_storeu_si128((reinterpret_cast<__m128i*>(dst)) + 1, m1); + _mm_storeu_si128((reinterpret_cast<__m128i*>(dst)) + 2, m2); + _mm_storeu_si128((reinterpret_cast<__m128i*>(dst)) + 3, m3); +} + +static INLINE void memcpy_sse2_128(void * __restrict dst, const void * __restrict src) +{ + __m128i m0 = _mm_loadu_si128((reinterpret_cast(src)) + 0); + __m128i m1 = _mm_loadu_si128((reinterpret_cast(src)) + 1); + __m128i m2 = _mm_loadu_si128((reinterpret_cast(src)) + 2); + __m128i m3 = _mm_loadu_si128((reinterpret_cast(src)) + 3); + __m128i m4 = _mm_loadu_si128((reinterpret_cast(src)) + 4); + __m128i m5 = _mm_loadu_si128((reinterpret_cast(src)) + 5); + __m128i m6 = _mm_loadu_si128((reinterpret_cast(src)) + 6); + __m128i m7 = _mm_loadu_si128((reinterpret_cast(src)) + 7); + _mm_storeu_si128((reinterpret_cast<__m128i*>(dst)) + 0, m0); + _mm_storeu_si128((reinterpret_cast<__m128i*>(dst)) + 1, m1); + _mm_storeu_si128((reinterpret_cast<__m128i*>(dst)) + 2, m2); + _mm_storeu_si128((reinterpret_cast<__m128i*>(dst)) + 3, m3); + _mm_storeu_si128((reinterpret_cast<__m128i*>(dst)) + 4, m4); + _mm_storeu_si128((reinterpret_cast<__m128i*>(dst)) + 5, m5); + _mm_storeu_si128((reinterpret_cast<__m128i*>(dst)) + 6, m6); + _mm_storeu_si128((reinterpret_cast<__m128i*>(dst)) + 7, m7); +} + + +//--------------------------------------------------------------------- +// tiny memory copy with jump table optimized +//--------------------------------------------------------------------- +/// Attribute is used to avoid an error with undefined behaviour sanitizer +/// ../contrib/FastMemcpy/FastMemcpy.h:91:56: runtime error: applying zero offset to null pointer +/// Found by 01307_orc_output_format.sh, cause - ORCBlockInputFormat and external ORC library. +__attribute__((__no_sanitize__("undefined"))) static INLINE void *memcpy_tiny(void * __restrict dst, const void * __restrict src, size_t size) +{ + unsigned char *dd = ((unsigned char*)dst) + size; + const unsigned char *ss = ((const unsigned char*)src) + size; + + switch (size) + { + case 64: + memcpy_sse2_64(dd - 64, ss - 64); + [[fallthrough]]; + case 0: + break; + + case 65: + memcpy_sse2_64(dd - 65, ss - 65); + [[fallthrough]]; + case 1: + dd[-1] = ss[-1]; + break; + + case 66: + memcpy_sse2_64(dd - 66, ss - 66); + [[fallthrough]]; + case 2: + *((uint16_unaligned_t*)(dd - 2)) = *((const uint16_unaligned_t*)(ss - 2)); + break; + + case 67: + memcpy_sse2_64(dd - 67, ss - 67); + [[fallthrough]]; + case 3: + *((uint16_unaligned_t*)(dd - 3)) = *((const uint16_unaligned_t*)(ss - 3)); + dd[-1] = ss[-1]; + break; + + case 68: + memcpy_sse2_64(dd - 68, ss - 68); + [[fallthrough]]; + case 4: + *((uint32_unaligned_t*)(dd - 4)) = *((const uint32_unaligned_t*)(ss - 4)); + break; + + case 69: + memcpy_sse2_64(dd - 69, ss - 69); + [[fallthrough]]; + case 5: + *((uint32_unaligned_t*)(dd - 5)) = *((const uint32_unaligned_t*)(ss - 5)); + dd[-1] = ss[-1]; + break; + + case 70: + memcpy_sse2_64(dd - 70, ss - 70); + [[fallthrough]]; + case 6: + *((uint32_unaligned_t*)(dd - 6)) = *((const uint32_unaligned_t*)(ss - 6)); + *((uint16_unaligned_t*)(dd - 2)) = *((const uint16_unaligned_t*)(ss - 2)); + break; + + case 71: + memcpy_sse2_64(dd - 71, ss - 71); + [[fallthrough]]; + case 7: + *((uint32_unaligned_t*)(dd - 7)) = *((const uint32_unaligned_t*)(ss - 7)); + *((uint32_unaligned_t*)(dd - 4)) = *((const uint32_unaligned_t*)(ss - 4)); + break; + + case 72: + memcpy_sse2_64(dd - 72, ss - 72); + [[fallthrough]]; + case 8: + *((uint64_unaligned_t*)(dd - 8)) = *((const uint64_unaligned_t*)(ss - 8)); + break; + + case 73: + memcpy_sse2_64(dd - 73, ss - 73); + [[fallthrough]]; + case 9: + *((uint64_unaligned_t*)(dd - 9)) = *((const uint64_unaligned_t*)(ss - 9)); + dd[-1] = ss[-1]; + break; + + case 74: + memcpy_sse2_64(dd - 74, ss - 74); + [[fallthrough]]; + case 10: + *((uint64_unaligned_t*)(dd - 10)) = *((const uint64_unaligned_t*)(ss - 10)); + *((uint16_unaligned_t*)(dd - 2)) = *((const uint16_unaligned_t*)(ss - 2)); + break; + + case 75: + memcpy_sse2_64(dd - 75, ss - 75); + [[fallthrough]]; + case 11: + *((uint64_unaligned_t*)(dd - 11)) = *((const uint64_unaligned_t*)(ss - 11)); + *((uint32_unaligned_t*)(dd - 4)) = *((const uint32_unaligned_t*)(ss - 4)); + break; + + case 76: + memcpy_sse2_64(dd - 76, ss - 76); + [[fallthrough]]; + case 12: + *((uint64_unaligned_t*)(dd - 12)) = *((const uint64_unaligned_t*)(ss - 12)); + *((uint32_unaligned_t*)(dd - 4)) = *((const uint32_unaligned_t*)(ss - 4)); + break; + + case 77: + memcpy_sse2_64(dd - 77, ss - 77); + [[fallthrough]]; + case 13: + *((uint64_unaligned_t*)(dd - 13)) = *((const uint64_unaligned_t*)(ss - 13)); + *((uint32_unaligned_t*)(dd - 5)) = *((const uint32_unaligned_t*)(ss - 5)); + dd[-1] = ss[-1]; + break; + + case 78: + memcpy_sse2_64(dd - 78, ss - 78); + [[fallthrough]]; + case 14: + *((uint64_unaligned_t*)(dd - 14)) = *((const uint64_unaligned_t*)(ss - 14)); + *((uint64_unaligned_t*)(dd - 8)) = *((const uint64_unaligned_t*)(ss - 8)); + break; + + case 79: + memcpy_sse2_64(dd - 79, ss - 79); + [[fallthrough]]; + case 15: + *((uint64_unaligned_t*)(dd - 15)) = *((const uint64_unaligned_t*)(ss - 15)); + *((uint64_unaligned_t*)(dd - 8)) = *((const uint64_unaligned_t*)(ss - 8)); + break; + + case 80: + memcpy_sse2_64(dd - 80, ss - 80); + [[fallthrough]]; + case 16: + memcpy_sse2_16(dd - 16, ss - 16); + break; + + case 81: + memcpy_sse2_64(dd - 81, ss - 81); + [[fallthrough]]; + case 17: + memcpy_sse2_16(dd - 17, ss - 17); + dd[-1] = ss[-1]; + break; + + case 82: + memcpy_sse2_64(dd - 82, ss - 82); + [[fallthrough]]; + case 18: + memcpy_sse2_16(dd - 18, ss - 18); + *((uint16_unaligned_t*)(dd - 2)) = *((const uint16_unaligned_t*)(ss - 2)); + break; + + case 83: + memcpy_sse2_64(dd - 83, ss - 83); + [[fallthrough]]; + case 19: + memcpy_sse2_16(dd - 19, ss - 19); + *((uint16_unaligned_t*)(dd - 3)) = *((const uint16_unaligned_t*)(ss - 3)); + dd[-1] = ss[-1]; + break; + + case 84: + memcpy_sse2_64(dd - 84, ss - 84); + [[fallthrough]]; + case 20: + memcpy_sse2_16(dd - 20, ss - 20); + *((uint32_unaligned_t*)(dd - 4)) = *((const uint32_unaligned_t*)(ss - 4)); + break; + + case 85: + memcpy_sse2_64(dd - 85, ss - 85); + [[fallthrough]]; + case 21: + memcpy_sse2_16(dd - 21, ss - 21); + *((uint32_unaligned_t*)(dd - 5)) = *((const uint32_unaligned_t*)(ss - 5)); + dd[-1] = ss[-1]; + break; + + case 86: + memcpy_sse2_64(dd - 86, ss - 86); + [[fallthrough]]; + case 22: + memcpy_sse2_16(dd - 22, ss - 22); + *((uint32_unaligned_t*)(dd - 6)) = *((const uint32_unaligned_t*)(ss - 6)); + *((uint16_unaligned_t*)(dd - 2)) = *((const uint16_unaligned_t*)(ss - 2)); + break; + + case 87: + memcpy_sse2_64(dd - 87, ss - 87); + [[fallthrough]]; + case 23: + memcpy_sse2_16(dd - 23, ss - 23); + *((uint32_unaligned_t*)(dd - 7)) = *((const uint32_unaligned_t*)(ss - 7)); + *((uint32_unaligned_t*)(dd - 4)) = *((const uint32_unaligned_t*)(ss - 4)); + break; + + case 88: + memcpy_sse2_64(dd - 88, ss - 88); + [[fallthrough]]; + case 24: + memcpy_sse2_16(dd - 24, ss - 24); + memcpy_sse2_16(dd - 16, ss - 16); + break; + + case 89: + memcpy_sse2_64(dd - 89, ss - 89); + [[fallthrough]]; + case 25: + memcpy_sse2_16(dd - 25, ss - 25); + memcpy_sse2_16(dd - 16, ss - 16); + break; + + case 90: + memcpy_sse2_64(dd - 90, ss - 90); + [[fallthrough]]; + case 26: + memcpy_sse2_16(dd - 26, ss - 26); + memcpy_sse2_16(dd - 16, ss - 16); + break; + + case 91: + memcpy_sse2_64(dd - 91, ss - 91); + [[fallthrough]]; + case 27: + memcpy_sse2_16(dd - 27, ss - 27); + memcpy_sse2_16(dd - 16, ss - 16); + break; + + case 92: + memcpy_sse2_64(dd - 92, ss - 92); + [[fallthrough]]; + case 28: + memcpy_sse2_16(dd - 28, ss - 28); + memcpy_sse2_16(dd - 16, ss - 16); + break; + + case 93: + memcpy_sse2_64(dd - 93, ss - 93); + [[fallthrough]]; + case 29: + memcpy_sse2_16(dd - 29, ss - 29); + memcpy_sse2_16(dd - 16, ss - 16); + break; + + case 94: + memcpy_sse2_64(dd - 94, ss - 94); + [[fallthrough]]; + case 30: + memcpy_sse2_16(dd - 30, ss - 30); + memcpy_sse2_16(dd - 16, ss - 16); + break; + + case 95: + memcpy_sse2_64(dd - 95, ss - 95); + [[fallthrough]]; + case 31: + memcpy_sse2_16(dd - 31, ss - 31); + memcpy_sse2_16(dd - 16, ss - 16); + break; + + case 96: + memcpy_sse2_64(dd - 96, ss - 96); + [[fallthrough]]; + case 32: + memcpy_sse2_32(dd - 32, ss - 32); + break; + + case 97: + memcpy_sse2_64(dd - 97, ss - 97); + [[fallthrough]]; + case 33: + memcpy_sse2_32(dd - 33, ss - 33); + dd[-1] = ss[-1]; + break; + + case 98: + memcpy_sse2_64(dd - 98, ss - 98); + [[fallthrough]]; + case 34: + memcpy_sse2_32(dd - 34, ss - 34); + *((uint16_unaligned_t*)(dd - 2)) = *((const uint16_unaligned_t*)(ss - 2)); + break; + + case 99: + memcpy_sse2_64(dd - 99, ss - 99); + [[fallthrough]]; + case 35: + memcpy_sse2_32(dd - 35, ss - 35); + *((uint16_unaligned_t*)(dd - 3)) = *((const uint16_unaligned_t*)(ss - 3)); + dd[-1] = ss[-1]; + break; + + case 100: + memcpy_sse2_64(dd - 100, ss - 100); + [[fallthrough]]; + case 36: + memcpy_sse2_32(dd - 36, ss - 36); + *((uint32_unaligned_t*)(dd - 4)) = *((const uint32_unaligned_t*)(ss - 4)); + break; + + case 101: + memcpy_sse2_64(dd - 101, ss - 101); + [[fallthrough]]; + case 37: + memcpy_sse2_32(dd - 37, ss - 37); + *((uint32_unaligned_t*)(dd - 5)) = *((const uint32_unaligned_t*)(ss - 5)); + dd[-1] = ss[-1]; + break; + + case 102: + memcpy_sse2_64(dd - 102, ss - 102); + [[fallthrough]]; + case 38: + memcpy_sse2_32(dd - 38, ss - 38); + *((uint32_unaligned_t*)(dd - 6)) = *((const uint32_unaligned_t*)(ss - 6)); + *((uint16_unaligned_t*)(dd - 2)) = *((const uint16_unaligned_t*)(ss - 2)); + break; + + case 103: + memcpy_sse2_64(dd - 103, ss - 103); + [[fallthrough]]; + case 39: + memcpy_sse2_32(dd - 39, ss - 39); + *((uint32_unaligned_t*)(dd - 7)) = *((const uint32_unaligned_t*)(ss - 7)); + *((uint32_unaligned_t*)(dd - 4)) = *((const uint32_unaligned_t*)(ss - 4)); + break; + + case 104: + memcpy_sse2_64(dd - 104, ss - 104); + [[fallthrough]]; + case 40: + memcpy_sse2_32(dd - 40, ss - 40); + *((uint64_unaligned_t*)(dd - 8)) = *((const uint64_unaligned_t*)(ss - 8)); + break; + + case 105: + memcpy_sse2_64(dd - 105, ss - 105); + [[fallthrough]]; + case 41: + memcpy_sse2_32(dd - 41, ss - 41); + *((uint64_unaligned_t*)(dd - 9)) = *((const uint64_unaligned_t*)(ss - 9)); + dd[-1] = ss[-1]; + break; + + case 106: + memcpy_sse2_64(dd - 106, ss - 106); + [[fallthrough]]; + case 42: + memcpy_sse2_32(dd - 42, ss - 42); + *((uint64_unaligned_t*)(dd - 10)) = *((const uint64_unaligned_t*)(ss - 10)); + *((uint16_unaligned_t*)(dd - 2)) = *((const uint16_unaligned_t*)(ss - 2)); + break; + + case 107: + memcpy_sse2_64(dd - 107, ss - 107); + [[fallthrough]]; + case 43: + memcpy_sse2_32(dd - 43, ss - 43); + *((uint64_unaligned_t*)(dd - 11)) = *((const uint64_unaligned_t*)(ss - 11)); + *((uint32_unaligned_t*)(dd - 4)) = *((const uint32_unaligned_t*)(ss - 4)); + break; + + case 108: + memcpy_sse2_64(dd - 108, ss - 108); + [[fallthrough]]; + case 44: + memcpy_sse2_32(dd - 44, ss - 44); + *((uint64_unaligned_t*)(dd - 12)) = *((const uint64_unaligned_t*)(ss - 12)); + *((uint32_unaligned_t*)(dd - 4)) = *((const uint32_unaligned_t*)(ss - 4)); + break; + + case 109: + memcpy_sse2_64(dd - 109, ss - 109); + [[fallthrough]]; + case 45: + memcpy_sse2_32(dd - 45, ss - 45); + *((uint64_unaligned_t*)(dd - 13)) = *((const uint64_unaligned_t*)(ss - 13)); + *((uint32_unaligned_t*)(dd - 5)) = *((const uint32_unaligned_t*)(ss - 5)); + dd[-1] = ss[-1]; + break; + + case 110: + memcpy_sse2_64(dd - 110, ss - 110); + [[fallthrough]]; + case 46: + memcpy_sse2_32(dd - 46, ss - 46); + *((uint64_unaligned_t*)(dd - 14)) = *((const uint64_unaligned_t*)(ss - 14)); + *((uint64_unaligned_t*)(dd - 8)) = *((const uint64_unaligned_t*)(ss - 8)); + break; + + case 111: + memcpy_sse2_64(dd - 111, ss - 111); + [[fallthrough]]; + case 47: + memcpy_sse2_32(dd - 47, ss - 47); + *((uint64_unaligned_t*)(dd - 15)) = *((const uint64_unaligned_t*)(ss - 15)); + *((uint64_unaligned_t*)(dd - 8)) = *((const uint64_unaligned_t*)(ss - 8)); + break; + + case 112: + memcpy_sse2_64(dd - 112, ss - 112); + [[fallthrough]]; + case 48: + memcpy_sse2_32(dd - 48, ss - 48); + memcpy_sse2_16(dd - 16, ss - 16); + break; + + case 113: + memcpy_sse2_64(dd - 113, ss - 113); + [[fallthrough]]; + case 49: + memcpy_sse2_32(dd - 49, ss - 49); + memcpy_sse2_16(dd - 17, ss - 17); + dd[-1] = ss[-1]; + break; + + case 114: + memcpy_sse2_64(dd - 114, ss - 114); + [[fallthrough]]; + case 50: + memcpy_sse2_32(dd - 50, ss - 50); + memcpy_sse2_16(dd - 18, ss - 18); + *((uint16_unaligned_t*)(dd - 2)) = *((const uint16_unaligned_t*)(ss - 2)); + break; + + case 115: + memcpy_sse2_64(dd - 115, ss - 115); + [[fallthrough]]; + case 51: + memcpy_sse2_32(dd - 51, ss - 51); + memcpy_sse2_16(dd - 19, ss - 19); + *((uint16_unaligned_t*)(dd - 3)) = *((const uint16_unaligned_t*)(ss - 3)); + dd[-1] = ss[-1]; + break; + + case 116: + memcpy_sse2_64(dd - 116, ss - 116); + [[fallthrough]]; + case 52: + memcpy_sse2_32(dd - 52, ss - 52); + memcpy_sse2_16(dd - 20, ss - 20); + *((uint32_unaligned_t*)(dd - 4)) = *((const uint32_unaligned_t*)(ss - 4)); + break; + + case 117: + memcpy_sse2_64(dd - 117, ss - 117); + [[fallthrough]]; + case 53: + memcpy_sse2_32(dd - 53, ss - 53); + memcpy_sse2_16(dd - 21, ss - 21); + *((uint32_unaligned_t*)(dd - 5)) = *((const uint32_unaligned_t*)(ss - 5)); + dd[-1] = ss[-1]; + break; + + case 118: + memcpy_sse2_64(dd - 118, ss - 118); + [[fallthrough]]; + case 54: + memcpy_sse2_32(dd - 54, ss - 54); + memcpy_sse2_16(dd - 22, ss - 22); + *((uint32_unaligned_t*)(dd - 6)) = *((const uint32_unaligned_t*)(ss - 6)); + *((uint16_unaligned_t*)(dd - 2)) = *((const uint16_unaligned_t*)(ss - 2)); + break; + + case 119: + memcpy_sse2_64(dd - 119, ss - 119); + [[fallthrough]]; + case 55: + memcpy_sse2_32(dd - 55, ss - 55); + memcpy_sse2_16(dd - 23, ss - 23); + *((uint32_unaligned_t*)(dd - 7)) = *((const uint32_unaligned_t*)(ss - 7)); + *((uint32_unaligned_t*)(dd - 4)) = *((const uint32_unaligned_t*)(ss - 4)); + break; + + case 120: + memcpy_sse2_64(dd - 120, ss - 120); + [[fallthrough]]; + case 56: + memcpy_sse2_32(dd - 56, ss - 56); + memcpy_sse2_16(dd - 24, ss - 24); + memcpy_sse2_16(dd - 16, ss - 16); + break; + + case 121: + memcpy_sse2_64(dd - 121, ss - 121); + [[fallthrough]]; + case 57: + memcpy_sse2_32(dd - 57, ss - 57); + memcpy_sse2_16(dd - 25, ss - 25); + memcpy_sse2_16(dd - 16, ss - 16); + break; + + case 122: + memcpy_sse2_64(dd - 122, ss - 122); + [[fallthrough]]; + case 58: + memcpy_sse2_32(dd - 58, ss - 58); + memcpy_sse2_16(dd - 26, ss - 26); + memcpy_sse2_16(dd - 16, ss - 16); + break; + + case 123: + memcpy_sse2_64(dd - 123, ss - 123); + [[fallthrough]]; + case 59: + memcpy_sse2_32(dd - 59, ss - 59); + memcpy_sse2_16(dd - 27, ss - 27); + memcpy_sse2_16(dd - 16, ss - 16); + break; + + case 124: + memcpy_sse2_64(dd - 124, ss - 124); + [[fallthrough]]; + case 60: + memcpy_sse2_32(dd - 60, ss - 60); + memcpy_sse2_16(dd - 28, ss - 28); + memcpy_sse2_16(dd - 16, ss - 16); + break; + + case 125: + memcpy_sse2_64(dd - 125, ss - 125); + [[fallthrough]]; + case 61: + memcpy_sse2_32(dd - 61, ss - 61); + memcpy_sse2_16(dd - 29, ss - 29); + memcpy_sse2_16(dd - 16, ss - 16); + break; + + case 126: + memcpy_sse2_64(dd - 126, ss - 126); + [[fallthrough]]; + case 62: + memcpy_sse2_32(dd - 62, ss - 62); + memcpy_sse2_16(dd - 30, ss - 30); + memcpy_sse2_16(dd - 16, ss - 16); + break; + + case 127: + memcpy_sse2_64(dd - 127, ss - 127); + [[fallthrough]]; + case 63: + memcpy_sse2_32(dd - 63, ss - 63); + memcpy_sse2_16(dd - 31, ss - 31); + memcpy_sse2_16(dd - 16, ss - 16); + break; + + case 128: + memcpy_sse2_128(dd - 128, ss - 128); + break; + } + + return dst; +} + + +//--------------------------------------------------------------------- +// main routine +//--------------------------------------------------------------------- +void* memcpy_fast_sse(void * __restrict destination, const void * __restrict source, size_t size) +{ + unsigned char *dst = (unsigned char*)destination; + const unsigned char *src = (const unsigned char*)source; + static size_t cachesize = 0x200000; // L2-cache size + size_t padding; + + // small memory copy + if (size <= 128) +{ + return memcpy_tiny(dst, src, size); + } + + // align destination to 16 bytes boundary + padding = (16 - (((size_t)dst) & 15)) & 15; + + if (padding > 0) + { + __m128i head = _mm_loadu_si128(reinterpret_cast(src)); + _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), head); + dst += padding; + src += padding; + size -= padding; + } + + // medium size copy + if (size <= cachesize) + { + __m128i c0, c1, c2, c3, c4, c5, c6, c7; + + for (; size >= 128; size -= 128) + { + c0 = _mm_loadu_si128((reinterpret_cast(src)) + 0); + c1 = _mm_loadu_si128((reinterpret_cast(src)) + 1); + c2 = _mm_loadu_si128((reinterpret_cast(src)) + 2); + c3 = _mm_loadu_si128((reinterpret_cast(src)) + 3); + c4 = _mm_loadu_si128((reinterpret_cast(src)) + 4); + c5 = _mm_loadu_si128((reinterpret_cast(src)) + 5); + c6 = _mm_loadu_si128((reinterpret_cast(src)) + 6); + c7 = _mm_loadu_si128((reinterpret_cast(src)) + 7); + _mm_prefetch((const char*)(src + 256), _MM_HINT_NTA); + src += 128; + _mm_store_si128(((reinterpret_cast<__m128i*>(dst)) + 0), c0); + _mm_store_si128(((reinterpret_cast<__m128i*>(dst)) + 1), c1); + _mm_store_si128(((reinterpret_cast<__m128i*>(dst)) + 2), c2); + _mm_store_si128(((reinterpret_cast<__m128i*>(dst)) + 3), c3); + _mm_store_si128(((reinterpret_cast<__m128i*>(dst)) + 4), c4); + _mm_store_si128(((reinterpret_cast<__m128i*>(dst)) + 5), c5); + _mm_store_si128(((reinterpret_cast<__m128i*>(dst)) + 6), c6); + _mm_store_si128(((reinterpret_cast<__m128i*>(dst)) + 7), c7); + dst += 128; + } + } + else + { // big memory copy + __m128i c0, c1, c2, c3, c4, c5, c6, c7; + + _mm_prefetch((const char*)(src), _MM_HINT_NTA); + + if ((((size_t)src) & 15) == 0) + { // source aligned + for (; size >= 128; size -= 128) + { + c0 = _mm_load_si128((reinterpret_cast(src)) + 0); + c1 = _mm_load_si128((reinterpret_cast(src)) + 1); + c2 = _mm_load_si128((reinterpret_cast(src)) + 2); + c3 = _mm_load_si128((reinterpret_cast(src)) + 3); + c4 = _mm_load_si128((reinterpret_cast(src)) + 4); + c5 = _mm_load_si128((reinterpret_cast(src)) + 5); + c6 = _mm_load_si128((reinterpret_cast(src)) + 6); + c7 = _mm_load_si128((reinterpret_cast(src)) + 7); + _mm_prefetch((const char*)(src + 256), _MM_HINT_NTA); + src += 128; + _mm_stream_si128(((reinterpret_cast<__m128i*>(dst)) + 0), c0); + _mm_stream_si128(((reinterpret_cast<__m128i*>(dst)) + 1), c1); + _mm_stream_si128(((reinterpret_cast<__m128i*>(dst)) + 2), c2); + _mm_stream_si128(((reinterpret_cast<__m128i*>(dst)) + 3), c3); + _mm_stream_si128(((reinterpret_cast<__m128i*>(dst)) + 4), c4); + _mm_stream_si128(((reinterpret_cast<__m128i*>(dst)) + 5), c5); + _mm_stream_si128(((reinterpret_cast<__m128i*>(dst)) + 6), c6); + _mm_stream_si128(((reinterpret_cast<__m128i*>(dst)) + 7), c7); + dst += 128; + } + } + else + { // source unaligned + for (; size >= 128; size -= 128) + { + c0 = _mm_loadu_si128((reinterpret_cast(src)) + 0); + c1 = _mm_loadu_si128((reinterpret_cast(src)) + 1); + c2 = _mm_loadu_si128((reinterpret_cast(src)) + 2); + c3 = _mm_loadu_si128((reinterpret_cast(src)) + 3); + c4 = _mm_loadu_si128((reinterpret_cast(src)) + 4); + c5 = _mm_loadu_si128((reinterpret_cast(src)) + 5); + c6 = _mm_loadu_si128((reinterpret_cast(src)) + 6); + c7 = _mm_loadu_si128((reinterpret_cast(src)) + 7); + _mm_prefetch((const char*)(src + 256), _MM_HINT_NTA); + src += 128; + _mm_stream_si128(((reinterpret_cast<__m128i*>(dst)) + 0), c0); + _mm_stream_si128(((reinterpret_cast<__m128i*>(dst)) + 1), c1); + _mm_stream_si128(((reinterpret_cast<__m128i*>(dst)) + 2), c2); + _mm_stream_si128(((reinterpret_cast<__m128i*>(dst)) + 3), c3); + _mm_stream_si128(((reinterpret_cast<__m128i*>(dst)) + 4), c4); + _mm_stream_si128(((reinterpret_cast<__m128i*>(dst)) + 5), c5); + _mm_stream_si128(((reinterpret_cast<__m128i*>(dst)) + 6), c6); + _mm_stream_si128(((reinterpret_cast<__m128i*>(dst)) + 7), c7); + dst += 128; + } + } + _mm_sfence(); + } + + memcpy_tiny(dst, src, size); + + return destination; +} diff --git a/utils/memcpy-bench/FastMemcpy_Avx.h b/utils/memcpy-bench/FastMemcpy_Avx.h new file mode 100644 index 00000000000..ee7d4e19536 --- /dev/null +++ b/utils/memcpy-bench/FastMemcpy_Avx.h @@ -0,0 +1,496 @@ +#pragma once + +//===================================================================== +// +// FastMemcpy.c - skywind3000@163.com, 2015 +// +// feature: +// 50% speed up in avg. vs standard memcpy (tested in vc2012/gcc5.1) +// +//===================================================================== + +#include +#include +#include + + +//--------------------------------------------------------------------- +// force inline for compilers +//--------------------------------------------------------------------- +#ifndef INLINE +#ifdef __GNUC__ +#if (__GNUC__ > 3) || ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 1)) + #define INLINE __inline__ __attribute__((always_inline)) +#else + #define INLINE __inline__ +#endif +#elif defined(_MSC_VER) + #define INLINE __forceinline +#elif (defined(__BORLANDC__) || defined(__WATCOMC__)) + #define INLINE __inline +#else + #define INLINE +#endif +#endif + + +//--------------------------------------------------------------------- +// fast copy for different sizes +//--------------------------------------------------------------------- +static INLINE void memcpy_avx_16(void * __restrict dst, const void * __restrict src) +{ +#if 1 + __m128i m0 = _mm_loadu_si128(((const __m128i*)src) + 0); + _mm_storeu_si128(((__m128i*)dst) + 0, m0); +#else + *((uint64_t*)((char*)dst + 0)) = *((uint64_t*)((const char*)src + 0)); + *((uint64_t*)((char*)dst + 8)) = *((uint64_t*)((const char*)src + 8)); +#endif +} + +static INLINE void memcpy_avx_32(void *dst, const void *src) +{ + __m256i m0 = _mm256_loadu_si256((reinterpret_cast(src)) + 0); + _mm256_storeu_si256((reinterpret_cast<__m256i*>(dst)) + 0, m0); +} + +static INLINE void memcpy_avx_64(void *dst, const void *src) +{ + __m256i m0 = _mm256_loadu_si256((reinterpret_cast(src)) + 0); + __m256i m1 = _mm256_loadu_si256((reinterpret_cast(src)) + 1); + _mm256_storeu_si256((reinterpret_cast<__m256i*>(dst)) + 0, m0); + _mm256_storeu_si256((reinterpret_cast<__m256i*>(dst)) + 1, m1); +} + +static INLINE void memcpy_avx_128(void *dst, const void *src) +{ + __m256i m0 = _mm256_loadu_si256((reinterpret_cast(src)) + 0); + __m256i m1 = _mm256_loadu_si256((reinterpret_cast(src)) + 1); + __m256i m2 = _mm256_loadu_si256((reinterpret_cast(src)) + 2); + __m256i m3 = _mm256_loadu_si256((reinterpret_cast(src)) + 3); + _mm256_storeu_si256((reinterpret_cast<__m256i*>(dst)) + 0, m0); + _mm256_storeu_si256((reinterpret_cast<__m256i*>(dst)) + 1, m1); + _mm256_storeu_si256((reinterpret_cast<__m256i*>(dst)) + 2, m2); + _mm256_storeu_si256((reinterpret_cast<__m256i*>(dst)) + 3, m3); +} + +static INLINE void memcpy_avx_256(void *dst, const void *src) +{ + __m256i m0 = _mm256_loadu_si256((reinterpret_cast(src)) + 0); + __m256i m1 = _mm256_loadu_si256((reinterpret_cast(src)) + 1); + __m256i m2 = _mm256_loadu_si256((reinterpret_cast(src)) + 2); + __m256i m3 = _mm256_loadu_si256((reinterpret_cast(src)) + 3); + __m256i m4 = _mm256_loadu_si256((reinterpret_cast(src)) + 4); + __m256i m5 = _mm256_loadu_si256((reinterpret_cast(src)) + 5); + __m256i m6 = _mm256_loadu_si256((reinterpret_cast(src)) + 6); + __m256i m7 = _mm256_loadu_si256((reinterpret_cast(src)) + 7); + _mm256_storeu_si256((reinterpret_cast<__m256i*>(dst)) + 0, m0); + _mm256_storeu_si256((reinterpret_cast<__m256i*>(dst)) + 1, m1); + _mm256_storeu_si256((reinterpret_cast<__m256i*>(dst)) + 2, m2); + _mm256_storeu_si256((reinterpret_cast<__m256i*>(dst)) + 3, m3); + _mm256_storeu_si256((reinterpret_cast<__m256i*>(dst)) + 4, m4); + _mm256_storeu_si256((reinterpret_cast<__m256i*>(dst)) + 5, m5); + _mm256_storeu_si256((reinterpret_cast<__m256i*>(dst)) + 6, m6); + _mm256_storeu_si256((reinterpret_cast<__m256i*>(dst)) + 7, m7); +} + + +//--------------------------------------------------------------------- +// tiny memory copy with jump table optimized +//--------------------------------------------------------------------- +static INLINE void *memcpy_tiny_avx(void * __restrict dst, const void * __restrict src, size_t size) +{ + unsigned char *dd = reinterpret_cast(dst) + size; + const unsigned char *ss = reinterpret_cast(src) + size; + + switch (size) + { + case 128: memcpy_avx_128(dd - 128, ss - 128); [[fallthrough]]; + case 0: break; + case 129: memcpy_avx_128(dd - 129, ss - 129); [[fallthrough]]; + case 1: dd[-1] = ss[-1]; break; + case 130: memcpy_avx_128(dd - 130, ss - 130); [[fallthrough]]; + case 2: *((uint16_t*)(dd - 2)) = *((uint16_t*)(ss - 2)); break; + case 131: memcpy_avx_128(dd - 131, ss - 131); [[fallthrough]]; + case 3: *((uint16_t*)(dd - 3)) = *((uint16_t*)(ss - 3)); dd[-1] = ss[-1]; break; + case 132: memcpy_avx_128(dd - 132, ss - 132); [[fallthrough]]; + case 4: *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); break; + case 133: memcpy_avx_128(dd - 133, ss - 133); [[fallthrough]]; + case 5: *((uint32_t*)(dd - 5)) = *((uint32_t*)(ss - 5)); dd[-1] = ss[-1]; break; + case 134: memcpy_avx_128(dd - 134, ss - 134); [[fallthrough]]; + case 6: *((uint32_t*)(dd - 6)) = *((uint32_t*)(ss - 6)); *((uint16_t*)(dd - 2)) = *((uint16_t*)(ss - 2)); break; + case 135: memcpy_avx_128(dd - 135, ss - 135); [[fallthrough]]; + case 7: *((uint32_t*)(dd - 7)) = *((uint32_t*)(ss - 7)); *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); break; + case 136: memcpy_avx_128(dd - 136, ss - 136); [[fallthrough]]; + case 8: *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break; + case 137: memcpy_avx_128(dd - 137, ss - 137); [[fallthrough]]; + case 9: *((uint64_t*)(dd - 9)) = *((uint64_t*)(ss - 9)); dd[-1] = ss[-1]; break; + case 138: memcpy_avx_128(dd - 138, ss - 138); [[fallthrough]]; + case 10: *((uint64_t*)(dd - 10)) = *((uint64_t*)(ss - 10)); *((uint16_t*)(dd - 2)) = *((uint16_t*)(ss - 2)); break; + case 139: memcpy_avx_128(dd - 139, ss - 139); [[fallthrough]]; + case 11: *((uint64_t*)(dd - 11)) = *((uint64_t*)(ss - 11)); *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); break; + case 140: memcpy_avx_128(dd - 140, ss - 140); [[fallthrough]]; + case 12: *((uint64_t*)(dd - 12)) = *((uint64_t*)(ss - 12)); *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); break; + case 141: memcpy_avx_128(dd - 141, ss - 141); [[fallthrough]]; + case 13: *((uint64_t*)(dd - 13)) = *((uint64_t*)(ss - 13)); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break; + case 142: memcpy_avx_128(dd - 142, ss - 142); [[fallthrough]]; + case 14: *((uint64_t*)(dd - 14)) = *((uint64_t*)(ss - 14)); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break; + case 143: memcpy_avx_128(dd - 143, ss - 143); [[fallthrough]]; + case 15: *((uint64_t*)(dd - 15)) = *((uint64_t*)(ss - 15)); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break; + case 144: memcpy_avx_128(dd - 144, ss - 144); [[fallthrough]]; + case 16: memcpy_avx_16(dd - 16, ss - 16); break; + case 145: memcpy_avx_128(dd - 145, ss - 145); [[fallthrough]]; + case 17: memcpy_avx_16(dd - 17, ss - 17); dd[-1] = ss[-1]; break; + case 146: memcpy_avx_128(dd - 146, ss - 146); [[fallthrough]]; + case 18: memcpy_avx_16(dd - 18, ss - 18); *((uint16_t*)(dd - 2)) = *((uint16_t*)(ss - 2)); break; + case 147: memcpy_avx_128(dd - 147, ss - 147); [[fallthrough]]; + case 19: memcpy_avx_16(dd - 19, ss - 19); *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); break; + case 148: memcpy_avx_128(dd - 148, ss - 148); [[fallthrough]]; + case 20: memcpy_avx_16(dd - 20, ss - 20); *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); break; + case 149: memcpy_avx_128(dd - 149, ss - 149); [[fallthrough]]; + case 21: memcpy_avx_16(dd - 21, ss - 21); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break; + case 150: memcpy_avx_128(dd - 150, ss - 150); [[fallthrough]]; + case 22: memcpy_avx_16(dd - 22, ss - 22); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break; + case 151: memcpy_avx_128(dd - 151, ss - 151); [[fallthrough]]; + case 23: memcpy_avx_16(dd - 23, ss - 23); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break; + case 152: memcpy_avx_128(dd - 152, ss - 152); [[fallthrough]]; + case 24: memcpy_avx_16(dd - 24, ss - 24); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break; + case 153: memcpy_avx_128(dd - 153, ss - 153); [[fallthrough]]; + case 25: memcpy_avx_16(dd - 25, ss - 25); memcpy_avx_16(dd - 16, ss - 16); break; + case 154: memcpy_avx_128(dd - 154, ss - 154); [[fallthrough]]; + case 26: memcpy_avx_16(dd - 26, ss - 26); memcpy_avx_16(dd - 16, ss - 16); break; + case 155: memcpy_avx_128(dd - 155, ss - 155); [[fallthrough]]; + case 27: memcpy_avx_16(dd - 27, ss - 27); memcpy_avx_16(dd - 16, ss - 16); break; + case 156: memcpy_avx_128(dd - 156, ss - 156); [[fallthrough]]; + case 28: memcpy_avx_16(dd - 28, ss - 28); memcpy_avx_16(dd - 16, ss - 16); break; + case 157: memcpy_avx_128(dd - 157, ss - 157); [[fallthrough]]; + case 29: memcpy_avx_16(dd - 29, ss - 29); memcpy_avx_16(dd - 16, ss - 16); break; + case 158: memcpy_avx_128(dd - 158, ss - 158); [[fallthrough]]; + case 30: memcpy_avx_16(dd - 30, ss - 30); memcpy_avx_16(dd - 16, ss - 16); break; + case 159: memcpy_avx_128(dd - 159, ss - 159); [[fallthrough]]; + case 31: memcpy_avx_16(dd - 31, ss - 31); memcpy_avx_16(dd - 16, ss - 16); break; + case 160: memcpy_avx_128(dd - 160, ss - 160); [[fallthrough]]; + case 32: memcpy_avx_32(dd - 32, ss - 32); break; + case 161: memcpy_avx_128(dd - 161, ss - 161); [[fallthrough]]; + case 33: memcpy_avx_32(dd - 33, ss - 33); dd[-1] = ss[-1]; break; + case 162: memcpy_avx_128(dd - 162, ss - 162); [[fallthrough]]; + case 34: memcpy_avx_32(dd - 34, ss - 34); *((uint16_t*)(dd - 2)) = *((uint16_t*)(ss - 2)); break; + case 163: memcpy_avx_128(dd - 163, ss - 163); [[fallthrough]]; + case 35: memcpy_avx_32(dd - 35, ss - 35); *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); break; + case 164: memcpy_avx_128(dd - 164, ss - 164); [[fallthrough]]; + case 36: memcpy_avx_32(dd - 36, ss - 36); *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); break; + case 165: memcpy_avx_128(dd - 165, ss - 165); [[fallthrough]]; + case 37: memcpy_avx_32(dd - 37, ss - 37); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break; + case 166: memcpy_avx_128(dd - 166, ss - 166); [[fallthrough]]; + case 38: memcpy_avx_32(dd - 38, ss - 38); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break; + case 167: memcpy_avx_128(dd - 167, ss - 167); [[fallthrough]]; + case 39: memcpy_avx_32(dd - 39, ss - 39); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break; + case 168: memcpy_avx_128(dd - 168, ss - 168); [[fallthrough]]; + case 40: memcpy_avx_32(dd - 40, ss - 40); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break; + case 169: memcpy_avx_128(dd - 169, ss - 169); [[fallthrough]]; + case 41: memcpy_avx_32(dd - 41, ss - 41); memcpy_avx_16(dd - 16, ss - 16); break; + case 170: memcpy_avx_128(dd - 170, ss - 170); [[fallthrough]]; + case 42: memcpy_avx_32(dd - 42, ss - 42); memcpy_avx_16(dd - 16, ss - 16); break; + case 171: memcpy_avx_128(dd - 171, ss - 171); [[fallthrough]]; + case 43: memcpy_avx_32(dd - 43, ss - 43); memcpy_avx_16(dd - 16, ss - 16); break; + case 172: memcpy_avx_128(dd - 172, ss - 172); [[fallthrough]]; + case 44: memcpy_avx_32(dd - 44, ss - 44); memcpy_avx_16(dd - 16, ss - 16); break; + case 173: memcpy_avx_128(dd - 173, ss - 173); [[fallthrough]]; + case 45: memcpy_avx_32(dd - 45, ss - 45); memcpy_avx_16(dd - 16, ss - 16); break; + case 174: memcpy_avx_128(dd - 174, ss - 174); [[fallthrough]]; + case 46: memcpy_avx_32(dd - 46, ss - 46); memcpy_avx_16(dd - 16, ss - 16); break; + case 175: memcpy_avx_128(dd - 175, ss - 175); [[fallthrough]]; + case 47: memcpy_avx_32(dd - 47, ss - 47); memcpy_avx_16(dd - 16, ss - 16); break; + case 176: memcpy_avx_128(dd - 176, ss - 176); [[fallthrough]]; + case 48: memcpy_avx_32(dd - 48, ss - 48); memcpy_avx_16(dd - 16, ss - 16); break; + case 177: memcpy_avx_128(dd - 177, ss - 177); [[fallthrough]]; + case 49: memcpy_avx_32(dd - 49, ss - 49); memcpy_avx_32(dd - 32, ss - 32); break; + case 178: memcpy_avx_128(dd - 178, ss - 178); [[fallthrough]]; + case 50: memcpy_avx_32(dd - 50, ss - 50); memcpy_avx_32(dd - 32, ss - 32); break; + case 179: memcpy_avx_128(dd - 179, ss - 179); [[fallthrough]]; + case 51: memcpy_avx_32(dd - 51, ss - 51); memcpy_avx_32(dd - 32, ss - 32); break; + case 180: memcpy_avx_128(dd - 180, ss - 180); [[fallthrough]]; + case 52: memcpy_avx_32(dd - 52, ss - 52); memcpy_avx_32(dd - 32, ss - 32); break; + case 181: memcpy_avx_128(dd - 181, ss - 181); [[fallthrough]]; + case 53: memcpy_avx_32(dd - 53, ss - 53); memcpy_avx_32(dd - 32, ss - 32); break; + case 182: memcpy_avx_128(dd - 182, ss - 182); [[fallthrough]]; + case 54: memcpy_avx_32(dd - 54, ss - 54); memcpy_avx_32(dd - 32, ss - 32); break; + case 183: memcpy_avx_128(dd - 183, ss - 183); [[fallthrough]]; + case 55: memcpy_avx_32(dd - 55, ss - 55); memcpy_avx_32(dd - 32, ss - 32); break; + case 184: memcpy_avx_128(dd - 184, ss - 184); [[fallthrough]]; + case 56: memcpy_avx_32(dd - 56, ss - 56); memcpy_avx_32(dd - 32, ss - 32); break; + case 185: memcpy_avx_128(dd - 185, ss - 185); [[fallthrough]]; + case 57: memcpy_avx_32(dd - 57, ss - 57); memcpy_avx_32(dd - 32, ss - 32); break; + case 186: memcpy_avx_128(dd - 186, ss - 186); [[fallthrough]]; + case 58: memcpy_avx_32(dd - 58, ss - 58); memcpy_avx_32(dd - 32, ss - 32); break; + case 187: memcpy_avx_128(dd - 187, ss - 187); [[fallthrough]]; + case 59: memcpy_avx_32(dd - 59, ss - 59); memcpy_avx_32(dd - 32, ss - 32); break; + case 188: memcpy_avx_128(dd - 188, ss - 188); [[fallthrough]]; + case 60: memcpy_avx_32(dd - 60, ss - 60); memcpy_avx_32(dd - 32, ss - 32); break; + case 189: memcpy_avx_128(dd - 189, ss - 189); [[fallthrough]]; + case 61: memcpy_avx_32(dd - 61, ss - 61); memcpy_avx_32(dd - 32, ss - 32); break; + case 190: memcpy_avx_128(dd - 190, ss - 190); [[fallthrough]]; + case 62: memcpy_avx_32(dd - 62, ss - 62); memcpy_avx_32(dd - 32, ss - 32); break; + case 191: memcpy_avx_128(dd - 191, ss - 191); [[fallthrough]]; + case 63: memcpy_avx_32(dd - 63, ss - 63); memcpy_avx_32(dd - 32, ss - 32); break; + case 192: memcpy_avx_128(dd - 192, ss - 192); [[fallthrough]]; + case 64: memcpy_avx_64(dd - 64, ss - 64); break; + case 193: memcpy_avx_128(dd - 193, ss - 193); [[fallthrough]]; + case 65: memcpy_avx_64(dd - 65, ss - 65); dd[-1] = ss[-1]; break; + case 194: memcpy_avx_128(dd - 194, ss - 194); [[fallthrough]]; + case 66: memcpy_avx_64(dd - 66, ss - 66); *((uint16_t*)(dd - 2)) = *((uint16_t*)(ss - 2)); break; + case 195: memcpy_avx_128(dd - 195, ss - 195); [[fallthrough]]; + case 67: memcpy_avx_64(dd - 67, ss - 67); *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); break; + case 196: memcpy_avx_128(dd - 196, ss - 196); [[fallthrough]]; + case 68: memcpy_avx_64(dd - 68, ss - 68); *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); break; + case 197: memcpy_avx_128(dd - 197, ss - 197); [[fallthrough]]; + case 69: memcpy_avx_64(dd - 69, ss - 69); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break; + case 198: memcpy_avx_128(dd - 198, ss - 198); [[fallthrough]]; + case 70: memcpy_avx_64(dd - 70, ss - 70); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break; + case 199: memcpy_avx_128(dd - 199, ss - 199); [[fallthrough]]; + case 71: memcpy_avx_64(dd - 71, ss - 71); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break; + case 200: memcpy_avx_128(dd - 200, ss - 200); [[fallthrough]]; + case 72: memcpy_avx_64(dd - 72, ss - 72); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break; + case 201: memcpy_avx_128(dd - 201, ss - 201); [[fallthrough]]; + case 73: memcpy_avx_64(dd - 73, ss - 73); memcpy_avx_16(dd - 16, ss - 16); break; + case 202: memcpy_avx_128(dd - 202, ss - 202); [[fallthrough]]; + case 74: memcpy_avx_64(dd - 74, ss - 74); memcpy_avx_16(dd - 16, ss - 16); break; + case 203: memcpy_avx_128(dd - 203, ss - 203); [[fallthrough]]; + case 75: memcpy_avx_64(dd - 75, ss - 75); memcpy_avx_16(dd - 16, ss - 16); break; + case 204: memcpy_avx_128(dd - 204, ss - 204); [[fallthrough]]; + case 76: memcpy_avx_64(dd - 76, ss - 76); memcpy_avx_16(dd - 16, ss - 16); break; + case 205: memcpy_avx_128(dd - 205, ss - 205); [[fallthrough]]; + case 77: memcpy_avx_64(dd - 77, ss - 77); memcpy_avx_16(dd - 16, ss - 16); break; + case 206: memcpy_avx_128(dd - 206, ss - 206); [[fallthrough]]; + case 78: memcpy_avx_64(dd - 78, ss - 78); memcpy_avx_16(dd - 16, ss - 16); break; + case 207: memcpy_avx_128(dd - 207, ss - 207); [[fallthrough]]; + case 79: memcpy_avx_64(dd - 79, ss - 79); memcpy_avx_16(dd - 16, ss - 16); break; + case 208: memcpy_avx_128(dd - 208, ss - 208); [[fallthrough]]; + case 80: memcpy_avx_64(dd - 80, ss - 80); memcpy_avx_16(dd - 16, ss - 16); break; + case 209: memcpy_avx_128(dd - 209, ss - 209); [[fallthrough]]; + case 81: memcpy_avx_64(dd - 81, ss - 81); memcpy_avx_32(dd - 32, ss - 32); break; + case 210: memcpy_avx_128(dd - 210, ss - 210); [[fallthrough]]; + case 82: memcpy_avx_64(dd - 82, ss - 82); memcpy_avx_32(dd - 32, ss - 32); break; + case 211: memcpy_avx_128(dd - 211, ss - 211); [[fallthrough]]; + case 83: memcpy_avx_64(dd - 83, ss - 83); memcpy_avx_32(dd - 32, ss - 32); break; + case 212: memcpy_avx_128(dd - 212, ss - 212); [[fallthrough]]; + case 84: memcpy_avx_64(dd - 84, ss - 84); memcpy_avx_32(dd - 32, ss - 32); break; + case 213: memcpy_avx_128(dd - 213, ss - 213); [[fallthrough]]; + case 85: memcpy_avx_64(dd - 85, ss - 85); memcpy_avx_32(dd - 32, ss - 32); break; + case 214: memcpy_avx_128(dd - 214, ss - 214); [[fallthrough]]; + case 86: memcpy_avx_64(dd - 86, ss - 86); memcpy_avx_32(dd - 32, ss - 32); break; + case 215: memcpy_avx_128(dd - 215, ss - 215); [[fallthrough]]; + case 87: memcpy_avx_64(dd - 87, ss - 87); memcpy_avx_32(dd - 32, ss - 32); break; + case 216: memcpy_avx_128(dd - 216, ss - 216); [[fallthrough]]; + case 88: memcpy_avx_64(dd - 88, ss - 88); memcpy_avx_32(dd - 32, ss - 32); break; + case 217: memcpy_avx_128(dd - 217, ss - 217); [[fallthrough]]; + case 89: memcpy_avx_64(dd - 89, ss - 89); memcpy_avx_32(dd - 32, ss - 32); break; + case 218: memcpy_avx_128(dd - 218, ss - 218); [[fallthrough]]; + case 90: memcpy_avx_64(dd - 90, ss - 90); memcpy_avx_32(dd - 32, ss - 32); break; + case 219: memcpy_avx_128(dd - 219, ss - 219); [[fallthrough]]; + case 91: memcpy_avx_64(dd - 91, ss - 91); memcpy_avx_32(dd - 32, ss - 32); break; + case 220: memcpy_avx_128(dd - 220, ss - 220); [[fallthrough]]; + case 92: memcpy_avx_64(dd - 92, ss - 92); memcpy_avx_32(dd - 32, ss - 32); break; + case 221: memcpy_avx_128(dd - 221, ss - 221); [[fallthrough]]; + case 93: memcpy_avx_64(dd - 93, ss - 93); memcpy_avx_32(dd - 32, ss - 32); break; + case 222: memcpy_avx_128(dd - 222, ss - 222); [[fallthrough]]; + case 94: memcpy_avx_64(dd - 94, ss - 94); memcpy_avx_32(dd - 32, ss - 32); break; + case 223: memcpy_avx_128(dd - 223, ss - 223); [[fallthrough]]; + case 95: memcpy_avx_64(dd - 95, ss - 95); memcpy_avx_32(dd - 32, ss - 32); break; + case 224: memcpy_avx_128(dd - 224, ss - 224); [[fallthrough]]; + case 96: memcpy_avx_64(dd - 96, ss - 96); memcpy_avx_32(dd - 32, ss - 32); break; + case 225: memcpy_avx_128(dd - 225, ss - 225); [[fallthrough]]; + case 97: memcpy_avx_64(dd - 97, ss - 97); memcpy_avx_64(dd - 64, ss - 64); break; + case 226: memcpy_avx_128(dd - 226, ss - 226); [[fallthrough]]; + case 98: memcpy_avx_64(dd - 98, ss - 98); memcpy_avx_64(dd - 64, ss - 64); break; + case 227: memcpy_avx_128(dd - 227, ss - 227); [[fallthrough]]; + case 99: memcpy_avx_64(dd - 99, ss - 99); memcpy_avx_64(dd - 64, ss - 64); break; + case 228: memcpy_avx_128(dd - 228, ss - 228); [[fallthrough]]; + case 100: memcpy_avx_64(dd - 100, ss - 100); memcpy_avx_64(dd - 64, ss - 64); break; + case 229: memcpy_avx_128(dd - 229, ss - 229); [[fallthrough]]; + case 101: memcpy_avx_64(dd - 101, ss - 101); memcpy_avx_64(dd - 64, ss - 64); break; + case 230: memcpy_avx_128(dd - 230, ss - 230); [[fallthrough]]; + case 102: memcpy_avx_64(dd - 102, ss - 102); memcpy_avx_64(dd - 64, ss - 64); break; + case 231: memcpy_avx_128(dd - 231, ss - 231); [[fallthrough]]; + case 103: memcpy_avx_64(dd - 103, ss - 103); memcpy_avx_64(dd - 64, ss - 64); break; + case 232: memcpy_avx_128(dd - 232, ss - 232); [[fallthrough]]; + case 104: memcpy_avx_64(dd - 104, ss - 104); memcpy_avx_64(dd - 64, ss - 64); break; + case 233: memcpy_avx_128(dd - 233, ss - 233); [[fallthrough]]; + case 105: memcpy_avx_64(dd - 105, ss - 105); memcpy_avx_64(dd - 64, ss - 64); break; + case 234: memcpy_avx_128(dd - 234, ss - 234); [[fallthrough]]; + case 106: memcpy_avx_64(dd - 106, ss - 106); memcpy_avx_64(dd - 64, ss - 64); break; + case 235: memcpy_avx_128(dd - 235, ss - 235); [[fallthrough]]; + case 107: memcpy_avx_64(dd - 107, ss - 107); memcpy_avx_64(dd - 64, ss - 64); break; + case 236: memcpy_avx_128(dd - 236, ss - 236); [[fallthrough]]; + case 108: memcpy_avx_64(dd - 108, ss - 108); memcpy_avx_64(dd - 64, ss - 64); break; + case 237: memcpy_avx_128(dd - 237, ss - 237); [[fallthrough]]; + case 109: memcpy_avx_64(dd - 109, ss - 109); memcpy_avx_64(dd - 64, ss - 64); break; + case 238: memcpy_avx_128(dd - 238, ss - 238); [[fallthrough]]; + case 110: memcpy_avx_64(dd - 110, ss - 110); memcpy_avx_64(dd - 64, ss - 64); break; + case 239: memcpy_avx_128(dd - 239, ss - 239); [[fallthrough]]; + case 111: memcpy_avx_64(dd - 111, ss - 111); memcpy_avx_64(dd - 64, ss - 64); break; + case 240: memcpy_avx_128(dd - 240, ss - 240); [[fallthrough]]; + case 112: memcpy_avx_64(dd - 112, ss - 112); memcpy_avx_64(dd - 64, ss - 64); break; + case 241: memcpy_avx_128(dd - 241, ss - 241); [[fallthrough]]; + case 113: memcpy_avx_64(dd - 113, ss - 113); memcpy_avx_64(dd - 64, ss - 64); break; + case 242: memcpy_avx_128(dd - 242, ss - 242); [[fallthrough]]; + case 114: memcpy_avx_64(dd - 114, ss - 114); memcpy_avx_64(dd - 64, ss - 64); break; + case 243: memcpy_avx_128(dd - 243, ss - 243); [[fallthrough]]; + case 115: memcpy_avx_64(dd - 115, ss - 115); memcpy_avx_64(dd - 64, ss - 64); break; + case 244: memcpy_avx_128(dd - 244, ss - 244); [[fallthrough]]; + case 116: memcpy_avx_64(dd - 116, ss - 116); memcpy_avx_64(dd - 64, ss - 64); break; + case 245: memcpy_avx_128(dd - 245, ss - 245); [[fallthrough]]; + case 117: memcpy_avx_64(dd - 117, ss - 117); memcpy_avx_64(dd - 64, ss - 64); break; + case 246: memcpy_avx_128(dd - 246, ss - 246); [[fallthrough]]; + case 118: memcpy_avx_64(dd - 118, ss - 118); memcpy_avx_64(dd - 64, ss - 64); break; + case 247: memcpy_avx_128(dd - 247, ss - 247); [[fallthrough]]; + case 119: memcpy_avx_64(dd - 119, ss - 119); memcpy_avx_64(dd - 64, ss - 64); break; + case 248: memcpy_avx_128(dd - 248, ss - 248); [[fallthrough]]; + case 120: memcpy_avx_64(dd - 120, ss - 120); memcpy_avx_64(dd - 64, ss - 64); break; + case 249: memcpy_avx_128(dd - 249, ss - 249); [[fallthrough]]; + case 121: memcpy_avx_64(dd - 121, ss - 121); memcpy_avx_64(dd - 64, ss - 64); break; + case 250: memcpy_avx_128(dd - 250, ss - 250); [[fallthrough]]; + case 122: memcpy_avx_64(dd - 122, ss - 122); memcpy_avx_64(dd - 64, ss - 64); break; + case 251: memcpy_avx_128(dd - 251, ss - 251); [[fallthrough]]; + case 123: memcpy_avx_64(dd - 123, ss - 123); memcpy_avx_64(dd - 64, ss - 64); break; + case 252: memcpy_avx_128(dd - 252, ss - 252); [[fallthrough]]; + case 124: memcpy_avx_64(dd - 124, ss - 124); memcpy_avx_64(dd - 64, ss - 64); break; + case 253: memcpy_avx_128(dd - 253, ss - 253); [[fallthrough]]; + case 125: memcpy_avx_64(dd - 125, ss - 125); memcpy_avx_64(dd - 64, ss - 64); break; + case 254: memcpy_avx_128(dd - 254, ss - 254); [[fallthrough]]; + case 126: memcpy_avx_64(dd - 126, ss - 126); memcpy_avx_64(dd - 64, ss - 64); break; + case 255: memcpy_avx_128(dd - 255, ss - 255); [[fallthrough]]; + case 127: memcpy_avx_64(dd - 127, ss - 127); memcpy_avx_64(dd - 64, ss - 64); break; + case 256: memcpy_avx_256(dd - 256, ss - 256); break; + } + + return dst; +} + + +//--------------------------------------------------------------------- +// main routine +//--------------------------------------------------------------------- +void* memcpy_fast_avx(void * __restrict destination, const void * __restrict source, size_t size) +{ + unsigned char *dst = reinterpret_cast(destination); + const unsigned char *src = reinterpret_cast(source); + static size_t cachesize = 0x200000; // L3-cache size + size_t padding; + + // small memory copy + if (size <= 256) + { + memcpy_tiny_avx(dst, src, size); + _mm256_zeroupper(); + return destination; + } + + // align destination to 16 bytes boundary + padding = (32 - (((size_t)dst) & 31)) & 31; + +#if 0 + if (padding > 0) + { + __m256i head = _mm256_loadu_si256(reinterpret_cast(src)); + _mm256_storeu_si256((__m256i*)dst, head); + dst += padding; + src += padding; + size -= padding; + } +#else + __m256i head = _mm256_loadu_si256(reinterpret_cast(src)); + _mm256_storeu_si256((__m256i*)dst, head); + dst += padding; + src += padding; + size -= padding; +#endif + + // medium size copy + if (size <= cachesize) + { + __m256i c0, c1, c2, c3, c4, c5, c6, c7; + + for (; size >= 256; size -= 256) + { + c0 = _mm256_loadu_si256((reinterpret_cast(src)) + 0); + c1 = _mm256_loadu_si256((reinterpret_cast(src)) + 1); + c2 = _mm256_loadu_si256((reinterpret_cast(src)) + 2); + c3 = _mm256_loadu_si256((reinterpret_cast(src)) + 3); + c4 = _mm256_loadu_si256((reinterpret_cast(src)) + 4); + c5 = _mm256_loadu_si256((reinterpret_cast(src)) + 5); + c6 = _mm256_loadu_si256((reinterpret_cast(src)) + 6); + c7 = _mm256_loadu_si256((reinterpret_cast(src)) + 7); + src += 256; + _mm256_storeu_si256(((reinterpret_cast<__m256i*>(dst)) + 0), c0); + _mm256_storeu_si256(((reinterpret_cast<__m256i*>(dst)) + 1), c1); + _mm256_storeu_si256(((reinterpret_cast<__m256i*>(dst)) + 2), c2); + _mm256_storeu_si256(((reinterpret_cast<__m256i*>(dst)) + 3), c3); + _mm256_storeu_si256(((reinterpret_cast<__m256i*>(dst)) + 4), c4); + _mm256_storeu_si256(((reinterpret_cast<__m256i*>(dst)) + 5), c5); + _mm256_storeu_si256(((reinterpret_cast<__m256i*>(dst)) + 6), c6); + _mm256_storeu_si256(((reinterpret_cast<__m256i*>(dst)) + 7), c7); + dst += 256; + } + } + else + { // big memory copy + __m256i c0, c1, c2, c3, c4, c5, c6, c7; + /* __m256i c0, c1, c2, c3, c4, c5, c6, c7; */ + + if ((((size_t)src) & 31) == 0) + { // source aligned + for (; size >= 256; size -= 256) + { + c0 = _mm256_load_si256((reinterpret_cast(src)) + 0); + c1 = _mm256_load_si256((reinterpret_cast(src)) + 1); + c2 = _mm256_load_si256((reinterpret_cast(src)) + 2); + c3 = _mm256_load_si256((reinterpret_cast(src)) + 3); + c4 = _mm256_load_si256((reinterpret_cast(src)) + 4); + c5 = _mm256_load_si256((reinterpret_cast(src)) + 5); + c6 = _mm256_load_si256((reinterpret_cast(src)) + 6); + c7 = _mm256_load_si256((reinterpret_cast(src)) + 7); + src += 256; + _mm256_stream_si256(((reinterpret_cast<__m256i*>(dst)) + 0), c0); + _mm256_stream_si256(((reinterpret_cast<__m256i*>(dst)) + 1), c1); + _mm256_stream_si256(((reinterpret_cast<__m256i*>(dst)) + 2), c2); + _mm256_stream_si256(((reinterpret_cast<__m256i*>(dst)) + 3), c3); + _mm256_stream_si256(((reinterpret_cast<__m256i*>(dst)) + 4), c4); + _mm256_stream_si256(((reinterpret_cast<__m256i*>(dst)) + 5), c5); + _mm256_stream_si256(((reinterpret_cast<__m256i*>(dst)) + 6), c6); + _mm256_stream_si256(((reinterpret_cast<__m256i*>(dst)) + 7), c7); + dst += 256; + } + } + else + { // source unaligned + for (; size >= 256; size -= 256) + { + c0 = _mm256_loadu_si256((reinterpret_cast(src)) + 0); + c1 = _mm256_loadu_si256((reinterpret_cast(src)) + 1); + c2 = _mm256_loadu_si256((reinterpret_cast(src)) + 2); + c3 = _mm256_loadu_si256((reinterpret_cast(src)) + 3); + c4 = _mm256_loadu_si256((reinterpret_cast(src)) + 4); + c5 = _mm256_loadu_si256((reinterpret_cast(src)) + 5); + c6 = _mm256_loadu_si256((reinterpret_cast(src)) + 6); + c7 = _mm256_loadu_si256((reinterpret_cast(src)) + 7); + src += 256; + _mm256_stream_si256(((reinterpret_cast<__m256i*>(dst)) + 0), c0); + _mm256_stream_si256(((reinterpret_cast<__m256i*>(dst)) + 1), c1); + _mm256_stream_si256(((reinterpret_cast<__m256i*>(dst)) + 2), c2); + _mm256_stream_si256(((reinterpret_cast<__m256i*>(dst)) + 3), c3); + _mm256_stream_si256(((reinterpret_cast<__m256i*>(dst)) + 4), c4); + _mm256_stream_si256(((reinterpret_cast<__m256i*>(dst)) + 5), c5); + _mm256_stream_si256(((reinterpret_cast<__m256i*>(dst)) + 6), c6); + _mm256_stream_si256(((reinterpret_cast<__m256i*>(dst)) + 7), c7); + dst += 256; + } + } + _mm_sfence(); + } + + memcpy_tiny_avx(dst, src, size); + _mm256_zeroupper(); + + return destination; +} diff --git a/utils/memcpy-bench/memcpy-bench.cpp b/utils/memcpy-bench/memcpy-bench.cpp new file mode 100644 index 00000000000..2df72cb5ccb --- /dev/null +++ b/utils/memcpy-bench/memcpy-bench.cpp @@ -0,0 +1,620 @@ +#include +#include +#include +#include +#include +#include +#include + +#include + +#include + +#include + +#include + +#pragma GCC diagnostic ignored "-Wold-style-cast" +#pragma GCC diagnostic ignored "-Wcast-align" +#pragma GCC diagnostic ignored "-Wcast-qual" +#include "FastMemcpy.h" +//#include "FastMemcpy_Avx.h" + +#include +#include + + +template +void NO_INLINE loop(uint8_t * dst, uint8_t * src, size_t size, F && chunk_size_distribution, MemcpyImpl && impl) +{ + while (size) + { + size_t bytes_to_copy = std::min(size, chunk_size_distribution()); + + impl(dst, src, bytes_to_copy); + + dst += bytes_to_copy; + src += bytes_to_copy; + size -= bytes_to_copy; + } +} + + +using RNG = pcg32_fast; + +template +size_t generatorUniform(RNG & rng) { return rng() % N; }; + + +template +void test(uint8_t * dst, uint8_t * src, size_t size, size_t iterations, size_t num_threads, F && generator, MemcpyImpl && impl) +{ + Stopwatch watch; + + std::vector threads; + threads.reserve(num_threads); + + for (size_t thread_num = 0; thread_num < num_threads; ++thread_num) + { + size_t begin = size * thread_num / num_threads; + size_t end = size * (thread_num + 1) / num_threads; + + threads.emplace_back([begin, end, iterations, &src, &dst, &generator, &impl] + { + for (size_t iteration = 0; iteration < iterations; ++iteration) + { + loop( + iteration % 2 ? &src[begin] : &dst[begin], + iteration % 2 ? &dst[begin] : &src[begin], + end - begin, + [rng = RNG(), &generator]() mutable { return generator(rng); }, + std::forward(impl)); + } + }); + } + + for (auto & thread : threads) + thread.join(); + + double elapsed_ns = watch.elapsed(); + + /// Validation + size_t sum = 0; + for (size_t i = 0; i < size; ++i) + sum += dst[i]; + + std::cerr << std::fixed << std::setprecision(3) + << "Processed in " << (elapsed_ns / 1e9) << "sec, " << (size * iterations * 1.0 / elapsed_ns) << " GB/sec (sum = " << sum << ")\n"; +} + + +using memcpy_type = void * (*)(const void * __restrict, void * __restrict, size_t); + + +static void * memcpy_erms(void * dst, const void * src, size_t size) +{ + asm volatile ( + "rep movsb" + : "=D"(dst), "=S"(src), "=c"(size) + : "0"(dst), "1"(src), "2"(size) + : "memory"); + return dst; +} + +extern "C" void * memcpy_jart(void * dst, const void * src, size_t size); +extern "C" void MemCpy(void * dst, const void * src, size_t size); + + +static void * memcpySSE2(void * __restrict destination, const void * __restrict source, size_t size) +{ + unsigned char *dst = reinterpret_cast(destination); + const unsigned char *src = reinterpret_cast(source); + size_t padding; + + // small memory copy + if (size <= 16) + return memcpy_tiny(dst, src, size); + + // align destination to 16 bytes boundary + padding = (16 - (reinterpret_cast(dst) & 15)) & 15; + + if (padding > 0) + { + __m128i head = _mm_loadu_si128(reinterpret_cast(src)); + _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), head); + dst += padding; + src += padding; + size -= padding; + } + + // medium size copy + __m128i c0; + + for (; size >= 16; size -= 16) + { + c0 = _mm_loadu_si128(reinterpret_cast(src)); + src += 16; + _mm_store_si128((reinterpret_cast<__m128i*>(dst)), c0); + dst += 16; + } + + memcpy_tiny(dst, src, size); + return destination; +} + +static void * memcpySSE2Unrolled2(void * __restrict destination, const void * __restrict source, size_t size) +{ + unsigned char *dst = reinterpret_cast(destination); + const unsigned char *src = reinterpret_cast(source); + size_t padding; + + // small memory copy + if (size <= 32) + return memcpy_tiny(dst, src, size); + + // align destination to 16 bytes boundary + padding = (16 - (reinterpret_cast(dst) & 15)) & 15; + + if (padding > 0) + { + __m128i head = _mm_loadu_si128(reinterpret_cast(src)); + _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), head); + dst += padding; + src += padding; + size -= padding; + } + + // medium size copy + __m128i c0, c1; + + for (; size >= 32; size -= 32) + { + c0 = _mm_loadu_si128(reinterpret_cast(src) + 0); + c1 = _mm_loadu_si128(reinterpret_cast(src) + 1); + src += 32; + _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 0), c0); + _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 1), c1); + dst += 32; + } + + memcpy_tiny(dst, src, size); + return destination; +} + +static void * memcpySSE2Unrolled4(void * __restrict destination, const void * __restrict source, size_t size) +{ + unsigned char *dst = reinterpret_cast(destination); + const unsigned char *src = reinterpret_cast(source); + size_t padding; + + // small memory copy + if (size <= 64) + return memcpy_tiny(dst, src, size); + + // align destination to 16 bytes boundary + padding = (16 - (reinterpret_cast(dst) & 15)) & 15; + + if (padding > 0) + { + __m128i head = _mm_loadu_si128(reinterpret_cast(src)); + _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), head); + dst += padding; + src += padding; + size -= padding; + } + + // medium size copy + __m128i c0, c1, c2, c3; + + for (; size >= 64; size -= 64) + { + c0 = _mm_loadu_si128(reinterpret_cast(src) + 0); + c1 = _mm_loadu_si128(reinterpret_cast(src) + 1); + c2 = _mm_loadu_si128(reinterpret_cast(src) + 2); + c3 = _mm_loadu_si128(reinterpret_cast(src) + 3); + src += 64; + _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 0), c0); + _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 1), c1); + _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 2), c2); + _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 3), c3); + dst += 64; + } + + memcpy_tiny(dst, src, size); + return destination; +} + + +static void * memcpySSE2Unrolled8(void * __restrict destination, const void * __restrict source, size_t size) +{ + unsigned char *dst = reinterpret_cast(destination); + const unsigned char *src = reinterpret_cast(source); + size_t padding; + + // small memory copy + if (size <= 128) + return memcpy_tiny(dst, src, size); + + // align destination to 16 bytes boundary + padding = (16 - (reinterpret_cast(dst) & 15)) & 15; + + if (padding > 0) + { + __m128i head = _mm_loadu_si128(reinterpret_cast(src)); + _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), head); + dst += padding; + src += padding; + size -= padding; + } + + // medium size copy + __m128i c0, c1, c2, c3, c4, c5, c6, c7; + + for (; size >= 128; size -= 128) + { + c0 = _mm_loadu_si128(reinterpret_cast(src) + 0); + c1 = _mm_loadu_si128(reinterpret_cast(src) + 1); + c2 = _mm_loadu_si128(reinterpret_cast(src) + 2); + c3 = _mm_loadu_si128(reinterpret_cast(src) + 3); + c4 = _mm_loadu_si128(reinterpret_cast(src) + 4); + c5 = _mm_loadu_si128(reinterpret_cast(src) + 5); + c6 = _mm_loadu_si128(reinterpret_cast(src) + 6); + c7 = _mm_loadu_si128(reinterpret_cast(src) + 7); + src += 128; + _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 0), c0); + _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 1), c1); + _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 2), c2); + _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 3), c3); + _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 4), c4); + _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 5), c5); + _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 6), c6); + _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 7), c7); + dst += 128; + } + + memcpy_tiny(dst, src, size); + return destination; +} + + +//static __attribute__((__always_inline__, __target__("sse2"))) +__attribute__((__always_inline__)) +void memcpy_my_medium_sse(uint8_t * __restrict & dst, const uint8_t * __restrict & src, size_t & size) +{ + /// Align destination to 16 bytes boundary. + size_t padding = (16 - (reinterpret_cast(dst) & 15)) & 15; + + if (padding > 0) + { + __m128i head = _mm_loadu_si128(reinterpret_cast(src)); + _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), head); + dst += padding; + src += padding; + size -= padding; + } + + /// Aligned unrolled copy. + __m128i c0, c1, c2, c3, c4, c5, c6, c7; + + while (size >= 128) + { + c0 = _mm_loadu_si128(reinterpret_cast(src) + 0); + c1 = _mm_loadu_si128(reinterpret_cast(src) + 1); + c2 = _mm_loadu_si128(reinterpret_cast(src) + 2); + c3 = _mm_loadu_si128(reinterpret_cast(src) + 3); + c4 = _mm_loadu_si128(reinterpret_cast(src) + 4); + c5 = _mm_loadu_si128(reinterpret_cast(src) + 5); + c6 = _mm_loadu_si128(reinterpret_cast(src) + 6); + c7 = _mm_loadu_si128(reinterpret_cast(src) + 7); + src += 128; + _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 0), c0); + _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 1), c1); + _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 2), c2); + _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 3), c3); + _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 4), c4); + _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 5), c5); + _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 6), c6); + _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 7), c7); + dst += 128; + + size -= 128; + } +} + +__attribute__((__target__("avx"))) +void memcpy_my_medium_avx(uint8_t * __restrict & __restrict dst, const uint8_t * __restrict & __restrict src, size_t & __restrict size) +{ + size_t padding = (32 - (reinterpret_cast(dst) & 31)) & 31; + + if (padding > 0) + { + __m256i head = _mm256_loadu_si256(reinterpret_cast(src)); + _mm256_storeu_si256((__m256i*)dst, head); + dst += padding; + src += padding; + size -= padding; + } + + __m256i c0, c1, c2, c3, c4, c5, c6, c7; + + while (size >= 256) + { + c0 = _mm256_loadu_si256((reinterpret_cast(src)) + 0); + c1 = _mm256_loadu_si256((reinterpret_cast(src)) + 1); + c2 = _mm256_loadu_si256((reinterpret_cast(src)) + 2); + c3 = _mm256_loadu_si256((reinterpret_cast(src)) + 3); + c4 = _mm256_loadu_si256((reinterpret_cast(src)) + 4); + c5 = _mm256_loadu_si256((reinterpret_cast(src)) + 5); + c6 = _mm256_loadu_si256((reinterpret_cast(src)) + 6); + c7 = _mm256_loadu_si256((reinterpret_cast(src)) + 7); + src += 256; + _mm256_store_si256(((reinterpret_cast<__m256i*>(dst)) + 0), c0); + _mm256_store_si256(((reinterpret_cast<__m256i*>(dst)) + 1), c1); + _mm256_store_si256(((reinterpret_cast<__m256i*>(dst)) + 2), c2); + _mm256_store_si256(((reinterpret_cast<__m256i*>(dst)) + 3), c3); + _mm256_store_si256(((reinterpret_cast<__m256i*>(dst)) + 4), c4); + _mm256_store_si256(((reinterpret_cast<__m256i*>(dst)) + 5), c5); + _mm256_store_si256(((reinterpret_cast<__m256i*>(dst)) + 6), c6); + _mm256_store_si256(((reinterpret_cast<__m256i*>(dst)) + 7), c7); + dst += 256; + + size -= 256; + } +} + +bool have_avx = true; + +static uint8_t * memcpy_my(uint8_t * __restrict dst, const uint8_t * __restrict src, size_t size) +{ + uint8_t * ret = dst; + +tail: + if (size <= 16) + { + if (size >= 8) + { + __builtin_memcpy(dst + size - 8, src + size - 8, 8); + __builtin_memcpy(dst, src, 8); + } + else if (size >= 4) + { + __builtin_memcpy(dst + size - 4, src + size - 4, 4); + __builtin_memcpy(dst, src, 4); + } + else if (size >= 2) + { + __builtin_memcpy(dst + size - 2, src + size - 2, 2); + __builtin_memcpy(dst, src, 2); + } + else if (size >= 1) + { + *dst = *src; + } + } + else if (have_avx) + { + if (size <= 32) + { + __builtin_memcpy(dst, src, 8); + __builtin_memcpy(dst + 8, src + 8, 8); + + dst += 16; + src += 16; + size -= 16; + + goto tail; + } + + if (size <= 256) + { + __asm__( + "vmovups -0x20(%[s],%[size],1), %%ymm0\n" + "vmovups %%ymm0, -0x20(%[d],%[size],1)\n" + : [d]"+r"(dst), [s]"+r"(src) + : [size]"r"(size) + : "ymm0", "memory"); + + while (size > 32) + { + __asm__( + "vmovups (%[s]), %%ymm0\n" + "vmovups %%ymm0, (%[d])\n" + : [d]"+r"(dst), [s]"+r"(src) + : + : "ymm0", "memory"); + + dst += 32; + src += 32; + size -= 32; + } + } + else + { + size_t padding = (32 - (reinterpret_cast(dst) & 31)) & 31; + + if (padding > 0) + { + __asm__( + "vmovups (%[s]), %%ymm0\n" + "vmovups %%ymm0, (%[d])\n" + : [d]"+r"(dst), [s]"+r"(src) + : + : "ymm0", "memory"); + + dst += padding; + src += padding; + size -= padding; + } + + while (size >= 256) + { + __asm__( + "vmovups (%[s]), %%ymm0\n" + "vmovups 0x20(%[s]), %%ymm1\n" + "vmovups 0x40(%[s]), %%ymm2\n" + "vmovups 0x60(%[s]), %%ymm3\n" + "vmovups 0x80(%[s]), %%ymm4\n" + "vmovups 0xa0(%[s]), %%ymm5\n" + "vmovups 0xc0(%[s]), %%ymm6\n" + "vmovups 0xe0(%[s]), %%ymm7\n" + "add $0x100,%[s]\n" + "vmovaps %%ymm0, (%[d])\n" + "vmovaps %%ymm1, 0x20(%[d])\n" + "vmovaps %%ymm2, 0x40(%[d])\n" + "vmovaps %%ymm3, 0x60(%[d])\n" + "vmovaps %%ymm4, 0x80(%[d])\n" + "vmovaps %%ymm5, 0xa0(%[d])\n" + "vmovaps %%ymm6, 0xc0(%[d])\n" + "vmovaps %%ymm7, 0xe0(%[d])\n" + "add $0x100, %[d]\n" + : [d]"+r"(dst), [s]"+r"(src) + : + : "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7", "memory"); + + size -= 256; + } + + goto tail; + } + } + else + { + if (size <= 128) + { + _mm_storeu_si128(reinterpret_cast<__m128i *>(dst + size - 16), _mm_loadu_si128(reinterpret_cast(src + size - 16))); + + while (size > 16) + { + _mm_storeu_si128(reinterpret_cast<__m128i *>(dst), _mm_loadu_si128(reinterpret_cast(src))); + dst += 16; + src += 16; + size -= 16; + } + } + else + { + /// Align destination to 16 bytes boundary. + size_t padding = (16 - (reinterpret_cast(dst) & 15)) & 15; + + if (padding > 0) + { + __m128i head = _mm_loadu_si128(reinterpret_cast(src)); + _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), head); + dst += padding; + src += padding; + size -= padding; + } + + /// Aligned unrolled copy. + __m128i c0, c1, c2, c3, c4, c5, c6, c7; + + while (size >= 128) + { + c0 = _mm_loadu_si128(reinterpret_cast(src) + 0); + c1 = _mm_loadu_si128(reinterpret_cast(src) + 1); + c2 = _mm_loadu_si128(reinterpret_cast(src) + 2); + c3 = _mm_loadu_si128(reinterpret_cast(src) + 3); + c4 = _mm_loadu_si128(reinterpret_cast(src) + 4); + c5 = _mm_loadu_si128(reinterpret_cast(src) + 5); + c6 = _mm_loadu_si128(reinterpret_cast(src) + 6); + c7 = _mm_loadu_si128(reinterpret_cast(src) + 7); + src += 128; + _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 0), c0); + _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 1), c1); + _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 2), c2); + _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 3), c3); + _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 4), c4); + _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 5), c5); + _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 6), c6); + _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 7), c7); + dst += 128; + + size -= 128; + } + + goto tail; + } + } + + return ret; +} + + +template +void dispatchMemcpyVariants(size_t memcpy_variant, uint8_t * dst, uint8_t * src, size_t size, size_t iterations, size_t num_threads, F && generator) +{ + memcpy_type memcpy_libc = reinterpret_cast(dlsym(RTLD_NEXT, "memcpy")); + + if (memcpy_variant == 1) + test(dst, src, size, iterations, num_threads, std::forward(generator), memcpy); + if (memcpy_variant == 2) + test(dst, src, size, iterations, num_threads, std::forward(generator), memcpy_libc); + if (memcpy_variant == 3) + test(dst, src, size, iterations, num_threads, std::forward(generator), memcpy_erms); + if (memcpy_variant == 4) + test(dst, src, size, iterations, num_threads, std::forward(generator), MemCpy); + if (memcpy_variant == 5) + test(dst, src, size, iterations, num_threads, std::forward(generator), memcpySSE2); + if (memcpy_variant == 6) + test(dst, src, size, iterations, num_threads, std::forward(generator), memcpySSE2Unrolled2); + if (memcpy_variant == 7) + test(dst, src, size, iterations, num_threads, std::forward(generator), memcpySSE2Unrolled4); + if (memcpy_variant == 8) + test(dst, src, size, iterations, num_threads, std::forward(generator), memcpySSE2Unrolled8); +// if (memcpy_variant == 9) +// test(dst, src, size, iterations, num_threads, std::forward(generator), memcpy_fast_avx); + if (memcpy_variant == 10) + test(dst, src, size, iterations, num_threads, std::forward(generator), memcpy_my); +} + +void dispatchVariants(size_t memcpy_variant, size_t generator_variant, uint8_t * dst, uint8_t * src, size_t size, size_t iterations, size_t num_threads) +{ + if (generator_variant == 1) + dispatchMemcpyVariants(memcpy_variant, dst, src, size, iterations, num_threads, generatorUniform<16>); + if (generator_variant == 2) + dispatchMemcpyVariants(memcpy_variant, dst, src, size, iterations, num_threads, generatorUniform<256>); + if (generator_variant == 3) + dispatchMemcpyVariants(memcpy_variant, dst, src, size, iterations, num_threads, generatorUniform<4096>); + if (generator_variant == 4) + dispatchMemcpyVariants(memcpy_variant, dst, src, size, iterations, num_threads, generatorUniform<65536>); + if (generator_variant == 5) + dispatchMemcpyVariants(memcpy_variant, dst, src, size, iterations, num_threads, generatorUniform<1048576>); +} + + +int main(int argc, char ** argv) +{ + size_t size = 1000000000; + if (argc >= 2) + size = std::stoull(argv[1]); + + size_t iterations = 10; + if (argc >= 3) + iterations = std::stoull(argv[2]); + + size_t num_threads = 1; + if (argc >= 4) + num_threads = std::stoull(argv[3]); + + size_t memcpy_variant = 1; + if (argc >= 5) + memcpy_variant = std::stoull(argv[4]); + + size_t generator_variant = 1; + if (argc >= 6) + generator_variant = std::stoull(argv[5]); + + std::unique_ptr src(new uint8_t[size]); + std::unique_ptr dst(new uint8_t[size]); + + /// Fill src with some pattern for validation. + for (size_t i = 0; i < size; ++i) + src[i] = i; + + /// Fill dst to avoid page faults. + memset(dst.get(), 0, size); + + dispatchVariants(memcpy_variant, generator_variant, dst.get(), src.get(), size, iterations, num_threads); + + return 0; +} diff --git a/utils/memcpy-bench/memcpy_jart.S b/utils/memcpy-bench/memcpy_jart.S new file mode 100644 index 00000000000..50430d0abe0 --- /dev/null +++ b/utils/memcpy-bench/memcpy_jart.S @@ -0,0 +1,138 @@ +/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8 -*-│ +│vi: set et ft=asm ts=8 tw=8 fenc=utf-8 :vi│ +╞══════════════════════════════════════════════════════════════════════════════╡ +│ Copyright 2020 Justine Alexandra Roberts Tunney │ +│ │ +│ Permission to use, copy, modify, and/or distribute this software for │ +│ any purpose with or without fee is hereby granted, provided that the │ +│ above copyright notice and this permission notice appear in all copies. │ +│ │ +│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL │ +│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED │ +│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE │ +│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL │ +│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR │ +│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER │ +│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR │ +│ PERFORMANCE OF THIS SOFTWARE. │ +╚─────────────────────────────────────────────────────────────────────────────*/ + +// Copies memory. +// +// DEST and SRC must not overlap, unless DEST≤SRC. +// +// @param rdi is dest +// @param rsi is src +// @param rdx is number of bytes +// @return original rdi copied to rax +// @mode long +// @asyncsignalsafe +memcpy_jart: mov %rdi,%rax +// 𝑠𝑙𝑖𝑑𝑒 + .align 16 + .type memcpy_jart,@function + .size memcpy_jart,.-memcpy_jart + .globl memcpy_jart + +// Copies memory w/ minimal impact ABI. +// +// @param rdi is dest +// @param rsi is src +// @param rdx is number of bytes +// @clob flags,rcx,xmm3,xmm4 +// @mode long +MemCpy: mov $.Lmemcpytab.size,%ecx + cmp %rcx,%rdx + cmovb %rdx,%rcx + jmp *memcpytab(,%rcx,8) +.Lanchorpoint: +.L16r: cmp $1024,%rdx + jae .Lerms +.L16: movdqu -16(%rsi,%rdx),%xmm4 + mov $16,%rcx +0: add $16,%rcx + movdqu -32(%rsi,%rcx),%xmm3 + movdqu %xmm3,-32(%rdi,%rcx) + cmp %rcx,%rdx + ja 0b + movdqu %xmm4,-16(%rdi,%rdx) + pxor %xmm4,%xmm4 + pxor %xmm3,%xmm3 + jmp .L0 +.L8: push %rbx + mov (%rsi),%rcx + mov -8(%rsi,%rdx),%rbx + mov %rcx,(%rdi) + mov %rbx,-8(%rdi,%rdx) +1: pop %rbx +.L0: ret +.L4: push %rbx + mov (%rsi),%ecx + mov -4(%rsi,%rdx),%ebx + mov %ecx,(%rdi) + mov %ebx,-4(%rdi,%rdx) + jmp 1b +.L3: push %rbx + mov (%rsi),%cx + mov -2(%rsi,%rdx),%bx + mov %cx,(%rdi) + mov %bx,-2(%rdi,%rdx) + jmp 1b +.L2: mov (%rsi),%cx + mov %cx,(%rdi) + jmp .L0 +.L1: mov (%rsi),%cl + mov %cl,(%rdi) + jmp .L0 +.Lerms: cmp $1024*1024,%rdx + ja .Lnts + push %rdi + push %rsi + mov %rdx,%rcx + rep movsb + pop %rsi + pop %rdi + jmp .L0 +.Lnts: movdqu (%rsi),%xmm3 + movdqu %xmm3,(%rdi) + lea 16(%rdi),%rcx + and $-16,%rcx + sub %rdi,%rcx + add %rcx,%rdi + add %rcx,%rsi + sub %rcx,%rdx + mov $16,%rcx +0: add $16,%rcx + movdqu -32(%rsi,%rcx),%xmm3 + movntdq %xmm3,-32(%rdi,%rcx) + cmp %rcx,%rdx + ja 0b + sfence + movdqu -16(%rsi,%rdx),%xmm3 + movdqu %xmm3,-16(%rdi,%rdx) + pxor %xmm3,%xmm3 + jmp .L0 + .type MemCpy,@function + .size MemCpy,.-MemCpy + .globl MemCpy + + .section .rodata + .align 8 +memcpytab: + .quad .L0 + .quad .L1 + .quad .L2 + .quad .L3 + .rept 4 + .quad .L4 + .endr + .rept 8 + .quad .L8 + .endr + .rept 16 + .quad .L16 + .endr + .equ .Lmemcpytab.size,(.-memcpytab)/8 + .quad .L16r # SSE + ERMS + NTS + .type memcpytab,@object + .previous