From feaed1e020934ddac683fab616fd2927e6d256a8 Mon Sep 17 00:00:00 2001 From: Dmitrii Kovalkov Date: Wed, 20 May 2020 17:43:01 +0200 Subject: [PATCH] rand isn't avx2-vectorizable, I give it up --- src/Functions/FunctionsRandom.cpp | 581 +++++++++--------------------- src/Functions/FunctionsRandom.h | 111 +----- src/Functions/VectorExtension.h | 101 ++++++ 3 files changed, 278 insertions(+), 515 deletions(-) create mode 100644 src/Functions/VectorExtension.h diff --git a/src/Functions/FunctionsRandom.cpp b/src/Functions/FunctionsRandom.cpp index 283013bdb9b..5ab51e9e3b8 100644 --- a/src/Functions/FunctionsRandom.cpp +++ b/src/Functions/FunctionsRandom.cpp @@ -1,5 +1,6 @@ #include #include +#include #include #include #include @@ -8,11 +9,6 @@ namespace DB { -// TODO(dakovalkov): remove this workaround. -#if !defined(__clang__) -# pragma GCC diagnostic ignored "-Wvector-operation-performance" -#endif - DECLARE_MULTITARGET_CODE( namespace @@ -80,250 +76,34 @@ void RandImpl::execute(char * output, size_t size) /// It is guaranteed (by PaddedPODArray) that we can overwrite up to 15 bytes after end. } -void RandImpl2::execute(char * output, size_t size) -{ - if (size == 0) - return; - - LinearCongruentialGenerator generator0; - LinearCongruentialGenerator generator1; - LinearCongruentialGenerator generator2; - LinearCongruentialGenerator generator3; - LinearCongruentialGenerator generator4; - LinearCongruentialGenerator generator5; - LinearCongruentialGenerator generator6; - LinearCongruentialGenerator generator7; - - UInt64 rand_seed = randomSeed(); - - seed(generator0, rand_seed, 0xfaaae481acb5874aULL + reinterpret_cast(output)); - seed(generator1, rand_seed, 0x3181a34f32887db6ULL + reinterpret_cast(output)); - seed(generator2, rand_seed, 0xb6970e4a91b66afdULL + reinterpret_cast(output)); - seed(generator3, rand_seed, 0xc16062649e83dc13ULL + reinterpret_cast(output)); - seed(generator4, rand_seed, 0xbb093972da5c8d92ULL + reinterpret_cast(output)); - seed(generator5, rand_seed, 0xc37dcc410dcfed31ULL + reinterpret_cast(output)); - seed(generator6, rand_seed, 0x45e1526b7a4367d5ULL + reinterpret_cast(output)); - seed(generator7, rand_seed, 0x99c2759203868a7fULL + reinterpret_cast(output)); - - const char * end = output + size; - - constexpr int bytes_per_write = 32; - constexpr int safe_overwrite = 15; - - for (; (end - output) + safe_overwrite >= bytes_per_write; output += safe_overwrite) - { - unalignedStore(output, generator0.next()); - unalignedStore(output + 4, generator1.next()); - unalignedStore(output + 8, generator2.next()); - unalignedStore(output + 12, generator3.next()); - unalignedStore(output + 16, generator4.next()); - unalignedStore(output + 20, generator5.next()); - unalignedStore(output + 24, generator6.next()); - unalignedStore(output + 28, generator7.next()); - } - - seed(generator0, rand_seed, 0xfaaae481acb5874aULL + reinterpret_cast(output)); - seed(generator1, rand_seed, 0x3181a34f32887db6ULL + reinterpret_cast(output)); - seed(generator2, rand_seed, 0xb6970e4a91b66afdULL + reinterpret_cast(output)); - seed(generator3, rand_seed, 0xc16062649e83dc13ULL + reinterpret_cast(output)); - - if (end - output > 0) - { - unalignedStore(output, generator0.next()); - unalignedStore(output + 4, generator1.next()); - unalignedStore(output + 8, generator2.next()); - unalignedStore(output + 12, generator3.next()); - } -} - -typedef UInt64 UInt64x16 __attribute__ ((vector_size (128))); -typedef UInt64 UInt64x8 __attribute__ ((vector_size (64))); -typedef UInt64 UInt64x4 __attribute__ ((vector_size (32))); - -typedef UInt32 UInt32x16 __attribute__ ((vector_size (64))); -typedef UInt32 UInt32x8 __attribute__ ((vector_size (32))); -typedef UInt32 UInt32x4 __attribute__ ((vector_size (16))); - -template -struct DummyStruct; - -template <> -struct DummyStruct<4> -{ - using UInt64Type = UInt64x4; - using UInt32Type = UInt32x4; -}; -template <> -struct DummyStruct<8> -{ - using UInt64Type = UInt64x8; - using UInt32Type = UInt32x8; -}; -template <> -struct DummyStruct<16> -{ - using UInt64Type = UInt64x16; - using UInt32Type = UInt32x16; -}; - -template -using VecUInt64 = typename DummyStruct::UInt64Type; -template -using VecUInt32 = typename DummyStruct::UInt32Type; - -void RandImpl3::execute(char * output, size_t size) -{ - if (size == 0) - return; - - char * end = output + size; - - UInt64x4 generators = { - 0xfb4121280b2ab902ULL + reinterpret_cast(output), - 0x0121cf76df39c673ULL + reinterpret_cast(output), - 0x17ae86e3a19a602fULL + reinterpret_cast(output), - 0x8b6e16da7e06d622ULL + reinterpret_cast(output), - }; - - constexpr int bytes_per_write = sizeof(UInt32x4); - constexpr int safe_overwrite = 15; - - while ((end - output) + safe_overwrite >= bytes_per_write) - { - generators *= LinearCongruentialGenerator::a; - generators += LinearCongruentialGenerator::c; - unalignedStore(output, __builtin_convertvector(generators >> 16, UInt32x4)); - output += bytes_per_write; - } -} - -void RandImpl4::execute(char * output, size_t size) -{ - if (size == 0) - return; - - char * end = output + size; - - UInt64 rand_seed = randomSeed(); - - UInt64x8 generators = { - calcSeed(rand_seed, 0xfb4121280b2ab902ULL + reinterpret_cast(output)), - calcSeed(rand_seed, 0x0121cf76df39c673ULL + reinterpret_cast(output)), - calcSeed(rand_seed, 0x17ae86e3a19a602fULL + reinterpret_cast(output)), - calcSeed(rand_seed, 0x8b6e16da7e06d622ULL + reinterpret_cast(output)), - calcSeed(rand_seed, 0xfb4121f80b2ab902ULL + reinterpret_cast(output)), - calcSeed(rand_seed, 0x0122cf767f39c633ULL + reinterpret_cast(output)), - calcSeed(rand_seed, 0x14ae86e3a79a502fULL + reinterpret_cast(output)), - calcSeed(rand_seed, 0x876316da7e06d622ULL + reinterpret_cast(output)), - }; - - constexpr int bytes_per_write = sizeof(UInt32x8); - constexpr int safe_overwrite = 15; - - while ((end - output) + safe_overwrite >= bytes_per_write) - { - generators *= LinearCongruentialGenerator::a; - generators += LinearCongruentialGenerator::c; - unalignedStore(output, __builtin_convertvector(generators >> 16, UInt32x8)); - output += bytes_per_write; - } - - if ((end - output) > 0) - { - generators *= LinearCongruentialGenerator::a; - generators += LinearCongruentialGenerator::c; - UInt32x8 values = __builtin_convertvector(generators >> 16, UInt32x8); - for (int i = 0; (end - output) > 0; ++i) - { - unalignedStore(output, values[i]); - output += sizeof(UInt32); - } - } -} - -void RandImpl5::execute(char * output, size_t size) -{ - if (size == 0) - return; - - char * end = output + size; - - UInt64 rand_seed = randomSeed(); - - UInt64x16 generators = { - calcSeed(rand_seed, 0xfb4121280b2ab902ULL + reinterpret_cast(output)), - calcSeed(rand_seed, 0x0121cf76df39c673ULL + reinterpret_cast(output)), - calcSeed(rand_seed, 0x17ae86e3a19a602fULL + reinterpret_cast(output)), - calcSeed(rand_seed, 0x8b6e16da7e06d622ULL + reinterpret_cast(output)), - calcSeed(rand_seed, 0xfb4121f80b2ab902ULL + reinterpret_cast(output)), - calcSeed(rand_seed, 0x0122cf767f39c633ULL + reinterpret_cast(output)), - calcSeed(rand_seed, 0x14ae86e3a79a502fULL + reinterpret_cast(output)), - calcSeed(rand_seed, 0x876316da7e06d622ULL + reinterpret_cast(output)), - calcSeed(rand_seed, 0xfb4821280b2ab912ULL + reinterpret_cast(output)), - calcSeed(rand_seed, 0x0126cf76df39c633ULL + reinterpret_cast(output)), - calcSeed(rand_seed, 0x17a486e3a19a602fULL + reinterpret_cast(output)), - calcSeed(rand_seed, 0x8b6216da7e08d622ULL + reinterpret_cast(output)), - calcSeed(rand_seed, 0xfb4101f80b5ab902ULL + reinterpret_cast(output)), - calcSeed(rand_seed, 0x01226f767f34c633ULL + reinterpret_cast(output)), - calcSeed(rand_seed, 0x14ae86e3a75a502fULL + reinterpret_cast(output)), - calcSeed(rand_seed, 0x876e36da7e36d622ULL + reinterpret_cast(output)), - }; - - constexpr int bytes_per_write = sizeof(UInt32x16); - constexpr int safe_overwrite = 15; - - while ((end - output) + safe_overwrite >= bytes_per_write) - { - generators *= LinearCongruentialGenerator::a; - generators += LinearCongruentialGenerator::c; - unalignedStore(output, __builtin_convertvector(generators >> 16, UInt32x16)); - output += bytes_per_write; - } - - if ((end - output) > 0) - { - generators *= LinearCongruentialGenerator::a; - generators += LinearCongruentialGenerator::c; - UInt32x16 values = __builtin_convertvector(generators >> 16, UInt32x16); - for (int i = 0; (end - output) > 0; ++i) - { - unalignedStore(output, values[i]); - output += sizeof(UInt32); - } - } -} - namespace { -constexpr std::array random_numbers = { - 0x0c8ff307dabc0c4cULL, - 0xf4bce78bf3821c1bULL, - 0x4eb628a1e189c21aULL, - 0x85ae000d253e0dbcULL, +// The array of random numbers from 'head -c8 /dev/urandom | xxd -p'. +// Can be used for creating seeds for random generators. +constexpr std::array random_numbers = { + 0x0c8ff307dabc0c4cULL, 0xf4bce78bf3821c1bULL, 0x4eb628a1e189c21aULL, 0x85ae000d253e0dbcULL, + 0xc98073e6480f8a10ULL, 0xb17e9b70a084d570ULL, 0x1361c752b768da8cULL, 0x3d915f60c06d144dULL, + 0xd5bc9b7aced79587ULL, 0x66c28000ba8a66cfULL, 0x0fb58da7a48820f5ULL, 0x540ee1b57aa861a1ULL, + 0x212f11936ef2db04ULL, 0xa3939cd900edcc58ULL, 0xc676c84420170102ULL, 0xcbdc824e8b4bf3edULL, - 0xc98073e6480f8a10ULL, - 0xb17e9b70a084d570ULL, - 0x1361c752b768da8cULL, - 0x3d915f60c06d144dULL, - - 0xd5bc9b7aced79587ULL, - 0x66c28000ba8a66cfULL, - 0x0fb58da7a48820f5ULL, - 0x540ee1b57aa861a1ULL, - - 0x212f11936ef2db04ULL, - 0xa3939cd900edcc58ULL, - 0xc676c84420170102ULL, - 0xcbdc824e8b4bf3edULL, + 0x8296f9d93cc94e3bULL, 0x78a7e826d62085b2ULL, 0xaa30620211fc6c69ULL, 0xbd38de52f0a93677ULL, + 0x19983de8d79dcc4eULL, 0x8afe883ef2199e6fULL, 0xb7160f7ed022b60aULL, 0x2ce173d373ddafd4ULL, + 0x15762761bb55b9acULL, 0x3e448fc94fdd28e7ULL, 0xa5121232adfbe70aULL, 0xb1e0f6d286112804ULL, + 0x6062e96de9554806ULL, 0xcc679b329c28882aULL, 0x5c6d29f45cbc060eULL, 0x1af1325a86ffb162ULL, }; }; -template -void RandVecImpl::execute(char * output, size_t size) +using namespace VectorExtension; + +template +void RandVecImpl::execute(char * output, size_t size) { - static_assert(VectorSize >= 4); - static_assert(VectorSize <= random_numbers.size()); + static_assert(VecSize >= 4); + static_assert(VecSize <= random_numbers.size()); + + using VecUInt64 = UInt64x; + using VecUInt32 = UInt32x; if (size == 0) return; @@ -331,80 +111,38 @@ void RandVecImpl::execute(char * output, size_t size) char * end = output + size; constexpr int safe_overwrite = 15; - constexpr int bytes_per_write = sizeof(VecUInt32); + constexpr int bytes_per_write = sizeof(VecUInt32); UInt64 rand_seed = randomSeed(); - VecUInt64 generators{}; - for (int i = 0; i < VectorSize; ++i) - generators[i] = calcSeed(rand_seed, random_numbers[VectorSize] + reinterpret_cast(output)); + UInt64 a = LinearCongruentialGenerator::a; + // TODO(dakovalkov): try to remove this. + /// Note: GCC likes to expand multiplication by a constant into shifts + additions. + /// In this case a few multiplications become tens of shifts and additions. That leads to a huge slow down. + /// To avoid it we pretend that 'a' is not a constant. Actually we hope that rand_seed is never 0. + if (rand_seed == 0) + a = LinearCongruentialGenerator::a + 2; + + constexpr UInt64 c = LinearCongruentialGenerator::c; + + VecUInt64 generators{}; + for (int i = 0; i < VecSize; ++i) + generators[i] = calcSeed(rand_seed, random_numbers[i] + reinterpret_cast(output)); while ((end - output) + safe_overwrite >= bytes_per_write) { - generators *= LinearCongruentialGenerator::a; - generators += LinearCongruentialGenerator::c; - VecUInt32 values = __builtin_convertvector(generators >> 16, VecUInt32); - unalignedStore>(output, values); + generators = generators * a + c;; + VecUInt32 values = __builtin_convertvector(generators >> 16, VecUInt32); + unalignedStore(output, values); output += bytes_per_write; } - if ((end - output) > 0) - { - generators *= LinearCongruentialGenerator::a; - generators += LinearCongruentialGenerator::c; - VecUInt32 values = __builtin_convertvector(generators >> 16, VecUInt32); - for (int i = 0; (end - output) > 0; ++i) - { - unalignedStore(output, values[i]); - output += sizeof(UInt32); - } - } -} - -template struct RandVecImpl<4>; -template struct RandVecImpl<8>; -template struct RandVecImpl<16>; - -template -void RandVecImpl2::execute(char * output, size_t size) -{ - static_assert(VectorSize >= 4); - - if (size == 0) - return; - - char * end = output + size; - - constexpr int safe_overwrite = 15; - constexpr int bytes_per_write = 2 * sizeof(VecUInt32); - - UInt64 rand_seed = randomSeed(); - VecUInt64 gens1{}, gens2{}; - for (int i = 0; i < VectorSize; ++i) - { - gens1[i] = calcSeed(rand_seed, i * 1123465ull * reinterpret_cast(output)); - gens2[i] = calcSeed(rand_seed, i * 6432453ull * reinterpret_cast(output)); - } - - while ((end - output) + safe_overwrite >= bytes_per_write) - { - gens1 *= LinearCongruentialGenerator::a; - gens1 += LinearCongruentialGenerator::c; - VecUInt32 values1 = __builtin_convertvector(gens1 >> 16, VecUInt32); - unalignedStore>(output, values1); - gens2 *= LinearCongruentialGenerator::a; - gens2 += LinearCongruentialGenerator::c; - VecUInt32 values2 = __builtin_convertvector(gens2 >> 16, VecUInt32); - unalignedStore>(output, values2); - output += bytes_per_write; - } - + // Process tail while ((end - output) > 0) { - gens1 *= LinearCongruentialGenerator::a; - gens1 += LinearCongruentialGenerator::c; - VecUInt32 values = __builtin_convertvector(gens1 >> 16, VecUInt32); - for (int i = 0; (end - output) > 0 && i < VectorSize; ++i) + generators = generators * a + c;; + VecUInt32 values = __builtin_convertvector(generators >> 16, VecUInt32); + for (int i = 0; i < VecSize && (end - output) > 0; ++i) { unalignedStore(output, values[i]); output += sizeof(UInt32); @@ -412,137 +150,60 @@ void RandVecImpl2::execute(char * output, size_t size) } } -template struct RandVecImpl2<4>; -template struct RandVecImpl2<8>; -template struct RandVecImpl2<16>; - -// template -// void RandVecImpl4::execute(char * output, size_t size) -// { -// static_assert(VectorSize >= 4); - -// if (size == 0) -// return; - -// char * end = output + size; - -// constexpr int safe_overwrite = 15; -// constexpr int bytes_per_write = 4 * sizeof(VecUInt32); - -// VecUInt64 gens1{}, gens2{}, gens3{}, gens4{}; -// for (int i = 0; i < VectorSize; ++i) -// { -// gens1[i] = calcSeed(i * 1123465ull * reinterpret_cast(output)); -// gens2[i] = calcSeed(i * 6432453ull * reinterpret_cast(output)); -// gens3[i] = calcSeed(i * 1346434ull * reinterpret_cast(output)); -// gens4[i] = calcSeed(i * 5344753ull * reinterpret_cast(output)); -// } - -// while ((end - output) + safe_overwrite >= bytes_per_write) -// { -// gens1 *= LinearCongruentialGenerator::a; -// gens1 += LinearCongruentialGenerator::c; -// VecUInt32 values1 = __builtin_convertvector(gens1 >> 16, VecUInt32); -// unalignedStore>(output, values1); -// gens2 *= LinearCongruentialGenerator::a; -// gens2 += LinearCongruentialGenerator::c; -// VecUInt32 values2 = __builtin_convertvector(gens2 >> 16, VecUInt32); -// unalignedStore>(output, values2); -// gens3 *= LinearCongruentialGenerator::a; -// gens3 += LinearCongruentialGenerator::c; -// VecUInt32 values3 = __builtin_convertvector(gens3 >> 16, VecUInt32); -// unalignedStore>(output, values3); -// gens4 *= LinearCongruentialGenerator::a; -// gens4 += LinearCongruentialGenerator::c; -// VecUInt32 values4 = __builtin_convertvector(gens4 >> 16, VecUInt32); -// unalignedStore>(output, values4); -// output += bytes_per_write; -// } - -// while ((end - output) > 0) -// { -// gens1 *= LinearCongruentialGenerator::a; -// gens1 += LinearCongruentialGenerator::c; -// VecUInt32 values = __builtin_convertvector(gens1 >> 16, VecUInt32); -// for (int i = 0; (end - output) > 0 && i < VectorSize; i += 4) -// { -// unalignedStore(output, values[i]); -// unalignedStore(output + 4, values[i + 1]); -// unalignedStore(output + 8, values[i + 2]); -// unalignedStore(output + 12, values[i + 3]); -// output += 16; -// } -// } -// } - -// template struct RandVecImpl2<4>; -// template struct RandVecImpl2<8>; -// template struct RandVecImpl2<16>; - -) //DECLARE_MULTITARGET_CODE - -DECLARE_AVX2_SPECIFIC_CODE( - -void RandImpl6::execute(char * output, size_t size) +template +void RandVecImpl2::execute(char * output, size_t size) { + static_assert(VecSize >= 4); + static_assert(2 * VecSize <= random_numbers.size()); + + using VecUInt64 = UInt64x; + using VecUInt32 = UInt32x; + if (size == 0) return; char * end = output + size; - UInt64x8 generators = { - 0x5f186ce5faee450bULL + reinterpret_cast(output), - 0x9adb2ca3c72ac2eeULL + reinterpret_cast(output), - 0x07acf8bfa2537705ULL + reinterpret_cast(output), - 0x692b1b533834db92ULL + reinterpret_cast(output), - 0x5148b84cdda30081ULL + reinterpret_cast(output), - 0xe17b8a75a301ad47ULL + reinterpret_cast(output), - 0x6d4a5d69ed2a5f56ULL + reinterpret_cast(output), - 0x114e23266201b333ULL + reinterpret_cast(output), - }; - - union { - UInt64x8 vec; - __m256i mm[2]; - } gens {generators}; - - constexpr int bytes_per_write = sizeof(UInt32x8); constexpr int safe_overwrite = 15; + constexpr int bytes_per_write = 2 * sizeof(VecUInt32); - const auto low_a = _mm256_set1_epi64x(0xDEECE66D); - // const auto high_a = _mm256_set1_epi64x(5); - const auto c = _mm256_set1_epi64x(11); + UInt64 rand_seed = randomSeed(); + + UInt64 a = LinearCongruentialGenerator::a; + // TODO(dakovalkov): try to remove this. + /// Note: GCC likes to expand multiplication by a constant into shifts + additions. + /// In this case a few multiplications become tens of shifts and additions. That leads to a huge slow down. + /// To avoid it we pretend that 'a' is not a constant. Actually we hope that rand_seed is never 0. + if (rand_seed == 0) + a = LinearCongruentialGenerator::a + 2; + + constexpr UInt64 c = LinearCongruentialGenerator::c; + + VecUInt64 gens1{}; + VecUInt64 gens2{}; + for (int i = 0; i < VecSize; ++i) + { + gens1[i] = calcSeed(rand_seed, random_numbers[i] + reinterpret_cast(output)); + gens2[i] = calcSeed(rand_seed, random_numbers[i + VecSize] + reinterpret_cast(output)); + } while ((end - output) + safe_overwrite >= bytes_per_write) { - { - auto gens_high = _mm256_srli_epi64(gens.mm[0], 32); - auto low_low_res = _mm256_mul_epu32(gens.mm[0], low_a); - auto high_low_res = _mm256_slli_epi64(_mm256_mul_epu32(gens_high, low_a), 32); - auto low_high_res = _mm256_slli_epi64(gens.mm[0], 32) + _mm256_slli_epi64(gens.mm[0], 34); - gens.mm[0] = _mm256_add_epi64(_mm256_add_epi64(low_low_res, high_low_res), - _mm256_add_epi64(low_high_res, c)); - } - { - auto gens_high = _mm256_srli_epi64(gens.mm[1], 32); - auto low_low_res = _mm256_mul_epu32(gens.mm[1], low_a); - auto high_low_res = _mm256_slli_epi64(_mm256_mul_epu32(gens_high, low_a), 32); - auto low_high_res = _mm256_slli_epi64(gens.mm[1], 32) + _mm256_slli_epi64(gens.mm[1], 34); - gens.mm[1] = _mm256_add_epi64(_mm256_add_epi64(low_low_res, high_low_res), - _mm256_add_epi64(low_high_res, c)); - } - // generators *= LinearCongruentialGenerator::a; - // generators += LinearCongruentialGenerator::c; - unalignedStore(output, __builtin_convertvector(gens.vec >> 16, UInt32x8)); + gens1 = gens1 * a + c;; + VecUInt32 values1 = __builtin_convertvector(gens1 >> 16, VecUInt32); + unalignedStore(output, values1); + gens2 = gens2 * a + c;; + VecUInt32 values2 = __builtin_convertvector(gens2 >> 16, VecUInt32); + unalignedStore(output + sizeof(VecUInt32), values2); output += bytes_per_write; } - if ((end - output) > 0) + // Process tail + while ((end - output) > 0) { - generators *= LinearCongruentialGenerator::a; - generators += LinearCongruentialGenerator::c; - UInt32x8 values = __builtin_convertvector(generators >> 16, UInt32x8); - for (int i = 0; (end - output) > 0; ++i) + gens1 = gens1 * a + c;; + VecUInt32 values = __builtin_convertvector(gens1 >> 16, VecUInt32); + for (int i = 0; i < VecSize && (end - output) > 0; ++i) { unalignedStore(output, values[i]); output += sizeof(UInt32); @@ -550,6 +211,86 @@ void RandImpl6::execute(char * output, size_t size) } } +template +void RandVecImpl4::execute(char * output, size_t size) +{ + static_assert(VecSize >= 4); + static_assert(4 * VecSize <= random_numbers.size()); + + using VecUInt64 = UInt64x; + using VecUInt32 = UInt32x; + + if (size == 0) + return; + + char * end = output + size; + + constexpr int safe_overwrite = 15; + constexpr int bytes_per_write = 4 * sizeof(VecUInt32); + + UInt64 rand_seed = randomSeed(); + + UInt64 a = LinearCongruentialGenerator::a; + // TODO(dakovalkov): try to remove this. + /// Note: GCC likes to expand multiplication by a constant into shifts + additions. + /// In this case a few multiplications become tens of shifts and additions. That leads to a huge slow down. + /// To avoid it we pretend that 'a' is not a constant. Actually we hope that rand_seed is never 0. + if (rand_seed == 0) + a = LinearCongruentialGenerator::a + 2; + + constexpr UInt64 c = LinearCongruentialGenerator::c; + + VecUInt64 gens1{}; + VecUInt64 gens2{}; + VecUInt64 gens3{}; + VecUInt64 gens4{}; + for (int i = 0; i < VecSize; ++i) + { + gens1[i] = calcSeed(rand_seed, random_numbers[i] + reinterpret_cast(output)); + gens2[i] = calcSeed(rand_seed, random_numbers[i + VecSize] + reinterpret_cast(output)); + gens3[i] = calcSeed(rand_seed, random_numbers[i + 2 * VecSize] + reinterpret_cast(output)); + gens4[i] = calcSeed(rand_seed, random_numbers[i + 3 * VecSize] + reinterpret_cast(output)); + } + + while ((end - output) + safe_overwrite >= bytes_per_write) + { + gens1 = gens1 * a + c; + VecUInt32 values1 = __builtin_convertvector(gens1 >> 16, VecUInt32); + unalignedStore(output, values1); + gens2 = gens2 * a + c; + VecUInt32 values2 = __builtin_convertvector(gens2 >> 16, VecUInt32); + unalignedStore(output + sizeof(VecUInt32), values2); + gens3 = gens3 * a + c; + VecUInt32 values3 = __builtin_convertvector(gens3 >> 16, VecUInt32); + unalignedStore(output + 2 * sizeof(VecUInt32), values3); + gens4 = gens4 * a + c; + VecUInt32 values4 = __builtin_convertvector(gens4 >> 16, VecUInt32); + unalignedStore(output + 3 * sizeof(VecUInt32), values4); + output += bytes_per_write; + } + + // Process tail + while ((end - output) > 0) + { + gens1 = gens1 * a + c;; + VecUInt32 values = __builtin_convertvector(gens1 >> 16, VecUInt32); + for (int i = 0; i < VecSize && (end - output) > 0; ++i) + { + unalignedStore(output, values[i]); + output += sizeof(UInt32); + } + } +} + +) // DECLARE_MULTITARGET_CODE + +DECLARE_AVX2_SPECIFIC_CODE( + template struct RandVecImpl4<4>; ) // DECLARE_AVX2_SPECIFIC_CODE +DECLARE_AVX512F_SPECIFIC_CODE( + template struct RandVecImpl4<8>; +) // DECLARE_AVX512F_SPECIFIC_CODE + + } diff --git a/src/Functions/FunctionsRandom.h b/src/Functions/FunctionsRandom.h index 557e1fbe868..a82f199356e 100644 --- a/src/Functions/FunctionsRandom.h +++ b/src/Functions/FunctionsRandom.h @@ -43,49 +43,26 @@ struct RandImpl static void execute(char * output, size_t size); static String getImplementationTag() { return ToString(BuildArch); } }; - -struct RandImpl2 -{ - static void execute(char * output, size_t size); - static String getImplementationTag() { return ToString(BuildArch) + "_v2"; } -}; - -struct RandImpl3 -{ - static void execute(char * output, size_t size); - static String getImplementationTag() { return ToString(BuildArch) + "_v3"; } -}; - -struct RandImpl4 -{ - static void execute(char * output, size_t size); - static String getImplementationTag() { return ToString(BuildArch) + "_v4"; } -}; - -struct RandImpl5 -{ - static void execute(char * output, size_t size); - static String getImplementationTag() { return ToString(BuildArch) + "_v5"; } -}; - -template +// Isn't used now. +template struct RandVecImpl { - static void execute(char * outpu, size_t size); - static String getImplementationTag() { return ToString(BuildArch) + "_vec_" + toString(VectorSize); } + static void execute(char * output, size_t size); + static String getImplementationTag() { return ToString(BuildArch) + "_vec_" + toString(VecSize); } }; - -template +// Isn't used now. +template struct RandVecImpl2 { - static void execute(char * outpu, size_t size); - static String getImplementationTag() { return ToString(BuildArch) + "_vec2_" + toString(VectorSize); } + static void execute(char * output, size_t size); + static String getImplementationTag() { return ToString(BuildArch) + "_vec2_" + toString(VecSize); } }; -struct RandImpl6 +template +struct RandVecImpl4 { - static void execute(char * outpu, size_t size); - static String getImplementationTag() { return ToString(BuildArch) + "_v6"; } + static void execute(char * output, size_t size); + static String getImplementationTag() { return ToString(BuildArch) + "_vec4_" + toString(VecSize); } }; ) // DECLARE_MULTITARGET_CODE @@ -144,72 +121,16 @@ public: { selector.registerImplementation>(); - selector.registerImplementation>(); if constexpr (UseMultitargetCode) { - selector.registerImplementation>(); - selector.registerImplementation>(); + // vec impl 4 selector.registerImplementation>(); + FunctionRandomImpl, ToType, Name>>(); + selector.registerImplementation>(); + FunctionRandomImpl, ToType, Name>>(); - selector.registerImplementation>(); - - selector.registerImplementation>(); - selector.registerImplementation>(); - - selector.registerImplementation>(); - selector.registerImplementation>(); - - selector.registerImplementation>(); - selector.registerImplementation>(); - - // vec impl - selector.registerImplementation, ToType, Name>>(); - selector.registerImplementation, ToType, Name>>(); - - selector.registerImplementation, ToType, Name>>(); - selector.registerImplementation, ToType, Name>>(); - - selector.registerImplementation, ToType, Name>>(); - selector.registerImplementation, ToType, Name>>(); - - // vec impl 2 - selector.registerImplementation, ToType, Name>>(); - selector.registerImplementation, ToType, Name>>(); - - selector.registerImplementation, ToType, Name>>(); - selector.registerImplementation, ToType, Name>>(); - - selector.registerImplementation, ToType, Name>>(); - selector.registerImplementation, ToType, Name>>(); - - selector.registerImplementation>(); } } diff --git a/src/Functions/VectorExtension.h b/src/Functions/VectorExtension.h new file mode 100644 index 00000000000..49a029bb0d9 --- /dev/null +++ b/src/Functions/VectorExtension.h @@ -0,0 +1,101 @@ +#pragma once + +#include +// Contains types declarations and wrappers for GCC vector extension. + +// TODO(dakovalkov): remove this workaround. +#if !defined(__clang__) +# pragma GCC diagnostic ignored "-Wvector-operation-performance" +#endif + +namespace DB::VectorExtension +{ + +typedef UInt64 UInt64x2 __attribute__ ((vector_size (sizeof(UInt64) * 2))); +typedef UInt64 UInt64x4 __attribute__ ((vector_size (sizeof(UInt64) * 4))); +typedef UInt64 UInt64x8 __attribute__ ((vector_size (sizeof(UInt64) * 8))); +typedef UInt64 UInt64x16 __attribute__ ((vector_size (sizeof(UInt64) * 16))); +typedef UInt64 UInt64x32 __attribute__ ((vector_size (sizeof(UInt64) * 32))); + +typedef UInt32 UInt32x2 __attribute__ ((vector_size (sizeof(UInt32) * 2))); +typedef UInt32 UInt32x4 __attribute__ ((vector_size (sizeof(UInt32) * 4))); +typedef UInt32 UInt32x8 __attribute__ ((vector_size (sizeof(UInt32) * 8))); +typedef UInt32 UInt32x16 __attribute__ ((vector_size (sizeof(UInt32) * 16))); +typedef UInt32 UInt32x32 __attribute__ ((vector_size (sizeof(UInt32) * 32))); +typedef UInt32 UInt32x64 __attribute__ ((vector_size (sizeof(UInt32) * 64))); + +typedef UInt16 UInt16x2 __attribute__ ((vector_size (sizeof(UInt16) * 2))); +typedef UInt16 UInt16x4 __attribute__ ((vector_size (sizeof(UInt16) * 4))); +typedef UInt16 UInt16x8 __attribute__ ((vector_size (sizeof(UInt16) * 8))); +typedef UInt16 UInt16x16 __attribute__ ((vector_size (sizeof(UInt16) * 16))); +typedef UInt16 UInt16x32 __attribute__ ((vector_size (sizeof(UInt16) * 32))); +typedef UInt16 UInt16x64 __attribute__ ((vector_size (sizeof(UInt16) * 64))); + +typedef UInt8 UInt8x2 __attribute__ ((vector_size (sizeof(UInt8) * 2))); +typedef UInt8 UInt8x4 __attribute__ ((vector_size (sizeof(UInt8) * 4))); +typedef UInt8 UInt8x8 __attribute__ ((vector_size (sizeof(UInt8) * 8))); +typedef UInt8 UInt8x16 __attribute__ ((vector_size (sizeof(UInt8) * 16))); +typedef UInt8 UInt8x32 __attribute__ ((vector_size (sizeof(UInt8) * 32))); +typedef UInt8 UInt8x64 __attribute__ ((vector_size (sizeof(UInt8) * 64))); + +namespace detail +{ + template + struct DummyStruct; + + template <> + struct DummyStruct<4> + { + using UInt8Type = UInt8x4; + using UInt16Type = UInt16x4; + using UInt32Type = UInt32x4; + using UInt64Type = UInt64x4; + }; + template <> + struct DummyStruct<8> + { + using UInt8Type = UInt8x8; + using UInt16Type = UInt16x8; + using UInt32Type = UInt32x8; + using UInt64Type = UInt64x8; + }; + template <> + struct DummyStruct<16> + { + using UInt8Type = UInt8x16; + using UInt16Type = UInt16x16; + using UInt32Type = UInt32x16; + using UInt64Type = UInt64x16; + }; + template <> + struct DummyStruct<32> + { + using UInt8Type = UInt8x32; + using UInt16Type = UInt16x32; + using UInt32Type = UInt32x32; + using UInt64Type = UInt64x32; + }; + +} + +// Same as above via template, e.g. UInt64x<8> +template +using UInt8x = typename detail::DummyStruct::UInt8Type; +template +using UInt16x = typename detail::DummyStruct::UInt16Type; +template +using UInt32x = typename detail::DummyStruct::UInt32Type; +template +using UInt64x = typename detail::DummyStruct::UInt64Type; + +/* Casts vectors of the same size. + * UInt32x4 x{}; + * UInt64x4 y = ConvertVector(x); + */ +// template +// inline To ConvertVector(From a) +// { +// return __builtin_convertvector(a, To); +// } + +}