diff --git a/src/Functions/FunctionsRandom.cpp b/src/Functions/FunctionsRandom.cpp index d0d25e56c60..496e0edcc5a 100644 --- a/src/Functions/FunctionsRandom.cpp +++ b/src/Functions/FunctionsRandom.cpp @@ -3,19 +3,18 @@ #include #include #include +#include namespace DB { -/* - // TODO(dakovalkov): remove this workaround. -#pragma GCC diagnostic ignored "-Wvector-operation-performance" +#if !defined(__clang__) +# pragma GCC diagnostic ignored "-Wvector-operation-performance" +#endif DECLARE_MULTITARGET_CODE( -*/ - namespace { /// NOTE Probably @@ -45,10 +44,16 @@ namespace } }; - void seed(LinearCongruentialGenerator & generator, intptr_t additional_seed) + UInt64 calcSeed(UInt64 rand_seed, UInt64 additional_seed) { - generator.seed(intHash64(randomSeed() ^ intHash64(additional_seed))); + return intHash64(rand_seed ^ intHash64(additional_seed)); } + + void seed(LinearCongruentialGenerator & generator, UInt64 rand_seed, intptr_t additional_seed) + { + generator.seed(calcSeed(rand_seed, additional_seed)); + } + } void RandImpl::execute(char * output, size_t size) @@ -58,10 +63,12 @@ void RandImpl::execute(char * output, size_t size) LinearCongruentialGenerator generator2; LinearCongruentialGenerator generator3; - seed(generator0, 0xfb4121280b2ab902ULL + reinterpret_cast(output)); - seed(generator1, 0x0121cf76df39c673ULL + reinterpret_cast(output)); - seed(generator2, 0x17ae86e3a19a602fULL + reinterpret_cast(output)); - seed(generator3, 0x8b6e16da7e06d622ULL + reinterpret_cast(output)); + UInt64 rand_seed = randomSeed(); + + seed(generator0, rand_seed, 0xfb4121280b2ab902ULL + reinterpret_cast(output)); + seed(generator1, rand_seed, 0x0121cf76df39c673ULL + reinterpret_cast(output)); + seed(generator2, rand_seed, 0x17ae86e3a19a602fULL + reinterpret_cast(output)); + seed(generator3, rand_seed, 0x8b6e16da7e06d622ULL + reinterpret_cast(output)); for (const char * end = output + size; output < end; output += 16) { @@ -73,55 +80,6 @@ void RandImpl::execute(char * output, size_t size) /// It is guaranteed (by PaddedPODArray) that we can overwrite up to 15 bytes after end. } -void RandImpl2::execute(char * output, size_t size) -{ - if (size == 0) - return; - - LinearCongruentialGenerator generator0; - LinearCongruentialGenerator generator1; - LinearCongruentialGenerator generator2; - LinearCongruentialGenerator generator3; - LinearCongruentialGenerator generator4; - LinearCongruentialGenerator generator5; - LinearCongruentialGenerator generator6; - LinearCongruentialGenerator generator7; - - seed(generator0, 0xfaaae481acb5874aULL + reinterpret_cast(output)); - seed(generator1, 0x3181a34f32887db6ULL + reinterpret_cast(output)); - seed(generator2, 0xb6970e4a91b66afdULL + reinterpret_cast(output)); - seed(generator3, 0xc16062649e83dc13ULL + reinterpret_cast(output)); - seed(generator4, 0xbb093972da5c8d92ULL + reinterpret_cast(output)); - seed(generator5, 0xc37dcc410dcfed31ULL + reinterpret_cast(output)); - seed(generator6, 0x45e1526b7a4367d5ULL + reinterpret_cast(output)); - seed(generator7, 0x99c2759203868a7fULL + reinterpret_cast(output)); - - const char * end = output + size; - - for (; (end - output + 15) <= 32; output += 32) - { - unalignedStore(output, generator0.next()); - unalignedStore(output + 4, generator1.next()); - unalignedStore(output + 8, generator2.next()); - unalignedStore(output + 12, generator3.next()); - unalignedStore(output + 16, generator4.next()); - unalignedStore(output + 20, generator5.next()); - unalignedStore(output + 24, generator6.next()); - unalignedStore(output + 28, generator7.next()); - } - - if (end - output > 0) - { - unalignedStore(output, generator0.next()); - unalignedStore(output + 4, generator1.next()); - unalignedStore(output + 8, generator2.next()); - unalignedStore(output + 12, generator3.next()); - output += 16; - } -} - -/* - typedef UInt64 UInt64x16 __attribute__ ((vector_size (128))); typedef UInt64 UInt64x8 __attribute__ ((vector_size (64))); typedef UInt64 UInt64x4 __attribute__ ((vector_size (32))); @@ -130,58 +88,85 @@ typedef UInt32 UInt32x16 __attribute__ ((vector_size (64))); typedef UInt32 UInt32x8 __attribute__ ((vector_size (32))); typedef UInt32 UInt32x4 __attribute__ ((vector_size (16))); -void RandImpl3::execute(char * output, size_t size) +template +struct DummyStruct; + +template <> +struct DummyStruct<4> { + using UInt64Type = UInt64x4; + using UInt32Type = UInt32x4; +}; +template <> +struct DummyStruct<8> +{ + using UInt64Type = UInt64x8; + using UInt32Type = UInt32x8; +}; +template <> +struct DummyStruct<16> +{ + using UInt64Type = UInt64x16; + using UInt32Type = UInt32x16; +}; + +template +using VecUInt64 = typename DummyStruct::UInt64Type; +template +using VecUInt32 = typename DummyStruct::UInt32Type; + +namespace { + +constexpr std::array random_numbers = { + 0x0c8ff307dabc0c4cULL, + 0xf4bce78bf3821c1bULL, + 0x4eb628a1e189c21aULL, + 0x85ae000d253e0dbcULL, + + 0xc98073e6480f8a10ULL, + 0xb17e9b70a084d570ULL, + 0x1361c752b768da8cULL, + 0x3d915f60c06d144dULL, + + 0xd5bc9b7aced79587ULL, + 0x66c28000ba8a66cfULL, + 0x0fb58da7a48820f5ULL, + 0x540ee1b57aa861a1ULL, + + 0x212f11936ef2db04ULL, + 0xa3939cd900edcc58ULL, + 0xc676c84420170102ULL, + 0xcbdc824e8b4bf3edULL, +}; + +}; + +template +void RandVecImpl::execute(char * output, size_t size) +{ + static_assert(VectorSize >= 4); + static_assert(VectorSize <= random_numbers.size()); + if (size == 0) return; char * end = output + size; - UInt64x4 generators = { - 0xfb4121280b2ab902ULL + reinterpret_cast(output), - 0x0121cf76df39c673ULL + reinterpret_cast(output), - 0x17ae86e3a19a602fULL + reinterpret_cast(output), - 0x8b6e16da7e06d622ULL + reinterpret_cast(output), - }; - - constexpr int bytes_per_write = sizeof(UInt32x4); constexpr int safe_overwrite = 15; + constexpr int bytes_per_write = sizeof(VecUInt32); + + UInt64 rand_seed = randomSeed(); + + VecUInt64 generators{}; + for (int i = 0; i < VectorSize; ++i) + generators[i] = calcSeed(rand_seed, random_numbers[VectorSize] + reinterpret_cast(output)); while ((end - output) + safe_overwrite >= bytes_per_write) { generators *= LinearCongruentialGenerator::a; generators += LinearCongruentialGenerator::c; - unalignedStore(output, __builtin_convertvector(generators, UInt32x4)); - output += bytes_per_write; - } -} - -void RandImpl4::execute(char * output, size_t size) -{ - if (size == 0) - return; - - char * end = output + size; - - UInt64x8 generators = { - 0x5f186ce5faee450bULL + reinterpret_cast(output), - 0x9adb2ca3c72ac2eeULL + reinterpret_cast(output), - 0x07acf8bfa2537705ULL + reinterpret_cast(output), - 0x692b1b533834db92ULL + reinterpret_cast(output), - 0x5148b84cdda30081ULL + reinterpret_cast(output), - 0xe17b8a75a301ad47ULL + reinterpret_cast(output), - 0x6d4a5d69ed2a5f56ULL + reinterpret_cast(output), - 0x114e23266201b333ULL + reinterpret_cast(output), - }; - - constexpr int bytes_per_write = sizeof(UInt32x8); - constexpr int safe_overwrite = 15; - - while ((end - output) + safe_overwrite >= bytes_per_write) - { - generators *= LinearCongruentialGenerator::a; - generators += LinearCongruentialGenerator::c; - unalignedStore(output, __builtin_convertvector(generators, UInt32x8)); + VecUInt32 values = __builtin_convertvector(generators >> 16, VecUInt32); + unalignedStore>(output, values); output += bytes_per_write; } @@ -189,7 +174,7 @@ void RandImpl4::execute(char * output, size_t size) { generators *= LinearCongruentialGenerator::a; generators += LinearCongruentialGenerator::c; - UInt32x8 values = __builtin_convertvector(generators, UInt32x8); + VecUInt32 values = __builtin_convertvector(generators >> 16, VecUInt32); for (int i = 0; (end - output) > 0; ++i) { unalignedStore(output, values[i]); @@ -198,49 +183,50 @@ void RandImpl4::execute(char * output, size_t size) } } -void RandImpl5::execute(char * output, size_t size) +template struct RandVecImpl<4>; +template struct RandVecImpl<8>; +template struct RandVecImpl<16>; + +template +void RandVecImpl2::execute(char * output, size_t size) { + static_assert(VectorSize >= 4); + if (size == 0) return; char * end = output + size; - UInt64x16 generators = { - 0xfb4121280b2ab902ULL + reinterpret_cast(output), - 0x0121cf76df39c673ULL + reinterpret_cast(output), - 0x17ae86e3a19a602fULL + reinterpret_cast(output), - 0x8b6e16da7e06d622ULL + reinterpret_cast(output), - 0xfb4121f80b2ab902ULL + reinterpret_cast(output), - 0x0122cf767f39c633ULL + reinterpret_cast(output), - 0x14ae86e3a79a502fULL + reinterpret_cast(output), - 0x876316da7e06d622ULL + reinterpret_cast(output), - 0xfb4821280b2ab912ULL + reinterpret_cast(output), - 0x0126cf76df39c633ULL + reinterpret_cast(output), - 0x17a486e3a19a602fULL + reinterpret_cast(output), - 0x8b6216da7e08d622ULL + reinterpret_cast(output), - 0xfb4101f80b5ab902ULL + reinterpret_cast(output), - 0x01226f767f34c633ULL + reinterpret_cast(output), - 0x14ae86e3a75a502fULL + reinterpret_cast(output), - 0x876e36da7e36d622ULL + reinterpret_cast(output), - }; - - constexpr int bytes_per_write = sizeof(UInt32x16); constexpr int safe_overwrite = 15; + constexpr int bytes_per_write = 2 * sizeof(VecUInt32); + + UInt64 rand_seed = randomSeed(); + VecUInt64 gens1{}, gens2{}; + for (int i = 0; i < VectorSize; ++i) + { + gens1[i] = calcSeed(rand_seed, i * 1123465ull * reinterpret_cast(output)); + gens2[i] = calcSeed(rand_seed, i * 6432453ull * reinterpret_cast(output)); + } while ((end - output) + safe_overwrite >= bytes_per_write) { - generators *= LinearCongruentialGenerator::a; - generators += LinearCongruentialGenerator::c; - unalignedStore(output, __builtin_convertvector(generators, UInt32x16)); + gens1 *= LinearCongruentialGenerator::a; + gens1 += LinearCongruentialGenerator::c; + VecUInt32 values1 = __builtin_convertvector(gens1 >> 16, VecUInt32); + unalignedStore>(output, values1); + gens2 *= LinearCongruentialGenerator::a; + gens2 += LinearCongruentialGenerator::c; + VecUInt32 values2 = __builtin_convertvector(gens2 >> 16, VecUInt32); + unalignedStore>(output, values2); output += bytes_per_write; } - - if ((end - output) > 0) + + while ((end - output) > 0) { - generators *= LinearCongruentialGenerator::a; - generators += LinearCongruentialGenerator::c; - UInt32x16 values = __builtin_convertvector(generators, UInt32x16); - for (int i = 0; (end - output) > 0; ++i) + gens1 *= LinearCongruentialGenerator::a; + gens1 += LinearCongruentialGenerator::c; + VecUInt32 values = __builtin_convertvector(gens1 >> 16, VecUInt32); + for (int i = 0; (end - output) > 0 && i < VectorSize; ++i) { unalignedStore(output, values[i]); output += sizeof(UInt32); @@ -248,8 +234,73 @@ void RandImpl5::execute(char * output, size_t size) } } +template struct RandVecImpl2<4>; +template struct RandVecImpl2<8>; +template struct RandVecImpl2<16>; + +// template +// void RandVecImpl4::execute(char * output, size_t size) +// { +// static_assert(VectorSize >= 4); + +// if (size == 0) +// return; + +// char * end = output + size; + +// constexpr int safe_overwrite = 15; +// constexpr int bytes_per_write = 4 * sizeof(VecUInt32); + +// VecUInt64 gens1{}, gens2{}, gens3{}, gens4{}; +// for (int i = 0; i < VectorSize; ++i) +// { +// gens1[i] = calcSeed(i * 1123465ull * reinterpret_cast(output)); +// gens2[i] = calcSeed(i * 6432453ull * reinterpret_cast(output)); +// gens3[i] = calcSeed(i * 1346434ull * reinterpret_cast(output)); +// gens4[i] = calcSeed(i * 5344753ull * reinterpret_cast(output)); +// } + +// while ((end - output) + safe_overwrite >= bytes_per_write) +// { +// gens1 *= LinearCongruentialGenerator::a; +// gens1 += LinearCongruentialGenerator::c; +// VecUInt32 values1 = __builtin_convertvector(gens1 >> 16, VecUInt32); +// unalignedStore>(output, values1); +// gens2 *= LinearCongruentialGenerator::a; +// gens2 += LinearCongruentialGenerator::c; +// VecUInt32 values2 = __builtin_convertvector(gens2 >> 16, VecUInt32); +// unalignedStore>(output, values2); +// gens3 *= LinearCongruentialGenerator::a; +// gens3 += LinearCongruentialGenerator::c; +// VecUInt32 values3 = __builtin_convertvector(gens3 >> 16, VecUInt32); +// unalignedStore>(output, values3); +// gens4 *= LinearCongruentialGenerator::a; +// gens4 += LinearCongruentialGenerator::c; +// VecUInt32 values4 = __builtin_convertvector(gens4 >> 16, VecUInt32); +// unalignedStore>(output, values4); +// output += bytes_per_write; +// } + +// while ((end - output) > 0) +// { +// gens1 *= LinearCongruentialGenerator::a; +// gens1 += LinearCongruentialGenerator::c; +// VecUInt32 values = __builtin_convertvector(gens1 >> 16, VecUInt32); +// for (int i = 0; (end - output) > 0 && i < VectorSize; i += 4) +// { +// unalignedStore(output, values[i]); +// unalignedStore(output + 4, values[i + 1]); +// unalignedStore(output + 8, values[i + 2]); +// unalignedStore(output + 12, values[i + 3]); +// output += 16; +// } +// } +// } + +// template struct RandVecImpl2<4>; +// template struct RandVecImpl2<8>; +// template struct RandVecImpl2<16>; + ) //DECLARE_MULTITARGET_CODE -*/ - } diff --git a/src/Functions/FunctionsRandom.h b/src/Functions/FunctionsRandom.h index 9a06d8df7a3..557e1fbe868 100644 --- a/src/Functions/FunctionsRandom.h +++ b/src/Functions/FunctionsRandom.h @@ -36,26 +36,20 @@ namespace ErrorCodes * This means that the timer must be of sufficient resolution to give different values to each block. */ -/* - DECLARE_MULTITARGET_CODE( -*/ - struct RandImpl { static void execute(char * output, size_t size); - static String getImplementationTag() { return ToString(TargetArch::Default); } + static String getImplementationTag() { return ToString(BuildArch); } }; struct RandImpl2 { static void execute(char * output, size_t size); - static String getImplementationTag() { return ToString(TargetArch::Default) + "_v2"; } + static String getImplementationTag() { return ToString(BuildArch) + "_v2"; } }; -/* - struct RandImpl3 { static void execute(char * output, size_t size); @@ -74,9 +68,27 @@ struct RandImpl5 static String getImplementationTag() { return ToString(BuildArch) + "_v5"; } }; -) // DECLARE_MULTITARGET_CODE +template +struct RandVecImpl +{ + static void execute(char * outpu, size_t size); + static String getImplementationTag() { return ToString(BuildArch) + "_vec_" + toString(VectorSize); } +}; -*/ +template +struct RandVecImpl2 +{ + static void execute(char * outpu, size_t size); + static String getImplementationTag() { return ToString(BuildArch) + "_vec2_" + toString(VectorSize); } +}; + +struct RandImpl6 +{ + static void execute(char * outpu, size_t size); + static String getImplementationTag() { return ToString(BuildArch) + "_v6"; } +}; + +) // DECLARE_MULTITARGET_CODE template class FunctionRandomImpl : public IFunction @@ -125,45 +137,80 @@ public: }; template -class FunctionRandom : public FunctionRandomImpl +class FunctionRandom : public FunctionRandomImpl { public: FunctionRandom(const Context & context) : selector(context) { - // selector.registerImplementation>(); selector.registerImplementation>(); + FunctionRandomImpl>(); + selector.registerImplementation>(); - // if constexpr (UseMultitargetCode) - // { - // selector.registerImplementation>(); - // selector.registerImplementation>(); - // selector.registerImplementation>(); - // selector.registerImplementation>(); + if constexpr (UseMultitargetCode) + { + selector.registerImplementation>(); + selector.registerImplementation>(); + selector.registerImplementation>(); + selector.registerImplementation>(); - // selector.registerImplementation>(); + selector.registerImplementation>(); - // selector.registerImplementation>(); - // selector.registerImplementation>(); + selector.registerImplementation>(); + selector.registerImplementation>(); - // selector.registerImplementation>(); - // selector.registerImplementation>(); + selector.registerImplementation>(); + selector.registerImplementation>(); - // selector.registerImplementation>(); - // selector.registerImplementation>(); - // } + selector.registerImplementation>(); + selector.registerImplementation>(); + + // vec impl + selector.registerImplementation, ToType, Name>>(); + selector.registerImplementation, ToType, Name>>(); + + selector.registerImplementation, ToType, Name>>(); + selector.registerImplementation, ToType, Name>>(); + + selector.registerImplementation, ToType, Name>>(); + selector.registerImplementation, ToType, Name>>(); + + // vec impl 2 + selector.registerImplementation, ToType, Name>>(); + selector.registerImplementation, ToType, Name>>(); + + selector.registerImplementation, ToType, Name>>(); + selector.registerImplementation, ToType, Name>>(); + + selector.registerImplementation, ToType, Name>>(); + selector.registerImplementation, ToType, Name>>(); + + selector.registerImplementation>(); + } } void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result, size_t input_rows_count) override diff --git a/src/Functions/generateUUIDv4.cpp b/src/Functions/generateUUIDv4.cpp index 4db3bd4c73d..d543226ba5c 100644 --- a/src/Functions/generateUUIDv4.cpp +++ b/src/Functions/generateUUIDv4.cpp @@ -33,7 +33,7 @@ public: size_t size = input_rows_count; vec_to.resize(size); // TODO(dakovalkov): rewrite this workaround - RandImpl::execute(reinterpret_cast(vec_to.data()), vec_to.size() * sizeof(UInt128)); + TargetSpecific::Default::RandImpl::execute(reinterpret_cast(vec_to.data()), vec_to.size() * sizeof(UInt128)); for (UInt128 & uuid: vec_to) { diff --git a/src/Functions/randConstant.cpp b/src/Functions/randConstant.cpp index 163f943d206..3eba5abf10d 100644 --- a/src/Functions/randConstant.cpp +++ b/src/Functions/randConstant.cpp @@ -100,7 +100,7 @@ public: typename ColumnVector::Container vec_to(1); // TODO(dakovalkov): Rewrite this workaround - RandImpl::execute(reinterpret_cast(vec_to.data()), sizeof(ToType)); + TargetSpecific::Default::RandImpl::execute(reinterpret_cast(vec_to.data()), sizeof(ToType)); ToType value = vec_to[0]; return std::make_unique>(value, argument_types, return_type);