Merge a446ff5524 into b4504f20bf

2024-09-19 16:20:50 +00:00 · 2024-09-19 21:32:51 +08:00 · 2024-09-19 21:32:51 +08:00 · 1a73bd79a4
commit 1a73bd79a4
parent b4504f20bf a446ff5524
10 changed files with 239 additions and 260 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -314,12 +314,12 @@ set (COMPILER_FLAGS "${COMPILER_FLAGS} -ffp-contract=off")
 set (DEBUG_INFO_FLAGS "-g")

 # Disable omit frame pointer compiler optimization using -fno-omit-frame-pointer
-option(DISABLE_OMIT_FRAME_POINTER "Disable omit frame pointer compiler optimization" OFF)
+option(DISABLE_OMIT_FRAME_POINTER "Disable omit frame pointer compiler optimization" ON)

 if (DISABLE_OMIT_FRAME_POINTER)
-    set (CMAKE_CXX_FLAGS_ADD "${CMAKE_CXX_FLAGS_ADD} -fno-omit-frame-pointer -mno-omit-leaf-frame-pointer")
-    set (CMAKE_C_FLAGS_ADD "${CMAKE_C_FLAGS_ADD} -fno-omit-frame-pointer -mno-omit-leaf-frame-pointer")
-    set (CMAKE_ASM_FLAGS_ADD "${CMAKE_ASM_FLAGS_ADD} -fno-omit-frame-pointer -mno-omit-leaf-frame-pointer")
+    set (CMAKE_CXX_FLAGS_ADD "${CMAKE_CXX_FLAGS_ADD} -fno-omit-frame-pointer")
+    set (CMAKE_C_FLAGS_ADD "${CMAKE_C_FLAGS_ADD} -fno-omit-frame-pointer")
+    set (CMAKE_ASM_FLAGS_ADD "${CMAKE_ASM_FLAGS_ADD} -fno-omit-frame-pointer")
 endif()

 # Before you start hating your debugger because it refuses to show variables ('<optimized out>'), try building with -DDEBUG_O_LEVEL="0"
--- a/src/Functions/FunctionsHashing.h
+++ b/src/Functions/FunctionsHashing.h
@ -739,8 +739,6 @@ struct ImplXXH3
    static constexpr bool use_int_hash_for_pods = false;
 };

-DECLARE_MULTITARGET_CODE(
-
 template <typename Impl, typename Name>
 class FunctionIntHash : public IFunction
 {
@ -832,44 +830,11 @@ public:
            throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Illegal type {} of argument of function {}",
                arguments[0].type->getName(), getName());
    }
+
+    static FunctionPtr create(ContextPtr) { return std::make_shared<FunctionIntHash>(); }
 };

-) // DECLARE_MULTITARGET_CODE
-
-template <typename Impl, typename Name>
-class FunctionIntHash : public TargetSpecific::Default::FunctionIntHash<Impl, Name>
-{
-public:
-    explicit FunctionIntHash(ContextPtr context) : selector(context)
-    {
-        selector.registerImplementation<TargetArch::Default,
-            TargetSpecific::Default::FunctionIntHash<Impl, Name>>();
-
-    #if USE_MULTITARGET_CODE
-        selector.registerImplementation<TargetArch::AVX2,
-            TargetSpecific::AVX2::FunctionIntHash<Impl, Name>>();
-        selector.registerImplementation<TargetArch::AVX512F,
-            TargetSpecific::AVX512F::FunctionIntHash<Impl, Name>>();
-    #endif
-    }
-
-    ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t input_rows_count) const override
-    {
-        return selector.selectAndExecute(arguments, result_type, input_rows_count);
-    }
-
-    static FunctionPtr create(ContextPtr context)
-    {
-        return std::make_shared<FunctionIntHash>(context);
-    }
-
-private:
-    ImplementationSelector<IFunction> selector;
-};
-
-DECLARE_MULTITARGET_CODE(
-
-template <typename Impl, bool Keyed, typename KeyType, typename KeyColumnsType>
+template <typename Impl, bool Keyed = false, typename KeyType = char, typename KeyColumnsType = char>
 class FunctionAnyHash : public IFunction
 {
 public:
@ -1396,38 +1361,8 @@ public:
        else
            return Impl::combineHashes(h1, h2);
    }
-};

-) // DECLARE_MULTITARGET_CODE
-
-template <typename Impl, bool Keyed = false, typename KeyType = char, typename KeyColumnsType = char>
-class FunctionAnyHash : public TargetSpecific::Default::FunctionAnyHash<Impl, Keyed, KeyType, KeyColumnsType>
-{
-public:
-    explicit FunctionAnyHash(ContextPtr context) : selector(context)
-    {
-        selector
-            .registerImplementation<TargetArch::Default, TargetSpecific::Default::FunctionAnyHash<Impl, Keyed, KeyType, KeyColumnsType>>();
-
-#if USE_MULTITARGET_CODE
-        selector.registerImplementation<TargetArch::AVX2, TargetSpecific::AVX2::FunctionAnyHash<Impl, Keyed, KeyType, KeyColumnsType>>();
-        selector
-            .registerImplementation<TargetArch::AVX512F, TargetSpecific::AVX512F::FunctionAnyHash<Impl, Keyed, KeyType, KeyColumnsType>>();
-#endif
-    }
-
-    ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t input_rows_count) const override
-    {
-        return selector.selectAndExecute(arguments, result_type, input_rows_count);
-    }
-
-    static FunctionPtr create(ContextPtr context)
-    {
-        return std::make_shared<FunctionAnyHash>(context);
-    }
-
-private:
-    ImplementationSelector<IFunction> selector;
+    static FunctionPtr create(ContextPtr) { return std::make_shared<FunctionAnyHash>(); }
 };


--- a/src/Functions/FunctionsRandom.cpp
+++ b/src/Functions/FunctionsRandom.cpp
@ -4,8 +4,11 @@
 #include <Common/HashTable/Hash.h>
 #include <Common/randomSeed.h>
 #include <base/unaligned.h>
+
 #if USE_MULTITARGET_CODE
-#  include <x86intrin.h>
+#    include <Common/TargetSpecific.h>
+
+#    include <immintrin.h>
 #endif


@ -64,59 +67,62 @@ namespace
        0x15762761bb55b9acULL, 0x3e448fc94fdd28e7ULL, 0xa5121232adfbe70aULL, 0xb1e0f6d286112804ULL,
        0x6062e96de9554806ULL, 0xcc679b329c28882aULL, 0x5c6d29f45cbc060eULL, 0x1af1325a86ffb162ULL,
    };
-}

-DECLARE_DEFAULT_CODE(
-
-void RandImpl::execute(char * output, size_t size)
-{
-    LinearCongruentialGenerator generator0;
-    LinearCongruentialGenerator generator1;
-    LinearCongruentialGenerator generator2;
-    LinearCongruentialGenerator generator3;
-
-    UInt64 rand_seed = randomSeed();
-
-    seed(generator0, rand_seed, random_numbers[0] + reinterpret_cast<intptr_t>(output));
-    seed(generator1, rand_seed, random_numbers[1] + reinterpret_cast<intptr_t>(output));
-    seed(generator2, rand_seed, random_numbers[2] + reinterpret_cast<intptr_t>(output));
-    seed(generator3, rand_seed, random_numbers[3] + reinterpret_cast<intptr_t>(output));
-
-    for (const char * end = output + size; output < end; output += 16)
+    void randImpl(char * output, size_t size)
    {
-        unalignedStore<UInt32>(output, generator0.next());
-        unalignedStore<UInt32>(output + 4, generator1.next());
-        unalignedStore<UInt32>(output + 8, generator2.next());
-        unalignedStore<UInt32>(output + 12, generator3.next());
+        LinearCongruentialGenerator generator0;
+        LinearCongruentialGenerator generator1;
+        LinearCongruentialGenerator generator2;
+        LinearCongruentialGenerator generator3;
+
+        UInt64 rand_seed = randomSeed();
+
+        seed(generator0, rand_seed, random_numbers[0] + reinterpret_cast<intptr_t>(output));
+        seed(generator1, rand_seed, random_numbers[1] + reinterpret_cast<intptr_t>(output));
+        seed(generator2, rand_seed, random_numbers[2] + reinterpret_cast<intptr_t>(output));
+        seed(generator3, rand_seed, random_numbers[3] + reinterpret_cast<intptr_t>(output));
+
+        for (const char * end = output + size; output < end; output += 16)
+        {
+            unalignedStore<UInt32>(output, generator0.next());
+            unalignedStore<UInt32>(output + 4, generator1.next());
+            unalignedStore<UInt32>(output + 8, generator2.next());
+            unalignedStore<UInt32>(output + 12, generator3.next());
+        }
+        /// It is guaranteed (by PaddedPODArray) that we can overwrite up to 15 bytes after end.
    }
-    /// It is guaranteed (by PaddedPODArray) that we can overwrite up to 15 bytes after end.
 }

-) // DECLARE_DEFAULT_CODE
-
-DECLARE_AVX2_SPECIFIC_CODE(
+#if USE_MULTITARGET_CODE

 using namespace VectorExtension;

 /* Takes 2 vectors with LinearCongruentialGenerator states and combines them into vector with random values.
 * From every rand-state we use only bits 15...47 to generate random vector.
 */
-inline UInt64x4 combineValues(UInt64x4 a, UInt64x4 b)
+AVX2_FUNCTION_SPECIFIC_ATTRIBUTE ALWAYS_INLINE inline UInt64x4 combineValuesAVX2(UInt64x4 & a, UInt64x4 & b)
 {
    auto xa = reinterpret_cast<__m256i>(a);
    auto xb = reinterpret_cast<__m256i>(b);
-    /// Every state is 8-byte value and we need to use only 4 from the middle.
-    /// Swap the low half and the high half of every state to move these bytes from the middle to sides.
-    /// xa = xa[1, 0, 3, 2, 5, 4, 7, 6]
+
+    /// 2 128-bit lanes
+    /// Each lane consist of 4 32-bit words
+    /// We only want to keep the 4 words of the middle so we move them to the sides
+    /// Mask: 0xb1 => 0b10110001 => Order: 2, 3, 0, 1
+    /// xa = a[2, 3, 0, 1, 6, 7, 4, 5]
    xa = _mm256_shuffle_epi32(xa, 0xb1);
-    /// Now every 8-byte value in xa is xx....xx and every value in xb is ..xxxx.. where x is random byte we want to use.
+
+    /// Now every 128-bit lane in xa is xx....xx and every value in xb is ..xxxx.. where x is random byte we want to use.
+    /// Now each lane consists of 8 16-bit words
    /// Just blend them to get the result vector.
-    /// result = xa[0],xb[1,2],xa[3,4],xb[5,6],xa[7,8],xb[9,10],xa[11,12],xb[13,14],xa[15]
+    /// Mask (least significant 8 bits): 0x66 => 0b01100110 => a_b_b_a_a_b_b_a (x2)
+    /// result = xa[0],xb[1,2],xa[3,4],xb[5,6],xa[7] - xa[8],xb[9,10],xa[11,12],xb[13,14],xa[15]
+    /// Final: a[2], b[1], b[2], a[1], a[6], b[5], b[6], a[5] - a[10], b[9], b[10], a[9], a[14], b[13], b[14], a[13]
    __m256i result = _mm256_blend_epi16(xa, xb, 0x66);
    return reinterpret_cast<UInt64x4>(result);
 }

-void RandImpl::execute(char * output, size_t size)
+AVX2_FUNCTION_SPECIFIC_ATTRIBUTE void NO_INLINE RandImpl::executeAVX2(char * output, size_t size)
 {
    if (size == 0)
        return;
@ -130,13 +136,6 @@ void RandImpl::execute(char * output, size_t size)
    UInt64 rand_seed = randomSeed();

    UInt64 a = LinearCongruentialGenerator::a;
-    // TODO(dakovalkov): try to remove this.
-    /// Note: GCC likes to expand multiplication by a constant into shifts + additions.
-    /// In this case a few multiplications become tens of shifts and additions. That leads to a huge slow down.
-    /// To avoid it we pretend that 'a' is not a constant. Actually we hope that rand_seed is never 0.
-    if (rand_seed == 0)
-        a = LinearCongruentialGenerator::a + 2;
-
    constexpr UInt64 c = LinearCongruentialGenerator::c;

    UInt64x4 gens1{};
@ -156,16 +155,16 @@ void RandImpl::execute(char * output, size_t size)
    {
        gens1 = gens1 * a + c;
        gens2 = gens2 * a + c;
-        unalignedStore<UInt64x4>(output, combineValues(gens1, gens2));
+        unalignedStore<UInt64x4>(output, combineValuesAVX2(gens1, gens2));
        gens3 = gens3 * a + c;
        gens4 = gens4 * a + c;
-        unalignedStore<UInt64x4>(output + sizeof(UInt64x4), combineValues(gens3, gens4));
+        unalignedStore<UInt64x4>(output + sizeof(UInt64x4), combineValuesAVX2(gens3, gens4));
        gens1 = gens1 * a + c;
        gens2 = gens2 * a + c;
-        unalignedStore<UInt64x4>(output + 2 * sizeof(UInt64x4), combineValues(gens1, gens2));
+        unalignedStore<UInt64x4>(output + 2 * sizeof(UInt64x4), combineValuesAVX2(gens1, gens2));
        gens3 = gens3 * a + c;
        gens4 = gens4 * a + c;
-        unalignedStore<UInt64x4>(output + 3 * sizeof(UInt64x4), combineValues(gens3, gens4));
+        unalignedStore<UInt64x4>(output + 3 * sizeof(UInt64x4), combineValuesAVX2(gens3, gens4));
        output += bytes_per_write;
    }

@ -174,7 +173,7 @@ void RandImpl::execute(char * output, size_t size)
    {
        gens1 = gens1 * a + c;
        gens2 = gens2 * a + c;
-        UInt64x4 values = combineValues(gens1, gens2);
+        UInt64x4 values = combineValuesAVX2(gens1, gens2);
        for (int i = 0; i < vec_size && (end - output) > 0; ++i)
        {
            unalignedStore<UInt64>(output, values[i]);
@ -183,6 +182,108 @@ void RandImpl::execute(char * output, size_t size)
    }
 }

-) // DECLARE_AVX2_SPECIFIC_CODE

+/* Takes 2 vectors with LinearCongruentialGenerator states and combines them into vector with random values.
+ * From every rand-state we use only bits 15...47 to generate random vector.
+ */
+AVX512BW_FUNCTION_SPECIFIC_ATTRIBUTE ALWAYS_INLINE inline UInt64x8 combineValuesAVX512BW(UInt64x8 & a, UInt64x8 & b)
+{
+    auto xa = reinterpret_cast<__m512i>(a);
+    auto xb = reinterpret_cast<__m512i>(b);
+
+    /// 4 128-bit lanes
+    /// Each lane consist of 4 32-bit words
+    /// We only want to keep the 4 words of the middle so we move them to the sides
+    /// Mask: 0xb1 => 0b10110001 => Order: 2, 3, 0, 1
+    /// xa = a[2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13]
+    xa = _mm512_shuffle_epi32(xa, 0xb1); // 0b10110001 => 2_3_0_1 (128 bits x 4 times)
+
+    /// Now every 128-bit lane in xa is xx....xx and every value in xb is ..xxxx.. where x is random byte we want to use.
+    /// Now each lane consists of 32 16-bit words
+    /// Just blend them to get the result vector.
+    /// Mask (all 32 bits are used): 0x66666666 => 0b01100110011001100110011001100110
+    __m512i result = _mm512_mask_blend_epi16(0x66666666, xa, xb);
+    return reinterpret_cast<UInt64x8>(result);
+}
+
+AVX512BW_FUNCTION_SPECIFIC_ATTRIBUTE void NO_INLINE RandImpl::executeAVX512BW(char * output, size_t size)
+{
+    if (size == 0)
+        return;
+
+    char * end = output + size;
+
+    constexpr int vec_size = 8;
+    constexpr int safe_overwrite = PADDING_FOR_SIMD - 1;
+    constexpr int bytes_per_write = 4 * sizeof(UInt64x8);
+
+    UInt64 rand_seed = randomSeed();
+
+    UInt64 a = LinearCongruentialGenerator::a;
+    constexpr UInt64 c = LinearCongruentialGenerator::c;
+
+    UInt64x8 gens1{};
+    UInt64x8 gens2{};
+    UInt64x8 gens3{};
+    UInt64x8 gens4{};
+
+    for (int i = 0; i < vec_size; ++i)
+    {
+        gens1[i] = calcSeed(rand_seed, random_numbers[i] + reinterpret_cast<intptr_t>(output));
+        gens2[i] = calcSeed(rand_seed, random_numbers[i + vec_size] + reinterpret_cast<intptr_t>(output));
+        gens3[i] = calcSeed(rand_seed, random_numbers[i + 2 * vec_size] + reinterpret_cast<intptr_t>(output));
+        gens4[i] = calcSeed(rand_seed, random_numbers[i + 3 * vec_size] + reinterpret_cast<intptr_t>(output));
+    }
+
+    while ((end - output) + safe_overwrite >= bytes_per_write)
+    {
+        gens1 = gens1 * a + c;
+        gens2 = gens2 * a + c;
+        unalignedStore<UInt64x8>(output, combineValuesAVX512BW(gens1, gens2));
+        gens3 = gens3 * a + c;
+        gens4 = gens4 * a + c;
+        unalignedStore<UInt64x8>(output + sizeof(UInt64x8), combineValuesAVX512BW(gens3, gens4));
+        gens1 = gens1 * a + c;
+        gens2 = gens2 * a + c;
+        unalignedStore<UInt64x8>(output + 2 * sizeof(UInt64x8), combineValuesAVX512BW(gens1, gens2));
+        gens3 = gens3 * a + c;
+        gens4 = gens4 * a + c;
+        unalignedStore<UInt64x8>(output + 3 * sizeof(UInt64x8), combineValuesAVX512BW(gens3, gens4));
+        output += bytes_per_write;
+    }
+
+    // Process tail
+    while ((end - output) > 0)
+    {
+        gens1 = gens1 * a + c;
+        gens2 = gens2 * a + c;
+        UInt64x8 values = combineValuesAVX512BW(gens1, gens2);
+        for (int i = 0; i < vec_size && (end - output) > 0; ++i)
+        {
+            unalignedStore<UInt64>(output, values[i]);
+            output += sizeof(UInt64);
+        }
+    }
+}
+
+#endif
+
+void RandImpl::execute(char * output, size_t size)
+{
+#if USE_MULTITARGET_CODE
+    if (isArchSupported(TargetArch::AVX512BW))
+    {
+        executeAVX512BW(output, size);
+        return;
+    }
+
+    if (isArchSupported(TargetArch::AVX2))
+    {
+        executeAVX2(output, size);
+        return;
+    }
+#endif
+
+    randImpl(output, size);
+}
 }
--- a/src/Functions/FunctionsRandom.h
+++ b/src/Functions/FunctionsRandom.h
@ -36,18 +36,20 @@ namespace ErrorCodes
  * This means that the timer must be of sufficient resolution to give different values to each columns.
  */

-DECLARE_MULTITARGET_CODE(
-
 struct RandImpl
 {
    /// Fill memory with random data. The memory region must be 15-bytes padded.
    static void execute(char * output, size_t size);
+
+#if USE_MULTITARGET_CODE
+    /// Assumes isArchSupported has been verified before calling
+    static void executeAVX2(char * output, size_t size);
+    static void executeAVX512BW(char * output, size_t size);
+#endif
 };

-) // DECLARE_MULTITARGET_CODE
-
-template <typename RandImpl, typename ToType, typename Name>
-class FunctionRandomImpl : public IFunction
+template <typename ToType, typename Name>
+class FunctionRandom : public IFunction
 {
 public:
    static constexpr auto name = Name::name;
@ -85,35 +87,8 @@ public:

        return col_to;
    }
-};

-template <typename ToType, typename Name>
-class FunctionRandom : public FunctionRandomImpl<TargetSpecific::Default::RandImpl, ToType, Name>
-{
-public:
-    explicit FunctionRandom(ContextPtr context) : selector(context)
-    {
-        selector.registerImplementation<TargetArch::Default,
-            FunctionRandomImpl<TargetSpecific::Default::RandImpl, ToType, Name>>();
-
-    #if USE_MULTITARGET_CODE
-        selector.registerImplementation<TargetArch::AVX2,
-            FunctionRandomImpl<TargetSpecific::AVX2::RandImpl, ToType, Name>>();
-    #endif
-    }
-
-    ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t input_rows_count) const override
-    {
-        return selector.selectAndExecute(arguments, result_type, input_rows_count);
-    }
-
-    static FunctionPtr create(ContextPtr context)
-    {
-        return std::make_shared<FunctionRandom<ToType, Name>>(context);
-    }
-
-private:
-    ImplementationSelector<IFunction> selector;
+    static FunctionPtr create(ContextPtr) { return std::make_shared<FunctionRandom<ToType, Name>>(); }
 };

 }
--- a/src/Functions/canonicalRand.cpp
+++ b/src/Functions/canonicalRand.cpp
@ -37,10 +37,24 @@ struct NameCanonicalRand
    static constexpr auto name = "randCanonical";
 };

-class FunctionCanonicalRand : public FunctionRandomImpl<CanonicalRandImpl, Float64, NameCanonicalRand>
+class FunctionCanonicalRand : public FunctionRandom<Float64, NameCanonicalRand>
 {
+    using ToType = Float64;
+
 public:
    static FunctionPtr create(ContextPtr /*context*/) { return std::make_shared<FunctionCanonicalRand>(); }
+
+    ColumnPtr executeImpl(const ColumnsWithTypeAndName &, const DataTypePtr &, size_t input_rows_count) const override
+    {
+        auto col_to = ColumnVector<ToType>::create();
+        typename ColumnVector<ToType>::Container & vec_to = col_to->getData();
+
+        size_t size = input_rows_count;
+        vec_to.resize(size);
+        CanonicalRandImpl::execute(reinterpret_cast<char *>(vec_to.data()), vec_to.size() * sizeof(ToType));
+
+        return col_to;
+    }
 };

 }
--- a/src/Functions/generateUUIDv4.cpp
+++ b/src/Functions/generateUUIDv4.cpp
@ -2,15 +2,49 @@
 #include <Functions/FunctionFactory.h>
 #include <Functions/FunctionHelpers.h>
 #include <Functions/FunctionsRandom.h>
+#include <Common/TargetSpecific.h>

 namespace DB
 {

-#define DECLARE_SEVERAL_IMPLEMENTATIONS(...) \
-DECLARE_DEFAULT_CODE      (__VA_ARGS__) \
-DECLARE_AVX2_SPECIFIC_CODE(__VA_ARGS__)
+namespace
+{

-DECLARE_SEVERAL_IMPLEMENTATIONS(
+void generateUUID4Generic(ColumnVector<UUID>::Container & vec_to)
+{
+    RandImpl::execute(reinterpret_cast<char *>(vec_to.data()), vec_to.size() * sizeof(UUID));
+    for (UUID & uuid : vec_to)
+    {
+        /// https://tools.ietf.org/html/rfc4122#section-4.4
+        UUIDHelpers::getHighBytes(uuid) = (UUIDHelpers::getHighBytes(uuid) & 0xffffffffffff0fffull) | 0x0000000000004000ull;
+        UUIDHelpers::getLowBytes(uuid) = (UUIDHelpers::getLowBytes(uuid) & 0x3fffffffffffffffull) | 0x8000000000000000ull;
+    }
+}
+
+#if USE_MULTITARGET_CODE
+
+AVX2_FUNCTION_SPECIFIC_ATTRIBUTE void NO_INLINE generateUUID4AVX2(ColumnVector<UUID>::Container & vec_to)
+{
+    RandImpl::executeAVX2(reinterpret_cast<char *>(vec_to.data()), vec_to.size() * sizeof(UUID));
+    for (UUID & uuid : vec_to)
+    {
+        UUIDHelpers::getHighBytes(uuid) = (UUIDHelpers::getHighBytes(uuid) & 0xffffffffffff0fffull) | 0x0000000000004000ull;
+        UUIDHelpers::getLowBytes(uuid) = (UUIDHelpers::getLowBytes(uuid) & 0x3fffffffffffffffull) | 0x8000000000000000ull;
+    }
+}
+
+AVX512BW_FUNCTION_SPECIFIC_ATTRIBUTE void NO_INLINE generateUUID4AVX512BW(ColumnVector<UUID>::Container & vec_to)
+{
+    RandImpl::executeAVX512BW(reinterpret_cast<char *>(vec_to.data()), vec_to.size() * sizeof(UUID));
+    for (UUID & uuid : vec_to)
+    {
+        UUIDHelpers::getHighBytes(uuid) = (UUIDHelpers::getHighBytes(uuid) & 0xffffffffffff0fffull) | 0x0000000000004000ull;
+        UUIDHelpers::getLowBytes(uuid) = (UUIDHelpers::getLowBytes(uuid) & 0x3fffffffffffffffull) | 0x8000000000000000ull;
+    }
+}
+
+#endif
+}

 class FunctionGenerateUUIDv4 : public IFunction
 {
@ -44,51 +78,27 @@ public:

        size_t size = input_rows_count;
        vec_to.resize(size);
-
-        /// RandImpl is target-dependent and is not the same in different TargetSpecific namespaces.
-        RandImpl::execute(reinterpret_cast<char *>(vec_to.data()), vec_to.size() * sizeof(UUID));
-
-        for (UUID & uuid : vec_to)
-        {
-            /// https://tools.ietf.org/html/rfc4122#section-4.4
-
-            UUIDHelpers::getHighBytes(uuid) = (UUIDHelpers::getHighBytes(uuid) & 0xffffffffffff0fffull) | 0x0000000000004000ull;
-            UUIDHelpers::getLowBytes(uuid) = (UUIDHelpers::getLowBytes(uuid) & 0x3fffffffffffffffull) | 0x8000000000000000ull;
-        }
-
-        return col_res;
-    }
-};
-
-) // DECLARE_SEVERAL_IMPLEMENTATIONS
-#undef DECLARE_SEVERAL_IMPLEMENTATIONS
-
-class FunctionGenerateUUIDv4 : public TargetSpecific::Default::FunctionGenerateUUIDv4
-{
-public:
-    explicit FunctionGenerateUUIDv4(ContextPtr context) : selector(context)
-    {
-        selector.registerImplementation<TargetArch::Default,
-            TargetSpecific::Default::FunctionGenerateUUIDv4>();
+        if (!size)
+            return col_res;

 #if USE_MULTITARGET_CODE
-        selector.registerImplementation<TargetArch::AVX2,
-            TargetSpecific::AVX2::FunctionGenerateUUIDv4>();
+        if (isArchSupported(TargetArch::AVX512BW))
+        {
+            generateUUID4AVX512BW(vec_to);
+            return col_res;
+        }
+
+        if (isArchSupported(TargetArch::AVX2))
+        {
+            generateUUID4AVX2(vec_to);
+            return col_res;
+        }
 #endif
+        generateUUID4Generic(vec_to);
+        return col_res;
    }

-    ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t input_rows_count) const override
-    {
-        return selector.selectAndExecute(arguments, result_type, input_rows_count);
-    }
-
-    static FunctionPtr create(ContextPtr context)
-    {
-        return std::make_shared<FunctionGenerateUUIDv4>(context);
-    }
-
-private:
-    ImplementationSelector<IFunction> selector;
+    static FunctionPtr create(ContextPtr) { return std::make_shared<FunctionGenerateUUIDv4>(); }
 };

 REGISTER_FUNCTION(GenerateUUIDv4)
--- a/src/Functions/randConstant.cpp
+++ b/src/Functions/randConstant.cpp
@ -107,7 +107,7 @@ public:

        typename ColumnVector<ToType>::Container vec_to(1);

-        TargetSpecific::Default::RandImpl::execute(reinterpret_cast<char *>(vec_to.data()), sizeof(ToType));
+        RandImpl::execute(reinterpret_cast<char *>(vec_to.data()), sizeof(ToType));
        ToType value = vec_to[0];

        return std::make_unique<FunctionBaseRandomConstant<ToType, Name>>(value, argument_types, return_type);
--- a/src/Functions/randomFixedString.cpp
+++ b/src/Functions/randomFixedString.cpp
@ -25,8 +25,7 @@ namespace
 {

 /* Generate random fixed string with fully random bytes (including zero). */
-template <typename RandImpl>
-class FunctionRandomFixedStringImpl : public IFunction
+class FunctionRandomFixedString : public IFunction
 {
 public:
    static constexpr auto name = "randomFixedString";
@ -74,34 +73,8 @@ public:

        return col_to;
    }
-};

-class FunctionRandomFixedString : public FunctionRandomFixedStringImpl<TargetSpecific::Default::RandImpl>
-{
-public:
-    explicit FunctionRandomFixedString(ContextPtr context) : selector(context)
-    {
-        selector.registerImplementation<TargetArch::Default,
-            FunctionRandomFixedStringImpl<TargetSpecific::Default::RandImpl>>();
-
-    #if USE_MULTITARGET_CODE
-        selector.registerImplementation<TargetArch::AVX2,
-            FunctionRandomFixedStringImpl<TargetSpecific::AVX2::RandImpl>>();
-    #endif
-    }
-
-    ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t input_rows_count) const override
-    {
-        return selector.selectAndExecute(arguments, result_type, input_rows_count);
-    }
-
-    static FunctionPtr create(ContextPtr context)
-    {
-        return std::make_shared<FunctionRandomFixedString>(context);
-    }
-
-private:
-    ImplementationSelector<IFunction> selector;
+    static FunctionPtr create(ContextPtr) { return std::make_shared<FunctionRandomFixedString>(); }
 };

 }
--- a/src/Functions/randomString.cpp
+++ b/src/Functions/randomString.cpp
@ -23,8 +23,7 @@ namespace
 {

 /* Generate random string of specified length with fully random bytes (including zero). */
-template <typename RandImpl>
-class FunctionRandomStringImpl : public IFunction
+class FunctionRandomString : public IFunction
 {
 public:
    static constexpr auto name = "randomString";
@ -92,34 +91,8 @@ public:

        return col_to;
    }
-};

-class FunctionRandomString : public FunctionRandomStringImpl<TargetSpecific::Default::RandImpl>
-{
-public:
-    explicit FunctionRandomString(ContextPtr context) : selector(context)
-    {
-        selector.registerImplementation<TargetArch::Default,
-            FunctionRandomStringImpl<TargetSpecific::Default::RandImpl>>();
-
-    #if USE_MULTITARGET_CODE
-        selector.registerImplementation<TargetArch::AVX2,
-            FunctionRandomStringImpl<TargetSpecific::AVX2::RandImpl>>();
-    #endif
-    }
-
-    ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t input_rows_count) const override
-    {
-        return selector.selectAndExecute(arguments, result_type, input_rows_count);
-    }
-
-    static FunctionPtr create(ContextPtr context)
-    {
-        return std::make_shared<FunctionRandomString>(context);
-    }
-
-private:
-    ImplementationSelector<IFunction> selector;
+    static FunctionPtr create(ContextPtr) { return std::make_shared<FunctionRandomString>(); }
 };

 }
--- a/tests/performance/scripts/config/config.d/zzz-perf-comparison-tweaks-config.xml
+++ b/tests/performance/scripts/config/config.d/zzz-perf-comparison-tweaks-config.xml
@ -19,6 +19,4 @@
    <uncompressed_cache_size>1000000000</uncompressed_cache_size>

    <asynchronous_metrics_update_period_s>10</asynchronous_metrics_update_period_s>
-
-    <remap_executable replace="replace">true</remap_executable>
 </clickhouse>