diff --git a/src/Compression/LZ4_decompress_faster.cpp b/src/Compression/LZ4_decompress_faster.cpp index 84471836cfc..33726811848 100644 --- a/src/Compression/LZ4_decompress_faster.cpp +++ b/src/Compression/LZ4_decompress_faster.cpp @@ -4,7 +4,7 @@ #include #include #include -#include +#include #include #include @@ -16,7 +16,7 @@ #include #endif -#if defined(__AVX512VBMI__) +#if USE_MULTITARGET_CODE #include #endif @@ -408,9 +408,9 @@ inline void copyOverlap32(UInt8 * op, const UInt8 *& match, const size_t offset) match += shift4[offset]; } +DECLARE_AVX512VBMI_SPECIFIC_CODE( inline void copyOverlap32Shuffle(UInt8 * op, const UInt8 *& match, const size_t offset) { -#if defined(__AVX512VBMI__) && !defined(MEMORY_SANITIZER) static constexpr UInt8 __attribute__((__aligned__(32))) masks[] = { 0, 1, 2, 2, 4, 2, 2, 4, 8, 5, 2, 10, 8, 6, 4, 2, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, /* offset=0, shift amount index. */ @@ -447,28 +447,25 @@ inline void copyOverlap32Shuffle(UInt8 * op, const UInt8 *& match, const size_t 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 0, }; - if (DB::Cpu::CpuFlagsCache::have_AVX512VBMI) - { - _mm256_storeu_si256(reinterpret_cast<__m256i *>(op), - _mm256_permutexvar_epi8( - _mm256_load_si256(reinterpret_cast(masks) + offset), - _mm256_loadu_si256(reinterpret_cast(match)))); - match += masks[offset]; - } - else - { - copyOverlap32(op, match, offset); - } -#else - copyOverlap32(op, match, offset); -#endif + _mm256_storeu_si256(reinterpret_cast<__m256i *>(op), + _mm256_permutexvar_epi8( + _mm256_load_si256(reinterpret_cast(masks) + offset), + _mm256_loadu_si256(reinterpret_cast(match)))); + match += masks[offset]; } +) /// DECLARE_AVX512VBMI_SPECIFIC_CODE template <> void inline copy<32>(UInt8 * dst, const UInt8 * src) { copy32(dst, src); } template <> void inline wildCopy<32>(UInt8 * dst, const UInt8 * src, UInt8 * dst_end) { wildCopy32(dst, src, dst_end); } template <> void inline copyOverlap<32, false>(UInt8 * op, const UInt8 *& match, const size_t offset) { copyOverlap32(op, match, offset); } -template <> void inline copyOverlap<32, true>(UInt8 * op, const UInt8 *& match, const size_t offset) { copyOverlap32Shuffle(op, match, offset); } +template <> void inline copyOverlap<32, true>(UInt8 * op, const UInt8 *& match, const size_t offset) { +#if USE_MULTITARGET_CODE + TargetSpecific::AVX512VBMI::copyOverlap32Shuffle(op, match, offset); +#else + copyOverlap32(op, match, offset); +#endif +} /// See also https://stackoverflow.com/a/30669632 @@ -641,9 +638,9 @@ bool decompress( if (dest_size >= 32768) { size_t variant_size = 4; -#if defined(__AVX512VBMI__) && !defined(MEMORY_SANITIZER) +#if USE_MULTITARGET_CODE && !defined(MEMORY_SANITIZER) /// best_variant == 4 only valid when AVX512VBMI available - if (DB::Cpu::CpuFlagsCache::have_AVX512VBMI) + if (isArchSupported(DB::TargetArch::AVX512VBMI)) variant_size = 5; #endif size_t best_variant = statistics.select(variant_size);