From 3548e974e1a8d7cbeaf8789b461cfc6f648f3135 Mon Sep 17 00:00:00 2001 From: Youenn Lebras Date: Fri, 13 Aug 2021 17:26:47 +0200 Subject: [PATCH 01/82] Implement HasAll specialization for SSE and AVX2 --- src/Functions/GatherUtils/Algorithms.h | 893 +++++++++++++++++++++-- src/Functions/GatherUtils/CMakeLists.txt | 4 + src/Functions/tests/gtest_hasAll.cpp | 104 +++ 3 files changed, 921 insertions(+), 80 deletions(-) create mode 100644 src/Functions/tests/gtest_hasAll.cpp diff --git a/src/Functions/GatherUtils/Algorithms.h b/src/Functions/GatherUtils/Algorithms.h index fc54eaf88ab..245794da976 100644 --- a/src/Functions/GatherUtils/Algorithms.h +++ b/src/Functions/GatherUtils/Algorithms.h @@ -7,7 +7,9 @@ #include #include #include "GatherUtils.h" - +#if defined(__AVX2__) + #include +#endif namespace DB::ErrorCodes { @@ -418,41 +420,55 @@ void NO_INLINE conditional(SourceA && src_a, SourceB && src_b, Sink && sink, con } -/// Methods to check if first array has elements from second array, overloaded for various combinations of types. -template < - ArraySearchType search_type, - typename FirstSliceType, - typename SecondSliceType, - bool (*isEqual)(const FirstSliceType &, const SecondSliceType &, size_t, size_t)> -bool sliceHasImplAnyAll(const FirstSliceType & first, const SecondSliceType & second, const UInt8 * first_null_map, const UInt8 * second_null_map) + +template +bool sliceEqualElements(const NumericArraySlice & first [[maybe_unused]], + const NumericArraySlice & second [[maybe_unused]], + size_t first_ind [[maybe_unused]], + size_t second_ind [[maybe_unused]]) { - const bool has_first_null_map = first_null_map != nullptr; - const bool has_second_null_map = second_null_map != nullptr; - - for (size_t i = 0; i < second.size; ++i) - { - bool has = false; - for (size_t j = 0; j < first.size && !has; ++j) - { - const bool is_first_null = has_first_null_map && first_null_map[j]; - const bool is_second_null = has_second_null_map && second_null_map[i]; - - if (is_first_null && is_second_null) - has = true; - - if (!is_first_null && !is_second_null && isEqual(first, second, j, i)) - has = true; - } - - if (has && search_type == ArraySearchType::Any) - return true; - - if (!has && search_type == ArraySearchType::All) - return false; - } - return search_type == ArraySearchType::All; + /// TODO: Decimal scale + if constexpr (is_decimal && is_decimal) + return accurate::equalsOp(first.data[first_ind].value, second.data[second_ind].value); + else if constexpr (is_decimal || is_decimal) + return false; + else + return accurate::equalsOp(first.data[first_ind], second.data[second_ind]); } +template +bool sliceEqualElements(const NumericArraySlice &, const GenericArraySlice &, size_t, size_t) +{ + return false; +} + +template +bool sliceEqualElements(const GenericArraySlice &, const NumericArraySlice &, size_t, size_t) +{ + return false; +} + +inline ALWAYS_INLINE bool sliceEqualElements(const GenericArraySlice & first, const GenericArraySlice & second, size_t first_ind, size_t second_ind) +{ + return first.elements->compareAt(first_ind + first.begin, second_ind + second.begin, *second.elements, -1) == 0; +} + +template +bool insliceEqualElements(const NumericArraySlice & first [[maybe_unused]], + size_t first_ind [[maybe_unused]], + size_t second_ind [[maybe_unused]]) +{ + if constexpr (is_decimal) + return accurate::equalsOp(first.data[first_ind].value, first.data[second_ind].value); + else + return accurate::equalsOp(first.data[first_ind], first.data[second_ind]); +} +inline ALWAYS_INLINE bool insliceEqualElements(const GenericArraySlice & first, size_t first_ind, size_t second_ind) +{ + return first.elements->compareAt(first_ind + first.begin, second_ind + first.begin, *first.elements, -1) == 0; +} + + /// For details of Knuth-Morris-Pratt string matching algorithm see /// https://en.wikipedia.org/wiki/Knuth%E2%80%93Morris%E2%80%93Pratt_algorithm. @@ -481,6 +497,770 @@ std::vector buildKMPPrefixFunction(const SliceType & pattern, const Equa } +/// Methods to check if first array has elements from second array, overloaded for various combinations of types. +template < + ArraySearchType search_type, + typename FirstSliceType, + typename SecondSliceType, + bool (*isEqual)(const FirstSliceType &, const SecondSliceType &, size_t, size_t)> +bool sliceHasImplAnyAll(const FirstSliceType & first, const SecondSliceType & second, const UInt8 * first_null_map, const UInt8 * second_null_map) +{ + const bool has_first_null_map = first_null_map != nullptr; + const bool has_second_null_map = second_null_map != nullptr; + + for (size_t i = 0; i < second.size; ++i) + { + bool has = false; + for (unsigned j = 0; j < first.size && !has; ++j) + { + const bool is_first_null = has_first_null_map && first_null_map[j]; + const bool is_second_null = has_second_null_map && second_null_map[i]; + + if (is_first_null && is_second_null) + has = true; + + if (!is_first_null && !is_second_null && isEqual(first, second, j, i)) + has = true; + } + + if (has && search_type == ArraySearchType::Any) + return true; + + if (!has && search_type == ArraySearchType::All) + return false; + } + return search_type == ArraySearchType::All; +} + + +#if defined(__AVX2__) +// AVX2 - Int specialization +template <> +inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >(const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, const UInt8 * second_null_map) +{ + if (first.size == 0) return true; + + const bool has_first_null_map = first_null_map != nullptr; + const bool has_second_null_map = second_null_map != nullptr; + if (has_second_null_map != has_first_null_map && has_first_null_map) return false; + + unsigned j = 0; + short has_mask = 1; + const int full = -1, none = 0; + const __m256i ones = _mm256_set1_epi32(full); + const __m256i zeros = _mm256_setzero_si256(); + if (first.size > 7 && second.size > 7) + { + for (; j < first.size-7 && has_mask; j += 8) + { + has_mask = 0; + const __m256i f_data = _mm256_lddqu_si256(reinterpret_cast(first.data+j)); + // bitmask is fulfilled with ones for ones which are considered as null in the corresponding null map, 0 otherwise; + __m256i bitmask = has_first_null_map ? _mm256_set_epi32((first_null_map[j+7])? full: none, + (first_null_map[j+6])? full: none, + (first_null_map[j+5])? full: none, + (first_null_map[j+4])? full: none, + (first_null_map[j+3])? full: none, + (first_null_map[j+2])? full: none, + (first_null_map[j+1])? full: none, + (first_null_map[j]) ? full: none + ) + :zeros; + + size_t i = 0; + // Browse second array to try to match ell first elements + for (; i < second.size-7 && !has_mask; has_mask = _mm256_testc_si256(bitmask, ones), i += 8) + { + const __m256i s_data = _mm256_lddqu_si256(reinterpret_cast(second.data+i)); + // Create a mask to avoid to compare null elements + // set_m128i takes two arguments: (high segment, low segment) that are two __m128i convert from 8bits to 32bits to fit to our following operations + const __m256i second_nm_mask = _mm256_set_m128i(_mm_cvtepi8_epi32(_mm_lddqu_si128(reinterpret_cast(second_null_map+i+4))), + _mm_cvtepi8_epi32(_mm_lddqu_si128(reinterpret_cast(second_null_map+i)))); + bitmask = + _mm256_or_si256( + _mm256_or_si256( + _mm256_or_si256( + _mm256_or_si256( + _mm256_andnot_si256( + second_nm_mask, + _mm256_cmpeq_epi32(f_data, s_data)), + _mm256_andnot_si256( + _mm256_permutevar8x32_epi32(second_nm_mask, _mm256_set_epi32(6,5,4,3,2,1,0,7)), + _mm256_cmpeq_epi32(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(6,5,4,3,2,1,0,7))))), + _mm256_or_si256( + _mm256_andnot_si256( + _mm256_permutevar8x32_epi32(second_nm_mask, _mm256_set_epi32(5,4,3,2,1,0,7,6)), + _mm256_cmpeq_epi32(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(5,4,3,2,1,0,7,6)))), + _mm256_andnot_si256( + _mm256_permutevar8x32_epi32(second_nm_mask, _mm256_set_epi32(4,3,2,1,0,7,6,5)), + _mm256_cmpeq_epi32(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(4,3,2,1,0,7,6,5))))) + ), + _mm256_or_si256( + _mm256_or_si256( + _mm256_andnot_si256( + _mm256_permutevar8x32_epi32(second_nm_mask, _mm256_set_epi32(3,2,1,0,7,6,5,4)), + _mm256_cmpeq_epi32(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(3,2,1,0,7,6,5,4)))), + _mm256_andnot_si256( + _mm256_permutevar8x32_epi32(second_nm_mask, _mm256_set_epi32(2,1,0,7,6,5,4,3)), + _mm256_cmpeq_epi32(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(2,1,0,7,6,5,4,3))))), + _mm256_or_si256( + _mm256_andnot_si256( + _mm256_permutevar8x32_epi32(second_nm_mask, _mm256_set_epi32(1,0,7,6,5,4,3,2)), + _mm256_cmpeq_epi32(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(1,0,7,6,5,4,3,2)))), + _mm256_andnot_si256( + _mm256_permutevar8x32_epi32(second_nm_mask, _mm256_set_epi32(0,7,6,5,4,3,2,1)), + _mm256_cmpeq_epi32(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(0,7,6,5,4,3,2,1))))))) + ,bitmask); + } + + if (i < second.size) + { + // Loop(i)-jam + for (; i < second.size && !has_mask; i++) + { + if (second_null_map[i]) continue; + __m256i v_i = _mm256_set1_epi32(second.data[i]); + bitmask = _mm256_or_si256(bitmask, _mm256_cmpeq_epi32(f_data, v_i)); + has_mask = _mm256_testc_si256 (bitmask, ones); + } + } + } + } + + bool found = false; + // Loop(j)-jam + for (; j < first.size && has_mask; j++) + { + found = (has_first_null_map && first_null_map[j])? true: false; + for (unsigned i = 0; i < second.size && !found; i ++) + { + if (has_second_null_map && second_null_map[i]) continue; + found = (second.data[i] == first.data[j]); + } + if (!found) + return false; + } + return has_mask || found; +} + +// TODO: Discuss about +// raise an error : "error: no viable conversion from 'const NumericArraySlice' to 'const NumericArraySlice'" +// How should we do, copy past each function ?? I haven't found a way to specialize a same function body for two different types. +// AVX2 UInt specialization +// template <> +// inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >(const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, const UInt8 * second_null_map) +// { +// return sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements > (static_cast &>(first), static_cast &>(second), first_null_map, second_null_map); +// } + +// AVX2 Int64 specialization +template <> +inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >(const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, const UInt8 * second_null_map) +{ + if (first.size == 0) return true; + + const bool has_first_null_map = first_null_map != nullptr; + const bool has_second_null_map = second_null_map != nullptr; + if (has_second_null_map != has_first_null_map && has_first_null_map) return false; + + unsigned j = 0; + short has_mask = 1; + const int full = -1, none = 0; + const __m256i ones = _mm256_set1_epi64x(full); + const __m256i zeros = _mm256_setzero_si256(); + if (first.size > 3 && second.size > 3) + { + for (; j < first.size-3 && has_mask; j += 4) + { + has_mask = 0; + const __m256i f_data = _mm256_lddqu_si256(reinterpret_cast(first.data+j)); + __m256i bitmask = has_first_null_map ? _mm256_set_epi64x((first_null_map[j+3])? full: none, + (first_null_map[j+2])? full: none, + (first_null_map[j+1])? full: none, + (first_null_map[j]) ? full: none + ) + :zeros; + + unsigned i = 0; + for (; i < second.size-3 && !has_mask; has_mask = _mm256_testc_si256 (bitmask, ones), i += 4) + { + const __m256i s_data = _mm256_lddqu_si256(reinterpret_cast(second.data+i)); + const __m256i second_nm_mask = _mm256_set_m128i(_mm_cvtepi8_epi64(_mm_lddqu_si128(reinterpret_cast(second_null_map+i+2))), + _mm_cvtepi8_epi64(_mm_lddqu_si128(reinterpret_cast(second_null_map+i)))); + bitmask = + _mm256_or_si256( + _mm256_or_si256( + _mm256_or_si256( + _mm256_andnot_si256( + second_nm_mask, + _mm256_cmpeq_epi64(f_data, s_data)), + _mm256_andnot_si256( + _mm256_permutevar8x32_epi32(second_nm_mask, _mm256_set_epi32(5,4,3,2,1,0,7,6)), + _mm256_cmpeq_epi64(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(5,4,3,2,1,0,7,6))))), + + _mm256_or_si256( + _mm256_andnot_si256( + _mm256_permutevar8x32_epi32(second_nm_mask, _mm256_set_epi32(3,2,1,0,7,6,5,4)), + _mm256_cmpeq_epi64(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(3,2,1,0,7,6,5,4)))), + _mm256_andnot_si256( + _mm256_permutevar8x32_epi32(second_nm_mask, _mm256_set_epi32(1,0,7,6,5,4,3,2)), + _mm256_cmpeq_epi64(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(1,0,7,6,5,4,3,2))))) + ), + bitmask); + } + + if (i < second.size) + { + for (; i < second.size && !has_mask; i++) + { + if (second_null_map[i]) continue; + __m256i v_i = _mm256_set1_epi64x(second.data[i]); + bitmask = _mm256_or_si256(bitmask, _mm256_cmpeq_epi64(f_data, v_i)); + has_mask = _mm256_testc_si256 (bitmask, ones); + } + } + } + } + + bool found = false; + for (; j < first.size && (has_mask || first.size <= 2); j++) + { + found = (has_first_null_map && first_null_map[j])? true: false; + for (unsigned i = 0; i < second.size && !found; i ++) + { + if (has_second_null_map && second_null_map[i]) continue; + found = (second.data[i] == first.data[j]); + } + if (!found) + return false; + } + return has_mask || found; +} + +// AVX2 Int16_t specialization +template <> +inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >(const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, const UInt8 * second_null_map) +{ + if (first.size == 0) return true; + + const bool has_second_null_map = second_null_map != nullptr; + const bool has_first_null_map = first_null_map != nullptr; + if (has_second_null_map != has_first_null_map && has_first_null_map) return false; + + unsigned j = 0; + short has_mask = 1; + const int full = -1, none = 0; + const __m256i ones = _mm256_set1_epi16(full); + const __m256i zeros = _mm256_setzero_si256(); + if (first.size > 15 && second.size > 15) + { + for (; j < first.size-15 && has_mask; j += 16) + { + has_mask = 0; + const __m256i f_data = _mm256_lddqu_si256(reinterpret_cast(first.data+j)); + __m256i bitmask = has_first_null_map ? _mm256_set_epi16((first_null_map[j+15])? full: none, (first_null_map[j+14])? full: none, + (first_null_map[j+13])? full: none, (first_null_map[j+12])? full: none, + (first_null_map[j+11])? full: none, (first_null_map[j+10])? full: none, + (first_null_map[j+9])? full: none, (first_null_map[j+8])? full: none, + (first_null_map[j+7])? full: none, (first_null_map[j+6])? full: none, + (first_null_map[j+5])? full: none, (first_null_map[j+4])? full: none, + (first_null_map[j+3])? full: none, (first_null_map[j+2])? full: none, + (first_null_map[j+1])? full: none, (first_null_map[j]) ? full: none + ) + :zeros; + unsigned i = 0; + for (; i < second.size-15 && !has_mask; has_mask = _mm256_testc_si256 (bitmask, ones), i += 16) + { + const __m256i s_data = _mm256_lddqu_si256(reinterpret_cast(second.data+i)); + const __m256i second_nm_mask = _mm256_set_m128i(_mm_cvtepi8_epi16(_mm_lddqu_si128(reinterpret_cast(second_null_map+i+8))), + _mm_cvtepi8_epi16(_mm_lddqu_si128(reinterpret_cast(second_null_map+i)))); + bitmask = + _mm256_or_si256( + _mm256_or_si256( + _mm256_or_si256( + _mm256_or_si256( + _mm256_or_si256( + _mm256_andnot_si256( + second_nm_mask, + _mm256_cmpeq_epi16(f_data, s_data)), + _mm256_andnot_si256( + _mm256_shuffle_epi8(second_nm_mask, _mm256_set_epi8(29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30)), + _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(s_data, _mm256_set_epi8(29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30))))), + _mm256_or_si256( + _mm256_andnot_si256( + _mm256_shuffle_epi8(second_nm_mask, _mm256_set_epi8(27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28)), + _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(s_data, _mm256_set_epi8(27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28)))), + _mm256_andnot_si256( + _mm256_shuffle_epi8(second_nm_mask, _mm256_set_epi8(25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26)), + _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(s_data, _mm256_set_epi8(25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26))))) + ), + _mm256_or_si256( + _mm256_or_si256( + _mm256_andnot_si256( + _mm256_shuffle_epi8(second_nm_mask, _mm256_set_epi8(23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24)), + _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(s_data, _mm256_set_epi8(23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24)))), + _mm256_andnot_si256( + _mm256_shuffle_epi8(second_nm_mask, _mm256_set_epi8(21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22)), + _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(s_data, _mm256_set_epi8(21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22))))), + _mm256_or_si256( + _mm256_andnot_si256( + _mm256_shuffle_epi8(second_nm_mask, _mm256_set_epi8(19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20)), + _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(s_data, _mm256_set_epi8(19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20)))), + _mm256_andnot_si256( + _mm256_shuffle_epi8(second_nm_mask, _mm256_set_epi8(17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18)), + _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(s_data, _mm256_set_epi8(17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18)))))) + ), + _mm256_or_si256( + _mm256_or_si256( + _mm256_or_si256( + _mm256_andnot_si256( + _mm256_permute2x128_si256(second_nm_mask,second_nm_mask,1), + _mm256_cmpeq_epi16(f_data, _mm256_permute2x128_si256(s_data,s_data,1))), + _mm256_andnot_si256( + _mm256_shuffle_epi8(_mm256_permute2x128_si256(second_nm_mask,second_nm_mask,1), _mm256_set_epi8(13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14)), + _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(_mm256_permute2x128_si256(s_data,s_data,1), _mm256_set_epi8(13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14))))), + _mm256_or_si256( + _mm256_andnot_si256( + _mm256_shuffle_epi8(_mm256_permute2x128_si256(second_nm_mask,second_nm_mask,1), _mm256_set_epi8(11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12)), + _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(_mm256_permute2x128_si256(s_data,s_data,1), _mm256_set_epi8(11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12)))), + _mm256_andnot_si256( + _mm256_shuffle_epi8(_mm256_permute2x128_si256(second_nm_mask,second_nm_mask,1), _mm256_set_epi8(9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10)), + _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(_mm256_permute2x128_si256(s_data,s_data,1), _mm256_set_epi8(9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10))))) + ), + _mm256_or_si256( + _mm256_or_si256( + _mm256_andnot_si256( + _mm256_shuffle_epi8(_mm256_permute2x128_si256(second_nm_mask,second_nm_mask,1), _mm256_set_epi8(7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8)), + _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(_mm256_permute2x128_si256(s_data,s_data,1), _mm256_set_epi8(7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8)))), + _mm256_andnot_si256( + _mm256_shuffle_epi8(_mm256_permute2x128_si256(second_nm_mask,second_nm_mask,1), _mm256_set_epi8(5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6)), + _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(_mm256_permute2x128_si256(s_data,s_data,1), _mm256_set_epi8(5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6))))), + _mm256_or_si256( + _mm256_andnot_si256( + _mm256_shuffle_epi8(_mm256_permute2x128_si256(second_nm_mask,second_nm_mask,1), _mm256_set_epi8(3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4)), + _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(_mm256_permute2x128_si256(s_data,s_data,1), _mm256_set_epi8(3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4)))), + _mm256_andnot_si256( + _mm256_shuffle_epi8(_mm256_permute2x128_si256(second_nm_mask,second_nm_mask,1), _mm256_set_epi8(1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2)), + _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(_mm256_permute2x128_si256(s_data,s_data,1), _mm256_set_epi8(1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2)))))) + ) + ), + bitmask); + } + + if (i < second.size) + { + for (; i < second.size && !has_mask; i++) + { + if (second_null_map[i]) continue; + __m256i v_i = _mm256_set1_epi16(second.data[i]); + bitmask = _mm256_or_si256(bitmask, _mm256_cmpeq_epi16(f_data, v_i)); + has_mask = _mm256_testc_si256 (bitmask, ones); + } + } + } + } + + bool found = false; + for (; j < first.size && (has_mask || first.size <= 2); j++) + { + found = (has_first_null_map && first_null_map[j])? true: false; + for (unsigned i = 0; i < second.size && !found; i ++) + { + if (has_second_null_map && second_null_map[i]) continue; + found = (second.data[i] == first.data[j]); + } + if (!found) + return false; + } + return has_mask || found; +} + +#else + +// SSE4.2 Int specialization +template <> +inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >(const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, const UInt8 * second_null_map) +{ + if (first.size == 0) return true; + + const bool has_first_null_map = first_null_map != nullptr; + const bool has_second_null_map = second_null_map != nullptr; + if (has_second_null_map != has_first_null_map && has_first_null_map) return false; + + unsigned j = 0; + short has_mask = 1; + const __m128i zeros = _mm_setzero_si128(); + if (first.size > 3 && second.size > 2) + { + const int full = -1, none = 0; + for (; j < first.size-3 && has_mask; j += 4) + { + has_mask = 0; + const __m128i f_data = _mm_lddqu_si128(reinterpret_cast(first.data+j)); + __m128i bitmask = has_first_null_map ? _mm_set_epi32((first_null_map[j+3])? full: none, + (first_null_map[j+2])? full: none, + (first_null_map[j+1])? full: none, + (first_null_map[j]) ? full: none + ) + :zeros; + + unsigned i = 0; + for (; i < second.size-3 && !has_mask; has_mask = _mm_test_all_ones(bitmask), i += 4) + { + const __m128i s_data = _mm_lddqu_si128(reinterpret_cast(second.data+i)); + const __m128i second_nm_mask = (has_second_null_map)? _mm_cvtepi8_epi32(_mm_lddqu_si128(reinterpret_cast(second_null_map+i))) + : zeros; + + bitmask = + _mm_or_si128( + _mm_or_si128( + _mm_or_si128( + _mm_andnot_si128( + second_nm_mask, + _mm_cmpeq_epi32(f_data, s_data)), + _mm_andnot_si128( + _mm_shuffle_epi32(second_nm_mask, _MM_SHUFFLE(2,1,0,3)), + _mm_cmpeq_epi32(f_data, _mm_shuffle_epi32(s_data, _MM_SHUFFLE(2,1,0,3))))), + _mm_or_si128( + _mm_andnot_si128( + _mm_shuffle_epi32(second_nm_mask, _MM_SHUFFLE(1,0,3,2)), + _mm_cmpeq_epi32(f_data, _mm_shuffle_epi32(s_data, _MM_SHUFFLE(1,0,3,2)))), + _mm_andnot_si128( + _mm_shuffle_epi32(second_nm_mask, _MM_SHUFFLE(0,3,2,1)), + _mm_cmpeq_epi32(f_data, _mm_shuffle_epi32(s_data, _MM_SHUFFLE(0,3,2,1))))) + ), + bitmask); + } + + if (i < second.size) + { + for (; i < second.size && !has_mask; i++) + { + if (has_second_null_map && second_null_map[i]) continue; + __m128i r_i = _mm_set1_epi32(second.data[i]); + bitmask = _mm_or_si128(bitmask, _mm_cmpeq_epi32(f_data, r_i)); + has_mask = _mm_test_all_ones(bitmask); + } + } + } + } + + bool found = false; + for (; j < first.size && has_mask; j++) + { + found = (has_first_null_map && first_null_map[j])? true: false; + for (unsigned i = 0; i < second.size && !found; i ++) + { + if (has_second_null_map && second_null_map[i]) continue; + found = (second.data[i] == first.data[j]); + } + if (!found) + return false; + } + return has_mask || found; +} + +// SSE4.2 Int64 specialization +template <> +inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >(const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, const UInt8 * second_null_map) +{ + if (first.size == 0) return true; + + const bool has_first_null_map = first_null_map != nullptr; + const bool has_second_null_map = second_null_map != nullptr; + if (has_first_null_map != has_second_null_map && has_first_null_map) return false; + + unsigned j = 0; + short has_mask = 1; + const Int64 full = -1, none = 0; + const __m128i zeros = _mm_setzero_si128(); + for (; j < first.size-1 && has_mask; j += 2) + { + has_mask = 0; + const __m128i f_data = _mm_lddqu_si128(reinterpret_cast(first.data+j)); + __m128i bitmask = has_first_null_map ? _mm_set_epi64x((first_null_map[j+1])? full: none, + (first_null_map[j]) ? full: none + ) + : zeros; + unsigned i = 0; + for (; i < second.size-1 && !has_mask; has_mask = _mm_test_all_ones(bitmask), i += 2) + { + const __m128i s_data = _mm_lddqu_si128(reinterpret_cast(second.data+i)); + const __m128i second_nm_mask = (has_second_null_map)? _mm_cvtepi8_epi64(_mm_lddqu_si128(reinterpret_cast(second_null_map+i))) + : zeros; + bitmask = + _mm_or_si128( + _mm_or_si128( + _mm_andnot_si128( + second_nm_mask, + _mm_cmpeq_epi32(f_data, s_data)), + _mm_andnot_si128( + _mm_shuffle_epi32(second_nm_mask, _MM_SHUFFLE(1,0,3,2)), + _mm_cmpeq_epi64(f_data, _mm_shuffle_epi32(s_data, _MM_SHUFFLE(1,0,3,2))))) + ,bitmask); + } + + if (i < second.size) + { + for (; i < second.size && !has_mask; i++) + { + if (has_second_null_map && second_null_map[i]) continue; + __m128i v_i = _mm_set1_epi64x(second.data[i]); + bitmask = _mm_or_si128(bitmask, _mm_cmpeq_epi64(f_data, v_i)); + has_mask = _mm_test_all_ones(bitmask); + } + } + } + + bool found = false; + for (; j < first.size && has_mask; j++) + { + // skip null elements since both have at least one + found = (has_first_null_map && first_null_map[j])? true: false; + for (unsigned i = 0; i < second.size && !found; i ++) + { + if (has_second_null_map && second_null_map[i]) continue; + found = (second.data[i] == first.data[j]); + } + if (!found) + return false; + } + return has_mask || found; +} + +// SSE4.2 Int16_t specialization +template <> +inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >(const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, const UInt8 * second_null_map) +{ + if (first.size == 0) return true; + const bool has_first_null_map = first_null_map != nullptr; + const bool has_second_null_map = second_null_map != nullptr; + if (has_first_null_map != has_second_null_map && has_first_null_map) return false; + + unsigned j = 0; + short has_mask = 1; + const int16_t full = -1, none = 0; + const __m128i zeros = _mm_setzero_si128(); + if (first.size > 6 && second.size > 6) + { + for (; j < first.size-7 && has_mask; j += 8) + { + has_mask = 0; + const __m128i f_data = _mm_lddqu_si128(reinterpret_cast(first.data+j)); + __m128i bitmask = has_first_null_map ? _mm_set_epi16((first_null_map[j+7])? full: none, (first_null_map[j+6])? full: none, + (first_null_map[j+5])? full: none, (first_null_map[j+4])? full: none, + (first_null_map[j+3])? full: none, (first_null_map[j+2])? full: none, + (first_null_map[j+1])? full: none, (first_null_map[j]) ? full: none + ) + :zeros; + unsigned i = 0; + for (; i < second.size-7 && !has_mask; has_mask = _mm_test_all_ones(bitmask), i += 8) + { + const __m128i s_data = _mm_lddqu_si128(reinterpret_cast(second.data+i)); + const __m128i second_nm_mask = (has_second_null_map)? _mm_cvtepi8_epi16(_mm_lddqu_si128(reinterpret_cast(second_null_map+i))) + : zeros; + bitmask = + _mm_or_si128( + _mm_or_si128( + _mm_or_si128( + _mm_or_si128( + _mm_andnot_si128( + second_nm_mask, + _mm_cmpeq_epi16(f_data, s_data)), + _mm_andnot_si128( + _mm_shuffle_epi8(second_nm_mask, _mm_set_epi8(13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14)), + _mm_cmpeq_epi16(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14))))), + _mm_or_si128( + _mm_andnot_si128( + _mm_shuffle_epi8(second_nm_mask, _mm_set_epi8(11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12)), + _mm_cmpeq_epi16(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12)))), + _mm_andnot_si128( + _mm_shuffle_epi8(second_nm_mask, _mm_set_epi8(9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10)), + _mm_cmpeq_epi16(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10))))) + ), + _mm_or_si128( + _mm_or_si128( + _mm_andnot_si128( + _mm_shuffle_epi8(second_nm_mask, _mm_set_epi8(7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8)), + _mm_cmpeq_epi16(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8)))), + _mm_andnot_si128( + _mm_shuffle_epi8(second_nm_mask, _mm_set_epi8(5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6)), + _mm_cmpeq_epi16(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6))))), + _mm_or_si128( + _mm_andnot_si128( + _mm_shuffle_epi8(second_nm_mask, _mm_set_epi8(3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4)), + _mm_cmpeq_epi16(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4)))), + _mm_andnot_si128( + _mm_shuffle_epi8(second_nm_mask, _mm_set_epi8(1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2)), + _mm_cmpeq_epi16(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2)))))) + ), + bitmask); + } + + if (i < second.size) + { + for (; i < second.size && !has_mask; i++) + { + if (has_second_null_map && second_null_map[i]) continue; + __m128i v_i = _mm_set1_epi16(second.data[i]); + bitmask = _mm_or_si128(bitmask, _mm_cmpeq_epi16(f_data, v_i)); + has_mask = _mm_test_all_ones(bitmask); + } + } + } + } + + bool found = false; + for (; j < first.size && (has_mask || first.size <= 2); j++) + { + found = (has_first_null_map && first_null_map[j])? true: false; + for (unsigned i = 0; i < second.size && !found; i ++) + { + if (has_second_null_map && second_null_map[i]) continue; + found = (second.data[i] == first.data[j]); + } + if (!found) + return false; + } + return has_mask || found; +} +#endif + +// SSE4.2 Int8_t specialization +template <> +inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >(const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, const UInt8 * second_null_map) +{ + if (first.size == 0) return true; + + const bool has_first_null_map = first_null_map != nullptr; + const bool has_second_null_map = second_null_map != nullptr; + if (has_first_null_map != has_second_null_map && has_first_null_map) return false; + + unsigned j = 0; + short has_mask = 1; + const int full = -1, none = 0; + const __m128i zeros = _mm_setzero_si128(); + if (first.size > 15) +{ + for (; j < first.size-15 && has_mask; j += 16) + { + has_mask = 0; + const __m128i f_data = _mm_lddqu_si128(reinterpret_cast(first.data+j)); + __m128i bitmask = has_first_null_map ? _mm_set_epi8((first_null_map[j+15])? full: none, (first_null_map[j+14])? full: none, + (first_null_map[j+13])? full: none, (first_null_map[j+12])? full: none, + (first_null_map[j+11])? full: none, (first_null_map[j+10])? full: none, + (first_null_map[j+9]) ? full: none, (first_null_map[j+8]) ? full: none, + (first_null_map[j+7]) ? full: none, (first_null_map[j+6]) ? full: none, + (first_null_map[j+5]) ? full: none, (first_null_map[j+4]) ? full: none, + (first_null_map[j+3]) ? full: none, (first_null_map[j+2]) ? full: none, + (first_null_map[j+1]) ? full: none, (first_null_map[j]) ? full: none + ) + : zeros; + unsigned i = 0; + for (; i < second.size-15 && !has_mask; has_mask = _mm_test_all_ones(bitmask), i += 16) + { + const __m128i s_data = _mm_lddqu_si128(reinterpret_cast(second.data+i)); + const __m128i second_nm_mask = (has_second_null_map)? _mm_lddqu_si128(reinterpret_cast(second_null_map+i)) + : zeros; + bitmask = + _mm_or_si128( + _mm_or_si128( + _mm_or_si128( + _mm_or_si128( + _mm_or_si128( + _mm_andnot_si128( + second_nm_mask, + _mm_cmpeq_epi8(f_data, s_data)), + _mm_andnot_si128( + _mm_shuffle_epi8(second_nm_mask, _mm_set_epi8(14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15)), + _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15))))), + _mm_or_si128( + _mm_andnot_si128( + _mm_shuffle_epi8(second_nm_mask, _mm_set_epi8(13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14)), + _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14)))), + _mm_andnot_si128( + _mm_shuffle_epi8(second_nm_mask, _mm_set_epi8(12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13)), + _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13))))) + ), + _mm_or_si128( + _mm_or_si128( + _mm_andnot_si128( + _mm_shuffle_epi8(second_nm_mask, _mm_set_epi8(11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12)), + _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12)))), + _mm_andnot_si128( + _mm_shuffle_epi8(second_nm_mask, _mm_set_epi8(10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11)), + _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11))))), + _mm_or_si128( + _mm_andnot_si128( + _mm_shuffle_epi8(second_nm_mask, _mm_set_epi8(9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10)), + _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10)))), + _mm_andnot_si128( + _mm_shuffle_epi8(second_nm_mask, _mm_set_epi8(8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9)), + _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9))))))), + _mm_or_si128( + _mm_or_si128( + _mm_or_si128( + _mm_andnot_si128( + _mm_shuffle_epi8(second_nm_mask, _mm_set_epi8(7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8)), + _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8)))), + _mm_andnot_si128( + _mm_shuffle_epi8(second_nm_mask, _mm_set_epi8(6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7)), + _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7))))), + _mm_or_si128( + _mm_andnot_si128( + _mm_shuffle_epi8(second_nm_mask, _mm_set_epi8(5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6)), + _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6)))), + _mm_andnot_si128( + _mm_shuffle_epi8(second_nm_mask, _mm_set_epi8(4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5)), + _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5)))))), + _mm_or_si128( + _mm_or_si128( + _mm_andnot_si128( + _mm_shuffle_epi8(second_nm_mask, _mm_set_epi8(3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4)), + _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4)))), + _mm_andnot_si128( + _mm_shuffle_epi8(second_nm_mask, _mm_set_epi8(2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3)), + _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3))))), + _mm_or_si128( + _mm_andnot_si128( + _mm_shuffle_epi8(second_nm_mask, _mm_set_epi8(1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2)), + _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2)))), + _mm_andnot_si128( + _mm_shuffle_epi8(second_nm_mask, _mm_set_epi8(0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1)), + _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1)))))))), + bitmask); + } + + if (i < second.size) + { + for (; i < second.size && !has_mask; i++) + { + if (has_second_null_map && second_null_map[i]) continue; + __m128i v_i = _mm_set1_epi8(second.data[i]); + bitmask = _mm_or_si128(bitmask, _mm_cmpeq_epi8(f_data, v_i)); + has_mask = _mm_test_all_ones(bitmask); + } + } + } + } + + bool found = false; + for (; j < first.size && has_mask; j++) + { + found = (has_first_null_map && first_null_map[j])? true: false; + for (unsigned i = 0; i < second.size && !found; i ++) + { + if (has_second_null_map && second_null_map[i]) continue; + found = (second.data[i] == first.data[j]); + } + if (!found) + return false; + } + + return has_mask || found; +} + + template < typename FirstSliceType, typename SecondSliceType, bool (*isEqual)(const FirstSliceType &, const SecondSliceType &, size_t, size_t), @@ -551,53 +1331,6 @@ bool sliceHasImpl(const FirstSliceType & first, const SecondSliceType & second, } -template -bool sliceEqualElements(const NumericArraySlice & first [[maybe_unused]], - const NumericArraySlice & second [[maybe_unused]], - size_t first_ind [[maybe_unused]], - size_t second_ind [[maybe_unused]]) -{ - /// TODO: Decimal scale - if constexpr (is_decimal && is_decimal) - return accurate::equalsOp(first.data[first_ind].value, second.data[second_ind].value); - else if constexpr (is_decimal || is_decimal) - return false; - else - return accurate::equalsOp(first.data[first_ind], second.data[second_ind]); -} - -template -bool sliceEqualElements(const NumericArraySlice &, const GenericArraySlice &, size_t, size_t) -{ - return false; -} - -template -bool sliceEqualElements(const GenericArraySlice &, const NumericArraySlice &, size_t, size_t) -{ - return false; -} - -inline ALWAYS_INLINE bool sliceEqualElements(const GenericArraySlice & first, const GenericArraySlice & second, size_t first_ind, size_t second_ind) -{ - return first.elements->compareAt(first_ind + first.begin, second_ind + second.begin, *second.elements, -1) == 0; -} - -template -bool insliceEqualElements(const NumericArraySlice & first [[maybe_unused]], - size_t first_ind [[maybe_unused]], - size_t second_ind [[maybe_unused]]) -{ - if constexpr (is_decimal) - return accurate::equalsOp(first.data[first_ind].value, first.data[second_ind].value); - else - return accurate::equalsOp(first.data[first_ind], first.data[second_ind]); -} -inline ALWAYS_INLINE bool insliceEqualElements(const GenericArraySlice & first, size_t first_ind, size_t second_ind) -{ - return first.elements->compareAt(first_ind + first.begin, second_ind + first.begin, *first.elements, -1) == 0; -} - template bool sliceHas(const NumericArraySlice & first, const NumericArraySlice & second) { diff --git a/src/Functions/GatherUtils/CMakeLists.txt b/src/Functions/GatherUtils/CMakeLists.txt index f30527c2a46..731407e774c 100644 --- a/src/Functions/GatherUtils/CMakeLists.txt +++ b/src/Functions/GatherUtils/CMakeLists.txt @@ -11,6 +11,10 @@ if (HAS_SUGGEST_DESTRUCTOR_OVERRIDE) target_compile_definitions(clickhouse_functions_gatherutils PUBLIC HAS_SUGGEST_DESTRUCTOR_OVERRIDE) endif() +if (HAVE_AVX2) + target_compile_options(clickhouse_functions_gatherutils PRIVATE -mavx2 -DNAMESPACE=AVX2) +endif() + if (STRIP_DEBUG_SYMBOLS_FUNCTIONS) target_compile_options(clickhouse_functions_gatherutils PRIVATE "-g0") endif() diff --git a/src/Functions/tests/gtest_hasAll.cpp b/src/Functions/tests/gtest_hasAll.cpp new file mode 100644 index 00000000000..bbc841e7605 --- /dev/null +++ b/src/Functions/tests/gtest_hasAll.cpp @@ -0,0 +1,104 @@ +#include + +#include + +using namespace DB::GatherUtils; + + + +template +void array_init(T* first, size_t first_size, T* second, size_t second_size, bool expected_return) { + T i = 0; + for (; i < second_size; i++) { + second[i] = i; + } + for (i=0; i < first_size; i++) { + first[i] = second[std::rand()%second_size]; + } + // Set one element different from + if (!expected_return) { + first[first_size-1] = second_size+1; + } +} + +void null_map_init(UInt8 * null_map, size_t null_map_size, size_t nb_elem) { + for (int i =0; i < null_map_size-1 && i < nb_elem; i++) { + null_map[std::rand()%null_map_size-1] = 1; + } +} + +template +bool test_hasAll(size_t first_size, size_t second_size, bool have_null_map, bool expected_output) { + T* first_data = new T [first_size]; + T* second_data = new T [second_size]; + + UInt8 *first_nm = nullptr, *second_nm = nullptr; + if (have_null_map) { + first_nm = new UInt8 [first_size]; + second_nm = new UInt8 [second_size]; + null_map_init(first_nm, first_size, 5); + null_map_init(second_nm, second_size, 2); + } + + array_init(first_data, first_size, second_data, second_size, expected_output); + + NumericArraySlice first = {first_data, first_size}; + NumericArraySlice second = {second_data, second_size}; + + // Test + /// Check if all first array are elements from second array, overloaded for various combinations of types. + return sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >(first, second, nullptr, nullptr); +} + +TEST(HasAll, integer) +{ + bool test1 = test_hasAll(4, 100, false, true); + bool test2 = test_hasAll(4, 100, false, false); + bool test3 = test_hasAll(100, 4096, false, true); + bool test4 = test_hasAll(100, 4096, false, false); + + ASSERT_EQ(test1, true); + ASSERT_EQ(test2, false); + ASSERT_EQ(test3, true); + ASSERT_EQ(test4, false); +} + + +TEST(HasAll, int64) +{ + bool test1 = test_hasAll(2, 100, false, true); + bool test2 = test_hasAll(2, 100, false, false); + bool test3 = test_hasAll(100, 4096, false, true); + bool test4 = test_hasAll(100, 4096, false, false); + + ASSERT_EQ(test1, true); + ASSERT_EQ(test2, false); + ASSERT_EQ(test3, true); + ASSERT_EQ(test4, false); +} + +TEST(HasAll, int16) +{ + bool test1 = test_hasAll(2, 100, false, true); + bool test2 = test_hasAll(2, 100, false, false); + bool test3 = test_hasAll(100, 4096, false, true); + bool test4 = test_hasAll(100, 4096, false, false); + + ASSERT_EQ(test1, true); + ASSERT_EQ(test2, false); + ASSERT_EQ(test3, true); + ASSERT_EQ(test4, false); +} + +TEST(HasAll, int8) +{ + bool test1 = test_hasAll(2, 100, false, true); + bool test2 = test_hasAll(2, 100, false, false); + bool test3 = test_hasAll(50, 125, false, true); + bool test4 = test_hasAll(50, 125, false, false); + + ASSERT_EQ(test1, true); + ASSERT_EQ(test2, false); + ASSERT_EQ(test3, true); + ASSERT_EQ(test4, false); +} From abecb8114f470aa2b00122ee6079a32d6b17d62a Mon Sep 17 00:00:00 2001 From: Jakub Kuklis Date: Wed, 18 Aug 2021 15:19:31 +0200 Subject: [PATCH 02/82] Refactoring the hasAll gtest so that it works with the original hasAll --- src/Functions/tests/gtest_hasAll.cpp | 97 +++++++++++++++------------- 1 file changed, 53 insertions(+), 44 deletions(-) diff --git a/src/Functions/tests/gtest_hasAll.cpp b/src/Functions/tests/gtest_hasAll.cpp index bbc841e7605..310c059bbbc 100644 --- a/src/Functions/tests/gtest_hasAll.cpp +++ b/src/Functions/tests/gtest_hasAll.cpp @@ -5,57 +5,66 @@ using namespace DB::GatherUtils; - template -void array_init(T* first, size_t first_size, T* second, size_t second_size, bool expected_return) { - T i = 0; - for (; i < second_size; i++) { - second[i] = i; +void array_init(T* elements_to_have, size_t elements_to_have_count, T* set_elements, size_t set_size, bool expected_output) { + for (T i = 0; i < set_size; ++i) + { + set_elements[i] = i; } - for (i=0; i < first_size; i++) { - first[i] = second[std::rand()%second_size]; + for (T i = 0; i < elements_to_have_count; ++i) + { + elements_to_have[i] = set_elements[std::rand() % set_size]; } - // Set one element different from - if (!expected_return) { - first[first_size-1] = second_size+1; + if (!expected_output) + { + // make one element to be searched for missing from the target set + elements_to_have[elements_to_have_count - 1] = set_size + 1; } } -void null_map_init(UInt8 * null_map, size_t null_map_size, size_t nb_elem) { - for (int i =0; i < null_map_size-1 && i < nb_elem; i++) { - null_map[std::rand()%null_map_size-1] = 1; +void null_map_init(UInt8 * null_map, size_t null_map_size, size_t null_elements_count) +{ + for (int i = 0; i < null_map_size; ++i) + { + null_map[i] = 0; + } + for (int i = 0; i < null_map_size - 1 && i < null_elements_count; ++i) + { + null_map[std::rand() % null_map_size - 1] = 1; } } template -bool test_hasAll(size_t first_size, size_t second_size, bool have_null_map, bool expected_output) { - T* first_data = new T [first_size]; - T* second_data = new T [second_size]; +bool testHasAll(size_t elements_to_have_count, size_t set_size, bool have_null_map, bool expected_output) +{ + T * set_elements = new T[set_size]; + T * elements_to_have = new T[elements_to_have_count]; - UInt8 *first_nm = nullptr, *second_nm = nullptr; - if (have_null_map) { - first_nm = new UInt8 [first_size]; - second_nm = new UInt8 [second_size]; - null_map_init(first_nm, first_size, 5); - null_map_init(second_nm, second_size, 2); + UInt8 * first_nm = nullptr, * second_nm = nullptr; + if (have_null_map) + { + first_nm = new UInt8[set_size]; + second_nm = new UInt8[elements_to_have_count]; + null_map_init(first_nm, set_size, 5); + null_map_init(second_nm, elements_to_have_count, 2); } - array_init(first_data, first_size, second_data, second_size, expected_output); + array_init(elements_to_have, elements_to_have_count, set_elements, set_size, expected_output); - NumericArraySlice first = {first_data, first_size}; - NumericArraySlice second = {second_data, second_size}; + NumericArraySlice first = {set_elements, set_size}; + NumericArraySlice second = {elements_to_have, elements_to_have_count}; - // Test - /// Check if all first array are elements from second array, overloaded for various combinations of types. - return sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >(first, second, nullptr, nullptr); + /// Check whether all elements of the second array are also elements of the first array, overloaded for various combinations of types. + return sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >( + first, second, first_nm, second_nm); } TEST(HasAll, integer) { - bool test1 = test_hasAll(4, 100, false, true); - bool test2 = test_hasAll(4, 100, false, false); - bool test3 = test_hasAll(100, 4096, false, true); - bool test4 = test_hasAll(100, 4096, false, false); + bool test1 = testHasAll(4, 100, false, true); + bool test2 = testHasAll(4, 100, false, false); + bool test3 = testHasAll(100, 4096, false, true); + bool test4 = testHasAll(100, 4096, false, false); ASSERT_EQ(test1, true); ASSERT_EQ(test2, false); @@ -66,10 +75,10 @@ TEST(HasAll, integer) TEST(HasAll, int64) { - bool test1 = test_hasAll(2, 100, false, true); - bool test2 = test_hasAll(2, 100, false, false); - bool test3 = test_hasAll(100, 4096, false, true); - bool test4 = test_hasAll(100, 4096, false, false); + bool test1 = testHasAll(2, 100, false, true); + bool test2 = testHasAll(2, 100, false, false); + bool test3 = testHasAll(100, 4096, false, true); + bool test4 = testHasAll(100, 4096, false, false); ASSERT_EQ(test1, true); ASSERT_EQ(test2, false); @@ -79,10 +88,10 @@ TEST(HasAll, int64) TEST(HasAll, int16) { - bool test1 = test_hasAll(2, 100, false, true); - bool test2 = test_hasAll(2, 100, false, false); - bool test3 = test_hasAll(100, 4096, false, true); - bool test4 = test_hasAll(100, 4096, false, false); + bool test1 = testHasAll(2, 100, false, true); + bool test2 = testHasAll(2, 100, false, false); + bool test3 = testHasAll(100, 4096, false, true); + bool test4 = testHasAll(100, 4096, false, false); ASSERT_EQ(test1, true); ASSERT_EQ(test2, false); @@ -92,10 +101,10 @@ TEST(HasAll, int16) TEST(HasAll, int8) { - bool test1 = test_hasAll(2, 100, false, true); - bool test2 = test_hasAll(2, 100, false, false); - bool test3 = test_hasAll(50, 125, false, true); - bool test4 = test_hasAll(50, 125, false, false); + bool test1 = testHasAll(2, 100, false, true); + bool test2 = testHasAll(2, 100, false, false); + bool test3 = testHasAll(50, 125, false, true); + bool test4 = testHasAll(50, 125, false, false); ASSERT_EQ(test1, true); ASSERT_EQ(test2, false); From 92ec28a87b42653d7d1abef2369e4e6e3407a2cb Mon Sep 17 00:00:00 2001 From: Jakub Kuklis Date: Wed, 18 Aug 2021 16:52:14 +0200 Subject: [PATCH 03/82] Refactoring the new SIMD hasAll implementation to comply with current hasAll implementation (swapping 'first' and 'second' arguments meaning) and with ClickHouse C++ guidelines --- src/Functions/GatherUtils/Algorithms.h | 622 +++++++++++++------------ 1 file changed, 336 insertions(+), 286 deletions(-) diff --git a/src/Functions/GatherUtils/Algorithms.h b/src/Functions/GatherUtils/Algorithms.h index 245794da976..2812821e339 100644 --- a/src/Functions/GatherUtils/Algorithms.h +++ b/src/Functions/GatherUtils/Algorithms.h @@ -7,7 +7,7 @@ #include #include #include "GatherUtils.h" -#if defined(__AVX2__) +#if defined(__AVX2__) || defined(__SSE4_2__) #include #endif @@ -420,7 +420,6 @@ void NO_INLINE conditional(SourceA && src_a, SourceB && src_b, Sink && sink, con } - template bool sliceEqualElements(const NumericArraySlice & first [[maybe_unused]], const NumericArraySlice & second [[maybe_unused]], @@ -469,7 +468,6 @@ inline ALWAYS_INLINE bool insliceEqualElements(const GenericArraySlice & first, } - /// For details of Knuth-Morris-Pratt string matching algorithm see /// https://en.wikipedia.org/wiki/Knuth%E2%80%93Morris%E2%80%93Pratt_algorithm. /// A "prefix-function" is defined as: i-th element is the length of the longest of all prefixes that end in i-th position @@ -536,91 +534,97 @@ bool sliceHasImplAnyAll(const FirstSliceType & first, const SecondSliceType & se #if defined(__AVX2__) // AVX2 - Int specialization template <> -inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >(const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, const UInt8 * second_null_map) +inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >( + const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * second_null_map, const UInt8 * first_null_map) { - if (first.size == 0) return true; + if (second.size == 0) + return true; - const bool has_first_null_map = first_null_map != nullptr; + const bool has_first_null_map = first_null_map != nullptr; const bool has_second_null_map = second_null_map != nullptr; - if (has_second_null_map != has_first_null_map && has_first_null_map) return false; + if (!has_first_null_map && has_second_null_map) + return false; unsigned j = 0; short has_mask = 1; const int full = -1, none = 0; const __m256i ones = _mm256_set1_epi32(full); const __m256i zeros = _mm256_setzero_si256(); - if (first.size > 7 && second.size > 7) + if (second.size > 7 && first.size > 7) { - for (; j < first.size-7 && has_mask; j += 8) + for (; j < second.size - 7 && has_mask; j += 8) { has_mask = 0; - const __m256i f_data = _mm256_lddqu_si256(reinterpret_cast(first.data+j)); - // bitmask is fulfilled with ones for ones which are considered as null in the corresponding null map, 0 otherwise; - __m256i bitmask = has_first_null_map ? _mm256_set_epi32((first_null_map[j+7])? full: none, - (first_null_map[j+6])? full: none, - (first_null_map[j+5])? full: none, - (first_null_map[j+4])? full: none, - (first_null_map[j+3])? full: none, - (first_null_map[j+2])? full: none, - (first_null_map[j+1])? full: none, - (first_null_map[j]) ? full: none - ) - :zeros; + const __m256i f_data = _mm256_lddqu_si256(reinterpret_cast(second.data + j)); + // bitmask is filled with minus ones for ones which are considered as null in the corresponding null map, 0 otherwise; + __m256i bitmask = has_second_null_map ? + _mm256_set_epi32( + (second_null_map[j + 7]) ? full : none, + (second_null_map[j + 6]) ? full : none, + (second_null_map[j + 5]) ? full : none, + (second_null_map[j + 4]) ? full : none, + (second_null_map[j + 3]) ? full : none, + (second_null_map[j + 2]) ? full : none, + (second_null_map[j + 1]) ? full : none, + (second_null_map[j]) ? full : none) + : zeros; size_t i = 0; - // Browse second array to try to match ell first elements - for (; i < second.size-7 && !has_mask; has_mask = _mm256_testc_si256(bitmask, ones), i += 8) + // Search first array to try to match all second elements + for (; i < first.size - 7 && !has_mask; has_mask = _mm256_testc_si256(bitmask, ones), i += 8) { - const __m256i s_data = _mm256_lddqu_si256(reinterpret_cast(second.data+i)); + const __m256i s_data = _mm256_lddqu_si256(reinterpret_cast(first.data + i)); // Create a mask to avoid to compare null elements // set_m128i takes two arguments: (high segment, low segment) that are two __m128i convert from 8bits to 32bits to fit to our following operations - const __m256i second_nm_mask = _mm256_set_m128i(_mm_cvtepi8_epi32(_mm_lddqu_si128(reinterpret_cast(second_null_map+i+4))), - _mm_cvtepi8_epi32(_mm_lddqu_si128(reinterpret_cast(second_null_map+i)))); + const __m256i first_nm_mask = _mm256_set_m128i( + _mm_cvtepi8_epi32(_mm_lddqu_si128(reinterpret_cast(first_null_map + i + 4))), + _mm_cvtepi8_epi32(_mm_lddqu_si128(reinterpret_cast(first_null_map + i)))); bitmask = _mm256_or_si256( _mm256_or_si256( _mm256_or_si256( _mm256_or_si256( _mm256_andnot_si256( - second_nm_mask, + first_nm_mask, _mm256_cmpeq_epi32(f_data, s_data)), _mm256_andnot_si256( - _mm256_permutevar8x32_epi32(second_nm_mask, _mm256_set_epi32(6,5,4,3,2,1,0,7)), + _mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(6,5,4,3,2,1,0,7)), _mm256_cmpeq_epi32(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(6,5,4,3,2,1,0,7))))), _mm256_or_si256( _mm256_andnot_si256( - _mm256_permutevar8x32_epi32(second_nm_mask, _mm256_set_epi32(5,4,3,2,1,0,7,6)), + _mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(5,4,3,2,1,0,7,6)), _mm256_cmpeq_epi32(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(5,4,3,2,1,0,7,6)))), _mm256_andnot_si256( - _mm256_permutevar8x32_epi32(second_nm_mask, _mm256_set_epi32(4,3,2,1,0,7,6,5)), + _mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(4,3,2,1,0,7,6,5)), _mm256_cmpeq_epi32(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(4,3,2,1,0,7,6,5))))) ), _mm256_or_si256( _mm256_or_si256( _mm256_andnot_si256( - _mm256_permutevar8x32_epi32(second_nm_mask, _mm256_set_epi32(3,2,1,0,7,6,5,4)), + _mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(3,2,1,0,7,6,5,4)), _mm256_cmpeq_epi32(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(3,2,1,0,7,6,5,4)))), _mm256_andnot_si256( - _mm256_permutevar8x32_epi32(second_nm_mask, _mm256_set_epi32(2,1,0,7,6,5,4,3)), + _mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(2,1,0,7,6,5,4,3)), _mm256_cmpeq_epi32(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(2,1,0,7,6,5,4,3))))), _mm256_or_si256( _mm256_andnot_si256( - _mm256_permutevar8x32_epi32(second_nm_mask, _mm256_set_epi32(1,0,7,6,5,4,3,2)), + _mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(1,0,7,6,5,4,3,2)), _mm256_cmpeq_epi32(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(1,0,7,6,5,4,3,2)))), _mm256_andnot_si256( - _mm256_permutevar8x32_epi32(second_nm_mask, _mm256_set_epi32(0,7,6,5,4,3,2,1)), - _mm256_cmpeq_epi32(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(0,7,6,5,4,3,2,1))))))) - ,bitmask); + _mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(0,7,6,5,4,3,2,1)), + _mm256_cmpeq_epi32(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(0,7,6,5,4,3,2,1))))))), + bitmask); } - if (i < second.size) + if (i < first.size) { // Loop(i)-jam - for (; i < second.size && !has_mask; i++) + for (; i < first.size && !has_mask; ++i) { - if (second_null_map[i]) continue; - __m256i v_i = _mm256_set1_epi32(second.data[i]); - bitmask = _mm256_or_si256(bitmask, _mm256_cmpeq_epi32(f_data, v_i)); + if (has_first_null_map && first_null_map[i]) + continue; + __m256i v_i = _mm256_set1_epi32(first.data[i]); + bitmask = _mm256_or_si256(bitmask, _mm256_cmpeq_epi32(f_data, v_i)); has_mask = _mm256_testc_si256 (bitmask, ones); } } @@ -629,13 +633,15 @@ inline ALWAYS_INLINE bool sliceHasImplAnyAll -// inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >(const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, const UInt8 * second_null_map) +// inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >( +// const NumericArraySlice & second, const NumericArraySlice & first, const UInt8 * second_null_map, const UInt8 * first_null_map) // { -// return sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements > (static_cast &>(first), static_cast &>(second), first_null_map, second_null_map); +// return sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements > ( +// static_cast &>(second), static_cast &>(first), second_null_map, first_null_map); // } // AVX2 Int64 specialization template <> -inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >(const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, const UInt8 * second_null_map) +inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >( + const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * second_null_map, const UInt8 * first_null_map) { - if (first.size == 0) return true; + if (second.size == 0) + return true; - const bool has_first_null_map = first_null_map != nullptr; - const bool has_second_null_map = second_null_map != nullptr; - if (has_second_null_map != has_first_null_map && has_first_null_map) return false; + const bool has_first_null_map = first_null_map != nullptr; + const bool has_second_null_map = second_null_map != nullptr; + if (!has_first_null_map && has_second_null_map) + return false; unsigned j = 0; short has_mask = 1; const int full = -1, none = 0; const __m256i ones = _mm256_set1_epi64x(full); const __m256i zeros = _mm256_setzero_si256(); - if (first.size > 3 && second.size > 3) + if (second.size > 3 && first.size > 3) { - for (; j < first.size-3 && has_mask; j += 4) + for (; j < second.size - 3 && has_mask; j += 4) { has_mask = 0; - const __m256i f_data = _mm256_lddqu_si256(reinterpret_cast(first.data+j)); - __m256i bitmask = has_first_null_map ? _mm256_set_epi64x((first_null_map[j+3])? full: none, - (first_null_map[j+2])? full: none, - (first_null_map[j+1])? full: none, - (first_null_map[j]) ? full: none - ) - :zeros; + const __m256i f_data = _mm256_lddqu_si256(reinterpret_cast(second.data + j)); + __m256i bitmask = has_second_null_map ? + _mm256_set_epi64x( + (second_null_map[j + 3])? full : none, + (second_null_map[j + 2])? full : none, + (second_null_map[j + 1])? full : none, + (second_null_map[j]) ? full : none) + : zeros; unsigned i = 0; - for (; i < second.size-3 && !has_mask; has_mask = _mm256_testc_si256 (bitmask, ones), i += 4) + for (; i < first.size - 3 && !has_mask; has_mask = _mm256_testc_si256 (bitmask, ones), i += 4) { - const __m256i s_data = _mm256_lddqu_si256(reinterpret_cast(second.data+i)); - const __m256i second_nm_mask = _mm256_set_m128i(_mm_cvtepi8_epi64(_mm_lddqu_si128(reinterpret_cast(second_null_map+i+2))), - _mm_cvtepi8_epi64(_mm_lddqu_si128(reinterpret_cast(second_null_map+i)))); + const __m256i s_data = _mm256_lddqu_si256(reinterpret_cast(first.data + i)); + const __m256i first_nm_mask = _mm256_set_m128i( + _mm_cvtepi8_epi64(_mm_lddqu_si128(reinterpret_cast(first_null_map + i + 2))), + _mm_cvtepi8_epi64(_mm_lddqu_si128(reinterpret_cast(first_null_map + i)))); bitmask = _mm256_or_si256( _mm256_or_si256( _mm256_or_si256( _mm256_andnot_si256( - second_nm_mask, + first_nm_mask, _mm256_cmpeq_epi64(f_data, s_data)), _mm256_andnot_si256( - _mm256_permutevar8x32_epi32(second_nm_mask, _mm256_set_epi32(5,4,3,2,1,0,7,6)), + _mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(5,4,3,2,1,0,7,6)), _mm256_cmpeq_epi64(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(5,4,3,2,1,0,7,6))))), _mm256_or_si256( _mm256_andnot_si256( - _mm256_permutevar8x32_epi32(second_nm_mask, _mm256_set_epi32(3,2,1,0,7,6,5,4)), + _mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(3,2,1,0,7,6,5,4)), _mm256_cmpeq_epi64(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(3,2,1,0,7,6,5,4)))), _mm256_andnot_si256( - _mm256_permutevar8x32_epi32(second_nm_mask, _mm256_set_epi32(1,0,7,6,5,4,3,2)), - _mm256_cmpeq_epi64(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(1,0,7,6,5,4,3,2))))) - ), + _mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(1,0,7,6,5,4,3,2)), + _mm256_cmpeq_epi64(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(1,0,7,6,5,4,3,2)))))), bitmask); } - if (i < second.size) + if (i < first.size) { - for (; i < second.size && !has_mask; i++) + for (; i < first.size && !has_mask; ++i) { - if (second_null_map[i]) continue; - __m256i v_i = _mm256_set1_epi64x(second.data[i]); - bitmask = _mm256_or_si256(bitmask, _mm256_cmpeq_epi64(f_data, v_i)); + if (has_first_null_map && first_null_map[i]) + continue; + __m256i v_i = _mm256_set1_epi64x(first.data[i]); + bitmask = _mm256_or_si256(bitmask, _mm256_cmpeq_epi64(f_data, v_i)); has_mask = _mm256_testc_si256 (bitmask, ones); } } @@ -723,13 +736,14 @@ inline ALWAYS_INLINE bool sliceHasImplAnyAll -inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >(const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, const UInt8 * second_null_map) +inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >( + const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * second_null_map, const UInt8 * first_null_map) { - if (first.size == 0) return true; + if (second.size == 0) + return true; - const bool has_second_null_map = second_null_map != nullptr; const bool has_first_null_map = first_null_map != nullptr; - if (has_second_null_map != has_first_null_map && has_first_null_map) return false; + const bool has_second_null_map = second_null_map != nullptr; + if (!has_first_null_map && has_second_null_map) + return false; unsigned j = 0; short has_mask = 1; const int full = -1, none = 0; const __m256i ones = _mm256_set1_epi16(full); const __m256i zeros = _mm256_setzero_si256(); - if (first.size > 15 && second.size > 15) + if (second.size > 15 && first.size > 15) { - for (; j < first.size-15 && has_mask; j += 16) + for (; j < second.size - 15 && has_mask; j += 16) { has_mask = 0; - const __m256i f_data = _mm256_lddqu_si256(reinterpret_cast(first.data+j)); - __m256i bitmask = has_first_null_map ? _mm256_set_epi16((first_null_map[j+15])? full: none, (first_null_map[j+14])? full: none, - (first_null_map[j+13])? full: none, (first_null_map[j+12])? full: none, - (first_null_map[j+11])? full: none, (first_null_map[j+10])? full: none, - (first_null_map[j+9])? full: none, (first_null_map[j+8])? full: none, - (first_null_map[j+7])? full: none, (first_null_map[j+6])? full: none, - (first_null_map[j+5])? full: none, (first_null_map[j+4])? full: none, - (first_null_map[j+3])? full: none, (first_null_map[j+2])? full: none, - (first_null_map[j+1])? full: none, (first_null_map[j]) ? full: none - ) - :zeros; + const __m256i f_data = _mm256_lddqu_si256(reinterpret_cast(second.data + j)); + __m256i bitmask = has_second_null_map ? + _mm256_set_epi16( + (second_null_map[j + 15]) ? full : none, (second_null_map[j + 14]) ? full : none, + (second_null_map[j + 13]) ? full : none, (second_null_map[j + 12]) ? full : none, + (second_null_map[j + 11]) ? full : none, (second_null_map[j + 10]) ? full : none, + (second_null_map[j + 9]) ? full : none, (second_null_map[j + 8])? full : none, + (second_null_map[j + 7]) ? full : none, (second_null_map[j + 6])? full : none, + (second_null_map[j + 5]) ? full : none, (second_null_map[j + 4])? full : none, + (second_null_map[j + 3]) ? full : none, (second_null_map[j + 2])? full : none, + (second_null_map[j + 1]) ? full : none, (second_null_map[j]) ? full : none) + : zeros; unsigned i = 0; - for (; i < second.size-15 && !has_mask; has_mask = _mm256_testc_si256 (bitmask, ones), i += 16) + for (; i < first.size - 15 && !has_mask; has_mask = _mm256_testc_si256 (bitmask, ones), i += 16) { - const __m256i s_data = _mm256_lddqu_si256(reinterpret_cast(second.data+i)); - const __m256i second_nm_mask = _mm256_set_m128i(_mm_cvtepi8_epi16(_mm_lddqu_si128(reinterpret_cast(second_null_map+i+8))), - _mm_cvtepi8_epi16(_mm_lddqu_si128(reinterpret_cast(second_null_map+i)))); + const __m256i s_data = _mm256_lddqu_si256(reinterpret_cast(first.data + i)); + const __m256i first_nm_mask = _mm256_set_m128i( + _mm_cvtepi8_epi16(_mm_lddqu_si128(reinterpret_cast(first_null_map+i+8))), + _mm_cvtepi8_epi16(_mm_lddqu_si128(reinterpret_cast(first_null_map+i)))); bitmask = _mm256_or_si256( _mm256_or_si256( @@ -781,79 +800,80 @@ inline ALWAYS_INLINE bool sliceHasImplAnyAll -inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >(const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, const UInt8 * second_null_map) +inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >( + const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * second_null_map, const UInt8 * first_null_map) { - if (first.size == 0) return true; + if (second.size == 0) + return true; const bool has_first_null_map = first_null_map != nullptr; - const bool has_second_null_map = second_null_map != nullptr; - if (has_second_null_map != has_first_null_map && has_first_null_map) return false; + const bool has_second_null_map = second_null_map != nullptr; + if (!has_first_null_map && has_second_null_map) + return false; unsigned j = 0; short has_mask = 1; const __m128i zeros = _mm_setzero_si128(); - if (first.size > 3 && second.size > 2) + if (second.size > 3 && first.size > 2) { const int full = -1, none = 0; - for (; j < first.size-3 && has_mask; j += 4) + for (; j < second.size - 3 && has_mask; j += 4) { has_mask = 0; - const __m128i f_data = _mm_lddqu_si128(reinterpret_cast(first.data+j)); - __m128i bitmask = has_first_null_map ? _mm_set_epi32((first_null_map[j+3])? full: none, - (first_null_map[j+2])? full: none, - (first_null_map[j+1])? full: none, - (first_null_map[j]) ? full: none - ) - :zeros; + const __m128i f_data = _mm_lddqu_si128(reinterpret_cast(second.data + j)); + __m128i bitmask = has_second_null_map ? + _mm_set_epi32( + (second_null_map[j + 3]) ? full : none, + (second_null_map[j + 2]) ? full : none, + (second_null_map[j + 1]) ? full : none, + (second_null_map[j]) ? full : none) + : zeros; unsigned i = 0; - for (; i < second.size-3 && !has_mask; has_mask = _mm_test_all_ones(bitmask), i += 4) + for (; i < first.size - 3 && !has_mask; has_mask = _mm_test_all_ones(bitmask), i += 4) { - const __m128i s_data = _mm_lddqu_si128(reinterpret_cast(second.data+i)); - const __m128i second_nm_mask = (has_second_null_map)? _mm_cvtepi8_epi32(_mm_lddqu_si128(reinterpret_cast(second_null_map+i))) - : zeros; + const __m128i s_data = _mm_lddqu_si128(reinterpret_cast(first.data + i)); + const __m128i first_nm_mask = has_first_null_map ? + _mm_cvtepi8_epi32(_mm_lddqu_si128(reinterpret_cast(first_null_map + i))) + : zeros; bitmask = _mm_or_si128( _mm_or_si128( _mm_or_si128( _mm_andnot_si128( - second_nm_mask, + first_nm_mask, _mm_cmpeq_epi32(f_data, s_data)), _mm_andnot_si128( - _mm_shuffle_epi32(second_nm_mask, _MM_SHUFFLE(2,1,0,3)), + _mm_shuffle_epi32(first_nm_mask, _MM_SHUFFLE(2,1,0,3)), _mm_cmpeq_epi32(f_data, _mm_shuffle_epi32(s_data, _MM_SHUFFLE(2,1,0,3))))), _mm_or_si128( _mm_andnot_si128( - _mm_shuffle_epi32(second_nm_mask, _MM_SHUFFLE(1,0,3,2)), + _mm_shuffle_epi32(first_nm_mask, _MM_SHUFFLE(1,0,3,2)), _mm_cmpeq_epi32(f_data, _mm_shuffle_epi32(s_data, _MM_SHUFFLE(1,0,3,2)))), _mm_andnot_si128( - _mm_shuffle_epi32(second_nm_mask, _MM_SHUFFLE(0,3,2,1)), + _mm_shuffle_epi32(first_nm_mask, _MM_SHUFFLE(0,3,2,1)), _mm_cmpeq_epi32(f_data, _mm_shuffle_epi32(s_data, _MM_SHUFFLE(0,3,2,1))))) ), bitmask); } - if (i < second.size) + if (i < first.size) { - for (; i < second.size && !has_mask; i++) + for (; i < first.size && !has_mask; ++i) { - if (has_second_null_map && second_null_map[i]) continue; - __m128i r_i = _mm_set1_epi32(second.data[i]); - bitmask = _mm_or_si128(bitmask, _mm_cmpeq_epi32(f_data, r_i)); + if (has_first_null_map && first_null_map[i]) + continue; + __m128i r_i = _mm_set1_epi32(first.data[i]); + bitmask = _mm_or_si128(bitmask, _mm_cmpeq_epi32(f_data, r_i)); has_mask = _mm_test_all_ones(bitmask); } } @@ -946,13 +973,14 @@ inline ALWAYS_INLINE bool sliceHasImplAnyAll -inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >(const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, const UInt8 * second_null_map) +inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >( + const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * second_null_map, const UInt8 * first_null_map) { - if (first.size == 0) return true; + if (second.size == 0) + return true; - const bool has_first_null_map = first_null_map != nullptr; - const bool has_second_null_map = second_null_map != nullptr; - if (has_first_null_map != has_second_null_map && has_first_null_map) return false; + const bool has_first_null_map = first_null_map != nullptr; + const bool has_second_null_map = second_null_map != nullptr; + if (!has_first_null_map && has_second_null_map) + return false; unsigned j = 0; short has_mask = 1; const Int64 full = -1, none = 0; const __m128i zeros = _mm_setzero_si128(); - for (; j < first.size-1 && has_mask; j += 2) + for (; j < second.size - 1 && has_mask; j += 2) { has_mask = 0; - const __m128i f_data = _mm_lddqu_si128(reinterpret_cast(first.data+j)); - __m128i bitmask = has_first_null_map ? _mm_set_epi64x((first_null_map[j+1])? full: none, - (first_null_map[j]) ? full: none - ) - : zeros; + const __m128i f_data = _mm_lddqu_si128(reinterpret_cast(second.data + j)); + __m128i bitmask = has_second_null_map ? + _mm_set_epi64x( + (second_null_map[j + 1]) ? full : none, + (second_null_map[j]) ? full : none) + : zeros; unsigned i = 0; - for (; i < second.size-1 && !has_mask; has_mask = _mm_test_all_ones(bitmask), i += 2) + for (; i < first.size - 1 && !has_mask; has_mask = _mm_test_all_ones(bitmask), i += 2) { - const __m128i s_data = _mm_lddqu_si128(reinterpret_cast(second.data+i)); - const __m128i second_nm_mask = (has_second_null_map)? _mm_cvtepi8_epi64(_mm_lddqu_si128(reinterpret_cast(second_null_map+i))) - : zeros; + const __m128i s_data = _mm_lddqu_si128(reinterpret_cast(first.data + i)); + const __m128i first_nm_mask = has_first_null_map ? + _mm_cvtepi8_epi64(_mm_lddqu_si128(reinterpret_cast(first_null_map + i))) + : zeros; bitmask = _mm_or_si128( _mm_or_si128( _mm_andnot_si128( - second_nm_mask, + first_nm_mask, _mm_cmpeq_epi32(f_data, s_data)), _mm_andnot_si128( - _mm_shuffle_epi32(second_nm_mask, _MM_SHUFFLE(1,0,3,2)), - _mm_cmpeq_epi64(f_data, _mm_shuffle_epi32(s_data, _MM_SHUFFLE(1,0,3,2))))) - ,bitmask); + _mm_shuffle_epi32(first_nm_mask, _MM_SHUFFLE(1,0,3,2)), + _mm_cmpeq_epi64(f_data, _mm_shuffle_epi32(s_data, _MM_SHUFFLE(1,0,3,2))))), + bitmask); } - if (i < second.size) + if (i < first.size) { - for (; i < second.size && !has_mask; i++) + for (; i < first.size && !has_mask; ++i) { - if (has_second_null_map && second_null_map[i]) continue; - __m128i v_i = _mm_set1_epi64x(second.data[i]); - bitmask = _mm_or_si128(bitmask, _mm_cmpeq_epi64(f_data, v_i)); + if (has_first_null_map && first_null_map[i]) + continue; + __m128i v_i = _mm_set1_epi64x(first.data[i]); + bitmask = _mm_or_si128(bitmask, _mm_cmpeq_epi64(f_data, v_i)); has_mask = _mm_test_all_ones(bitmask); } } } bool found = false; - for (; j < first.size && has_mask; j++) + for (; j < second.size && has_mask; j++) { - // skip null elements since both have at least one - found = (has_first_null_map && first_null_map[j])? true: false; - for (unsigned i = 0; i < second.size && !found; i ++) + found = (has_second_null_map && second_null_map[j]) ? true : false; + for (unsigned i = 0; i < first.size && !found; ++i) { - if (has_second_null_map && second_null_map[i]) continue; - found = (second.data[i] == first.data[j]); + if (has_first_null_map && first_null_map[i]) + continue; + found = (first.data[i] == second.data[j]); } if (!found) return false; @@ -1030,80 +1064,87 @@ inline ALWAYS_INLINE bool sliceHasImplAnyAll -inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >(const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, const UInt8 * second_null_map) +inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >( + const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * second_null_map, const UInt8 * first_null_map) { - if (first.size == 0) return true; - const bool has_first_null_map = first_null_map != nullptr; - const bool has_second_null_map = second_null_map != nullptr; - if (has_first_null_map != has_second_null_map && has_first_null_map) return false; + if (second.size == 0) + return true; + + const bool has_first_null_map = first_null_map != nullptr; + const bool has_second_null_map = second_null_map != nullptr; + if (!has_first_null_map && has_second_null_map) + return false; unsigned j = 0; short has_mask = 1; const int16_t full = -1, none = 0; const __m128i zeros = _mm_setzero_si128(); - if (first.size > 6 && second.size > 6) + if (second.size > 6 && first.size > 6) { - for (; j < first.size-7 && has_mask; j += 8) + for (; j < second.size - 7 && has_mask; j += 8) { has_mask = 0; - const __m128i f_data = _mm_lddqu_si128(reinterpret_cast(first.data+j)); - __m128i bitmask = has_first_null_map ? _mm_set_epi16((first_null_map[j+7])? full: none, (first_null_map[j+6])? full: none, - (first_null_map[j+5])? full: none, (first_null_map[j+4])? full: none, - (first_null_map[j+3])? full: none, (first_null_map[j+2])? full: none, - (first_null_map[j+1])? full: none, (first_null_map[j]) ? full: none - ) - :zeros; + const __m128i f_data = _mm_lddqu_si128(reinterpret_cast(second.data+j)); + __m128i bitmask = has_second_null_map ? + _mm_set_epi16( + (second_null_map[j + 7]) ? full : none, (second_null_map[j + 6]) ? full : none, + (second_null_map[j + 5]) ? full : none, (second_null_map[j + 4]) ? full : none, + (second_null_map[j + 3]) ? full : none, (second_null_map[j + 2]) ? full : none, + (second_null_map[j + 1]) ? full : none, (second_null_map[j]) ? full: none) + : zeros; unsigned i = 0; - for (; i < second.size-7 && !has_mask; has_mask = _mm_test_all_ones(bitmask), i += 8) + for (; i < first.size-7 && !has_mask; has_mask = _mm_test_all_ones(bitmask), i += 8) { - const __m128i s_data = _mm_lddqu_si128(reinterpret_cast(second.data+i)); - const __m128i second_nm_mask = (has_second_null_map)? _mm_cvtepi8_epi16(_mm_lddqu_si128(reinterpret_cast(second_null_map+i))) - : zeros; + const __m128i s_data = _mm_lddqu_si128(reinterpret_cast(first.data + i)); + const __m128i first_nm_mask = has_first_null_map ? + _mm_cvtepi8_epi16(_mm_lddqu_si128(reinterpret_cast(first_null_map+i))) + : zeros; bitmask = _mm_or_si128( _mm_or_si128( _mm_or_si128( _mm_or_si128( _mm_andnot_si128( - second_nm_mask, + first_nm_mask, _mm_cmpeq_epi16(f_data, s_data)), _mm_andnot_si128( - _mm_shuffle_epi8(second_nm_mask, _mm_set_epi8(13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14)), + _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14)), _mm_cmpeq_epi16(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14))))), _mm_or_si128( _mm_andnot_si128( - _mm_shuffle_epi8(second_nm_mask, _mm_set_epi8(11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12)), + _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12)), _mm_cmpeq_epi16(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12)))), _mm_andnot_si128( - _mm_shuffle_epi8(second_nm_mask, _mm_set_epi8(9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10)), + _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10)), _mm_cmpeq_epi16(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10))))) ), _mm_or_si128( _mm_or_si128( _mm_andnot_si128( - _mm_shuffle_epi8(second_nm_mask, _mm_set_epi8(7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8)), + _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8)), _mm_cmpeq_epi16(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8)))), _mm_andnot_si128( - _mm_shuffle_epi8(second_nm_mask, _mm_set_epi8(5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6)), + _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6)), _mm_cmpeq_epi16(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6))))), _mm_or_si128( _mm_andnot_si128( - _mm_shuffle_epi8(second_nm_mask, _mm_set_epi8(3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4)), + _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4)), _mm_cmpeq_epi16(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4)))), _mm_andnot_si128( - _mm_shuffle_epi8(second_nm_mask, _mm_set_epi8(1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2)), + _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2)), _mm_cmpeq_epi16(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2)))))) ), bitmask); } - if (i < second.size) + if (i < first.size) { - for (; i < second.size && !has_mask; i++) + for (; i < first.size && !has_mask; ++i) { - if (has_second_null_map && second_null_map[i]) continue; - __m128i v_i = _mm_set1_epi16(second.data[i]); - bitmask = _mm_or_si128(bitmask, _mm_cmpeq_epi16(f_data, v_i)); + if (has_first_null_map && first_null_map[i]) + continue; + __m128i v_i = _mm_set1_epi16(first.data[i]); + bitmask = _mm_or_si128(bitmask, _mm_cmpeq_epi16(f_data, v_i)); has_mask = _mm_test_all_ones(bitmask); } } @@ -1111,57 +1152,62 @@ inline ALWAYS_INLINE bool sliceHasImplAnyAll -inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >(const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, const UInt8 * second_null_map) +inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >( + const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * second_null_map, const UInt8 * first_null_map) { - if (first.size == 0) return true; + if (second.size == 0) + return true; - const bool has_first_null_map = first_null_map != nullptr; - const bool has_second_null_map = second_null_map != nullptr; - if (has_first_null_map != has_second_null_map && has_first_null_map) return false; + const bool has_first_null_map = first_null_map != nullptr; + const bool has_second_null_map = second_null_map != nullptr; + if (!has_first_null_map && has_second_null_map) + return false; unsigned j = 0; short has_mask = 1; const int full = -1, none = 0; const __m128i zeros = _mm_setzero_si128(); - if (first.size > 15) -{ - for (; j < first.size-15 && has_mask; j += 16) + if (second.size > 15) + { + for (; j < second.size - 15 && has_mask; j += 16) { has_mask = 0; - const __m128i f_data = _mm_lddqu_si128(reinterpret_cast(first.data+j)); - __m128i bitmask = has_first_null_map ? _mm_set_epi8((first_null_map[j+15])? full: none, (first_null_map[j+14])? full: none, - (first_null_map[j+13])? full: none, (first_null_map[j+12])? full: none, - (first_null_map[j+11])? full: none, (first_null_map[j+10])? full: none, - (first_null_map[j+9]) ? full: none, (first_null_map[j+8]) ? full: none, - (first_null_map[j+7]) ? full: none, (first_null_map[j+6]) ? full: none, - (first_null_map[j+5]) ? full: none, (first_null_map[j+4]) ? full: none, - (first_null_map[j+3]) ? full: none, (first_null_map[j+2]) ? full: none, - (first_null_map[j+1]) ? full: none, (first_null_map[j]) ? full: none - ) - : zeros; + const __m128i f_data = _mm_lddqu_si128(reinterpret_cast(second.data+j)); + __m128i bitmask = has_second_null_map ? + _mm_set_epi8( + (second_null_map[j + 15]) ? full : none, (second_null_map[j + 14]) ? full : none, + (second_null_map[j + 13]) ? full : none, (second_null_map[j + 12]) ? full : none, + (second_null_map[j + 11]) ? full : none, (second_null_map[j + 10]) ? full : none, + (second_null_map[j + 9]) ? full : none, (second_null_map[j + 8]) ? full : none, + (second_null_map[j + 7]) ? full : none, (second_null_map[j + 6]) ? full : none, + (second_null_map[j + 5]) ? full : none, (second_null_map[j + 4]) ? full : none, + (second_null_map[j + 3]) ? full : none, (second_null_map[j + 2]) ? full : none, + (second_null_map[j + 1]) ? full : none, (second_null_map[j]) ? full : none) + : zeros; unsigned i = 0; - for (; i < second.size-15 && !has_mask; has_mask = _mm_test_all_ones(bitmask), i += 16) + for (; i < first.size - 15 && !has_mask; has_mask = _mm_test_all_ones(bitmask), i += 16) { - const __m128i s_data = _mm_lddqu_si128(reinterpret_cast(second.data+i)); - const __m128i second_nm_mask = (has_second_null_map)? _mm_lddqu_si128(reinterpret_cast(second_null_map+i)) - : zeros; + const __m128i s_data = _mm_lddqu_si128(reinterpret_cast(first.data+i)); + const __m128i first_nm_mask = has_first_null_map ? + _mm_lddqu_si128(reinterpret_cast(first_null_map+i)) + : zeros; bitmask = _mm_or_si128( _mm_or_si128( @@ -1169,89 +1215,91 @@ inline ALWAYS_INLINE bool sliceHasImplAnyAll Date: Thu, 19 Aug 2021 11:25:14 +0200 Subject: [PATCH 04/82] Adding a null map negative test, the new hasAll implementation needs correction for that case --- src/Functions/tests/gtest_hasAll.cpp | 64 +++++++++++++++++----------- 1 file changed, 39 insertions(+), 25 deletions(-) diff --git a/src/Functions/tests/gtest_hasAll.cpp b/src/Functions/tests/gtest_hasAll.cpp index 310c059bbbc..b7ba59f91c7 100644 --- a/src/Functions/tests/gtest_hasAll.cpp +++ b/src/Functions/tests/gtest_hasAll.cpp @@ -6,57 +6,58 @@ using namespace DB::GatherUtils; template -void array_init(T* elements_to_have, size_t elements_to_have_count, T* set_elements, size_t set_size, bool expected_output) { - for (T i = 0; i < set_size; ++i) +void arrayInit(T* elements_to_have, size_t nb_elements_to_have, T* array_elements, size_t array_size, bool all_elements_present) { + for (T i = 0; i < array_size; ++i) { - set_elements[i] = i; + array_elements[i] = i; } - for (T i = 0; i < elements_to_have_count; ++i) + for (T i = 0; i < nb_elements_to_have; ++i) { - elements_to_have[i] = set_elements[std::rand() % set_size]; + elements_to_have[i] = array_elements[std::rand() % array_size]; } - if (!expected_output) + if (!all_elements_present) { - // make one element to be searched for missing from the target set - elements_to_have[elements_to_have_count - 1] = set_size + 1; + /// make one element to be searched for missing from the target array + elements_to_have[nb_elements_to_have - 1] = array_size + 1; } } -void null_map_init(UInt8 * null_map, size_t null_map_size, size_t null_elements_count) +void nullMapInit(UInt8 * null_map, size_t null_map_size, size_t nb_null_elements) { for (int i = 0; i < null_map_size; ++i) { null_map[i] = 0; } - for (int i = 0; i < null_map_size - 1 && i < null_elements_count; ++i) + for (int i = 0; i < null_map_size - 1 && i < nb_null_elements; ++i) { - null_map[std::rand() % null_map_size - 1] = 1; + null_map[std::rand() % null_map_size] = 1; } } template -bool testHasAll(size_t elements_to_have_count, size_t set_size, bool have_null_map, bool expected_output) +bool testHasAll(size_t nb_elements_to_have, size_t array_size, bool with_null_maps, bool all_elements_present) { - T * set_elements = new T[set_size]; - T * elements_to_have = new T[elements_to_have_count]; + auto array_elements = std::make_unique(array_size); + auto elements_to_have = std::make_unique(nb_elements_to_have); - UInt8 * first_nm = nullptr, * second_nm = nullptr; - if (have_null_map) + std::unique_ptr first_nm = nullptr, second_nm = nullptr; + if (with_null_maps) { - first_nm = new UInt8[set_size]; - second_nm = new UInt8[elements_to_have_count]; - null_map_init(first_nm, set_size, 5); - null_map_init(second_nm, elements_to_have_count, 2); + first_nm = std::make_unique(array_size); + second_nm = std::make_unique(nb_elements_to_have); + /// add a null to elements to have, but not to the target array, making the answer negative + nullMapInit(first_nm.get(), array_size, 0); + nullMapInit(second_nm.get(), nb_elements_to_have, 1); } - array_init(elements_to_have, elements_to_have_count, set_elements, set_size, expected_output); + arrayInit(elements_to_have.get(), nb_elements_to_have, array_elements.get(), array_size, all_elements_present); - NumericArraySlice first = {set_elements, set_size}; - NumericArraySlice second = {elements_to_have, elements_to_have_count}; + NumericArraySlice first = {array_elements.get(), array_size}; + NumericArraySlice second = {elements_to_have.get(), nb_elements_to_have}; - /// Check whether all elements of the second array are also elements of the first array, overloaded for various combinations of types. + /// check whether all elements of the second array are also elements of the first array, overloaded for various combinations of types. return sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >( - first, second, first_nm, second_nm); + first, second, first_nm.get(), second_nm.get()); } TEST(HasAll, integer) @@ -111,3 +112,16 @@ TEST(HasAll, int8) ASSERT_EQ(test3, true); ASSERT_EQ(test4, false); } + +TEST(HasAllSingleNullElement, all) +{ + bool test1 = testHasAll(4, 100, true, true); + bool test2 = testHasAll(4, 100, true, true); + bool test3 = testHasAll(4, 100, true, true); + bool test4 = testHasAll(4, 100, true, true); + + ASSERT_EQ(test1, false); + ASSERT_EQ(test2, false); + ASSERT_EQ(test3, false); + ASSERT_EQ(test4, false); +} From 763bd006a75be454f5c109ef306a0e4e538726b1 Mon Sep 17 00:00:00 2001 From: Jakub Kuklis Date: Thu, 19 Aug 2021 14:13:30 +0200 Subject: [PATCH 05/82] Correcting new hasAll implementation for the case with null elements present in 'second' and absent in 'first', refactoring the outer loop remainder into a separate function, improving null checking in the default implementation --- src/Functions/GatherUtils/Algorithms.h | 307 +++++++++++++------------ 1 file changed, 155 insertions(+), 152 deletions(-) diff --git a/src/Functions/GatherUtils/Algorithms.h b/src/Functions/GatherUtils/Algorithms.h index 2812821e339..d37341b0f81 100644 --- a/src/Functions/GatherUtils/Algorithms.h +++ b/src/Functions/GatherUtils/Algorithms.h @@ -7,10 +7,12 @@ #include #include #include "GatherUtils.h" + #if defined(__AVX2__) || defined(__SSE4_2__) - #include +#include #endif + namespace DB::ErrorCodes { extern const int LOGICAL_ERROR; @@ -495,6 +497,20 @@ std::vector buildKMPPrefixFunction(const SliceType & pattern, const Equa } +inline ALWAYS_INLINE bool hasNull(const UInt8 * null_map, size_t null_map_size) +{ + if (null_map != nullptr) + { + for (size_t i = 0; i < null_map_size; ++i) + { + if (null_map[i]) + return true; + } + } + return false; +} + + /// Methods to check if first array has elements from second array, overloaded for various combinations of types. template < ArraySearchType search_type, @@ -506,19 +522,35 @@ bool sliceHasImplAnyAll(const FirstSliceType & first, const SecondSliceType & se const bool has_first_null_map = first_null_map != nullptr; const bool has_second_null_map = second_null_map != nullptr; + const bool has_second_null = hasNull(second_null_map, second.size); + if (has_second_null) + { + const bool has_first_null = hasNull(first_null_map, first.size); + + if (has_first_null && search_type == ArraySearchType::Any) + return true; + + if (!has_first_null && search_type == ArraySearchType::All) + return false; + } + for (size_t i = 0; i < second.size; ++i) { + if (has_second_null_map && second_null_map[i]) + continue; + bool has = false; - for (unsigned j = 0; j < first.size && !has; ++j) + + for (size_t j = 0; j < first.size && !has; ++j) { - const bool is_first_null = has_first_null_map && first_null_map[j]; - const bool is_second_null = has_second_null_map && second_null_map[i]; + if (has_first_null_map && first_null_map[j]) + continue; - if (is_first_null && is_second_null) - has = true; - - if (!is_first_null && !is_second_null && isEqual(first, second, j, i)) + if (isEqual(first, second, j, i)) + { has = true; + break; + } } if (has && search_type == ArraySearchType::Any) @@ -531,21 +563,60 @@ bool sliceHasImplAnyAll(const FirstSliceType & first, const SecondSliceType & se } +#if defined(__AVX2__) || defined(__SSE4_2__) + +template +inline ALWAYS_INLINE bool hasAllIntegralLoopRemainder( + size_t j, const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, const UInt8 * second_null_map) +{ + const bool has_first_null_map = first_null_map != nullptr; + const bool has_second_null_map = second_null_map != nullptr; + + for (; j < second.size; ++j) + { + // skip null elements since both have at least one - assuming it was checked earlier that at least one element in 'first' is null + if (has_second_null_map && second_null_map[j]) + continue; + + bool found = false; + + for (size_t i = 0; i < first.size; ++i) + { + if (has_first_null_map && first_null_map[i]) + continue; + + if (first.data[i] == second.data[j]) + { + found = true; + break; + } + } + + if (!found) + return false; + } + return true; +} + +#endif + + #if defined(__AVX2__) -// AVX2 - Int specialization +// AVX2 Int specialization template <> inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >( - const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * second_null_map, const UInt8 * first_null_map) + const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, const UInt8 * second_null_map) { if (second.size == 0) return true; - const bool has_first_null_map = first_null_map != nullptr; - const bool has_second_null_map = second_null_map != nullptr; - if (!has_first_null_map && has_second_null_map) + if (!hasNull(first_null_map, first.size) && hasNull(second_null_map, second.size)) return false; - unsigned j = 0; + const bool has_first_null_map = first_null_map != nullptr; + const bool has_second_null_map = second_null_map != nullptr; + + size_t j = 0; short has_mask = 1; const int full = -1, none = 0; const __m256i ones = _mm256_set1_epi32(full); @@ -625,28 +696,16 @@ inline ALWAYS_INLINE bool sliceHasImplAnyAll // inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >( -// const NumericArraySlice & second, const NumericArraySlice & first, const UInt8 * second_null_map, const UInt8 * first_null_map) +// const NumericArraySlice & second, const NumericArraySlice & first, const UInt8 * first_null_map, const UInt8 * second_null_map) // { -// return sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements > ( +// return sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements > ( // static_cast &>(second), static_cast &>(first), second_null_map, first_null_map); // } // AVX2 Int64 specialization template <> inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >( - const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * second_null_map, const UInt8 * first_null_map) + const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, const UInt8 * second_null_map) { if (second.size == 0) return true; - const bool has_first_null_map = first_null_map != nullptr; - const bool has_second_null_map = second_null_map != nullptr; - if (!has_first_null_map && has_second_null_map) + if (!hasNull(first_null_map, first.size) && hasNull(second_null_map, second.size)) return false; - unsigned j = 0; + const bool has_first_null_map = first_null_map != nullptr; + const bool has_second_null_map = second_null_map != nullptr; + + size_t j = 0; short has_mask = 1; const int full = -1, none = 0; const __m256i ones = _mm256_set1_epi64x(full); @@ -694,7 +754,7 @@ inline ALWAYS_INLINE bool sliceHasImplAnyAll(first.data + i)); const __m256i first_nm_mask = _mm256_set_m128i( @@ -729,42 +789,33 @@ inline ALWAYS_INLINE bool sliceHasImplAnyAll 2) + return false; + + return hasAllIntegralLoopRemainder(j, first, second, first_null_map, second_null_map); } // AVX2 Int16_t specialization template <> inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >( - const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * second_null_map, const UInt8 * first_null_map) + const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, const UInt8 * second_null_map) { if (second.size == 0) return true; - const bool has_first_null_map = first_null_map != nullptr; - const bool has_second_null_map = second_null_map != nullptr; - if (!has_first_null_map && has_second_null_map) + if (!hasNull(first_null_map, first.size) && hasNull(second_null_map, second.size)) return false; - unsigned j = 0; + const bool has_first_null_map = first_null_map != nullptr; + const bool has_second_null_map = second_null_map != nullptr; + + size_t j = 0; short has_mask = 1; const int full = -1, none = 0; const __m256i ones = _mm256_set1_epi16(full); @@ -787,7 +838,7 @@ inline ALWAYS_INLINE bool sliceHasImplAnyAll(first.data + i)); const __m256i first_nm_mask = _mm256_set_m128i( @@ -874,26 +925,16 @@ inline ALWAYS_INLINE bool sliceHasImplAnyAll 2) + return false; + + return hasAllIntegralLoopRemainder(j, first, second, first_null_map, second_null_map); } #elif defined(__SSE4_2__) @@ -901,17 +942,18 @@ inline ALWAYS_INLINE bool sliceHasImplAnyAll inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >( - const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * second_null_map, const UInt8 * first_null_map) + const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, const UInt8 * second_null_map) { if (second.size == 0) return true; - const bool has_first_null_map = first_null_map != nullptr; - const bool has_second_null_map = second_null_map != nullptr; - if (!has_first_null_map && has_second_null_map) + if (!hasNull(first_null_map, first.size) && hasNull(second_null_map, second.size)) return false; - unsigned j = 0; + const bool has_first_null_map = first_null_map != nullptr; + const bool has_second_null_map = second_null_map != nullptr; + + size_t j = 0; short has_mask = 1; const __m128i zeros = _mm_setzero_si128(); if (second.size > 3 && first.size > 2) @@ -972,36 +1014,27 @@ inline ALWAYS_INLINE bool sliceHasImplAnyAll inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >( - const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * second_null_map, const UInt8 * first_null_map) + const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, const UInt8 * second_null_map) { if (second.size == 0) return true; - const bool has_first_null_map = first_null_map != nullptr; - const bool has_second_null_map = second_null_map != nullptr; - if (!has_first_null_map && has_second_null_map) + if (!hasNull(first_null_map, first.size) && hasNull(second_null_map, second.size)) return false; - unsigned j = 0; + const bool has_first_null_map = first_null_map != nullptr; + const bool has_second_null_map = second_null_map != nullptr; + + size_t j = 0; short has_mask = 1; const Int64 full = -1, none = 0; const __m128i zeros = _mm_setzero_si128(); @@ -1046,36 +1079,27 @@ inline ALWAYS_INLINE bool sliceHasImplAnyAll inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >( - const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * second_null_map, const UInt8 * first_null_map) + const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, const UInt8 * second_null_map) { if (second.size == 0) return true; - const bool has_first_null_map = first_null_map != nullptr; - const bool has_second_null_map = second_null_map != nullptr; - if (!has_first_null_map && has_second_null_map) + if (!hasNull(first_null_map, first.size) && hasNull(second_null_map, second.size)) return false; - unsigned j = 0; + const bool has_first_null_map = first_null_map != nullptr; + const bool has_second_null_map = second_null_map != nullptr; + + size_t j = 0; short has_mask = 1; const int16_t full = -1, none = 0; const __m128i zeros = _mm_setzero_si128(); @@ -1151,36 +1175,27 @@ inline ALWAYS_INLINE bool sliceHasImplAnyAll 2) + return false; + + return hasAllIntegralLoopRemainder(j, first, second, first_null_map, second_null_map); } // SSE4.2 Int8_t specialization template <> inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >( - const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * second_null_map, const UInt8 * first_null_map) + const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, const UInt8 * second_null_map) { if (second.size == 0) return true; - const bool has_first_null_map = first_null_map != nullptr; - const bool has_second_null_map = second_null_map != nullptr; - if (!has_first_null_map && has_second_null_map) + if (!hasNull(first_null_map, first.size) && hasNull(second_null_map, second.size)) return false; - unsigned j = 0; + const bool has_first_null_map = first_null_map != nullptr; + const bool has_second_null_map = second_null_map != nullptr; + + size_t j = 0; short has_mask = 1; const int full = -1, none = 0; const __m128i zeros = _mm_setzero_si128(); @@ -1291,21 +1306,10 @@ inline ALWAYS_INLINE bool sliceHasImplAnyAll Date: Thu, 19 Aug 2021 15:29:26 +0200 Subject: [PATCH 06/82] Correcting { placement --- src/Functions/tests/gtest_hasAll.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/Functions/tests/gtest_hasAll.cpp b/src/Functions/tests/gtest_hasAll.cpp index b7ba59f91c7..89f011cd7f1 100644 --- a/src/Functions/tests/gtest_hasAll.cpp +++ b/src/Functions/tests/gtest_hasAll.cpp @@ -6,7 +6,8 @@ using namespace DB::GatherUtils; template -void arrayInit(T* elements_to_have, size_t nb_elements_to_have, T* array_elements, size_t array_size, bool all_elements_present) { +void arrayInit(T* elements_to_have, size_t nb_elements_to_have, T* array_elements, size_t array_size, bool all_elements_present) +{ for (T i = 0; i < array_size; ++i) { array_elements[i] = i; From 169c49c58378a6be1729b1ac2eadaf991d01b1ac Mon Sep 17 00:00:00 2001 From: Jakub Kuklis Date: Fri, 20 Aug 2021 13:00:40 +0200 Subject: [PATCH 07/82] Correcting style and resolving warnings --- src/Functions/tests/gtest_hasAll.cpp | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/src/Functions/tests/gtest_hasAll.cpp b/src/Functions/tests/gtest_hasAll.cpp index 89f011cd7f1..ca7bc80b4aa 100644 --- a/src/Functions/tests/gtest_hasAll.cpp +++ b/src/Functions/tests/gtest_hasAll.cpp @@ -1,20 +1,29 @@ +#include #include - #include using namespace DB::GatherUtils; +auto uni_int_dist(int min, int max) +{ + std::random_device rd; + std::mt19937 mt(rd()); + std::uniform_int_distribution<> dist(min, max); + return std::make_pair(dist, mt); +} + template void arrayInit(T* elements_to_have, size_t nb_elements_to_have, T* array_elements, size_t array_size, bool all_elements_present) { - for (T i = 0; i < array_size; ++i) + for (size_t i = 0; i < array_size; ++i) { array_elements[i] = i; } - for (T i = 0; i < nb_elements_to_have; ++i) + auto [dist, gen] = uni_int_dist(0, array_size - 1); + for (size_t i = 0; i < nb_elements_to_have; ++i) { - elements_to_have[i] = array_elements[std::rand() % array_size]; + elements_to_have[i] = array_elements[dist(gen)]; } if (!all_elements_present) { @@ -25,13 +34,15 @@ void arrayInit(T* elements_to_have, size_t nb_elements_to_have, T* array_element void nullMapInit(UInt8 * null_map, size_t null_map_size, size_t nb_null_elements) { - for (int i = 0; i < null_map_size; ++i) + /// -2 to keep the last element of the array non-null + auto [dist, gen] = uni_int_dist(0, null_map_size - 2); + for (size_t i = 0; i < null_map_size; ++i) { null_map[i] = 0; } - for (int i = 0; i < null_map_size - 1 && i < nb_null_elements; ++i) + for (size_t i = 0; i < null_map_size - 1 && i < nb_null_elements; ++i) { - null_map[std::rand() % null_map_size] = 1; + null_map[dist(gen)] = 1; } } From a3c08acac3d3ca3239b19dc09cf4bfb3730c37d3 Mon Sep 17 00:00:00 2001 From: Jakub Kuklis Date: Thu, 26 Aug 2021 12:07:56 +0200 Subject: [PATCH 08/82] Moving sliceHasImplAnyAll and sliceEqualElements to separate header files to avoid SIMD instructions bloat in Algorithms.h --- src/Functions/GatherUtils/Algorithms.h | 856 +----------------- .../GatherUtils/sliceEqualElements.h | 41 + .../GatherUtils/sliceHasImplAnyAll.h | 839 +++++++++++++++++ 3 files changed, 882 insertions(+), 854 deletions(-) create mode 100644 src/Functions/GatherUtils/sliceEqualElements.h create mode 100644 src/Functions/GatherUtils/sliceHasImplAnyAll.h diff --git a/src/Functions/GatherUtils/Algorithms.h b/src/Functions/GatherUtils/Algorithms.h index d37341b0f81..4bab415f199 100644 --- a/src/Functions/GatherUtils/Algorithms.h +++ b/src/Functions/GatherUtils/Algorithms.h @@ -7,10 +7,8 @@ #include #include #include "GatherUtils.h" - -#if defined(__AVX2__) || defined(__SSE4_2__) -#include -#endif +#include "sliceEqualElements.h" +#include "sliceHasImplAnyAll.h" namespace DB::ErrorCodes @@ -422,38 +420,6 @@ void NO_INLINE conditional(SourceA && src_a, SourceB && src_b, Sink && sink, con } -template -bool sliceEqualElements(const NumericArraySlice & first [[maybe_unused]], - const NumericArraySlice & second [[maybe_unused]], - size_t first_ind [[maybe_unused]], - size_t second_ind [[maybe_unused]]) -{ - /// TODO: Decimal scale - if constexpr (is_decimal && is_decimal) - return accurate::equalsOp(first.data[first_ind].value, second.data[second_ind].value); - else if constexpr (is_decimal || is_decimal) - return false; - else - return accurate::equalsOp(first.data[first_ind], second.data[second_ind]); -} - -template -bool sliceEqualElements(const NumericArraySlice &, const GenericArraySlice &, size_t, size_t) -{ - return false; -} - -template -bool sliceEqualElements(const GenericArraySlice &, const NumericArraySlice &, size_t, size_t) -{ - return false; -} - -inline ALWAYS_INLINE bool sliceEqualElements(const GenericArraySlice & first, const GenericArraySlice & second, size_t first_ind, size_t second_ind) -{ - return first.elements->compareAt(first_ind + first.begin, second_ind + second.begin, *second.elements, -1) == 0; -} - template bool insliceEqualElements(const NumericArraySlice & first [[maybe_unused]], size_t first_ind [[maybe_unused]], @@ -497,824 +463,6 @@ std::vector buildKMPPrefixFunction(const SliceType & pattern, const Equa } -inline ALWAYS_INLINE bool hasNull(const UInt8 * null_map, size_t null_map_size) -{ - if (null_map != nullptr) - { - for (size_t i = 0; i < null_map_size; ++i) - { - if (null_map[i]) - return true; - } - } - return false; -} - - -/// Methods to check if first array has elements from second array, overloaded for various combinations of types. -template < - ArraySearchType search_type, - typename FirstSliceType, - typename SecondSliceType, - bool (*isEqual)(const FirstSliceType &, const SecondSliceType &, size_t, size_t)> -bool sliceHasImplAnyAll(const FirstSliceType & first, const SecondSliceType & second, const UInt8 * first_null_map, const UInt8 * second_null_map) -{ - const bool has_first_null_map = first_null_map != nullptr; - const bool has_second_null_map = second_null_map != nullptr; - - const bool has_second_null = hasNull(second_null_map, second.size); - if (has_second_null) - { - const bool has_first_null = hasNull(first_null_map, first.size); - - if (has_first_null && search_type == ArraySearchType::Any) - return true; - - if (!has_first_null && search_type == ArraySearchType::All) - return false; - } - - for (size_t i = 0; i < second.size; ++i) - { - if (has_second_null_map && second_null_map[i]) - continue; - - bool has = false; - - for (size_t j = 0; j < first.size && !has; ++j) - { - if (has_first_null_map && first_null_map[j]) - continue; - - if (isEqual(first, second, j, i)) - { - has = true; - break; - } - } - - if (has && search_type == ArraySearchType::Any) - return true; - - if (!has && search_type == ArraySearchType::All) - return false; - } - return search_type == ArraySearchType::All; -} - - -#if defined(__AVX2__) || defined(__SSE4_2__) - -template -inline ALWAYS_INLINE bool hasAllIntegralLoopRemainder( - size_t j, const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, const UInt8 * second_null_map) -{ - const bool has_first_null_map = first_null_map != nullptr; - const bool has_second_null_map = second_null_map != nullptr; - - for (; j < second.size; ++j) - { - // skip null elements since both have at least one - assuming it was checked earlier that at least one element in 'first' is null - if (has_second_null_map && second_null_map[j]) - continue; - - bool found = false; - - for (size_t i = 0; i < first.size; ++i) - { - if (has_first_null_map && first_null_map[i]) - continue; - - if (first.data[i] == second.data[j]) - { - found = true; - break; - } - } - - if (!found) - return false; - } - return true; -} - -#endif - - -#if defined(__AVX2__) -// AVX2 Int specialization -template <> -inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >( - const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, const UInt8 * second_null_map) -{ - if (second.size == 0) - return true; - - if (!hasNull(first_null_map, first.size) && hasNull(second_null_map, second.size)) - return false; - - const bool has_first_null_map = first_null_map != nullptr; - const bool has_second_null_map = second_null_map != nullptr; - - size_t j = 0; - short has_mask = 1; - const int full = -1, none = 0; - const __m256i ones = _mm256_set1_epi32(full); - const __m256i zeros = _mm256_setzero_si256(); - if (second.size > 7 && first.size > 7) - { - for (; j < second.size - 7 && has_mask; j += 8) - { - has_mask = 0; - const __m256i f_data = _mm256_lddqu_si256(reinterpret_cast(second.data + j)); - // bitmask is filled with minus ones for ones which are considered as null in the corresponding null map, 0 otherwise; - __m256i bitmask = has_second_null_map ? - _mm256_set_epi32( - (second_null_map[j + 7]) ? full : none, - (second_null_map[j + 6]) ? full : none, - (second_null_map[j + 5]) ? full : none, - (second_null_map[j + 4]) ? full : none, - (second_null_map[j + 3]) ? full : none, - (second_null_map[j + 2]) ? full : none, - (second_null_map[j + 1]) ? full : none, - (second_null_map[j]) ? full : none) - : zeros; - - size_t i = 0; - // Search first array to try to match all second elements - for (; i < first.size - 7 && !has_mask; has_mask = _mm256_testc_si256(bitmask, ones), i += 8) - { - const __m256i s_data = _mm256_lddqu_si256(reinterpret_cast(first.data + i)); - // Create a mask to avoid to compare null elements - // set_m128i takes two arguments: (high segment, low segment) that are two __m128i convert from 8bits to 32bits to fit to our following operations - const __m256i first_nm_mask = _mm256_set_m128i( - _mm_cvtepi8_epi32(_mm_lddqu_si128(reinterpret_cast(first_null_map + i + 4))), - _mm_cvtepi8_epi32(_mm_lddqu_si128(reinterpret_cast(first_null_map + i)))); - bitmask = - _mm256_or_si256( - _mm256_or_si256( - _mm256_or_si256( - _mm256_or_si256( - _mm256_andnot_si256( - first_nm_mask, - _mm256_cmpeq_epi32(f_data, s_data)), - _mm256_andnot_si256( - _mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(6,5,4,3,2,1,0,7)), - _mm256_cmpeq_epi32(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(6,5,4,3,2,1,0,7))))), - _mm256_or_si256( - _mm256_andnot_si256( - _mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(5,4,3,2,1,0,7,6)), - _mm256_cmpeq_epi32(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(5,4,3,2,1,0,7,6)))), - _mm256_andnot_si256( - _mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(4,3,2,1,0,7,6,5)), - _mm256_cmpeq_epi32(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(4,3,2,1,0,7,6,5))))) - ), - _mm256_or_si256( - _mm256_or_si256( - _mm256_andnot_si256( - _mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(3,2,1,0,7,6,5,4)), - _mm256_cmpeq_epi32(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(3,2,1,0,7,6,5,4)))), - _mm256_andnot_si256( - _mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(2,1,0,7,6,5,4,3)), - _mm256_cmpeq_epi32(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(2,1,0,7,6,5,4,3))))), - _mm256_or_si256( - _mm256_andnot_si256( - _mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(1,0,7,6,5,4,3,2)), - _mm256_cmpeq_epi32(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(1,0,7,6,5,4,3,2)))), - _mm256_andnot_si256( - _mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(0,7,6,5,4,3,2,1)), - _mm256_cmpeq_epi32(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(0,7,6,5,4,3,2,1))))))), - bitmask); - } - - if (i < first.size) - { - // Loop(i)-jam - for (; i < first.size && !has_mask; ++i) - { - if (has_first_null_map && first_null_map[i]) - continue; - __m256i v_i = _mm256_set1_epi32(first.data[i]); - bitmask = _mm256_or_si256(bitmask, _mm256_cmpeq_epi32(f_data, v_i)); - has_mask = _mm256_testc_si256(bitmask, ones); - } - } - } - } - - if (!has_mask) - return false; - - return hasAllIntegralLoopRemainder(j, first, second, first_null_map, second_null_map); -} - -// TODO: Discuss about -// raise an error : "error: no viable conversion from 'const NumericArraySlice' to 'const NumericArraySlice'" -// How should we do, copy past each function ?? I haven't found a way to specialize a same function body for two different types. -// AVX2 UInt specialization -// template <> -// inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >( -// const NumericArraySlice & second, const NumericArraySlice & first, const UInt8 * first_null_map, const UInt8 * second_null_map) -// { -// return sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements > ( -// static_cast &>(second), static_cast &>(first), second_null_map, first_null_map); -// } - -// AVX2 Int64 specialization -template <> -inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >( - const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, const UInt8 * second_null_map) -{ - if (second.size == 0) - return true; - - if (!hasNull(first_null_map, first.size) && hasNull(second_null_map, second.size)) - return false; - - const bool has_first_null_map = first_null_map != nullptr; - const bool has_second_null_map = second_null_map != nullptr; - - size_t j = 0; - short has_mask = 1; - const int full = -1, none = 0; - const __m256i ones = _mm256_set1_epi64x(full); - const __m256i zeros = _mm256_setzero_si256(); - if (second.size > 3 && first.size > 3) - { - for (; j < second.size - 3 && has_mask; j += 4) - { - has_mask = 0; - const __m256i f_data = _mm256_lddqu_si256(reinterpret_cast(second.data + j)); - __m256i bitmask = has_second_null_map ? - _mm256_set_epi64x( - (second_null_map[j + 3])? full : none, - (second_null_map[j + 2])? full : none, - (second_null_map[j + 1])? full : none, - (second_null_map[j]) ? full : none) - : zeros; - - unsigned i = 0; - for (; i < first.size - 3 && !has_mask; has_mask = _mm256_testc_si256(bitmask, ones), i += 4) - { - const __m256i s_data = _mm256_lddqu_si256(reinterpret_cast(first.data + i)); - const __m256i first_nm_mask = _mm256_set_m128i( - _mm_cvtepi8_epi64(_mm_lddqu_si128(reinterpret_cast(first_null_map + i + 2))), - _mm_cvtepi8_epi64(_mm_lddqu_si128(reinterpret_cast(first_null_map + i)))); - bitmask = - _mm256_or_si256( - _mm256_or_si256( - _mm256_or_si256( - _mm256_andnot_si256( - first_nm_mask, - _mm256_cmpeq_epi64(f_data, s_data)), - _mm256_andnot_si256( - _mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(5,4,3,2,1,0,7,6)), - _mm256_cmpeq_epi64(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(5,4,3,2,1,0,7,6))))), - - _mm256_or_si256( - _mm256_andnot_si256( - _mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(3,2,1,0,7,6,5,4)), - _mm256_cmpeq_epi64(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(3,2,1,0,7,6,5,4)))), - _mm256_andnot_si256( - _mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(1,0,7,6,5,4,3,2)), - _mm256_cmpeq_epi64(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(1,0,7,6,5,4,3,2)))))), - bitmask); - } - - if (i < first.size) - { - for (; i < first.size && !has_mask; ++i) - { - if (has_first_null_map && first_null_map[i]) - continue; - __m256i v_i = _mm256_set1_epi64x(first.data[i]); - bitmask = _mm256_or_si256(bitmask, _mm256_cmpeq_epi64(f_data, v_i)); - has_mask = _mm256_testc_si256(bitmask, ones); - } - } - } - } - - if (!has_mask && second.size > 2) - return false; - - return hasAllIntegralLoopRemainder(j, first, second, first_null_map, second_null_map); -} - -// AVX2 Int16_t specialization -template <> -inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >( - const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, const UInt8 * second_null_map) -{ - if (second.size == 0) - return true; - - if (!hasNull(first_null_map, first.size) && hasNull(second_null_map, second.size)) - return false; - - const bool has_first_null_map = first_null_map != nullptr; - const bool has_second_null_map = second_null_map != nullptr; - - size_t j = 0; - short has_mask = 1; - const int full = -1, none = 0; - const __m256i ones = _mm256_set1_epi16(full); - const __m256i zeros = _mm256_setzero_si256(); - if (second.size > 15 && first.size > 15) - { - for (; j < second.size - 15 && has_mask; j += 16) - { - has_mask = 0; - const __m256i f_data = _mm256_lddqu_si256(reinterpret_cast(second.data + j)); - __m256i bitmask = has_second_null_map ? - _mm256_set_epi16( - (second_null_map[j + 15]) ? full : none, (second_null_map[j + 14]) ? full : none, - (second_null_map[j + 13]) ? full : none, (second_null_map[j + 12]) ? full : none, - (second_null_map[j + 11]) ? full : none, (second_null_map[j + 10]) ? full : none, - (second_null_map[j + 9]) ? full : none, (second_null_map[j + 8])? full : none, - (second_null_map[j + 7]) ? full : none, (second_null_map[j + 6])? full : none, - (second_null_map[j + 5]) ? full : none, (second_null_map[j + 4])? full : none, - (second_null_map[j + 3]) ? full : none, (second_null_map[j + 2])? full : none, - (second_null_map[j + 1]) ? full : none, (second_null_map[j]) ? full : none) - : zeros; - unsigned i = 0; - for (; i < first.size - 15 && !has_mask; has_mask = _mm256_testc_si256(bitmask, ones), i += 16) - { - const __m256i s_data = _mm256_lddqu_si256(reinterpret_cast(first.data + i)); - const __m256i first_nm_mask = _mm256_set_m128i( - _mm_cvtepi8_epi16(_mm_lddqu_si128(reinterpret_cast(first_null_map+i+8))), - _mm_cvtepi8_epi16(_mm_lddqu_si128(reinterpret_cast(first_null_map+i)))); - bitmask = - _mm256_or_si256( - _mm256_or_si256( - _mm256_or_si256( - _mm256_or_si256( - _mm256_or_si256( - _mm256_andnot_si256( - first_nm_mask, - _mm256_cmpeq_epi16(f_data, s_data)), - _mm256_andnot_si256( - _mm256_shuffle_epi8(first_nm_mask, _mm256_set_epi8(29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30)), - _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(s_data, _mm256_set_epi8(29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30))))), - _mm256_or_si256( - _mm256_andnot_si256( - _mm256_shuffle_epi8(first_nm_mask, _mm256_set_epi8(27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28)), - _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(s_data, _mm256_set_epi8(27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28)))), - _mm256_andnot_si256( - _mm256_shuffle_epi8(first_nm_mask, _mm256_set_epi8(25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26)), - _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(s_data, _mm256_set_epi8(25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26))))) - ), - _mm256_or_si256( - _mm256_or_si256( - _mm256_andnot_si256( - _mm256_shuffle_epi8(first_nm_mask, _mm256_set_epi8(23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24)), - _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(s_data, _mm256_set_epi8(23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24)))), - _mm256_andnot_si256( - _mm256_shuffle_epi8(first_nm_mask, _mm256_set_epi8(21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22)), - _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(s_data, _mm256_set_epi8(21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22))))), - _mm256_or_si256( - _mm256_andnot_si256( - _mm256_shuffle_epi8(first_nm_mask, _mm256_set_epi8(19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20)), - _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(s_data, _mm256_set_epi8(19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20)))), - _mm256_andnot_si256( - _mm256_shuffle_epi8(first_nm_mask, _mm256_set_epi8(17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18)), - _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(s_data, _mm256_set_epi8(17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18)))))) - ), - _mm256_or_si256( - _mm256_or_si256( - _mm256_or_si256( - _mm256_andnot_si256( - _mm256_permute2x128_si256(first_nm_mask, first_nm_mask,1), - _mm256_cmpeq_epi16(f_data, _mm256_permute2x128_si256(s_data, s_data, 1))), - _mm256_andnot_si256( - _mm256_shuffle_epi8(_mm256_permute2x128_si256(first_nm_mask, first_nm_mask, 1), _mm256_set_epi8(13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14)), - _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(_mm256_permute2x128_si256(s_data, s_data, 1), _mm256_set_epi8(13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14))))), - _mm256_or_si256( - _mm256_andnot_si256( - _mm256_shuffle_epi8(_mm256_permute2x128_si256(first_nm_mask, first_nm_mask, 1), _mm256_set_epi8(11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12)), - _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(_mm256_permute2x128_si256(s_data, s_data, 1), _mm256_set_epi8(11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12)))), - _mm256_andnot_si256( - _mm256_shuffle_epi8(_mm256_permute2x128_si256(first_nm_mask, first_nm_mask, 1), _mm256_set_epi8(9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10)), - _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(_mm256_permute2x128_si256(s_data, s_data, 1), _mm256_set_epi8(9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10))))) - ), - _mm256_or_si256( - _mm256_or_si256( - _mm256_andnot_si256( - _mm256_shuffle_epi8(_mm256_permute2x128_si256(first_nm_mask, first_nm_mask, 1), _mm256_set_epi8(7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8)), - _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(_mm256_permute2x128_si256(s_data ,s_data, 1), _mm256_set_epi8(7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8)))), - _mm256_andnot_si256( - _mm256_shuffle_epi8(_mm256_permute2x128_si256(first_nm_mask, first_nm_mask, 1), _mm256_set_epi8(5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6)), - _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(_mm256_permute2x128_si256(s_data, s_data, 1), _mm256_set_epi8(5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6))))), - _mm256_or_si256( - _mm256_andnot_si256( - _mm256_shuffle_epi8(_mm256_permute2x128_si256(first_nm_mask, first_nm_mask, 1), _mm256_set_epi8(3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4)), - _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(_mm256_permute2x128_si256(s_data ,s_data ,1), _mm256_set_epi8(3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4)))), - _mm256_andnot_si256( - _mm256_shuffle_epi8(_mm256_permute2x128_si256(first_nm_mask, first_nm_mask, 1), _mm256_set_epi8(1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2)), - _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(_mm256_permute2x128_si256(s_data, s_data, 1), _mm256_set_epi8(1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2)))))) - ) - ), - bitmask); - } - - if (i < first.size) - { - for (; i < first.size && !has_mask; ++i) - { - if (has_first_null_map && first_null_map[i]) - continue; - __m256i v_i = _mm256_set1_epi16(first.data[i]); - bitmask = _mm256_or_si256(bitmask, _mm256_cmpeq_epi16(f_data, v_i)); - has_mask = _mm256_testc_si256(bitmask, ones); - } - } - } - } - - if (!has_mask && second.size > 2) - return false; - - return hasAllIntegralLoopRemainder(j, first, second, first_null_map, second_null_map); -} - -#elif defined(__SSE4_2__) - -// SSE4.2 Int specialization -template <> -inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >( - const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, const UInt8 * second_null_map) -{ - if (second.size == 0) - return true; - - if (!hasNull(first_null_map, first.size) && hasNull(second_null_map, second.size)) - return false; - - const bool has_first_null_map = first_null_map != nullptr; - const bool has_second_null_map = second_null_map != nullptr; - - size_t j = 0; - short has_mask = 1; - const __m128i zeros = _mm_setzero_si128(); - if (second.size > 3 && first.size > 2) - { - const int full = -1, none = 0; - for (; j < second.size - 3 && has_mask; j += 4) - { - has_mask = 0; - const __m128i f_data = _mm_lddqu_si128(reinterpret_cast(second.data + j)); - __m128i bitmask = has_second_null_map ? - _mm_set_epi32( - (second_null_map[j + 3]) ? full : none, - (second_null_map[j + 2]) ? full : none, - (second_null_map[j + 1]) ? full : none, - (second_null_map[j]) ? full : none) - : zeros; - - unsigned i = 0; - for (; i < first.size - 3 && !has_mask; has_mask = _mm_test_all_ones(bitmask), i += 4) - { - const __m128i s_data = _mm_lddqu_si128(reinterpret_cast(first.data + i)); - const __m128i first_nm_mask = has_first_null_map ? - _mm_cvtepi8_epi32(_mm_lddqu_si128(reinterpret_cast(first_null_map + i))) - : zeros; - - bitmask = - _mm_or_si128( - _mm_or_si128( - _mm_or_si128( - _mm_andnot_si128( - first_nm_mask, - _mm_cmpeq_epi32(f_data, s_data)), - _mm_andnot_si128( - _mm_shuffle_epi32(first_nm_mask, _MM_SHUFFLE(2,1,0,3)), - _mm_cmpeq_epi32(f_data, _mm_shuffle_epi32(s_data, _MM_SHUFFLE(2,1,0,3))))), - _mm_or_si128( - _mm_andnot_si128( - _mm_shuffle_epi32(first_nm_mask, _MM_SHUFFLE(1,0,3,2)), - _mm_cmpeq_epi32(f_data, _mm_shuffle_epi32(s_data, _MM_SHUFFLE(1,0,3,2)))), - _mm_andnot_si128( - _mm_shuffle_epi32(first_nm_mask, _MM_SHUFFLE(0,3,2,1)), - _mm_cmpeq_epi32(f_data, _mm_shuffle_epi32(s_data, _MM_SHUFFLE(0,3,2,1))))) - ), - bitmask); - } - - if (i < first.size) - { - for (; i < first.size && !has_mask; ++i) - { - if (has_first_null_map && first_null_map[i]) - continue; - __m128i r_i = _mm_set1_epi32(first.data[i]); - bitmask = _mm_or_si128(bitmask, _mm_cmpeq_epi32(f_data, r_i)); - has_mask = _mm_test_all_ones(bitmask); - } - } - } - } - - if (!has_mask) - return false; - - return hasAllIntegralLoopRemainder(j, first, second, first_null_map, second_null_map); -} - -// SSE4.2 Int64 specialization -template <> -inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >( - const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, const UInt8 * second_null_map) -{ - if (second.size == 0) - return true; - - if (!hasNull(first_null_map, first.size) && hasNull(second_null_map, second.size)) - return false; - - const bool has_first_null_map = first_null_map != nullptr; - const bool has_second_null_map = second_null_map != nullptr; - - size_t j = 0; - short has_mask = 1; - const Int64 full = -1, none = 0; - const __m128i zeros = _mm_setzero_si128(); - for (; j < second.size - 1 && has_mask; j += 2) - { - has_mask = 0; - const __m128i f_data = _mm_lddqu_si128(reinterpret_cast(second.data + j)); - __m128i bitmask = has_second_null_map ? - _mm_set_epi64x( - (second_null_map[j + 1]) ? full : none, - (second_null_map[j]) ? full : none) - : zeros; - unsigned i = 0; - for (; i < first.size - 1 && !has_mask; has_mask = _mm_test_all_ones(bitmask), i += 2) - { - const __m128i s_data = _mm_lddqu_si128(reinterpret_cast(first.data + i)); - const __m128i first_nm_mask = has_first_null_map ? - _mm_cvtepi8_epi64(_mm_lddqu_si128(reinterpret_cast(first_null_map + i))) - : zeros; - bitmask = - _mm_or_si128( - _mm_or_si128( - _mm_andnot_si128( - first_nm_mask, - _mm_cmpeq_epi32(f_data, s_data)), - _mm_andnot_si128( - _mm_shuffle_epi32(first_nm_mask, _MM_SHUFFLE(1,0,3,2)), - _mm_cmpeq_epi64(f_data, _mm_shuffle_epi32(s_data, _MM_SHUFFLE(1,0,3,2))))), - bitmask); - } - - if (i < first.size) - { - for (; i < first.size && !has_mask; ++i) - { - if (has_first_null_map && first_null_map[i]) - continue; - __m128i v_i = _mm_set1_epi64x(first.data[i]); - bitmask = _mm_or_si128(bitmask, _mm_cmpeq_epi64(f_data, v_i)); - has_mask = _mm_test_all_ones(bitmask); - } - } - } - - if (!has_mask) - return false; - - return hasAllIntegralLoopRemainder(j, first, second, first_null_map, second_null_map); -} - -// SSE4.2 Int16_t specialization -template <> -inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >( - const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, const UInt8 * second_null_map) -{ - if (second.size == 0) - return true; - - if (!hasNull(first_null_map, first.size) && hasNull(second_null_map, second.size)) - return false; - - const bool has_first_null_map = first_null_map != nullptr; - const bool has_second_null_map = second_null_map != nullptr; - - size_t j = 0; - short has_mask = 1; - const int16_t full = -1, none = 0; - const __m128i zeros = _mm_setzero_si128(); - if (second.size > 6 && first.size > 6) - { - for (; j < second.size - 7 && has_mask; j += 8) - { - has_mask = 0; - const __m128i f_data = _mm_lddqu_si128(reinterpret_cast(second.data+j)); - __m128i bitmask = has_second_null_map ? - _mm_set_epi16( - (second_null_map[j + 7]) ? full : none, (second_null_map[j + 6]) ? full : none, - (second_null_map[j + 5]) ? full : none, (second_null_map[j + 4]) ? full : none, - (second_null_map[j + 3]) ? full : none, (second_null_map[j + 2]) ? full : none, - (second_null_map[j + 1]) ? full : none, (second_null_map[j]) ? full: none) - : zeros; - unsigned i = 0; - for (; i < first.size-7 && !has_mask; has_mask = _mm_test_all_ones(bitmask), i += 8) - { - const __m128i s_data = _mm_lddqu_si128(reinterpret_cast(first.data + i)); - const __m128i first_nm_mask = has_first_null_map ? - _mm_cvtepi8_epi16(_mm_lddqu_si128(reinterpret_cast(first_null_map+i))) - : zeros; - bitmask = - _mm_or_si128( - _mm_or_si128( - _mm_or_si128( - _mm_or_si128( - _mm_andnot_si128( - first_nm_mask, - _mm_cmpeq_epi16(f_data, s_data)), - _mm_andnot_si128( - _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14)), - _mm_cmpeq_epi16(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14))))), - _mm_or_si128( - _mm_andnot_si128( - _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12)), - _mm_cmpeq_epi16(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12)))), - _mm_andnot_si128( - _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10)), - _mm_cmpeq_epi16(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10))))) - ), - _mm_or_si128( - _mm_or_si128( - _mm_andnot_si128( - _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8)), - _mm_cmpeq_epi16(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8)))), - _mm_andnot_si128( - _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6)), - _mm_cmpeq_epi16(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6))))), - _mm_or_si128( - _mm_andnot_si128( - _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4)), - _mm_cmpeq_epi16(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4)))), - _mm_andnot_si128( - _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2)), - _mm_cmpeq_epi16(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2)))))) - ), - bitmask); - } - - if (i < first.size) - { - for (; i < first.size && !has_mask; ++i) - { - if (has_first_null_map && first_null_map[i]) - continue; - __m128i v_i = _mm_set1_epi16(first.data[i]); - bitmask = _mm_or_si128(bitmask, _mm_cmpeq_epi16(f_data, v_i)); - has_mask = _mm_test_all_ones(bitmask); - } - } - } - } - - if (!has_mask && second.size > 2) - return false; - - return hasAllIntegralLoopRemainder(j, first, second, first_null_map, second_null_map); -} - -// SSE4.2 Int8_t specialization -template <> -inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >( - const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, const UInt8 * second_null_map) -{ - if (second.size == 0) - return true; - - if (!hasNull(first_null_map, first.size) && hasNull(second_null_map, second.size)) - return false; - - const bool has_first_null_map = first_null_map != nullptr; - const bool has_second_null_map = second_null_map != nullptr; - - size_t j = 0; - short has_mask = 1; - const int full = -1, none = 0; - const __m128i zeros = _mm_setzero_si128(); - if (second.size > 15) - { - for (; j < second.size - 15 && has_mask; j += 16) - { - has_mask = 0; - const __m128i f_data = _mm_lddqu_si128(reinterpret_cast(second.data+j)); - __m128i bitmask = has_second_null_map ? - _mm_set_epi8( - (second_null_map[j + 15]) ? full : none, (second_null_map[j + 14]) ? full : none, - (second_null_map[j + 13]) ? full : none, (second_null_map[j + 12]) ? full : none, - (second_null_map[j + 11]) ? full : none, (second_null_map[j + 10]) ? full : none, - (second_null_map[j + 9]) ? full : none, (second_null_map[j + 8]) ? full : none, - (second_null_map[j + 7]) ? full : none, (second_null_map[j + 6]) ? full : none, - (second_null_map[j + 5]) ? full : none, (second_null_map[j + 4]) ? full : none, - (second_null_map[j + 3]) ? full : none, (second_null_map[j + 2]) ? full : none, - (second_null_map[j + 1]) ? full : none, (second_null_map[j]) ? full : none) - : zeros; - unsigned i = 0; - for (; i < first.size - 15 && !has_mask; has_mask = _mm_test_all_ones(bitmask), i += 16) - { - const __m128i s_data = _mm_lddqu_si128(reinterpret_cast(first.data+i)); - const __m128i first_nm_mask = has_first_null_map ? - _mm_lddqu_si128(reinterpret_cast(first_null_map+i)) - : zeros; - bitmask = - _mm_or_si128( - _mm_or_si128( - _mm_or_si128( - _mm_or_si128( - _mm_or_si128( - _mm_andnot_si128( - first_nm_mask, - _mm_cmpeq_epi8(f_data, s_data)), - _mm_andnot_si128( - _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15)), - _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15))))), - _mm_or_si128( - _mm_andnot_si128( - _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14)), - _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14)))), - _mm_andnot_si128( - _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13)), - _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13))))) - ), - _mm_or_si128( - _mm_or_si128( - _mm_andnot_si128( - _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12)), - _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12)))), - _mm_andnot_si128( - _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11)), - _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11))))), - _mm_or_si128( - _mm_andnot_si128( - _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10)), - _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10)))), - _mm_andnot_si128( - _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9)), - _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9))))))), - _mm_or_si128( - _mm_or_si128( - _mm_or_si128( - _mm_andnot_si128( - _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8)), - _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8)))), - _mm_andnot_si128( - _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7)), - _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7))))), - _mm_or_si128( - _mm_andnot_si128( - _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6)), - _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6)))), - _mm_andnot_si128( - _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5)), - _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5)))))), - _mm_or_si128( - _mm_or_si128( - _mm_andnot_si128( - _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4)), - _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4)))), - _mm_andnot_si128( - _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3)), - _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3))))), - _mm_or_si128( - _mm_andnot_si128( - _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2)), - _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2)))), - _mm_andnot_si128( - _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1)), - _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1)))))))), - bitmask); - } - - if (i < first.size) - { - for (; i < first.size && !has_mask; ++i) - { - if (has_first_null_map && first_null_map[i]) - continue; - __m128i v_i = _mm_set1_epi8(first.data[i]); - bitmask = _mm_or_si128(bitmask, _mm_cmpeq_epi8(f_data, v_i)); - has_mask = _mm_test_all_ones(bitmask); - } - } - } - } - - if (!has_mask) - return false; - - return hasAllIntegralLoopRemainder(j, first, second, first_null_map, second_null_map); -} - -#endif - - template < typename FirstSliceType, typename SecondSliceType, bool (*isEqual)(const FirstSliceType &, const SecondSliceType &, size_t, size_t), diff --git a/src/Functions/GatherUtils/sliceEqualElements.h b/src/Functions/GatherUtils/sliceEqualElements.h new file mode 100644 index 00000000000..f219d51c56a --- /dev/null +++ b/src/Functions/GatherUtils/sliceEqualElements.h @@ -0,0 +1,41 @@ +#pragma once + +#include +#include "Slices.h" + +namespace DB::GatherUtils +{ + +template +bool sliceEqualElements(const NumericArraySlice & first [[maybe_unused]], + const NumericArraySlice & second [[maybe_unused]], + size_t first_ind [[maybe_unused]], + size_t second_ind [[maybe_unused]]) +{ + /// TODO: Decimal scale + if constexpr (is_decimal && is_decimal) + return accurate::equalsOp(first.data[first_ind].value, second.data[second_ind].value); + else if constexpr (is_decimal || is_decimal) + return false; + else + return accurate::equalsOp(first.data[first_ind], second.data[second_ind]); +} + +template +bool sliceEqualElements(const NumericArraySlice &, const GenericArraySlice &, size_t, size_t) +{ + return false; +} + +template +bool sliceEqualElements(const GenericArraySlice &, const NumericArraySlice &, size_t, size_t) +{ + return false; +} + +inline ALWAYS_INLINE bool sliceEqualElements(const GenericArraySlice & first, const GenericArraySlice & second, size_t first_ind, size_t second_ind) +{ + return first.elements->compareAt(first_ind + first.begin, second_ind + second.begin, *second.elements, -1) == 0; +} + +} diff --git a/src/Functions/GatherUtils/sliceHasImplAnyAll.h b/src/Functions/GatherUtils/sliceHasImplAnyAll.h new file mode 100644 index 00000000000..59d37473e42 --- /dev/null +++ b/src/Functions/GatherUtils/sliceHasImplAnyAll.h @@ -0,0 +1,839 @@ +#pragma once + +#include "GatherUtils.h" +#include "Slices.h" +#include "sliceEqualElements.h" + +#if defined(__AVX2__) || defined(__SSE4_2__) +#include +#endif + +namespace DB::GatherUtils +{ + +namespace +{ + +inline ALWAYS_INLINE bool hasNull(const UInt8 * null_map, size_t null_map_size) +{ + if (null_map != nullptr) + { + for (size_t i = 0; i < null_map_size; ++i) + { + if (null_map[i]) + return true; + } + } + return false; +} + +} + +/// Methods to check if first array has elements from second array, overloaded for various combinations of types. +template < + ArraySearchType search_type, + typename FirstSliceType, + typename SecondSliceType, + bool (*isEqual)(const FirstSliceType &, const SecondSliceType &, size_t, size_t)> +bool sliceHasImplAnyAll(const FirstSliceType & first, const SecondSliceType & second, const UInt8 * first_null_map, const UInt8 * second_null_map) +{ + const bool has_first_null_map = first_null_map != nullptr; + const bool has_second_null_map = second_null_map != nullptr; + + const bool has_second_null = hasNull(second_null_map, second.size); + if (has_second_null) + { + const bool has_first_null = hasNull(first_null_map, first.size); + + if (has_first_null && search_type == ArraySearchType::Any) + return true; + + if (!has_first_null && search_type == ArraySearchType::All) + return false; + } + + for (size_t i = 0; i < second.size; ++i) + { + if (has_second_null_map && second_null_map[i]) + continue; + + bool has = false; + + for (size_t j = 0; j < first.size && !has; ++j) + { + if (has_first_null_map && first_null_map[j]) + continue; + + if (isEqual(first, second, j, i)) + { + has = true; + break; + } + } + + if (has && search_type == ArraySearchType::Any) + return true; + + if (!has && search_type == ArraySearchType::All) + return false; + } + return search_type == ArraySearchType::All; +} + + +#if defined(__AVX2__) || defined(__SSE4_2__) + +namespace +{ + +template +inline ALWAYS_INLINE bool hasAllIntegralLoopRemainder( + size_t j, const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, const UInt8 * second_null_map) +{ + const bool has_first_null_map = first_null_map != nullptr; + const bool has_second_null_map = second_null_map != nullptr; + + for (; j < second.size; ++j) + { + // skip null elements since both have at least one - assuming it was checked earlier that at least one element in 'first' is null + if (has_second_null_map && second_null_map[j]) + continue; + + bool found = false; + + for (size_t i = 0; i < first.size; ++i) + { + if (has_first_null_map && first_null_map[i]) + continue; + + if (first.data[i] == second.data[j]) + { + found = true; + break; + } + } + + if (!found) + return false; + } + return true; +} + +} + +#endif + +#if defined(__AVX2__) +// AVX2 Int specialization +template <> +inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >( + const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, const UInt8 * second_null_map) +{ + if (second.size == 0) + return true; + + if (!hasNull(first_null_map, first.size) && hasNull(second_null_map, second.size)) + return false; + + const bool has_first_null_map = first_null_map != nullptr; + const bool has_second_null_map = second_null_map != nullptr; + + size_t j = 0; + short has_mask = 1; + const int full = -1, none = 0; + const __m256i ones = _mm256_set1_epi32(full); + const __m256i zeros = _mm256_setzero_si256(); + if (second.size > 7 && first.size > 7) + { + for (; j < second.size - 7 && has_mask; j += 8) + { + has_mask = 0; + const __m256i f_data = _mm256_lddqu_si256(reinterpret_cast(second.data + j)); + // bitmask is filled with minus ones for ones which are considered as null in the corresponding null map, 0 otherwise; + __m256i bitmask = has_second_null_map ? + _mm256_set_epi32( + (second_null_map[j + 7]) ? full : none, + (second_null_map[j + 6]) ? full : none, + (second_null_map[j + 5]) ? full : none, + (second_null_map[j + 4]) ? full : none, + (second_null_map[j + 3]) ? full : none, + (second_null_map[j + 2]) ? full : none, + (second_null_map[j + 1]) ? full : none, + (second_null_map[j]) ? full : none) + : zeros; + + size_t i = 0; + // Search first array to try to match all second elements + for (; i < first.size - 7 && !has_mask; has_mask = _mm256_testc_si256(bitmask, ones), i += 8) + { + const __m256i s_data = _mm256_lddqu_si256(reinterpret_cast(first.data + i)); + // Create a mask to avoid to compare null elements + // set_m128i takes two arguments: (high segment, low segment) that are two __m128i convert from 8bits to 32bits to fit to our following operations + const __m256i first_nm_mask = _mm256_set_m128i( + _mm_cvtepi8_epi32(_mm_lddqu_si128(reinterpret_cast(first_null_map + i + 4))), + _mm_cvtepi8_epi32(_mm_lddqu_si128(reinterpret_cast(first_null_map + i)))); + bitmask = + _mm256_or_si256( + _mm256_or_si256( + _mm256_or_si256( + _mm256_or_si256( + _mm256_andnot_si256( + first_nm_mask, + _mm256_cmpeq_epi32(f_data, s_data)), + _mm256_andnot_si256( + _mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(6,5,4,3,2,1,0,7)), + _mm256_cmpeq_epi32(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(6,5,4,3,2,1,0,7))))), + _mm256_or_si256( + _mm256_andnot_si256( + _mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(5,4,3,2,1,0,7,6)), + _mm256_cmpeq_epi32(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(5,4,3,2,1,0,7,6)))), + _mm256_andnot_si256( + _mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(4,3,2,1,0,7,6,5)), + _mm256_cmpeq_epi32(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(4,3,2,1,0,7,6,5))))) + ), + _mm256_or_si256( + _mm256_or_si256( + _mm256_andnot_si256( + _mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(3,2,1,0,7,6,5,4)), + _mm256_cmpeq_epi32(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(3,2,1,0,7,6,5,4)))), + _mm256_andnot_si256( + _mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(2,1,0,7,6,5,4,3)), + _mm256_cmpeq_epi32(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(2,1,0,7,6,5,4,3))))), + _mm256_or_si256( + _mm256_andnot_si256( + _mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(1,0,7,6,5,4,3,2)), + _mm256_cmpeq_epi32(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(1,0,7,6,5,4,3,2)))), + _mm256_andnot_si256( + _mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(0,7,6,5,4,3,2,1)), + _mm256_cmpeq_epi32(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(0,7,6,5,4,3,2,1))))))), + bitmask); + } + + if (i < first.size) + { + // Loop(i)-jam + for (; i < first.size && !has_mask; ++i) + { + if (has_first_null_map && first_null_map[i]) + continue; + __m256i v_i = _mm256_set1_epi32(first.data[i]); + bitmask = _mm256_or_si256(bitmask, _mm256_cmpeq_epi32(f_data, v_i)); + has_mask = _mm256_testc_si256(bitmask, ones); + } + } + } + } + + if (!has_mask) + return false; + + return hasAllIntegralLoopRemainder(j, first, second, first_null_map, second_null_map); +} + +// TODO: Discuss about +// raise an error : "error: no viable conversion from 'const NumericArraySlice' to 'const NumericArraySlice'" +// How should we do, copy past each function ?? I haven't found a way to specialize a same function body for two different types. +// AVX2 UInt specialization +// template <> +// inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >( +// const NumericArraySlice & second, const NumericArraySlice & first, const UInt8 * first_null_map, const UInt8 * second_null_map) +// { +// return sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements > ( +// static_cast &>(second), static_cast &>(first), second_null_map, first_null_map); +// } + +// AVX2 Int64 specialization +template <> +inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >( + const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, const UInt8 * second_null_map) +{ + if (second.size == 0) + return true; + + if (!hasNull(first_null_map, first.size) && hasNull(second_null_map, second.size)) + return false; + + const bool has_first_null_map = first_null_map != nullptr; + const bool has_second_null_map = second_null_map != nullptr; + + size_t j = 0; + short has_mask = 1; + const int full = -1, none = 0; + const __m256i ones = _mm256_set1_epi64x(full); + const __m256i zeros = _mm256_setzero_si256(); + if (second.size > 3 && first.size > 3) + { + for (; j < second.size - 3 && has_mask; j += 4) + { + has_mask = 0; + const __m256i f_data = _mm256_lddqu_si256(reinterpret_cast(second.data + j)); + __m256i bitmask = has_second_null_map ? + _mm256_set_epi64x( + (second_null_map[j + 3])? full : none, + (second_null_map[j + 2])? full : none, + (second_null_map[j + 1])? full : none, + (second_null_map[j]) ? full : none) + : zeros; + + unsigned i = 0; + for (; i < first.size - 3 && !has_mask; has_mask = _mm256_testc_si256(bitmask, ones), i += 4) + { + const __m256i s_data = _mm256_lddqu_si256(reinterpret_cast(first.data + i)); + const __m256i first_nm_mask = _mm256_set_m128i( + _mm_cvtepi8_epi64(_mm_lddqu_si128(reinterpret_cast(first_null_map + i + 2))), + _mm_cvtepi8_epi64(_mm_lddqu_si128(reinterpret_cast(first_null_map + i)))); + bitmask = + _mm256_or_si256( + _mm256_or_si256( + _mm256_or_si256( + _mm256_andnot_si256( + first_nm_mask, + _mm256_cmpeq_epi64(f_data, s_data)), + _mm256_andnot_si256( + _mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(5,4,3,2,1,0,7,6)), + _mm256_cmpeq_epi64(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(5,4,3,2,1,0,7,6))))), + + _mm256_or_si256( + _mm256_andnot_si256( + _mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(3,2,1,0,7,6,5,4)), + _mm256_cmpeq_epi64(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(3,2,1,0,7,6,5,4)))), + _mm256_andnot_si256( + _mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(1,0,7,6,5,4,3,2)), + _mm256_cmpeq_epi64(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(1,0,7,6,5,4,3,2)))))), + bitmask); + } + + if (i < first.size) + { + for (; i < first.size && !has_mask; ++i) + { + if (has_first_null_map && first_null_map[i]) + continue; + __m256i v_i = _mm256_set1_epi64x(first.data[i]); + bitmask = _mm256_or_si256(bitmask, _mm256_cmpeq_epi64(f_data, v_i)); + has_mask = _mm256_testc_si256(bitmask, ones); + } + } + } + } + + if (!has_mask && second.size > 2) + return false; + + return hasAllIntegralLoopRemainder(j, first, second, first_null_map, second_null_map); +} + +// AVX2 Int16_t specialization +template <> +inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >( + const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, const UInt8 * second_null_map) +{ + if (second.size == 0) + return true; + + if (!hasNull(first_null_map, first.size) && hasNull(second_null_map, second.size)) + return false; + + const bool has_first_null_map = first_null_map != nullptr; + const bool has_second_null_map = second_null_map != nullptr; + + size_t j = 0; + short has_mask = 1; + const int full = -1, none = 0; + const __m256i ones = _mm256_set1_epi16(full); + const __m256i zeros = _mm256_setzero_si256(); + if (second.size > 15 && first.size > 15) + { + for (; j < second.size - 15 && has_mask; j += 16) + { + has_mask = 0; + const __m256i f_data = _mm256_lddqu_si256(reinterpret_cast(second.data + j)); + __m256i bitmask = has_second_null_map ? + _mm256_set_epi16( + (second_null_map[j + 15]) ? full : none, (second_null_map[j + 14]) ? full : none, + (second_null_map[j + 13]) ? full : none, (second_null_map[j + 12]) ? full : none, + (second_null_map[j + 11]) ? full : none, (second_null_map[j + 10]) ? full : none, + (second_null_map[j + 9]) ? full : none, (second_null_map[j + 8])? full : none, + (second_null_map[j + 7]) ? full : none, (second_null_map[j + 6])? full : none, + (second_null_map[j + 5]) ? full : none, (second_null_map[j + 4])? full : none, + (second_null_map[j + 3]) ? full : none, (second_null_map[j + 2])? full : none, + (second_null_map[j + 1]) ? full : none, (second_null_map[j]) ? full : none) + : zeros; + unsigned i = 0; + for (; i < first.size - 15 && !has_mask; has_mask = _mm256_testc_si256(bitmask, ones), i += 16) + { + const __m256i s_data = _mm256_lddqu_si256(reinterpret_cast(first.data + i)); + const __m256i first_nm_mask = _mm256_set_m128i( + _mm_cvtepi8_epi16(_mm_lddqu_si128(reinterpret_cast(first_null_map+i+8))), + _mm_cvtepi8_epi16(_mm_lddqu_si128(reinterpret_cast(first_null_map+i)))); + bitmask = + _mm256_or_si256( + _mm256_or_si256( + _mm256_or_si256( + _mm256_or_si256( + _mm256_or_si256( + _mm256_andnot_si256( + first_nm_mask, + _mm256_cmpeq_epi16(f_data, s_data)), + _mm256_andnot_si256( + _mm256_shuffle_epi8(first_nm_mask, _mm256_set_epi8(29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30)), + _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(s_data, _mm256_set_epi8(29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30))))), + _mm256_or_si256( + _mm256_andnot_si256( + _mm256_shuffle_epi8(first_nm_mask, _mm256_set_epi8(27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28)), + _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(s_data, _mm256_set_epi8(27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28)))), + _mm256_andnot_si256( + _mm256_shuffle_epi8(first_nm_mask, _mm256_set_epi8(25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26)), + _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(s_data, _mm256_set_epi8(25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26))))) + ), + _mm256_or_si256( + _mm256_or_si256( + _mm256_andnot_si256( + _mm256_shuffle_epi8(first_nm_mask, _mm256_set_epi8(23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24)), + _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(s_data, _mm256_set_epi8(23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24)))), + _mm256_andnot_si256( + _mm256_shuffle_epi8(first_nm_mask, _mm256_set_epi8(21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22)), + _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(s_data, _mm256_set_epi8(21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22))))), + _mm256_or_si256( + _mm256_andnot_si256( + _mm256_shuffle_epi8(first_nm_mask, _mm256_set_epi8(19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20)), + _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(s_data, _mm256_set_epi8(19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20)))), + _mm256_andnot_si256( + _mm256_shuffle_epi8(first_nm_mask, _mm256_set_epi8(17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18)), + _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(s_data, _mm256_set_epi8(17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18)))))) + ), + _mm256_or_si256( + _mm256_or_si256( + _mm256_or_si256( + _mm256_andnot_si256( + _mm256_permute2x128_si256(first_nm_mask, first_nm_mask,1), + _mm256_cmpeq_epi16(f_data, _mm256_permute2x128_si256(s_data, s_data, 1))), + _mm256_andnot_si256( + _mm256_shuffle_epi8(_mm256_permute2x128_si256(first_nm_mask, first_nm_mask, 1), _mm256_set_epi8(13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14)), + _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(_mm256_permute2x128_si256(s_data, s_data, 1), _mm256_set_epi8(13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14))))), + _mm256_or_si256( + _mm256_andnot_si256( + _mm256_shuffle_epi8(_mm256_permute2x128_si256(first_nm_mask, first_nm_mask, 1), _mm256_set_epi8(11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12)), + _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(_mm256_permute2x128_si256(s_data, s_data, 1), _mm256_set_epi8(11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12)))), + _mm256_andnot_si256( + _mm256_shuffle_epi8(_mm256_permute2x128_si256(first_nm_mask, first_nm_mask, 1), _mm256_set_epi8(9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10)), + _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(_mm256_permute2x128_si256(s_data, s_data, 1), _mm256_set_epi8(9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10))))) + ), + _mm256_or_si256( + _mm256_or_si256( + _mm256_andnot_si256( + _mm256_shuffle_epi8(_mm256_permute2x128_si256(first_nm_mask, first_nm_mask, 1), _mm256_set_epi8(7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8)), + _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(_mm256_permute2x128_si256(s_data ,s_data, 1), _mm256_set_epi8(7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8)))), + _mm256_andnot_si256( + _mm256_shuffle_epi8(_mm256_permute2x128_si256(first_nm_mask, first_nm_mask, 1), _mm256_set_epi8(5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6)), + _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(_mm256_permute2x128_si256(s_data, s_data, 1), _mm256_set_epi8(5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6))))), + _mm256_or_si256( + _mm256_andnot_si256( + _mm256_shuffle_epi8(_mm256_permute2x128_si256(first_nm_mask, first_nm_mask, 1), _mm256_set_epi8(3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4)), + _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(_mm256_permute2x128_si256(s_data ,s_data ,1), _mm256_set_epi8(3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4)))), + _mm256_andnot_si256( + _mm256_shuffle_epi8(_mm256_permute2x128_si256(first_nm_mask, first_nm_mask, 1), _mm256_set_epi8(1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2)), + _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(_mm256_permute2x128_si256(s_data, s_data, 1), _mm256_set_epi8(1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2)))))) + ) + ), + bitmask); + } + + if (i < first.size) + { + for (; i < first.size && !has_mask; ++i) + { + if (has_first_null_map && first_null_map[i]) + continue; + __m256i v_i = _mm256_set1_epi16(first.data[i]); + bitmask = _mm256_or_si256(bitmask, _mm256_cmpeq_epi16(f_data, v_i)); + has_mask = _mm256_testc_si256(bitmask, ones); + } + } + } + } + + if (!has_mask && second.size > 2) + return false; + + return hasAllIntegralLoopRemainder(j, first, second, first_null_map, second_null_map); +} + +#elif defined(__SSE4_2__) + +// SSE4.2 Int specialization +template <> +inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >( + const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, const UInt8 * second_null_map) +{ + if (second.size == 0) + return true; + + if (!hasNull(first_null_map, first.size) && hasNull(second_null_map, second.size)) + return false; + + const bool has_first_null_map = first_null_map != nullptr; + const bool has_second_null_map = second_null_map != nullptr; + + size_t j = 0; + short has_mask = 1; + const __m128i zeros = _mm_setzero_si128(); + if (second.size > 3 && first.size > 2) + { + const int full = -1, none = 0; + for (; j < second.size - 3 && has_mask; j += 4) + { + has_mask = 0; + const __m128i f_data = _mm_lddqu_si128(reinterpret_cast(second.data + j)); + __m128i bitmask = has_second_null_map ? + _mm_set_epi32( + (second_null_map[j + 3]) ? full : none, + (second_null_map[j + 2]) ? full : none, + (second_null_map[j + 1]) ? full : none, + (second_null_map[j]) ? full : none) + : zeros; + + unsigned i = 0; + for (; i < first.size - 3 && !has_mask; has_mask = _mm_test_all_ones(bitmask), i += 4) + { + const __m128i s_data = _mm_lddqu_si128(reinterpret_cast(first.data + i)); + const __m128i first_nm_mask = has_first_null_map ? + _mm_cvtepi8_epi32(_mm_lddqu_si128(reinterpret_cast(first_null_map + i))) + : zeros; + + bitmask = + _mm_or_si128( + _mm_or_si128( + _mm_or_si128( + _mm_andnot_si128( + first_nm_mask, + _mm_cmpeq_epi32(f_data, s_data)), + _mm_andnot_si128( + _mm_shuffle_epi32(first_nm_mask, _MM_SHUFFLE(2,1,0,3)), + _mm_cmpeq_epi32(f_data, _mm_shuffle_epi32(s_data, _MM_SHUFFLE(2,1,0,3))))), + _mm_or_si128( + _mm_andnot_si128( + _mm_shuffle_epi32(first_nm_mask, _MM_SHUFFLE(1,0,3,2)), + _mm_cmpeq_epi32(f_data, _mm_shuffle_epi32(s_data, _MM_SHUFFLE(1,0,3,2)))), + _mm_andnot_si128( + _mm_shuffle_epi32(first_nm_mask, _MM_SHUFFLE(0,3,2,1)), + _mm_cmpeq_epi32(f_data, _mm_shuffle_epi32(s_data, _MM_SHUFFLE(0,3,2,1))))) + ), + bitmask); + } + + if (i < first.size) + { + for (; i < first.size && !has_mask; ++i) + { + if (has_first_null_map && first_null_map[i]) + continue; + __m128i r_i = _mm_set1_epi32(first.data[i]); + bitmask = _mm_or_si128(bitmask, _mm_cmpeq_epi32(f_data, r_i)); + has_mask = _mm_test_all_ones(bitmask); + } + } + } + } + + if (!has_mask) + return false; + + return hasAllIntegralLoopRemainder(j, first, second, first_null_map, second_null_map); +} + +// SSE4.2 Int64 specialization +template <> +inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >( + const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, const UInt8 * second_null_map) +{ + if (second.size == 0) + return true; + + if (!hasNull(first_null_map, first.size) && hasNull(second_null_map, second.size)) + return false; + + const bool has_first_null_map = first_null_map != nullptr; + const bool has_second_null_map = second_null_map != nullptr; + + size_t j = 0; + short has_mask = 1; + const Int64 full = -1, none = 0; + const __m128i zeros = _mm_setzero_si128(); + for (; j < second.size - 1 && has_mask; j += 2) + { + has_mask = 0; + const __m128i f_data = _mm_lddqu_si128(reinterpret_cast(second.data + j)); + __m128i bitmask = has_second_null_map ? + _mm_set_epi64x( + (second_null_map[j + 1]) ? full : none, + (second_null_map[j]) ? full : none) + : zeros; + unsigned i = 0; + for (; i < first.size - 1 && !has_mask; has_mask = _mm_test_all_ones(bitmask), i += 2) + { + const __m128i s_data = _mm_lddqu_si128(reinterpret_cast(first.data + i)); + const __m128i first_nm_mask = has_first_null_map ? + _mm_cvtepi8_epi64(_mm_lddqu_si128(reinterpret_cast(first_null_map + i))) + : zeros; + bitmask = + _mm_or_si128( + _mm_or_si128( + _mm_andnot_si128( + first_nm_mask, + _mm_cmpeq_epi32(f_data, s_data)), + _mm_andnot_si128( + _mm_shuffle_epi32(first_nm_mask, _MM_SHUFFLE(1,0,3,2)), + _mm_cmpeq_epi64(f_data, _mm_shuffle_epi32(s_data, _MM_SHUFFLE(1,0,3,2))))), + bitmask); + } + + if (i < first.size) + { + for (; i < first.size && !has_mask; ++i) + { + if (has_first_null_map && first_null_map[i]) + continue; + __m128i v_i = _mm_set1_epi64x(first.data[i]); + bitmask = _mm_or_si128(bitmask, _mm_cmpeq_epi64(f_data, v_i)); + has_mask = _mm_test_all_ones(bitmask); + } + } + } + + if (!has_mask) + return false; + + return hasAllIntegralLoopRemainder(j, first, second, first_null_map, second_null_map); +} + +// SSE4.2 Int16_t specialization +template <> +inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >( + const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, const UInt8 * second_null_map) +{ + if (second.size == 0) + return true; + + if (!hasNull(first_null_map, first.size) && hasNull(second_null_map, second.size)) + return false; + + const bool has_first_null_map = first_null_map != nullptr; + const bool has_second_null_map = second_null_map != nullptr; + + size_t j = 0; + short has_mask = 1; + const int16_t full = -1, none = 0; + const __m128i zeros = _mm_setzero_si128(); + if (second.size > 6 && first.size > 6) + { + for (; j < second.size - 7 && has_mask; j += 8) + { + has_mask = 0; + const __m128i f_data = _mm_lddqu_si128(reinterpret_cast(second.data+j)); + __m128i bitmask = has_second_null_map ? + _mm_set_epi16( + (second_null_map[j + 7]) ? full : none, (second_null_map[j + 6]) ? full : none, + (second_null_map[j + 5]) ? full : none, (second_null_map[j + 4]) ? full : none, + (second_null_map[j + 3]) ? full : none, (second_null_map[j + 2]) ? full : none, + (second_null_map[j + 1]) ? full : none, (second_null_map[j]) ? full: none) + : zeros; + unsigned i = 0; + for (; i < first.size-7 && !has_mask; has_mask = _mm_test_all_ones(bitmask), i += 8) + { + const __m128i s_data = _mm_lddqu_si128(reinterpret_cast(first.data + i)); + const __m128i first_nm_mask = has_first_null_map ? + _mm_cvtepi8_epi16(_mm_lddqu_si128(reinterpret_cast(first_null_map+i))) + : zeros; + bitmask = + _mm_or_si128( + _mm_or_si128( + _mm_or_si128( + _mm_or_si128( + _mm_andnot_si128( + first_nm_mask, + _mm_cmpeq_epi16(f_data, s_data)), + _mm_andnot_si128( + _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14)), + _mm_cmpeq_epi16(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14))))), + _mm_or_si128( + _mm_andnot_si128( + _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12)), + _mm_cmpeq_epi16(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12)))), + _mm_andnot_si128( + _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10)), + _mm_cmpeq_epi16(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10))))) + ), + _mm_or_si128( + _mm_or_si128( + _mm_andnot_si128( + _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8)), + _mm_cmpeq_epi16(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8)))), + _mm_andnot_si128( + _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6)), + _mm_cmpeq_epi16(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6))))), + _mm_or_si128( + _mm_andnot_si128( + _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4)), + _mm_cmpeq_epi16(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4)))), + _mm_andnot_si128( + _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2)), + _mm_cmpeq_epi16(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2)))))) + ), + bitmask); + } + + if (i < first.size) + { + for (; i < first.size && !has_mask; ++i) + { + if (has_first_null_map && first_null_map[i]) + continue; + __m128i v_i = _mm_set1_epi16(first.data[i]); + bitmask = _mm_or_si128(bitmask, _mm_cmpeq_epi16(f_data, v_i)); + has_mask = _mm_test_all_ones(bitmask); + } + } + } + } + + if (!has_mask && second.size > 2) + return false; + + return hasAllIntegralLoopRemainder(j, first, second, first_null_map, second_null_map); +} + +// SSE4.2 Int8_t specialization +template <> +inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >( + const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, const UInt8 * second_null_map) +{ + if (second.size == 0) + return true; + + if (!hasNull(first_null_map, first.size) && hasNull(second_null_map, second.size)) + return false; + + const bool has_first_null_map = first_null_map != nullptr; + const bool has_second_null_map = second_null_map != nullptr; + + size_t j = 0; + short has_mask = 1; + const int full = -1, none = 0; + const __m128i zeros = _mm_setzero_si128(); + if (second.size > 15) + { + for (; j < second.size - 15 && has_mask; j += 16) + { + has_mask = 0; + const __m128i f_data = _mm_lddqu_si128(reinterpret_cast(second.data+j)); + __m128i bitmask = has_second_null_map ? + _mm_set_epi8( + (second_null_map[j + 15]) ? full : none, (second_null_map[j + 14]) ? full : none, + (second_null_map[j + 13]) ? full : none, (second_null_map[j + 12]) ? full : none, + (second_null_map[j + 11]) ? full : none, (second_null_map[j + 10]) ? full : none, + (second_null_map[j + 9]) ? full : none, (second_null_map[j + 8]) ? full : none, + (second_null_map[j + 7]) ? full : none, (second_null_map[j + 6]) ? full : none, + (second_null_map[j + 5]) ? full : none, (second_null_map[j + 4]) ? full : none, + (second_null_map[j + 3]) ? full : none, (second_null_map[j + 2]) ? full : none, + (second_null_map[j + 1]) ? full : none, (second_null_map[j]) ? full : none) + : zeros; + unsigned i = 0; + for (; i < first.size - 15 && !has_mask; has_mask = _mm_test_all_ones(bitmask), i += 16) + { + const __m128i s_data = _mm_lddqu_si128(reinterpret_cast(first.data+i)); + const __m128i first_nm_mask = has_first_null_map ? + _mm_lddqu_si128(reinterpret_cast(first_null_map+i)) + : zeros; + bitmask = + _mm_or_si128( + _mm_or_si128( + _mm_or_si128( + _mm_or_si128( + _mm_or_si128( + _mm_andnot_si128( + first_nm_mask, + _mm_cmpeq_epi8(f_data, s_data)), + _mm_andnot_si128( + _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15)), + _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15))))), + _mm_or_si128( + _mm_andnot_si128( + _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14)), + _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14)))), + _mm_andnot_si128( + _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13)), + _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13))))) + ), + _mm_or_si128( + _mm_or_si128( + _mm_andnot_si128( + _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12)), + _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12)))), + _mm_andnot_si128( + _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11)), + _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11))))), + _mm_or_si128( + _mm_andnot_si128( + _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10)), + _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10)))), + _mm_andnot_si128( + _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9)), + _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9))))))), + _mm_or_si128( + _mm_or_si128( + _mm_or_si128( + _mm_andnot_si128( + _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8)), + _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8)))), + _mm_andnot_si128( + _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7)), + _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7))))), + _mm_or_si128( + _mm_andnot_si128( + _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6)), + _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6)))), + _mm_andnot_si128( + _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5)), + _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5)))))), + _mm_or_si128( + _mm_or_si128( + _mm_andnot_si128( + _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4)), + _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4)))), + _mm_andnot_si128( + _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3)), + _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3))))), + _mm_or_si128( + _mm_andnot_si128( + _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2)), + _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2)))), + _mm_andnot_si128( + _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1)), + _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1)))))))), + bitmask); + } + + if (i < first.size) + { + for (; i < first.size && !has_mask; ++i) + { + if (has_first_null_map && first_null_map[i]) + continue; + __m128i v_i = _mm_set1_epi8(first.data[i]); + bitmask = _mm_or_si128(bitmask, _mm_cmpeq_epi8(f_data, v_i)); + has_mask = _mm_test_all_ones(bitmask); + } + } + } + } + + if (!has_mask) + return false; + + return hasAllIntegralLoopRemainder(j, first, second, first_null_map, second_null_map); +} + +#endif + +} From 13878d261850d190dfec91c10d1d8846df7e8035 Mon Sep 17 00:00:00 2001 From: Youenn Lebras Date: Tue, 31 Aug 2021 14:04:15 +0200 Subject: [PATCH 09/82] Modify include files according to the processors capabilities --- src/Functions/GatherUtils/sliceHasImplAnyAll.h | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/Functions/GatherUtils/sliceHasImplAnyAll.h b/src/Functions/GatherUtils/sliceHasImplAnyAll.h index 59d37473e42..5603a802e7a 100644 --- a/src/Functions/GatherUtils/sliceHasImplAnyAll.h +++ b/src/Functions/GatherUtils/sliceHasImplAnyAll.h @@ -4,8 +4,12 @@ #include "Slices.h" #include "sliceEqualElements.h" -#if defined(__AVX2__) || defined(__SSE4_2__) -#include +#if defined(__SSE4_2__) + #include + #include +#endif +#if defined(__AVX2__) + #include #endif namespace DB::GatherUtils @@ -124,7 +128,7 @@ inline ALWAYS_INLINE bool hasAllIntegralLoopRemainder( #endif #if defined(__AVX2__) -// AVX2 Int specialization +// AVX2 Int specialization of sliceHasImplAnyAll template <> inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >( const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, const UInt8 * second_null_map) From ade754d444f668eb0534e36f998bba341ba047e4 Mon Sep 17 00:00:00 2001 From: Youenn Lebras Date: Thu, 2 Sep 2021 18:28:25 +0200 Subject: [PATCH 10/82] Fix a bug for avx2 and add performance tests for HasAll --- tests/performance/hasAll.xml | 113 +++++++++++++++++++++++++++++++++++ 1 file changed, 113 insertions(+) create mode 100644 tests/performance/hasAll.xml diff --git a/tests/performance/hasAll.xml b/tests/performance/hasAll.xml new file mode 100644 index 00000000000..a6ceb915bd5 --- /dev/null +++ b/tests/performance/hasAll.xml @@ -0,0 +1,113 @@ + + CREATE TABLE test_table_small (`set` Array(Int32), `subset` Array (Int32)) ENGINE = MergeTree ORDER BY set + CREATE TABLE test_table_small2 (`set` Array(Int32), `subset` Array (Int32)) ENGINE = MergeTree ORDER BY set + CREATE TABLE test_table_smallf (`set` Array(Int32), `subset` Array (Int32)) ENGINE = MergeTree ORDER BY set + + CREATE TABLE test_table_medium (`set` Array(Int32), `subset` Array (Int32)) ENGINE = MergeTree ORDER BY set + CREATE TABLE test_table_medium2 (`set` Array(Int32), `subset` Array (Int32)) ENGINE = MergeTree ORDER BY set + CREATE TABLE test_table_mediumf (`set` Array(Int32), `subset` Array (Int32)) ENGINE = MergeTree ORDER BY set + + CREATE TABLE test_table_large (`set` Array(Int32), `subset` Array (Int32)) ENGINE = MergeTree ORDER BY set + CREATE TABLE test_table_large2 (`set` Array(Int32), `subset` Array (Int32)) ENGINE = MergeTree ORDER BY set + CREATE TABLE test_table_largef (`set` Array(Int32), `subset` Array (Int32)) ENGINE = MergeTree ORDER BY set + + + + INSERT INTO test_table_small SELECT + groupArraySample(500)(number) AS set, + groupArraySample(10)(number) AS subset + FROM (SELECT * FROM numbers(500)) + + INSERT INTO test_table_small2 SELECT + groupArraySample(500)(number) AS set, + groupArraySample(400)(number) AS subset + FROM (SELECT * FROM numbers(500)) + + INSERT INTO test_table_smallf SELECT + groupArraySample(500)(number) AS set, + groupArraySample(10)(number) AS subset + FROM (SELECT * FROM numbers(5000000)) + + + + INSERT INTO test_table_medium SELECT + groupArraySample(50000)(number) AS set, + groupArraySample(10)(number) AS subset + FROM + ( + SELECT * + FROM numbers(50000) + ) + + INSERT INTO test_table_medium2 SELECT + groupArraySample(50000)(number) AS set, + groupArraySample(4000)(number) AS subset + FROM + ( + SELECT * + FROM numbers(50000) + ) + + INSERT INTO test_table_mediumf SELECT + groupArraySample(50000)(number) AS set, + groupArraySample(10)(number) AS subset + FROM + ( + SELECT * + FROM numbers(5000000) + ) + + + + INSERT INTO test_table_large SELECT + groupArraySample(5000000)(number) AS set, + groupArraySample(10)(number) AS subset + FROM + ( + SELECT * + FROM numbers(5000000) + ) + + INSERT INTO test_table_large2 SELECT + groupArraySample(5000000)(number) AS set, + groupArraySample(4000)(number) AS subset + FROM + ( + SELECT * + FROM numbers(5000000) + ) + + INSERT INTO test_table_largef SELECT + groupArraySample(5000000)(number) AS set, + groupArraySample(10)(number) AS subset + FROM + ( + SELECT * + FROM numbers(100000000) + ) + + + select hasAll(set, subset) from test_table_small + select hasAll(set, subset) from test_table_small2 + select hasAll(set, subset) from test_table_smallf + + select hasAll(set, subset) from test_table_medium + select hasAll(set, subset) from test_table_medium2 + select hasAll(set, subset) from test_table_mediumf + + select hasAll(set, subset) from test_table_large + select hasAll(set, subset) from test_table_large2 + select hasAll(set, subset) from test_table_largef + + DROP TABLE IF EXISTS test_table_small + DROP TABLE IF EXISTS test_table_small2 + DROP TABLE IF EXISTS test_table_smallf + + DROP TABLE IF EXISTS test_table_medium + DROP TABLE IF EXISTS test_table_medium2 + DROP TABLE IF EXISTS test_table_mediumf + + DROP TABLE IF EXISTS test_table_large + DROP TABLE IF EXISTS test_table_large2 + DROP TABLE IF EXISTS test_table_largef + From a71944d11ddcbc2d277692d8175fea5d91220aed Mon Sep 17 00:00:00 2001 From: Youenn Lebras Date: Fri, 3 Sep 2021 12:19:42 +0200 Subject: [PATCH 11/82] Add performance tests for HasAll for int{64,16,8} --- tests/performance/hasAll.xml | 113 ------------------------ tests/performance/hasAll_simd_int16.xml | 52 +++++++++++ tests/performance/hasAll_simd_int32.xml | 52 +++++++++++ tests/performance/hasAll_simd_int64.xml | 52 +++++++++++ tests/performance/hasAll_simd_int8.xml | 52 +++++++++++ 5 files changed, 208 insertions(+), 113 deletions(-) delete mode 100644 tests/performance/hasAll.xml create mode 100644 tests/performance/hasAll_simd_int16.xml create mode 100644 tests/performance/hasAll_simd_int32.xml create mode 100644 tests/performance/hasAll_simd_int64.xml create mode 100644 tests/performance/hasAll_simd_int8.xml diff --git a/tests/performance/hasAll.xml b/tests/performance/hasAll.xml deleted file mode 100644 index a6ceb915bd5..00000000000 --- a/tests/performance/hasAll.xml +++ /dev/null @@ -1,113 +0,0 @@ - - CREATE TABLE test_table_small (`set` Array(Int32), `subset` Array (Int32)) ENGINE = MergeTree ORDER BY set - CREATE TABLE test_table_small2 (`set` Array(Int32), `subset` Array (Int32)) ENGINE = MergeTree ORDER BY set - CREATE TABLE test_table_smallf (`set` Array(Int32), `subset` Array (Int32)) ENGINE = MergeTree ORDER BY set - - CREATE TABLE test_table_medium (`set` Array(Int32), `subset` Array (Int32)) ENGINE = MergeTree ORDER BY set - CREATE TABLE test_table_medium2 (`set` Array(Int32), `subset` Array (Int32)) ENGINE = MergeTree ORDER BY set - CREATE TABLE test_table_mediumf (`set` Array(Int32), `subset` Array (Int32)) ENGINE = MergeTree ORDER BY set - - CREATE TABLE test_table_large (`set` Array(Int32), `subset` Array (Int32)) ENGINE = MergeTree ORDER BY set - CREATE TABLE test_table_large2 (`set` Array(Int32), `subset` Array (Int32)) ENGINE = MergeTree ORDER BY set - CREATE TABLE test_table_largef (`set` Array(Int32), `subset` Array (Int32)) ENGINE = MergeTree ORDER BY set - - - - INSERT INTO test_table_small SELECT - groupArraySample(500)(number) AS set, - groupArraySample(10)(number) AS subset - FROM (SELECT * FROM numbers(500)) - - INSERT INTO test_table_small2 SELECT - groupArraySample(500)(number) AS set, - groupArraySample(400)(number) AS subset - FROM (SELECT * FROM numbers(500)) - - INSERT INTO test_table_smallf SELECT - groupArraySample(500)(number) AS set, - groupArraySample(10)(number) AS subset - FROM (SELECT * FROM numbers(5000000)) - - - - INSERT INTO test_table_medium SELECT - groupArraySample(50000)(number) AS set, - groupArraySample(10)(number) AS subset - FROM - ( - SELECT * - FROM numbers(50000) - ) - - INSERT INTO test_table_medium2 SELECT - groupArraySample(50000)(number) AS set, - groupArraySample(4000)(number) AS subset - FROM - ( - SELECT * - FROM numbers(50000) - ) - - INSERT INTO test_table_mediumf SELECT - groupArraySample(50000)(number) AS set, - groupArraySample(10)(number) AS subset - FROM - ( - SELECT * - FROM numbers(5000000) - ) - - - - INSERT INTO test_table_large SELECT - groupArraySample(5000000)(number) AS set, - groupArraySample(10)(number) AS subset - FROM - ( - SELECT * - FROM numbers(5000000) - ) - - INSERT INTO test_table_large2 SELECT - groupArraySample(5000000)(number) AS set, - groupArraySample(4000)(number) AS subset - FROM - ( - SELECT * - FROM numbers(5000000) - ) - - INSERT INTO test_table_largef SELECT - groupArraySample(5000000)(number) AS set, - groupArraySample(10)(number) AS subset - FROM - ( - SELECT * - FROM numbers(100000000) - ) - - - select hasAll(set, subset) from test_table_small - select hasAll(set, subset) from test_table_small2 - select hasAll(set, subset) from test_table_smallf - - select hasAll(set, subset) from test_table_medium - select hasAll(set, subset) from test_table_medium2 - select hasAll(set, subset) from test_table_mediumf - - select hasAll(set, subset) from test_table_large - select hasAll(set, subset) from test_table_large2 - select hasAll(set, subset) from test_table_largef - - DROP TABLE IF EXISTS test_table_small - DROP TABLE IF EXISTS test_table_small2 - DROP TABLE IF EXISTS test_table_smallf - - DROP TABLE IF EXISTS test_table_medium - DROP TABLE IF EXISTS test_table_medium2 - DROP TABLE IF EXISTS test_table_mediumf - - DROP TABLE IF EXISTS test_table_large - DROP TABLE IF EXISTS test_table_large2 - DROP TABLE IF EXISTS test_table_largef - diff --git a/tests/performance/hasAll_simd_int16.xml b/tests/performance/hasAll_simd_int16.xml new file mode 100644 index 00000000000..c2ce4eec77f --- /dev/null +++ b/tests/performance/hasAll_simd_int16.xml @@ -0,0 +1,52 @@ + + CREATE TABLE test_table_small (`set` Array(Int16), `subset` Array (Int16)) ENGINE = MergeTree ORDER BY set + CREATE TABLE test_table_small2 (`set` Array(Int16), `subset` Array (Int16)) ENGINE = MergeTree ORDER BY set + CREATE TABLE test_table_smallf (`set` Array(Int16), `subset` Array (Int16)) ENGINE = MergeTree ORDER BY set + + CREATE TABLE test_table_medium (`set` Array(Int16), `subset` Array (Int16)) ENGINE = MergeTree ORDER BY set + CREATE TABLE test_table_medium2 (`set` Array(Int16), `subset` Array (Int16)) ENGINE = MergeTree ORDER BY set + CREATE TABLE test_table_mediumf (`set` Array(Int16), `subset` Array (Int16)) ENGINE = MergeTree ORDER BY set + + CREATE TABLE test_table_large (`set` Array(Int16), `subset` Array (Int16)) ENGINE = MergeTree ORDER BY set + CREATE TABLE test_table_large2 (`set` Array(Int16), `subset` Array (Int16)) ENGINE = MergeTree ORDER BY set + CREATE TABLE test_table_largef (`set` Array(Int16), `subset` Array (Int16)) ENGINE = MergeTree ORDER BY set + + + INSERT INTO test_table_small SELECT groupArraySample(500)(number) AS set, groupArraySample(10)(number) AS subset FROM (SELECT * FROM numbers(500)) + INSERT INTO test_table_small2 SELECT groupArraySample(500)(number) AS set, groupArraySample(400)(number) AS subset FROM (SELECT * FROM numbers(500)) + INSERT INTO test_table_smallf SELECT groupArraySample(500)(number) AS set, groupArraySample(10)(number) AS subset FROM (SELECT * FROM numbers(5000000)) + + + INSERT INTO test_table_medium SELECT groupArraySample(1000000)(number) AS set, groupArraySample(10)(number) AS subset FROM (SELECT * FROM numbers(1000000)) + INSERT INTO test_table_medium2 SELECT groupArraySample(1000000)(number) AS set, groupArraySample(4000)(number) AS subset FROM (SELECT * FROM numbers(1000000)) + INSERT INTO test_table_mediumf SELECT groupArraySample(1000000)(number) AS set, groupArraySample(10)(number) AS subset FROM (SELECT * FROM numbers(100000000)) + + + INSERT INTO test_table_large SELECT groupArraySample(50000000)(number) AS set, groupArraySample(10)(number) AS subset FROM (SELECT * FROM numbers(50000000)) + INSERT INTO test_table_large2 SELECT groupArraySample(50000000)(number) AS set, groupArraySample(8000)(number) AS subset FROM (SELECT * FROM numbers(50000000)) + INSERT INTO test_table_largef SELECT groupArraySample(50000000)(number) AS set, groupArraySample(10)(number) AS subset FROM (SELECT * FROM numbers(1000000000)) + + select hasAll(set, subset) from test_table_small + select hasAll(set, subset) from test_table_small2 + select hasAll(set, subset) from test_table_smallf + + select hasAll(set, subset) from test_table_medium + select hasAll(set, subset) from test_table_medium2 + select hasAll(set, subset) from test_table_mediumf + + select hasAll(set, subset) from test_table_large + select hasAll(set, subset) from test_table_large2 Settings max_execution_time=300 + select hasAll(set, subset) from test_table_largef + + DROP TABLE IF EXISTS test_table_small + DROP TABLE IF EXISTS test_table_small2 + DROP TABLE IF EXISTS test_table_smallf + + DROP TABLE IF EXISTS test_table_medium + DROP TABLE IF EXISTS test_table_medium2 + DROP TABLE IF EXISTS test_table_mediumf + + DROP TABLE IF EXISTS test_table_large + DROP TABLE IF EXISTS test_table_large2 + DROP TABLE IF EXISTS test_table_largef + diff --git a/tests/performance/hasAll_simd_int32.xml b/tests/performance/hasAll_simd_int32.xml new file mode 100644 index 00000000000..4543dea161b --- /dev/null +++ b/tests/performance/hasAll_simd_int32.xml @@ -0,0 +1,52 @@ + + CREATE TABLE test_table_small (`set` Array(Int32), `subset` Array (Int32)) ENGINE = MergeTree ORDER BY set + CREATE TABLE test_table_small2 (`set` Array(Int32), `subset` Array (Int32)) ENGINE = MergeTree ORDER BY set + CREATE TABLE test_table_smallf (`set` Array(Int32), `subset` Array (Int32)) ENGINE = MergeTree ORDER BY set + + CREATE TABLE test_table_medium (`set` Array(Int32), `subset` Array (Int32)) ENGINE = MergeTree ORDER BY set + CREATE TABLE test_table_medium2 (`set` Array(Int32), `subset` Array (Int32)) ENGINE = MergeTree ORDER BY set + CREATE TABLE test_table_mediumf (`set` Array(Int32), `subset` Array (Int32)) ENGINE = MergeTree ORDER BY set + + CREATE TABLE test_table_large (`set` Array(Int32), `subset` Array (Int32)) ENGINE = MergeTree ORDER BY set + CREATE TABLE test_table_large2 (`set` Array(Int32), `subset` Array (Int32)) ENGINE = MergeTree ORDER BY set + CREATE TABLE test_table_largef (`set` Array(Int32), `subset` Array (Int32)) ENGINE = MergeTree ORDER BY set + + + INSERT INTO test_table_small SELECT groupArraySample(500)(number) AS set, groupArraySample(10)(number) AS subset FROM (SELECT * FROM numbers(500)) + INSERT INTO test_table_small2 SELECT groupArraySample(500)(number) AS set, groupArraySample(400)(number) AS subset FROM (SELECT * FROM numbers(500)) + INSERT INTO test_table_smallf SELECT groupArraySample(500)(number) AS set, groupArraySample(10)(number) AS subset FROM (SELECT * FROM numbers(5000000)) + + + INSERT INTO test_table_medium SELECT groupArraySample(1000000)(number) AS set, groupArraySample(10)(number) AS subset FROM (SELECT * FROM numbers(1000000)) + INSERT INTO test_table_medium2 SELECT groupArraySample(1000000)(number) AS set, groupArraySample(4000)(number) AS subset FROM (SELECT * FROM numbers(1000000)) + INSERT INTO test_table_mediumf SELECT groupArraySample(1000000)(number) AS set, groupArraySample(10)(number) AS subset FROM (SELECT * FROM numbers(100000000)) + + + INSERT INTO test_table_large SELECT groupArraySample(50000000)(number) AS set, groupArraySample(10)(number) AS subset FROM (SELECT * FROM numbers(50000000)) + INSERT INTO test_table_large2 SELECT groupArraySample(50000000)(number) AS set, groupArraySample(4000)(number) AS subset FROM (SELECT * FROM numbers(50000000)) Settings max_execution_time=30 + INSERT INTO test_table_largef SELECT groupArraySample(50000000)(number) AS set, groupArraySample(10)(number) AS subset FROM (SELECT * FROM numbers(1000000000)) + + select hasAll(set, subset) from test_table_small + select hasAll(set, subset) from test_table_small2 + select hasAll(set, subset) from test_table_smallf + + select hasAll(set, subset) from test_table_medium + select hasAll(set, subset) from test_table_medium2 + select hasAll(set, subset) from test_table_mediumf + + select hasAll(set, subset) from test_table_large + select hasAll(set, subset) from test_table_large2 Settings max_execution_time=300 + select hasAll(set, subset) from test_table_largef + + DROP TABLE IF EXISTS test_table_small + DROP TABLE IF EXISTS test_table_small2 + DROP TABLE IF EXISTS test_table_smallf + + DROP TABLE IF EXISTS test_table_medium + DROP TABLE IF EXISTS test_table_medium2 + DROP TABLE IF EXISTS test_table_mediumf + + DROP TABLE IF EXISTS test_table_large + DROP TABLE IF EXISTS test_table_large2 + DROP TABLE IF EXISTS test_table_largef + diff --git a/tests/performance/hasAll_simd_int64.xml b/tests/performance/hasAll_simd_int64.xml new file mode 100644 index 00000000000..07e52483bb1 --- /dev/null +++ b/tests/performance/hasAll_simd_int64.xml @@ -0,0 +1,52 @@ + + CREATE TABLE test_table_small (`set` Array(Int64), `subset` Array (Int64)) ENGINE = MergeTree ORDER BY set + CREATE TABLE test_table_small2 (`set` Array(Int64), `subset` Array (Int64)) ENGINE = MergeTree ORDER BY set + CREATE TABLE test_table_smallf (`set` Array(Int64), `subset` Array (Int64)) ENGINE = MergeTree ORDER BY set + + CREATE TABLE test_table_medium (`set` Array(Int64), `subset` Array (Int64)) ENGINE = MergeTree ORDER BY set + CREATE TABLE test_table_medium2 (`set` Array(Int64), `subset` Array (Int64)) ENGINE = MergeTree ORDER BY set + CREATE TABLE test_table_mediumf (`set` Array(Int64), `subset` Array (Int64)) ENGINE = MergeTree ORDER BY set + + CREATE TABLE test_table_large (`set` Array(Int64), `subset` Array (Int64)) ENGINE = MergeTree ORDER BY set + CREATE TABLE test_table_large2 (`set` Array(Int64), `subset` Array (Int64)) ENGINE = MergeTree ORDER BY set + CREATE TABLE test_table_largef (`set` Array(Int64), `subset` Array (Int64)) ENGINE = MergeTree ORDER BY set + + + INSERT INTO test_table_small SELECT groupArraySample(500)(number) AS set, groupArraySample(10)(number) AS subset FROM (SELECT * FROM numbers(500)) + INSERT INTO test_table_small2 SELECT groupArraySample(500)(number) AS set, groupArraySample(400)(number) AS subset FROM (SELECT * FROM numbers(500)) + INSERT INTO test_table_smallf SELECT groupArraySample(500)(number) AS set, groupArraySample(10)(number) AS subset FROM (SELECT * FROM numbers(5000000)) + + + INSERT INTO test_table_medium SELECT groupArraySample(1000000)(number) AS set, groupArraySample(10)(number) AS subset FROM (SELECT * FROM numbers(1000000)) + INSERT INTO test_table_medium2 SELECT groupArraySample(1000000)(number) AS set, groupArraySample(4000)(number) AS subset FROM (SELECT * FROM numbers(1000000)) + INSERT INTO test_table_mediumf SELECT groupArraySample(1000000)(number) AS set, groupArraySample(10)(number) AS subset FROM (SELECT * FROM numbers(100000000)) + + + INSERT INTO test_table_large SELECT groupArraySample(50000000)(number) AS set, groupArraySample(10)(number) AS subset FROM (SELECT * FROM numbers(50000000)) + INSERT INTO test_table_large2 SELECT groupArraySample(50000000)(number) AS set, groupArraySample(2000)(number) AS subset FROM (SELECT * FROM numbers(50000000)) Settings max_execution_time=30 + INSERT INTO test_table_largef SELECT groupArraySample(50000000)(number) AS set, groupArraySample(10)(number) AS subset FROM (SELECT * FROM numbers(1000000000)) + + select hasAll(set, subset) from test_table_small + select hasAll(set, subset) from test_table_small2 + select hasAll(set, subset) from test_table_smallf + + select hasAll(set, subset) from test_table_medium + select hasAll(set, subset) from test_table_medium2 + select hasAll(set, subset) from test_table_mediumf + + select hasAll(set, subset) from test_table_large + select hasAll(set, subset) from test_table_large2 Settings max_execution_time=300 + select hasAll(set, subset) from test_table_largef + + DROP TABLE IF EXISTS test_table_small + DROP TABLE IF EXISTS test_table_small2 + DROP TABLE IF EXISTS test_table_smallf + + DROP TABLE IF EXISTS test_table_medium + DROP TABLE IF EXISTS test_table_medium2 + DROP TABLE IF EXISTS test_table_mediumf + + DROP TABLE IF EXISTS test_table_large + DROP TABLE IF EXISTS test_table_large2 + DROP TABLE IF EXISTS test_table_largef + diff --git a/tests/performance/hasAll_simd_int8.xml b/tests/performance/hasAll_simd_int8.xml new file mode 100644 index 00000000000..5ddc84aa5bd --- /dev/null +++ b/tests/performance/hasAll_simd_int8.xml @@ -0,0 +1,52 @@ + + CREATE TABLE test_table_small (`set` Array(Int8), `subset` Array (Int8)) ENGINE = MergeTree ORDER BY set + CREATE TABLE test_table_small2 (`set` Array(Int8), `subset` Array (Int8)) ENGINE = MergeTree ORDER BY set + CREATE TABLE test_table_smallf (`set` Array(Int8), `subset` Array (Int8)) ENGINE = MergeTree ORDER BY set + + CREATE TABLE test_table_medium (`set` Array(Int8), `subset` Array (Int8)) ENGINE = MergeTree ORDER BY set + CREATE TABLE test_table_medium2 (`set` Array(Int8), `subset` Array (Int8)) ENGINE = MergeTree ORDER BY set + CREATE TABLE test_table_mediumf (`set` Array(Int8), `subset` Array (Int8)) ENGINE = MergeTree ORDER BY set + + CREATE TABLE test_table_large (`set` Array(Int8), `subset` Array (Int8)) ENGINE = MergeTree ORDER BY set + CREATE TABLE test_table_large2 (`set` Array(Int8), `subset` Array (Int8)) ENGINE = MergeTree ORDER BY set + CREATE TABLE test_table_largef (`set` Array(Int8), `subset` Array (Int8)) ENGINE = MergeTree ORDER BY set + + + INSERT INTO test_table_small SELECT groupArraySample(500)(number) AS set, groupArraySample(10)(number) AS subset FROM (SELECT * FROM numbers(500)) + INSERT INTO test_table_small2 SELECT groupArraySample(500)(number) AS set, groupArraySample(400)(number) AS subset FROM (SELECT * FROM numbers(500)) + INSERT INTO test_table_smallf SELECT groupArraySample(500)(number) AS set, groupArraySample(10)(number) AS subset FROM (SELECT * FROM numbers(5000000)) + + + INSERT INTO test_table_medium SELECT groupArraySample(1000000)(number) AS set, groupArraySample(10)(number) AS subset FROM (SELECT * FROM numbers(1000000)) + INSERT INTO test_table_medium2 SELECT groupArraySample(1000000)(number) AS set, groupArraySample(4000)(number) AS subset FROM (SELECT * FROM numbers(1000000)) + INSERT INTO test_table_mediumf SELECT groupArraySample(1000000)(number) AS set, groupArraySample(10)(number) AS subset FROM (SELECT * FROM numbers(100000000)) + + + INSERT INTO test_table_large SELECT groupArraySample(50000000)(number) AS set, groupArraySample(10)(number) AS subset FROM (SELECT * FROM numbers(50000000)) + INSERT INTO test_table_large2 SELECT groupArraySample(50000000)(number) AS set, groupArraySample(4000)(number) AS subset FROM (SELECT * FROM numbers(50000000)) + INSERT INTO test_table_largef SELECT groupArraySample(50000000)(number) AS set, groupArraySample(10)(number) AS subset FROM (SELECT * FROM numbers(1000000000)) + + select hasAll(set, subset) from test_table_small + select hasAll(set, subset) from test_table_small2 + select hasAll(set, subset) from test_table_smallf + + select hasAll(set, subset) from test_table_medium + select hasAll(set, subset) from test_table_medium2 + select hasAll(set, subset) from test_table_mediumf + + select hasAll(set, subset) from test_table_large + select hasAll(set, subset) from test_table_large2 Settings max_execution_time=300 + select hasAll(set, subset) from test_table_largef + + DROP TABLE IF EXISTS test_table_small + DROP TABLE IF EXISTS test_table_small2 + DROP TABLE IF EXISTS test_table_smallf + + DROP TABLE IF EXISTS test_table_medium + DROP TABLE IF EXISTS test_table_medium2 + DROP TABLE IF EXISTS test_table_mediumf + + DROP TABLE IF EXISTS test_table_large + DROP TABLE IF EXISTS test_table_large2 + DROP TABLE IF EXISTS test_table_largef + From 62487fe2fcf0eb90e53bd9291afec42761f4a797 Mon Sep 17 00:00:00 2001 From: Youenn Lebras Date: Mon, 6 Sep 2021 09:20:03 +0200 Subject: [PATCH 12/82] Pass SSE version to 4.2 and exploiting it's specific loadu --- src/Functions/GatherUtils/CMakeLists.txt | 3 + .../GatherUtils/sliceHasImplAnyAll.h | 140 ++++++++++-------- 2 files changed, 79 insertions(+), 64 deletions(-) diff --git a/src/Functions/GatherUtils/CMakeLists.txt b/src/Functions/GatherUtils/CMakeLists.txt index 731407e774c..a379ccbadde 100644 --- a/src/Functions/GatherUtils/CMakeLists.txt +++ b/src/Functions/GatherUtils/CMakeLists.txt @@ -11,6 +11,9 @@ if (HAS_SUGGEST_DESTRUCTOR_OVERRIDE) target_compile_definitions(clickhouse_functions_gatherutils PUBLIC HAS_SUGGEST_DESTRUCTOR_OVERRIDE) endif() +if (HAVE_SSE42) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -msse2 -msse4.2") +endif() if (HAVE_AVX2) target_compile_options(clickhouse_functions_gatherutils PRIVATE -mavx2 -DNAMESPACE=AVX2) endif() diff --git a/src/Functions/GatherUtils/sliceHasImplAnyAll.h b/src/Functions/GatherUtils/sliceHasImplAnyAll.h index 5603a802e7a..111b9d767dd 100644 --- a/src/Functions/GatherUtils/sliceHasImplAnyAll.h +++ b/src/Functions/GatherUtils/sliceHasImplAnyAll.h @@ -6,6 +6,7 @@ #if defined(__SSE4_2__) #include + #include #include #endif #if defined(__AVX2__) @@ -153,7 +154,7 @@ inline ALWAYS_INLINE bool sliceHasImplAnyAll(second.data + j)); - // bitmask is filled with minus ones for ones which are considered as null in the corresponding null map, 0 otherwise; + // bits of the bitmask are set to one if considered as null in the corresponding null map, 0 otherwise; __m256i bitmask = has_second_null_map ? _mm256_set_epi32( (second_null_map[j + 7]) ? full : none, @@ -167,15 +168,16 @@ inline ALWAYS_INLINE bool sliceHasImplAnyAll(first.data + i)); // Create a mask to avoid to compare null elements - // set_m128i takes two arguments: (high segment, low segment) that are two __m128i convert from 8bits to 32bits to fit to our following operations - const __m256i first_nm_mask = _mm256_set_m128i( - _mm_cvtepi8_epi32(_mm_lddqu_si128(reinterpret_cast(first_null_map + i + 4))), - _mm_cvtepi8_epi32(_mm_lddqu_si128(reinterpret_cast(first_null_map + i)))); + // set_m128i takes two arguments: (high segment, low segment) that are two __m128i convert from 8bits to 32bits to match with next operations + const __m256i first_nm_mask = has_first_null_map? + _mm256_set_m128i( + _mm_cvtepi8_epi32(_mm_lddqu_si128(reinterpret_cast(first_null_map + i + 4))), + _mm_cvtepi8_epi32(_mm_lddqu_si128(reinterpret_cast(first_null_map + i)))) + : zeros; bitmask = _mm256_or_si256( _mm256_or_si256( @@ -228,7 +230,7 @@ inline ALWAYS_INLINE bool sliceHasImplAnyAll 7) return false; return hasAllIntegralLoopRemainder(j, first, second, first_null_map, second_null_map); @@ -262,7 +264,7 @@ inline ALWAYS_INLINE bool sliceHasImplAnyAll 3 && first.size > 3) @@ -271,6 +273,7 @@ inline ALWAYS_INLINE bool sliceHasImplAnyAll(second.data + j)); + // bits of the bitmask are set to one if considered as null in the corresponding null map, 0 otherwise; __m256i bitmask = has_second_null_map ? _mm256_set_epi64x( (second_null_map[j + 3])? full : none, @@ -283,9 +286,11 @@ inline ALWAYS_INLINE bool sliceHasImplAnyAll(first.data + i)); - const __m256i first_nm_mask = _mm256_set_m128i( - _mm_cvtepi8_epi64(_mm_lddqu_si128(reinterpret_cast(first_null_map + i + 2))), - _mm_cvtepi8_epi64(_mm_lddqu_si128(reinterpret_cast(first_null_map + i)))); + const __m256i first_nm_mask = has_first_null_map? + _mm256_set_m128i( + _mm_cvtepi8_epi64(_mm_lddqu_si128(reinterpret_cast(first_null_map + i + 2))), + _mm_cvtepi8_epi64(_mm_lddqu_si128(reinterpret_cast(first_null_map + i)))) + : zeros; bitmask = _mm256_or_si256( _mm256_or_si256( @@ -321,7 +326,7 @@ inline ALWAYS_INLINE bool sliceHasImplAnyAll 2) + if (!has_mask && second.size > 3) return false; return hasAllIntegralLoopRemainder(j, first, second, first_null_map, second_null_map); @@ -343,7 +348,7 @@ inline ALWAYS_INLINE bool sliceHasImplAnyAll 15 && first.size > 15) @@ -367,9 +372,11 @@ inline ALWAYS_INLINE bool sliceHasImplAnyAll(first.data + i)); - const __m256i first_nm_mask = _mm256_set_m128i( - _mm_cvtepi8_epi16(_mm_lddqu_si128(reinterpret_cast(first_null_map+i+8))), - _mm_cvtepi8_epi16(_mm_lddqu_si128(reinterpret_cast(first_null_map+i)))); + const __m256i first_nm_mask = has_first_null_map? + _mm256_set_m128i( + _mm_cvtepi8_epi16(_mm_lddqu_si128(reinterpret_cast(first_null_map + i + 8))), + _mm_cvtepi8_epi16(_mm_lddqu_si128(reinterpret_cast(first_null_map + i)))) + : zeros; bitmask = _mm256_or_si256( _mm256_or_si256( @@ -457,7 +464,7 @@ inline ALWAYS_INLINE bool sliceHasImplAnyAll 2) + if (!has_mask && second.size > 15) return false; return hasAllIntegralLoopRemainder(j, first, second, first_null_map, second_null_map); @@ -481,10 +488,10 @@ inline ALWAYS_INLINE bool sliceHasImplAnyAll 3 && first.size > 2) + if (second.size > 3 && first.size > 3) { - const int full = -1, none = 0; for (; j < second.size - 3 && has_mask; j += 4) { has_mask = 0; @@ -540,7 +547,7 @@ inline ALWAYS_INLINE bool sliceHasImplAnyAll 3) return false; return hasAllIntegralLoopRemainder(j, first, second, first_null_map, second_null_map); @@ -564,48 +571,51 @@ inline ALWAYS_INLINE bool sliceHasImplAnyAll 1 && first.size > 1) { - has_mask = 0; - const __m128i f_data = _mm_lddqu_si128(reinterpret_cast(second.data + j)); - __m128i bitmask = has_second_null_map ? - _mm_set_epi64x( - (second_null_map[j + 1]) ? full : none, - (second_null_map[j]) ? full : none) - : zeros; - unsigned i = 0; - for (; i < first.size - 1 && !has_mask; has_mask = _mm_test_all_ones(bitmask), i += 2) + for (; j < second.size - 1 && has_mask; j += 2) { - const __m128i s_data = _mm_lddqu_si128(reinterpret_cast(first.data + i)); - const __m128i first_nm_mask = has_first_null_map ? - _mm_cvtepi8_epi64(_mm_lddqu_si128(reinterpret_cast(first_null_map + i))) + has_mask = 0; + const __m128i f_data = _mm_lddqu_si128(reinterpret_cast(second.data + j)); + __m128i bitmask = has_second_null_map ? + _mm_set_epi64x( + (second_null_map[j + 1]) ? full : none, + (second_null_map[j]) ? full : none) : zeros; - bitmask = - _mm_or_si128( - _mm_or_si128( - _mm_andnot_si128( - first_nm_mask, - _mm_cmpeq_epi32(f_data, s_data)), - _mm_andnot_si128( - _mm_shuffle_epi32(first_nm_mask, _MM_SHUFFLE(1,0,3,2)), - _mm_cmpeq_epi64(f_data, _mm_shuffle_epi32(s_data, _MM_SHUFFLE(1,0,3,2))))), - bitmask); - } - - if (i < first.size) - { - for (; i < first.size && !has_mask; ++i) + unsigned i = 0; + for (; i < first.size - 1 && !has_mask; has_mask = _mm_test_all_ones(bitmask), i += 2) { - if (has_first_null_map && first_null_map[i]) - continue; - __m128i v_i = _mm_set1_epi64x(first.data[i]); - bitmask = _mm_or_si128(bitmask, _mm_cmpeq_epi64(f_data, v_i)); - has_mask = _mm_test_all_ones(bitmask); + const __m128i s_data = _mm_lddqu_si128(reinterpret_cast(first.data + i)); + const __m128i first_nm_mask = has_first_null_map ? + _mm_cvtepi8_epi64(_mm_lddqu_si128(reinterpret_cast(first_null_map + i))) + : zeros; + bitmask = + _mm_or_si128( + _mm_or_si128( + _mm_andnot_si128( + first_nm_mask, + _mm_cmpeq_epi64(f_data, s_data)), + _mm_andnot_si128( + _mm_shuffle_epi32(first_nm_mask, _MM_SHUFFLE(1,0,3,2)), + _mm_cmpeq_epi64(f_data, _mm_shuffle_epi32(s_data, _MM_SHUFFLE(1,0,3,2))))), + bitmask); + } + + if (i < first.size) + { + for (; i < first.size && !has_mask; ++i) + { + if (has_first_null_map && first_null_map[i]) + continue; + __m128i v_i = _mm_set1_epi64x(first.data[i]); + bitmask = _mm_or_si128(bitmask, _mm_cmpeq_epi64(f_data, v_i)); + has_mask = _mm_test_all_ones(bitmask); + } } } } - if (!has_mask) + if (!has_mask && second.size > 1) return false; return hasAllIntegralLoopRemainder(j, first, second, first_null_map, second_null_map); @@ -634,7 +644,7 @@ inline ALWAYS_INLINE bool sliceHasImplAnyAll(second.data+j)); + const __m128i f_data = _mm_lddqu_si128(reinterpret_cast(second.data + j)); __m128i bitmask = has_second_null_map ? _mm_set_epi16( (second_null_map[j + 7]) ? full : none, (second_null_map[j + 6]) ? full : none, @@ -647,7 +657,7 @@ inline ALWAYS_INLINE bool sliceHasImplAnyAll(first.data + i)); const __m128i first_nm_mask = has_first_null_map ? - _mm_cvtepi8_epi16(_mm_lddqu_si128(reinterpret_cast(first_null_map+i))) + _mm_cvtepi8_epi16(_mm_lddqu_si128(reinterpret_cast(first_null_map + i))) : zeros; bitmask = _mm_or_si128( @@ -701,13 +711,15 @@ inline ALWAYS_INLINE bool sliceHasImplAnyAll 2) + if (!has_mask && second.size > 6) return false; return hasAllIntegralLoopRemainder(j, first, second, first_null_map, second_null_map); } -// SSE4.2 Int8_t specialization +// Int8 version is faster with SSE than with AVX2 +#if defined(__SSE4_2__) +// SSE2 Int8_t specialization template <> inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >( const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, const UInt8 * second_null_map) @@ -723,14 +735,14 @@ inline ALWAYS_INLINE bool sliceHasImplAnyAll 15) + if (second.size > 15 && first.size > 15) { for (; j < second.size - 15 && has_mask; j += 16) { has_mask = 0; - const __m128i f_data = _mm_lddqu_si128(reinterpret_cast(second.data+j)); + const __m128i f_data = _mm_lddqu_si128(reinterpret_cast(second.data + j)); __m128i bitmask = has_second_null_map ? _mm_set_epi8( (second_null_map[j + 15]) ? full : none, (second_null_map[j + 14]) ? full : none, @@ -745,9 +757,9 @@ inline ALWAYS_INLINE bool sliceHasImplAnyAll(first.data+i)); + const __m128i s_data = _mm_lddqu_si128(reinterpret_cast(first.data + i)); const __m128i first_nm_mask = has_first_null_map ? - _mm_lddqu_si128(reinterpret_cast(first_null_map+i)) + _mm_lddqu_si128(reinterpret_cast(first_null_map + i)) : zeros; bitmask = _mm_or_si128( @@ -832,7 +844,7 @@ inline ALWAYS_INLINE bool sliceHasImplAnyAll 15) return false; return hasAllIntegralLoopRemainder(j, first, second, first_null_map, second_null_map); From a810ce5dcb46a6de40fdc1afa9cb5ee7eab6a5b5 Mon Sep 17 00:00:00 2001 From: Youenn Lebras Date: Thu, 9 Sep 2021 11:21:32 +0200 Subject: [PATCH 13/82] Remove AVX2 to figure out where is the illegal intruction Enable AVX2 - int32 --- .../GatherUtils/sliceHasImplAnyAll.h | 235 ------------------ 1 file changed, 235 deletions(-) diff --git a/src/Functions/GatherUtils/sliceHasImplAnyAll.h b/src/Functions/GatherUtils/sliceHasImplAnyAll.h index 111b9d767dd..7c253cbc407 100644 --- a/src/Functions/GatherUtils/sliceHasImplAnyAll.h +++ b/src/Functions/GatherUtils/sliceHasImplAnyAll.h @@ -12,7 +12,6 @@ #if defined(__AVX2__) #include #endif - namespace DB::GatherUtils { @@ -236,240 +235,6 @@ inline ALWAYS_INLINE bool sliceHasImplAnyAll' to 'const NumericArraySlice'" -// How should we do, copy past each function ?? I haven't found a way to specialize a same function body for two different types. -// AVX2 UInt specialization -// template <> -// inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >( -// const NumericArraySlice & second, const NumericArraySlice & first, const UInt8 * first_null_map, const UInt8 * second_null_map) -// { -// return sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements > ( -// static_cast &>(second), static_cast &>(first), second_null_map, first_null_map); -// } - -// AVX2 Int64 specialization -template <> -inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >( - const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, const UInt8 * second_null_map) -{ - if (second.size == 0) - return true; - - if (!hasNull(first_null_map, first.size) && hasNull(second_null_map, second.size)) - return false; - - const bool has_first_null_map = first_null_map != nullptr; - const bool has_second_null_map = second_null_map != nullptr; - - size_t j = 0; - short has_mask = 1; - const Int64 full = -1, none = 0; - const __m256i ones = _mm256_set1_epi64x(full); - const __m256i zeros = _mm256_setzero_si256(); - if (second.size > 3 && first.size > 3) - { - for (; j < second.size - 3 && has_mask; j += 4) - { - has_mask = 0; - const __m256i f_data = _mm256_lddqu_si256(reinterpret_cast(second.data + j)); - // bits of the bitmask are set to one if considered as null in the corresponding null map, 0 otherwise; - __m256i bitmask = has_second_null_map ? - _mm256_set_epi64x( - (second_null_map[j + 3])? full : none, - (second_null_map[j + 2])? full : none, - (second_null_map[j + 1])? full : none, - (second_null_map[j]) ? full : none) - : zeros; - - unsigned i = 0; - for (; i < first.size - 3 && !has_mask; has_mask = _mm256_testc_si256(bitmask, ones), i += 4) - { - const __m256i s_data = _mm256_lddqu_si256(reinterpret_cast(first.data + i)); - const __m256i first_nm_mask = has_first_null_map? - _mm256_set_m128i( - _mm_cvtepi8_epi64(_mm_lddqu_si128(reinterpret_cast(first_null_map + i + 2))), - _mm_cvtepi8_epi64(_mm_lddqu_si128(reinterpret_cast(first_null_map + i)))) - : zeros; - bitmask = - _mm256_or_si256( - _mm256_or_si256( - _mm256_or_si256( - _mm256_andnot_si256( - first_nm_mask, - _mm256_cmpeq_epi64(f_data, s_data)), - _mm256_andnot_si256( - _mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(5,4,3,2,1,0,7,6)), - _mm256_cmpeq_epi64(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(5,4,3,2,1,0,7,6))))), - - _mm256_or_si256( - _mm256_andnot_si256( - _mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(3,2,1,0,7,6,5,4)), - _mm256_cmpeq_epi64(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(3,2,1,0,7,6,5,4)))), - _mm256_andnot_si256( - _mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(1,0,7,6,5,4,3,2)), - _mm256_cmpeq_epi64(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(1,0,7,6,5,4,3,2)))))), - bitmask); - } - - if (i < first.size) - { - for (; i < first.size && !has_mask; ++i) - { - if (has_first_null_map && first_null_map[i]) - continue; - __m256i v_i = _mm256_set1_epi64x(first.data[i]); - bitmask = _mm256_or_si256(bitmask, _mm256_cmpeq_epi64(f_data, v_i)); - has_mask = _mm256_testc_si256(bitmask, ones); - } - } - } - } - - if (!has_mask && second.size > 3) - return false; - - return hasAllIntegralLoopRemainder(j, first, second, first_null_map, second_null_map); -} - -// AVX2 Int16_t specialization -template <> -inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >( - const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, const UInt8 * second_null_map) -{ - if (second.size == 0) - return true; - - if (!hasNull(first_null_map, first.size) && hasNull(second_null_map, second.size)) - return false; - - const bool has_first_null_map = first_null_map != nullptr; - const bool has_second_null_map = second_null_map != nullptr; - - size_t j = 0; - short has_mask = 1; - const int16_t full = -1, none = 0; - const __m256i ones = _mm256_set1_epi16(full); - const __m256i zeros = _mm256_setzero_si256(); - if (second.size > 15 && first.size > 15) - { - for (; j < second.size - 15 && has_mask; j += 16) - { - has_mask = 0; - const __m256i f_data = _mm256_lddqu_si256(reinterpret_cast(second.data + j)); - __m256i bitmask = has_second_null_map ? - _mm256_set_epi16( - (second_null_map[j + 15]) ? full : none, (second_null_map[j + 14]) ? full : none, - (second_null_map[j + 13]) ? full : none, (second_null_map[j + 12]) ? full : none, - (second_null_map[j + 11]) ? full : none, (second_null_map[j + 10]) ? full : none, - (second_null_map[j + 9]) ? full : none, (second_null_map[j + 8])? full : none, - (second_null_map[j + 7]) ? full : none, (second_null_map[j + 6])? full : none, - (second_null_map[j + 5]) ? full : none, (second_null_map[j + 4])? full : none, - (second_null_map[j + 3]) ? full : none, (second_null_map[j + 2])? full : none, - (second_null_map[j + 1]) ? full : none, (second_null_map[j]) ? full : none) - : zeros; - unsigned i = 0; - for (; i < first.size - 15 && !has_mask; has_mask = _mm256_testc_si256(bitmask, ones), i += 16) - { - const __m256i s_data = _mm256_lddqu_si256(reinterpret_cast(first.data + i)); - const __m256i first_nm_mask = has_first_null_map? - _mm256_set_m128i( - _mm_cvtepi8_epi16(_mm_lddqu_si128(reinterpret_cast(first_null_map + i + 8))), - _mm_cvtepi8_epi16(_mm_lddqu_si128(reinterpret_cast(first_null_map + i)))) - : zeros; - bitmask = - _mm256_or_si256( - _mm256_or_si256( - _mm256_or_si256( - _mm256_or_si256( - _mm256_or_si256( - _mm256_andnot_si256( - first_nm_mask, - _mm256_cmpeq_epi16(f_data, s_data)), - _mm256_andnot_si256( - _mm256_shuffle_epi8(first_nm_mask, _mm256_set_epi8(29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30)), - _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(s_data, _mm256_set_epi8(29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30))))), - _mm256_or_si256( - _mm256_andnot_si256( - _mm256_shuffle_epi8(first_nm_mask, _mm256_set_epi8(27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28)), - _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(s_data, _mm256_set_epi8(27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28)))), - _mm256_andnot_si256( - _mm256_shuffle_epi8(first_nm_mask, _mm256_set_epi8(25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26)), - _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(s_data, _mm256_set_epi8(25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26))))) - ), - _mm256_or_si256( - _mm256_or_si256( - _mm256_andnot_si256( - _mm256_shuffle_epi8(first_nm_mask, _mm256_set_epi8(23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24)), - _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(s_data, _mm256_set_epi8(23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24)))), - _mm256_andnot_si256( - _mm256_shuffle_epi8(first_nm_mask, _mm256_set_epi8(21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22)), - _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(s_data, _mm256_set_epi8(21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22))))), - _mm256_or_si256( - _mm256_andnot_si256( - _mm256_shuffle_epi8(first_nm_mask, _mm256_set_epi8(19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20)), - _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(s_data, _mm256_set_epi8(19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20)))), - _mm256_andnot_si256( - _mm256_shuffle_epi8(first_nm_mask, _mm256_set_epi8(17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18)), - _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(s_data, _mm256_set_epi8(17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18)))))) - ), - _mm256_or_si256( - _mm256_or_si256( - _mm256_or_si256( - _mm256_andnot_si256( - _mm256_permute2x128_si256(first_nm_mask, first_nm_mask,1), - _mm256_cmpeq_epi16(f_data, _mm256_permute2x128_si256(s_data, s_data, 1))), - _mm256_andnot_si256( - _mm256_shuffle_epi8(_mm256_permute2x128_si256(first_nm_mask, first_nm_mask, 1), _mm256_set_epi8(13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14)), - _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(_mm256_permute2x128_si256(s_data, s_data, 1), _mm256_set_epi8(13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14))))), - _mm256_or_si256( - _mm256_andnot_si256( - _mm256_shuffle_epi8(_mm256_permute2x128_si256(first_nm_mask, first_nm_mask, 1), _mm256_set_epi8(11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12)), - _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(_mm256_permute2x128_si256(s_data, s_data, 1), _mm256_set_epi8(11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12)))), - _mm256_andnot_si256( - _mm256_shuffle_epi8(_mm256_permute2x128_si256(first_nm_mask, first_nm_mask, 1), _mm256_set_epi8(9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10)), - _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(_mm256_permute2x128_si256(s_data, s_data, 1), _mm256_set_epi8(9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10))))) - ), - _mm256_or_si256( - _mm256_or_si256( - _mm256_andnot_si256( - _mm256_shuffle_epi8(_mm256_permute2x128_si256(first_nm_mask, first_nm_mask, 1), _mm256_set_epi8(7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8)), - _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(_mm256_permute2x128_si256(s_data ,s_data, 1), _mm256_set_epi8(7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8)))), - _mm256_andnot_si256( - _mm256_shuffle_epi8(_mm256_permute2x128_si256(first_nm_mask, first_nm_mask, 1), _mm256_set_epi8(5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6)), - _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(_mm256_permute2x128_si256(s_data, s_data, 1), _mm256_set_epi8(5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6))))), - _mm256_or_si256( - _mm256_andnot_si256( - _mm256_shuffle_epi8(_mm256_permute2x128_si256(first_nm_mask, first_nm_mask, 1), _mm256_set_epi8(3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4)), - _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(_mm256_permute2x128_si256(s_data ,s_data ,1), _mm256_set_epi8(3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4)))), - _mm256_andnot_si256( - _mm256_shuffle_epi8(_mm256_permute2x128_si256(first_nm_mask, first_nm_mask, 1), _mm256_set_epi8(1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2)), - _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(_mm256_permute2x128_si256(s_data, s_data, 1), _mm256_set_epi8(1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2)))))) - ) - ), - bitmask); - } - - if (i < first.size) - { - for (; i < first.size && !has_mask; ++i) - { - if (has_first_null_map && first_null_map[i]) - continue; - __m256i v_i = _mm256_set1_epi16(first.data[i]); - bitmask = _mm256_or_si256(bitmask, _mm256_cmpeq_epi16(f_data, v_i)); - has_mask = _mm256_testc_si256(bitmask, ones); - } - } - } - } - - if (!has_mask && second.size > 15) - return false; - - return hasAllIntegralLoopRemainder(j, first, second, first_null_map, second_null_map); -} - #elif defined(__SSE4_2__) // SSE4.2 Int specialization From 2a2eb3a27bf623e23c08d462d17ade6791bbab23 Mon Sep 17 00:00:00 2001 From: Youenn Lebras Date: Fri, 10 Sep 2021 15:07:36 +0200 Subject: [PATCH 14/82] re-enable full AVX2 - change lddqu to loadu - Update CmakeList.txt --- src/Functions/GatherUtils/CMakeLists.txt | 16 +- .../GatherUtils/sliceHasImplAnyAll.h | 269 ++++++++++++++++-- 2 files changed, 262 insertions(+), 23 deletions(-) diff --git a/src/Functions/GatherUtils/CMakeLists.txt b/src/Functions/GatherUtils/CMakeLists.txt index a379ccbadde..b1c72656f24 100644 --- a/src/Functions/GatherUtils/CMakeLists.txt +++ b/src/Functions/GatherUtils/CMakeLists.txt @@ -11,13 +11,15 @@ if (HAS_SUGGEST_DESTRUCTOR_OVERRIDE) target_compile_definitions(clickhouse_functions_gatherutils PUBLIC HAS_SUGGEST_DESTRUCTOR_OVERRIDE) endif() -if (HAVE_SSE42) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -msse2 -msse4.2") -endif() -if (HAVE_AVX2) - target_compile_options(clickhouse_functions_gatherutils PRIVATE -mavx2 -DNAMESPACE=AVX2) -endif() - if (STRIP_DEBUG_SYMBOLS_FUNCTIONS) target_compile_options(clickhouse_functions_gatherutils PRIVATE "-g0") endif() + +if (HAVE_SSE42) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -msse4.2") + target_compile_options(clickhouse_functions_gatherutils PRIVATE -msse4.2) +endif() +if (HAVE_AVX2) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx2") + target_compile_options(clickhouse_functions_gatherutils PRIVATE -mavx2) +endif() diff --git a/src/Functions/GatherUtils/sliceHasImplAnyAll.h b/src/Functions/GatherUtils/sliceHasImplAnyAll.h index 7c253cbc407..a14acd08e93 100644 --- a/src/Functions/GatherUtils/sliceHasImplAnyAll.h +++ b/src/Functions/GatherUtils/sliceHasImplAnyAll.h @@ -128,6 +128,19 @@ inline ALWAYS_INLINE bool hasAllIntegralLoopRemainder( #endif #if defined(__AVX2__) + +// TODO: Discuss about +// raise an error : "error: no viable conversion from 'const NumericArraySlice' to 'const NumericArraySlice'" +// How should we do, copy past each function ?? I haven't found a way to specialize a same function body for two different types. +// AVX2 UInt specialization +// template <> +// inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >( +// const NumericArraySlice & second, const NumericArraySlice & first, const UInt8 * first_null_map, const UInt8 * second_null_map) +// { +// return sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements > ( +// static_cast &>(second), static_cast &>(first), second_null_map, first_null_map); +// } + // AVX2 Int specialization of sliceHasImplAnyAll template <> inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >( @@ -152,7 +165,7 @@ inline ALWAYS_INLINE bool sliceHasImplAnyAll(second.data + j)); + const __m256i f_data = _mm256_loadu_si256(reinterpret_cast(second.data + j)); // bits of the bitmask are set to one if considered as null in the corresponding null map, 0 otherwise; __m256i bitmask = has_second_null_map ? _mm256_set_epi32( @@ -169,13 +182,13 @@ inline ALWAYS_INLINE bool sliceHasImplAnyAll(first.data + i)); + const __m256i s_data = _mm256_loadu_si256(reinterpret_cast(first.data + i)); // Create a mask to avoid to compare null elements // set_m128i takes two arguments: (high segment, low segment) that are two __m128i convert from 8bits to 32bits to match with next operations const __m256i first_nm_mask = has_first_null_map? _mm256_set_m128i( - _mm_cvtepi8_epi32(_mm_lddqu_si128(reinterpret_cast(first_null_map + i + 4))), - _mm_cvtepi8_epi32(_mm_lddqu_si128(reinterpret_cast(first_null_map + i)))) + _mm_cvtepi8_epi32(_mm_loadu_si128(reinterpret_cast(first_null_map + i + 4))), + _mm_cvtepi8_epi32(_mm_loadu_si128(reinterpret_cast(first_null_map + i)))) : zeros; bitmask = _mm256_or_si256( @@ -235,6 +248,228 @@ inline ALWAYS_INLINE bool sliceHasImplAnyAll +inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >( + const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, const UInt8 * second_null_map) +{ + if (second.size == 0) + return true; + + if (!hasNull(first_null_map, first.size) && hasNull(second_null_map, second.size)) + return false; + + const bool has_first_null_map = first_null_map != nullptr; + const bool has_second_null_map = second_null_map != nullptr; + + size_t j = 0; + short has_mask = 1; + const Int64 full = -1, none = 0; + const __m256i ones = _mm256_set1_epi64x(full); + const __m256i zeros = _mm256_setzero_si256(); + if (second.size > 3 && first.size > 3) + { + for (; j < second.size - 3 && has_mask; j += 4) + { + has_mask = 0; + const __m256i f_data = _mm256_loadu_si256(reinterpret_cast(second.data + j)); + // bits of the bitmask are set to one if considered as null in the corresponding null map, 0 otherwise; + __m256i bitmask = has_second_null_map ? + _mm256_set_epi64x( + (second_null_map[j + 3])? full : none, + (second_null_map[j + 2])? full : none, + (second_null_map[j + 1])? full : none, + (second_null_map[j]) ? full : none) + : zeros; + + unsigned i = 0; + for (; i < first.size - 3 && !has_mask; has_mask = _mm256_testc_si256(bitmask, ones), i += 4) + { + const __m256i s_data = _mm256_loadu_si256(reinterpret_cast(first.data + i)); + const __m256i first_nm_mask = has_first_null_map? + _mm256_set_m128i( + _mm_cvtepi8_epi64(_mm_loadu_si128(reinterpret_cast(first_null_map + i + 2))), + _mm_cvtepi8_epi64(_mm_loadu_si128(reinterpret_cast(first_null_map + i)))) + : zeros; + bitmask = + _mm256_or_si256( + _mm256_or_si256( + _mm256_or_si256( + _mm256_andnot_si256( + first_nm_mask, + _mm256_cmpeq_epi64(f_data, s_data)), + _mm256_andnot_si256( + _mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(5,4,3,2,1,0,7,6)), + _mm256_cmpeq_epi64(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(5,4,3,2,1,0,7,6))))), + + _mm256_or_si256( + _mm256_andnot_si256( + _mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(3,2,1,0,7,6,5,4)), + _mm256_cmpeq_epi64(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(3,2,1,0,7,6,5,4)))), + _mm256_andnot_si256( + _mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(1,0,7,6,5,4,3,2)), + _mm256_cmpeq_epi64(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(1,0,7,6,5,4,3,2)))))), + bitmask); + } + + if (i < first.size) + { + for (; i < first.size && !has_mask; ++i) + { + if (has_first_null_map && first_null_map[i]) + continue; + __m256i v_i = _mm256_set1_epi64x(first.data[i]); + bitmask = _mm256_or_si256(bitmask, _mm256_cmpeq_epi64(f_data, v_i)); + has_mask = _mm256_testc_si256(bitmask, ones); + } + } + } + } + + if (!has_mask && second.size > 3) + return false; + + return hasAllIntegralLoopRemainder(j, first, second, first_null_map, second_null_map); +} + +// AVX2 Int16_t specialization +template <> +inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >( + const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, const UInt8 * second_null_map) +{ + if (second.size == 0) + return true; + + if (!hasNull(first_null_map, first.size) && hasNull(second_null_map, second.size)) + return false; + + const bool has_first_null_map = first_null_map != nullptr; + const bool has_second_null_map = second_null_map != nullptr; + + size_t j = 0; + short has_mask = 1; + const int16_t full = -1, none = 0; + const __m256i ones = _mm256_set1_epi16(full); + const __m256i zeros = _mm256_setzero_si256(); + if (second.size > 15 && first.size > 15) + { + for (; j < second.size - 15 && has_mask; j += 16) + { + has_mask = 0; + const __m256i f_data = _mm256_loadu_si256(reinterpret_cast(second.data + j)); + __m256i bitmask = has_second_null_map ? + _mm256_set_epi16( + (second_null_map[j + 15]) ? full : none, (second_null_map[j + 14]) ? full : none, + (second_null_map[j + 13]) ? full : none, (second_null_map[j + 12]) ? full : none, + (second_null_map[j + 11]) ? full : none, (second_null_map[j + 10]) ? full : none, + (second_null_map[j + 9]) ? full : none, (second_null_map[j + 8])? full : none, + (second_null_map[j + 7]) ? full : none, (second_null_map[j + 6])? full : none, + (second_null_map[j + 5]) ? full : none, (second_null_map[j + 4])? full : none, + (second_null_map[j + 3]) ? full : none, (second_null_map[j + 2])? full : none, + (second_null_map[j + 1]) ? full : none, (second_null_map[j]) ? full : none) + : zeros; + unsigned i = 0; + for (; i < first.size - 15 && !has_mask; has_mask = _mm256_testc_si256(bitmask, ones), i += 16) + { + const __m256i s_data = _mm256_loadu_si256(reinterpret_cast(first.data + i)); + const __m256i first_nm_mask = has_first_null_map? + _mm256_set_m128i( + _mm_cvtepi8_epi16(_mm_loadu_si128(reinterpret_cast(first_null_map + i + 8))), + _mm_cvtepi8_epi16(_mm_loadu_si128(reinterpret_cast(first_null_map + i)))) + : zeros; + bitmask = + _mm256_or_si256( + _mm256_or_si256( + _mm256_or_si256( + _mm256_or_si256( + _mm256_or_si256( + _mm256_andnot_si256( + first_nm_mask, + _mm256_cmpeq_epi16(f_data, s_data)), + _mm256_andnot_si256( + _mm256_shuffle_epi8(first_nm_mask, _mm256_set_epi8(29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30)), + _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(s_data, _mm256_set_epi8(29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30))))), + _mm256_or_si256( + _mm256_andnot_si256( + _mm256_shuffle_epi8(first_nm_mask, _mm256_set_epi8(27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28)), + _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(s_data, _mm256_set_epi8(27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28)))), + _mm256_andnot_si256( + _mm256_shuffle_epi8(first_nm_mask, _mm256_set_epi8(25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26)), + _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(s_data, _mm256_set_epi8(25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26))))) + ), + _mm256_or_si256( + _mm256_or_si256( + _mm256_andnot_si256( + _mm256_shuffle_epi8(first_nm_mask, _mm256_set_epi8(23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24)), + _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(s_data, _mm256_set_epi8(23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24)))), + _mm256_andnot_si256( + _mm256_shuffle_epi8(first_nm_mask, _mm256_set_epi8(21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22)), + _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(s_data, _mm256_set_epi8(21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22))))), + _mm256_or_si256( + _mm256_andnot_si256( + _mm256_shuffle_epi8(first_nm_mask, _mm256_set_epi8(19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20)), + _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(s_data, _mm256_set_epi8(19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20)))), + _mm256_andnot_si256( + _mm256_shuffle_epi8(first_nm_mask, _mm256_set_epi8(17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18)), + _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(s_data, _mm256_set_epi8(17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18)))))) + ), + _mm256_or_si256( + _mm256_or_si256( + _mm256_or_si256( + _mm256_andnot_si256( + _mm256_permute2x128_si256(first_nm_mask, first_nm_mask,1), + _mm256_cmpeq_epi16(f_data, _mm256_permute2x128_si256(s_data, s_data, 1))), + _mm256_andnot_si256( + _mm256_shuffle_epi8(_mm256_permute2x128_si256(first_nm_mask, first_nm_mask, 1), _mm256_set_epi8(13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14)), + _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(_mm256_permute2x128_si256(s_data, s_data, 1), _mm256_set_epi8(13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14))))), + _mm256_or_si256( + _mm256_andnot_si256( + _mm256_shuffle_epi8(_mm256_permute2x128_si256(first_nm_mask, first_nm_mask, 1), _mm256_set_epi8(11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12)), + _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(_mm256_permute2x128_si256(s_data, s_data, 1), _mm256_set_epi8(11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12)))), + _mm256_andnot_si256( + _mm256_shuffle_epi8(_mm256_permute2x128_si256(first_nm_mask, first_nm_mask, 1), _mm256_set_epi8(9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10)), + _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(_mm256_permute2x128_si256(s_data, s_data, 1), _mm256_set_epi8(9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10))))) + ), + _mm256_or_si256( + _mm256_or_si256( + _mm256_andnot_si256( + _mm256_shuffle_epi8(_mm256_permute2x128_si256(first_nm_mask, first_nm_mask, 1), _mm256_set_epi8(7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8)), + _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(_mm256_permute2x128_si256(s_data ,s_data, 1), _mm256_set_epi8(7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8)))), + _mm256_andnot_si256( + _mm256_shuffle_epi8(_mm256_permute2x128_si256(first_nm_mask, first_nm_mask, 1), _mm256_set_epi8(5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6)), + _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(_mm256_permute2x128_si256(s_data, s_data, 1), _mm256_set_epi8(5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6))))), + _mm256_or_si256( + _mm256_andnot_si256( + _mm256_shuffle_epi8(_mm256_permute2x128_si256(first_nm_mask, first_nm_mask, 1), _mm256_set_epi8(3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4)), + _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(_mm256_permute2x128_si256(s_data ,s_data ,1), _mm256_set_epi8(3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4)))), + _mm256_andnot_si256( + _mm256_shuffle_epi8(_mm256_permute2x128_si256(first_nm_mask, first_nm_mask, 1), _mm256_set_epi8(1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2)), + _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(_mm256_permute2x128_si256(s_data, s_data, 1), _mm256_set_epi8(1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2)))))) + ) + ), + bitmask); + } + + if (i < first.size) + { + for (; i < first.size && !has_mask; ++i) + { + if (has_first_null_map && first_null_map[i]) + continue; + __m256i v_i = _mm256_set1_epi16(first.data[i]); + bitmask = _mm256_or_si256(bitmask, _mm256_cmpeq_epi16(f_data, v_i)); + has_mask = _mm256_testc_si256(bitmask, ones); + } + } + } + } + + if (!has_mask && second.size > 15) + return false; + + return hasAllIntegralLoopRemainder(j, first, second, first_null_map, second_null_map); +} + #elif defined(__SSE4_2__) // SSE4.2 Int specialization @@ -260,7 +495,7 @@ inline ALWAYS_INLINE bool sliceHasImplAnyAll(second.data + j)); + const __m128i f_data = _mm_loadu_si128(reinterpret_cast(second.data + j)); __m128i bitmask = has_second_null_map ? _mm_set_epi32( (second_null_map[j + 3]) ? full : none, @@ -272,9 +507,9 @@ inline ALWAYS_INLINE bool sliceHasImplAnyAll(first.data + i)); + const __m128i s_data = _mm_loadu_si128(reinterpret_cast(first.data + i)); const __m128i first_nm_mask = has_first_null_map ? - _mm_cvtepi8_epi32(_mm_lddqu_si128(reinterpret_cast(first_null_map + i))) + _mm_cvtepi8_epi32(_mm_loadu_si128(reinterpret_cast(first_null_map + i))) : zeros; bitmask = @@ -341,7 +576,7 @@ inline ALWAYS_INLINE bool sliceHasImplAnyAll(second.data + j)); + const __m128i f_data = _mm_loadu_si128(reinterpret_cast(second.data + j)); __m128i bitmask = has_second_null_map ? _mm_set_epi64x( (second_null_map[j + 1]) ? full : none, @@ -350,9 +585,9 @@ inline ALWAYS_INLINE bool sliceHasImplAnyAll(first.data + i)); + const __m128i s_data = _mm_loadu_si128(reinterpret_cast(first.data + i)); const __m128i first_nm_mask = has_first_null_map ? - _mm_cvtepi8_epi64(_mm_lddqu_si128(reinterpret_cast(first_null_map + i))) + _mm_cvtepi8_epi64(_mm_loadu_si128(reinterpret_cast(first_null_map + i))) : zeros; bitmask = _mm_or_si128( @@ -409,7 +644,7 @@ inline ALWAYS_INLINE bool sliceHasImplAnyAll(second.data + j)); + const __m128i f_data = _mm_loadu_si128(reinterpret_cast(second.data + j)); __m128i bitmask = has_second_null_map ? _mm_set_epi16( (second_null_map[j + 7]) ? full : none, (second_null_map[j + 6]) ? full : none, @@ -420,9 +655,9 @@ inline ALWAYS_INLINE bool sliceHasImplAnyAll(first.data + i)); + const __m128i s_data = _mm_loadu_si128(reinterpret_cast(first.data + i)); const __m128i first_nm_mask = has_first_null_map ? - _mm_cvtepi8_epi16(_mm_lddqu_si128(reinterpret_cast(first_null_map + i))) + _mm_cvtepi8_epi16(_mm_loadu_si128(reinterpret_cast(first_null_map + i))) : zeros; bitmask = _mm_or_si128( @@ -482,6 +717,8 @@ inline ALWAYS_INLINE bool sliceHasImplAnyAll(second.data + j)); + const __m128i f_data = _mm_loadu_si128(reinterpret_cast(second.data + j)); __m128i bitmask = has_second_null_map ? _mm_set_epi8( (second_null_map[j + 15]) ? full : none, (second_null_map[j + 14]) ? full : none, @@ -522,9 +759,9 @@ inline ALWAYS_INLINE bool sliceHasImplAnyAll(first.data + i)); + const __m128i s_data = _mm_loadu_si128(reinterpret_cast(first.data + i)); const __m128i first_nm_mask = has_first_null_map ? - _mm_lddqu_si128(reinterpret_cast(first_null_map + i)) + _mm_loadu_si128(reinterpret_cast(first_null_map + i)) : zeros; bitmask = _mm_or_si128( From 72fb56904d8de814bddeb08f93d8c0882b6cd4d2 Mon Sep 17 00:00:00 2001 From: youenn lebras Date: Tue, 26 Oct 2021 10:43:23 +0200 Subject: [PATCH 15/82] Add cmake option to enable or not AVX2 instructions --- src/Functions/GatherUtils/CMakeLists.txt | 6 ++++-- src/Functions/GatherUtils/sliceHasImplAnyAll.h | 4 ++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/src/Functions/GatherUtils/CMakeLists.txt b/src/Functions/GatherUtils/CMakeLists.txt index b1c72656f24..f291663550d 100644 --- a/src/Functions/GatherUtils/CMakeLists.txt +++ b/src/Functions/GatherUtils/CMakeLists.txt @@ -1,4 +1,6 @@ include("${ClickHouse_SOURCE_DIR}/cmake/dbms_glob_sources.cmake") +option(ENABLE_AVX2 "Enable AVX2 instructions (when available) when build for modern Intel CPUs" OFF) + add_headers_and_sources(clickhouse_functions_gatherutils .) add_library(clickhouse_functions_gatherutils ${clickhouse_functions_gatherutils_sources} ${clickhouse_functions_gatherutils_headers}) target_link_libraries(clickhouse_functions_gatherutils PRIVATE dbms) @@ -19,7 +21,7 @@ if (HAVE_SSE42) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -msse4.2") target_compile_options(clickhouse_functions_gatherutils PRIVATE -msse4.2) endif() -if (HAVE_AVX2) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx2") +if (HAVE_AVX2 AND ENABLE_AVX2) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx2 -DENABLE_AVX2") target_compile_options(clickhouse_functions_gatherutils PRIVATE -mavx2) endif() diff --git a/src/Functions/GatherUtils/sliceHasImplAnyAll.h b/src/Functions/GatherUtils/sliceHasImplAnyAll.h index a14acd08e93..9028a94b2aa 100644 --- a/src/Functions/GatherUtils/sliceHasImplAnyAll.h +++ b/src/Functions/GatherUtils/sliceHasImplAnyAll.h @@ -85,7 +85,7 @@ bool sliceHasImplAnyAll(const FirstSliceType & first, const SecondSliceType & se } -#if defined(__AVX2__) || defined(__SSE4_2__) +#if (defined(__AVX2__) && defined(ENABLE_AVX2)) || defined(__SSE4_2__) namespace { @@ -127,7 +127,7 @@ inline ALWAYS_INLINE bool hasAllIntegralLoopRemainder( #endif -#if defined(__AVX2__) +#if defined(__AVX2__) && defined(ENABLE_AVX2) // TODO: Discuss about // raise an error : "error: no viable conversion from 'const NumericArraySlice' to 'const NumericArraySlice'" From 0154eab9cb1000c831bc44c49cfc1d3ccf2ff5c1 Mon Sep 17 00:00:00 2001 From: youenn lebras Date: Wed, 8 Dec 2021 10:27:42 +0100 Subject: [PATCH 16/82] Modify performance tests for HasAll, removing Large tests to see if it helps passing CICD --- tests/performance/hasAll_simd_int16.xml | 16 ++++++++-------- tests/performance/hasAll_simd_int32.xml | 16 ++++++++-------- tests/performance/hasAll_simd_int64.xml | 16 ++++++++-------- tests/performance/hasAll_simd_int8.xml | 16 ++++++++-------- 4 files changed, 32 insertions(+), 32 deletions(-) diff --git a/tests/performance/hasAll_simd_int16.xml b/tests/performance/hasAll_simd_int16.xml index c2ce4eec77f..63d869e7794 100644 --- a/tests/performance/hasAll_simd_int16.xml +++ b/tests/performance/hasAll_simd_int16.xml @@ -7,9 +7,9 @@ CREATE TABLE test_table_medium2 (`set` Array(Int16), `subset` Array (Int16)) ENGINE = MergeTree ORDER BY set CREATE TABLE test_table_mediumf (`set` Array(Int16), `subset` Array (Int16)) ENGINE = MergeTree ORDER BY set - CREATE TABLE test_table_large (`set` Array(Int16), `subset` Array (Int16)) ENGINE = MergeTree ORDER BY set + INSERT INTO test_table_small SELECT groupArraySample(500)(number) AS set, groupArraySample(10)(number) AS subset FROM (SELECT * FROM numbers(500)) @@ -22,9 +22,9 @@ INSERT INTO test_table_mediumf SELECT groupArraySample(1000000)(number) AS set, groupArraySample(10)(number) AS subset FROM (SELECT * FROM numbers(100000000)) - INSERT INTO test_table_large SELECT groupArraySample(50000000)(number) AS set, groupArraySample(10)(number) AS subset FROM (SELECT * FROM numbers(50000000)) + select hasAll(set, subset) from test_table_small select hasAll(set, subset) from test_table_small2 @@ -34,9 +34,9 @@ select hasAll(set, subset) from test_table_medium2 select hasAll(set, subset) from test_table_mediumf - select hasAll(set, subset) from test_table_large + DROP TABLE IF EXISTS test_table_small DROP TABLE IF EXISTS test_table_small2 @@ -46,7 +46,7 @@ DROP TABLE IF EXISTS test_table_medium2 DROP TABLE IF EXISTS test_table_mediumf - DROP TABLE IF EXISTS test_table_large + diff --git a/tests/performance/hasAll_simd_int32.xml b/tests/performance/hasAll_simd_int32.xml index 4543dea161b..074901737b0 100644 --- a/tests/performance/hasAll_simd_int32.xml +++ b/tests/performance/hasAll_simd_int32.xml @@ -7,9 +7,9 @@ CREATE TABLE test_table_medium2 (`set` Array(Int32), `subset` Array (Int32)) ENGINE = MergeTree ORDER BY set CREATE TABLE test_table_mediumf (`set` Array(Int32), `subset` Array (Int32)) ENGINE = MergeTree ORDER BY set - CREATE TABLE test_table_large (`set` Array(Int32), `subset` Array (Int32)) ENGINE = MergeTree ORDER BY set + INSERT INTO test_table_small SELECT groupArraySample(500)(number) AS set, groupArraySample(10)(number) AS subset FROM (SELECT * FROM numbers(500)) @@ -22,9 +22,9 @@ INSERT INTO test_table_mediumf SELECT groupArraySample(1000000)(number) AS set, groupArraySample(10)(number) AS subset FROM (SELECT * FROM numbers(100000000)) - INSERT INTO test_table_large SELECT groupArraySample(50000000)(number) AS set, groupArraySample(10)(number) AS subset FROM (SELECT * FROM numbers(50000000)) + select hasAll(set, subset) from test_table_small select hasAll(set, subset) from test_table_small2 @@ -34,9 +34,9 @@ select hasAll(set, subset) from test_table_medium2 select hasAll(set, subset) from test_table_mediumf - select hasAll(set, subset) from test_table_large + DROP TABLE IF EXISTS test_table_small DROP TABLE IF EXISTS test_table_small2 @@ -46,7 +46,7 @@ DROP TABLE IF EXISTS test_table_medium2 DROP TABLE IF EXISTS test_table_mediumf - DROP TABLE IF EXISTS test_table_large + diff --git a/tests/performance/hasAll_simd_int64.xml b/tests/performance/hasAll_simd_int64.xml index 07e52483bb1..9e68d3d219c 100644 --- a/tests/performance/hasAll_simd_int64.xml +++ b/tests/performance/hasAll_simd_int64.xml @@ -7,9 +7,9 @@ CREATE TABLE test_table_medium2 (`set` Array(Int64), `subset` Array (Int64)) ENGINE = MergeTree ORDER BY set CREATE TABLE test_table_mediumf (`set` Array(Int64), `subset` Array (Int64)) ENGINE = MergeTree ORDER BY set - CREATE TABLE test_table_large (`set` Array(Int64), `subset` Array (Int64)) ENGINE = MergeTree ORDER BY set + INSERT INTO test_table_small SELECT groupArraySample(500)(number) AS set, groupArraySample(10)(number) AS subset FROM (SELECT * FROM numbers(500)) @@ -22,9 +22,9 @@ INSERT INTO test_table_mediumf SELECT groupArraySample(1000000)(number) AS set, groupArraySample(10)(number) AS subset FROM (SELECT * FROM numbers(100000000)) - INSERT INTO test_table_large SELECT groupArraySample(50000000)(number) AS set, groupArraySample(10)(number) AS subset FROM (SELECT * FROM numbers(50000000)) + select hasAll(set, subset) from test_table_small select hasAll(set, subset) from test_table_small2 @@ -34,9 +34,9 @@ select hasAll(set, subset) from test_table_medium2 select hasAll(set, subset) from test_table_mediumf - select hasAll(set, subset) from test_table_large + DROP TABLE IF EXISTS test_table_small DROP TABLE IF EXISTS test_table_small2 @@ -46,7 +46,7 @@ DROP TABLE IF EXISTS test_table_medium2 DROP TABLE IF EXISTS test_table_mediumf - DROP TABLE IF EXISTS test_table_large + diff --git a/tests/performance/hasAll_simd_int8.xml b/tests/performance/hasAll_simd_int8.xml index 5ddc84aa5bd..4a0b30524ad 100644 --- a/tests/performance/hasAll_simd_int8.xml +++ b/tests/performance/hasAll_simd_int8.xml @@ -7,9 +7,9 @@ CREATE TABLE test_table_medium2 (`set` Array(Int8), `subset` Array (Int8)) ENGINE = MergeTree ORDER BY set CREATE TABLE test_table_mediumf (`set` Array(Int8), `subset` Array (Int8)) ENGINE = MergeTree ORDER BY set - CREATE TABLE test_table_large (`set` Array(Int8), `subset` Array (Int8)) ENGINE = MergeTree ORDER BY set + INSERT INTO test_table_small SELECT groupArraySample(500)(number) AS set, groupArraySample(10)(number) AS subset FROM (SELECT * FROM numbers(500)) @@ -22,9 +22,9 @@ INSERT INTO test_table_mediumf SELECT groupArraySample(1000000)(number) AS set, groupArraySample(10)(number) AS subset FROM (SELECT * FROM numbers(100000000)) - INSERT INTO test_table_large SELECT groupArraySample(50000000)(number) AS set, groupArraySample(10)(number) AS subset FROM (SELECT * FROM numbers(50000000)) + select hasAll(set, subset) from test_table_small select hasAll(set, subset) from test_table_small2 @@ -34,9 +34,9 @@ select hasAll(set, subset) from test_table_medium2 select hasAll(set, subset) from test_table_mediumf - select hasAll(set, subset) from test_table_large + DROP TABLE IF EXISTS test_table_small DROP TABLE IF EXISTS test_table_small2 @@ -46,7 +46,7 @@ DROP TABLE IF EXISTS test_table_medium2 DROP TABLE IF EXISTS test_table_mediumf - DROP TABLE IF EXISTS test_table_large + From c2b761acf282c00290dae1cbeb1ae43d8a7858bd Mon Sep 17 00:00:00 2001 From: youenn lebras Date: Wed, 8 Dec 2021 11:01:24 +0100 Subject: [PATCH 17/82] Add cmake option to enable or not AVX2 instructions This reverts commit bca8eca44fe382b6efe80a381d42e6ede8a91fa3. --- src/Functions/GatherUtils/CMakeLists.txt | 2 +- tests/performance/hasAll_simd_int16.xml | 16 ++++++++-------- tests/performance/hasAll_simd_int32.xml | 16 ++++++++-------- tests/performance/hasAll_simd_int64.xml | 16 ++++++++-------- tests/performance/hasAll_simd_int8.xml | 16 ++++++++-------- 5 files changed, 33 insertions(+), 33 deletions(-) diff --git a/src/Functions/GatherUtils/CMakeLists.txt b/src/Functions/GatherUtils/CMakeLists.txt index f291663550d..10909b99b82 100644 --- a/src/Functions/GatherUtils/CMakeLists.txt +++ b/src/Functions/GatherUtils/CMakeLists.txt @@ -22,6 +22,6 @@ if (HAVE_SSE42) target_compile_options(clickhouse_functions_gatherutils PRIVATE -msse4.2) endif() if (HAVE_AVX2 AND ENABLE_AVX2) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx2 -DENABLE_AVX2") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx2 -DENABLE_AVX2") target_compile_options(clickhouse_functions_gatherutils PRIVATE -mavx2) endif() diff --git a/tests/performance/hasAll_simd_int16.xml b/tests/performance/hasAll_simd_int16.xml index 63d869e7794..c2ce4eec77f 100644 --- a/tests/performance/hasAll_simd_int16.xml +++ b/tests/performance/hasAll_simd_int16.xml @@ -7,9 +7,9 @@ CREATE TABLE test_table_medium2 (`set` Array(Int16), `subset` Array (Int16)) ENGINE = MergeTree ORDER BY set CREATE TABLE test_table_mediumf (`set` Array(Int16), `subset` Array (Int16)) ENGINE = MergeTree ORDER BY set - + CREATE TABLE test_table_largef (`set` Array(Int16), `subset` Array (Int16)) ENGINE = MergeTree ORDER BY set INSERT INTO test_table_small SELECT groupArraySample(500)(number) AS set, groupArraySample(10)(number) AS subset FROM (SELECT * FROM numbers(500)) @@ -22,9 +22,9 @@ INSERT INTO test_table_mediumf SELECT groupArraySample(1000000)(number) AS set, groupArraySample(10)(number) AS subset FROM (SELECT * FROM numbers(100000000)) - + INSERT INTO test_table_largef SELECT groupArraySample(50000000)(number) AS set, groupArraySample(10)(number) AS subset FROM (SELECT * FROM numbers(1000000000)) select hasAll(set, subset) from test_table_small select hasAll(set, subset) from test_table_small2 @@ -34,9 +34,9 @@ select hasAll(set, subset) from test_table_medium2 select hasAll(set, subset) from test_table_mediumf - + select hasAll(set, subset) from test_table_largef DROP TABLE IF EXISTS test_table_small DROP TABLE IF EXISTS test_table_small2 @@ -46,7 +46,7 @@ DROP TABLE IF EXISTS test_table_medium2 DROP TABLE IF EXISTS test_table_mediumf - + DROP TABLE IF EXISTS test_table_largef diff --git a/tests/performance/hasAll_simd_int32.xml b/tests/performance/hasAll_simd_int32.xml index 074901737b0..4543dea161b 100644 --- a/tests/performance/hasAll_simd_int32.xml +++ b/tests/performance/hasAll_simd_int32.xml @@ -7,9 +7,9 @@ CREATE TABLE test_table_medium2 (`set` Array(Int32), `subset` Array (Int32)) ENGINE = MergeTree ORDER BY set CREATE TABLE test_table_mediumf (`set` Array(Int32), `subset` Array (Int32)) ENGINE = MergeTree ORDER BY set - + CREATE TABLE test_table_largef (`set` Array(Int32), `subset` Array (Int32)) ENGINE = MergeTree ORDER BY set INSERT INTO test_table_small SELECT groupArraySample(500)(number) AS set, groupArraySample(10)(number) AS subset FROM (SELECT * FROM numbers(500)) @@ -22,9 +22,9 @@ INSERT INTO test_table_mediumf SELECT groupArraySample(1000000)(number) AS set, groupArraySample(10)(number) AS subset FROM (SELECT * FROM numbers(100000000)) - + INSERT INTO test_table_largef SELECT groupArraySample(50000000)(number) AS set, groupArraySample(10)(number) AS subset FROM (SELECT * FROM numbers(1000000000)) select hasAll(set, subset) from test_table_small select hasAll(set, subset) from test_table_small2 @@ -34,9 +34,9 @@ select hasAll(set, subset) from test_table_medium2 select hasAll(set, subset) from test_table_mediumf - + select hasAll(set, subset) from test_table_largef DROP TABLE IF EXISTS test_table_small DROP TABLE IF EXISTS test_table_small2 @@ -46,7 +46,7 @@ DROP TABLE IF EXISTS test_table_medium2 DROP TABLE IF EXISTS test_table_mediumf - + DROP TABLE IF EXISTS test_table_largef diff --git a/tests/performance/hasAll_simd_int64.xml b/tests/performance/hasAll_simd_int64.xml index 9e68d3d219c..07e52483bb1 100644 --- a/tests/performance/hasAll_simd_int64.xml +++ b/tests/performance/hasAll_simd_int64.xml @@ -7,9 +7,9 @@ CREATE TABLE test_table_medium2 (`set` Array(Int64), `subset` Array (Int64)) ENGINE = MergeTree ORDER BY set CREATE TABLE test_table_mediumf (`set` Array(Int64), `subset` Array (Int64)) ENGINE = MergeTree ORDER BY set - + CREATE TABLE test_table_largef (`set` Array(Int64), `subset` Array (Int64)) ENGINE = MergeTree ORDER BY set INSERT INTO test_table_small SELECT groupArraySample(500)(number) AS set, groupArraySample(10)(number) AS subset FROM (SELECT * FROM numbers(500)) @@ -22,9 +22,9 @@ INSERT INTO test_table_mediumf SELECT groupArraySample(1000000)(number) AS set, groupArraySample(10)(number) AS subset FROM (SELECT * FROM numbers(100000000)) - + INSERT INTO test_table_largef SELECT groupArraySample(50000000)(number) AS set, groupArraySample(10)(number) AS subset FROM (SELECT * FROM numbers(1000000000)) select hasAll(set, subset) from test_table_small select hasAll(set, subset) from test_table_small2 @@ -34,9 +34,9 @@ select hasAll(set, subset) from test_table_medium2 select hasAll(set, subset) from test_table_mediumf - + select hasAll(set, subset) from test_table_largef DROP TABLE IF EXISTS test_table_small DROP TABLE IF EXISTS test_table_small2 @@ -46,7 +46,7 @@ DROP TABLE IF EXISTS test_table_medium2 DROP TABLE IF EXISTS test_table_mediumf - + DROP TABLE IF EXISTS test_table_largef diff --git a/tests/performance/hasAll_simd_int8.xml b/tests/performance/hasAll_simd_int8.xml index 4a0b30524ad..5ddc84aa5bd 100644 --- a/tests/performance/hasAll_simd_int8.xml +++ b/tests/performance/hasAll_simd_int8.xml @@ -7,9 +7,9 @@ CREATE TABLE test_table_medium2 (`set` Array(Int8), `subset` Array (Int8)) ENGINE = MergeTree ORDER BY set CREATE TABLE test_table_mediumf (`set` Array(Int8), `subset` Array (Int8)) ENGINE = MergeTree ORDER BY set - + CREATE TABLE test_table_largef (`set` Array(Int8), `subset` Array (Int8)) ENGINE = MergeTree ORDER BY set INSERT INTO test_table_small SELECT groupArraySample(500)(number) AS set, groupArraySample(10)(number) AS subset FROM (SELECT * FROM numbers(500)) @@ -22,9 +22,9 @@ INSERT INTO test_table_mediumf SELECT groupArraySample(1000000)(number) AS set, groupArraySample(10)(number) AS subset FROM (SELECT * FROM numbers(100000000)) - + INSERT INTO test_table_largef SELECT groupArraySample(50000000)(number) AS set, groupArraySample(10)(number) AS subset FROM (SELECT * FROM numbers(1000000000)) select hasAll(set, subset) from test_table_small select hasAll(set, subset) from test_table_small2 @@ -34,9 +34,9 @@ select hasAll(set, subset) from test_table_medium2 select hasAll(set, subset) from test_table_mediumf - + select hasAll(set, subset) from test_table_largef DROP TABLE IF EXISTS test_table_small DROP TABLE IF EXISTS test_table_small2 @@ -46,7 +46,7 @@ DROP TABLE IF EXISTS test_table_medium2 DROP TABLE IF EXISTS test_table_mediumf - + DROP TABLE IF EXISTS test_table_largef From 9ec7e61f5f96a97653cbf364ef68d61ca2b09d07 Mon Sep 17 00:00:00 2001 From: youenn lebras Date: Mon, 31 Jan 2022 12:15:20 +0100 Subject: [PATCH 18/82] Add Unsigned version for Int8, Int16, Int32, Int64 for SS4.2 and AVX2 (aka AVX256) --- .../GatherUtils/sliceHasImplAnyAll.h | 844 ++++++++++++++++-- 1 file changed, 768 insertions(+), 76 deletions(-) diff --git a/src/Functions/GatherUtils/sliceHasImplAnyAll.h b/src/Functions/GatherUtils/sliceHasImplAnyAll.h index 9028a94b2aa..52448f88447 100644 --- a/src/Functions/GatherUtils/sliceHasImplAnyAll.h +++ b/src/Functions/GatherUtils/sliceHasImplAnyAll.h @@ -129,22 +129,178 @@ inline ALWAYS_INLINE bool hasAllIntegralLoopRemainder( #if defined(__AVX2__) && defined(ENABLE_AVX2) -// TODO: Discuss about -// raise an error : "error: no viable conversion from 'const NumericArraySlice' to 'const NumericArraySlice'" -// How should we do, copy past each function ?? I haven't found a way to specialize a same function body for two different types. -// AVX2 UInt specialization -// template <> -// inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >( -// const NumericArraySlice & second, const NumericArraySlice & first, const UInt8 * first_null_map, const UInt8 * second_null_map) -// { -// return sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements > ( -// static_cast &>(second), static_cast &>(first), second_null_map, first_null_map); -// } - -// AVX2 Int specialization of sliceHasImplAnyAll +// AVX2 Int64 specialization template <> -inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >( - const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, const UInt8 * second_null_map) +inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >( + const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, const UInt8 * second_null_map) +{ + if (second.size == 0) + return true; + + if (!hasNull(first_null_map, first.size) && hasNull(second_null_map, second.size)) + return false; + + const bool has_first_null_map = first_null_map != nullptr; + const bool has_second_null_map = second_null_map != nullptr; + + size_t j = 0; + short has_mask = 1; + const Int64 full = -1, none = 0; + const __m256i ones = _mm256_set1_epi64x(full); + const __m256i zeros = _mm256_setzero_si256(); + if (second.size > 3 && first.size > 3) + { + for (; j < second.size - 3 && has_mask; j += 4) + { + has_mask = 0; + const __m256i f_data = _mm256_loadu_si256(reinterpret_cast(second.data + j)); + // bits of the bitmask are set to one if considered as null in the corresponding null map, 0 otherwise; + __m256i bitmask = has_second_null_map ? + _mm256_set_epi64x( + (second_null_map[j + 3])? full : none, + (second_null_map[j + 2])? full : none, + (second_null_map[j + 1])? full : none, + (second_null_map[j]) ? full : none) + : zeros; + + unsigned i = 0; + for (; i < first.size - 3 && !has_mask; has_mask = _mm256_testc_si256(bitmask, ones), i += 4) + { + const __m256i s_data = _mm256_loadu_si256(reinterpret_cast(first.data + i)); + const __m256i first_nm_mask = has_first_null_map? + _mm256_set_m128i( + _mm_cvtepi8_epi64(_mm_loadu_si128(reinterpret_cast(first_null_map + i + 2))), + _mm_cvtepi8_epi64(_mm_loadu_si128(reinterpret_cast(first_null_map + i)))) + : zeros; + bitmask = + _mm256_or_si256( + _mm256_or_si256( + _mm256_or_si256( + _mm256_andnot_si256( + first_nm_mask, + _mm256_cmpeq_epi64(f_data, s_data)), + _mm256_andnot_si256( + _mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(5,4,3,2,1,0,7,6)), + _mm256_cmpeq_epi64(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(5,4,3,2,1,0,7,6))))), + + _mm256_or_si256( + _mm256_andnot_si256( + _mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(3,2,1,0,7,6,5,4)), + _mm256_cmpeq_epi64(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(3,2,1,0,7,6,5,4)))), + _mm256_andnot_si256( + _mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(1,0,7,6,5,4,3,2)), + _mm256_cmpeq_epi64(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(1,0,7,6,5,4,3,2)))))), + bitmask); + } + + if (i < first.size) + { + for (; i < first.size && !has_mask; ++i) + { + if (has_first_null_map && first_null_map[i]) + continue; + __m256i v_i = _mm256_set1_epi64x(first.data[i]); + bitmask = _mm256_or_si256(bitmask, _mm256_cmpeq_epi64(f_data, v_i)); + has_mask = _mm256_testc_si256(bitmask, ones); + } + } + } + } + + if (!has_mask && second.size > 3) + return false; + + return hasAllIntegralLoopRemainder(j, first, second, first_null_map, second_null_map); +} + +// AVX2 Int64 specialization +template <> +inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >( + const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, const UInt8 * second_null_map) +{ + if (second.size == 0) + return true; + + if (!hasNull(first_null_map, first.size) && hasNull(second_null_map, second.size)) + return false; + + const bool has_first_null_map = first_null_map != nullptr; + const bool has_second_null_map = second_null_map != nullptr; + + size_t j = 0; + short has_mask = 1; + const Int64 full = -1, none = 0; + const __m256i ones = _mm256_set1_epi64x(full); + const __m256i zeros = _mm256_setzero_si256(); + if (second.size > 3 && first.size > 3) + { + for (; j < second.size - 3 && has_mask; j += 4) + { + has_mask = 0; + const __m256i f_data = _mm256_loadu_si256(reinterpret_cast(second.data + j)); + // bits of the bitmask are set to one if considered as null in the corresponding null map, 0 otherwise; + __m256i bitmask = has_second_null_map ? + _mm256_set_epi64x( + (second_null_map[j + 3])? full : none, + (second_null_map[j + 2])? full : none, + (second_null_map[j + 1])? full : none, + (second_null_map[j]) ? full : none) + : zeros; + + unsigned i = 0; + for (; i < first.size - 3 && !has_mask; has_mask = _mm256_testc_si256(bitmask, ones), i += 4) + { + const __m256i s_data = _mm256_loadu_si256(reinterpret_cast(first.data + i)); + const __m256i first_nm_mask = has_first_null_map? + _mm256_set_m128i( + _mm_cvtepi8_epi64(_mm_loadu_si128(reinterpret_cast(first_null_map + i + 2))), + _mm_cvtepi8_epi64(_mm_loadu_si128(reinterpret_cast(first_null_map + i)))) + : zeros; + bitmask = + _mm256_or_si256( + _mm256_or_si256( + _mm256_or_si256( + _mm256_andnot_si256( + first_nm_mask, + _mm256_cmpeq_epi64(f_data, s_data)), + _mm256_andnot_si256( + _mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(5,4,3,2,1,0,7,6)), + _mm256_cmpeq_epi64(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(5,4,3,2,1,0,7,6))))), + + _mm256_or_si256( + _mm256_andnot_si256( + _mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(3,2,1,0,7,6,5,4)), + _mm256_cmpeq_epi64(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(3,2,1,0,7,6,5,4)))), + _mm256_andnot_si256( + _mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(1,0,7,6,5,4,3,2)), + _mm256_cmpeq_epi64(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(1,0,7,6,5,4,3,2)))))), + bitmask); + } + + if (i < first.size) + { + for (; i < first.size && !has_mask; ++i) + { + if (has_first_null_map && first_null_map[i]) + continue; + __m256i v_i = _mm256_set1_epi64x(first.data[i]); + bitmask = _mm256_or_si256(bitmask, _mm256_cmpeq_epi64(f_data, v_i)); + has_mask = _mm256_testc_si256(bitmask, ones); + } + } + } + } + + if (!has_mask && second.size > 3) + return false; + + return hasAllIntegralLoopRemainder(j, first, second, first_null_map, second_null_map); +} + +// AVX2 Int32 specialization of sliceHasImplAnyAll +template <> +inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >( + const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, const UInt8 * second_null_map) { if (second.size == 0) return true; @@ -248,10 +404,117 @@ inline ALWAYS_INLINE bool sliceHasImplAnyAll -inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >( - const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, const UInt8 * second_null_map) +inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >( + const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, const UInt8 * second_null_map) +{ + if (second.size == 0) + return true; + + if (!hasNull(first_null_map, first.size) && hasNull(second_null_map, second.size)) + return false; + + const bool has_first_null_map = first_null_map != nullptr; + const bool has_second_null_map = second_null_map != nullptr; + + size_t j = 0; + short has_mask = 1; + const int full = -1, none = 0; + const __m256i ones = _mm256_set1_epi32(full); + const __m256i zeros = _mm256_setzero_si256(); + if (second.size > 7 && first.size > 7) + { + for (; j < second.size - 7 && has_mask; j += 8) + { + has_mask = 0; + const __m256i f_data = _mm256_loadu_si256(reinterpret_cast(second.data + j)); + // bits of the bitmask are set to one if considered as null in the corresponding null map, 0 otherwise; + __m256i bitmask = has_second_null_map ? + _mm256_set_epi32( + (second_null_map[j + 7]) ? full : none, + (second_null_map[j + 6]) ? full : none, + (second_null_map[j + 5]) ? full : none, + (second_null_map[j + 4]) ? full : none, + (second_null_map[j + 3]) ? full : none, + (second_null_map[j + 2]) ? full : none, + (second_null_map[j + 1]) ? full : none, + (second_null_map[j]) ? full : none) + : zeros; + + size_t i = 0; + for (; i < first.size - 7 && !has_mask; has_mask = _mm256_testc_si256(bitmask, ones), i += 8) + { + const __m256i s_data = _mm256_loadu_si256(reinterpret_cast(first.data + i)); + // Create a mask to avoid to compare null elements + // set_m128i takes two arguments: (high segment, low segment) that are two __m128i convert from 8bits to 32bits to match with next operations + const __m256i first_nm_mask = has_first_null_map? + _mm256_set_m128i( + _mm_cvtepi8_epi32(_mm_loadu_si128(reinterpret_cast(first_null_map + i + 4))), + _mm_cvtepi8_epi32(_mm_loadu_si128(reinterpret_cast(first_null_map + i)))) + : zeros; + bitmask = + _mm256_or_si256( + _mm256_or_si256( + _mm256_or_si256( + _mm256_or_si256( + _mm256_andnot_si256( + first_nm_mask, + _mm256_cmpeq_epi32(f_data, s_data)), + _mm256_andnot_si256( + _mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(6,5,4,3,2,1,0,7)), + _mm256_cmpeq_epi32(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(6,5,4,3,2,1,0,7))))), + _mm256_or_si256( + _mm256_andnot_si256( + _mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(5,4,3,2,1,0,7,6)), + _mm256_cmpeq_epi32(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(5,4,3,2,1,0,7,6)))), + _mm256_andnot_si256( + _mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(4,3,2,1,0,7,6,5)), + _mm256_cmpeq_epi32(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(4,3,2,1,0,7,6,5))))) + ), + _mm256_or_si256( + _mm256_or_si256( + _mm256_andnot_si256( + _mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(3,2,1,0,7,6,5,4)), + _mm256_cmpeq_epi32(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(3,2,1,0,7,6,5,4)))), + _mm256_andnot_si256( + _mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(2,1,0,7,6,5,4,3)), + _mm256_cmpeq_epi32(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(2,1,0,7,6,5,4,3))))), + _mm256_or_si256( + _mm256_andnot_si256( + _mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(1,0,7,6,5,4,3,2)), + _mm256_cmpeq_epi32(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(1,0,7,6,5,4,3,2)))), + _mm256_andnot_si256( + _mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(0,7,6,5,4,3,2,1)), + _mm256_cmpeq_epi32(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(0,7,6,5,4,3,2,1))))))), + bitmask); + } + + if (i < first.size) + { + // Loop(i)-jam + for (; i < first.size && !has_mask; ++i) + { + if (has_first_null_map && first_null_map[i]) + continue; + __m256i v_i = _mm256_set1_epi32(first.data[i]); + bitmask = _mm256_or_si256(bitmask, _mm256_cmpeq_epi32(f_data, v_i)); + has_mask = _mm256_testc_si256(bitmask, ones); + } + } + } + } + + if (!has_mask && second.size > 7) + return false; + + return hasAllIntegralLoopRemainder(j, first, second, first_null_map, second_null_map); +} + +// AVX2 Int16 specialization +template <> +inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >( + const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, const UInt8 * second_null_map) { if (second.size == 0) return true; @@ -264,52 +527,106 @@ inline ALWAYS_INLINE bool sliceHasImplAnyAll 3 && first.size > 3) + if (second.size > 15 && first.size > 15) { - for (; j < second.size - 3 && has_mask; j += 4) + for (; j < second.size - 15 && has_mask; j += 16) { has_mask = 0; const __m256i f_data = _mm256_loadu_si256(reinterpret_cast(second.data + j)); - // bits of the bitmask are set to one if considered as null in the corresponding null map, 0 otherwise; __m256i bitmask = has_second_null_map ? - _mm256_set_epi64x( - (second_null_map[j + 3])? full : none, - (second_null_map[j + 2])? full : none, - (second_null_map[j + 1])? full : none, - (second_null_map[j]) ? full : none) + _mm256_set_epi16( + (second_null_map[j + 15]) ? full : none, (second_null_map[j + 14]) ? full : none, + (second_null_map[j + 13]) ? full : none, (second_null_map[j + 12]) ? full : none, + (second_null_map[j + 11]) ? full : none, (second_null_map[j + 10]) ? full : none, + (second_null_map[j + 9]) ? full : none, (second_null_map[j + 8])? full : none, + (second_null_map[j + 7]) ? full : none, (second_null_map[j + 6])? full : none, + (second_null_map[j + 5]) ? full : none, (second_null_map[j + 4])? full : none, + (second_null_map[j + 3]) ? full : none, (second_null_map[j + 2])? full : none, + (second_null_map[j + 1]) ? full : none, (second_null_map[j]) ? full : none) : zeros; - unsigned i = 0; - for (; i < first.size - 3 && !has_mask; has_mask = _mm256_testc_si256(bitmask, ones), i += 4) + for (; i < first.size - 15 && !has_mask; has_mask = _mm256_testc_si256(bitmask, ones), i += 16) { const __m256i s_data = _mm256_loadu_si256(reinterpret_cast(first.data + i)); const __m256i first_nm_mask = has_first_null_map? _mm256_set_m128i( - _mm_cvtepi8_epi64(_mm_loadu_si128(reinterpret_cast(first_null_map + i + 2))), - _mm_cvtepi8_epi64(_mm_loadu_si128(reinterpret_cast(first_null_map + i)))) + _mm_cvtepi8_epi16(_mm_loadu_si128(reinterpret_cast(first_null_map + i + 8))), + _mm_cvtepi8_epi16(_mm_loadu_si128(reinterpret_cast(first_null_map + i)))) : zeros; bitmask = _mm256_or_si256( _mm256_or_si256( _mm256_or_si256( - _mm256_andnot_si256( - first_nm_mask, - _mm256_cmpeq_epi64(f_data, s_data)), - _mm256_andnot_si256( - _mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(5,4,3,2,1,0,7,6)), - _mm256_cmpeq_epi64(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(5,4,3,2,1,0,7,6))))), - + _mm256_or_si256( + _mm256_or_si256( + _mm256_andnot_si256( + first_nm_mask, + _mm256_cmpeq_epi16(f_data, s_data)), + _mm256_andnot_si256( + _mm256_shuffle_epi8(first_nm_mask, _mm256_set_epi8(29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30)), + _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(s_data, _mm256_set_epi8(29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30))))), + _mm256_or_si256( + _mm256_andnot_si256( + _mm256_shuffle_epi8(first_nm_mask, _mm256_set_epi8(27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28)), + _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(s_data, _mm256_set_epi8(27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28)))), + _mm256_andnot_si256( + _mm256_shuffle_epi8(first_nm_mask, _mm256_set_epi8(25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26)), + _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(s_data, _mm256_set_epi8(25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26))))) + ), + _mm256_or_si256( + _mm256_or_si256( + _mm256_andnot_si256( + _mm256_shuffle_epi8(first_nm_mask, _mm256_set_epi8(23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24)), + _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(s_data, _mm256_set_epi8(23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24)))), + _mm256_andnot_si256( + _mm256_shuffle_epi8(first_nm_mask, _mm256_set_epi8(21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22)), + _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(s_data, _mm256_set_epi8(21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22))))), + _mm256_or_si256( + _mm256_andnot_si256( + _mm256_shuffle_epi8(first_nm_mask, _mm256_set_epi8(19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20)), + _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(s_data, _mm256_set_epi8(19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20)))), + _mm256_andnot_si256( + _mm256_shuffle_epi8(first_nm_mask, _mm256_set_epi8(17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18)), + _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(s_data, _mm256_set_epi8(17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18)))))) + ), _mm256_or_si256( - _mm256_andnot_si256( - _mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(3,2,1,0,7,6,5,4)), - _mm256_cmpeq_epi64(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(3,2,1,0,7,6,5,4)))), - _mm256_andnot_si256( - _mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(1,0,7,6,5,4,3,2)), - _mm256_cmpeq_epi64(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(1,0,7,6,5,4,3,2)))))), - bitmask); + _mm256_or_si256( + _mm256_or_si256( + _mm256_andnot_si256( + _mm256_permute2x128_si256(first_nm_mask, first_nm_mask,1), + _mm256_cmpeq_epi16(f_data, _mm256_permute2x128_si256(s_data, s_data, 1))), + _mm256_andnot_si256( + _mm256_shuffle_epi8(_mm256_permute2x128_si256(first_nm_mask, first_nm_mask, 1), _mm256_set_epi8(13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14)), + _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(_mm256_permute2x128_si256(s_data, s_data, 1), _mm256_set_epi8(13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14))))), + _mm256_or_si256( + _mm256_andnot_si256( + _mm256_shuffle_epi8(_mm256_permute2x128_si256(first_nm_mask, first_nm_mask, 1), _mm256_set_epi8(11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12)), + _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(_mm256_permute2x128_si256(s_data, s_data, 1), _mm256_set_epi8(11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12)))), + _mm256_andnot_si256( + _mm256_shuffle_epi8(_mm256_permute2x128_si256(first_nm_mask, first_nm_mask, 1), _mm256_set_epi8(9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10)), + _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(_mm256_permute2x128_si256(s_data, s_data, 1), _mm256_set_epi8(9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10))))) + ), + _mm256_or_si256( + _mm256_or_si256( + _mm256_andnot_si256( + _mm256_shuffle_epi8(_mm256_permute2x128_si256(first_nm_mask, first_nm_mask, 1), _mm256_set_epi8(7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8)), + _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(_mm256_permute2x128_si256(s_data ,s_data, 1), _mm256_set_epi8(7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8)))), + _mm256_andnot_si256( + _mm256_shuffle_epi8(_mm256_permute2x128_si256(first_nm_mask, first_nm_mask, 1), _mm256_set_epi8(5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6)), + _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(_mm256_permute2x128_si256(s_data, s_data, 1), _mm256_set_epi8(5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6))))), + _mm256_or_si256( + _mm256_andnot_si256( + _mm256_shuffle_epi8(_mm256_permute2x128_si256(first_nm_mask, first_nm_mask, 1), _mm256_set_epi8(3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4)), + _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(_mm256_permute2x128_si256(s_data ,s_data ,1), _mm256_set_epi8(3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4)))), + _mm256_andnot_si256( + _mm256_shuffle_epi8(_mm256_permute2x128_si256(first_nm_mask, first_nm_mask, 1), _mm256_set_epi8(1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2)), + _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(_mm256_permute2x128_si256(s_data, s_data, 1), _mm256_set_epi8(1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2)))))) + ) + ), + bitmask); } if (i < first.size) @@ -318,24 +635,24 @@ inline ALWAYS_INLINE bool sliceHasImplAnyAll 3) + if (!has_mask && second.size > 15) return false; return hasAllIntegralLoopRemainder(j, first, second, first_null_map, second_null_map); } -// AVX2 Int16_t specialization +// AVX2 UInt16 specialization template <> -inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >( - const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, const UInt8 * second_null_map) +inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >( + const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, const UInt8 * second_null_map) { if (second.size == 0) return true; @@ -472,10 +789,146 @@ inline ALWAYS_INLINE bool sliceHasImplAnyAll -inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >( - const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, const UInt8 * second_null_map) +inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >( + const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, const UInt8 * second_null_map) +{ + if (second.size == 0) + return true; + + if (!hasNull(first_null_map, first.size) && hasNull(second_null_map, second.size)) + return false; + + const bool has_first_null_map = first_null_map != nullptr; + const bool has_second_null_map = second_null_map != nullptr; + + size_t j = 0; + short has_mask = 1; + const Int64 full = -1, none = 0; + const __m128i zeros = _mm_setzero_si128(); + if (second.size > 1 && first.size > 1) + { + for (; j < second.size - 1 && has_mask; j += 2) + { + has_mask = 0; + const __m128i f_data = _mm_loadu_si128(reinterpret_cast(second.data + j)); + __m128i bitmask = has_second_null_map ? + _mm_set_epi64x( + (second_null_map[j + 1]) ? full : none, + (second_null_map[j]) ? full : none) + : zeros; + unsigned i = 0; + for (; i < first.size - 1 && !has_mask; has_mask = _mm_test_all_ones(bitmask), i += 2) + { + const __m128i s_data = _mm_loadu_si128(reinterpret_cast(first.data + i)); + const __m128i first_nm_mask = has_first_null_map ? + _mm_cvtepi8_epi64(_mm_loadu_si128(reinterpret_cast(first_null_map + i))) + : zeros; + bitmask = + _mm_or_si128( + _mm_or_si128( + _mm_andnot_si128( + first_nm_mask, + _mm_cmpeq_epi64(f_data, s_data)), + _mm_andnot_si128( + _mm_shuffle_epi32(first_nm_mask, _MM_SHUFFLE(1,0,3,2)), + _mm_cmpeq_epi64(f_data, _mm_shuffle_epi32(s_data, _MM_SHUFFLE(1,0,3,2))))), + bitmask); + } + + if (i < first.size) + { + for (; i < first.size && !has_mask; ++i) + { + if (has_first_null_map && first_null_map[i]) + continue; + __m128i v_i = _mm_set1_epi64x(first.data[i]); + bitmask = _mm_or_si128(bitmask, _mm_cmpeq_epi64(f_data, v_i)); + has_mask = _mm_test_all_ones(bitmask); + } + } + } + } + + if (!has_mask && second.size > 1) + return false; + + return hasAllIntegralLoopRemainder(j, first, second, first_null_map, second_null_map); +} + +// SSE4.2 UInt64 specialization +template <> +inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >( + const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, const UInt8 * second_null_map) +{ + if (second.size == 0) + return true; + + if (!hasNull(first_null_map, first.size) && hasNull(second_null_map, second.size)) + return false; + + const bool has_first_null_map = first_null_map != nullptr; + const bool has_second_null_map = second_null_map != nullptr; + + size_t j = 0; + short has_mask = 1; + const Int64 full = -1, none = 0; + const __m128i zeros = _mm_setzero_si128(); + if (second.size > 1 && first.size > 1) + { + for (; j < second.size - 1 && has_mask; j += 2) + { + has_mask = 0; + const __m128i f_data = _mm_loadu_si128(reinterpret_cast(second.data + j)); + __m128i bitmask = has_second_null_map ? + _mm_set_epi64x( + (second_null_map[j + 1]) ? full : none, + (second_null_map[j]) ? full : none) + : zeros; + unsigned i = 0; + for (; i < first.size - 1 && !has_mask; has_mask = _mm_test_all_ones(bitmask), i += 2) + { + const __m128i s_data = _mm_loadu_si128(reinterpret_cast(first.data + i)); + const __m128i first_nm_mask = has_first_null_map ? + _mm_cvtepi8_epi64(_mm_loadu_si128(reinterpret_cast(first_null_map + i))) + : zeros; + bitmask = + _mm_or_si128( + _mm_or_si128( + _mm_andnot_si128( + first_nm_mask, + _mm_cmpeq_epi64(f_data, s_data)), + _mm_andnot_si128( + _mm_shuffle_epi32(first_nm_mask, _MM_SHUFFLE(1,0,3,2)), + _mm_cmpeq_epi64(f_data, _mm_shuffle_epi32(s_data, _MM_SHUFFLE(1,0,3,2))))), + bitmask); + } + + if (i < first.size) + { + for (; i < first.size && !has_mask; ++i) + { + if (has_first_null_map && first_null_map[i]) + continue; + __m128i v_i = _mm_set1_epi64x(first.data[i]); + bitmask = _mm_or_si128(bitmask, _mm_cmpeq_epi64(f_data, v_i)); + has_mask = _mm_test_all_ones(bitmask); + } + } + } + } + + if (!has_mask && second.size > 1) + return false; + + return hasAllIntegralLoopRemainder(j, first, second, first_null_map, second_null_map); +} + +// SSE4.2 Int32 specialization +template <> +inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >( + const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, const UInt8 * second_null_map) { if (second.size == 0) return true; @@ -553,10 +1006,10 @@ inline ALWAYS_INLINE bool sliceHasImplAnyAll -inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >( - const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, const UInt8 * second_null_map) +inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >( + const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, const UInt8 * second_null_map) { if (second.size == 0) return true; @@ -569,35 +1022,48 @@ inline ALWAYS_INLINE bool sliceHasImplAnyAll 1 && first.size > 1) + if (second.size > 3 && first.size > 3) { - for (; j < second.size - 1 && has_mask; j += 2) + for (; j < second.size - 3 && has_mask; j += 4) { has_mask = 0; const __m128i f_data = _mm_loadu_si128(reinterpret_cast(second.data + j)); __m128i bitmask = has_second_null_map ? - _mm_set_epi64x( + _mm_set_epi32( + (second_null_map[j + 3]) ? full : none, + (second_null_map[j + 2]) ? full : none, (second_null_map[j + 1]) ? full : none, (second_null_map[j]) ? full : none) : zeros; + unsigned i = 0; - for (; i < first.size - 1 && !has_mask; has_mask = _mm_test_all_ones(bitmask), i += 2) + for (; i < first.size - 3 && !has_mask; has_mask = _mm_test_all_ones(bitmask), i += 4) { const __m128i s_data = _mm_loadu_si128(reinterpret_cast(first.data + i)); const __m128i first_nm_mask = has_first_null_map ? - _mm_cvtepi8_epi64(_mm_loadu_si128(reinterpret_cast(first_null_map + i))) + _mm_cvtepi8_epi32(_mm_loadu_si128(reinterpret_cast(first_null_map + i))) : zeros; + bitmask = _mm_or_si128( + _mm_or_si128( _mm_or_si128( _mm_andnot_si128( - first_nm_mask, - _mm_cmpeq_epi64(f_data, s_data)), + first_nm_mask, + _mm_cmpeq_epi32(f_data, s_data)), + _mm_andnot_si128( + _mm_shuffle_epi32(first_nm_mask, _MM_SHUFFLE(2,1,0,3)), + _mm_cmpeq_epi32(f_data, _mm_shuffle_epi32(s_data, _MM_SHUFFLE(2,1,0,3))))), + _mm_or_si128( _mm_andnot_si128( _mm_shuffle_epi32(first_nm_mask, _MM_SHUFFLE(1,0,3,2)), - _mm_cmpeq_epi64(f_data, _mm_shuffle_epi32(s_data, _MM_SHUFFLE(1,0,3,2))))), + _mm_cmpeq_epi32(f_data, _mm_shuffle_epi32(s_data, _MM_SHUFFLE(1,0,3,2)))), + _mm_andnot_si128( + _mm_shuffle_epi32(first_nm_mask, _MM_SHUFFLE(0,3,2,1)), + _mm_cmpeq_epi32(f_data, _mm_shuffle_epi32(s_data, _MM_SHUFFLE(0,3,2,1))))) + ), bitmask); } @@ -607,24 +1073,120 @@ inline ALWAYS_INLINE bool sliceHasImplAnyAll 1) + if (!has_mask && second.size > 3) return false; return hasAllIntegralLoopRemainder(j, first, second, first_null_map, second_null_map); } -// SSE4.2 Int16_t specialization +// SSE4.2 Int16 specialization template <> -inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >( - const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, const UInt8 * second_null_map) +inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >( + const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, const UInt8 * second_null_map) +{ + if (second.size == 0) + return true; + + if (!hasNull(first_null_map, first.size) && hasNull(second_null_map, second.size)) + return false; + + const bool has_first_null_map = first_null_map != nullptr; + const bool has_second_null_map = second_null_map != nullptr; + + size_t j = 0; + short has_mask = 1; + const int16_t full = -1, none = 0; + const __m128i zeros = _mm_setzero_si128(); + if (second.size > 6 && first.size > 6) + { + for (; j < second.size - 7 && has_mask; j += 8) + { + has_mask = 0; + const __m128i f_data = _mm_loadu_si128(reinterpret_cast(second.data + j)); + __m128i bitmask = has_second_null_map ? + _mm_set_epi16( + (second_null_map[j + 7]) ? full : none, (second_null_map[j + 6]) ? full : none, + (second_null_map[j + 5]) ? full : none, (second_null_map[j + 4]) ? full : none, + (second_null_map[j + 3]) ? full : none, (second_null_map[j + 2]) ? full : none, + (second_null_map[j + 1]) ? full : none, (second_null_map[j]) ? full: none) + : zeros; + unsigned i = 0; + for (; i < first.size-7 && !has_mask; has_mask = _mm_test_all_ones(bitmask), i += 8) + { + const __m128i s_data = _mm_loadu_si128(reinterpret_cast(first.data + i)); + const __m128i first_nm_mask = has_first_null_map ? + _mm_cvtepi8_epi16(_mm_loadu_si128(reinterpret_cast(first_null_map + i))) + : zeros; + bitmask = + _mm_or_si128( + _mm_or_si128( + _mm_or_si128( + _mm_or_si128( + _mm_andnot_si128( + first_nm_mask, + _mm_cmpeq_epi16(f_data, s_data)), + _mm_andnot_si128( + _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14)), + _mm_cmpeq_epi16(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14))))), + _mm_or_si128( + _mm_andnot_si128( + _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12)), + _mm_cmpeq_epi16(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12)))), + _mm_andnot_si128( + _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10)), + _mm_cmpeq_epi16(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10))))) + ), + _mm_or_si128( + _mm_or_si128( + _mm_andnot_si128( + _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8)), + _mm_cmpeq_epi16(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8)))), + _mm_andnot_si128( + _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6)), + _mm_cmpeq_epi16(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6))))), + _mm_or_si128( + _mm_andnot_si128( + _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4)), + _mm_cmpeq_epi16(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4)))), + _mm_andnot_si128( + _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2)), + _mm_cmpeq_epi16(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2)))))) + ), + bitmask); + } + + if (i < first.size) + { + for (; i < first.size && !has_mask; ++i) + { + if (has_first_null_map && first_null_map[i]) + continue; + __m128i v_i = _mm_set1_epi16(first.data[i]); + bitmask = _mm_or_si128(bitmask, _mm_cmpeq_epi16(f_data, v_i)); + has_mask = _mm_test_all_ones(bitmask); + } + } + } + } + + if (!has_mask && second.size > 6) + return false; + + return hasAllIntegralLoopRemainder(j, first, second, first_null_map, second_null_map); +} + +// SSE4.2 UInt16 specialization +template <> +inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >( + const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, const UInt8 * second_null_map) { if (second.size == 0) return true; @@ -721,10 +1283,10 @@ inline ALWAYS_INLINE bool sliceHasImplAnyAll -inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >( - const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, const UInt8 * second_null_map) +inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >( + const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, const UInt8 * second_null_map) { if (second.size == 0) return true; @@ -852,6 +1414,136 @@ inline ALWAYS_INLINE bool sliceHasImplAnyAll +inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >( + const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, const UInt8 * second_null_map) +{ + if (second.size == 0) + return true; + + if (!hasNull(first_null_map, first.size) && hasNull(second_null_map, second.size)) + return false; + + const bool has_first_null_map = first_null_map != nullptr; + const bool has_second_null_map = second_null_map != nullptr; + + size_t j = 0; + short has_mask = 1; + const int8_t full = -1, none = 0; + const __m128i zeros = _mm_setzero_si128(); + if (second.size > 15 && first.size > 15) + { + for (; j < second.size - 15 && has_mask; j += 16) + { + has_mask = 0; + const __m128i f_data = _mm_loadu_si128(reinterpret_cast(second.data + j)); + __m128i bitmask = has_second_null_map ? + _mm_set_epi8( + (second_null_map[j + 15]) ? full : none, (second_null_map[j + 14]) ? full : none, + (second_null_map[j + 13]) ? full : none, (second_null_map[j + 12]) ? full : none, + (second_null_map[j + 11]) ? full : none, (second_null_map[j + 10]) ? full : none, + (second_null_map[j + 9]) ? full : none, (second_null_map[j + 8]) ? full : none, + (second_null_map[j + 7]) ? full : none, (second_null_map[j + 6]) ? full : none, + (second_null_map[j + 5]) ? full : none, (second_null_map[j + 4]) ? full : none, + (second_null_map[j + 3]) ? full : none, (second_null_map[j + 2]) ? full : none, + (second_null_map[j + 1]) ? full : none, (second_null_map[j]) ? full : none) + : zeros; + unsigned i = 0; + for (; i < first.size - 15 && !has_mask; has_mask = _mm_test_all_ones(bitmask), i += 16) + { + const __m128i s_data = _mm_loadu_si128(reinterpret_cast(first.data + i)); + const __m128i first_nm_mask = has_first_null_map ? + _mm_loadu_si128(reinterpret_cast(first_null_map + i)) + : zeros; + bitmask = + _mm_or_si128( + _mm_or_si128( + _mm_or_si128( + _mm_or_si128( + _mm_or_si128( + _mm_andnot_si128( + first_nm_mask, + _mm_cmpeq_epi8(f_data, s_data)), + _mm_andnot_si128( + _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15)), + _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15))))), + _mm_or_si128( + _mm_andnot_si128( + _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14)), + _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14)))), + _mm_andnot_si128( + _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13)), + _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13))))) + ), + _mm_or_si128( + _mm_or_si128( + _mm_andnot_si128( + _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12)), + _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12)))), + _mm_andnot_si128( + _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11)), + _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11))))), + _mm_or_si128( + _mm_andnot_si128( + _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10)), + _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10)))), + _mm_andnot_si128( + _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9)), + _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9))))))), + _mm_or_si128( + _mm_or_si128( + _mm_or_si128( + _mm_andnot_si128( + _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8)), + _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8)))), + _mm_andnot_si128( + _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7)), + _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7))))), + _mm_or_si128( + _mm_andnot_si128( + _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6)), + _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6)))), + _mm_andnot_si128( + _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5)), + _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5)))))), + _mm_or_si128( + _mm_or_si128( + _mm_andnot_si128( + _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4)), + _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4)))), + _mm_andnot_si128( + _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3)), + _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3))))), + _mm_or_si128( + _mm_andnot_si128( + _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2)), + _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2)))), + _mm_andnot_si128( + _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1)), + _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1)))))))), + bitmask); + } + + if (i < first.size) + { + for (; i < first.size && !has_mask; ++i) + { + if (has_first_null_map && first_null_map[i]) + continue; + __m128i v_i = _mm_set1_epi8(first.data[i]); + bitmask = _mm_or_si128(bitmask, _mm_cmpeq_epi8(f_data, v_i)); + has_mask = _mm_test_all_ones(bitmask); + } + } + } + } + + if (!has_mask && second.size > 15) + return false; + + return hasAllIntegralLoopRemainder(j, first, second, first_null_map, second_null_map); +} #endif } From c2d19350807450250eadfa3b182b12f62bdc1e66 Mon Sep 17 00:00:00 2001 From: Anton Kozlov Date: Fri, 18 Mar 2022 15:56:25 +0000 Subject: [PATCH 19/82] Do not build krb5 if ENABLE_LIBRARIES is not set This module has hard dependency on SSL. If ENABLE_LIBRARIES is off then SSL is disabled. With this change, building this module will not break. --- contrib/krb5-cmake/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/contrib/krb5-cmake/CMakeLists.txt b/contrib/krb5-cmake/CMakeLists.txt index 685e8737ef0..0d6075ee99e 100644 --- a/contrib/krb5-cmake/CMakeLists.txt +++ b/contrib/krb5-cmake/CMakeLists.txt @@ -1,4 +1,4 @@ -set (ENABLE_KRB5_DEFAULT 1) +set (ENABLE_KRB5_DEFAULT ${ENABLE_LIBRARIES}) if (NOT CMAKE_SYSTEM_NAME MATCHES "Linux" AND NOT (CMAKE_SYSTEM_NAME MATCHES "Darwin" AND NOT CMAKE_CROSSCOMPILING)) message (WARNING "krb5 disabled in non-Linux and non-native-Darwin environments") set (ENABLE_KRB5_DEFAULT 0) From 097ff9cc98b985201f2af1dcd262b5300a9079ee Mon Sep 17 00:00:00 2001 From: Kerry Clendinning Date: Thu, 24 Mar 2022 08:35:22 -0500 Subject: [PATCH 20/82] Update index.md Fixed spelling "retuned" -> "returned" --- docs/en/sql-reference/functions/index.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en/sql-reference/functions/index.md b/docs/en/sql-reference/functions/index.md index 7cceec889bd..572aa7f632e 100644 --- a/docs/en/sql-reference/functions/index.md +++ b/docs/en/sql-reference/functions/index.md @@ -77,7 +77,7 @@ A function configuration contains the following settings: - `argument` - argument description with the `type`, and optional `name` of an argument. Each argument is described in a separate setting. Specifying name is necessary if argument names are part of serialization for user defined function format like [Native](../../interfaces/formats.md#native) or [JSONEachRow](../../interfaces/formats.md#jsoneachrow). Default argument name value is `c` + argument_number. - `format` - a [format](../../interfaces/formats.md) in which arguments are passed to the command. - `return_type` - the type of a returned value. -- `return_name` - name of retuned value. Specifying return name is necessary if return name is part of serialization for user defined function format like [Native](../../interfaces/formats.md#native) or [JSONEachRow](../../interfaces/formats.md#jsoneachrow). Optional. Default value is `result`. +- `return_name` - name of returned value. Specifying return name is necessary if return name is part of serialization for user defined function format like [Native](../../interfaces/formats.md#native) or [JSONEachRow](../../interfaces/formats.md#jsoneachrow). Optional. Default value is `result`. - `type` - an executable type. If `type` is set to `executable` then single command is started. If it is set to `executable_pool` then a pool of commands is created. - `max_command_execution_time` - maximum execution time in seconds for processing block of data. This setting is valid for `executable_pool` commands only. Optional. Default value is `10`. - `command_termination_timeout` - time in seconds during which a command should finish after its pipe is closed. After that time `SIGTERM` is sent to the process executing the command. Optional. Default value is `10`. From a216bc26c1906ec12cc49757f4c0e47ebdba2314 Mon Sep 17 00:00:00 2001 From: vdimir Date: Mon, 28 Mar 2022 13:29:34 +0000 Subject: [PATCH 21/82] Correct check asof join key nullability --- src/Interpreters/TableJoin.cpp | 17 +++++++++-------- .../0_stateless/01428_nullable_asof_join.sql | 5 +++++ 2 files changed, 14 insertions(+), 8 deletions(-) diff --git a/src/Interpreters/TableJoin.cpp b/src/Interpreters/TableJoin.cpp index 7b7ccb689c3..ec5358cf6bc 100644 --- a/src/Interpreters/TableJoin.cpp +++ b/src/Interpreters/TableJoin.cpp @@ -512,14 +512,6 @@ TableJoin::createConvertingActions(const ColumnsWithTypeAndName & left_sample_co template void TableJoin::inferJoinKeyCommonType(const LeftNamesAndTypes & left, const RightNamesAndTypes & right, bool allow_right) { - if (strictness() == ASTTableJoin::Strictness::Asof) - { - if (clauses.size() != 1) - throw DB::Exception("ASOF join over multiple keys is not supported", ErrorCodes::NOT_IMPLEMENTED); - if (right.back().type->isNullable()) - throw DB::Exception("ASOF join over right table Nullable column is not implemented", ErrorCodes::NOT_IMPLEMENTED); - } - if (!left_type_map.empty() || !right_type_map.empty()) return; @@ -531,6 +523,15 @@ void TableJoin::inferJoinKeyCommonType(const LeftNamesAndTypes & left, const Rig for (const auto & col : right) right_types[renamedRightColumnName(col.name)] = col.type; + if (strictness() == ASTTableJoin::Strictness::Asof) + { + if (clauses.size() != 1) + throw DB::Exception("ASOF join over multiple keys is not supported", ErrorCodes::NOT_IMPLEMENTED); + + auto asof_key_type = right_types.find(clauses.back().key_names_right.back()); + if (asof_key_type != right_types.end() && asof_key_type->second->isNullable()) + throw DB::Exception("ASOF join over right table Nullable column is not implemented", ErrorCodes::NOT_IMPLEMENTED); + } forAllKeys(clauses, [&](const auto & left_key_name, const auto & right_key_name) { diff --git a/tests/queries/0_stateless/01428_nullable_asof_join.sql b/tests/queries/0_stateless/01428_nullable_asof_join.sql index 30e5c51eb1c..e1b00158d68 100644 --- a/tests/queries/0_stateless/01428_nullable_asof_join.sql +++ b/tests/queries/0_stateless/01428_nullable_asof_join.sql @@ -109,3 +109,8 @@ FROM (SELECT toUInt8(number) > 0 as pk, toNullable(toUInt8(number)) as dt FROM n ASOF JOIN (SELECT 1 as pk, toNullable(0) as dt) b ON a.dt >= b.dt AND a.pk = b.pk ORDER BY a.dt; -- { serverError 48 } + +SELECT * +FROM (SELECT NULL AS y, 1 AS x, '2020-01-01 10:10:10' :: DateTime64 AS t) AS t1 +ASOF LEFT JOIN (SELECT NULL AS y, 1 AS x, '2020-01-01 10:10:10' :: DateTime64 AS t) AS t2 +ON t1.t <= t2.t AND t1.x == t2.x FORMAT Null; From 0722beca0c615cd698a4efc71c4ebd1f642585bb Mon Sep 17 00:00:00 2001 From: Denny Crane Date: Wed, 30 Mar 2022 08:45:30 -0300 Subject: [PATCH 22/82] Update Client.cpp --- programs/client/Client.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/programs/client/Client.cpp b/programs/client/Client.cpp index c2094b3b00d..3d5cc291f46 100644 --- a/programs/client/Client.cpp +++ b/programs/client/Client.cpp @@ -810,7 +810,7 @@ void Client::addOptions(OptionsDescription & options_description) ("quota_key", po::value(), "A string to differentiate quotas when the user have keyed quotas configured on server") ("max_client_network_bandwidth", po::value(), "the maximum speed of data exchange over the network for the client in bytes per second.") - ("compression", po::value(), "enable or disable compression") + ("compression", po::value(), "enable or disable compression (enabled by default for remote communication and disabled for localhost communication).") ("query-fuzzer-runs", po::value()->default_value(0), "After executing every SELECT query, do random mutations in it and run again specified number of times. This is used for testing to discover unexpected corner cases.") ("interleave-queries-file", po::value>()->multitoken(), From 91eec8962fb78e03be002966b61bcfbdf6228a9d Mon Sep 17 00:00:00 2001 From: Maksim Kita Date: Wed, 30 Mar 2022 18:39:28 +0200 Subject: [PATCH 23/82] Rename test --- src/Functions/tests/{gtest_hasAll.cpp => gtest_has_all.cpp} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename src/Functions/tests/{gtest_hasAll.cpp => gtest_has_all.cpp} (100%) diff --git a/src/Functions/tests/gtest_hasAll.cpp b/src/Functions/tests/gtest_has_all.cpp similarity index 100% rename from src/Functions/tests/gtest_hasAll.cpp rename to src/Functions/tests/gtest_has_all.cpp From 8d0a9689e4cad21dd03e459a74b9a0a564b0db60 Mon Sep 17 00:00:00 2001 From: Maksim Kita Date: Wed, 30 Mar 2022 18:40:18 +0200 Subject: [PATCH 24/82] Update gatherutils CMakeLists to use X86_INTRINSICS_FLAGS from cpu_features --- src/Functions/GatherUtils/CMakeLists.txt | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/src/Functions/GatherUtils/CMakeLists.txt b/src/Functions/GatherUtils/CMakeLists.txt index 10909b99b82..460b02326a1 100644 --- a/src/Functions/GatherUtils/CMakeLists.txt +++ b/src/Functions/GatherUtils/CMakeLists.txt @@ -1,5 +1,4 @@ include("${ClickHouse_SOURCE_DIR}/cmake/dbms_glob_sources.cmake") -option(ENABLE_AVX2 "Enable AVX2 instructions (when available) when build for modern Intel CPUs" OFF) add_headers_and_sources(clickhouse_functions_gatherutils .) add_library(clickhouse_functions_gatherutils ${clickhouse_functions_gatherutils_sources} ${clickhouse_functions_gatherutils_headers}) @@ -17,11 +16,4 @@ if (STRIP_DEBUG_SYMBOLS_FUNCTIONS) target_compile_options(clickhouse_functions_gatherutils PRIVATE "-g0") endif() -if (HAVE_SSE42) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -msse4.2") - target_compile_options(clickhouse_functions_gatherutils PRIVATE -msse4.2) -endif() -if (HAVE_AVX2 AND ENABLE_AVX2) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx2 -DENABLE_AVX2") - target_compile_options(clickhouse_functions_gatherutils PRIVATE -mavx2) -endif() +set_target_properties(clickhouse_functions_gatherutils PROPERTIES COMPILE_FLAGS "${X86_INTRINSICS_FLAGS}") From e43fdcd7ebb54ce15eb4612dba18f93beae03802 Mon Sep 17 00:00:00 2001 From: Maksim Kita Date: Wed, 30 Mar 2022 18:41:34 +0200 Subject: [PATCH 25/82] Function hasAll added dynamic dispatch for SSE4.2, AVX2 --- .../GatherUtils/sliceHasImplAnyAll.h | 2302 ++++++----------- 1 file changed, 848 insertions(+), 1454 deletions(-) diff --git a/src/Functions/GatherUtils/sliceHasImplAnyAll.h b/src/Functions/GatherUtils/sliceHasImplAnyAll.h index 52448f88447..97ac0c6be72 100644 --- a/src/Functions/GatherUtils/sliceHasImplAnyAll.h +++ b/src/Functions/GatherUtils/sliceHasImplAnyAll.h @@ -9,37 +9,836 @@ #include #include #endif + #if defined(__AVX2__) #include #endif -namespace DB::GatherUtils -{ -namespace +#include + +namespace DB::GatherUtils { inline ALWAYS_INLINE bool hasNull(const UInt8 * null_map, size_t null_map_size) { - if (null_map != nullptr) - { - for (size_t i = 0; i < null_map_size; ++i) - { - if (null_map[i]) - return true; - } + if (null_map == nullptr) { + return false; } + + for (size_t i = 0; i < null_map_size; ++i) + { + if (null_map[i]) + return true; + } + return false; } +template +inline ALWAYS_INLINE bool hasAllIntegralLoopRemainder( + size_t j, const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, const UInt8 * second_null_map) +{ + const bool has_first_null_map = first_null_map != nullptr; + const bool has_second_null_map = second_null_map != nullptr; + + for (; j < second.size; ++j) + { + // skip null elements since both have at least one - assuming it was checked earlier that at least one element in 'first' is null + if (has_second_null_map && second_null_map[j]) + continue; + + bool found = false; + + for (size_t i = 0; i < first.size; ++i) + { + if (has_first_null_map && first_null_map[i]) + continue; + + if (first.data[i] == second.data[j]) + { + found = true; + break; + } + } + + if (!found) + return false; + } + return true; } -/// Methods to check if first array has elements from second array, overloaded for various combinations of types. + +#if defined(__AVX2__) + +DECLARE_AVX2_SPECIFIC_CODE ( + +// AVX2 Int64, UInt64 specialization +template +requires (std::is_same_v || std::is_same_v) +inline ALWAYS_INLINE bool sliceHasImplAnyAllImplInt64( + const NumericArraySlice & first, + const NumericArraySlice & second, + const UInt8 * first_null_map, + const UInt8 * second_null_map) +{ + if (second.size == 0) + return true; + + if (!hasNull(first_null_map, first.size) && hasNull(second_null_map, second.size)) + return false; + + const bool has_first_null_map = first_null_map != nullptr; + const bool has_second_null_map = second_null_map != nullptr; + + size_t j = 0; + int has_mask = 1; + static constexpr Int64 full = -1, none = 0; + const __m256i ones = _mm256_set1_epi64x(full); + const __m256i zeros = _mm256_setzero_si256(); + + if (second.size > 3 && first.size > 3) + { + for (; j < second.size - 3 && has_mask; j += 4) + { + has_mask = 0; + const __m256i second_data = _mm256_loadu_si256(reinterpret_cast(second.data + j)); + // bits of the bitmask are set to one if considered as null in the corresponding null map, 0 otherwise; + __m256i bitmask = has_second_null_map ? + _mm256_set_epi64x( + (second_null_map[j + 3])? full : none, + (second_null_map[j + 2])? full : none, + (second_null_map[j + 1])? full : none, + (second_null_map[j]) ? full : none) + : zeros; + + size_t i = 0; + for (; i < first.size - 3 && !has_mask; has_mask = _mm256_testc_si256(bitmask, ones), i += 4) + { + const __m256i first_data = _mm256_loadu_si256(reinterpret_cast(first.data + i)); + const __m256i first_nm_mask = has_first_null_map? + _mm256_set_m128i( + _mm_cvtepi8_epi64(_mm_loadu_si128(reinterpret_cast(first_null_map + i + 2))), + _mm_cvtepi8_epi64(_mm_loadu_si128(reinterpret_cast(first_null_map + i)))) + : zeros; + bitmask = + _mm256_or_si256( + _mm256_or_si256( + _mm256_or_si256( + _mm256_andnot_si256( + first_nm_mask, + _mm256_cmpeq_epi64(second_data, first_data)), + _mm256_andnot_si256( + _mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(5,4,3,2,1,0,7,6)), + _mm256_cmpeq_epi64(second_data, _mm256_permutevar8x32_epi32(first_data, _mm256_set_epi32(5,4,3,2,1,0,7,6))))), + + _mm256_or_si256( + _mm256_andnot_si256( + _mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(3,2,1,0,7,6,5,4)), + _mm256_cmpeq_epi64(second_data, _mm256_permutevar8x32_epi32(first_data, _mm256_set_epi32(3,2,1,0,7,6,5,4)))), + _mm256_andnot_si256( + _mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(1,0,7,6,5,4,3,2)), + _mm256_cmpeq_epi64(second_data, _mm256_permutevar8x32_epi32(first_data, _mm256_set_epi32(1,0,7,6,5,4,3,2)))))), + bitmask); + } + + if (i < first.size) + { + for (; i < first.size && !has_mask; ++i) + { + if (has_first_null_map && first_null_map[i]) + continue; + + __m256i v_i = _mm256_set1_epi64x(first.data[i]); + bitmask = _mm256_or_si256(bitmask, _mm256_cmpeq_epi64(second_data, v_i)); + has_mask = _mm256_testc_si256(bitmask, ones); + } + } + } + } + + if (!has_mask && second.size > 3) + return false; + + return hasAllIntegralLoopRemainder(j, first, second, first_null_map, second_null_map); +} + +// AVX2 Int32, UInt32 specialization +template +requires (std::is_same_v || std::is_same_v) +inline ALWAYS_INLINE bool sliceHasImplAnyAllImplInt32( + const NumericArraySlice & first, + const NumericArraySlice & second, + const UInt8 * first_null_map, + const UInt8 * second_null_map) +{ + if (second.size == 0) + return true; + + if (!hasNull(first_null_map, first.size) && hasNull(second_null_map, second.size)) + return false; + + const bool has_first_null_map = first_null_map != nullptr; + const bool has_second_null_map = second_null_map != nullptr; + + size_t j = 0; + int has_mask = 1; + static constexpr int full = -1, none = 0; + + const __m256i ones = _mm256_set1_epi32(full); + const __m256i zeros = _mm256_setzero_si256(); + + if (second.size > 7 && first.size > 7) + { + for (; j < second.size - 7 && has_mask; j += 8) + { + has_mask = 0; + const __m256i second_data = _mm256_loadu_si256(reinterpret_cast(second.data + j)); + // bits of the bitmask are set to one if considered as null in the corresponding null map, 0 otherwise; + __m256i bitmask = has_second_null_map ? + _mm256_set_epi32( + (second_null_map[j + 7]) ? full : none, + (second_null_map[j + 6]) ? full : none, + (second_null_map[j + 5]) ? full : none, + (second_null_map[j + 4]) ? full : none, + (second_null_map[j + 3]) ? full : none, + (second_null_map[j + 2]) ? full : none, + (second_null_map[j + 1]) ? full : none, + (second_null_map[j]) ? full : none) + : zeros; + + size_t i = 0; + for (; i < first.size - 7 && !has_mask; has_mask = _mm256_testc_si256(bitmask, ones), i += 8) + { + const __m256i first_data = _mm256_loadu_si256(reinterpret_cast(first.data + i)); + // Create a mask to avoid to compare null elements + // set_m128i takes two arguments: (high segment, low segment) that are two __m128i convert from 8bits to 32bits to match with next operations + const __m256i first_nm_mask = has_first_null_map? + _mm256_set_m128i( + _mm_cvtepi8_epi32(_mm_loadu_si128(reinterpret_cast(first_null_map + i + 4))), + _mm_cvtepi8_epi32(_mm_loadu_si128(reinterpret_cast(first_null_map + i)))) + : zeros; + bitmask = + _mm256_or_si256( + _mm256_or_si256( + _mm256_or_si256( + _mm256_or_si256( + _mm256_andnot_si256( + first_nm_mask, + _mm256_cmpeq_epi32(second_data, first_data)), + _mm256_andnot_si256( + _mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(6,5,4,3,2,1,0,7)), + _mm256_cmpeq_epi32(second_data, _mm256_permutevar8x32_epi32(first_data, _mm256_set_epi32(6,5,4,3,2,1,0,7))))), + _mm256_or_si256( + _mm256_andnot_si256( + _mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(5,4,3,2,1,0,7,6)), + _mm256_cmpeq_epi32(second_data, _mm256_permutevar8x32_epi32(first_data, _mm256_set_epi32(5,4,3,2,1,0,7,6)))), + _mm256_andnot_si256( + _mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(4,3,2,1,0,7,6,5)), + _mm256_cmpeq_epi32(second_data, _mm256_permutevar8x32_epi32(first_data, _mm256_set_epi32(4,3,2,1,0,7,6,5))))) + ), + _mm256_or_si256( + _mm256_or_si256( + _mm256_andnot_si256( + _mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(3,2,1,0,7,6,5,4)), + _mm256_cmpeq_epi32(second_data, _mm256_permutevar8x32_epi32(first_data, _mm256_set_epi32(3,2,1,0,7,6,5,4)))), + _mm256_andnot_si256( + _mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(2,1,0,7,6,5,4,3)), + _mm256_cmpeq_epi32(second_data, _mm256_permutevar8x32_epi32(first_data, _mm256_set_epi32(2,1,0,7,6,5,4,3))))), + _mm256_or_si256( + _mm256_andnot_si256( + _mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(1,0,7,6,5,4,3,2)), + _mm256_cmpeq_epi32(second_data, _mm256_permutevar8x32_epi32(first_data, _mm256_set_epi32(1,0,7,6,5,4,3,2)))), + _mm256_andnot_si256( + _mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(0,7,6,5,4,3,2,1)), + _mm256_cmpeq_epi32(second_data, _mm256_permutevar8x32_epi32(first_data, _mm256_set_epi32(0,7,6,5,4,3,2,1))))))), + bitmask); + } + + if (i < first.size) + { + for (; i < first.size && !has_mask; ++i) + { + if (has_first_null_map && first_null_map[i]) + continue; + + __m256i v_i = _mm256_set1_epi32(first.data[i]); + bitmask = _mm256_or_si256(bitmask, _mm256_cmpeq_epi32(second_data, v_i)); + has_mask = _mm256_testc_si256(bitmask, ones); + } + } + } + } + + if (!has_mask && second.size > 7) + return false; + + return hasAllIntegralLoopRemainder(j, first, second, first_null_map, second_null_map); +} + +// AVX2 Int16, UInt16 specialization +template +requires (std::is_same_v || std::is_same_v) +inline ALWAYS_INLINE bool sliceHasImplAnyAllImplInt16( + const NumericArraySlice & first, + const NumericArraySlice & second, + const UInt8 * first_null_map, + const UInt8 * second_null_map) +{ + if (second.size == 0) + return true; + + if (!hasNull(first_null_map, first.size) && hasNull(second_null_map, second.size)) + return false; + + const bool has_first_null_map = first_null_map != nullptr; + const bool has_second_null_map = second_null_map != nullptr; + + size_t j = 0; + int has_mask = 1; + static constexpr int16_t full = -1, none = 0; + const __m256i ones = _mm256_set1_epi16(full); + const __m256i zeros = _mm256_setzero_si256(); + if (second.size > 15 && first.size > 15) + { + for (; j < second.size - 15 && has_mask; j += 16) + { + has_mask = 0; + const __m256i second_data = _mm256_loadu_si256(reinterpret_cast(second.data + j)); + __m256i bitmask = has_second_null_map ? + _mm256_set_epi16( + (second_null_map[j + 15]) ? full : none, (second_null_map[j + 14]) ? full : none, + (second_null_map[j + 13]) ? full : none, (second_null_map[j + 12]) ? full : none, + (second_null_map[j + 11]) ? full : none, (second_null_map[j + 10]) ? full : none, + (second_null_map[j + 9]) ? full : none, (second_null_map[j + 8])? full : none, + (second_null_map[j + 7]) ? full : none, (second_null_map[j + 6])? full : none, + (second_null_map[j + 5]) ? full : none, (second_null_map[j + 4])? full : none, + (second_null_map[j + 3]) ? full : none, (second_null_map[j + 2])? full : none, + (second_null_map[j + 1]) ? full : none, (second_null_map[j]) ? full : none) + : zeros; + + size_t i = 0; + for (; i < first.size - 15 && !has_mask; has_mask = _mm256_testc_si256(bitmask, ones), i += 16) + { + const __m256i first_data = _mm256_loadu_si256(reinterpret_cast(first.data + i)); + const __m256i first_nm_mask = has_first_null_map? + _mm256_set_m128i( + _mm_cvtepi8_epi16(_mm_loadu_si128(reinterpret_cast(first_null_map + i + 8))), + _mm_cvtepi8_epi16(_mm_loadu_si128(reinterpret_cast(first_null_map + i)))) + : zeros; + + bitmask = + _mm256_or_si256( + _mm256_or_si256( + _mm256_or_si256( + _mm256_or_si256( + _mm256_or_si256( + _mm256_andnot_si256( + first_nm_mask, + _mm256_cmpeq_epi16(second_data, first_data)), + _mm256_andnot_si256( + _mm256_shuffle_epi8(first_nm_mask, _mm256_set_epi8(29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30)), + _mm256_cmpeq_epi16(second_data, _mm256_shuffle_epi8(first_data, _mm256_set_epi8(29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30))))), + _mm256_or_si256( + _mm256_andnot_si256( + _mm256_shuffle_epi8(first_nm_mask, _mm256_set_epi8(27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28)), + _mm256_cmpeq_epi16(second_data, _mm256_shuffle_epi8(first_data, _mm256_set_epi8(27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28)))), + _mm256_andnot_si256( + _mm256_shuffle_epi8(first_nm_mask, _mm256_set_epi8(25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26)), + _mm256_cmpeq_epi16(second_data, _mm256_shuffle_epi8(first_data, _mm256_set_epi8(25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26))))) + ), + _mm256_or_si256( + _mm256_or_si256( + _mm256_andnot_si256( + _mm256_shuffle_epi8(first_nm_mask, _mm256_set_epi8(23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24)), + _mm256_cmpeq_epi16(second_data, _mm256_shuffle_epi8(first_data, _mm256_set_epi8(23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24)))), + _mm256_andnot_si256( + _mm256_shuffle_epi8(first_nm_mask, _mm256_set_epi8(21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22)), + _mm256_cmpeq_epi16(second_data, _mm256_shuffle_epi8(first_data, _mm256_set_epi8(21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22))))), + _mm256_or_si256( + _mm256_andnot_si256( + _mm256_shuffle_epi8(first_nm_mask, _mm256_set_epi8(19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20)), + _mm256_cmpeq_epi16(second_data, _mm256_shuffle_epi8(first_data, _mm256_set_epi8(19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20)))), + _mm256_andnot_si256( + _mm256_shuffle_epi8(first_nm_mask, _mm256_set_epi8(17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18)), + _mm256_cmpeq_epi16(second_data, _mm256_shuffle_epi8(first_data, _mm256_set_epi8(17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18)))))) + ), + _mm256_or_si256( + _mm256_or_si256( + _mm256_or_si256( + _mm256_andnot_si256( + _mm256_permute2x128_si256(first_nm_mask, first_nm_mask,1), + _mm256_cmpeq_epi16(second_data, _mm256_permute2x128_si256(first_data, first_data, 1))), + _mm256_andnot_si256( + _mm256_shuffle_epi8(_mm256_permute2x128_si256(first_nm_mask, first_nm_mask, 1), _mm256_set_epi8(13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14)), + _mm256_cmpeq_epi16(second_data, _mm256_shuffle_epi8(_mm256_permute2x128_si256(first_data, first_data, 1), _mm256_set_epi8(13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14))))), + _mm256_or_si256( + _mm256_andnot_si256( + _mm256_shuffle_epi8(_mm256_permute2x128_si256(first_nm_mask, first_nm_mask, 1), _mm256_set_epi8(11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12)), + _mm256_cmpeq_epi16(second_data, _mm256_shuffle_epi8(_mm256_permute2x128_si256(first_data, first_data, 1), _mm256_set_epi8(11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12)))), + _mm256_andnot_si256( + _mm256_shuffle_epi8(_mm256_permute2x128_si256(first_nm_mask, first_nm_mask, 1), _mm256_set_epi8(9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10)), + _mm256_cmpeq_epi16(second_data, _mm256_shuffle_epi8(_mm256_permute2x128_si256(first_data, first_data, 1), _mm256_set_epi8(9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10))))) + ), + _mm256_or_si256( + _mm256_or_si256( + _mm256_andnot_si256( + _mm256_shuffle_epi8(_mm256_permute2x128_si256(first_nm_mask, first_nm_mask, 1), _mm256_set_epi8(7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8)), + _mm256_cmpeq_epi16(second_data, _mm256_shuffle_epi8(_mm256_permute2x128_si256(first_data ,first_data, 1), _mm256_set_epi8(7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8)))), + _mm256_andnot_si256( + _mm256_shuffle_epi8(_mm256_permute2x128_si256(first_nm_mask, first_nm_mask, 1), _mm256_set_epi8(5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6)), + _mm256_cmpeq_epi16(second_data, _mm256_shuffle_epi8(_mm256_permute2x128_si256(first_data, first_data, 1), _mm256_set_epi8(5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6))))), + _mm256_or_si256( + _mm256_andnot_si256( + _mm256_shuffle_epi8(_mm256_permute2x128_si256(first_nm_mask, first_nm_mask, 1), _mm256_set_epi8(3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4)), + _mm256_cmpeq_epi16(second_data, _mm256_shuffle_epi8(_mm256_permute2x128_si256(first_data ,first_data ,1), _mm256_set_epi8(3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4)))), + _mm256_andnot_si256( + _mm256_shuffle_epi8(_mm256_permute2x128_si256(first_nm_mask, first_nm_mask, 1), _mm256_set_epi8(1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2)), + _mm256_cmpeq_epi16(second_data, _mm256_shuffle_epi8(_mm256_permute2x128_si256(first_data, first_data, 1), _mm256_set_epi8(1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2)))))) + ) + ), + bitmask); + } + + if (i < first.size) + { + for (; i < first.size && !has_mask; ++i) + { + if (has_first_null_map && first_null_map[i]) + continue; + + __m256i v_i = _mm256_set1_epi16(first.data[i]); + bitmask = _mm256_or_si256(bitmask, _mm256_cmpeq_epi16(second_data, v_i)); + has_mask = _mm256_testc_si256(bitmask, ones); + } + } + } + } + + if (!has_mask && second.size > 15) + return false; + + return hasAllIntegralLoopRemainder(j, first, second, first_null_map, second_null_map); +} + +) + +#endif + +#if defined(__SSE4_2__) + +DECLARE_SSE42_SPECIFIC_CODE ( + +// SSE4.2 Int64, UInt64 specialization +template +requires (std::is_same_v || std::is_same_v) +inline ALWAYS_INLINE bool sliceHasImplAnyAllImplInt64( + const NumericArraySlice & first, + const NumericArraySlice & second, + const UInt8 * first_null_map, + const UInt8 * second_null_map) +{ + if (second.size == 0) + return true; + + if (!hasNull(first_null_map, first.size) && hasNull(second_null_map, second.size)) + return false; + + const bool has_first_null_map = first_null_map != nullptr; + const bool has_second_null_map = second_null_map != nullptr; + + size_t j = 0; + int has_mask = 1; + static constexpr Int64 full = -1, none = 0; + const __m128i zeros = _mm_setzero_si128(); + if (second.size > 1 && first.size > 1) + { + for (; j < second.size - 1 && has_mask; j += 2) + { + has_mask = 0; + const __m128i second_data = _mm_loadu_si128(reinterpret_cast(second.data + j)); + __m128i bitmask = has_second_null_map ? + _mm_set_epi64x( + (second_null_map[j + 1]) ? full : none, + (second_null_map[j]) ? full : none) + : zeros; + + size_t i = 0; + + for (; i < first.size - 1 && !has_mask; has_mask = _mm_test_all_ones(bitmask), i += 2) + { + const __m128i first_data = _mm_loadu_si128(reinterpret_cast(first.data + i)); + const __m128i first_nm_mask = has_first_null_map ? + _mm_cvtepi8_epi64(_mm_loadu_si128(reinterpret_cast(first_null_map + i))) + : zeros; + + bitmask = + _mm_or_si128( + _mm_or_si128( + _mm_andnot_si128( + first_nm_mask, + _mm_cmpeq_epi64(second_data, first_data)), + _mm_andnot_si128( + _mm_shuffle_epi32(first_nm_mask, _MM_SHUFFLE(1,0,3,2)), + _mm_cmpeq_epi64(second_data, _mm_shuffle_epi32(first_data, _MM_SHUFFLE(1,0,3,2))))), + bitmask); + } + + if (i < first.size) + { + for (; i < first.size && !has_mask; ++i) + { + if (has_first_null_map && first_null_map[i]) + continue; + + __m128i v_i = _mm_set1_epi64x(first.data[i]); + bitmask = _mm_or_si128(bitmask, _mm_cmpeq_epi64(second_data, v_i)); + has_mask = _mm_test_all_ones(bitmask); + } + } + } + } + + if (!has_mask && second.size > 1) + return false; + + return hasAllIntegralLoopRemainder(j, first, second, first_null_map, second_null_map); +} + +// SSE4.2 Int32, UInt32 specialization +template +requires (std::is_same_v || std::is_same_v) +inline ALWAYS_INLINE bool sliceHasImplAnyAllImplInt32( + const NumericArraySlice & first, + const NumericArraySlice & second, + const UInt8 * first_null_map, + const UInt8 * second_null_map) +{ + if (second.size == 0) + return true; + + if (!hasNull(first_null_map, first.size) && hasNull(second_null_map, second.size)) + return false; + + const bool has_first_null_map = first_null_map != nullptr; + const bool has_second_null_map = second_null_map != nullptr; + + size_t j = 0; + int has_mask = 1; + static constexpr int full = -1, none = 0; + const __m128i zeros = _mm_setzero_si128(); + if (second.size > 3 && first.size > 3) + { + for (; j < second.size - 3 && has_mask; j += 4) + { + has_mask = 0; + const __m128i second_data = _mm_loadu_si128(reinterpret_cast(second.data + j)); + __m128i bitmask = has_second_null_map ? + _mm_set_epi32( + (second_null_map[j + 3]) ? full : none, + (second_null_map[j + 2]) ? full : none, + (second_null_map[j + 1]) ? full : none, + (second_null_map[j]) ? full : none) + : zeros; + + size_t i = 0; + for (; i < first.size - 3 && !has_mask; has_mask = _mm_test_all_ones(bitmask), i += 4) + { + const __m128i first_data = _mm_loadu_si128(reinterpret_cast(first.data + i)); + const __m128i first_nm_mask = has_first_null_map ? + _mm_cvtepi8_epi32(_mm_loadu_si128(reinterpret_cast(first_null_map + i))) + : zeros; + + bitmask = + _mm_or_si128( + _mm_or_si128( + _mm_or_si128( + _mm_andnot_si128( + first_nm_mask, + _mm_cmpeq_epi32(second_data, first_data)), + _mm_andnot_si128( + _mm_shuffle_epi32(first_nm_mask, _MM_SHUFFLE(2,1,0,3)), + _mm_cmpeq_epi32(second_data, _mm_shuffle_epi32(first_data, _MM_SHUFFLE(2,1,0,3))))), + _mm_or_si128( + _mm_andnot_si128( + _mm_shuffle_epi32(first_nm_mask, _MM_SHUFFLE(1,0,3,2)), + _mm_cmpeq_epi32(second_data, _mm_shuffle_epi32(first_data, _MM_SHUFFLE(1,0,3,2)))), + _mm_andnot_si128( + _mm_shuffle_epi32(first_nm_mask, _MM_SHUFFLE(0,3,2,1)), + _mm_cmpeq_epi32(second_data, _mm_shuffle_epi32(first_data, _MM_SHUFFLE(0,3,2,1))))) + ), + bitmask); + } + + if (i < first.size) + { + for (; i < first.size && !has_mask; ++i) + { + if (has_first_null_map && first_null_map[i]) + continue; + __m128i r_i = _mm_set1_epi32(first.data[i]); + bitmask = _mm_or_si128(bitmask, _mm_cmpeq_epi32(second_data, r_i)); + has_mask = _mm_test_all_ones(bitmask); + } + } + } + } + + if (!has_mask && second.size > 3) + return false; + + return hasAllIntegralLoopRemainder(j, first, second, first_null_map, second_null_map); +} + +// SSE4.2 Int16, UInt16 specialization +template +requires (std::is_same_v || std::is_same_v) +inline ALWAYS_INLINE bool sliceHasImplAnyAllImplInt16( + const NumericArraySlice & first, + const NumericArraySlice & second, + const UInt8 * first_null_map, + const UInt8 * second_null_map) +{ + if (second.size == 0) + return true; + + if (!hasNull(first_null_map, first.size) && hasNull(second_null_map, second.size)) + return false; + + const bool has_first_null_map = first_null_map != nullptr; + const bool has_second_null_map = second_null_map != nullptr; + + size_t j = 0; + int has_mask = 1; + static constexpr int16_t full = -1, none = 0; + const __m128i zeros = _mm_setzero_si128(); + if (second.size > 6 && first.size > 6) + { + for (; j < second.size - 7 && has_mask; j += 8) + { + has_mask = 0; + const __m128i second_data = _mm_loadu_si128(reinterpret_cast(second.data + j)); + __m128i bitmask = has_second_null_map ? + _mm_set_epi16( + (second_null_map[j + 7]) ? full : none, (second_null_map[j + 6]) ? full : none, + (second_null_map[j + 5]) ? full : none, (second_null_map[j + 4]) ? full : none, + (second_null_map[j + 3]) ? full : none, (second_null_map[j + 2]) ? full : none, + (second_null_map[j + 1]) ? full : none, (second_null_map[j]) ? full: none) + : zeros; + + size_t i = 0; + for (; i < first.size-7 && !has_mask; has_mask = _mm_test_all_ones(bitmask), i += 8) + { + const __m128i first_data = _mm_loadu_si128(reinterpret_cast(first.data + i)); + const __m128i first_nm_mask = has_first_null_map ? + _mm_cvtepi8_epi16(_mm_loadu_si128(reinterpret_cast(first_null_map + i))) + : zeros; + bitmask = + _mm_or_si128( + _mm_or_si128( + _mm_or_si128( + _mm_or_si128( + _mm_andnot_si128( + first_nm_mask, + _mm_cmpeq_epi16(second_data, first_data)), + _mm_andnot_si128( + _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14)), + _mm_cmpeq_epi16(second_data, _mm_shuffle_epi8(first_data, _mm_set_epi8(13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14))))), + _mm_or_si128( + _mm_andnot_si128( + _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12)), + _mm_cmpeq_epi16(second_data, _mm_shuffle_epi8(first_data, _mm_set_epi8(11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12)))), + _mm_andnot_si128( + _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10)), + _mm_cmpeq_epi16(second_data, _mm_shuffle_epi8(first_data, _mm_set_epi8(9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10))))) + ), + _mm_or_si128( + _mm_or_si128( + _mm_andnot_si128( + _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8)), + _mm_cmpeq_epi16(second_data, _mm_shuffle_epi8(first_data, _mm_set_epi8(7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8)))), + _mm_andnot_si128( + _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6)), + _mm_cmpeq_epi16(second_data, _mm_shuffle_epi8(first_data, _mm_set_epi8(5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6))))), + _mm_or_si128( + _mm_andnot_si128( + _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4)), + _mm_cmpeq_epi16(second_data, _mm_shuffle_epi8(first_data, _mm_set_epi8(3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4)))), + _mm_andnot_si128( + _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2)), + _mm_cmpeq_epi16(second_data, _mm_shuffle_epi8(first_data, _mm_set_epi8(1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2)))))) + ), + bitmask); + } + + if (i < first.size) + { + for (; i < first.size && !has_mask; ++i) + { + if (has_first_null_map && first_null_map[i]) + continue; + __m128i v_i = _mm_set1_epi16(first.data[i]); + bitmask = _mm_or_si128(bitmask, _mm_cmpeq_epi16(second_data, v_i)); + has_mask = _mm_test_all_ones(bitmask); + } + } + } + } + + if (!has_mask && second.size > 6) + return false; + + return hasAllIntegralLoopRemainder(j, first, second, first_null_map, second_null_map); +} + +// Int8/UInt8 version is faster with SSE than with AVX2 +// SSE2 Int8, UInt8 specialization +template +requires (std::is_same_v || std::is_same_v) +inline ALWAYS_INLINE bool sliceHasImplAnyAllImplInt8( + const NumericArraySlice & first, + const NumericArraySlice & second, + const UInt8 * first_null_map, + const UInt8 * second_null_map) +{ + if (second.size == 0) + return true; + + if (!hasNull(first_null_map, first.size) && hasNull(second_null_map, second.size)) + return false; + + const bool has_first_null_map = first_null_map != nullptr; + const bool has_second_null_map = second_null_map != nullptr; + + size_t j = 0; + int has_mask = 1; + static constexpr int8_t full = -1, none = 0; + const __m128i zeros = _mm_setzero_si128(); + + if (second.size > 15 && first.size > 15) + { + for (; j < second.size - 15 && has_mask; j += 16) + { + has_mask = 0; + const __m128i second_data = _mm_loadu_si128(reinterpret_cast(second.data + j)); + __m128i bitmask = has_second_null_map ? + _mm_set_epi8( + (second_null_map[j + 15]) ? full : none, (second_null_map[j + 14]) ? full : none, + (second_null_map[j + 13]) ? full : none, (second_null_map[j + 12]) ? full : none, + (second_null_map[j + 11]) ? full : none, (second_null_map[j + 10]) ? full : none, + (second_null_map[j + 9]) ? full : none, (second_null_map[j + 8]) ? full : none, + (second_null_map[j + 7]) ? full : none, (second_null_map[j + 6]) ? full : none, + (second_null_map[j + 5]) ? full : none, (second_null_map[j + 4]) ? full : none, + (second_null_map[j + 3]) ? full : none, (second_null_map[j + 2]) ? full : none, + (second_null_map[j + 1]) ? full : none, (second_null_map[j]) ? full : none) + : zeros; + + size_t i = 0; + for (; i < first.size - 15 && !has_mask; has_mask = _mm_test_all_ones(bitmask), i += 16) + { + const __m128i first_data = _mm_loadu_si128(reinterpret_cast(first.data + i)); + const __m128i first_nm_mask = has_first_null_map ? + _mm_loadu_si128(reinterpret_cast(first_null_map + i)) + : zeros; + bitmask = + _mm_or_si128( + _mm_or_si128( + _mm_or_si128( + _mm_or_si128( + _mm_or_si128( + _mm_andnot_si128( + first_nm_mask, + _mm_cmpeq_epi8(second_data, first_data)), + _mm_andnot_si128( + _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15)), + _mm_cmpeq_epi8(second_data, _mm_shuffle_epi8(first_data, _mm_set_epi8(14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15))))), + _mm_or_si128( + _mm_andnot_si128( + _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14)), + _mm_cmpeq_epi8(second_data, _mm_shuffle_epi8(first_data, _mm_set_epi8(13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14)))), + _mm_andnot_si128( + _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13)), + _mm_cmpeq_epi8(second_data, _mm_shuffle_epi8(first_data, _mm_set_epi8(12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13))))) + ), + _mm_or_si128( + _mm_or_si128( + _mm_andnot_si128( + _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12)), + _mm_cmpeq_epi8(second_data, _mm_shuffle_epi8(first_data, _mm_set_epi8(11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12)))), + _mm_andnot_si128( + _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11)), + _mm_cmpeq_epi8(second_data, _mm_shuffle_epi8(first_data, _mm_set_epi8(10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11))))), + _mm_or_si128( + _mm_andnot_si128( + _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10)), + _mm_cmpeq_epi8(second_data, _mm_shuffle_epi8(first_data, _mm_set_epi8(9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10)))), + _mm_andnot_si128( + _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9)), + _mm_cmpeq_epi8(second_data, _mm_shuffle_epi8(first_data, _mm_set_epi8(8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9))))))), + _mm_or_si128( + _mm_or_si128( + _mm_or_si128( + _mm_andnot_si128( + _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8)), + _mm_cmpeq_epi8(second_data, _mm_shuffle_epi8(first_data, _mm_set_epi8(7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8)))), + _mm_andnot_si128( + _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7)), + _mm_cmpeq_epi8(second_data, _mm_shuffle_epi8(first_data, _mm_set_epi8(6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7))))), + _mm_or_si128( + _mm_andnot_si128( + _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6)), + _mm_cmpeq_epi8(second_data, _mm_shuffle_epi8(first_data, _mm_set_epi8(5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6)))), + _mm_andnot_si128( + _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5)), + _mm_cmpeq_epi8(second_data, _mm_shuffle_epi8(first_data, _mm_set_epi8(4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5)))))), + _mm_or_si128( + _mm_or_si128( + _mm_andnot_si128( + _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4)), + _mm_cmpeq_epi8(second_data, _mm_shuffle_epi8(first_data, _mm_set_epi8(3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4)))), + _mm_andnot_si128( + _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3)), + _mm_cmpeq_epi8(second_data, _mm_shuffle_epi8(first_data, _mm_set_epi8(2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3))))), + _mm_or_si128( + _mm_andnot_si128( + _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2)), + _mm_cmpeq_epi8(second_data, _mm_shuffle_epi8(first_data, _mm_set_epi8(1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2)))), + _mm_andnot_si128( + _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1)), + _mm_cmpeq_epi8(second_data, _mm_shuffle_epi8(first_data, _mm_set_epi8(0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1)))))))), + bitmask); + } + + if (i < first.size) + { + for (; i < first.size && !has_mask; ++i) + { + if (has_first_null_map && first_null_map[i]) + continue; + + __m128i v_i = _mm_set1_epi8(first.data[i]); + bitmask = _mm_or_si128(bitmask, _mm_cmpeq_epi8(second_data, v_i)); + has_mask = _mm_test_all_ones(bitmask); + } + } + } + } + + if (!has_mask && second.size > 15) + return false; + + return hasAllIntegralLoopRemainder(j, first, second, first_null_map, second_null_map); +} + +) + +#endif + template < ArraySearchType search_type, typename FirstSliceType, typename SecondSliceType, - bool (*isEqual)(const FirstSliceType &, const SecondSliceType &, size_t, size_t)> -bool sliceHasImplAnyAll(const FirstSliceType & first, const SecondSliceType & second, const UInt8 * first_null_map, const UInt8 * second_null_map) + bool (*isEqual)(const FirstSliceType &, const SecondSliceType &, size_t, size_t)> +bool sliceHasImplAnyAllGenericImpl(const FirstSliceType & first, const SecondSliceType & second, const UInt8 * first_null_map, const UInt8 * second_null_map) { const bool has_first_null_map = first_null_map != nullptr; const bool has_second_null_map = second_null_map != nullptr; @@ -81,1469 +880,64 @@ bool sliceHasImplAnyAll(const FirstSliceType & first, const SecondSliceType & se if (!has && search_type == ArraySearchType::All) return false; } + return search_type == ArraySearchType::All; } - -#if (defined(__AVX2__) && defined(ENABLE_AVX2)) || defined(__SSE4_2__) - -namespace +/// Methods to check if first array has elements from second array, overloaded for various combinations of types. +template < + ArraySearchType search_type, + typename FirstSliceType, + typename SecondSliceType, + bool (*isEqual)(const FirstSliceType &, const SecondSliceType &, size_t, size_t)> +inline ALWAYS_INLINE bool sliceHasImplAnyAll(const FirstSliceType & first, const SecondSliceType & second, const UInt8 * first_null_map, const UInt8 * second_null_map) { - -template -inline ALWAYS_INLINE bool hasAllIntegralLoopRemainder( - size_t j, const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, const UInt8 * second_null_map) -{ - const bool has_first_null_map = first_null_map != nullptr; - const bool has_second_null_map = second_null_map != nullptr; - - for (; j < second.size; ++j) +#if USE_MULTITARGET_CODE + if constexpr (search_type == ArraySearchType::All && std::is_same_v) { - // skip null elements since both have at least one - assuming it was checked earlier that at least one element in 'first' is null - if (has_second_null_map && second_null_map[j]) - continue; - bool found = false; - - for (size_t i = 0; i < first.size; ++i) +#if defined(__AVX2__) + if (isArchSupported(TargetArch::AVX2)) { - if (has_first_null_map && first_null_map[i]) - continue; - - if (first.data[i] == second.data[j]) + if constexpr (std::is_same_v> || std::is_same_v>) { - found = true; - break; + return GatherUtils::TargetSpecific::AVX2::sliceHasImplAnyAllImplInt16(first, second, first_null_map, second_null_map); + } + else if constexpr (std::is_same_v> || std::is_same_v>) + { + return GatherUtils::TargetSpecific::AVX2::sliceHasImplAnyAllImplInt32(first, second, first_null_map, second_null_map); + } + else if constexpr (std::is_same_v> || std::is_same_v>) + { + return GatherUtils::TargetSpecific::AVX2::sliceHasImplAnyAllImplInt64(first, second, first_null_map, second_null_map); } } - - if (!found) - return false; - } - return true; -} - -} - #endif -#if defined(__AVX2__) && defined(ENABLE_AVX2) - -// AVX2 Int64 specialization -template <> -inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >( - const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, const UInt8 * second_null_map) -{ - if (second.size == 0) - return true; - - if (!hasNull(first_null_map, first.size) && hasNull(second_null_map, second.size)) - return false; - - const bool has_first_null_map = first_null_map != nullptr; - const bool has_second_null_map = second_null_map != nullptr; - - size_t j = 0; - short has_mask = 1; - const Int64 full = -1, none = 0; - const __m256i ones = _mm256_set1_epi64x(full); - const __m256i zeros = _mm256_setzero_si256(); - if (second.size > 3 && first.size > 3) - { - for (; j < second.size - 3 && has_mask; j += 4) + if (isArchSupported(TargetArch::SSE42)) { - has_mask = 0; - const __m256i f_data = _mm256_loadu_si256(reinterpret_cast(second.data + j)); - // bits of the bitmask are set to one if considered as null in the corresponding null map, 0 otherwise; - __m256i bitmask = has_second_null_map ? - _mm256_set_epi64x( - (second_null_map[j + 3])? full : none, - (second_null_map[j + 2])? full : none, - (second_null_map[j + 1])? full : none, - (second_null_map[j]) ? full : none) - : zeros; - - unsigned i = 0; - for (; i < first.size - 3 && !has_mask; has_mask = _mm256_testc_si256(bitmask, ones), i += 4) + if constexpr (std::is_same_v> || std::is_same_v>) { - const __m256i s_data = _mm256_loadu_si256(reinterpret_cast(first.data + i)); - const __m256i first_nm_mask = has_first_null_map? - _mm256_set_m128i( - _mm_cvtepi8_epi64(_mm_loadu_si128(reinterpret_cast(first_null_map + i + 2))), - _mm_cvtepi8_epi64(_mm_loadu_si128(reinterpret_cast(first_null_map + i)))) - : zeros; - bitmask = - _mm256_or_si256( - _mm256_or_si256( - _mm256_or_si256( - _mm256_andnot_si256( - first_nm_mask, - _mm256_cmpeq_epi64(f_data, s_data)), - _mm256_andnot_si256( - _mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(5,4,3,2,1,0,7,6)), - _mm256_cmpeq_epi64(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(5,4,3,2,1,0,7,6))))), - - _mm256_or_si256( - _mm256_andnot_si256( - _mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(3,2,1,0,7,6,5,4)), - _mm256_cmpeq_epi64(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(3,2,1,0,7,6,5,4)))), - _mm256_andnot_si256( - _mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(1,0,7,6,5,4,3,2)), - _mm256_cmpeq_epi64(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(1,0,7,6,5,4,3,2)))))), - bitmask); + return TargetSpecific::SSE42::sliceHasImplAnyAllImplInt8(first, second, first_null_map, second_null_map); } - - if (i < first.size) + else if constexpr (std::is_same_v> || std::is_same_v>) { - for (; i < first.size && !has_mask; ++i) - { - if (has_first_null_map && first_null_map[i]) - continue; - __m256i v_i = _mm256_set1_epi64x(first.data[i]); - bitmask = _mm256_or_si256(bitmask, _mm256_cmpeq_epi64(f_data, v_i)); - has_mask = _mm256_testc_si256(bitmask, ones); - } + return GatherUtils::TargetSpecific::SSE42::sliceHasImplAnyAllImplInt16(first, second, first_null_map, second_null_map); + } + else if constexpr (std::is_same_v> || std::is_same_v>) + { + return GatherUtils::TargetSpecific::SSE42::sliceHasImplAnyAllImplInt32(first, second, first_null_map, second_null_map); + } + else if constexpr (std::is_same_v> || std::is_same_v>) + { + return GatherUtils::TargetSpecific::SSE42::sliceHasImplAnyAllImplInt64(first, second, first_null_map, second_null_map); } } } - - if (!has_mask && second.size > 3) - return false; - - return hasAllIntegralLoopRemainder(j, first, second, first_null_map, second_null_map); -} - -// AVX2 Int64 specialization -template <> -inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >( - const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, const UInt8 * second_null_map) -{ - if (second.size == 0) - return true; - - if (!hasNull(first_null_map, first.size) && hasNull(second_null_map, second.size)) - return false; - - const bool has_first_null_map = first_null_map != nullptr; - const bool has_second_null_map = second_null_map != nullptr; - - size_t j = 0; - short has_mask = 1; - const Int64 full = -1, none = 0; - const __m256i ones = _mm256_set1_epi64x(full); - const __m256i zeros = _mm256_setzero_si256(); - if (second.size > 3 && first.size > 3) - { - for (; j < second.size - 3 && has_mask; j += 4) - { - has_mask = 0; - const __m256i f_data = _mm256_loadu_si256(reinterpret_cast(second.data + j)); - // bits of the bitmask are set to one if considered as null in the corresponding null map, 0 otherwise; - __m256i bitmask = has_second_null_map ? - _mm256_set_epi64x( - (second_null_map[j + 3])? full : none, - (second_null_map[j + 2])? full : none, - (second_null_map[j + 1])? full : none, - (second_null_map[j]) ? full : none) - : zeros; - - unsigned i = 0; - for (; i < first.size - 3 && !has_mask; has_mask = _mm256_testc_si256(bitmask, ones), i += 4) - { - const __m256i s_data = _mm256_loadu_si256(reinterpret_cast(first.data + i)); - const __m256i first_nm_mask = has_first_null_map? - _mm256_set_m128i( - _mm_cvtepi8_epi64(_mm_loadu_si128(reinterpret_cast(first_null_map + i + 2))), - _mm_cvtepi8_epi64(_mm_loadu_si128(reinterpret_cast(first_null_map + i)))) - : zeros; - bitmask = - _mm256_or_si256( - _mm256_or_si256( - _mm256_or_si256( - _mm256_andnot_si256( - first_nm_mask, - _mm256_cmpeq_epi64(f_data, s_data)), - _mm256_andnot_si256( - _mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(5,4,3,2,1,0,7,6)), - _mm256_cmpeq_epi64(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(5,4,3,2,1,0,7,6))))), - - _mm256_or_si256( - _mm256_andnot_si256( - _mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(3,2,1,0,7,6,5,4)), - _mm256_cmpeq_epi64(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(3,2,1,0,7,6,5,4)))), - _mm256_andnot_si256( - _mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(1,0,7,6,5,4,3,2)), - _mm256_cmpeq_epi64(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(1,0,7,6,5,4,3,2)))))), - bitmask); - } - - if (i < first.size) - { - for (; i < first.size && !has_mask; ++i) - { - if (has_first_null_map && first_null_map[i]) - continue; - __m256i v_i = _mm256_set1_epi64x(first.data[i]); - bitmask = _mm256_or_si256(bitmask, _mm256_cmpeq_epi64(f_data, v_i)); - has_mask = _mm256_testc_si256(bitmask, ones); - } - } - } - } - - if (!has_mask && second.size > 3) - return false; - - return hasAllIntegralLoopRemainder(j, first, second, first_null_map, second_null_map); -} - -// AVX2 Int32 specialization of sliceHasImplAnyAll -template <> -inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >( - const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, const UInt8 * second_null_map) -{ - if (second.size == 0) - return true; - - if (!hasNull(first_null_map, first.size) && hasNull(second_null_map, second.size)) - return false; - - const bool has_first_null_map = first_null_map != nullptr; - const bool has_second_null_map = second_null_map != nullptr; - - size_t j = 0; - short has_mask = 1; - const int full = -1, none = 0; - const __m256i ones = _mm256_set1_epi32(full); - const __m256i zeros = _mm256_setzero_si256(); - if (second.size > 7 && first.size > 7) - { - for (; j < second.size - 7 && has_mask; j += 8) - { - has_mask = 0; - const __m256i f_data = _mm256_loadu_si256(reinterpret_cast(second.data + j)); - // bits of the bitmask are set to one if considered as null in the corresponding null map, 0 otherwise; - __m256i bitmask = has_second_null_map ? - _mm256_set_epi32( - (second_null_map[j + 7]) ? full : none, - (second_null_map[j + 6]) ? full : none, - (second_null_map[j + 5]) ? full : none, - (second_null_map[j + 4]) ? full : none, - (second_null_map[j + 3]) ? full : none, - (second_null_map[j + 2]) ? full : none, - (second_null_map[j + 1]) ? full : none, - (second_null_map[j]) ? full : none) - : zeros; - - size_t i = 0; - for (; i < first.size - 7 && !has_mask; has_mask = _mm256_testc_si256(bitmask, ones), i += 8) - { - const __m256i s_data = _mm256_loadu_si256(reinterpret_cast(first.data + i)); - // Create a mask to avoid to compare null elements - // set_m128i takes two arguments: (high segment, low segment) that are two __m128i convert from 8bits to 32bits to match with next operations - const __m256i first_nm_mask = has_first_null_map? - _mm256_set_m128i( - _mm_cvtepi8_epi32(_mm_loadu_si128(reinterpret_cast(first_null_map + i + 4))), - _mm_cvtepi8_epi32(_mm_loadu_si128(reinterpret_cast(first_null_map + i)))) - : zeros; - bitmask = - _mm256_or_si256( - _mm256_or_si256( - _mm256_or_si256( - _mm256_or_si256( - _mm256_andnot_si256( - first_nm_mask, - _mm256_cmpeq_epi32(f_data, s_data)), - _mm256_andnot_si256( - _mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(6,5,4,3,2,1,0,7)), - _mm256_cmpeq_epi32(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(6,5,4,3,2,1,0,7))))), - _mm256_or_si256( - _mm256_andnot_si256( - _mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(5,4,3,2,1,0,7,6)), - _mm256_cmpeq_epi32(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(5,4,3,2,1,0,7,6)))), - _mm256_andnot_si256( - _mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(4,3,2,1,0,7,6,5)), - _mm256_cmpeq_epi32(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(4,3,2,1,0,7,6,5))))) - ), - _mm256_or_si256( - _mm256_or_si256( - _mm256_andnot_si256( - _mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(3,2,1,0,7,6,5,4)), - _mm256_cmpeq_epi32(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(3,2,1,0,7,6,5,4)))), - _mm256_andnot_si256( - _mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(2,1,0,7,6,5,4,3)), - _mm256_cmpeq_epi32(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(2,1,0,7,6,5,4,3))))), - _mm256_or_si256( - _mm256_andnot_si256( - _mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(1,0,7,6,5,4,3,2)), - _mm256_cmpeq_epi32(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(1,0,7,6,5,4,3,2)))), - _mm256_andnot_si256( - _mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(0,7,6,5,4,3,2,1)), - _mm256_cmpeq_epi32(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(0,7,6,5,4,3,2,1))))))), - bitmask); - } - - if (i < first.size) - { - // Loop(i)-jam - for (; i < first.size && !has_mask; ++i) - { - if (has_first_null_map && first_null_map[i]) - continue; - __m256i v_i = _mm256_set1_epi32(first.data[i]); - bitmask = _mm256_or_si256(bitmask, _mm256_cmpeq_epi32(f_data, v_i)); - has_mask = _mm256_testc_si256(bitmask, ones); - } - } - } - } - - if (!has_mask && second.size > 7) - return false; - - return hasAllIntegralLoopRemainder(j, first, second, first_null_map, second_null_map); -} - -// AVX2 UInt32 specialization of sliceHasImplAnyAll -template <> -inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >( - const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, const UInt8 * second_null_map) -{ - if (second.size == 0) - return true; - - if (!hasNull(first_null_map, first.size) && hasNull(second_null_map, second.size)) - return false; - - const bool has_first_null_map = first_null_map != nullptr; - const bool has_second_null_map = second_null_map != nullptr; - - size_t j = 0; - short has_mask = 1; - const int full = -1, none = 0; - const __m256i ones = _mm256_set1_epi32(full); - const __m256i zeros = _mm256_setzero_si256(); - if (second.size > 7 && first.size > 7) - { - for (; j < second.size - 7 && has_mask; j += 8) - { - has_mask = 0; - const __m256i f_data = _mm256_loadu_si256(reinterpret_cast(second.data + j)); - // bits of the bitmask are set to one if considered as null in the corresponding null map, 0 otherwise; - __m256i bitmask = has_second_null_map ? - _mm256_set_epi32( - (second_null_map[j + 7]) ? full : none, - (second_null_map[j + 6]) ? full : none, - (second_null_map[j + 5]) ? full : none, - (second_null_map[j + 4]) ? full : none, - (second_null_map[j + 3]) ? full : none, - (second_null_map[j + 2]) ? full : none, - (second_null_map[j + 1]) ? full : none, - (second_null_map[j]) ? full : none) - : zeros; - - size_t i = 0; - for (; i < first.size - 7 && !has_mask; has_mask = _mm256_testc_si256(bitmask, ones), i += 8) - { - const __m256i s_data = _mm256_loadu_si256(reinterpret_cast(first.data + i)); - // Create a mask to avoid to compare null elements - // set_m128i takes two arguments: (high segment, low segment) that are two __m128i convert from 8bits to 32bits to match with next operations - const __m256i first_nm_mask = has_first_null_map? - _mm256_set_m128i( - _mm_cvtepi8_epi32(_mm_loadu_si128(reinterpret_cast(first_null_map + i + 4))), - _mm_cvtepi8_epi32(_mm_loadu_si128(reinterpret_cast(first_null_map + i)))) - : zeros; - bitmask = - _mm256_or_si256( - _mm256_or_si256( - _mm256_or_si256( - _mm256_or_si256( - _mm256_andnot_si256( - first_nm_mask, - _mm256_cmpeq_epi32(f_data, s_data)), - _mm256_andnot_si256( - _mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(6,5,4,3,2,1,0,7)), - _mm256_cmpeq_epi32(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(6,5,4,3,2,1,0,7))))), - _mm256_or_si256( - _mm256_andnot_si256( - _mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(5,4,3,2,1,0,7,6)), - _mm256_cmpeq_epi32(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(5,4,3,2,1,0,7,6)))), - _mm256_andnot_si256( - _mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(4,3,2,1,0,7,6,5)), - _mm256_cmpeq_epi32(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(4,3,2,1,0,7,6,5))))) - ), - _mm256_or_si256( - _mm256_or_si256( - _mm256_andnot_si256( - _mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(3,2,1,0,7,6,5,4)), - _mm256_cmpeq_epi32(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(3,2,1,0,7,6,5,4)))), - _mm256_andnot_si256( - _mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(2,1,0,7,6,5,4,3)), - _mm256_cmpeq_epi32(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(2,1,0,7,6,5,4,3))))), - _mm256_or_si256( - _mm256_andnot_si256( - _mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(1,0,7,6,5,4,3,2)), - _mm256_cmpeq_epi32(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(1,0,7,6,5,4,3,2)))), - _mm256_andnot_si256( - _mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(0,7,6,5,4,3,2,1)), - _mm256_cmpeq_epi32(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(0,7,6,5,4,3,2,1))))))), - bitmask); - } - - if (i < first.size) - { - // Loop(i)-jam - for (; i < first.size && !has_mask; ++i) - { - if (has_first_null_map && first_null_map[i]) - continue; - __m256i v_i = _mm256_set1_epi32(first.data[i]); - bitmask = _mm256_or_si256(bitmask, _mm256_cmpeq_epi32(f_data, v_i)); - has_mask = _mm256_testc_si256(bitmask, ones); - } - } - } - } - - if (!has_mask && second.size > 7) - return false; - - return hasAllIntegralLoopRemainder(j, first, second, first_null_map, second_null_map); -} - -// AVX2 Int16 specialization -template <> -inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >( - const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, const UInt8 * second_null_map) -{ - if (second.size == 0) - return true; - - if (!hasNull(first_null_map, first.size) && hasNull(second_null_map, second.size)) - return false; - - const bool has_first_null_map = first_null_map != nullptr; - const bool has_second_null_map = second_null_map != nullptr; - - size_t j = 0; - short has_mask = 1; - const int16_t full = -1, none = 0; - const __m256i ones = _mm256_set1_epi16(full); - const __m256i zeros = _mm256_setzero_si256(); - if (second.size > 15 && first.size > 15) - { - for (; j < second.size - 15 && has_mask; j += 16) - { - has_mask = 0; - const __m256i f_data = _mm256_loadu_si256(reinterpret_cast(second.data + j)); - __m256i bitmask = has_second_null_map ? - _mm256_set_epi16( - (second_null_map[j + 15]) ? full : none, (second_null_map[j + 14]) ? full : none, - (second_null_map[j + 13]) ? full : none, (second_null_map[j + 12]) ? full : none, - (second_null_map[j + 11]) ? full : none, (second_null_map[j + 10]) ? full : none, - (second_null_map[j + 9]) ? full : none, (second_null_map[j + 8])? full : none, - (second_null_map[j + 7]) ? full : none, (second_null_map[j + 6])? full : none, - (second_null_map[j + 5]) ? full : none, (second_null_map[j + 4])? full : none, - (second_null_map[j + 3]) ? full : none, (second_null_map[j + 2])? full : none, - (second_null_map[j + 1]) ? full : none, (second_null_map[j]) ? full : none) - : zeros; - unsigned i = 0; - for (; i < first.size - 15 && !has_mask; has_mask = _mm256_testc_si256(bitmask, ones), i += 16) - { - const __m256i s_data = _mm256_loadu_si256(reinterpret_cast(first.data + i)); - const __m256i first_nm_mask = has_first_null_map? - _mm256_set_m128i( - _mm_cvtepi8_epi16(_mm_loadu_si128(reinterpret_cast(first_null_map + i + 8))), - _mm_cvtepi8_epi16(_mm_loadu_si128(reinterpret_cast(first_null_map + i)))) - : zeros; - bitmask = - _mm256_or_si256( - _mm256_or_si256( - _mm256_or_si256( - _mm256_or_si256( - _mm256_or_si256( - _mm256_andnot_si256( - first_nm_mask, - _mm256_cmpeq_epi16(f_data, s_data)), - _mm256_andnot_si256( - _mm256_shuffle_epi8(first_nm_mask, _mm256_set_epi8(29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30)), - _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(s_data, _mm256_set_epi8(29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30))))), - _mm256_or_si256( - _mm256_andnot_si256( - _mm256_shuffle_epi8(first_nm_mask, _mm256_set_epi8(27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28)), - _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(s_data, _mm256_set_epi8(27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28)))), - _mm256_andnot_si256( - _mm256_shuffle_epi8(first_nm_mask, _mm256_set_epi8(25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26)), - _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(s_data, _mm256_set_epi8(25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26))))) - ), - _mm256_or_si256( - _mm256_or_si256( - _mm256_andnot_si256( - _mm256_shuffle_epi8(first_nm_mask, _mm256_set_epi8(23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24)), - _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(s_data, _mm256_set_epi8(23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24)))), - _mm256_andnot_si256( - _mm256_shuffle_epi8(first_nm_mask, _mm256_set_epi8(21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22)), - _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(s_data, _mm256_set_epi8(21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22))))), - _mm256_or_si256( - _mm256_andnot_si256( - _mm256_shuffle_epi8(first_nm_mask, _mm256_set_epi8(19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20)), - _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(s_data, _mm256_set_epi8(19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20)))), - _mm256_andnot_si256( - _mm256_shuffle_epi8(first_nm_mask, _mm256_set_epi8(17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18)), - _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(s_data, _mm256_set_epi8(17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18)))))) - ), - _mm256_or_si256( - _mm256_or_si256( - _mm256_or_si256( - _mm256_andnot_si256( - _mm256_permute2x128_si256(first_nm_mask, first_nm_mask,1), - _mm256_cmpeq_epi16(f_data, _mm256_permute2x128_si256(s_data, s_data, 1))), - _mm256_andnot_si256( - _mm256_shuffle_epi8(_mm256_permute2x128_si256(first_nm_mask, first_nm_mask, 1), _mm256_set_epi8(13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14)), - _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(_mm256_permute2x128_si256(s_data, s_data, 1), _mm256_set_epi8(13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14))))), - _mm256_or_si256( - _mm256_andnot_si256( - _mm256_shuffle_epi8(_mm256_permute2x128_si256(first_nm_mask, first_nm_mask, 1), _mm256_set_epi8(11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12)), - _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(_mm256_permute2x128_si256(s_data, s_data, 1), _mm256_set_epi8(11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12)))), - _mm256_andnot_si256( - _mm256_shuffle_epi8(_mm256_permute2x128_si256(first_nm_mask, first_nm_mask, 1), _mm256_set_epi8(9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10)), - _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(_mm256_permute2x128_si256(s_data, s_data, 1), _mm256_set_epi8(9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10))))) - ), - _mm256_or_si256( - _mm256_or_si256( - _mm256_andnot_si256( - _mm256_shuffle_epi8(_mm256_permute2x128_si256(first_nm_mask, first_nm_mask, 1), _mm256_set_epi8(7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8)), - _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(_mm256_permute2x128_si256(s_data ,s_data, 1), _mm256_set_epi8(7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8)))), - _mm256_andnot_si256( - _mm256_shuffle_epi8(_mm256_permute2x128_si256(first_nm_mask, first_nm_mask, 1), _mm256_set_epi8(5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6)), - _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(_mm256_permute2x128_si256(s_data, s_data, 1), _mm256_set_epi8(5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6))))), - _mm256_or_si256( - _mm256_andnot_si256( - _mm256_shuffle_epi8(_mm256_permute2x128_si256(first_nm_mask, first_nm_mask, 1), _mm256_set_epi8(3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4)), - _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(_mm256_permute2x128_si256(s_data ,s_data ,1), _mm256_set_epi8(3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4)))), - _mm256_andnot_si256( - _mm256_shuffle_epi8(_mm256_permute2x128_si256(first_nm_mask, first_nm_mask, 1), _mm256_set_epi8(1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2)), - _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(_mm256_permute2x128_si256(s_data, s_data, 1), _mm256_set_epi8(1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2)))))) - ) - ), - bitmask); - } - - if (i < first.size) - { - for (; i < first.size && !has_mask; ++i) - { - if (has_first_null_map && first_null_map[i]) - continue; - __m256i v_i = _mm256_set1_epi16(first.data[i]); - bitmask = _mm256_or_si256(bitmask, _mm256_cmpeq_epi16(f_data, v_i)); - has_mask = _mm256_testc_si256(bitmask, ones); - } - } - } - } - - if (!has_mask && second.size > 15) - return false; - - return hasAllIntegralLoopRemainder(j, first, second, first_null_map, second_null_map); -} - -// AVX2 UInt16 specialization -template <> -inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >( - const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, const UInt8 * second_null_map) -{ - if (second.size == 0) - return true; - - if (!hasNull(first_null_map, first.size) && hasNull(second_null_map, second.size)) - return false; - - const bool has_first_null_map = first_null_map != nullptr; - const bool has_second_null_map = second_null_map != nullptr; - - size_t j = 0; - short has_mask = 1; - const int16_t full = -1, none = 0; - const __m256i ones = _mm256_set1_epi16(full); - const __m256i zeros = _mm256_setzero_si256(); - if (second.size > 15 && first.size > 15) - { - for (; j < second.size - 15 && has_mask; j += 16) - { - has_mask = 0; - const __m256i f_data = _mm256_loadu_si256(reinterpret_cast(second.data + j)); - __m256i bitmask = has_second_null_map ? - _mm256_set_epi16( - (second_null_map[j + 15]) ? full : none, (second_null_map[j + 14]) ? full : none, - (second_null_map[j + 13]) ? full : none, (second_null_map[j + 12]) ? full : none, - (second_null_map[j + 11]) ? full : none, (second_null_map[j + 10]) ? full : none, - (second_null_map[j + 9]) ? full : none, (second_null_map[j + 8])? full : none, - (second_null_map[j + 7]) ? full : none, (second_null_map[j + 6])? full : none, - (second_null_map[j + 5]) ? full : none, (second_null_map[j + 4])? full : none, - (second_null_map[j + 3]) ? full : none, (second_null_map[j + 2])? full : none, - (second_null_map[j + 1]) ? full : none, (second_null_map[j]) ? full : none) - : zeros; - unsigned i = 0; - for (; i < first.size - 15 && !has_mask; has_mask = _mm256_testc_si256(bitmask, ones), i += 16) - { - const __m256i s_data = _mm256_loadu_si256(reinterpret_cast(first.data + i)); - const __m256i first_nm_mask = has_first_null_map? - _mm256_set_m128i( - _mm_cvtepi8_epi16(_mm_loadu_si128(reinterpret_cast(first_null_map + i + 8))), - _mm_cvtepi8_epi16(_mm_loadu_si128(reinterpret_cast(first_null_map + i)))) - : zeros; - bitmask = - _mm256_or_si256( - _mm256_or_si256( - _mm256_or_si256( - _mm256_or_si256( - _mm256_or_si256( - _mm256_andnot_si256( - first_nm_mask, - _mm256_cmpeq_epi16(f_data, s_data)), - _mm256_andnot_si256( - _mm256_shuffle_epi8(first_nm_mask, _mm256_set_epi8(29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30)), - _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(s_data, _mm256_set_epi8(29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30))))), - _mm256_or_si256( - _mm256_andnot_si256( - _mm256_shuffle_epi8(first_nm_mask, _mm256_set_epi8(27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28)), - _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(s_data, _mm256_set_epi8(27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28)))), - _mm256_andnot_si256( - _mm256_shuffle_epi8(first_nm_mask, _mm256_set_epi8(25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26)), - _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(s_data, _mm256_set_epi8(25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26))))) - ), - _mm256_or_si256( - _mm256_or_si256( - _mm256_andnot_si256( - _mm256_shuffle_epi8(first_nm_mask, _mm256_set_epi8(23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24)), - _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(s_data, _mm256_set_epi8(23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24)))), - _mm256_andnot_si256( - _mm256_shuffle_epi8(first_nm_mask, _mm256_set_epi8(21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22)), - _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(s_data, _mm256_set_epi8(21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22))))), - _mm256_or_si256( - _mm256_andnot_si256( - _mm256_shuffle_epi8(first_nm_mask, _mm256_set_epi8(19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20)), - _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(s_data, _mm256_set_epi8(19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20)))), - _mm256_andnot_si256( - _mm256_shuffle_epi8(first_nm_mask, _mm256_set_epi8(17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18)), - _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(s_data, _mm256_set_epi8(17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18)))))) - ), - _mm256_or_si256( - _mm256_or_si256( - _mm256_or_si256( - _mm256_andnot_si256( - _mm256_permute2x128_si256(first_nm_mask, first_nm_mask,1), - _mm256_cmpeq_epi16(f_data, _mm256_permute2x128_si256(s_data, s_data, 1))), - _mm256_andnot_si256( - _mm256_shuffle_epi8(_mm256_permute2x128_si256(first_nm_mask, first_nm_mask, 1), _mm256_set_epi8(13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14)), - _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(_mm256_permute2x128_si256(s_data, s_data, 1), _mm256_set_epi8(13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14))))), - _mm256_or_si256( - _mm256_andnot_si256( - _mm256_shuffle_epi8(_mm256_permute2x128_si256(first_nm_mask, first_nm_mask, 1), _mm256_set_epi8(11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12)), - _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(_mm256_permute2x128_si256(s_data, s_data, 1), _mm256_set_epi8(11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12)))), - _mm256_andnot_si256( - _mm256_shuffle_epi8(_mm256_permute2x128_si256(first_nm_mask, first_nm_mask, 1), _mm256_set_epi8(9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10)), - _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(_mm256_permute2x128_si256(s_data, s_data, 1), _mm256_set_epi8(9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10))))) - ), - _mm256_or_si256( - _mm256_or_si256( - _mm256_andnot_si256( - _mm256_shuffle_epi8(_mm256_permute2x128_si256(first_nm_mask, first_nm_mask, 1), _mm256_set_epi8(7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8)), - _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(_mm256_permute2x128_si256(s_data ,s_data, 1), _mm256_set_epi8(7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8)))), - _mm256_andnot_si256( - _mm256_shuffle_epi8(_mm256_permute2x128_si256(first_nm_mask, first_nm_mask, 1), _mm256_set_epi8(5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6)), - _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(_mm256_permute2x128_si256(s_data, s_data, 1), _mm256_set_epi8(5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6))))), - _mm256_or_si256( - _mm256_andnot_si256( - _mm256_shuffle_epi8(_mm256_permute2x128_si256(first_nm_mask, first_nm_mask, 1), _mm256_set_epi8(3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4)), - _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(_mm256_permute2x128_si256(s_data ,s_data ,1), _mm256_set_epi8(3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4)))), - _mm256_andnot_si256( - _mm256_shuffle_epi8(_mm256_permute2x128_si256(first_nm_mask, first_nm_mask, 1), _mm256_set_epi8(1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2)), - _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(_mm256_permute2x128_si256(s_data, s_data, 1), _mm256_set_epi8(1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2)))))) - ) - ), - bitmask); - } - - if (i < first.size) - { - for (; i < first.size && !has_mask; ++i) - { - if (has_first_null_map && first_null_map[i]) - continue; - __m256i v_i = _mm256_set1_epi16(first.data[i]); - bitmask = _mm256_or_si256(bitmask, _mm256_cmpeq_epi16(f_data, v_i)); - has_mask = _mm256_testc_si256(bitmask, ones); - } - } - } - } - - if (!has_mask && second.size > 15) - return false; - - return hasAllIntegralLoopRemainder(j, first, second, first_null_map, second_null_map); -} - -#elif defined(__SSE4_2__) - -// SSE4.2 Int64 specialization -template <> -inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >( - const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, const UInt8 * second_null_map) -{ - if (second.size == 0) - return true; - - if (!hasNull(first_null_map, first.size) && hasNull(second_null_map, second.size)) - return false; - - const bool has_first_null_map = first_null_map != nullptr; - const bool has_second_null_map = second_null_map != nullptr; - - size_t j = 0; - short has_mask = 1; - const Int64 full = -1, none = 0; - const __m128i zeros = _mm_setzero_si128(); - if (second.size > 1 && first.size > 1) - { - for (; j < second.size - 1 && has_mask; j += 2) - { - has_mask = 0; - const __m128i f_data = _mm_loadu_si128(reinterpret_cast(second.data + j)); - __m128i bitmask = has_second_null_map ? - _mm_set_epi64x( - (second_null_map[j + 1]) ? full : none, - (second_null_map[j]) ? full : none) - : zeros; - unsigned i = 0; - for (; i < first.size - 1 && !has_mask; has_mask = _mm_test_all_ones(bitmask), i += 2) - { - const __m128i s_data = _mm_loadu_si128(reinterpret_cast(first.data + i)); - const __m128i first_nm_mask = has_first_null_map ? - _mm_cvtepi8_epi64(_mm_loadu_si128(reinterpret_cast(first_null_map + i))) - : zeros; - bitmask = - _mm_or_si128( - _mm_or_si128( - _mm_andnot_si128( - first_nm_mask, - _mm_cmpeq_epi64(f_data, s_data)), - _mm_andnot_si128( - _mm_shuffle_epi32(first_nm_mask, _MM_SHUFFLE(1,0,3,2)), - _mm_cmpeq_epi64(f_data, _mm_shuffle_epi32(s_data, _MM_SHUFFLE(1,0,3,2))))), - bitmask); - } - - if (i < first.size) - { - for (; i < first.size && !has_mask; ++i) - { - if (has_first_null_map && first_null_map[i]) - continue; - __m128i v_i = _mm_set1_epi64x(first.data[i]); - bitmask = _mm_or_si128(bitmask, _mm_cmpeq_epi64(f_data, v_i)); - has_mask = _mm_test_all_ones(bitmask); - } - } - } - } - - if (!has_mask && second.size > 1) - return false; - - return hasAllIntegralLoopRemainder(j, first, second, first_null_map, second_null_map); -} - -// SSE4.2 UInt64 specialization -template <> -inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >( - const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, const UInt8 * second_null_map) -{ - if (second.size == 0) - return true; - - if (!hasNull(first_null_map, first.size) && hasNull(second_null_map, second.size)) - return false; - - const bool has_first_null_map = first_null_map != nullptr; - const bool has_second_null_map = second_null_map != nullptr; - - size_t j = 0; - short has_mask = 1; - const Int64 full = -1, none = 0; - const __m128i zeros = _mm_setzero_si128(); - if (second.size > 1 && first.size > 1) - { - for (; j < second.size - 1 && has_mask; j += 2) - { - has_mask = 0; - const __m128i f_data = _mm_loadu_si128(reinterpret_cast(second.data + j)); - __m128i bitmask = has_second_null_map ? - _mm_set_epi64x( - (second_null_map[j + 1]) ? full : none, - (second_null_map[j]) ? full : none) - : zeros; - unsigned i = 0; - for (; i < first.size - 1 && !has_mask; has_mask = _mm_test_all_ones(bitmask), i += 2) - { - const __m128i s_data = _mm_loadu_si128(reinterpret_cast(first.data + i)); - const __m128i first_nm_mask = has_first_null_map ? - _mm_cvtepi8_epi64(_mm_loadu_si128(reinterpret_cast(first_null_map + i))) - : zeros; - bitmask = - _mm_or_si128( - _mm_or_si128( - _mm_andnot_si128( - first_nm_mask, - _mm_cmpeq_epi64(f_data, s_data)), - _mm_andnot_si128( - _mm_shuffle_epi32(first_nm_mask, _MM_SHUFFLE(1,0,3,2)), - _mm_cmpeq_epi64(f_data, _mm_shuffle_epi32(s_data, _MM_SHUFFLE(1,0,3,2))))), - bitmask); - } - - if (i < first.size) - { - for (; i < first.size && !has_mask; ++i) - { - if (has_first_null_map && first_null_map[i]) - continue; - __m128i v_i = _mm_set1_epi64x(first.data[i]); - bitmask = _mm_or_si128(bitmask, _mm_cmpeq_epi64(f_data, v_i)); - has_mask = _mm_test_all_ones(bitmask); - } - } - } - } - - if (!has_mask && second.size > 1) - return false; - - return hasAllIntegralLoopRemainder(j, first, second, first_null_map, second_null_map); -} - -// SSE4.2 Int32 specialization -template <> -inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >( - const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, const UInt8 * second_null_map) -{ - if (second.size == 0) - return true; - - if (!hasNull(first_null_map, first.size) && hasNull(second_null_map, second.size)) - return false; - - const bool has_first_null_map = first_null_map != nullptr; - const bool has_second_null_map = second_null_map != nullptr; - - size_t j = 0; - short has_mask = 1; - const int full = -1, none = 0; - const __m128i zeros = _mm_setzero_si128(); - if (second.size > 3 && first.size > 3) - { - for (; j < second.size - 3 && has_mask; j += 4) - { - has_mask = 0; - const __m128i f_data = _mm_loadu_si128(reinterpret_cast(second.data + j)); - __m128i bitmask = has_second_null_map ? - _mm_set_epi32( - (second_null_map[j + 3]) ? full : none, - (second_null_map[j + 2]) ? full : none, - (second_null_map[j + 1]) ? full : none, - (second_null_map[j]) ? full : none) - : zeros; - - unsigned i = 0; - for (; i < first.size - 3 && !has_mask; has_mask = _mm_test_all_ones(bitmask), i += 4) - { - const __m128i s_data = _mm_loadu_si128(reinterpret_cast(first.data + i)); - const __m128i first_nm_mask = has_first_null_map ? - _mm_cvtepi8_epi32(_mm_loadu_si128(reinterpret_cast(first_null_map + i))) - : zeros; - - bitmask = - _mm_or_si128( - _mm_or_si128( - _mm_or_si128( - _mm_andnot_si128( - first_nm_mask, - _mm_cmpeq_epi32(f_data, s_data)), - _mm_andnot_si128( - _mm_shuffle_epi32(first_nm_mask, _MM_SHUFFLE(2,1,0,3)), - _mm_cmpeq_epi32(f_data, _mm_shuffle_epi32(s_data, _MM_SHUFFLE(2,1,0,3))))), - _mm_or_si128( - _mm_andnot_si128( - _mm_shuffle_epi32(first_nm_mask, _MM_SHUFFLE(1,0,3,2)), - _mm_cmpeq_epi32(f_data, _mm_shuffle_epi32(s_data, _MM_SHUFFLE(1,0,3,2)))), - _mm_andnot_si128( - _mm_shuffle_epi32(first_nm_mask, _MM_SHUFFLE(0,3,2,1)), - _mm_cmpeq_epi32(f_data, _mm_shuffle_epi32(s_data, _MM_SHUFFLE(0,3,2,1))))) - ), - bitmask); - } - - if (i < first.size) - { - for (; i < first.size && !has_mask; ++i) - { - if (has_first_null_map && first_null_map[i]) - continue; - __m128i r_i = _mm_set1_epi32(first.data[i]); - bitmask = _mm_or_si128(bitmask, _mm_cmpeq_epi32(f_data, r_i)); - has_mask = _mm_test_all_ones(bitmask); - } - } - } - } - - if (!has_mask && second.size > 3) - return false; - - return hasAllIntegralLoopRemainder(j, first, second, first_null_map, second_null_map); -} - -// SSE4.2 UInt32 specialization -template <> -inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >( - const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, const UInt8 * second_null_map) -{ - if (second.size == 0) - return true; - - if (!hasNull(first_null_map, first.size) && hasNull(second_null_map, second.size)) - return false; - - const bool has_first_null_map = first_null_map != nullptr; - const bool has_second_null_map = second_null_map != nullptr; - - size_t j = 0; - short has_mask = 1; - const int full = -1, none = 0; - const __m128i zeros = _mm_setzero_si128(); - if (second.size > 3 && first.size > 3) - { - for (; j < second.size - 3 && has_mask; j += 4) - { - has_mask = 0; - const __m128i f_data = _mm_loadu_si128(reinterpret_cast(second.data + j)); - __m128i bitmask = has_second_null_map ? - _mm_set_epi32( - (second_null_map[j + 3]) ? full : none, - (second_null_map[j + 2]) ? full : none, - (second_null_map[j + 1]) ? full : none, - (second_null_map[j]) ? full : none) - : zeros; - - unsigned i = 0; - for (; i < first.size - 3 && !has_mask; has_mask = _mm_test_all_ones(bitmask), i += 4) - { - const __m128i s_data = _mm_loadu_si128(reinterpret_cast(first.data + i)); - const __m128i first_nm_mask = has_first_null_map ? - _mm_cvtepi8_epi32(_mm_loadu_si128(reinterpret_cast(first_null_map + i))) - : zeros; - - bitmask = - _mm_or_si128( - _mm_or_si128( - _mm_or_si128( - _mm_andnot_si128( - first_nm_mask, - _mm_cmpeq_epi32(f_data, s_data)), - _mm_andnot_si128( - _mm_shuffle_epi32(first_nm_mask, _MM_SHUFFLE(2,1,0,3)), - _mm_cmpeq_epi32(f_data, _mm_shuffle_epi32(s_data, _MM_SHUFFLE(2,1,0,3))))), - _mm_or_si128( - _mm_andnot_si128( - _mm_shuffle_epi32(first_nm_mask, _MM_SHUFFLE(1,0,3,2)), - _mm_cmpeq_epi32(f_data, _mm_shuffle_epi32(s_data, _MM_SHUFFLE(1,0,3,2)))), - _mm_andnot_si128( - _mm_shuffle_epi32(first_nm_mask, _MM_SHUFFLE(0,3,2,1)), - _mm_cmpeq_epi32(f_data, _mm_shuffle_epi32(s_data, _MM_SHUFFLE(0,3,2,1))))) - ), - bitmask); - } - - if (i < first.size) - { - for (; i < first.size && !has_mask; ++i) - { - if (has_first_null_map && first_null_map[i]) - continue; - __m128i r_i = _mm_set1_epi32(first.data[i]); - bitmask = _mm_or_si128(bitmask, _mm_cmpeq_epi32(f_data, r_i)); - has_mask = _mm_test_all_ones(bitmask); - } - } - } - } - - if (!has_mask && second.size > 3) - return false; - - return hasAllIntegralLoopRemainder(j, first, second, first_null_map, second_null_map); -} - -// SSE4.2 Int16 specialization -template <> -inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >( - const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, const UInt8 * second_null_map) -{ - if (second.size == 0) - return true; - - if (!hasNull(first_null_map, first.size) && hasNull(second_null_map, second.size)) - return false; - - const bool has_first_null_map = first_null_map != nullptr; - const bool has_second_null_map = second_null_map != nullptr; - - size_t j = 0; - short has_mask = 1; - const int16_t full = -1, none = 0; - const __m128i zeros = _mm_setzero_si128(); - if (second.size > 6 && first.size > 6) - { - for (; j < second.size - 7 && has_mask; j += 8) - { - has_mask = 0; - const __m128i f_data = _mm_loadu_si128(reinterpret_cast(second.data + j)); - __m128i bitmask = has_second_null_map ? - _mm_set_epi16( - (second_null_map[j + 7]) ? full : none, (second_null_map[j + 6]) ? full : none, - (second_null_map[j + 5]) ? full : none, (second_null_map[j + 4]) ? full : none, - (second_null_map[j + 3]) ? full : none, (second_null_map[j + 2]) ? full : none, - (second_null_map[j + 1]) ? full : none, (second_null_map[j]) ? full: none) - : zeros; - unsigned i = 0; - for (; i < first.size-7 && !has_mask; has_mask = _mm_test_all_ones(bitmask), i += 8) - { - const __m128i s_data = _mm_loadu_si128(reinterpret_cast(first.data + i)); - const __m128i first_nm_mask = has_first_null_map ? - _mm_cvtepi8_epi16(_mm_loadu_si128(reinterpret_cast(first_null_map + i))) - : zeros; - bitmask = - _mm_or_si128( - _mm_or_si128( - _mm_or_si128( - _mm_or_si128( - _mm_andnot_si128( - first_nm_mask, - _mm_cmpeq_epi16(f_data, s_data)), - _mm_andnot_si128( - _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14)), - _mm_cmpeq_epi16(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14))))), - _mm_or_si128( - _mm_andnot_si128( - _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12)), - _mm_cmpeq_epi16(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12)))), - _mm_andnot_si128( - _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10)), - _mm_cmpeq_epi16(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10))))) - ), - _mm_or_si128( - _mm_or_si128( - _mm_andnot_si128( - _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8)), - _mm_cmpeq_epi16(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8)))), - _mm_andnot_si128( - _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6)), - _mm_cmpeq_epi16(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6))))), - _mm_or_si128( - _mm_andnot_si128( - _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4)), - _mm_cmpeq_epi16(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4)))), - _mm_andnot_si128( - _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2)), - _mm_cmpeq_epi16(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2)))))) - ), - bitmask); - } - - if (i < first.size) - { - for (; i < first.size && !has_mask; ++i) - { - if (has_first_null_map && first_null_map[i]) - continue; - __m128i v_i = _mm_set1_epi16(first.data[i]); - bitmask = _mm_or_si128(bitmask, _mm_cmpeq_epi16(f_data, v_i)); - has_mask = _mm_test_all_ones(bitmask); - } - } - } - } - - if (!has_mask && second.size > 6) - return false; - - return hasAllIntegralLoopRemainder(j, first, second, first_null_map, second_null_map); -} - -// SSE4.2 UInt16 specialization -template <> -inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >( - const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, const UInt8 * second_null_map) -{ - if (second.size == 0) - return true; - - if (!hasNull(first_null_map, first.size) && hasNull(second_null_map, second.size)) - return false; - - const bool has_first_null_map = first_null_map != nullptr; - const bool has_second_null_map = second_null_map != nullptr; - - size_t j = 0; - short has_mask = 1; - const int16_t full = -1, none = 0; - const __m128i zeros = _mm_setzero_si128(); - if (second.size > 6 && first.size > 6) - { - for (; j < second.size - 7 && has_mask; j += 8) - { - has_mask = 0; - const __m128i f_data = _mm_loadu_si128(reinterpret_cast(second.data + j)); - __m128i bitmask = has_second_null_map ? - _mm_set_epi16( - (second_null_map[j + 7]) ? full : none, (second_null_map[j + 6]) ? full : none, - (second_null_map[j + 5]) ? full : none, (second_null_map[j + 4]) ? full : none, - (second_null_map[j + 3]) ? full : none, (second_null_map[j + 2]) ? full : none, - (second_null_map[j + 1]) ? full : none, (second_null_map[j]) ? full: none) - : zeros; - unsigned i = 0; - for (; i < first.size-7 && !has_mask; has_mask = _mm_test_all_ones(bitmask), i += 8) - { - const __m128i s_data = _mm_loadu_si128(reinterpret_cast(first.data + i)); - const __m128i first_nm_mask = has_first_null_map ? - _mm_cvtepi8_epi16(_mm_loadu_si128(reinterpret_cast(first_null_map + i))) - : zeros; - bitmask = - _mm_or_si128( - _mm_or_si128( - _mm_or_si128( - _mm_or_si128( - _mm_andnot_si128( - first_nm_mask, - _mm_cmpeq_epi16(f_data, s_data)), - _mm_andnot_si128( - _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14)), - _mm_cmpeq_epi16(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14))))), - _mm_or_si128( - _mm_andnot_si128( - _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12)), - _mm_cmpeq_epi16(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12)))), - _mm_andnot_si128( - _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10)), - _mm_cmpeq_epi16(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10))))) - ), - _mm_or_si128( - _mm_or_si128( - _mm_andnot_si128( - _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8)), - _mm_cmpeq_epi16(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8)))), - _mm_andnot_si128( - _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6)), - _mm_cmpeq_epi16(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6))))), - _mm_or_si128( - _mm_andnot_si128( - _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4)), - _mm_cmpeq_epi16(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4)))), - _mm_andnot_si128( - _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2)), - _mm_cmpeq_epi16(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2)))))) - ), - bitmask); - } - - if (i < first.size) - { - for (; i < first.size && !has_mask; ++i) - { - if (has_first_null_map && first_null_map[i]) - continue; - __m128i v_i = _mm_set1_epi16(first.data[i]); - bitmask = _mm_or_si128(bitmask, _mm_cmpeq_epi16(f_data, v_i)); - has_mask = _mm_test_all_ones(bitmask); - } - } - } - } - - if (!has_mask && second.size > 6) - return false; - - return hasAllIntegralLoopRemainder(j, first, second, first_null_map, second_null_map); -} - #endif -// Int8 version is faster with SSE than with AVX2 -#if defined(__SSE4_2__) -// SSE2 Int8 specialization -template <> -inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >( - const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, const UInt8 * second_null_map) -{ - if (second.size == 0) - return true; - - if (!hasNull(first_null_map, first.size) && hasNull(second_null_map, second.size)) - return false; - - const bool has_first_null_map = first_null_map != nullptr; - const bool has_second_null_map = second_null_map != nullptr; - - size_t j = 0; - short has_mask = 1; - const int8_t full = -1, none = 0; - const __m128i zeros = _mm_setzero_si128(); - if (second.size > 15 && first.size > 15) - { - for (; j < second.size - 15 && has_mask; j += 16) - { - has_mask = 0; - const __m128i f_data = _mm_loadu_si128(reinterpret_cast(second.data + j)); - __m128i bitmask = has_second_null_map ? - _mm_set_epi8( - (second_null_map[j + 15]) ? full : none, (second_null_map[j + 14]) ? full : none, - (second_null_map[j + 13]) ? full : none, (second_null_map[j + 12]) ? full : none, - (second_null_map[j + 11]) ? full : none, (second_null_map[j + 10]) ? full : none, - (second_null_map[j + 9]) ? full : none, (second_null_map[j + 8]) ? full : none, - (second_null_map[j + 7]) ? full : none, (second_null_map[j + 6]) ? full : none, - (second_null_map[j + 5]) ? full : none, (second_null_map[j + 4]) ? full : none, - (second_null_map[j + 3]) ? full : none, (second_null_map[j + 2]) ? full : none, - (second_null_map[j + 1]) ? full : none, (second_null_map[j]) ? full : none) - : zeros; - unsigned i = 0; - for (; i < first.size - 15 && !has_mask; has_mask = _mm_test_all_ones(bitmask), i += 16) - { - const __m128i s_data = _mm_loadu_si128(reinterpret_cast(first.data + i)); - const __m128i first_nm_mask = has_first_null_map ? - _mm_loadu_si128(reinterpret_cast(first_null_map + i)) - : zeros; - bitmask = - _mm_or_si128( - _mm_or_si128( - _mm_or_si128( - _mm_or_si128( - _mm_or_si128( - _mm_andnot_si128( - first_nm_mask, - _mm_cmpeq_epi8(f_data, s_data)), - _mm_andnot_si128( - _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15)), - _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15))))), - _mm_or_si128( - _mm_andnot_si128( - _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14)), - _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14)))), - _mm_andnot_si128( - _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13)), - _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13))))) - ), - _mm_or_si128( - _mm_or_si128( - _mm_andnot_si128( - _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12)), - _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12)))), - _mm_andnot_si128( - _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11)), - _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11))))), - _mm_or_si128( - _mm_andnot_si128( - _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10)), - _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10)))), - _mm_andnot_si128( - _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9)), - _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9))))))), - _mm_or_si128( - _mm_or_si128( - _mm_or_si128( - _mm_andnot_si128( - _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8)), - _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8)))), - _mm_andnot_si128( - _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7)), - _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7))))), - _mm_or_si128( - _mm_andnot_si128( - _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6)), - _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6)))), - _mm_andnot_si128( - _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5)), - _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5)))))), - _mm_or_si128( - _mm_or_si128( - _mm_andnot_si128( - _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4)), - _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4)))), - _mm_andnot_si128( - _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3)), - _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3))))), - _mm_or_si128( - _mm_andnot_si128( - _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2)), - _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2)))), - _mm_andnot_si128( - _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1)), - _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1)))))))), - bitmask); - } - - if (i < first.size) - { - for (; i < first.size && !has_mask; ++i) - { - if (has_first_null_map && first_null_map[i]) - continue; - __m128i v_i = _mm_set1_epi8(first.data[i]); - bitmask = _mm_or_si128(bitmask, _mm_cmpeq_epi8(f_data, v_i)); - has_mask = _mm_test_all_ones(bitmask); - } - } - } - } - - if (!has_mask && second.size > 15) - return false; - - return hasAllIntegralLoopRemainder(j, first, second, first_null_map, second_null_map); + return sliceHasImplAnyAllGenericImpl(first, second, first_null_map, second_null_map); } -// SSE2 UInt8 specialization -template <> -inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >( - const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, const UInt8 * second_null_map) -{ - if (second.size == 0) - return true; - - if (!hasNull(first_null_map, first.size) && hasNull(second_null_map, second.size)) - return false; - - const bool has_first_null_map = first_null_map != nullptr; - const bool has_second_null_map = second_null_map != nullptr; - - size_t j = 0; - short has_mask = 1; - const int8_t full = -1, none = 0; - const __m128i zeros = _mm_setzero_si128(); - if (second.size > 15 && first.size > 15) - { - for (; j < second.size - 15 && has_mask; j += 16) - { - has_mask = 0; - const __m128i f_data = _mm_loadu_si128(reinterpret_cast(second.data + j)); - __m128i bitmask = has_second_null_map ? - _mm_set_epi8( - (second_null_map[j + 15]) ? full : none, (second_null_map[j + 14]) ? full : none, - (second_null_map[j + 13]) ? full : none, (second_null_map[j + 12]) ? full : none, - (second_null_map[j + 11]) ? full : none, (second_null_map[j + 10]) ? full : none, - (second_null_map[j + 9]) ? full : none, (second_null_map[j + 8]) ? full : none, - (second_null_map[j + 7]) ? full : none, (second_null_map[j + 6]) ? full : none, - (second_null_map[j + 5]) ? full : none, (second_null_map[j + 4]) ? full : none, - (second_null_map[j + 3]) ? full : none, (second_null_map[j + 2]) ? full : none, - (second_null_map[j + 1]) ? full : none, (second_null_map[j]) ? full : none) - : zeros; - unsigned i = 0; - for (; i < first.size - 15 && !has_mask; has_mask = _mm_test_all_ones(bitmask), i += 16) - { - const __m128i s_data = _mm_loadu_si128(reinterpret_cast(first.data + i)); - const __m128i first_nm_mask = has_first_null_map ? - _mm_loadu_si128(reinterpret_cast(first_null_map + i)) - : zeros; - bitmask = - _mm_or_si128( - _mm_or_si128( - _mm_or_si128( - _mm_or_si128( - _mm_or_si128( - _mm_andnot_si128( - first_nm_mask, - _mm_cmpeq_epi8(f_data, s_data)), - _mm_andnot_si128( - _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15)), - _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15))))), - _mm_or_si128( - _mm_andnot_si128( - _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14)), - _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14)))), - _mm_andnot_si128( - _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13)), - _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13))))) - ), - _mm_or_si128( - _mm_or_si128( - _mm_andnot_si128( - _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12)), - _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12)))), - _mm_andnot_si128( - _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11)), - _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11))))), - _mm_or_si128( - _mm_andnot_si128( - _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10)), - _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10)))), - _mm_andnot_si128( - _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9)), - _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9))))))), - _mm_or_si128( - _mm_or_si128( - _mm_or_si128( - _mm_andnot_si128( - _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8)), - _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8)))), - _mm_andnot_si128( - _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7)), - _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7))))), - _mm_or_si128( - _mm_andnot_si128( - _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6)), - _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6)))), - _mm_andnot_si128( - _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5)), - _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5)))))), - _mm_or_si128( - _mm_or_si128( - _mm_andnot_si128( - _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4)), - _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4)))), - _mm_andnot_si128( - _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3)), - _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3))))), - _mm_or_si128( - _mm_andnot_si128( - _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2)), - _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2)))), - _mm_andnot_si128( - _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1)), - _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1)))))))), - bitmask); - } - - if (i < first.size) - { - for (; i < first.size && !has_mask; ++i) - { - if (has_first_null_map && first_null_map[i]) - continue; - __m128i v_i = _mm_set1_epi8(first.data[i]); - bitmask = _mm_or_si128(bitmask, _mm_cmpeq_epi8(f_data, v_i)); - has_mask = _mm_test_all_ones(bitmask); - } - } - } - } - - if (!has_mask && second.size > 15) - return false; - - return hasAllIntegralLoopRemainder(j, first, second, first_null_map, second_null_map); -} -#endif } From 42acb1dc29bd8e6272e38a8bc33ca9577ff011d6 Mon Sep 17 00:00:00 2001 From: Anton Popov Date: Thu, 31 Mar 2022 13:26:32 +0000 Subject: [PATCH 26/82] fix inserts to columns of type Object in partitioned tables --- src/DataTypes/ObjectUtils.cpp | 16 ++++++------- src/DataTypes/ObjectUtils.h | 2 +- .../MergeTree/MergeTreeDataWriter.cpp | 24 ++++++++++++------- src/Storages/MergeTree/MergeTreeDataWriter.h | 9 +++---- src/Storages/MergeTree/MergeTreeSink.cpp | 2 ++ .../MergeTree/ReplicatedMergeTreeSink.cpp | 4 +++- src/Storages/StorageMemory.cpp | 3 +-- .../01825_type_json_partitions.reference | 2 ++ .../01825_type_json_partitions.sql | 13 ++++++++++ 9 files changed, 49 insertions(+), 26 deletions(-) create mode 100644 tests/queries/0_stateless/01825_type_json_partitions.reference create mode 100644 tests/queries/0_stateless/01825_type_json_partitions.sql diff --git a/src/DataTypes/ObjectUtils.cpp b/src/DataTypes/ObjectUtils.cpp index 9004a5296e0..cbabc71a965 100644 --- a/src/DataTypes/ObjectUtils.cpp +++ b/src/DataTypes/ObjectUtils.cpp @@ -128,22 +128,21 @@ static auto extractVector(const std::vector & vec) return res; } -void convertObjectsToTuples(NamesAndTypesList & columns_list, Block & block, const NamesAndTypesList & extended_storage_columns) +void convertObjectsToTuples(Block & block, const NamesAndTypesList & extended_storage_columns) { std::unordered_map storage_columns_map; for (const auto & [name, type] : extended_storage_columns) storage_columns_map[name] = type; - for (auto & name_type : columns_list) + for (auto & column : block) { - if (!isObject(name_type.type)) + if (!isObject(column.type)) continue; - auto & column = block.getByName(name_type.name); if (!isObject(column.type)) throw Exception(ErrorCodes::TYPE_MISMATCH, "Type for column '{}' mismatch in columns list and in block. In list: {}, in block: {}", - name_type.name, name_type.type->getName(), column.type->getName()); + column.name, column.type->getName(), column.type->getName()); const auto & column_object = assert_cast(*column.column); const auto & subcolumns = column_object.getSubcolumns(); @@ -151,7 +150,7 @@ void convertObjectsToTuples(NamesAndTypesList & columns_list, Block & block, con if (!column_object.isFinalized()) throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot convert to tuple column '{}' from type {}. Column should be finalized first", - name_type.name, name_type.type->getName()); + column.name, column.type->getName()); PathsInData tuple_paths; DataTypes tuple_types; @@ -164,12 +163,11 @@ void convertObjectsToTuples(NamesAndTypesList & columns_list, Block & block, con tuple_columns.emplace_back(entry->data.getFinalizedColumnPtr()); } - auto it = storage_columns_map.find(name_type.name); + auto it = storage_columns_map.find(column.name); if (it == storage_columns_map.end()) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Column '{}' not found in storage", name_type.name); + throw Exception(ErrorCodes::LOGICAL_ERROR, "Column '{}' not found in storage", column.name); std::tie(column.column, column.type) = unflattenTuple(tuple_paths, tuple_types, tuple_columns); - name_type.type = column.type; /// Check that constructed Tuple type and type in storage are compatible. getLeastCommonTypeForObject({column.type, it->second}, true); diff --git a/src/DataTypes/ObjectUtils.h b/src/DataTypes/ObjectUtils.h index 199a048c8cd..1dbeac2b244 100644 --- a/src/DataTypes/ObjectUtils.h +++ b/src/DataTypes/ObjectUtils.h @@ -38,7 +38,7 @@ DataTypePtr getDataTypeByColumn(const IColumn & column); /// Converts Object types and columns to Tuples in @columns_list and @block /// and checks that types are consistent with types in @extended_storage_columns. -void convertObjectsToTuples(NamesAndTypesList & columns_list, Block & block, const NamesAndTypesList & extended_storage_columns); +void convertObjectsToTuples(Block & block, const NamesAndTypesList & extended_storage_columns); /// Checks that each path is not the prefix of any other path. void checkObjectHasNoAmbiguosPaths(const PathsInData & paths); diff --git a/src/Storages/MergeTree/MergeTreeDataWriter.cpp b/src/Storages/MergeTree/MergeTreeDataWriter.cpp index 4805a273c70..fc05e293684 100644 --- a/src/Storages/MergeTree/MergeTreeDataWriter.cpp +++ b/src/Storages/MergeTree/MergeTreeDataWriter.cpp @@ -145,7 +145,7 @@ void MergeTreeDataWriter::TemporaryPart::finalize() } BlocksWithPartition MergeTreeDataWriter::splitBlockIntoParts( - const Block & block, size_t max_parts, const StorageMetadataPtr & metadata_snapshot, ContextPtr context) + const Block & block, size_t max_parts, const StorageMetadataPtr & metadata_snapshot, ContextPtr context) { BlocksWithPartition result; if (!block || !block.rows()) @@ -282,16 +282,12 @@ MergeTreeDataWriter::TemporaryPart MergeTreeDataWriter::writeTempPart( { TemporaryPart temp_part; Block & block = block_with_partition.block; + auto columns = metadata_snapshot->getColumns().getAllPhysical().filter(block.getNames()); - auto storage_snapshot = data.getStorageSnapshot(metadata_snapshot); - if (!storage_snapshot->object_columns.empty()) - { - auto extended_storage_columns = storage_snapshot->getColumns( - GetColumnsOptions(GetColumnsOptions::AllPhysical).withExtendedObjects()); - - convertObjectsToTuples(columns, block, extended_storage_columns); - } + for (auto & column : columns) + if (isObject(column.type)) + column.type = block.getByName(column.name).type; static const String TMP_PREFIX = "tmp_insert_"; @@ -466,6 +462,16 @@ MergeTreeDataWriter::TemporaryPart MergeTreeDataWriter::writeTempPart( return temp_part; } +void MergeTreeDataWriter::deduceTypesOfObjectColumns(const StorageSnapshotPtr & storage_snapshot, Block & block) +{ + if (!storage_snapshot->object_columns.empty()) + { + auto options = GetColumnsOptions(GetColumnsOptions::AllPhysical).withExtendedObjects(); + auto storage_columns = storage_snapshot->getColumns(options); + convertObjectsToTuples(block, storage_columns); + } +} + MergeTreeDataWriter::TemporaryPart MergeTreeDataWriter::writeProjectionPartImpl( const String & part_name, MergeTreeDataPartType part_type, diff --git a/src/Storages/MergeTree/MergeTreeDataWriter.h b/src/Storages/MergeTree/MergeTreeDataWriter.h index ae46a94ccd7..33742d7e52a 100644 --- a/src/Storages/MergeTree/MergeTreeDataWriter.h +++ b/src/Storages/MergeTree/MergeTreeDataWriter.h @@ -42,14 +42,12 @@ public: */ static BlocksWithPartition splitBlockIntoParts(const Block & block, size_t max_parts, const StorageMetadataPtr & metadata_snapshot, ContextPtr context); - /** All rows must correspond to same partition. - * Returns part with unique name starting with 'tmp_', yet not added to MergeTreeData. - */ - MergeTreeData::MutableDataPartPtr writeTempPart(BlockWithPartition & block, const StorageMetadataPtr & metadata_snapshot, bool optimize_on_insert); + void deduceTypesOfObjectColumns(const StorageSnapshotPtr & storage_snapshot, Block & block); /// This structure contains not completely written temporary part. /// Some writes may happen asynchronously, e.g. for blob storages. /// You should call finalize() to wait until all data is written. + struct TemporaryPart { MergeTreeData::MutableDataPartPtr part; @@ -65,6 +63,9 @@ public: void finalize(); }; + /** All rows must correspond to same partition. + * Returns part with unique name starting with 'tmp_', yet not added to MergeTreeData. + */ TemporaryPart writeTempPart(BlockWithPartition & block, const StorageMetadataPtr & metadata_snapshot, ContextPtr context); /// For insertion. diff --git a/src/Storages/MergeTree/MergeTreeSink.cpp b/src/Storages/MergeTree/MergeTreeSink.cpp index 97bbfc17e9d..7a4ecae24b3 100644 --- a/src/Storages/MergeTree/MergeTreeSink.cpp +++ b/src/Storages/MergeTree/MergeTreeSink.cpp @@ -50,7 +50,9 @@ struct MergeTreeSink::DelayedChunk void MergeTreeSink::consume(Chunk chunk) { auto block = getHeader().cloneWithColumns(chunk.detachColumns()); + auto storage_snapshot = storage.getStorageSnapshot(metadata_snapshot); + storage.writer.deduceTypesOfObjectColumns(storage_snapshot, block); auto part_blocks = storage.writer.splitBlockIntoParts(block, max_parts_per_block, metadata_snapshot, context); using DelayedPartitions = std::vector; diff --git a/src/Storages/MergeTree/ReplicatedMergeTreeSink.cpp b/src/Storages/MergeTree/ReplicatedMergeTreeSink.cpp index 550c586f7de..63fa2071056 100644 --- a/src/Storages/MergeTree/ReplicatedMergeTreeSink.cpp +++ b/src/Storages/MergeTree/ReplicatedMergeTreeSink.cpp @@ -150,7 +150,8 @@ void ReplicatedMergeTreeSink::consume(Chunk chunk) if (quorum) checkQuorumPrecondition(zookeeper); - const Settings & settings = context->getSettingsRef(); + auto storage_snapshot = storage.getStorageSnapshot(metadata_snapshot); + storage.writer.deduceTypesOfObjectColumns(storage_snapshot, block); auto part_blocks = storage.writer.splitBlockIntoParts(block, max_parts_per_block, metadata_snapshot, context); using DelayedPartitions = std::vector; @@ -158,6 +159,7 @@ void ReplicatedMergeTreeSink::consume(Chunk chunk) size_t streams = 0; bool support_parallel_write = false; + const Settings & settings = context->getSettingsRef(); for (auto & current_block : part_blocks) { diff --git a/src/Storages/StorageMemory.cpp b/src/Storages/StorageMemory.cpp index 30be297194a..a371ac1ccf8 100644 --- a/src/Storages/StorageMemory.cpp +++ b/src/Storages/StorageMemory.cpp @@ -137,11 +137,10 @@ public: storage_snapshot->metadata->check(block, true); if (!storage_snapshot->object_columns.empty()) { - auto columns = storage_snapshot->metadata->getColumns().getAllPhysical().filter(block.getNames()); auto extended_storage_columns = storage_snapshot->getColumns( GetColumnsOptions(GetColumnsOptions::AllPhysical).withExtendedObjects()); - convertObjectsToTuples(columns, block, extended_storage_columns); + convertObjectsToTuples(block, extended_storage_columns); } if (storage.compress) diff --git a/tests/queries/0_stateless/01825_type_json_partitions.reference b/tests/queries/0_stateless/01825_type_json_partitions.reference new file mode 100644 index 00000000000..5a7ba251572 --- /dev/null +++ b/tests/queries/0_stateless/01825_type_json_partitions.reference @@ -0,0 +1,2 @@ +{"id":1,"obj":{"k1":"v1","k2":""}} +{"id":2,"obj":{"k1":"","k2":"v2"}} diff --git a/tests/queries/0_stateless/01825_type_json_partitions.sql b/tests/queries/0_stateless/01825_type_json_partitions.sql new file mode 100644 index 00000000000..2cb9bca7702 --- /dev/null +++ b/tests/queries/0_stateless/01825_type_json_partitions.sql @@ -0,0 +1,13 @@ +DROP TABLE IF EXISTS t_json_partitions; + +SET allow_experimental_object_type = 1; +SET output_format_json_named_tuples_as_objects = 1; + +CREATE TABLE t_json_partitions (id UInt32, obj JSON) +ENGINE MergeTree ORDER BY id PARTITION BY id; + +INSERT INTO t_json_partitions FORMAT JSONEachRow {"id": 1, "obj": {"k1": "v1"}} {"id": 2, "obj": {"k2": "v2"}}; + +SELECT * FROM t_json_partitions ORDER BY id FORMAT JSONEachRow; + +DROP TABLE t_json_partitions; From e74d5f9d4c5e72f7c65cd9030ad566d12aa20c38 Mon Sep 17 00:00:00 2001 From: Anton Popov Date: Thu, 31 Mar 2022 18:30:19 +0200 Subject: [PATCH 27/82] Update 01825_type_json_partitions.sql --- tests/queries/0_stateless/01825_type_json_partitions.sql | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/queries/0_stateless/01825_type_json_partitions.sql b/tests/queries/0_stateless/01825_type_json_partitions.sql index 2cb9bca7702..27804e7edae 100644 --- a/tests/queries/0_stateless/01825_type_json_partitions.sql +++ b/tests/queries/0_stateless/01825_type_json_partitions.sql @@ -1,3 +1,5 @@ +-- Tags: no-fasttest + DROP TABLE IF EXISTS t_json_partitions; SET allow_experimental_object_type = 1; From 9a4cc78dfb083a20c7f8ffa9cea5baaeca958ba5 Mon Sep 17 00:00:00 2001 From: jewisliu Date: Thu, 31 Mar 2022 17:50:07 +0800 Subject: [PATCH 28/82] support ALTER TABLE t DETACH PARTITION ALL syntax --- src/Parsers/ASTPartition.h | 1 + src/Parsers/ParserPartition.cpp | 10 ++++ src/Storages/MergeTree/MergeTreeData.cpp | 9 +++- src/Storages/StorageMergeTree.cpp | 6 ++- src/Storages/StorageReplicatedMergeTree.cpp | 37 +++++++++++--- .../0_stateless/00753_alter_attach.reference | 12 +++++ .../0_stateless/00753_alter_attach.sql | 49 +++++++++++++++++++ .../0_stateless/01015_attach_part.reference | 1 + .../queries/0_stateless/01015_attach_part.sql | 4 ++ 9 files changed, 120 insertions(+), 9 deletions(-) diff --git a/src/Parsers/ASTPartition.h b/src/Parsers/ASTPartition.h index 87092f532c4..1bd16d55795 100644 --- a/src/Parsers/ASTPartition.h +++ b/src/Parsers/ASTPartition.h @@ -15,6 +15,7 @@ public: size_t fields_count = 0; String id; + bool all = false; String getID(char) const override; ASTPtr clone() const override; diff --git a/src/Parsers/ParserPartition.cpp b/src/Parsers/ParserPartition.cpp index c10999361de..5af442826df 100644 --- a/src/Parsers/ParserPartition.cpp +++ b/src/Parsers/ParserPartition.cpp @@ -5,6 +5,7 @@ #include #include #include +#include #include namespace DB @@ -13,6 +14,7 @@ namespace DB bool ParserPartition::parseImpl(Pos & pos, ASTPtr & node, Expected & expected) { ParserKeyword s_id("ID"); + ParserKeyword s_all("ALL"); ParserStringLiteral parser_string_literal; ParserExpression parser_expr; @@ -28,6 +30,14 @@ bool ParserPartition::parseImpl(Pos & pos, ASTPtr & node, Expected & expected) partition->id = partition_id->as().value.get(); } + else if (s_all.ignore(pos, expected)) + { + ASTPtr value = makeASTFunction("tuple"); + partition->value = value; + partition->children.push_back(value); + partition->fields_count = 0; + partition->all = true; + } else { ASTPtr value; diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp index f66586b121a..b89ca4021ae 100644 --- a/src/Storages/MergeTree/MergeTreeData.cpp +++ b/src/Storages/MergeTree/MergeTreeData.cpp @@ -3373,7 +3373,12 @@ void MergeTreeData::checkAlterPartitionIsPossible( void MergeTreeData::checkPartitionCanBeDropped(const ASTPtr & partition) { const String partition_id = getPartitionIDFromQuery(partition, getContext()); - auto parts_to_remove = getDataPartsVectorInPartition(MergeTreeDataPartState::Active, partition_id); + DataPartsVector parts_to_remove; + const auto * partition_ast = partition->as(); + if (partition_ast && partition_ast->all) + parts_to_remove = getDataPartsVector(); + else + parts_to_remove = getDataPartsVectorInPartition(MergeTreeDataPartState::Active, partition_id); UInt64 partition_size = 0; @@ -3824,6 +3829,8 @@ String MergeTreeData::getPartitionIDFromQuery(const ASTPtr & ast, ContextPtr loc auto metadata_snapshot = getInMemoryMetadataPtr(); const Block & key_sample_block = metadata_snapshot->getPartitionKey().sample_block; + if (partition_ast.all) + return "ALL"; size_t fields_count = key_sample_block.columns(); if (partition_ast.fields_count != fields_count) throw Exception(ErrorCodes::INVALID_PARTITION_VALUE, diff --git a/src/Storages/StorageMergeTree.cpp b/src/Storages/StorageMergeTree.cpp index 7f4c3deca37..521e4147968 100644 --- a/src/Storages/StorageMergeTree.cpp +++ b/src/Storages/StorageMergeTree.cpp @@ -1348,7 +1348,11 @@ void StorageMergeTree::dropPartition(const ASTPtr & partition, bool detach, Cont /// This protects against "revival" of data for a removed partition after completion of merge. auto merge_blocker = stopMergesAndWait(); String partition_id = getPartitionIDFromQuery(partition, local_context); - parts_to_remove = getDataPartsVectorInPartition(MergeTreeDataPartState::Active, partition_id); + const auto * partition_ast = partition->as(); + if (partition_ast && partition_ast->all) + parts_to_remove = getDataPartsVector(); + else + parts_to_remove = getDataPartsVectorInPartition(MergeTreeDataPartState::Active, partition_id); /// TODO should we throw an exception if parts_to_remove is empty? removePartsFromWorkingSet(parts_to_remove, true); diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp index d9f72cf7feb..1947a0ad427 100644 --- a/src/Storages/StorageReplicatedMergeTree.cpp +++ b/src/Storages/StorageReplicatedMergeTree.cpp @@ -43,6 +43,7 @@ #include #include #include +#include #include #include #include @@ -4941,15 +4942,37 @@ void StorageReplicatedMergeTree::dropPartition(const ASTPtr & partition, bool de throw Exception("DROP PARTITION cannot be done on this replica because it is not a leader", ErrorCodes::NOT_A_LEADER); zkutil::ZooKeeperPtr zookeeper = getZooKeeperAndAssertNotReadonly(); - LogEntry entry; - String partition_id = getPartitionIDFromQuery(partition, query_context); - bool did_drop = dropAllPartsInPartition(*zookeeper, partition_id, entry, query_context, detach); - - if (did_drop) + const auto * partition_ast = partition->as(); + if (partition_ast && partition_ast->all) { - waitForLogEntryToBeProcessedIfNecessary(entry, query_context); - cleanLastPartNode(partition_id); + Strings partitions = zookeeper->getChildren(fs::path(zookeeper_path) / "block_numbers"); + + std::vector>> entries_with_partitionid_to_drop; + entries_with_partitionid_to_drop.reserve(partitions.size()); + for (String & partition_id : partitions) + { + auto entry = std::make_unique(); + if (dropAllPartsInPartition(*zookeeper, partition_id, *entry, query_context, detach)) + entries_with_partitionid_to_drop.emplace_back(partition_id, std::move(entry)); + } + + for (const auto & entry : entries_with_partitionid_to_drop) + { + waitForLogEntryToBeProcessedIfNecessary(*entry.second, query_context); + cleanLastPartNode(entry.first); + } + } + else + { + LogEntry entry; + String partition_id = getPartitionIDFromQuery(partition, query_context); + bool did_drop = dropAllPartsInPartition(*zookeeper, partition_id, entry, query_context, detach); + if (did_drop) + { + waitForLogEntryToBeProcessedIfNecessary(entry, query_context); + cleanLastPartNode(partition_id); + } } } diff --git a/tests/queries/0_stateless/00753_alter_attach.reference b/tests/queries/0_stateless/00753_alter_attach.reference index 007b99d4748..b0d2a3d031c 100644 --- a/tests/queries/0_stateless/00753_alter_attach.reference +++ b/tests/queries/0_stateless/00753_alter_attach.reference @@ -10,3 +10,15 @@ 5 2 6 3 7 3 +4 2 +5 2 +1 1 +2 1 +3 1 +1 1 +2 1 +3 1 +1 1 +2 2 +1 1 +1 1 diff --git a/tests/queries/0_stateless/00753_alter_attach.sql b/tests/queries/0_stateless/00753_alter_attach.sql index ca43fb3aeae..2910bcc222b 100644 --- a/tests/queries/0_stateless/00753_alter_attach.sql +++ b/tests/queries/0_stateless/00753_alter_attach.sql @@ -19,4 +19,53 @@ INSERT INTO alter_attach VALUES (6, 3), (7, 3); ALTER TABLE alter_attach ATTACH PARTITION 2; SELECT * FROM alter_attach ORDER BY x; +ALTER TABLE alter_attach DETACH PARTITION ALL; +SELECT * FROM alter_attach ORDER BY x; + +ALTER TABLE alter_attach ATTACH PARTITION 2; +SELECT * FROM alter_attach ORDER BY x; + +DROP TABLE IF EXISTS detach_all_no_partition; +CREATE TABLE detach_all_no_partition (x UInt64, p UInt8) ENGINE = MergeTree ORDER BY tuple(); +INSERT INTO detach_all_no_partition VALUES (1, 1), (2, 1), (3, 1); +SELECT * FROM detach_all_no_partition ORDER BY x; + +ALTER TABLE detach_all_no_partition DETACH PARTITION ALL; +SELECT * FROM detach_all_no_partition ORDER BY x; + +ALTER TABLE detach_all_no_partition ATTACH PARTITION tuple(); +SELECT * FROM detach_all_no_partition ORDER BY x; + DROP TABLE alter_attach; +DROP TABLE detach_all_no_partition; + +DROP TABLE IF EXISTS replicated_table_detach_all1; +DROP TABLE IF EXISTS replicated_table_detach_all2; + +CREATE TABLE replicated_table_detach_all1 ( + id UInt64, + Data String +) ENGINE = ReplicatedMergeTree('/clickhouse/tables/test_00753_{database}/replicated_table_detach_all', '1') ORDER BY id PARTITION BY id; + +CREATE TABLE replicated_table_detach_all2 ( + id UInt64, + Data String +) ENGINE = ReplicatedMergeTree('/clickhouse/tables/test_00753_{database}/replicated_table_detach_all', '2') ORDER BY id PARTITION BY id; + + +INSERT INTO replicated_table_detach_all1 VALUES (1, '1'), (2, '2'); +select * from replicated_table_detach_all1 order by id; + +ALTER TABLE replicated_table_detach_all1 DETACH PARTITION ALL; +select * from replicated_table_detach_all1 order by id; +SYSTEM SYNC REPLICA replicated_table_detach_all2; +select * from replicated_table_detach_all2 order by id; + +ALTER TABLE replicated_table_detach_all1 ATTACH PARTITION tuple(1); +select * from replicated_table_detach_all1 order by id; +SYSTEM SYNC REPLICA replicated_table_detach_all2; +select * from replicated_table_detach_all2 order by id; + +DROP TABLE replicated_table_detach_all1; +DROP TABLE replicated_table_detach_all2; + diff --git a/tests/queries/0_stateless/01015_attach_part.reference b/tests/queries/0_stateless/01015_attach_part.reference index b6cd514cd25..81c49e654ac 100644 --- a/tests/queries/0_stateless/01015_attach_part.reference +++ b/tests/queries/0_stateless/01015_attach_part.reference @@ -1,3 +1,4 @@ 1000 0 1000 +0 diff --git a/tests/queries/0_stateless/01015_attach_part.sql b/tests/queries/0_stateless/01015_attach_part.sql index 6b786bfbab9..a2f949d3499 100644 --- a/tests/queries/0_stateless/01015_attach_part.sql +++ b/tests/queries/0_stateless/01015_attach_part.sql @@ -21,4 +21,8 @@ ALTER TABLE table_01 ATTACH PART '20191001_1_1_0'; SELECT COUNT() FROM table_01; +ALTER TABLE table_01 DETACH PARTITION ALL; + +SELECT COUNT() FROM table_01; + DROP TABLE IF EXISTS table_01; From 195b4c47ea8d45e1ddaf993b819d5a17eca69064 Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Thu, 31 Mar 2022 20:40:33 +0300 Subject: [PATCH 29/82] Avoid processing per-column TTL multiple times Before this patch ttl.txt will not be written for per-column TTLs, and hence it will be calculated every time after server restart of DETACH/ATTACH cycle (note, that it will work w/o restart since in-memory representation will avoid this). v2: convert test to .sh to get correct current database over default for MV v3: extract UUID to avoid error like in [1]: [ 490 ] {} void DB::SystemLog::flushImpl(const std::vector &, uint64_t) []: Code: 349. DB::Exception: Cannot convert NULL value to non-Nullable type: While processing query_id LIKE concat('%', CAST(_CAST(NULL, 'Nullable(UUID)') AS uuid, 'String'), '%'): while pushing to view test_0hc2ro.this_text_log (c64e5af4-059e-4330-a728-354ecf83c031). (CANNOT_INSERT_NULL_IN_ORDINARY_COLUMN) [1]: https://s3.amazonaws.com/clickhouse-test-reports/35820/a512d322b024d37d2f1082c4833f59f86057555f/stateless_tests_flaky_check__address__actions_.html v4: add no-parallel to avoid issues with disappeared underlying table while pushing to text_log Signed-off-by: Azat Khuzhin --- .../MergeTree/MergeTreeDataPartTTLInfo.h | 2 +- .../0_stateless/02262_column_ttl.reference | 1 + tests/queries/0_stateless/02262_column_ttl.sh | 51 +++++++++++++++++++ 3 files changed, 53 insertions(+), 1 deletion(-) create mode 100644 tests/queries/0_stateless/02262_column_ttl.reference create mode 100755 tests/queries/0_stateless/02262_column_ttl.sh diff --git a/src/Storages/MergeTree/MergeTreeDataPartTTLInfo.h b/src/Storages/MergeTree/MergeTreeDataPartTTLInfo.h index 2b79ad1aac5..c60a7eec09a 100644 --- a/src/Storages/MergeTree/MergeTreeDataPartTTLInfo.h +++ b/src/Storages/MergeTree/MergeTreeDataPartTTLInfo.h @@ -81,7 +81,7 @@ struct MergeTreeDataPartTTLInfos bool empty() const { /// part_min_ttl in minimum of rows, rows_where and group_by TTLs - return !part_min_ttl && moves_ttl.empty() && recompression_ttl.empty(); + return !part_min_ttl && moves_ttl.empty() && recompression_ttl.empty() && columns_ttl.empty(); } }; diff --git a/tests/queries/0_stateless/02262_column_ttl.reference b/tests/queries/0_stateless/02262_column_ttl.reference new file mode 100644 index 00000000000..f59cb48c5f5 --- /dev/null +++ b/tests/queries/0_stateless/02262_column_ttl.reference @@ -0,0 +1 @@ +1 0 diff --git a/tests/queries/0_stateless/02262_column_ttl.sh b/tests/queries/0_stateless/02262_column_ttl.sh new file mode 100755 index 00000000000..affb0c802ff --- /dev/null +++ b/tests/queries/0_stateless/02262_column_ttl.sh @@ -0,0 +1,51 @@ +#!/usr/bin/env bash +# Tags: no-parallel +# ^^^^^^^^^^^ +# Since the underlying view may disappears while flushing log, and leads to: +# +# DB::Exception: Table test_x449vo..inner_id.9c14fb82-e6b1-4d1a-85a6-935c3a2a2029 is dropped. (TABLE_IS_DROPPED) +# + +CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CUR_DIR"/../shell_config.sh + +# regression test for columns TTLs +# note, that this should be written in .sh since we need $CLICKHOUSE_DATABASE +# not 'default' to catch text_log + +$CLICKHOUSE_CLIENT -nm -q " + drop table if exists ttl_02262; + drop table if exists this_text_log; + + create table ttl_02262 (date Date, key Int, value String TTL date + interval 1 month) engine=MergeTree order by key; + insert into ttl_02262 values ('2010-01-01', 2010, 'foo'); + optimize table ttl_02262 final; + + detach table ttl_02262; + attach table ttl_02262; + + -- create system.text_log + system flush logs; +" + +ttl_02262_uuid=$($CLICKHOUSE_CLIENT -q "select uuid from system.tables where database = '$CLICKHOUSE_DATABASE' and name = 'ttl_02262'") + +$CLICKHOUSE_CLIENT -nm -q " + -- OPTIMIZE TABLE x FINAL will be done in background + -- attach to it's log, via table UUID in query_id (see merger/mutator code). + create materialized view this_text_log engine=Memory() as + select * from system.text_log where query_id like '%${ttl_02262_uuid}%'; + + optimize table ttl_02262 final; + system flush logs; + -- If TTL will be applied again (during OPTIMIZE TABLE FINAL) it will produce the following message: + -- + -- Some TTL values were not calculated for part 201701_487_641_3. Will calculate them forcefully during merge. + -- + -- Let's ensure that this is not happen anymore: + select count()>0, countIf(message LIKE '%TTL%') from this_text_log; + + drop table ttl_02262; + drop table this_text_log; +" From 7b35920d4cbc368759e9185f7b5a980bfcc22403 Mon Sep 17 00:00:00 2001 From: alesapin Date: Sun, 3 Apr 2022 14:03:34 +0200 Subject: [PATCH 30/82] Make more alters of nested types metadata-only --- src/Storages/AlterCommands.cpp | 33 ++++++++++++------- .../02251_alter_enum_nested_struct.reference | 7 ++++ .../02251_alter_enum_nested_struct.sql | 27 +++++++++++++++ 3 files changed, 55 insertions(+), 12 deletions(-) create mode 100644 tests/queries/0_stateless/02251_alter_enum_nested_struct.reference create mode 100644 tests/queries/0_stateless/02251_alter_enum_nested_struct.sql diff --git a/src/Storages/AlterCommands.cpp b/src/Storages/AlterCommands.cpp index 44f208adacc..16e1f044fd9 100644 --- a/src/Storages/AlterCommands.cpp +++ b/src/Storages/AlterCommands.cpp @@ -696,20 +696,22 @@ namespace /// The function works for Arrays and Nullables of the same structure. bool isMetadataOnlyConversion(const IDataType * from, const IDataType * to) { - if (from->equals(*to)) - return true; - - if (const auto * from_enum8 = typeid_cast(from)) + auto is_compatible_enum_types_conversion = [](const IDataType * from_type, const IDataType * to_type) { - if (const auto * to_enum8 = typeid_cast(to)) - return to_enum8->contains(*from_enum8); - } + if (const auto * from_enum8 = typeid_cast(from_type)) + { + if (const auto * to_enum8 = typeid_cast(to_type)) + return to_enum8->contains(*from_enum8); + } - if (const auto * from_enum16 = typeid_cast(from)) - { - if (const auto * to_enum16 = typeid_cast(to)) - return to_enum16->contains(*from_enum16); - } + if (const auto * from_enum16 = typeid_cast(from_type)) + { + if (const auto * to_enum16 = typeid_cast(to_type)) + return to_enum16->contains(*from_enum16); + } + + return false; + }; static const std::unordered_multimap ALLOWED_CONVERSIONS = { @@ -721,11 +723,18 @@ bool isMetadataOnlyConversion(const IDataType * from, const IDataType * to) { typeid(DataTypeUInt16), typeid(DataTypeDate) }, }; + /// Unwrap some nested and check for valid conevrsions while (true) { + /// types are equal, obviously pure metadata alter if (from->equals(*to)) return true; + /// We just adding something to enum, nothing changed on disk + if (is_compatible_enum_types_conversion(from, to)) + return true; + + /// Types changed, but representation on disk didn't auto it_range = ALLOWED_CONVERSIONS.equal_range(typeid(*from)); for (auto it = it_range.first; it != it_range.second; ++it) { diff --git a/tests/queries/0_stateless/02251_alter_enum_nested_struct.reference b/tests/queries/0_stateless/02251_alter_enum_nested_struct.reference new file mode 100644 index 00000000000..ada5f47c230 --- /dev/null +++ b/tests/queries/0_stateless/02251_alter_enum_nested_struct.reference @@ -0,0 +1,7 @@ +1 ['Option2','Option1'] +2 ['Option1'] +3 ['Option1','Option3'] +1 ['Option2','Option1'] +2 ['Option1'] +3 ['Option1','Option3'] +0 diff --git a/tests/queries/0_stateless/02251_alter_enum_nested_struct.sql b/tests/queries/0_stateless/02251_alter_enum_nested_struct.sql new file mode 100644 index 00000000000..ad2dab3631f --- /dev/null +++ b/tests/queries/0_stateless/02251_alter_enum_nested_struct.sql @@ -0,0 +1,27 @@ +DROP TABLE IF EXISTS alter_enum_array; + +CREATE TABLE alter_enum_array( + Key UInt64, + Value Array(Enum8('Option1'=1, 'Option2'=2)) +) +ENGINE=MergeTree() +ORDER BY tuple(); + +INSERT INTO alter_enum_array VALUES (1, ['Option2', 'Option1']), (2, ['Option1']); + +ALTER TABLE alter_enum_array MODIFY COLUMN Value Array(Enum8('Option1'=1, 'Option2'=2, 'Option3'=3)) SETTINGS mutations_sync=2; + +INSERT INTO alter_enum_array VALUES (3, ['Option1','Option3']); + +SELECT * FROM alter_enum_array ORDER BY Key; + +DETACH TABLE alter_enum_array; +ATTACH TABLE alter_enum_array; + +SELECT * FROM alter_enum_array ORDER BY Key; + +OPTIMIZE TABLE alter_enum_array FINAL; + +SELECT COUNT() FROM system.mutations where table='alter_enum_array' and database=currentDatabase(); + +DROP TABLE IF EXISTS alter_enum_array; From a8c1ccdb20ef6a772dfaa6f4b44284d3459dc722 Mon Sep 17 00:00:00 2001 From: alesapin Date: Sun, 3 Apr 2022 18:29:24 +0200 Subject: [PATCH 31/82] Forbit to reset non existing settings --- src/Storages/MergeTree/MergeTreeData.cpp | 8 ++++++++ .../02252_reset_non_existing_setting.reference | 0 .../02252_reset_non_existing_setting.sql | 13 +++++++++++++ 3 files changed, 21 insertions(+) create mode 100644 tests/queries/0_stateless/02252_reset_non_existing_setting.reference create mode 100644 tests/queries/0_stateless/02252_reset_non_existing_setting.sql diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp index b433c6e4591..f1af92e7763 100644 --- a/src/Storages/MergeTree/MergeTreeData.cpp +++ b/src/Storages/MergeTree/MergeTreeData.cpp @@ -2099,6 +2099,14 @@ void MergeTreeData::checkAlterIsPossible(const AlterCommands & commands, Context dropped_columns.emplace(command.column_name); } + else if (command.type == AlterCommand::RESET_SETTING) + { + for (const auto & reset_setting : command.settings_resets) + { + if (!settings.has(reset_setting)) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Cannot reset setting '{}' because it doesn't exist for MergeTree engines family", reset_setting); + } + } else if (command.isRequireMutationStage(getInMemoryMetadata())) { /// This alter will override data on disk. Let's check that it doesn't diff --git a/tests/queries/0_stateless/02252_reset_non_existing_setting.reference b/tests/queries/0_stateless/02252_reset_non_existing_setting.reference new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/queries/0_stateless/02252_reset_non_existing_setting.sql b/tests/queries/0_stateless/02252_reset_non_existing_setting.sql new file mode 100644 index 00000000000..362388c4a10 --- /dev/null +++ b/tests/queries/0_stateless/02252_reset_non_existing_setting.sql @@ -0,0 +1,13 @@ +DROP TABLE IF EXISTS most_ordinary_mt; + +CREATE TABLE most_ordinary_mt +( + Key UInt64 +) +ENGINE = MergeTree() +ORDER BY tuple(); + +ALTER TABLE most_ordinary_mt RESET SETTING ttl; --{serverError 36} +ALTER TABLE most_ordinary_mt RESET SETTING allow_remote_fs_zero_copy_replication, xxx; --{serverError 36} + +DROP TABLE IF EXISTS most_ordinary_mt; From 4cd159746e8ae498978cbfb1fb34194965a7a864 Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Wed, 30 Mar 2022 18:13:01 +0300 Subject: [PATCH 32/82] Fix polling of socket with negative timeout (when poll() interrupted by EINTR) In case of EINTR the timeout will be adjusted, but this should not be done in case of negative timeout since it means infinite timeout, and in that adjustment block negative timeout will be reset to 0, which will make poll() return (since zero timeout means return immediatelly even if no fd is ready). This should also fix 02127_connection_drain flap on CI [1]. [1]: https://s3.amazonaws.com/clickhouse-test-reports/32928/ddd5bebe555ce8feebcdd339e47fc45184c20dd1/stateless_tests__release__wide_parts_enabled__actions_.html Refs: https://github.com/ClickHouse/poco/pull/55 Signed-off-by: Azat Khuzhin --- contrib/poco | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/contrib/poco b/contrib/poco index 520a90e02e3..008b1646947 160000 --- a/contrib/poco +++ b/contrib/poco @@ -1 +1 @@ -Subproject commit 520a90e02e3e5cb90afeae1846d161dbc508a6f1 +Subproject commit 008b16469471d55b176db181756c94e3f14dd2dc From 4547ed370a4bbe20260ccdd6cd020b4c5d8ba55a Mon Sep 17 00:00:00 2001 From: taiyang-li <654010905@qq.com> Date: Wed, 30 Mar 2022 20:54:33 +0800 Subject: [PATCH 33/82] add hints for column description --- src/Common/NamePrompter.h | 7 +++++++ src/Storages/ColumnsDescription.cpp | 21 ++++++++++++++++++--- src/Storages/ColumnsDescription.h | 7 +++++-- src/Storages/IndicesDescription.h | 1 - 4 files changed, 30 insertions(+), 6 deletions(-) diff --git a/src/Common/NamePrompter.h b/src/Common/NamePrompter.h index a88d4bdea8e..8e301dec8b7 100644 --- a/src/Common/NamePrompter.h +++ b/src/Common/NamePrompter.h @@ -2,6 +2,7 @@ #include #include +#include #include #include @@ -102,6 +103,12 @@ public: return prompter.getHints(name, getAllRegisteredNames()); } + String getHintsString(const String & name) const + { + const auto hints = getHints(name); + return !hints.empty() ? ", may be you meant: " + toString(hints) : ""; + } + IHints() = default; IHints(const IHints &) = default; diff --git a/src/Storages/ColumnsDescription.cpp b/src/Storages/ColumnsDescription.cpp index 69ca6002c22..a694405665b 100644 --- a/src/Storages/ColumnsDescription.cpp +++ b/src/Storages/ColumnsDescription.cpp @@ -230,8 +230,8 @@ void ColumnsDescription::remove(const String & column_name) { auto range = getNameRange(columns, column_name); if (range.first == range.second) - throw Exception("There is no column " + column_name + " in table.", - ErrorCodes::NO_SUCH_COLUMN_IN_TABLE); + throw Exception( + "There is no column " + column_name + " in table" + getHintsString(column_name), ErrorCodes::NO_SUCH_COLUMN_IN_TABLE); for (auto list_it = range.first; list_it != range.second;) { @@ -244,7 +244,10 @@ void ColumnsDescription::rename(const String & column_from, const String & colum { auto it = columns.get<1>().find(column_from); if (it == columns.get<1>().end()) - throw Exception("Cannot find column " + column_from + " in ColumnsDescription", ErrorCodes::LOGICAL_ERROR); + { + throw Exception( + "Cannot find column " + column_from + " in ColumnsDescription" + getHintsString(column_from), ErrorCodes::LOGICAL_ERROR); + } columns.get<1>().modify_key(it, [&column_to] (String & old_name) { @@ -745,6 +748,18 @@ void ColumnsDescription::removeSubcolumns(const String & name_in_storage) subcolumns.get<1>().erase(range.first, range.second); } +std::vector ColumnsDescription::getAllRegisteredNames() const +{ + std::vector names; + names.reserve(columns.size()); + for (const auto & column : columns) + { + if (column.name.find('.') == std::string::npos) + names.push_back(column.name); + } + return names; +} + Block validateColumnsDefaultsAndGetSampleBlock(ASTPtr default_expr_list, const NamesAndTypesList & all_columns, ContextPtr context) { for (const auto & child : default_expr_list->children) diff --git a/src/Storages/ColumnsDescription.h b/src/Storages/ColumnsDescription.h index 4ae1dcfc2cd..affe2ef5a56 100644 --- a/src/Storages/ColumnsDescription.h +++ b/src/Storages/ColumnsDescription.h @@ -91,7 +91,7 @@ struct ColumnDescription /// Description of multiple table columns (in CREATE TABLE for example). -class ColumnsDescription +class ColumnsDescription : public IHints<2, ColumnsDescription> { public: ColumnsDescription() = default; @@ -149,7 +149,8 @@ public: { auto it = columns.get<1>().find(column_name); if (it == columns.get<1>().end()) - throw Exception("Cannot find column " + column_name + " in ColumnsDescription", ErrorCodes::LOGICAL_ERROR); + throw Exception( + "Cannot find column " + column_name + " in ColumnsDescription" + getHintsString(column_name), ErrorCodes::LOGICAL_ERROR); removeSubcolumns(it->name); if (!columns.get<1>().modify(it, std::forward(f))) @@ -196,6 +197,8 @@ public: return columns.empty(); } + std::vector getAllRegisteredNames() const override; + /// Keep the sequence of columns and allow to lookup by name. using ColumnsContainer = boost::multi_index_container< ColumnDescription, diff --git a/src/Storages/IndicesDescription.h b/src/Storages/IndicesDescription.h index 72e0748778f..862df6fe23c 100644 --- a/src/Storages/IndicesDescription.h +++ b/src/Storages/IndicesDescription.h @@ -74,7 +74,6 @@ struct IndicesDescription : public std::vector, IHints<1, Indi /// Return common expression for all stored indices ExpressionActionsPtr getSingleExpressionForIndices(const ColumnsDescription & columns, ContextPtr context) const; -public: Names getAllRegisteredNames() const override; }; From fd9a10ef5300ac4ad20eca03b7f213ba5b571e98 Mon Sep 17 00:00:00 2001 From: taiyang-li <654010905@qq.com> Date: Wed, 30 Mar 2022 21:33:23 +0800 Subject: [PATCH 34/82] add hints for projections --- src/Storages/ColumnsDescription.h | 2 +- src/Storages/ProjectionsDescription.cpp | 17 +++++++++++++++-- src/Storages/ProjectionsDescription.h | 4 +++- .../02250_hints_for_columns.reference | 3 +++ .../0_stateless/02250_hints_for_columns.sh | 17 +++++++++++++++++ .../02250_hints_for_projections.reference | 1 + .../0_stateless/02250_hints_for_projections.sh | 13 +++++++++++++ 7 files changed, 53 insertions(+), 4 deletions(-) create mode 100644 tests/queries/0_stateless/02250_hints_for_columns.reference create mode 100644 tests/queries/0_stateless/02250_hints_for_columns.sh create mode 100644 tests/queries/0_stateless/02250_hints_for_projections.reference create mode 100644 tests/queries/0_stateless/02250_hints_for_projections.sh diff --git a/src/Storages/ColumnsDescription.h b/src/Storages/ColumnsDescription.h index affe2ef5a56..81cb475a1f6 100644 --- a/src/Storages/ColumnsDescription.h +++ b/src/Storages/ColumnsDescription.h @@ -91,7 +91,7 @@ struct ColumnDescription /// Description of multiple table columns (in CREATE TABLE for example). -class ColumnsDescription : public IHints<2, ColumnsDescription> +class ColumnsDescription : public IHints<1, ColumnsDescription> { public: ColumnsDescription() = default; diff --git a/src/Storages/ProjectionsDescription.cpp b/src/Storages/ProjectionsDescription.cpp index 7c340cda739..70e312931cc 100644 --- a/src/Storages/ProjectionsDescription.cpp +++ b/src/Storages/ProjectionsDescription.cpp @@ -335,7 +335,9 @@ const ProjectionDescription & ProjectionsDescription::get(const String & project { auto it = map.find(projection_name); if (it == map.end()) - throw Exception("There is no projection " + projection_name + " in table", ErrorCodes::NO_SUCH_PROJECTION_IN_TABLE); + throw Exception( + "There is no projection " + projection_name + " in table" + getHintsString(projection_name), + ErrorCodes::NO_SUCH_PROJECTION_IN_TABLE); return *(it->second); } @@ -376,13 +378,24 @@ void ProjectionsDescription::remove(const String & projection_name, bool if_exis { if (if_exists) return; - throw Exception("There is no projection " + projection_name + " in table.", ErrorCodes::NO_SUCH_PROJECTION_IN_TABLE); + throw Exception( + "There is no projection " + projection_name + " in table" + getHintsString(projection_name), + ErrorCodes::NO_SUCH_PROJECTION_IN_TABLE); } projections.erase(it->second); map.erase(it); } +std::vector ProjectionsDescription::getAllRegisteredNames() const +{ + std::vector names; + names.reserve(map.size()); + for (const auto & pair : map) + names.push_back(pair.first); + return names; +} + ExpressionActionsPtr ProjectionsDescription::getSingleExpressionForProjections(const ColumnsDescription & columns, ContextPtr query_context) const { diff --git a/src/Storages/ProjectionsDescription.h b/src/Storages/ProjectionsDescription.h index 3e8d5e1a4f1..c48942eb0ec 100644 --- a/src/Storages/ProjectionsDescription.h +++ b/src/Storages/ProjectionsDescription.h @@ -106,7 +106,7 @@ struct ProjectionDescription using ProjectionDescriptionRawPtr = const ProjectionDescription *; /// All projections in storage -struct ProjectionsDescription +struct ProjectionsDescription : public IHints<1, ProjectionsDescription> { ProjectionsDescription() = default; ProjectionsDescription(ProjectionsDescription && other) = default; @@ -138,6 +138,8 @@ struct ProjectionsDescription add(ProjectionDescription && projection, const String & after_projection = String(), bool first = false, bool if_not_exists = false); void remove(const String & projection_name, bool if_exists); + std::vector getAllRegisteredNames() const override; + private: /// Keep the sequence of columns and allow to lookup by name. using Container = std::list; diff --git a/tests/queries/0_stateless/02250_hints_for_columns.reference b/tests/queries/0_stateless/02250_hints_for_columns.reference new file mode 100644 index 00000000000..0eabe367130 --- /dev/null +++ b/tests/queries/0_stateless/02250_hints_for_columns.reference @@ -0,0 +1,3 @@ +OK +OK +OK diff --git a/tests/queries/0_stateless/02250_hints_for_columns.sh b/tests/queries/0_stateless/02250_hints_for_columns.sh new file mode 100644 index 00000000000..e8fe1a7a160 --- /dev/null +++ b/tests/queries/0_stateless/02250_hints_for_columns.sh @@ -0,0 +1,17 @@ +#!/usr/bin/env bash + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +$CLICKHOUSE_CLIENT --query="DROP TABLE IF EXISTS t" + +$CLICKHOUSE_CLIENT --query="CREATE TABLE t (CounterID UInt32, StartDate Date, UserID UInt32, VisitID UInt32, NestedColumn Nested(A UInt8, S String), ToDrop UInt32) ENGINE = MergeTree(StartDate, intHash32(UserID), (CounterID, StartDate, intHash32(UserID), VisitID), 8192)" + +$CLICKHOUSE_CLIENT --query="ALTER TABLE t DROP COLUMN ToDro" 2>&1 | grep -q "may be you meant: \['ToDrop'\]" && echo 'OK' || echo 'FAIL' + +$CLICKHOUSE_CLIENT --query="ALTER TABLE t MODIFY COLUMN ToDro UInt64" 2>&1 | grep -q "may be you meant: \['ToDrop'\]" && echo 'OK' || echo 'FAIL' + +$CLICKHOUSE_CLIENT --query="ALTER TABLE t RENAME COLUMN ToDro to ToDropp" 2>&1 | grep -q "may be you meant: \['ToDrop'\]" && echo 'OK' || echo 'FAIL' + +$CLICKHOUSE_CLIENT --query="DROP TABLE t" \ No newline at end of file diff --git a/tests/queries/0_stateless/02250_hints_for_projections.reference b/tests/queries/0_stateless/02250_hints_for_projections.reference new file mode 100644 index 00000000000..d86bac9de59 --- /dev/null +++ b/tests/queries/0_stateless/02250_hints_for_projections.reference @@ -0,0 +1 @@ +OK diff --git a/tests/queries/0_stateless/02250_hints_for_projections.sh b/tests/queries/0_stateless/02250_hints_for_projections.sh new file mode 100644 index 00000000000..57123b88bde --- /dev/null +++ b/tests/queries/0_stateless/02250_hints_for_projections.sh @@ -0,0 +1,13 @@ +#!/usr/bin/env bash + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +$CLICKHOUSE_CLIENT --query="DROP TABLE IF EXISTS t" + +$CLICKHOUSE_CLIENT --query="create table t (x Int32, y Int32, projection pToDrop (select x, y order by x)) engine = MergeTree order by y;" + +$CLICKHOUSE_CLIENT --query="ALTER TABLE t DROP PROJECTION pToDro" 2>&1 | grep -q "may be you meant: \['pToDrop'\]" && echo 'OK' || echo 'FAIL' + +$CLICKHOUSE_CLIENT --query="DROP TABLE t" \ No newline at end of file From eda299b48b744d321d013885ecddcff0eb08fc0d Mon Sep 17 00:00:00 2001 From: taiyang-li <654010905@qq.com> Date: Thu, 31 Mar 2022 12:14:28 +0800 Subject: [PATCH 35/82] fix building --- src/Common/NamePrompter.h | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/Common/NamePrompter.h b/src/Common/NamePrompter.h index 8e301dec8b7..9663427ef12 100644 --- a/src/Common/NamePrompter.h +++ b/src/Common/NamePrompter.h @@ -1,8 +1,8 @@ #pragma once #include +#include #include -#include #include #include @@ -105,8 +105,12 @@ public: String getHintsString(const String & name) const { - const auto hints = getHints(name); - return !hints.empty() ? ", may be you meant: " + toString(hints) : ""; + auto hints = getHints(name); + + /// Note: we don't use toString because it will cause writeCString naming conflict in src/Dictionaries/MongoDBDictionarySource.cpp + for (auto & hint : hints) + hint = "'" + hint + "'"; + return !hints.empty() ? ", may be you meant: " + boost::algorithm::join(hints, ",") : ""; } IHints() = default; From d6247338de5ba64f4180ef99dd7e003416ee04d7 Mon Sep 17 00:00:00 2001 From: taiyang-li <654010905@qq.com> Date: Thu, 31 Mar 2022 13:33:20 +0800 Subject: [PATCH 36/82] fix failed stateless tests --- src/Common/NamePrompter.h | 2 +- src/Storages/AlterCommands.cpp | 21 ++++++++++++++------- 2 files changed, 15 insertions(+), 8 deletions(-) diff --git a/src/Common/NamePrompter.h b/src/Common/NamePrompter.h index 9663427ef12..25206cbd25f 100644 --- a/src/Common/NamePrompter.h +++ b/src/Common/NamePrompter.h @@ -110,7 +110,7 @@ public: /// Note: we don't use toString because it will cause writeCString naming conflict in src/Dictionaries/MongoDBDictionarySource.cpp for (auto & hint : hints) hint = "'" + hint + "'"; - return !hints.empty() ? ", may be you meant: " + boost::algorithm::join(hints, ",") : ""; + return !hints.empty() ? ", may be you meant: [" + boost::algorithm::join(hints, ",") + "]" : ""; } IHints() = default; diff --git a/src/Storages/AlterCommands.cpp b/src/Storages/AlterCommands.cpp index 44f208adacc..3ddeec4fa47 100644 --- a/src/Storages/AlterCommands.cpp +++ b/src/Storages/AlterCommands.cpp @@ -1046,8 +1046,10 @@ void AlterCommands::validate(const StorageInMemoryMetadata & metadata, ContextPt if (!all_columns.has(column_name)) { if (!command.if_exists) - throw Exception{"Wrong column name. Cannot find column " + backQuote(column_name) + " to modify", - ErrorCodes::NOT_FOUND_COLUMN_IN_BLOCK}; + throw Exception{ + "Wrong column name. Cannot find column " + backQuote(column_name) + " to modify" + + all_columns.getHintsString(column_name), + ErrorCodes::NOT_FOUND_COLUMN_IN_BLOCK}; else continue; } @@ -1153,7 +1155,8 @@ void AlterCommands::validate(const StorageInMemoryMetadata & metadata, ContextPt } else if (!command.if_exists) throw Exception( - "Wrong column name. Cannot find column " + backQuote(command.column_name) + " to drop", + "Wrong column name. Cannot find column " + backQuote(command.column_name) + " to drop" + + all_columns.getHintsString(command.column_name), ErrorCodes::NOT_FOUND_COLUMN_IN_BLOCK); } else if (command.type == AlterCommand::COMMENT_COLUMN) @@ -1161,8 +1164,10 @@ void AlterCommands::validate(const StorageInMemoryMetadata & metadata, ContextPt if (!all_columns.has(command.column_name)) { if (!command.if_exists) - throw Exception{"Wrong column name. Cannot find column " + backQuote(command.column_name) + " to comment", - ErrorCodes::NOT_FOUND_COLUMN_IN_BLOCK}; + throw Exception{ + "Wrong column name. Cannot find column " + backQuote(command.column_name) + " to comment" + + all_columns.getHintsString(command.column_name), + ErrorCodes::NOT_FOUND_COLUMN_IN_BLOCK}; } } else if (command.type == AlterCommand::MODIFY_SETTING || command.type == AlterCommand::RESET_SETTING) @@ -1196,8 +1201,10 @@ void AlterCommands::validate(const StorageInMemoryMetadata & metadata, ContextPt if (!all_columns.has(command.column_name)) { if (!command.if_exists) - throw Exception{"Wrong column name. Cannot find column " + backQuote(command.column_name) + " to rename", - ErrorCodes::NOT_FOUND_COLUMN_IN_BLOCK}; + throw Exception{ + "Wrong column name. Cannot find column " + backQuote(command.column_name) + " to rename" + + all_columns.getHintsString(command.column_name), + ErrorCodes::NOT_FOUND_COLUMN_IN_BLOCK}; else continue; } From 6bc1786047e0adee4c3d5c121fa9d9b3b0626c1a Mon Sep 17 00:00:00 2001 From: taiyang-li <654010905@qq.com> Date: Thu, 31 Mar 2022 16:43:23 +0800 Subject: [PATCH 37/82] fix style --- src/Storages/AlterCommands.cpp | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/src/Storages/AlterCommands.cpp b/src/Storages/AlterCommands.cpp index 3ddeec4fa47..5b44a4676c6 100644 --- a/src/Storages/AlterCommands.cpp +++ b/src/Storages/AlterCommands.cpp @@ -1046,8 +1046,7 @@ void AlterCommands::validate(const StorageInMemoryMetadata & metadata, ContextPt if (!all_columns.has(column_name)) { if (!command.if_exists) - throw Exception{ - "Wrong column name. Cannot find column " + backQuote(column_name) + " to modify" + throw Exception{"Wrong column name. Cannot find column " + backQuote(column_name) + " to modify" + all_columns.getHintsString(column_name), ErrorCodes::NOT_FOUND_COLUMN_IN_BLOCK}; else @@ -1164,8 +1163,7 @@ void AlterCommands::validate(const StorageInMemoryMetadata & metadata, ContextPt if (!all_columns.has(command.column_name)) { if (!command.if_exists) - throw Exception{ - "Wrong column name. Cannot find column " + backQuote(command.column_name) + " to comment" + throw Exception{"Wrong column name. Cannot find column " + backQuote(command.column_name) + " to comment" + all_columns.getHintsString(command.column_name), ErrorCodes::NOT_FOUND_COLUMN_IN_BLOCK}; } @@ -1201,8 +1199,7 @@ void AlterCommands::validate(const StorageInMemoryMetadata & metadata, ContextPt if (!all_columns.has(command.column_name)) { if (!command.if_exists) - throw Exception{ - "Wrong column name. Cannot find column " + backQuote(command.column_name) + " to rename" + throw Exception{"Wrong column name. Cannot find column " + backQuote(command.column_name) + " to rename" + all_columns.getHintsString(command.column_name), ErrorCodes::NOT_FOUND_COLUMN_IN_BLOCK}; else From 9dd1a76fd85e3ed677d084a01c1a550255e19c9e Mon Sep 17 00:00:00 2001 From: taiyang-li <654010905@qq.com> Date: Thu, 31 Mar 2022 16:45:36 +0800 Subject: [PATCH 38/82] fix stateless tests --- tests/queries/0_stateless/02250_hints_for_columns.reference | 2 -- tests/queries/0_stateless/02250_hints_for_projections.reference | 1 - 2 files changed, 3 deletions(-) diff --git a/tests/queries/0_stateless/02250_hints_for_columns.reference b/tests/queries/0_stateless/02250_hints_for_columns.reference index 0eabe367130..d86bac9de59 100644 --- a/tests/queries/0_stateless/02250_hints_for_columns.reference +++ b/tests/queries/0_stateless/02250_hints_for_columns.reference @@ -1,3 +1 @@ OK -OK -OK diff --git a/tests/queries/0_stateless/02250_hints_for_projections.reference b/tests/queries/0_stateless/02250_hints_for_projections.reference index d86bac9de59..e69de29bb2d 100644 --- a/tests/queries/0_stateless/02250_hints_for_projections.reference +++ b/tests/queries/0_stateless/02250_hints_for_projections.reference @@ -1 +0,0 @@ -OK From 10bbb965127f6d3f1ae15ff2a6b0cfbbdee68a18 Mon Sep 17 00:00:00 2001 From: taiyang-li <654010905@qq.com> Date: Fri, 1 Apr 2022 11:56:43 +0800 Subject: [PATCH 39/82] fix stateless test --- tests/queries/0_stateless/02250_hints_for_columns.reference | 2 ++ tests/queries/0_stateless/02250_hints_for_projections.reference | 1 + 2 files changed, 3 insertions(+) diff --git a/tests/queries/0_stateless/02250_hints_for_columns.reference b/tests/queries/0_stateless/02250_hints_for_columns.reference index d86bac9de59..0eabe367130 100644 --- a/tests/queries/0_stateless/02250_hints_for_columns.reference +++ b/tests/queries/0_stateless/02250_hints_for_columns.reference @@ -1 +1,3 @@ OK +OK +OK diff --git a/tests/queries/0_stateless/02250_hints_for_projections.reference b/tests/queries/0_stateless/02250_hints_for_projections.reference index e69de29bb2d..d86bac9de59 100644 --- a/tests/queries/0_stateless/02250_hints_for_projections.reference +++ b/tests/queries/0_stateless/02250_hints_for_projections.reference @@ -0,0 +1 @@ +OK From f4772d3b8fe416355324cac849ab925ae6bdfbe3 Mon Sep 17 00:00:00 2001 From: taiyang-li <654010905@qq.com> Date: Fri, 1 Apr 2022 14:45:20 +0800 Subject: [PATCH 40/82] chmod a+x 02250_hints_for_columns/02250_hints_for_projections --- tests/queries/0_stateless/02250_hints_for_columns.sh | 0 tests/queries/0_stateless/02250_hints_for_projections.sh | 0 2 files changed, 0 insertions(+), 0 deletions(-) mode change 100644 => 100755 tests/queries/0_stateless/02250_hints_for_columns.sh mode change 100644 => 100755 tests/queries/0_stateless/02250_hints_for_projections.sh diff --git a/tests/queries/0_stateless/02250_hints_for_columns.sh b/tests/queries/0_stateless/02250_hints_for_columns.sh old mode 100644 new mode 100755 diff --git a/tests/queries/0_stateless/02250_hints_for_projections.sh b/tests/queries/0_stateless/02250_hints_for_projections.sh old mode 100644 new mode 100755 From d96b682a5562132186da9f3aaea9af2647877b5b Mon Sep 17 00:00:00 2001 From: Antonio Andelic Date: Fri, 1 Apr 2022 13:12:54 +0000 Subject: [PATCH 41/82] Refactor --- src/Common/NamePrompter.cpp | 15 +++++++++ src/Common/NamePrompter.h | 15 +++++---- src/Storages/AlterCommands.cpp | 32 ++++++++++++------- src/Storages/ColumnsDescription.cpp | 12 ++++--- src/Storages/ColumnsDescription.h | 7 ++-- src/Storages/ProjectionsDescription.cpp | 15 +++++---- .../0_stateless/02250_hints_for_columns.sh | 8 ++--- .../02250_hints_for_projections.sh | 4 +-- 8 files changed, 71 insertions(+), 37 deletions(-) create mode 100644 src/Common/NamePrompter.cpp diff --git a/src/Common/NamePrompter.cpp b/src/Common/NamePrompter.cpp new file mode 100644 index 00000000000..c5a2224dcb4 --- /dev/null +++ b/src/Common/NamePrompter.cpp @@ -0,0 +1,15 @@ +#include +#include + +namespace DB::detail +{ +void appendHintsMessageImpl(String & message, const std::vector & hints) +{ + if (hints.empty()) + { + return; + } + + message += ". Maybe you meant: " + toString(hints); +} +} diff --git a/src/Common/NamePrompter.h b/src/Common/NamePrompter.h index 25206cbd25f..b3eb271c0f0 100644 --- a/src/Common/NamePrompter.h +++ b/src/Common/NamePrompter.h @@ -1,7 +1,6 @@ #pragma once #include -#include #include #include @@ -91,6 +90,10 @@ private: } }; +namespace detail +{ +void appendHintsMessageImpl(String & message, const std::vector & hints); +} template class IHints @@ -103,14 +106,10 @@ public: return prompter.getHints(name, getAllRegisteredNames()); } - String getHintsString(const String & name) const + void appendHintsMessage(String & message, const String & name) const { auto hints = getHints(name); - - /// Note: we don't use toString because it will cause writeCString naming conflict in src/Dictionaries/MongoDBDictionarySource.cpp - for (auto & hint : hints) - hint = "'" + hint + "'"; - return !hints.empty() ? ", may be you meant: [" + boost::algorithm::join(hints, ",") + "]" : ""; + detail::appendHintsMessageImpl(message, hints); } IHints() = default; @@ -126,4 +125,6 @@ private: NamePrompter prompter; }; +void appendHintsString(String & message, const std::vector & hints, const String & name); + } diff --git a/src/Storages/AlterCommands.cpp b/src/Storages/AlterCommands.cpp index 5b44a4676c6..2870dc42af7 100644 --- a/src/Storages/AlterCommands.cpp +++ b/src/Storages/AlterCommands.cpp @@ -1046,9 +1046,12 @@ void AlterCommands::validate(const StorageInMemoryMetadata & metadata, ContextPt if (!all_columns.has(column_name)) { if (!command.if_exists) - throw Exception{"Wrong column name. Cannot find column " + backQuote(column_name) + " to modify" - + all_columns.getHintsString(column_name), + { + String exception_message = fmt::format("Wrong column. Cannot find colum {} to modify", backQuote(column_name)); + all_columns.appendHintsMessage(exception_message, column_name); + throw Exception{exception_message, ErrorCodes::NOT_FOUND_COLUMN_IN_BLOCK}; + } else continue; } @@ -1153,19 +1156,22 @@ void AlterCommands::validate(const StorageInMemoryMetadata & metadata, ContextPt all_columns.remove(command.column_name); } else if (!command.if_exists) - throw Exception( - "Wrong column name. Cannot find column " + backQuote(command.column_name) + " to drop" - + all_columns.getHintsString(command.column_name), - ErrorCodes::NOT_FOUND_COLUMN_IN_BLOCK); + { + String exception_message = fmt::format("Wrong column name. Cannot find column {} to drop", backQuote(command.column_name)); + all_columns.appendHintsMessage(exception_message, command.column_name); + throw Exception(exception_message, ErrorCodes::NOT_FOUND_COLUMN_IN_BLOCK); + } } else if (command.type == AlterCommand::COMMENT_COLUMN) { if (!all_columns.has(command.column_name)) { if (!command.if_exists) - throw Exception{"Wrong column name. Cannot find column " + backQuote(command.column_name) + " to comment" - + all_columns.getHintsString(command.column_name), - ErrorCodes::NOT_FOUND_COLUMN_IN_BLOCK}; + { + String exception_message = fmt::format("Wrong column name. Cannot find column {} to comment", backQuote(command.column_name)); + all_columns.appendHintsMessage(exception_message, command.column_name); + throw Exception(exception_message, ErrorCodes::NOT_FOUND_COLUMN_IN_BLOCK); + } } } else if (command.type == AlterCommand::MODIFY_SETTING || command.type == AlterCommand::RESET_SETTING) @@ -1199,9 +1205,11 @@ void AlterCommands::validate(const StorageInMemoryMetadata & metadata, ContextPt if (!all_columns.has(command.column_name)) { if (!command.if_exists) - throw Exception{"Wrong column name. Cannot find column " + backQuote(command.column_name) + " to rename" - + all_columns.getHintsString(command.column_name), - ErrorCodes::NOT_FOUND_COLUMN_IN_BLOCK}; + { + String exception_message = fmt::format("Wrong column name. Cannot find column {} to rename", backQuote(command.column_name)); + all_columns.appendHintsMessage(exception_message, command.column_name); + throw Exception(exception_message, ErrorCodes::NOT_FOUND_COLUMN_IN_BLOCK); + } else continue; } diff --git a/src/Storages/ColumnsDescription.cpp b/src/Storages/ColumnsDescription.cpp index a694405665b..f3a939614c1 100644 --- a/src/Storages/ColumnsDescription.cpp +++ b/src/Storages/ColumnsDescription.cpp @@ -230,8 +230,11 @@ void ColumnsDescription::remove(const String & column_name) { auto range = getNameRange(columns, column_name); if (range.first == range.second) - throw Exception( - "There is no column " + column_name + " in table" + getHintsString(column_name), ErrorCodes::NO_SUCH_COLUMN_IN_TABLE); + { + String exception_message = fmt::format("There is no column {} in table", column_name); + appendHintsMessage(exception_message, column_name); + throw Exception(exception_message, ErrorCodes::NO_SUCH_COLUMN_IN_TABLE); + } for (auto list_it = range.first; list_it != range.second;) { @@ -245,8 +248,9 @@ void ColumnsDescription::rename(const String & column_from, const String & colum auto it = columns.get<1>().find(column_from); if (it == columns.get<1>().end()) { - throw Exception( - "Cannot find column " + column_from + " in ColumnsDescription" + getHintsString(column_from), ErrorCodes::LOGICAL_ERROR); + String exception_message = fmt::format("Cannot find column {} in ColumnsDescription", column_from); + appendHintsMessage(exception_message, column_from); + throw Exception(exception_message, ErrorCodes::LOGICAL_ERROR); } columns.get<1>().modify_key(it, [&column_to] (String & old_name) diff --git a/src/Storages/ColumnsDescription.h b/src/Storages/ColumnsDescription.h index 81cb475a1f6..d3d6f7f2ff5 100644 --- a/src/Storages/ColumnsDescription.h +++ b/src/Storages/ColumnsDescription.h @@ -149,8 +149,11 @@ public: { auto it = columns.get<1>().find(column_name); if (it == columns.get<1>().end()) - throw Exception( - "Cannot find column " + column_name + " in ColumnsDescription" + getHintsString(column_name), ErrorCodes::LOGICAL_ERROR); + { + String exception_message = fmt::format("Cannot find column {} in ColumnsDescription", column_name); + appendHintsMessage(exception_message, column_name); + throw Exception(exception_message, ErrorCodes::LOGICAL_ERROR); + } removeSubcolumns(it->name); if (!columns.get<1>().modify(it, std::forward(f))) diff --git a/src/Storages/ProjectionsDescription.cpp b/src/Storages/ProjectionsDescription.cpp index 70e312931cc..69d7c5f8ed6 100644 --- a/src/Storages/ProjectionsDescription.cpp +++ b/src/Storages/ProjectionsDescription.cpp @@ -335,9 +335,11 @@ const ProjectionDescription & ProjectionsDescription::get(const String & project { auto it = map.find(projection_name); if (it == map.end()) - throw Exception( - "There is no projection " + projection_name + " in table" + getHintsString(projection_name), - ErrorCodes::NO_SUCH_PROJECTION_IN_TABLE); + { + String exception_message = fmt::format("There is no projection {} in table", projection_name); + appendHintsMessage(exception_message, projection_name); + throw Exception(exception_message, ErrorCodes::NO_SUCH_PROJECTION_IN_TABLE); + } return *(it->second); } @@ -378,9 +380,10 @@ void ProjectionsDescription::remove(const String & projection_name, bool if_exis { if (if_exists) return; - throw Exception( - "There is no projection " + projection_name + " in table" + getHintsString(projection_name), - ErrorCodes::NO_SUCH_PROJECTION_IN_TABLE); + + String exception_message = fmt::format("There is no projection {} in table", projection_name); + appendHintsMessage(exception_message, projection_name); + throw Exception(exception_message, ErrorCodes::NO_SUCH_PROJECTION_IN_TABLE); } projections.erase(it->second); diff --git a/tests/queries/0_stateless/02250_hints_for_columns.sh b/tests/queries/0_stateless/02250_hints_for_columns.sh index e8fe1a7a160..45fd2f238b1 100755 --- a/tests/queries/0_stateless/02250_hints_for_columns.sh +++ b/tests/queries/0_stateless/02250_hints_for_columns.sh @@ -8,10 +8,10 @@ $CLICKHOUSE_CLIENT --query="DROP TABLE IF EXISTS t" $CLICKHOUSE_CLIENT --query="CREATE TABLE t (CounterID UInt32, StartDate Date, UserID UInt32, VisitID UInt32, NestedColumn Nested(A UInt8, S String), ToDrop UInt32) ENGINE = MergeTree(StartDate, intHash32(UserID), (CounterID, StartDate, intHash32(UserID), VisitID), 8192)" -$CLICKHOUSE_CLIENT --query="ALTER TABLE t DROP COLUMN ToDro" 2>&1 | grep -q "may be you meant: \['ToDrop'\]" && echo 'OK' || echo 'FAIL' +$CLICKHOUSE_CLIENT --query="ALTER TABLE t DROP COLUMN ToDro" 2>&1 | grep -q "Maybe you meant: \['ToDrop'\]" && echo 'OK' || echo 'FAIL' -$CLICKHOUSE_CLIENT --query="ALTER TABLE t MODIFY COLUMN ToDro UInt64" 2>&1 | grep -q "may be you meant: \['ToDrop'\]" && echo 'OK' || echo 'FAIL' +$CLICKHOUSE_CLIENT --query="ALTER TABLE t MODIFY COLUMN ToDro UInt64" 2>&1 | grep -q "Maybe you meant: \['ToDrop'\]" && echo 'OK' || echo 'FAIL' -$CLICKHOUSE_CLIENT --query="ALTER TABLE t RENAME COLUMN ToDro to ToDropp" 2>&1 | grep -q "may be you meant: \['ToDrop'\]" && echo 'OK' || echo 'FAIL' +$CLICKHOUSE_CLIENT --query="ALTER TABLE t RENAME COLUMN ToDro to ToDropp" 2>&1 | grep -q "Maybe you meant: \['ToDrop'\]" && echo 'OK' || echo 'FAIL' -$CLICKHOUSE_CLIENT --query="DROP TABLE t" \ No newline at end of file +$CLICKHOUSE_CLIENT --query="DROP TABLE t" diff --git a/tests/queries/0_stateless/02250_hints_for_projections.sh b/tests/queries/0_stateless/02250_hints_for_projections.sh index 57123b88bde..7db8b243ae4 100755 --- a/tests/queries/0_stateless/02250_hints_for_projections.sh +++ b/tests/queries/0_stateless/02250_hints_for_projections.sh @@ -8,6 +8,6 @@ $CLICKHOUSE_CLIENT --query="DROP TABLE IF EXISTS t" $CLICKHOUSE_CLIENT --query="create table t (x Int32, y Int32, projection pToDrop (select x, y order by x)) engine = MergeTree order by y;" -$CLICKHOUSE_CLIENT --query="ALTER TABLE t DROP PROJECTION pToDro" 2>&1 | grep -q "may be you meant: \['pToDrop'\]" && echo 'OK' || echo 'FAIL' +$CLICKHOUSE_CLIENT --query="ALTER TABLE t DROP PROJECTION pToDro" 2>&1 | grep -q "Maybe you meant: \['pToDrop'\]" && echo 'OK' || echo 'FAIL' -$CLICKHOUSE_CLIENT --query="DROP TABLE t" \ No newline at end of file +$CLICKHOUSE_CLIENT --query="DROP TABLE t" From a926bc19eabc3d739b5ed9bef0c324ac6d49ca62 Mon Sep 17 00:00:00 2001 From: Antonio Andelic Date: Mon, 4 Apr 2022 07:24:10 +0000 Subject: [PATCH 42/82] Address PR comments --- src/Common/NamePrompter.h | 3 --- src/Storages/AlterCommands.cpp | 2 +- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/src/Common/NamePrompter.h b/src/Common/NamePrompter.h index b3eb271c0f0..962a89a8e76 100644 --- a/src/Common/NamePrompter.h +++ b/src/Common/NamePrompter.h @@ -124,7 +124,4 @@ public: private: NamePrompter prompter; }; - -void appendHintsString(String & message, const std::vector & hints, const String & name); - } diff --git a/src/Storages/AlterCommands.cpp b/src/Storages/AlterCommands.cpp index 2870dc42af7..76df6316fed 100644 --- a/src/Storages/AlterCommands.cpp +++ b/src/Storages/AlterCommands.cpp @@ -1047,7 +1047,7 @@ void AlterCommands::validate(const StorageInMemoryMetadata & metadata, ContextPt { if (!command.if_exists) { - String exception_message = fmt::format("Wrong column. Cannot find colum {} to modify", backQuote(column_name)); + String exception_message = fmt::format("Wrong column. Cannot find column {} to modify", backQuote(column_name)); all_columns.appendHintsMessage(exception_message, column_name); throw Exception{exception_message, ErrorCodes::NOT_FOUND_COLUMN_IN_BLOCK}; From 9a76efb8500e779f4925c4830e4f15d527084b7c Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Sun, 3 Apr 2022 23:39:59 +0300 Subject: [PATCH 43/82] Fix formatting of INSERT INFILE queries (missing quotes) Signed-off-by: Azat Khuzhin --- src/Parsers/ASTInsertQuery.cpp | 12 ++++++++++-- .../0_stateless/02165_insert_from_infile.reference | 4 ++-- .../0_stateless/02264_format_insert_infile.reference | 3 +++ .../0_stateless/02264_format_insert_infile.sql | 2 ++ 4 files changed, 17 insertions(+), 4 deletions(-) create mode 100644 tests/queries/0_stateless/02264_format_insert_infile.reference create mode 100644 tests/queries/0_stateless/02264_format_insert_infile.sql diff --git a/src/Parsers/ASTInsertQuery.cpp b/src/Parsers/ASTInsertQuery.cpp index 7e1d48d7f55..1d30c8f1bbd 100644 --- a/src/Parsers/ASTInsertQuery.cpp +++ b/src/Parsers/ASTInsertQuery.cpp @@ -81,9 +81,17 @@ void ASTInsertQuery::formatImpl(const FormatSettings & settings, FormatState & s if (infile) { - settings.ostr << (settings.hilite ? hilite_keyword : "") << " FROM INFILE " << (settings.hilite ? hilite_none : "") << infile->as().value.safeGet(); + settings.ostr + << (settings.hilite ? hilite_keyword : "") + << " FROM INFILE " + << (settings.hilite ? hilite_none : "") + << quoteString(infile->as().value.safeGet()); if (compression) - settings.ostr << (settings.hilite ? hilite_keyword : "") << " COMPRESSION " << (settings.hilite ? hilite_none : "") << compression->as().value.safeGet(); + settings.ostr + << (settings.hilite ? hilite_keyword : "") + << " COMPRESSION " + << (settings.hilite ? hilite_none : "") + << compression->as().value.safeGet(); } if (select) diff --git a/tests/queries/0_stateless/02165_insert_from_infile.reference b/tests/queries/0_stateless/02165_insert_from_infile.reference index 2a00a8faa31..f8c205ecc0f 100644 --- a/tests/queries/0_stateless/02165_insert_from_infile.reference +++ b/tests/queries/0_stateless/02165_insert_from_infile.reference @@ -1,5 +1,5 @@ -INSERT INTO test FROM INFILE data.file SELECT x +INSERT INTO test FROM INFILE \'data.file\' SELECT x FROM input(\'x UInt32\') -INSERT INTO test FROM INFILE data.file WITH number AS x +INSERT INTO test FROM INFILE \'data.file\' WITH number AS x SELECT number FROM input(\'number UInt32\') diff --git a/tests/queries/0_stateless/02264_format_insert_infile.reference b/tests/queries/0_stateless/02264_format_insert_infile.reference new file mode 100644 index 00000000000..338ea6fbfc6 --- /dev/null +++ b/tests/queries/0_stateless/02264_format_insert_infile.reference @@ -0,0 +1,3 @@ +-- { echo } +EXPLAIN SYNTAX INSERT INTO foo FROM INFILE '/dev/null'; +INSERT INTO foo FROM INFILE \'/dev/null\' diff --git a/tests/queries/0_stateless/02264_format_insert_infile.sql b/tests/queries/0_stateless/02264_format_insert_infile.sql new file mode 100644 index 00000000000..38ee39d932d --- /dev/null +++ b/tests/queries/0_stateless/02264_format_insert_infile.sql @@ -0,0 +1,2 @@ +-- { echo } +EXPLAIN SYNTAX INSERT INTO foo FROM INFILE '/dev/null'; From 93bbe9641aa162c65ea182279cfa10ab71e6d8cf Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Mon, 4 Apr 2022 10:30:31 +0300 Subject: [PATCH 44/82] Fix formatting of INSERT ... COMPRESSION Signed-off-by: Azat Khuzhin --- src/Parsers/ASTInsertQuery.cpp | 2 +- .../0_stateless/02264_format_insert_compression.reference | 3 +++ tests/queries/0_stateless/02264_format_insert_compression.sql | 2 ++ 3 files changed, 6 insertions(+), 1 deletion(-) create mode 100644 tests/queries/0_stateless/02264_format_insert_compression.reference create mode 100644 tests/queries/0_stateless/02264_format_insert_compression.sql diff --git a/src/Parsers/ASTInsertQuery.cpp b/src/Parsers/ASTInsertQuery.cpp index 1d30c8f1bbd..40e14c918ff 100644 --- a/src/Parsers/ASTInsertQuery.cpp +++ b/src/Parsers/ASTInsertQuery.cpp @@ -91,7 +91,7 @@ void ASTInsertQuery::formatImpl(const FormatSettings & settings, FormatState & s << (settings.hilite ? hilite_keyword : "") << " COMPRESSION " << (settings.hilite ? hilite_none : "") - << compression->as().value.safeGet(); + << quoteString(compression->as().value.safeGet()); } if (select) diff --git a/tests/queries/0_stateless/02264_format_insert_compression.reference b/tests/queries/0_stateless/02264_format_insert_compression.reference new file mode 100644 index 00000000000..107b7fcb3e9 --- /dev/null +++ b/tests/queries/0_stateless/02264_format_insert_compression.reference @@ -0,0 +1,3 @@ +-- { echo } +EXPLAIN SYNTAX INSERT INTO foo FROM INFILE '/dev/null' COMPRESSION 'gz'; +INSERT INTO foo FROM INFILE \'/dev/null\' COMPRESSION \'gz\' diff --git a/tests/queries/0_stateless/02264_format_insert_compression.sql b/tests/queries/0_stateless/02264_format_insert_compression.sql new file mode 100644 index 00000000000..c095a8fbbb7 --- /dev/null +++ b/tests/queries/0_stateless/02264_format_insert_compression.sql @@ -0,0 +1,2 @@ +-- { echo } +EXPLAIN SYNTAX INSERT INTO foo FROM INFILE '/dev/null' COMPRESSION 'gz'; From ae53aae1063b09beaa23b74abec5fe8b79565597 Mon Sep 17 00:00:00 2001 From: Anton Popov Date: Mon, 4 Apr 2022 08:48:31 +0000 Subject: [PATCH 45/82] fix clang-tidy --- src/Storages/MergeTree/MergeTreeDataWriter.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Storages/MergeTree/MergeTreeDataWriter.h b/src/Storages/MergeTree/MergeTreeDataWriter.h index 33742d7e52a..7b6bf8fb1db 100644 --- a/src/Storages/MergeTree/MergeTreeDataWriter.h +++ b/src/Storages/MergeTree/MergeTreeDataWriter.h @@ -42,7 +42,7 @@ public: */ static BlocksWithPartition splitBlockIntoParts(const Block & block, size_t max_parts, const StorageMetadataPtr & metadata_snapshot, ContextPtr context); - void deduceTypesOfObjectColumns(const StorageSnapshotPtr & storage_snapshot, Block & block); + static void deduceTypesOfObjectColumns(const StorageSnapshotPtr & storage_snapshot, Block & block); /// This structure contains not completely written temporary part. /// Some writes may happen asynchronously, e.g. for blob storages. From 803a1a2a9c299fb2a87608f02b98e47515f66ac6 Mon Sep 17 00:00:00 2001 From: alesapin Date: Mon, 4 Apr 2022 11:19:02 +0200 Subject: [PATCH 46/82] Fix tests and check --- src/Storages/MergeTree/MergeTreeData.cpp | 3 ++- tests/queries/0_stateless/00980_merge_alter_settings.sql | 6 +++--- .../00980_zookeeper_merge_tree_alter_settings.sql | 4 ++-- 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp index f1af92e7763..4e4e555fb54 100644 --- a/src/Storages/MergeTree/MergeTreeData.cpp +++ b/src/Storages/MergeTree/MergeTreeData.cpp @@ -1909,6 +1909,7 @@ void MergeTreeData::checkAlterIsPossible(const AlterCommands & commands, Context StorageInMemoryMetadata old_metadata = getInMemoryMetadata(); const auto & settings = local_context->getSettingsRef(); + const auto & settings_from_storage = getSettings(); if (!settings.allow_non_metadata_alters) { @@ -2103,7 +2104,7 @@ void MergeTreeData::checkAlterIsPossible(const AlterCommands & commands, Context { for (const auto & reset_setting : command.settings_resets) { - if (!settings.has(reset_setting)) + if (!settings_from_storage->has(reset_setting)) throw Exception(ErrorCodes::BAD_ARGUMENTS, "Cannot reset setting '{}' because it doesn't exist for MergeTree engines family", reset_setting); } } diff --git a/tests/queries/0_stateless/00980_merge_alter_settings.sql b/tests/queries/0_stateless/00980_merge_alter_settings.sql index c0d18f6d453..f595a09970d 100644 --- a/tests/queries/0_stateless/00980_merge_alter_settings.sql +++ b/tests/queries/0_stateless/00980_merge_alter_settings.sql @@ -91,8 +91,8 @@ SHOW CREATE TABLE table_for_reset_setting; ALTER TABLE table_for_reset_setting RESET SETTING index_granularity; -- { serverError 472 } --- ignore undefined setting -ALTER TABLE table_for_reset_setting RESET SETTING merge_with_ttl_timeout, unknown_setting; +-- don't execute alter with incorrect setting +ALTER TABLE table_for_reset_setting RESET SETTING merge_with_ttl_timeout, unknown_setting; -- { serverError 36 } ALTER TABLE table_for_reset_setting MODIFY SETTING merge_with_ttl_timeout = 300, max_concurrent_queries = 1; @@ -102,4 +102,4 @@ ALTER TABLE table_for_reset_setting RESET SETTING max_concurrent_queries, merge_ SHOW CREATE TABLE table_for_reset_setting; -DROP TABLE IF EXISTS table_for_reset_setting; \ No newline at end of file +DROP TABLE IF EXISTS table_for_reset_setting; diff --git a/tests/queries/0_stateless/00980_zookeeper_merge_tree_alter_settings.sql b/tests/queries/0_stateless/00980_zookeeper_merge_tree_alter_settings.sql index dfb91eb3b0a..1b291bf84d2 100644 --- a/tests/queries/0_stateless/00980_zookeeper_merge_tree_alter_settings.sql +++ b/tests/queries/0_stateless/00980_zookeeper_merge_tree_alter_settings.sql @@ -108,8 +108,8 @@ ATTACH TABLE replicated_table_for_reset_setting1; SHOW CREATE TABLE replicated_table_for_reset_setting1; SHOW CREATE TABLE replicated_table_for_reset_setting2; --- ignore undefined setting -ALTER TABLE replicated_table_for_reset_setting1 RESET SETTING check_delay_period, unknown_setting; +-- don't execute alter with incorrect setting +ALTER TABLE replicated_table_for_reset_setting1 RESET SETTING check_delay_period, unknown_setting; -- { serverError 36 } ALTER TABLE replicated_table_for_reset_setting1 RESET SETTING merge_with_ttl_timeout; ALTER TABLE replicated_table_for_reset_setting2 RESET SETTING merge_with_ttl_timeout; From 885447342c1cb4937b4d07c8aa46c6407e7d584e Mon Sep 17 00:00:00 2001 From: alesapin Date: Mon, 4 Apr 2022 13:17:33 +0200 Subject: [PATCH 47/82] More logs on unsuccessful part removal --- src/Interpreters/ExternalDictionariesLoader.cpp | 2 +- src/Storages/MergeTree/MergeTreeData.cpp | 12 +++++++++++- src/Storages/StorageReplicatedMergeTree.cpp | 2 +- 3 files changed, 13 insertions(+), 3 deletions(-) diff --git a/src/Interpreters/ExternalDictionariesLoader.cpp b/src/Interpreters/ExternalDictionariesLoader.cpp index f615aa24a91..4dd779e3a50 100644 --- a/src/Interpreters/ExternalDictionariesLoader.cpp +++ b/src/Interpreters/ExternalDictionariesLoader.cpp @@ -136,7 +136,7 @@ std::string ExternalDictionariesLoader::resolveDictionaryNameFromDatabaseCatalog if (qualified_name->database.empty()) { - /// Ether database name is not specified and we should use current one + /// Either database name is not specified and we should use current one /// or it's an XML dictionary. bool is_xml_dictionary = has(name); if (is_xml_dictionary) diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp index b433c6e4591..f77480dbaaf 100644 --- a/src/Storages/MergeTree/MergeTreeData.cpp +++ b/src/Storages/MergeTree/MergeTreeData.cpp @@ -2953,7 +2953,8 @@ void MergeTreeData::tryRemovePartImmediately(DataPartPtr && part) { auto lock = lockParts(); - LOG_TRACE(log, "Trying to immediately remove part {}", part->getNameWithState()); + auto part_name_with_state = part->getNameWithState(); + LOG_TRACE(log, "Trying to immediately remove part {}", part_name_with_state); if (part->getState() != DataPartState::Temporary) { @@ -2964,7 +2965,16 @@ void MergeTreeData::tryRemovePartImmediately(DataPartPtr && part) part.reset(); if (!((*it)->getState() == DataPartState::Outdated && it->unique())) + { + if (!(*it)->getState() == DataPartState::Outdated) + LOG_WARNING("Cannot immediately remove part {} because it's not in Outdated state " + "usage counter {}", part_name_with_state, it->use_count()); + + if (!it->unique()) + LOG_WARNING("Cannot immediately remove part {} because someone using it right now " + "usage counter {}", part_name_with_state, it->use_count()); return; + } modifyPartState(it, DataPartState::Deleting); diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp index d9f72cf7feb..b6f7b65dfa2 100644 --- a/src/Storages/StorageReplicatedMergeTree.cpp +++ b/src/Storages/StorageReplicatedMergeTree.cpp @@ -3321,7 +3321,7 @@ void StorageReplicatedMergeTree::removePartAndEnqueueFetch(const String & part_n if (!broken_part_info.contains(part->info)) continue; - /// Broken part itself ether already moved to detached or does not exist. + /// Broken part itself either already moved to detached or does not exist. assert(broken_part_info != part->info); part->makeCloneInDetached("covered-by-broken", getInMemoryMetadataPtr()); } From af405d3ba62c291d7cab5b0384dd114b1ffb6bc4 Mon Sep 17 00:00:00 2001 From: Maksim Kita Date: Mon, 4 Apr 2022 13:34:27 +0200 Subject: [PATCH 48/82] Fixed style check --- src/Functions/GatherUtils/sliceHasImplAnyAll.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/Functions/GatherUtils/sliceHasImplAnyAll.h b/src/Functions/GatherUtils/sliceHasImplAnyAll.h index 97ac0c6be72..ec8daceb990 100644 --- a/src/Functions/GatherUtils/sliceHasImplAnyAll.h +++ b/src/Functions/GatherUtils/sliceHasImplAnyAll.h @@ -21,9 +21,8 @@ namespace DB::GatherUtils inline ALWAYS_INLINE bool hasNull(const UInt8 * null_map, size_t null_map_size) { - if (null_map == nullptr) { + if (null_map == nullptr) return false; - } for (size_t i = 0; i < null_map_size; ++i) { From 3c472a789778db4fbc258796a53615836c6c6478 Mon Sep 17 00:00:00 2001 From: Maksim Kita Date: Mon, 4 Apr 2022 13:34:40 +0200 Subject: [PATCH 49/82] Simplified hasAll performance tests --- tests/performance/hasAll_simd_int16.xml | 52 ------------------------ tests/performance/hasAll_simd_int32.xml | 52 ------------------------ tests/performance/hasAll_simd_int64.xml | 52 ------------------------ tests/performance/hasAll_simd_int8.xml | 52 ------------------------ tests/performance/has_all.xml | 53 +++++++++++++++++++++++++ 5 files changed, 53 insertions(+), 208 deletions(-) delete mode 100644 tests/performance/hasAll_simd_int16.xml delete mode 100644 tests/performance/hasAll_simd_int32.xml delete mode 100644 tests/performance/hasAll_simd_int64.xml delete mode 100644 tests/performance/hasAll_simd_int8.xml create mode 100644 tests/performance/has_all.xml diff --git a/tests/performance/hasAll_simd_int16.xml b/tests/performance/hasAll_simd_int16.xml deleted file mode 100644 index c2ce4eec77f..00000000000 --- a/tests/performance/hasAll_simd_int16.xml +++ /dev/null @@ -1,52 +0,0 @@ - - CREATE TABLE test_table_small (`set` Array(Int16), `subset` Array (Int16)) ENGINE = MergeTree ORDER BY set - CREATE TABLE test_table_small2 (`set` Array(Int16), `subset` Array (Int16)) ENGINE = MergeTree ORDER BY set - CREATE TABLE test_table_smallf (`set` Array(Int16), `subset` Array (Int16)) ENGINE = MergeTree ORDER BY set - - CREATE TABLE test_table_medium (`set` Array(Int16), `subset` Array (Int16)) ENGINE = MergeTree ORDER BY set - CREATE TABLE test_table_medium2 (`set` Array(Int16), `subset` Array (Int16)) ENGINE = MergeTree ORDER BY set - CREATE TABLE test_table_mediumf (`set` Array(Int16), `subset` Array (Int16)) ENGINE = MergeTree ORDER BY set - - CREATE TABLE test_table_large (`set` Array(Int16), `subset` Array (Int16)) ENGINE = MergeTree ORDER BY set - CREATE TABLE test_table_large2 (`set` Array(Int16), `subset` Array (Int16)) ENGINE = MergeTree ORDER BY set - CREATE TABLE test_table_largef (`set` Array(Int16), `subset` Array (Int16)) ENGINE = MergeTree ORDER BY set - - - INSERT INTO test_table_small SELECT groupArraySample(500)(number) AS set, groupArraySample(10)(number) AS subset FROM (SELECT * FROM numbers(500)) - INSERT INTO test_table_small2 SELECT groupArraySample(500)(number) AS set, groupArraySample(400)(number) AS subset FROM (SELECT * FROM numbers(500)) - INSERT INTO test_table_smallf SELECT groupArraySample(500)(number) AS set, groupArraySample(10)(number) AS subset FROM (SELECT * FROM numbers(5000000)) - - - INSERT INTO test_table_medium SELECT groupArraySample(1000000)(number) AS set, groupArraySample(10)(number) AS subset FROM (SELECT * FROM numbers(1000000)) - INSERT INTO test_table_medium2 SELECT groupArraySample(1000000)(number) AS set, groupArraySample(4000)(number) AS subset FROM (SELECT * FROM numbers(1000000)) - INSERT INTO test_table_mediumf SELECT groupArraySample(1000000)(number) AS set, groupArraySample(10)(number) AS subset FROM (SELECT * FROM numbers(100000000)) - - - INSERT INTO test_table_large SELECT groupArraySample(50000000)(number) AS set, groupArraySample(10)(number) AS subset FROM (SELECT * FROM numbers(50000000)) - INSERT INTO test_table_large2 SELECT groupArraySample(50000000)(number) AS set, groupArraySample(8000)(number) AS subset FROM (SELECT * FROM numbers(50000000)) - INSERT INTO test_table_largef SELECT groupArraySample(50000000)(number) AS set, groupArraySample(10)(number) AS subset FROM (SELECT * FROM numbers(1000000000)) - - select hasAll(set, subset) from test_table_small - select hasAll(set, subset) from test_table_small2 - select hasAll(set, subset) from test_table_smallf - - select hasAll(set, subset) from test_table_medium - select hasAll(set, subset) from test_table_medium2 - select hasAll(set, subset) from test_table_mediumf - - select hasAll(set, subset) from test_table_large - select hasAll(set, subset) from test_table_large2 Settings max_execution_time=300 - select hasAll(set, subset) from test_table_largef - - DROP TABLE IF EXISTS test_table_small - DROP TABLE IF EXISTS test_table_small2 - DROP TABLE IF EXISTS test_table_smallf - - DROP TABLE IF EXISTS test_table_medium - DROP TABLE IF EXISTS test_table_medium2 - DROP TABLE IF EXISTS test_table_mediumf - - DROP TABLE IF EXISTS test_table_large - DROP TABLE IF EXISTS test_table_large2 - DROP TABLE IF EXISTS test_table_largef - diff --git a/tests/performance/hasAll_simd_int32.xml b/tests/performance/hasAll_simd_int32.xml deleted file mode 100644 index 4543dea161b..00000000000 --- a/tests/performance/hasAll_simd_int32.xml +++ /dev/null @@ -1,52 +0,0 @@ - - CREATE TABLE test_table_small (`set` Array(Int32), `subset` Array (Int32)) ENGINE = MergeTree ORDER BY set - CREATE TABLE test_table_small2 (`set` Array(Int32), `subset` Array (Int32)) ENGINE = MergeTree ORDER BY set - CREATE TABLE test_table_smallf (`set` Array(Int32), `subset` Array (Int32)) ENGINE = MergeTree ORDER BY set - - CREATE TABLE test_table_medium (`set` Array(Int32), `subset` Array (Int32)) ENGINE = MergeTree ORDER BY set - CREATE TABLE test_table_medium2 (`set` Array(Int32), `subset` Array (Int32)) ENGINE = MergeTree ORDER BY set - CREATE TABLE test_table_mediumf (`set` Array(Int32), `subset` Array (Int32)) ENGINE = MergeTree ORDER BY set - - CREATE TABLE test_table_large (`set` Array(Int32), `subset` Array (Int32)) ENGINE = MergeTree ORDER BY set - CREATE TABLE test_table_large2 (`set` Array(Int32), `subset` Array (Int32)) ENGINE = MergeTree ORDER BY set - CREATE TABLE test_table_largef (`set` Array(Int32), `subset` Array (Int32)) ENGINE = MergeTree ORDER BY set - - - INSERT INTO test_table_small SELECT groupArraySample(500)(number) AS set, groupArraySample(10)(number) AS subset FROM (SELECT * FROM numbers(500)) - INSERT INTO test_table_small2 SELECT groupArraySample(500)(number) AS set, groupArraySample(400)(number) AS subset FROM (SELECT * FROM numbers(500)) - INSERT INTO test_table_smallf SELECT groupArraySample(500)(number) AS set, groupArraySample(10)(number) AS subset FROM (SELECT * FROM numbers(5000000)) - - - INSERT INTO test_table_medium SELECT groupArraySample(1000000)(number) AS set, groupArraySample(10)(number) AS subset FROM (SELECT * FROM numbers(1000000)) - INSERT INTO test_table_medium2 SELECT groupArraySample(1000000)(number) AS set, groupArraySample(4000)(number) AS subset FROM (SELECT * FROM numbers(1000000)) - INSERT INTO test_table_mediumf SELECT groupArraySample(1000000)(number) AS set, groupArraySample(10)(number) AS subset FROM (SELECT * FROM numbers(100000000)) - - - INSERT INTO test_table_large SELECT groupArraySample(50000000)(number) AS set, groupArraySample(10)(number) AS subset FROM (SELECT * FROM numbers(50000000)) - INSERT INTO test_table_large2 SELECT groupArraySample(50000000)(number) AS set, groupArraySample(4000)(number) AS subset FROM (SELECT * FROM numbers(50000000)) Settings max_execution_time=30 - INSERT INTO test_table_largef SELECT groupArraySample(50000000)(number) AS set, groupArraySample(10)(number) AS subset FROM (SELECT * FROM numbers(1000000000)) - - select hasAll(set, subset) from test_table_small - select hasAll(set, subset) from test_table_small2 - select hasAll(set, subset) from test_table_smallf - - select hasAll(set, subset) from test_table_medium - select hasAll(set, subset) from test_table_medium2 - select hasAll(set, subset) from test_table_mediumf - - select hasAll(set, subset) from test_table_large - select hasAll(set, subset) from test_table_large2 Settings max_execution_time=300 - select hasAll(set, subset) from test_table_largef - - DROP TABLE IF EXISTS test_table_small - DROP TABLE IF EXISTS test_table_small2 - DROP TABLE IF EXISTS test_table_smallf - - DROP TABLE IF EXISTS test_table_medium - DROP TABLE IF EXISTS test_table_medium2 - DROP TABLE IF EXISTS test_table_mediumf - - DROP TABLE IF EXISTS test_table_large - DROP TABLE IF EXISTS test_table_large2 - DROP TABLE IF EXISTS test_table_largef - diff --git a/tests/performance/hasAll_simd_int64.xml b/tests/performance/hasAll_simd_int64.xml deleted file mode 100644 index 07e52483bb1..00000000000 --- a/tests/performance/hasAll_simd_int64.xml +++ /dev/null @@ -1,52 +0,0 @@ - - CREATE TABLE test_table_small (`set` Array(Int64), `subset` Array (Int64)) ENGINE = MergeTree ORDER BY set - CREATE TABLE test_table_small2 (`set` Array(Int64), `subset` Array (Int64)) ENGINE = MergeTree ORDER BY set - CREATE TABLE test_table_smallf (`set` Array(Int64), `subset` Array (Int64)) ENGINE = MergeTree ORDER BY set - - CREATE TABLE test_table_medium (`set` Array(Int64), `subset` Array (Int64)) ENGINE = MergeTree ORDER BY set - CREATE TABLE test_table_medium2 (`set` Array(Int64), `subset` Array (Int64)) ENGINE = MergeTree ORDER BY set - CREATE TABLE test_table_mediumf (`set` Array(Int64), `subset` Array (Int64)) ENGINE = MergeTree ORDER BY set - - CREATE TABLE test_table_large (`set` Array(Int64), `subset` Array (Int64)) ENGINE = MergeTree ORDER BY set - CREATE TABLE test_table_large2 (`set` Array(Int64), `subset` Array (Int64)) ENGINE = MergeTree ORDER BY set - CREATE TABLE test_table_largef (`set` Array(Int64), `subset` Array (Int64)) ENGINE = MergeTree ORDER BY set - - - INSERT INTO test_table_small SELECT groupArraySample(500)(number) AS set, groupArraySample(10)(number) AS subset FROM (SELECT * FROM numbers(500)) - INSERT INTO test_table_small2 SELECT groupArraySample(500)(number) AS set, groupArraySample(400)(number) AS subset FROM (SELECT * FROM numbers(500)) - INSERT INTO test_table_smallf SELECT groupArraySample(500)(number) AS set, groupArraySample(10)(number) AS subset FROM (SELECT * FROM numbers(5000000)) - - - INSERT INTO test_table_medium SELECT groupArraySample(1000000)(number) AS set, groupArraySample(10)(number) AS subset FROM (SELECT * FROM numbers(1000000)) - INSERT INTO test_table_medium2 SELECT groupArraySample(1000000)(number) AS set, groupArraySample(4000)(number) AS subset FROM (SELECT * FROM numbers(1000000)) - INSERT INTO test_table_mediumf SELECT groupArraySample(1000000)(number) AS set, groupArraySample(10)(number) AS subset FROM (SELECT * FROM numbers(100000000)) - - - INSERT INTO test_table_large SELECT groupArraySample(50000000)(number) AS set, groupArraySample(10)(number) AS subset FROM (SELECT * FROM numbers(50000000)) - INSERT INTO test_table_large2 SELECT groupArraySample(50000000)(number) AS set, groupArraySample(2000)(number) AS subset FROM (SELECT * FROM numbers(50000000)) Settings max_execution_time=30 - INSERT INTO test_table_largef SELECT groupArraySample(50000000)(number) AS set, groupArraySample(10)(number) AS subset FROM (SELECT * FROM numbers(1000000000)) - - select hasAll(set, subset) from test_table_small - select hasAll(set, subset) from test_table_small2 - select hasAll(set, subset) from test_table_smallf - - select hasAll(set, subset) from test_table_medium - select hasAll(set, subset) from test_table_medium2 - select hasAll(set, subset) from test_table_mediumf - - select hasAll(set, subset) from test_table_large - select hasAll(set, subset) from test_table_large2 Settings max_execution_time=300 - select hasAll(set, subset) from test_table_largef - - DROP TABLE IF EXISTS test_table_small - DROP TABLE IF EXISTS test_table_small2 - DROP TABLE IF EXISTS test_table_smallf - - DROP TABLE IF EXISTS test_table_medium - DROP TABLE IF EXISTS test_table_medium2 - DROP TABLE IF EXISTS test_table_mediumf - - DROP TABLE IF EXISTS test_table_large - DROP TABLE IF EXISTS test_table_large2 - DROP TABLE IF EXISTS test_table_largef - diff --git a/tests/performance/hasAll_simd_int8.xml b/tests/performance/hasAll_simd_int8.xml deleted file mode 100644 index 5ddc84aa5bd..00000000000 --- a/tests/performance/hasAll_simd_int8.xml +++ /dev/null @@ -1,52 +0,0 @@ - - CREATE TABLE test_table_small (`set` Array(Int8), `subset` Array (Int8)) ENGINE = MergeTree ORDER BY set - CREATE TABLE test_table_small2 (`set` Array(Int8), `subset` Array (Int8)) ENGINE = MergeTree ORDER BY set - CREATE TABLE test_table_smallf (`set` Array(Int8), `subset` Array (Int8)) ENGINE = MergeTree ORDER BY set - - CREATE TABLE test_table_medium (`set` Array(Int8), `subset` Array (Int8)) ENGINE = MergeTree ORDER BY set - CREATE TABLE test_table_medium2 (`set` Array(Int8), `subset` Array (Int8)) ENGINE = MergeTree ORDER BY set - CREATE TABLE test_table_mediumf (`set` Array(Int8), `subset` Array (Int8)) ENGINE = MergeTree ORDER BY set - - CREATE TABLE test_table_large (`set` Array(Int8), `subset` Array (Int8)) ENGINE = MergeTree ORDER BY set - CREATE TABLE test_table_large2 (`set` Array(Int8), `subset` Array (Int8)) ENGINE = MergeTree ORDER BY set - CREATE TABLE test_table_largef (`set` Array(Int8), `subset` Array (Int8)) ENGINE = MergeTree ORDER BY set - - - INSERT INTO test_table_small SELECT groupArraySample(500)(number) AS set, groupArraySample(10)(number) AS subset FROM (SELECT * FROM numbers(500)) - INSERT INTO test_table_small2 SELECT groupArraySample(500)(number) AS set, groupArraySample(400)(number) AS subset FROM (SELECT * FROM numbers(500)) - INSERT INTO test_table_smallf SELECT groupArraySample(500)(number) AS set, groupArraySample(10)(number) AS subset FROM (SELECT * FROM numbers(5000000)) - - - INSERT INTO test_table_medium SELECT groupArraySample(1000000)(number) AS set, groupArraySample(10)(number) AS subset FROM (SELECT * FROM numbers(1000000)) - INSERT INTO test_table_medium2 SELECT groupArraySample(1000000)(number) AS set, groupArraySample(4000)(number) AS subset FROM (SELECT * FROM numbers(1000000)) - INSERT INTO test_table_mediumf SELECT groupArraySample(1000000)(number) AS set, groupArraySample(10)(number) AS subset FROM (SELECT * FROM numbers(100000000)) - - - INSERT INTO test_table_large SELECT groupArraySample(50000000)(number) AS set, groupArraySample(10)(number) AS subset FROM (SELECT * FROM numbers(50000000)) - INSERT INTO test_table_large2 SELECT groupArraySample(50000000)(number) AS set, groupArraySample(4000)(number) AS subset FROM (SELECT * FROM numbers(50000000)) - INSERT INTO test_table_largef SELECT groupArraySample(50000000)(number) AS set, groupArraySample(10)(number) AS subset FROM (SELECT * FROM numbers(1000000000)) - - select hasAll(set, subset) from test_table_small - select hasAll(set, subset) from test_table_small2 - select hasAll(set, subset) from test_table_smallf - - select hasAll(set, subset) from test_table_medium - select hasAll(set, subset) from test_table_medium2 - select hasAll(set, subset) from test_table_mediumf - - select hasAll(set, subset) from test_table_large - select hasAll(set, subset) from test_table_large2 Settings max_execution_time=300 - select hasAll(set, subset) from test_table_largef - - DROP TABLE IF EXISTS test_table_small - DROP TABLE IF EXISTS test_table_small2 - DROP TABLE IF EXISTS test_table_smallf - - DROP TABLE IF EXISTS test_table_medium - DROP TABLE IF EXISTS test_table_medium2 - DROP TABLE IF EXISTS test_table_mediumf - - DROP TABLE IF EXISTS test_table_large - DROP TABLE IF EXISTS test_table_large2 - DROP TABLE IF EXISTS test_table_largef - diff --git a/tests/performance/has_all.xml b/tests/performance/has_all.xml new file mode 100644 index 00000000000..331442cbfee --- /dev/null +++ b/tests/performance/has_all.xml @@ -0,0 +1,53 @@ + + + + array_type + + Int8 + Int16 + Int32 + Int64 + + + + + + CREATE TABLE test_table_small_{array_type} + ( + `set` Array({array_type}), + `subset` Array ({array_type}) + ) + ENGINE = MergeTree ORDER BY set; + + + + CREATE TABLE test_table_medium_{array_type} + ( + `set` Array({array_type}), + `subset` Array ({array_type}) + ) + ENGINE = MergeTree ORDER BY set; + + + + CREATE TABLE test_table_large_{array_type} + ( + `set` Array({array_type}), + `subset` Array ({array_type}) + ) + ENGINE = MergeTree ORDER BY set; + + + + INSERT INTO test_table_small_{array_type} SELECT groupArraySample(5000)(rand64()) AS set, groupArraySample(500)(rand64()) AS subset FROM numbers(10000000) GROUP BY number % 5000; + INSERT INTO test_table_medium_{array_type} SELECT groupArraySample(50000)(rand64()) AS set, groupArraySample(5000)(rand64()) AS subset FROM numbers(25000000) GROUP BY number % 50000; + INSERT INTO test_table_large_{array_type} SELECT groupArraySample(500000)(rand64()) AS set, groupArraySample(500000)(rand64()) AS subset FROM numbers(50000000) GROUP BY number % 500000; + + SELECT hasAll(set, subset) FROM test_table_small_{array_type} FORMAT Null + SELECT hasAll(set, subset) FROM test_table_medium_{array_type} FORMAT Null + SELECT hasAll(set, subset) FROM test_table_large_{array_type} FORMAT Null + + DROP TABLE IF EXISTS test_table_small_{array_type} + DROP TABLE IF EXISTS test_table_medium_{array_type} + DROP TABLE IF EXISTS test_table_large_{array_type} + From 09c04e4993ff357fa10d352f394dbe6204f4ee96 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E4=BD=95=E6=9D=8E=E5=A4=AB?= Date: Mon, 4 Apr 2022 19:56:41 +0800 Subject: [PATCH 50/82] Improve the pipeline description for JOIN (#35612) Improve the pipeline description for JOIN --- .../QueryPlan/ITransformingStep.cpp | 5 +++++ src/Processors/QueryPlan/ITransformingStep.h | 3 +++ src/QueryPipeline/QueryPipelineBuilder.cpp | 15 ++++++++++++++- .../02236_explain_pipeline_join.reference | 19 +++++++++++++++++++ .../02236_explain_pipeline_join.sql | 10 ++++++++++ 5 files changed, 51 insertions(+), 1 deletion(-) create mode 100644 tests/queries/0_stateless/02236_explain_pipeline_join.reference create mode 100644 tests/queries/0_stateless/02236_explain_pipeline_join.sql diff --git a/src/Processors/QueryPlan/ITransformingStep.cpp b/src/Processors/QueryPlan/ITransformingStep.cpp index 629fb89be1e..9b9797b6540 100644 --- a/src/Processors/QueryPlan/ITransformingStep.cpp +++ b/src/Processors/QueryPlan/ITransformingStep.cpp @@ -70,4 +70,9 @@ void ITransformingStep::describePipeline(FormatSettings & settings) const IQueryPlanStep::describePipeline(processors, settings); } +void ITransformingStep::appendExtraProcessors(const Processors & extra_processors) +{ + processors.insert(processors.end(), extra_processors.begin(), extra_processors.end()); +} + } diff --git a/src/Processors/QueryPlan/ITransformingStep.h b/src/Processors/QueryPlan/ITransformingStep.h index d87ca05d4bc..8f3641dd5bd 100644 --- a/src/Processors/QueryPlan/ITransformingStep.h +++ b/src/Processors/QueryPlan/ITransformingStep.h @@ -57,6 +57,9 @@ public: void describePipeline(FormatSettings & settings) const override; + /// Append extra processors for this step. + void appendExtraProcessors(const Processors & extra_processors); + protected: /// Clear distinct_columns if res_header doesn't contain all of them. static void updateDistinctColumns(const Block & res_header, NameSet & distinct_columns); diff --git a/src/QueryPipeline/QueryPipelineBuilder.cpp b/src/QueryPipeline/QueryPipelineBuilder.cpp index fcd3105a422..9f392b51cf0 100644 --- a/src/QueryPipeline/QueryPipelineBuilder.cpp +++ b/src/QueryPipeline/QueryPipelineBuilder.cpp @@ -1,5 +1,6 @@ #include +#include #include #include #include @@ -307,7 +308,15 @@ std::unique_ptr QueryPipelineBuilder::joinPipelines( right->pipe.dropExtremes(); left->pipe.collected_processors = collected_processors; - right->pipe.collected_processors = collected_processors; + + /// Collect the NEW processors for the right pipeline. + QueryPipelineProcessorsCollector collector(*right); + /// Remember the last step of the right pipeline. + ExpressionStep* step = typeid_cast(right->pipe.processors.back()->getQueryPlanStep()); + if (!step) + { + throw Exception(ErrorCodes::LOGICAL_ERROR, "The top step of the right pipeline should be ExpressionStep"); + } /// In case joined subquery has totals, and we don't, add default chunk to totals. bool default_totals = false; @@ -377,6 +386,10 @@ std::unique_ptr QueryPipelineBuilder::joinPipelines( left->pipe.processors.emplace_back(std::move(joining)); } + /// Move the collected processors to the last step in the right pipeline. + Processors processors = collector.detachProcessors(); + step->appendExtraProcessors(processors); + left->pipe.processors.insert(left->pipe.processors.end(), right->pipe.processors.begin(), right->pipe.processors.end()); left->pipe.holder = std::move(right->pipe.holder); left->pipe.header = left->pipe.output_ports.front()->getHeader(); diff --git a/tests/queries/0_stateless/02236_explain_pipeline_join.reference b/tests/queries/0_stateless/02236_explain_pipeline_join.reference new file mode 100644 index 00000000000..ed993e2a1e7 --- /dev/null +++ b/tests/queries/0_stateless/02236_explain_pipeline_join.reference @@ -0,0 +1,19 @@ +(Expression) +ExpressionTransform + (Join) + JoiningTransform 2 → 1 + (Expression) + ExpressionTransform + (SettingQuotaAndLimits) + (Limit) + Limit + (ReadFromStorage) + Numbers 0 → 1 + (Expression) + FillingRightJoinSide + ExpressionTransform + (SettingQuotaAndLimits) + (Limit) + Limit + (ReadFromStorage) + Numbers 0 → 1 diff --git a/tests/queries/0_stateless/02236_explain_pipeline_join.sql b/tests/queries/0_stateless/02236_explain_pipeline_join.sql new file mode 100644 index 00000000000..de885ed74ee --- /dev/null +++ b/tests/queries/0_stateless/02236_explain_pipeline_join.sql @@ -0,0 +1,10 @@ +EXPLAIN PIPELINE +SELECT * FROM +( + SELECT * FROM system.numbers LIMIT 10 +) t1 +ALL LEFT JOIN +( + SELECT * FROM system.numbers LIMIT 10 +) t2 +USING number; From 9b75ef6ce957b550aaf3fdb0ae2403227a4cfceb Mon Sep 17 00:00:00 2001 From: alesapin Date: Mon, 4 Apr 2022 14:03:16 +0200 Subject: [PATCH 51/82] Fix build --- src/Storages/MergeTree/MergeTreeData.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp index f77480dbaaf..8419f07ae73 100644 --- a/src/Storages/MergeTree/MergeTreeData.cpp +++ b/src/Storages/MergeTree/MergeTreeData.cpp @@ -2966,7 +2966,7 @@ void MergeTreeData::tryRemovePartImmediately(DataPartPtr && part) if (!((*it)->getState() == DataPartState::Outdated && it->unique())) { - if (!(*it)->getState() == DataPartState::Outdated) + if ((*it)->getState() != DataPartState::Outdated) LOG_WARNING("Cannot immediately remove part {} because it's not in Outdated state " "usage counter {}", part_name_with_state, it->use_count()); From 47528de78ba2d317f6532dab3bb07461f469049c Mon Sep 17 00:00:00 2001 From: Maksim Kita Date: Mon, 4 Apr 2022 14:07:05 +0200 Subject: [PATCH 52/82] Fix build --- src/CMakeLists.txt | 12 ++++++++++++ src/{Functions => Common}/TargetSpecific.cpp | 2 +- src/{Functions => Common}/TargetSpecific.h | 0 src/Functions/CMakeLists.txt | 11 ----------- src/Functions/FunctionStartsEndsWith.h | 2 +- src/Functions/FunctionsHashing.h | 2 +- src/Functions/FunctionsRandom.h | 2 +- src/Functions/GatherUtils/sliceHasImplAnyAll.h | 3 ++- src/Functions/PerformanceAdaptors.h | 2 +- src/Functions/greatCircleDistance.cpp | 2 +- 10 files changed, 20 insertions(+), 18 deletions(-) rename src/{Functions => Common}/TargetSpecific.cpp (96%) rename src/{Functions => Common}/TargetSpecific.h (100%) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 145015ad0f2..851c276cd10 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -49,6 +49,18 @@ if (COMPILER_GCC) add_definitions ("-fno-tree-loop-distribute-patterns") endif () +# ClickHouse developers may use platform-dependent code under some macro (e.g. `#ifdef ENABLE_MULTITARGET`). +# If turned ON, this option defines such macro. +# See `src/Common/TargetSpecific.h` +option(ENABLE_MULTITARGET_CODE "Enable platform-dependent code" ON) + +if (ENABLE_MULTITARGET_CODE) + add_definitions(-DENABLE_MULTITARGET_CODE=1) +else() + add_definitions(-DENABLE_MULTITARGET_CODE=0) +endif() + + add_subdirectory (Access) add_subdirectory (Backups) add_subdirectory (Columns) diff --git a/src/Functions/TargetSpecific.cpp b/src/Common/TargetSpecific.cpp similarity index 96% rename from src/Functions/TargetSpecific.cpp rename to src/Common/TargetSpecific.cpp index 830611fea7a..43319eff44b 100644 --- a/src/Functions/TargetSpecific.cpp +++ b/src/Common/TargetSpecific.cpp @@ -1,4 +1,4 @@ -#include +#include #include diff --git a/src/Functions/TargetSpecific.h b/src/Common/TargetSpecific.h similarity index 100% rename from src/Functions/TargetSpecific.h rename to src/Common/TargetSpecific.h diff --git a/src/Functions/CMakeLists.txt b/src/Functions/CMakeLists.txt index 2596b10503f..debe7fac8a5 100644 --- a/src/Functions/CMakeLists.txt +++ b/src/Functions/CMakeLists.txt @@ -96,17 +96,6 @@ if (TARGET ch_contrib::rapidjson) target_link_libraries(clickhouse_functions PRIVATE ch_contrib::rapidjson) endif() -# ClickHouse developers may use platform-dependent code under some macro (e.g. `#ifdef ENABLE_MULTITARGET`). -# If turned ON, this option defines such macro. -# See `src/Functions/TargetSpecific.h` -option(ENABLE_MULTITARGET_CODE "Enable platform-dependent code" ON) - -if (ENABLE_MULTITARGET_CODE) - add_definitions(-DENABLE_MULTITARGET_CODE=1) -else() - add_definitions(-DENABLE_MULTITARGET_CODE=0) -endif() - add_subdirectory(GatherUtils) target_link_libraries(clickhouse_functions PRIVATE clickhouse_functions_gatherutils) diff --git a/src/Functions/FunctionStartsEndsWith.h b/src/Functions/FunctionStartsEndsWith.h index bbe1631fdf9..f6e0d6375c6 100644 --- a/src/Functions/FunctionStartsEndsWith.h +++ b/src/Functions/FunctionStartsEndsWith.h @@ -1,12 +1,12 @@ #pragma once #include +#include #include #include #include #include #include -#include #include #include #include diff --git a/src/Functions/FunctionsHashing.h b/src/Functions/FunctionsHashing.h index 88a0e9524b3..b78ecb5c72a 100644 --- a/src/Functions/FunctionsHashing.h +++ b/src/Functions/FunctionsHashing.h @@ -38,8 +38,8 @@ #include #include #include -#include #include +#include #include #include diff --git a/src/Functions/FunctionsRandom.h b/src/Functions/FunctionsRandom.h index 2dacd6d6db9..937bc9d36dd 100644 --- a/src/Functions/FunctionsRandom.h +++ b/src/Functions/FunctionsRandom.h @@ -1,9 +1,9 @@ #pragma once +#include #include #include #include -#include #include #include diff --git a/src/Functions/GatherUtils/sliceHasImplAnyAll.h b/src/Functions/GatherUtils/sliceHasImplAnyAll.h index ec8daceb990..68f31006b4f 100644 --- a/src/Functions/GatherUtils/sliceHasImplAnyAll.h +++ b/src/Functions/GatherUtils/sliceHasImplAnyAll.h @@ -14,7 +14,8 @@ #include #endif -#include +#include + namespace DB::GatherUtils { diff --git a/src/Functions/PerformanceAdaptors.h b/src/Functions/PerformanceAdaptors.h index bcc195e988e..5b690d83805 100644 --- a/src/Functions/PerformanceAdaptors.h +++ b/src/Functions/PerformanceAdaptors.h @@ -1,8 +1,8 @@ #pragma once -#include #include +#include #include #include diff --git a/src/Functions/greatCircleDistance.cpp b/src/Functions/greatCircleDistance.cpp index f0743486584..9b0d2625914 100644 --- a/src/Functions/greatCircleDistance.cpp +++ b/src/Functions/greatCircleDistance.cpp @@ -6,8 +6,8 @@ #include #include #include -#include #include +#include #include #include From bd89fcafdbc44b4b41f1c7458af5eeedec062774 Mon Sep 17 00:00:00 2001 From: Nikita Taranov Date: Mon, 4 Apr 2022 14:17:15 +0200 Subject: [PATCH 53/82] Make `SortDescription::column_name` always non-empty (#35805) --- src/Core/Block.cpp | 3 +- src/Core/SortCursor.h | 32 ++++------- src/Core/SortDescription.cpp | 35 +++--------- src/Core/SortDescription.h | 54 +++++++++++-------- src/Interpreters/InterpreterSelectQuery.cpp | 4 -- src/Interpreters/MutationsInterpreter.cpp | 2 +- src/Interpreters/Set.cpp | 4 +- src/Interpreters/sortBlock.cpp | 4 +- src/Processors/LimitTransform.cpp | 7 +-- .../Algorithms/AggregatingSortedAlgorithm.cpp | 11 ++-- .../Algorithms/CollapsingSortedAlgorithm.cpp | 8 +-- .../FinishAggregatingInOrderAlgorithm.cpp | 27 +++------- .../FinishAggregatingInOrderAlgorithm.h | 6 +-- .../GraphiteRollupSortedAlgorithm.cpp | 18 ++++--- .../IMergingAlgorithmWithDelayedChunk.cpp | 13 ++--- .../IMergingAlgorithmWithDelayedChunk.h | 6 +-- .../IMergingAlgorithmWithSharedChunks.cpp | 12 ++--- .../IMergingAlgorithmWithSharedChunks.h | 6 +-- .../Algorithms/MergingSortedAlgorithm.cpp | 20 +++---- .../Algorithms/MergingSortedAlgorithm.h | 4 +- .../Algorithms/ReplacingSortedAlgorithm.cpp | 18 ++++--- .../Algorithms/SummingSortedAlgorithm.cpp | 17 +++--- .../VersionedCollapsingAlgorithm.cpp | 13 ++--- src/Processors/QueryPlan/FillingStep.cpp | 4 +- .../QueryPlan/ReadFromMergeTree.cpp | 11 +--- src/Processors/QueryPlan/SortingStep.cpp | 12 ++--- src/Processors/QueryPlan/WindowStep.cpp | 2 +- .../AggregatingInOrderTransform.cpp | 12 +---- .../Transforms/AggregatingInOrderTransform.h | 2 +- .../Transforms/CheckSortedTransform.cpp | 30 ++--------- .../Transforms/CheckSortedTransform.h | 11 +--- .../Transforms/DistinctSortedTransform.cpp | 11 ++-- .../Transforms/DistinctSortedTransform.h | 6 ++- .../Transforms/FinishSortingTransform.cpp | 14 +++-- .../Transforms/FinishSortingTransform.h | 9 ++-- .../Transforms/MergeSortingTransform.cpp | 21 +++++--- .../Transforms/MergeSortingTransform.h | 17 +++--- .../Transforms/PartialSortingTransform.cpp | 4 +- .../Transforms/SortingTransform.cpp | 19 ++----- src/Processors/Transforms/SortingTransform.h | 8 +-- src/Storages/MergeTree/MergeTask.cpp | 2 +- .../MergeTree/MergeTreeDataWriter.cpp | 4 +- 42 files changed, 218 insertions(+), 305 deletions(-) diff --git a/src/Core/Block.cpp b/src/Core/Block.cpp index a7142ef7f2e..60d2eba4f08 100644 --- a/src/Core/Block.cpp +++ b/src/Core/Block.cpp @@ -46,7 +46,8 @@ static ReturnType checkColumnStructure(const ColumnWithTypeAndName & actual, con return onError("Block structure mismatch in " + std::string(context_description) + " stream: different names of columns:\n" + actual.dumpStructure() + "\n" + expected.dumpStructure(), code); - if (!actual.type->equals(*expected.type)) + if ((actual.type && !expected.type) || (!actual.type && expected.type) + || (actual.type && expected.type && !actual.type->equals(*expected.type))) return onError("Block structure mismatch in " + std::string(context_description) + " stream: different types:\n" + actual.dumpStructure() + "\n" + expected.dumpStructure(), code); diff --git a/src/Core/SortCursor.h b/src/Core/SortCursor.h index a5daba9fbee..a0f60fbccf8 100644 --- a/src/Core/SortCursor.h +++ b/src/Core/SortCursor.h @@ -15,10 +15,6 @@ namespace DB { -namespace ErrorCodes -{ - extern const int LOGICAL_ERROR; -} /** Cursor allows to compare rows in different blocks (and parts). * Cursor moves inside single block. @@ -61,25 +57,21 @@ struct SortCursorImpl reset(block, perm); } - SortCursorImpl(const Columns & columns, const SortDescription & desc_, size_t order_ = 0, IColumn::Permutation * perm = nullptr) + SortCursorImpl( + const Block & header, + const Columns & columns, + const SortDescription & desc_, + size_t order_ = 0, + IColumn::Permutation * perm = nullptr) : desc(desc_), sort_columns_size(desc.size()), order(order_), need_collation(desc.size()) { - for (auto & column_desc : desc) - { - if (!column_desc.column_name.empty()) - throw Exception("SortDescription should contain column position if SortCursor was used without header.", - ErrorCodes::LOGICAL_ERROR); - } - reset(columns, {}, perm); + reset(columns, header, perm); } bool empty() const { return rows == 0; } /// Set the cursor to the beginning of the new block. - void reset(const Block & block, IColumn::Permutation * perm = nullptr) - { - reset(block.getColumns(), block, perm); - } + void reset(const Block & block, IColumn::Permutation * perm = nullptr) { reset(block.getColumns(), block, perm); } /// Set the cursor to the beginning of the new block. void reset(const Columns & columns, const Block & block, IColumn::Permutation * perm = nullptr) @@ -95,9 +87,7 @@ struct SortCursorImpl for (size_t j = 0, size = desc.size(); j < size; ++j) { auto & column_desc = desc[j]; - size_t column_number = !column_desc.column_name.empty() - ? block.getPositionByName(column_desc.column_name) - : column_desc.column_number; + size_t column_number = block.getPositionByName(column_desc.column_name); sort_columns.push_back(columns[column_number].get()); need_collation[j] = desc[j].collator != nullptr && sort_columns.back()->isCollationSupported(); @@ -367,12 +357,12 @@ private: }; template -bool less(const TLeftColumns & lhs, const TRightColumns & rhs, size_t i, size_t j, const SortDescription & descr) +bool less(const TLeftColumns & lhs, const TRightColumns & rhs, size_t i, size_t j, const SortDescriptionWithPositions & descr) { for (const auto & elem : descr) { size_t ind = elem.column_number; - int res = elem.direction * lhs[ind]->compareAt(i, j, *rhs[ind], elem.nulls_direction); + int res = elem.base.direction * lhs[ind]->compareAt(i, j, *rhs[ind], elem.base.nulls_direction); if (res < 0) return true; else if (res > 0) diff --git a/src/Core/SortDescription.cpp b/src/Core/SortDescription.cpp index 314b6624623..7994ada7b85 100644 --- a/src/Core/SortDescription.cpp +++ b/src/Core/SortDescription.cpp @@ -1,12 +1,12 @@ -#include #include +#include #include #include namespace DB { -void dumpSortDescription(const SortDescription & description, const Block & header, WriteBuffer & out) +void dumpSortDescription(const SortDescription & description, WriteBuffer & out) { bool first = true; @@ -16,17 +16,7 @@ void dumpSortDescription(const SortDescription & description, const Block & head out << ", "; first = false; - if (!desc.column_name.empty()) - out << desc.column_name; - else - { - if (desc.column_number < header.columns()) - out << header.getByPosition(desc.column_number).name; - else - out << "?"; - - out << " (pos " << desc.column_number << ")"; - } + out << desc.column_name; if (desc.direction > 0) out << " ASC"; @@ -38,18 +28,9 @@ void dumpSortDescription(const SortDescription & description, const Block & head } } -void SortColumnDescription::explain(JSONBuilder::JSONMap & map, const Block & header) const +void SortColumnDescription::explain(JSONBuilder::JSONMap & map) const { - if (!column_name.empty()) - map.add("Column", column_name); - else - { - if (column_number < header.columns()) - map.add("Column", header.getByPosition(column_number).name); - - map.add("Position", column_number); - } - + map.add("Column", column_name); map.add("Ascending", direction > 0); map.add("With Fill", with_fill); } @@ -57,17 +38,17 @@ void SortColumnDescription::explain(JSONBuilder::JSONMap & map, const Block & he std::string dumpSortDescription(const SortDescription & description) { WriteBufferFromOwnString wb; - dumpSortDescription(description, Block{}, wb); + dumpSortDescription(description, wb); return wb.str(); } -JSONBuilder::ItemPtr explainSortDescription(const SortDescription & description, const Block & header) +JSONBuilder::ItemPtr explainSortDescription(const SortDescription & description) { auto json_array = std::make_unique(); for (const auto & descr : description) { auto json_map = std::make_unique(); - descr.explain(*json_map, header); + descr.explain(*json_map); json_array->add(std::move(json_map)); } diff --git a/src/Core/SortDescription.h b/src/Core/SortDescription.h index db15f3a54db..66f2ca24c69 100644 --- a/src/Core/SortDescription.h +++ b/src/Core/SortDescription.h @@ -39,7 +39,6 @@ struct FillColumnDescription struct SortColumnDescription { std::string column_name; /// The name of the column. - size_t column_number; /// Column number (used if no name is given). int direction; /// 1 - ascending, -1 - descending. int nulls_direction; /// 1 - NULLs and NaNs are greater, -1 - less. /// To achieve NULLS LAST, set it equal to direction, to achieve NULLS FIRST, set it opposite. @@ -48,23 +47,24 @@ struct SortColumnDescription FillColumnDescription fill_description; explicit SortColumnDescription( - size_t column_number_, int direction_ = 1, int nulls_direction_ = 1, - const std::shared_ptr & collator_ = nullptr, - bool with_fill_ = false, const FillColumnDescription & fill_description_ = {}) - : column_number(column_number_), direction(direction_), nulls_direction(nulls_direction_), collator(collator_) - , with_fill(with_fill_), fill_description(fill_description_) {} - - explicit SortColumnDescription( - const std::string & column_name_, int direction_ = 1, int nulls_direction_ = 1, - const std::shared_ptr & collator_ = nullptr, - bool with_fill_ = false, const FillColumnDescription & fill_description_ = {}) - : column_name(column_name_), column_number(0), direction(direction_), nulls_direction(nulls_direction_) - , collator(collator_), with_fill(with_fill_), fill_description(fill_description_) {} + const std::string & column_name_, + int direction_ = 1, + int nulls_direction_ = 1, + const std::shared_ptr & collator_ = nullptr, + bool with_fill_ = false, + const FillColumnDescription & fill_description_ = {}) + : column_name(column_name_) + , direction(direction_) + , nulls_direction(nulls_direction_) + , collator(collator_) + , with_fill(with_fill_) + , fill_description(fill_description_) + { + } bool operator == (const SortColumnDescription & other) const { - return column_name == other.column_name && column_number == other.column_number - && direction == other.direction && nulls_direction == other.nulls_direction; + return column_name == other.column_name && direction == other.direction && nulls_direction == other.nulls_direction; } bool operator != (const SortColumnDescription & other) const @@ -72,22 +72,30 @@ struct SortColumnDescription return !(*this == other); } - std::string dump() const - { - return fmt::format("{}:{}:dir {}nulls ", column_name, column_number, direction, nulls_direction); - } + std::string dump() const { return fmt::format("{}:dir {}nulls {}", column_name, direction, nulls_direction); } - void explain(JSONBuilder::JSONMap & map, const Block & header) const; + void explain(JSONBuilder::JSONMap & map) const; +}; + +struct SortColumnDescriptionWithColumnIndex +{ + SortColumnDescription base; + size_t column_number; + + SortColumnDescriptionWithColumnIndex(SortColumnDescription description_, size_t column_number_) + : base(std::move(description_)), column_number(column_number_) + { + } }; /// Description of the sorting rule for several columns. using SortDescription = std::vector; +using SortDescriptionWithPositions = std::vector; /// Outputs user-readable description into `out`. -void dumpSortDescription(const SortDescription & description, const Block & header, WriteBuffer & out); +void dumpSortDescription(const SortDescription & description, WriteBuffer & out); std::string dumpSortDescription(const SortDescription & description); -JSONBuilder::ItemPtr explainSortDescription(const SortDescription & description, const Block & header); - +JSONBuilder::ItemPtr explainSortDescription(const SortDescription & description); } diff --git a/src/Interpreters/InterpreterSelectQuery.cpp b/src/Interpreters/InterpreterSelectQuery.cpp index dda2e3f2142..5091debbe72 100644 --- a/src/Interpreters/InterpreterSelectQuery.cpp +++ b/src/Interpreters/InterpreterSelectQuery.cpp @@ -2249,10 +2249,6 @@ static bool windowDescriptionComparator(const WindowDescription * _left, const W return true; else if (left[i].column_name > right[i].column_name) return false; - else if (left[i].column_number < right[i].column_number) - return true; - else if (left[i].column_number > right[i].column_number) - return false; else if (left[i].direction < right[i].direction) return true; else if (left[i].direction > right[i].direction) diff --git a/src/Interpreters/MutationsInterpreter.cpp b/src/Interpreters/MutationsInterpreter.cpp index 5e795c5760a..f46333dc00a 100644 --- a/src/Interpreters/MutationsInterpreter.cpp +++ b/src/Interpreters/MutationsInterpreter.cpp @@ -1025,7 +1025,7 @@ std::optional MutationsInterpreter::getStorageSortDescriptionIf for (size_t i = 0; i < sort_columns_size; ++i) { if (header.has(sort_columns[i])) - sort_description.emplace_back(header.getPositionByName(sort_columns[i]), 1, 1); + sort_description.emplace_back(sort_columns[i], 1, 1); else return {}; } diff --git a/src/Interpreters/Set.cpp b/src/Interpreters/Set.cpp index 224b13d2c45..28bbea54110 100644 --- a/src/Interpreters/Set.cpp +++ b/src/Interpreters/Set.cpp @@ -430,8 +430,8 @@ MergeTreeSetIndex::MergeTreeSetIndex(const Columns & set_elements, std::vectorgetName()}); + sort_description.emplace_back(ordered_set[i]->getName(), 1, 1); } sortBlock(block_to_sort, sort_description); diff --git a/src/Interpreters/sortBlock.cpp b/src/Interpreters/sortBlock.cpp index 3281445022e..4343e8c7fc6 100644 --- a/src/Interpreters/sortBlock.cpp +++ b/src/Interpreters/sortBlock.cpp @@ -98,9 +98,7 @@ ColumnsWithSortDescriptions getColumnsWithSortDescription(const Block & block, c { const auto & sort_column_description = description[i]; - const IColumn * column = !sort_column_description.column_name.empty() - ? block.getByName(sort_column_description.column_name).column.get() - : block.safeGetByPosition(sort_column_description.column_number).column.get(); + const IColumn * column = block.getByName(sort_column_description.column_name).column.get(); if (isCollationRequired(sort_column_description)) { diff --git a/src/Processors/LimitTransform.cpp b/src/Processors/LimitTransform.cpp index 36c58e1454e..48f29680da2 100644 --- a/src/Processors/LimitTransform.cpp +++ b/src/Processors/LimitTransform.cpp @@ -38,12 +38,7 @@ LimitTransform::LimitTransform( } for (const auto & desc : description) - { - if (!desc.column_name.empty()) - sort_column_positions.push_back(header_.getPositionByName(desc.column_name)); - else - sort_column_positions.push_back(desc.column_number); - } + sort_column_positions.push_back(header_.getPositionByName(desc.column_name)); } Chunk LimitTransform::makeChunkWithPreviousRow(const Chunk & chunk, UInt64 row) const diff --git a/src/Processors/Merges/Algorithms/AggregatingSortedAlgorithm.cpp b/src/Processors/Merges/Algorithms/AggregatingSortedAlgorithm.cpp index af31ef01fcd..ebc1b37074b 100644 --- a/src/Processors/Merges/Algorithms/AggregatingSortedAlgorithm.cpp +++ b/src/Processors/Merges/Algorithms/AggregatingSortedAlgorithm.cpp @@ -104,7 +104,7 @@ static AggregatingSortedAlgorithm::ColumnsDefinition defineColumns( /// Included into PK? auto it = description.begin(); for (; it != description.end(); ++it) - if (it->column_name == column.name || (it->column_name.empty() && it->column_number == i)) + if (it->column_name == column.name) break; if (it != description.end()) @@ -290,11 +290,10 @@ void AggregatingSortedAlgorithm::AggregatingMergedData::initAggregateDescription AggregatingSortedAlgorithm::AggregatingSortedAlgorithm( - const Block & header, size_t num_inputs, - SortDescription description_, size_t max_block_size) - : IMergingAlgorithmWithDelayedChunk(num_inputs, description_) - , columns_definition(defineColumns(header, description_)) - , merged_data(getMergedColumns(header, columns_definition), max_block_size, columns_definition) + const Block & header_, size_t num_inputs, SortDescription description_, size_t max_block_size) + : IMergingAlgorithmWithDelayedChunk(header_, num_inputs, description_) + , columns_definition(defineColumns(header_, description_)) + , merged_data(getMergedColumns(header_, columns_definition), max_block_size, columns_definition) { } diff --git a/src/Processors/Merges/Algorithms/CollapsingSortedAlgorithm.cpp b/src/Processors/Merges/Algorithms/CollapsingSortedAlgorithm.cpp index 592562c47b9..5dfec31c009 100644 --- a/src/Processors/Merges/Algorithms/CollapsingSortedAlgorithm.cpp +++ b/src/Processors/Merges/Algorithms/CollapsingSortedAlgorithm.cpp @@ -21,7 +21,7 @@ namespace ErrorCodes } CollapsingSortedAlgorithm::CollapsingSortedAlgorithm( - const Block & header, + const Block & header_, size_t num_inputs, SortDescription description_, const String & sign_column, @@ -30,9 +30,9 @@ CollapsingSortedAlgorithm::CollapsingSortedAlgorithm( Poco::Logger * log_, WriteBuffer * out_row_sources_buf_, bool use_average_block_sizes) - : IMergingAlgorithmWithSharedChunks(num_inputs, std::move(description_), out_row_sources_buf_, max_row_refs) - , merged_data(header.cloneEmptyColumns(), use_average_block_sizes, max_block_size) - , sign_column_number(header.getPositionByName(sign_column)) + : IMergingAlgorithmWithSharedChunks(header_, num_inputs, std::move(description_), out_row_sources_buf_, max_row_refs) + , merged_data(header_.cloneEmptyColumns(), use_average_block_sizes, max_block_size) + , sign_column_number(header_.getPositionByName(sign_column)) , only_positive_sign(only_positive_sign_) , log(log_) { diff --git a/src/Processors/Merges/Algorithms/FinishAggregatingInOrderAlgorithm.cpp b/src/Processors/Merges/Algorithms/FinishAggregatingInOrderAlgorithm.cpp index fdea3c23dc2..5d8a593c682 100644 --- a/src/Processors/Merges/Algorithms/FinishAggregatingInOrderAlgorithm.cpp +++ b/src/Processors/Merges/Algorithms/FinishAggregatingInOrderAlgorithm.cpp @@ -14,11 +14,8 @@ namespace ErrorCodes extern const int LOGICAL_ERROR; } -FinishAggregatingInOrderAlgorithm::State::State( - const Chunk & chunk, const SortDescription & desc, Int64 total_bytes_) - : all_columns(chunk.getColumns()) - , num_rows(chunk.getNumRows()) - , total_bytes(total_bytes_) +FinishAggregatingInOrderAlgorithm::State::State(const Chunk & chunk, const SortDescriptionWithPositions & desc, Int64 total_bytes_) + : all_columns(chunk.getColumns()), num_rows(chunk.getNumRows()), total_bytes(total_bytes_) { if (!chunk) return; @@ -32,25 +29,13 @@ FinishAggregatingInOrderAlgorithm::FinishAggregatingInOrderAlgorithm( const Block & header_, size_t num_inputs_, AggregatingTransformParamsPtr params_, - SortDescription description_, + const SortDescription & description_, size_t max_block_size_, size_t max_block_bytes_) - : header(header_) - , num_inputs(num_inputs_) - , params(params_) - , description(std::move(description_)) - , max_block_size(max_block_size_) - , max_block_bytes(max_block_bytes_) + : header(header_), num_inputs(num_inputs_), params(params_), max_block_size(max_block_size_), max_block_bytes(max_block_bytes_) { - /// Replace column names in description to positions. - for (auto & column_description : description) - { - if (!column_description.column_name.empty()) - { - column_description.column_number = header_.getPositionByName(column_description.column_name); - column_description.column_name.clear(); - } - } + for (const auto & column_description : description_) + description.emplace_back(column_description, header_.getPositionByName(column_description.column_name)); } void FinishAggregatingInOrderAlgorithm::initialize(Inputs inputs) diff --git a/src/Processors/Merges/Algorithms/FinishAggregatingInOrderAlgorithm.h b/src/Processors/Merges/Algorithms/FinishAggregatingInOrderAlgorithm.h index f3a1bd40635..ff31886f438 100644 --- a/src/Processors/Merges/Algorithms/FinishAggregatingInOrderAlgorithm.h +++ b/src/Processors/Merges/Algorithms/FinishAggregatingInOrderAlgorithm.h @@ -41,7 +41,7 @@ public: const Block & header_, size_t num_inputs_, AggregatingTransformParamsPtr params_, - SortDescription description_, + const SortDescription & description_, size_t max_block_size_, size_t max_block_bytes_); @@ -69,7 +69,7 @@ private: /// Number of bytes in all columns + number of bytes in arena, related to current chunk. size_t total_bytes = 0; - State(const Chunk & chunk, const SortDescription & description, Int64 total_bytes_); + State(const Chunk & chunk, const SortDescriptionWithPositions & description, Int64 total_bytes_); State() = default; bool isValid() const { return current_row < num_rows; } @@ -78,7 +78,7 @@ private: Block header; size_t num_inputs; AggregatingTransformParamsPtr params; - SortDescription description; + SortDescriptionWithPositions description; size_t max_block_size; size_t max_block_bytes; diff --git a/src/Processors/Merges/Algorithms/GraphiteRollupSortedAlgorithm.cpp b/src/Processors/Merges/Algorithms/GraphiteRollupSortedAlgorithm.cpp index 6464f10ca58..eff62d73f50 100644 --- a/src/Processors/Merges/Algorithms/GraphiteRollupSortedAlgorithm.cpp +++ b/src/Processors/Merges/Algorithms/GraphiteRollupSortedAlgorithm.cpp @@ -30,12 +30,16 @@ static GraphiteRollupSortedAlgorithm::ColumnsDefinition defineColumns( } GraphiteRollupSortedAlgorithm::GraphiteRollupSortedAlgorithm( - const Block & header, size_t num_inputs, - SortDescription description_, size_t max_block_size, - Graphite::Params params_, time_t time_of_merge_) - : IMergingAlgorithmWithSharedChunks(num_inputs, std::move(description_), nullptr, max_row_refs) - , merged_data(header.cloneEmptyColumns(), false, max_block_size) - , params(std::move(params_)), time_of_merge(time_of_merge_) + const Block & header_, + size_t num_inputs, + SortDescription description_, + size_t max_block_size, + Graphite::Params params_, + time_t time_of_merge_) + : IMergingAlgorithmWithSharedChunks(header_, num_inputs, std::move(description_), nullptr, max_row_refs) + , merged_data(header_.cloneEmptyColumns(), false, max_block_size) + , params(std::move(params_)) + , time_of_merge(time_of_merge_) { size_t max_size_of_aggregate_state = 0; size_t max_alignment_of_aggregate_state = 1; @@ -50,7 +54,7 @@ GraphiteRollupSortedAlgorithm::GraphiteRollupSortedAlgorithm( } merged_data.allocMemForAggregates(max_size_of_aggregate_state, max_alignment_of_aggregate_state); - columns_definition = defineColumns(header, params); + columns_definition = defineColumns(header_, params); } UInt32 GraphiteRollupSortedAlgorithm::selectPrecision(const Graphite::Retentions & retentions, time_t time) const diff --git a/src/Processors/Merges/Algorithms/IMergingAlgorithmWithDelayedChunk.cpp b/src/Processors/Merges/Algorithms/IMergingAlgorithmWithDelayedChunk.cpp index e4c60d7609c..1d0be726c16 100644 --- a/src/Processors/Merges/Algorithms/IMergingAlgorithmWithDelayedChunk.cpp +++ b/src/Processors/Merges/Algorithms/IMergingAlgorithmWithDelayedChunk.cpp @@ -4,12 +4,8 @@ namespace DB { -IMergingAlgorithmWithDelayedChunk::IMergingAlgorithmWithDelayedChunk( - size_t num_inputs, - SortDescription description_) - : description(std::move(description_)) - , current_inputs(num_inputs) - , cursors(num_inputs) +IMergingAlgorithmWithDelayedChunk::IMergingAlgorithmWithDelayedChunk(Block header_, size_t num_inputs, SortDescription description_) + : description(std::move(description_)), header(std::move(header_)), current_inputs(num_inputs), cursors(num_inputs) { } @@ -22,7 +18,8 @@ void IMergingAlgorithmWithDelayedChunk::initializeQueue(Inputs inputs) if (!current_inputs[source_num].chunk) continue; - cursors[source_num] = SortCursorImpl(current_inputs[source_num].chunk.getColumns(), description, source_num, current_inputs[source_num].permutation); + cursors[source_num] = SortCursorImpl( + header, current_inputs[source_num].chunk.getColumns(), description, source_num, current_inputs[source_num].permutation); } queue = SortingHeap(cursors); @@ -37,7 +34,7 @@ void IMergingAlgorithmWithDelayedChunk::updateCursor(Input & input, size_t sourc last_chunk_sort_columns = std::move(cursors[source_num].sort_columns); current_input.swap(input); - cursors[source_num].reset(current_input.chunk.getColumns(), {}, current_input.permutation); + cursors[source_num].reset(current_input.chunk.getColumns(), header, current_input.permutation); queue.push(cursors[source_num]); } diff --git a/src/Processors/Merges/Algorithms/IMergingAlgorithmWithDelayedChunk.h b/src/Processors/Merges/Algorithms/IMergingAlgorithmWithDelayedChunk.h index 69530a707c2..e9f735f4a71 100644 --- a/src/Processors/Merges/Algorithms/IMergingAlgorithmWithDelayedChunk.h +++ b/src/Processors/Merges/Algorithms/IMergingAlgorithmWithDelayedChunk.h @@ -10,9 +10,7 @@ namespace DB class IMergingAlgorithmWithDelayedChunk : public IMergingAlgorithm { public: - IMergingAlgorithmWithDelayedChunk( - size_t num_inputs, - SortDescription description_); + IMergingAlgorithmWithDelayedChunk(Block header_, size_t num_inputs, SortDescription description_); protected: SortingHeap queue; @@ -28,6 +26,8 @@ protected: bool skipLastRowFor(size_t input_number) const { return current_inputs[input_number].skip_last_row; } private: + Block header; + /// Inputs currently being merged. Inputs current_inputs; SortCursorImpls cursors; diff --git a/src/Processors/Merges/Algorithms/IMergingAlgorithmWithSharedChunks.cpp b/src/Processors/Merges/Algorithms/IMergingAlgorithmWithSharedChunks.cpp index 97abffdc167..2e87de1ae29 100644 --- a/src/Processors/Merges/Algorithms/IMergingAlgorithmWithSharedChunks.cpp +++ b/src/Processors/Merges/Algorithms/IMergingAlgorithmWithSharedChunks.cpp @@ -4,11 +4,9 @@ namespace DB { IMergingAlgorithmWithSharedChunks::IMergingAlgorithmWithSharedChunks( - size_t num_inputs, - SortDescription description_, - WriteBuffer * out_row_sources_buf_, - size_t max_row_refs) - : description(std::move(description_)) + Block header_, size_t num_inputs, SortDescription description_, WriteBuffer * out_row_sources_buf_, size_t max_row_refs) + : header(std::move(header_)) + , description(std::move(description_)) , chunk_allocator(num_inputs + max_row_refs) , cursors(num_inputs) , sources(num_inputs) @@ -39,7 +37,7 @@ void IMergingAlgorithmWithSharedChunks::initialize(Inputs inputs) source.skip_last_row = inputs[source_num].skip_last_row; source.chunk = chunk_allocator.alloc(inputs[source_num].chunk); - cursors[source_num] = SortCursorImpl(source.chunk->getColumns(), description, source_num, inputs[source_num].permutation); + cursors[source_num] = SortCursorImpl(header, source.chunk->getColumns(), description, source_num, inputs[source_num].permutation); source.chunk->all_columns = cursors[source_num].all_columns; source.chunk->sort_columns = cursors[source_num].sort_columns; @@ -55,7 +53,7 @@ void IMergingAlgorithmWithSharedChunks::consume(Input & input, size_t source_num auto & source = sources[source_num]; source.skip_last_row = input.skip_last_row; source.chunk = chunk_allocator.alloc(input.chunk); - cursors[source_num].reset(source.chunk->getColumns(), {}, input.permutation); + cursors[source_num].reset(source.chunk->getColumns(), header, input.permutation); source.chunk->all_columns = cursors[source_num].all_columns; source.chunk->sort_columns = cursors[source_num].sort_columns; diff --git a/src/Processors/Merges/Algorithms/IMergingAlgorithmWithSharedChunks.h b/src/Processors/Merges/Algorithms/IMergingAlgorithmWithSharedChunks.h index 65c456ea44c..32ef23ab6e5 100644 --- a/src/Processors/Merges/Algorithms/IMergingAlgorithmWithSharedChunks.h +++ b/src/Processors/Merges/Algorithms/IMergingAlgorithmWithSharedChunks.h @@ -10,15 +10,13 @@ class IMergingAlgorithmWithSharedChunks : public IMergingAlgorithm { public: IMergingAlgorithmWithSharedChunks( - size_t num_inputs, - SortDescription description_, - WriteBuffer * out_row_sources_buf_, - size_t max_row_refs); + Block header_, size_t num_inputs, SortDescription description_, WriteBuffer * out_row_sources_buf_, size_t max_row_refs); void initialize(Inputs inputs) override; void consume(Input & input, size_t source_num) override; private: + Block header; SortDescription description; /// Allocator must be destroyed after source_chunks. diff --git a/src/Processors/Merges/Algorithms/MergingSortedAlgorithm.cpp b/src/Processors/Merges/Algorithms/MergingSortedAlgorithm.cpp index 6b2f0f571a1..1765615f9d1 100644 --- a/src/Processors/Merges/Algorithms/MergingSortedAlgorithm.cpp +++ b/src/Processors/Merges/Algorithms/MergingSortedAlgorithm.cpp @@ -11,30 +11,22 @@ namespace ErrorCodes } MergingSortedAlgorithm::MergingSortedAlgorithm( - const Block & header, + Block header_, size_t num_inputs, SortDescription description_, size_t max_block_size, UInt64 limit_, WriteBuffer * out_row_sources_buf_, bool use_average_block_sizes) - : merged_data(header.cloneEmptyColumns(), use_average_block_sizes, max_block_size) + : header(std::move(header_)) + , merged_data(header.cloneEmptyColumns(), use_average_block_sizes, max_block_size) , description(std::move(description_)) , limit(limit_) + , has_collation(std::any_of(description.begin(), description.end(), [](const auto & descr) { return descr.collator != nullptr; })) , out_row_sources_buf(out_row_sources_buf_) , current_inputs(num_inputs) , cursors(num_inputs) { - /// Replace column names in description to positions. - for (auto & column_description : description) - { - has_collation |= column_description.collator != nullptr; - if (!column_description.column_name.empty()) - { - column_description.column_number = header.getPositionByName(column_description.column_name); - column_description.column_name.clear(); - } - } } void MergingSortedAlgorithm::addInput() @@ -65,7 +57,7 @@ void MergingSortedAlgorithm::initialize(Inputs inputs) continue; prepareChunk(chunk); - cursors[source_num] = SortCursorImpl(chunk.getColumns(), description, source_num); + cursors[source_num] = SortCursorImpl(header, chunk.getColumns(), description, source_num); } if (has_collation) @@ -78,7 +70,7 @@ void MergingSortedAlgorithm::consume(Input & input, size_t source_num) { prepareChunk(input.chunk); current_inputs[source_num].swap(input); - cursors[source_num].reset(current_inputs[source_num].chunk.getColumns(), {}); + cursors[source_num].reset(current_inputs[source_num].chunk.getColumns(), header); if (has_collation) queue_with_collation.push(cursors[source_num]); diff --git a/src/Processors/Merges/Algorithms/MergingSortedAlgorithm.h b/src/Processors/Merges/Algorithms/MergingSortedAlgorithm.h index 63dced26dd4..cf3ec44f5fc 100644 --- a/src/Processors/Merges/Algorithms/MergingSortedAlgorithm.h +++ b/src/Processors/Merges/Algorithms/MergingSortedAlgorithm.h @@ -14,7 +14,7 @@ class MergingSortedAlgorithm final : public IMergingAlgorithm { public: MergingSortedAlgorithm( - const Block & header, + Block header_, size_t num_inputs, SortDescription description_, size_t max_block_size, @@ -31,6 +31,8 @@ public: const MergedData & getMergedData() const { return merged_data; } private: + Block header; + MergedData merged_data; /// Settings diff --git a/src/Processors/Merges/Algorithms/ReplacingSortedAlgorithm.cpp b/src/Processors/Merges/Algorithms/ReplacingSortedAlgorithm.cpp index b8c788ed1fc..4afd01c988f 100644 --- a/src/Processors/Merges/Algorithms/ReplacingSortedAlgorithm.cpp +++ b/src/Processors/Merges/Algorithms/ReplacingSortedAlgorithm.cpp @@ -5,16 +5,18 @@ namespace DB { ReplacingSortedAlgorithm::ReplacingSortedAlgorithm( - const Block & header, size_t num_inputs, - SortDescription description_, const String & version_column, - size_t max_block_size, - WriteBuffer * out_row_sources_buf_, - bool use_average_block_sizes) - : IMergingAlgorithmWithSharedChunks(num_inputs, std::move(description_), out_row_sources_buf_, max_row_refs) - , merged_data(header.cloneEmptyColumns(), use_average_block_sizes, max_block_size) + const Block & header_, + size_t num_inputs, + SortDescription description_, + const String & version_column, + size_t max_block_size, + WriteBuffer * out_row_sources_buf_, + bool use_average_block_sizes) + : IMergingAlgorithmWithSharedChunks(header_, num_inputs, std::move(description_), out_row_sources_buf_, max_row_refs) + , merged_data(header_.cloneEmptyColumns(), use_average_block_sizes, max_block_size) { if (!version_column.empty()) - version_column_number = header.getPositionByName(version_column); + version_column_number = header_.getPositionByName(version_column); } void ReplacingSortedAlgorithm::insertRow() diff --git a/src/Processors/Merges/Algorithms/SummingSortedAlgorithm.cpp b/src/Processors/Merges/Algorithms/SummingSortedAlgorithm.cpp index 0247b8677af..dc4270d4041 100644 --- a/src/Processors/Merges/Algorithms/SummingSortedAlgorithm.cpp +++ b/src/Processors/Merges/Algorithms/SummingSortedAlgorithm.cpp @@ -101,10 +101,10 @@ struct SummingSortedAlgorithm::AggregateDescription }; -static bool isInPrimaryKey(const SortDescription & description, const std::string & name, const size_t number) +static bool isInPrimaryKey(const SortDescription & description, const std::string & name) { for (const auto & desc : description) - if (desc.column_name == name || (desc.column_name.empty() && desc.column_number == number)) + if (desc.column_name == name) return true; return false; @@ -251,7 +251,7 @@ static SummingSortedAlgorithm::ColumnsDefinition defineColumns( } /// Are they inside the primary key or partition key? - if (isInPrimaryKey(description, column.name, i) || isInPartitionKey(column.name, partition_key_columns)) + if (isInPrimaryKey(description, column.name) || isInPartitionKey(column.name, partition_key_columns)) { def.column_numbers_not_to_aggregate.push_back(i); continue; @@ -307,7 +307,7 @@ static SummingSortedAlgorithm::ColumnsDefinition defineColumns( /// no elements of map could be in primary key auto column_num_it = map.second.begin(); for (; column_num_it != map.second.end(); ++column_num_it) - if (isInPrimaryKey(description, header.safeGetByPosition(*column_num_it).name, *column_num_it)) + if (isInPrimaryKey(description, header.safeGetByPosition(*column_num_it).name)) break; if (column_num_it != map.second.end()) { @@ -687,14 +687,15 @@ Chunk SummingSortedAlgorithm::SummingMergedData::pull() SummingSortedAlgorithm::SummingSortedAlgorithm( - const Block & header, size_t num_inputs, + const Block & header_, + size_t num_inputs, SortDescription description_, const Names & column_names_to_sum, const Names & partition_key_columns, size_t max_block_size) - : IMergingAlgorithmWithDelayedChunk(num_inputs, std::move(description_)) - , columns_definition(defineColumns(header, description, column_names_to_sum, partition_key_columns)) - , merged_data(getMergedDataColumns(header, columns_definition), max_block_size, columns_definition) + : IMergingAlgorithmWithDelayedChunk(header_, num_inputs, std::move(description_)) + , columns_definition(defineColumns(header_, description, column_names_to_sum, partition_key_columns)) + , merged_data(getMergedDataColumns(header_, columns_definition), max_block_size, columns_definition) { } diff --git a/src/Processors/Merges/Algorithms/VersionedCollapsingAlgorithm.cpp b/src/Processors/Merges/Algorithms/VersionedCollapsingAlgorithm.cpp index 672242b253b..cbafa53d0a3 100644 --- a/src/Processors/Merges/Algorithms/VersionedCollapsingAlgorithm.cpp +++ b/src/Processors/Merges/Algorithms/VersionedCollapsingAlgorithm.cpp @@ -8,19 +8,20 @@ namespace DB static const size_t MAX_ROWS_IN_MULTIVERSION_QUEUE = 8192; VersionedCollapsingAlgorithm::VersionedCollapsingAlgorithm( - const Block & header, size_t num_inputs, - SortDescription description_, const String & sign_column_, + const Block & header_, + size_t num_inputs, + SortDescription description_, + const String & sign_column_, size_t max_block_size, WriteBuffer * out_row_sources_buf_, bool use_average_block_sizes) - : IMergingAlgorithmWithSharedChunks( - num_inputs, std::move(description_), out_row_sources_buf_, MAX_ROWS_IN_MULTIVERSION_QUEUE) - , merged_data(header.cloneEmptyColumns(), use_average_block_sizes, max_block_size) + : IMergingAlgorithmWithSharedChunks(header_, num_inputs, std::move(description_), out_row_sources_buf_, MAX_ROWS_IN_MULTIVERSION_QUEUE) + , merged_data(header_.cloneEmptyColumns(), use_average_block_sizes, max_block_size) /// -1 for +1 in FixedSizeDequeWithGaps's internal buffer. 3 is a reasonable minimum size to collapse anything. , max_rows_in_queue(std::min(std::max(3, max_block_size), MAX_ROWS_IN_MULTIVERSION_QUEUE) - 1) , current_keys(max_rows_in_queue) { - sign_column_number = header.getPositionByName(sign_column_); + sign_column_number = header_.getPositionByName(sign_column_); } inline ALWAYS_INLINE static void writeRowSourcePart(WriteBuffer & buffer, RowSourcePart row_source) diff --git a/src/Processors/QueryPlan/FillingStep.cpp b/src/Processors/QueryPlan/FillingStep.cpp index 223892aa528..a94bbdb0877 100644 --- a/src/Processors/QueryPlan/FillingStep.cpp +++ b/src/Processors/QueryPlan/FillingStep.cpp @@ -48,13 +48,13 @@ void FillingStep::transformPipeline(QueryPipelineBuilder & pipeline, const Build void FillingStep::describeActions(FormatSettings & settings) const { settings.out << String(settings.offset, ' '); - dumpSortDescription(sort_description, input_streams.front().header, settings.out); + dumpSortDescription(sort_description, settings.out); settings.out << '\n'; } void FillingStep::describeActions(JSONBuilder::JSONMap & map) const { - map.add("Sort Description", explainSortDescription(sort_description, input_streams.front().header)); + map.add("Sort Description", explainSortDescription(sort_description)); } } diff --git a/src/Processors/QueryPlan/ReadFromMergeTree.cpp b/src/Processors/QueryPlan/ReadFromMergeTree.cpp index 42fbc49b3e7..a14513aceb0 100644 --- a/src/Processors/QueryPlan/ReadFromMergeTree.cpp +++ b/src/Processors/QueryPlan/ReadFromMergeTree.cpp @@ -612,14 +612,8 @@ static void addMergingFinal( ColumnNumbers key_columns; key_columns.reserve(sort_description.size()); - for (const auto & desc : sort_description) - { - if (!desc.column_name.empty()) - key_columns.push_back(header.getPositionByName(desc.column_name)); - else - key_columns.emplace_back(desc.column_number); - } + key_columns.push_back(header.getPositionByName(desc.column_name)); pipe.addSimpleTransform([&](const Block & stream_header) { @@ -774,9 +768,8 @@ Pipe ReadFromMergeTree::spreadMarkRangesAmongStreamsFinal( Names partition_key_columns = metadata_for_reading->getPartitionKey().column_names; - const auto & header = pipe.getHeader(); for (size_t i = 0; i < sort_columns_size; ++i) - sort_description.emplace_back(header.getPositionByName(sort_columns[i]), 1, 1); + sort_description.emplace_back(sort_columns[i], 1, 1); addMergingFinal( pipe, diff --git a/src/Processors/QueryPlan/SortingStep.cpp b/src/Processors/QueryPlan/SortingStep.cpp index 32b314b1c50..1e56c02504b 100644 --- a/src/Processors/QueryPlan/SortingStep.cpp +++ b/src/Processors/QueryPlan/SortingStep.cpp @@ -206,17 +206,17 @@ void SortingStep::describeActions(FormatSettings & settings) const if (!prefix_description.empty()) { settings.out << prefix << "Prefix sort description: "; - dumpSortDescription(prefix_description, input_streams.front().header, settings.out); + dumpSortDescription(prefix_description, settings.out); settings.out << '\n'; settings.out << prefix << "Result sort description: "; - dumpSortDescription(result_description, input_streams.front().header, settings.out); + dumpSortDescription(result_description, settings.out); settings.out << '\n'; } else { settings.out << prefix << "Sort description: "; - dumpSortDescription(result_description, input_streams.front().header, settings.out); + dumpSortDescription(result_description, settings.out); settings.out << '\n'; } @@ -228,11 +228,11 @@ void SortingStep::describeActions(JSONBuilder::JSONMap & map) const { if (!prefix_description.empty()) { - map.add("Prefix Sort Description", explainSortDescription(prefix_description, input_streams.front().header)); - map.add("Result Sort Description", explainSortDescription(result_description, input_streams.front().header)); + map.add("Prefix Sort Description", explainSortDescription(prefix_description)); + map.add("Result Sort Description", explainSortDescription(result_description)); } else - map.add("Sort Description", explainSortDescription(result_description, input_streams.front().header)); + map.add("Sort Description", explainSortDescription(result_description)); if (limit) map.add("Limit", limit); diff --git a/src/Processors/QueryPlan/WindowStep.cpp b/src/Processors/QueryPlan/WindowStep.cpp index cd4bb5f6730..df42ca9e60f 100644 --- a/src/Processors/QueryPlan/WindowStep.cpp +++ b/src/Processors/QueryPlan/WindowStep.cpp @@ -129,7 +129,7 @@ void WindowStep::describeActions(JSONBuilder::JSONMap & map) const } if (!window_description.order_by.empty()) - map.add("Sort Description", explainSortDescription(window_description.order_by, {})); + map.add("Sort Description", explainSortDescription(window_description.order_by)); auto functions_array = std::make_unique(); for (const auto & func : window_functions) diff --git a/src/Processors/Transforms/AggregatingInOrderTransform.cpp b/src/Processors/Transforms/AggregatingInOrderTransform.cpp index 63497ea1af4..c998818a3ec 100644 --- a/src/Processors/Transforms/AggregatingInOrderTransform.cpp +++ b/src/Processors/Transforms/AggregatingInOrderTransform.cpp @@ -26,7 +26,6 @@ AggregatingInOrderTransform::AggregatingInOrderTransform( , max_block_size(max_block_size_) , max_block_bytes(max_block_bytes_) , params(std::move(params_)) - , group_by_description(group_by_description_) , aggregate_columns(params->params.aggregates_size) , many_data(std::move(many_data_)) , variants(*many_data->variants[current_variant]) @@ -34,15 +33,8 @@ AggregatingInOrderTransform::AggregatingInOrderTransform( /// We won't finalize states in order to merge same states (generated due to multi-thread execution) in AggregatingSortedTransform res_header = params->getCustomHeader(false); - /// Replace column names to column position in description_sorted. - for (auto & column_description : group_by_description) - { - if (!column_description.column_name.empty()) - { - column_description.column_number = res_header.getPositionByName(column_description.column_name); - column_description.column_name.clear(); - } - } + for (const auto & column_description : group_by_description_) + group_by_description.emplace_back(column_description, res_header.getPositionByName(column_description.column_name)); } AggregatingInOrderTransform::~AggregatingInOrderTransform() = default; diff --git a/src/Processors/Transforms/AggregatingInOrderTransform.h b/src/Processors/Transforms/AggregatingInOrderTransform.h index e4c217a8f81..f900040d549 100644 --- a/src/Processors/Transforms/AggregatingInOrderTransform.h +++ b/src/Processors/Transforms/AggregatingInOrderTransform.h @@ -51,7 +51,7 @@ private: MutableColumns res_aggregate_columns; AggregatingTransformParamsPtr params; - SortDescription group_by_description; + SortDescriptionWithPositions group_by_description; Aggregator::AggregateColumns aggregate_columns; diff --git a/src/Processors/Transforms/CheckSortedTransform.cpp b/src/Processors/Transforms/CheckSortedTransform.cpp index 3d4518a935d..4491301e274 100644 --- a/src/Processors/Transforms/CheckSortedTransform.cpp +++ b/src/Processors/Transforms/CheckSortedTransform.cpp @@ -12,33 +12,13 @@ namespace ErrorCodes extern const int LOGICAL_ERROR; } -CheckSortedTransform::CheckSortedTransform( - const Block & header_, - const SortDescription & sort_description_) - : ISimpleTransform(header_, header_, false) - , sort_description_map(addPositionsToSortDescriptions(sort_description_)) +CheckSortedTransform::CheckSortedTransform(const Block & header, const SortDescription & sort_description) + : ISimpleTransform(header, header, false) { + for (const auto & column_description : sort_description) + sort_description_map.emplace_back(column_description, header.getPositionByName(column_description.column_name)); } -SortDescriptionsWithPositions -CheckSortedTransform::addPositionsToSortDescriptions(const SortDescription & sort_description) -{ - SortDescriptionsWithPositions result; - result.reserve(sort_description.size()); - const auto & header = getInputPort().getHeader(); - - for (SortColumnDescription description_copy : sort_description) - { - if (!description_copy.column_name.empty()) - description_copy.column_number = header.getPositionByName(description_copy.column_name); - - result.push_back(description_copy); - } - - return result; -} - - void CheckSortedTransform::transform(Chunk & chunk) { size_t num_rows = chunk.getNumRows(); @@ -54,7 +34,7 @@ void CheckSortedTransform::transform(Chunk & chunk) const IColumn * left_col = left[column_number].get(); const IColumn * right_col = right[column_number].get(); - int res = elem.direction * left_col->compareAt(left_index, right_index, *right_col, elem.nulls_direction); + int res = elem.base.direction * left_col->compareAt(left_index, right_index, *right_col, elem.base.nulls_direction); if (res < 0) { return; diff --git a/src/Processors/Transforms/CheckSortedTransform.h b/src/Processors/Transforms/CheckSortedTransform.h index d1b13d22578..4daaaf79fdf 100644 --- a/src/Processors/Transforms/CheckSortedTransform.h +++ b/src/Processors/Transforms/CheckSortedTransform.h @@ -5,16 +5,12 @@ namespace DB { -using SortDescriptionsWithPositions = std::vector; - /// Streams checks that flow of blocks is sorted in the sort_description order /// Othrewise throws exception in readImpl function. class CheckSortedTransform : public ISimpleTransform { public: - CheckSortedTransform( - const Block & header_, - const SortDescription & sort_description_); + CheckSortedTransform(const Block & header, const SortDescription & sort_description); String getName() const override { return "CheckSortedTransform"; } @@ -23,10 +19,7 @@ protected: void transform(Chunk & chunk) override; private: - SortDescriptionsWithPositions sort_description_map; + SortDescriptionWithPositions sort_description_map; Columns last_row; - - /// Just checks, that all sort_descriptions has column_number - SortDescriptionsWithPositions addPositionsToSortDescriptions(const SortDescription & sort_description); }; } diff --git a/src/Processors/Transforms/DistinctSortedTransform.cpp b/src/Processors/Transforms/DistinctSortedTransform.cpp index 5600476fd77..13d039ebcae 100644 --- a/src/Processors/Transforms/DistinctSortedTransform.cpp +++ b/src/Processors/Transforms/DistinctSortedTransform.cpp @@ -9,8 +9,9 @@ namespace ErrorCodes } DistinctSortedTransform::DistinctSortedTransform( - const Block & header, SortDescription sort_description, const SizeLimits & set_size_limits_, UInt64 limit_hint_, const Names & columns) - : ISimpleTransform(header, header, true) + Block header_, SortDescription sort_description, const SizeLimits & set_size_limits_, UInt64 limit_hint_, const Names & columns) + : ISimpleTransform(header_, header_, true) + , header(std::move(header_)) , description(std::move(sort_description)) , columns_names(columns) , limit_hint(limit_hint_) @@ -24,7 +25,7 @@ void DistinctSortedTransform::transform(Chunk & chunk) if (column_ptrs.empty()) return; - ColumnRawPtrs clearing_hint_columns(getClearingColumns(chunk, column_ptrs)); + ColumnRawPtrs clearing_hint_columns(getClearingColumns(column_ptrs)); if (data.type == ClearableSetVariants::Type::EMPTY) data.init(ClearableSetVariants::chooseMethod(column_ptrs, key_sizes)); @@ -139,13 +140,13 @@ ColumnRawPtrs DistinctSortedTransform::getKeyColumns(const Chunk & chunk) const return column_ptrs; } -ColumnRawPtrs DistinctSortedTransform::getClearingColumns(const Chunk & chunk, const ColumnRawPtrs & key_columns) const +ColumnRawPtrs DistinctSortedTransform::getClearingColumns(const ColumnRawPtrs & key_columns) const { ColumnRawPtrs clearing_hint_columns; clearing_hint_columns.reserve(description.size()); for (const auto & sort_column_description : description) { - const auto * sort_column_ptr = chunk.getColumns().at(sort_column_description.column_number).get(); + const auto * sort_column_ptr = header.getByName(sort_column_description.column_name).column.get(); const auto it = std::find(key_columns.cbegin(), key_columns.cend(), sort_column_ptr); if (it != key_columns.cend()) /// if found in key_columns clearing_hint_columns.emplace_back(sort_column_ptr); diff --git a/src/Processors/Transforms/DistinctSortedTransform.h b/src/Processors/Transforms/DistinctSortedTransform.h index ddac6c18a64..0530a6689e9 100644 --- a/src/Processors/Transforms/DistinctSortedTransform.h +++ b/src/Processors/Transforms/DistinctSortedTransform.h @@ -22,7 +22,8 @@ class DistinctSortedTransform : public ISimpleTransform { public: /// Empty columns_ means all columns. - DistinctSortedTransform(const Block & header, SortDescription sort_description, const SizeLimits & set_size_limits_, UInt64 limit_hint_, const Names & columns); + DistinctSortedTransform( + Block header_, SortDescription sort_description, const SizeLimits & set_size_limits_, UInt64 limit_hint_, const Names & columns); String getName() const override { return "DistinctSortedTransform"; } @@ -33,7 +34,7 @@ private: ColumnRawPtrs getKeyColumns(const Chunk & chunk) const; /// When clearing_columns changed, we can clean HashSet to memory optimization /// clearing_columns is a left-prefix of SortDescription exists in key_columns - ColumnRawPtrs getClearingColumns(const Chunk & chunk, const ColumnRawPtrs & key_columns) const; + ColumnRawPtrs getClearingColumns(const ColumnRawPtrs & key_columns) const; static bool rowsEqual(const ColumnRawPtrs & lhs, size_t n, const ColumnRawPtrs & rhs, size_t m); /// return true if has new data @@ -46,6 +47,7 @@ private: size_t rows, ClearableSetVariants & variants) const; + Block header; SortDescription description; struct PreviousChunk diff --git a/src/Processors/Transforms/FinishSortingTransform.cpp b/src/Processors/Transforms/FinishSortingTransform.cpp index 763ed9ecc49..abded9bd2f0 100644 --- a/src/Processors/Transforms/FinishSortingTransform.cpp +++ b/src/Processors/Transforms/FinishSortingTransform.cpp @@ -21,9 +21,11 @@ static bool isPrefix(const SortDescription & pref_descr, const SortDescription & } FinishSortingTransform::FinishSortingTransform( - const Block & header, const SortDescription & description_sorted_, + const Block & header, + const SortDescription & description_sorted_, const SortDescription & description_to_sort_, - size_t max_merged_block_size_, UInt64 limit_) + size_t max_merged_block_size_, + UInt64 limit_) : SortingTransform(header, description_to_sort_, max_merged_block_size_, limit_) { /// Check for sanity non-modified descriptions @@ -34,7 +36,8 @@ FinishSortingTransform::FinishSortingTransform( /// The target description is modified in SortingTransform constructor. /// To avoid doing the same actions with description_sorted just copy it from prefix of target description. size_t prefix_size = description_sorted_.size(); - description_sorted.assign(description.begin(), description.begin() + prefix_size); + for (size_t i = 0; i < prefix_size; ++i) + description_with_positions.emplace_back(description[i], header_without_constants.getPositionByName(description[i].column_name)); } void FinishSortingTransform::consume(Chunk chunk) @@ -62,7 +65,7 @@ void FinishSortingTransform::consume(Chunk chunk) while (high - low > 1) { ssize_t mid = (low + high) / 2; - if (!less(last_chunk.getColumns(), chunk.getColumns(), last_chunk.getNumRows() - 1, mid, description_sorted)) + if (!less(last_chunk.getColumns(), chunk.getColumns(), last_chunk.getNumRows() - 1, mid, description_with_positions)) low = mid; else high = mid; @@ -100,7 +103,8 @@ void FinishSortingTransform::generate() { if (!merge_sorter) { - merge_sorter = std::make_unique(std::move(chunks), description, max_merged_block_size, limit); + merge_sorter + = std::make_unique(header_without_constants, std::move(chunks), description, max_merged_block_size, limit); generated_prefix = true; } diff --git a/src/Processors/Transforms/FinishSortingTransform.h b/src/Processors/Transforms/FinishSortingTransform.h index 63fbb2e0e63..3bebcc0a68f 100644 --- a/src/Processors/Transforms/FinishSortingTransform.h +++ b/src/Processors/Transforms/FinishSortingTransform.h @@ -11,9 +11,12 @@ class FinishSortingTransform : public SortingTransform { public: /// limit - if not 0, allowed to return just first 'limit' rows in sorted order. - FinishSortingTransform(const Block & header, const SortDescription & description_sorted_, + FinishSortingTransform( + const Block & header, + const SortDescription & description_sorted_, const SortDescription & description_to_sort_, - size_t max_merged_block_size_, UInt64 limit_); + size_t max_merged_block_size_, + UInt64 limit_); String getName() const override { return "FinishSortingTransform"; } @@ -22,7 +25,7 @@ protected: void generate() override; private: - SortDescription description_sorted; + SortDescriptionWithPositions description_with_positions; Chunk tail_chunk; }; diff --git a/src/Processors/Transforms/MergeSortingTransform.cpp b/src/Processors/Transforms/MergeSortingTransform.cpp index 73817d7de4a..1fe945cbbc9 100644 --- a/src/Processors/Transforms/MergeSortingTransform.cpp +++ b/src/Processors/Transforms/MergeSortingTransform.cpp @@ -90,16 +90,21 @@ private: MergeSortingTransform::MergeSortingTransform( const Block & header, const SortDescription & description_, - size_t max_merged_block_size_, UInt64 limit_, + size_t max_merged_block_size_, + UInt64 limit_, size_t max_bytes_before_remerge_, double remerge_lowered_memory_bytes_ratio_, - size_t max_bytes_before_external_sort_, VolumePtr tmp_volume_, + size_t max_bytes_before_external_sort_, + VolumePtr tmp_volume_, size_t min_free_disk_space_) : SortingTransform(header, description_, max_merged_block_size_, limit_) , max_bytes_before_remerge(max_bytes_before_remerge_) , remerge_lowered_memory_bytes_ratio(remerge_lowered_memory_bytes_ratio_) - , max_bytes_before_external_sort(max_bytes_before_external_sort_), tmp_volume(tmp_volume_) - , min_free_disk_space(min_free_disk_space_) {} + , max_bytes_before_external_sort(max_bytes_before_external_sort_) + , tmp_volume(tmp_volume_) + , min_free_disk_space(min_free_disk_space_) +{ +} Processors MergeSortingTransform::expandPipeline() { @@ -180,7 +185,8 @@ void MergeSortingTransform::consume(Chunk chunk) temporary_files.emplace_back(createTemporaryFile(tmp_path)); const std::string & path = temporary_files.back()->path(); - merge_sorter = std::make_unique(std::move(chunks), description, max_merged_block_size, limit); + merge_sorter + = std::make_unique(header_without_constants, std::move(chunks), description, max_merged_block_size, limit); auto current_processor = std::make_shared(header_without_constants, log, path); processors.emplace_back(current_processor); @@ -223,7 +229,8 @@ void MergeSortingTransform::generate() if (!generated_prefix) { if (temporary_files.empty()) - merge_sorter = std::make_unique(std::move(chunks), description, max_merged_block_size, limit); + merge_sorter + = std::make_unique(header_without_constants, std::move(chunks), description, max_merged_block_size, limit); else { ProfileEvents::increment(ProfileEvents::ExternalSortMerge); @@ -251,7 +258,7 @@ void MergeSortingTransform::remerge() LOG_DEBUG(log, "Re-merging intermediate ORDER BY data ({} blocks with {} rows) to save memory consumption", chunks.size(), sum_rows_in_blocks); /// NOTE Maybe concat all blocks and partial sort will be faster than merge? - MergeSorter remerge_sorter(std::move(chunks), description, max_merged_block_size, limit); + MergeSorter remerge_sorter(header_without_constants, std::move(chunks), description, max_merged_block_size, limit); Chunks new_chunks; size_t new_sum_rows_in_blocks = 0; diff --git a/src/Processors/Transforms/MergeSortingTransform.h b/src/Processors/Transforms/MergeSortingTransform.h index f16bebc2f46..b82ecc9d487 100644 --- a/src/Processors/Transforms/MergeSortingTransform.h +++ b/src/Processors/Transforms/MergeSortingTransform.h @@ -18,13 +18,16 @@ class MergeSortingTransform : public SortingTransform { public: /// limit - if not 0, allowed to return just first 'limit' rows in sorted order. - MergeSortingTransform(const Block & header, - const SortDescription & description_, - size_t max_merged_block_size_, UInt64 limit_, - size_t max_bytes_before_remerge_, - double remerge_lowered_memory_bytes_ratio_, - size_t max_bytes_before_external_sort_, VolumePtr tmp_volume_, - size_t min_free_disk_space_); + MergeSortingTransform( + const Block & header, + const SortDescription & description_, + size_t max_merged_block_size_, + UInt64 limit_, + size_t max_bytes_before_remerge_, + double remerge_lowered_memory_bytes_ratio_, + size_t max_bytes_before_external_sort_, + VolumePtr tmp_volume_, + size_t min_free_disk_space_); String getName() const override { return "MergeSortingTransform"; } diff --git a/src/Processors/Transforms/PartialSortingTransform.cpp b/src/Processors/Transforms/PartialSortingTransform.cpp index 3a75571872f..6a787a6cd15 100644 --- a/src/Processors/Transforms/PartialSortingTransform.cpp +++ b/src/Processors/Transforms/PartialSortingTransform.cpp @@ -22,9 +22,7 @@ static ColumnRawPtrs extractColumns(const Block & block, const SortDescription & for (size_t i = 0; i < size; ++i) { - const IColumn * column = !description[i].column_name.empty() - ? block.getByName(description[i].column_name).column.get() - : block.safeGetByPosition(description[i].column_number).column.get(); + const IColumn * column = block.getByName(description[i].column_name).column.get(); res.emplace_back(column); } diff --git a/src/Processors/Transforms/SortingTransform.cpp b/src/Processors/Transforms/SortingTransform.cpp index 8fa9d7adb84..c0f700070fa 100644 --- a/src/Processors/Transforms/SortingTransform.cpp +++ b/src/Processors/Transforms/SortingTransform.cpp @@ -22,7 +22,7 @@ namespace ErrorCodes extern const int LOGICAL_ERROR; } -MergeSorter::MergeSorter(Chunks chunks_, SortDescription & description_, size_t max_merged_block_size_, UInt64 limit_) +MergeSorter::MergeSorter(const Block & header, Chunks chunks_, SortDescription & description_, size_t max_merged_block_size_, UInt64 limit_) : chunks(std::move(chunks_)), description(description_), max_merged_block_size(max_merged_block_size_), limit(limit_) { Chunks nonempty_chunks; @@ -36,7 +36,7 @@ MergeSorter::MergeSorter(Chunks chunks_, SortDescription & description_, size_t /// which can be inefficient. convertToFullIfSparse(chunk); - cursors.emplace_back(chunk.getColumns(), description); + cursors.emplace_back(header, chunk.getColumns(), description); has_collation |= cursors.back().has_collation; nonempty_chunks.emplace_back(std::move(chunk)); @@ -139,16 +139,6 @@ SortingTransform::SortingTransform( { const auto & sample = inputs.front().getHeader(); - /// Replace column names to column position in sort_description. - for (auto & column_description : description) - { - if (!column_description.column_name.empty()) - { - column_description.column_number = sample.getPositionByName(column_description.column_name); - column_description.column_name.clear(); - } - } - /// Remove constants from header and map old indexes to new. size_t num_columns = sample.columns(); ColumnNumbers map(num_columns, num_columns); @@ -169,13 +159,10 @@ SortingTransform::SortingTransform( description_without_constants.reserve(description.size()); for (const auto & column_description : description) { - auto old_pos = column_description.column_number; + auto old_pos = header.getPositionByName(column_description.column_name); auto new_pos = map[old_pos]; if (new_pos < num_columns) - { description_without_constants.push_back(column_description); - description_without_constants.back().column_number = new_pos; - } } description.swap(description_without_constants); diff --git a/src/Processors/Transforms/SortingTransform.h b/src/Processors/Transforms/SortingTransform.h index 0f7cb4347a4..380ef4dff88 100644 --- a/src/Processors/Transforms/SortingTransform.h +++ b/src/Processors/Transforms/SortingTransform.h @@ -15,7 +15,7 @@ namespace DB class MergeSorter { public: - MergeSorter(Chunks chunks_, SortDescription & description_, size_t max_merged_block_size_, UInt64 limit_); + MergeSorter(const Block & header, Chunks chunks_, SortDescription & description_, size_t max_merged_block_size_, UInt64 limit_); Chunk read(); @@ -45,8 +45,10 @@ private: class MergeSorterSource : public ISource { public: - MergeSorterSource(Block header, Chunks chunks, SortDescription & description, size_t max_merged_block_size, UInt64 limit) - : ISource(std::move(header)), merge_sorter(std::move(chunks), description, max_merged_block_size, limit) {} + MergeSorterSource(const Block & header, Chunks chunks, SortDescription & description, size_t max_merged_block_size, UInt64 limit) + : ISource(header), merge_sorter(header, std::move(chunks), description, max_merged_block_size, limit) + { + } String getName() const override { return "MergeSorterSource"; } diff --git a/src/Storages/MergeTree/MergeTask.cpp b/src/Storages/MergeTree/MergeTask.cpp index 935a11ec5fa..e8241ffe080 100644 --- a/src/Storages/MergeTree/MergeTask.cpp +++ b/src/Storages/MergeTree/MergeTask.cpp @@ -782,7 +782,7 @@ void MergeTask::ExecuteAndFinalizeHorizontalPart::createMergedStream() Block header = pipes.at(0).getHeader(); for (size_t i = 0; i < sort_columns_size; ++i) - sort_description.emplace_back(header.getPositionByName(sort_columns[i]), 1, 1); + sort_description.emplace_back(sort_columns[i], 1, 1); /// The order of the streams is important: when the key is matched, the elements go in the order of the source stream number. /// In the merged part, the lines with the same key must be in the ascending order of the identifier of original part, diff --git a/src/Storages/MergeTree/MergeTreeDataWriter.cpp b/src/Storages/MergeTree/MergeTreeDataWriter.cpp index 4805a273c70..47e95121cb0 100644 --- a/src/Storages/MergeTree/MergeTreeDataWriter.cpp +++ b/src/Storages/MergeTree/MergeTreeDataWriter.cpp @@ -333,7 +333,7 @@ MergeTreeDataWriter::TemporaryPart MergeTreeDataWriter::writeTempPart( sort_description.reserve(sort_columns_size); for (size_t i = 0; i < sort_columns_size; ++i) - sort_description.emplace_back(block.getPositionByName(sort_columns[i]), 1, 1); + sort_description.emplace_back(sort_columns[i], 1, 1); ProfileEvents::increment(ProfileEvents::MergeTreeDataWriterBlocks); @@ -521,7 +521,7 @@ MergeTreeDataWriter::TemporaryPart MergeTreeDataWriter::writeProjectionPartImpl( sort_description.reserve(sort_columns_size); for (size_t i = 0; i < sort_columns_size; ++i) - sort_description.emplace_back(block.getPositionByName(sort_columns[i]), 1, 1); + sort_description.emplace_back(sort_columns[i], 1, 1); ProfileEvents::increment(ProfileEvents::MergeTreeDataProjectionWriterBlocks); From 4c51329ad64f28ce449ea745a1392b631b623fd2 Mon Sep 17 00:00:00 2001 From: Nickita Taranov Date: Fri, 25 Mar 2022 16:18:24 +0100 Subject: [PATCH 54/82] stash --- src/Interpreters/ActionsDAG.cpp | 14 ++++ src/Interpreters/ActionsDAG.h | 5 ++ .../QueryPlan/Optimizations/Optimizations.h | 9 ++- .../Optimizations/liftUpFunctions.cpp | 80 +++++++++++++++++++ src/Processors/QueryPlan/SortingStep.cpp | 24 ++++-- src/Processors/QueryPlan/SortingStep.h | 5 ++ ...on_calculation_after_sorting_and_limit.xml | 4 + .../01655_plan_optimizations.reference | 12 +++ .../0_stateless/01655_plan_optimizations.sh | 10 +++ 9 files changed, 155 insertions(+), 8 deletions(-) create mode 100644 src/Processors/QueryPlan/Optimizations/liftUpFunctions.cpp create mode 100644 tests/performance/function_calculation_after_sorting_and_limit.xml diff --git a/src/Interpreters/ActionsDAG.cpp b/src/Interpreters/ActionsDAG.cpp index 25116f5145a..151ca631d2d 100644 --- a/src/Interpreters/ActionsDAG.cpp +++ b/src/Interpreters/ActionsDAG.cpp @@ -1527,6 +1527,20 @@ ActionsDAG::SplitResult ActionsDAG::splitActionsBeforeArrayJoin(const NameSet & return res; } +ActionsDAG::SplitResult ActionsDAG::splitActionsBySortingDescription(const SortDescription & sort_description) const +{ + std::unordered_set split_nodes; + for (const auto & sort_column : sort_description) + { + const auto * node = tryFindInIndex(sort_column.column_name); + if (node) + split_nodes.insert(node); + } + auto res = split(split_nodes); + res.second->project_input = project_input; + return res; +} + ActionsDAG::SplitResult ActionsDAG::splitActionsForFilter(const std::string & column_name) const { const auto * node = tryFindInIndex(column_name); diff --git a/src/Interpreters/ActionsDAG.h b/src/Interpreters/ActionsDAG.h index b07ab08c997..a7424ac4967 100644 --- a/src/Interpreters/ActionsDAG.h +++ b/src/Interpreters/ActionsDAG.h @@ -7,6 +7,8 @@ #include "config_core.h" +#include + namespace DB { @@ -274,6 +276,9 @@ public: /// Index of initial actions must contain column_name. SplitResult splitActionsForFilter(const std::string & column_name) const; + /// + SplitResult splitActionsBySortingDescription(const SortDescription & sort_description) const; + /// Create actions which may calculate part of filter using only available_inputs. /// If nothing may be calculated, returns nullptr. /// Otherwise, return actions which inputs are from available_inputs. diff --git a/src/Processors/QueryPlan/Optimizations/Optimizations.h b/src/Processors/QueryPlan/Optimizations/Optimizations.h index 10bc6293537..7438bb18cd4 100644 --- a/src/Processors/QueryPlan/Optimizations/Optimizations.h +++ b/src/Processors/QueryPlan/Optimizations/Optimizations.h @@ -44,16 +44,19 @@ size_t tryMergeExpressions(QueryPlan::Node * parent_node, QueryPlan::Nodes &); /// May split FilterStep and push down only part of it. size_t tryPushDownFilter(QueryPlan::Node * parent_node, QueryPlan::Nodes & nodes); +/// +size_t tryExecuteFunctionsAfterSorting(QueryPlan::Node * parent_node, QueryPlan::Nodes & nodes); + inline const auto & getOptimizations() { - static const std::array optimizations = - {{ + static const std::array optimizations = {{ {tryLiftUpArrayJoin, "liftUpArrayJoin", &QueryPlanOptimizationSettings::optimize_plan}, {tryPushDownLimit, "pushDownLimit", &QueryPlanOptimizationSettings::optimize_plan}, {trySplitFilter, "splitFilter", &QueryPlanOptimizationSettings::optimize_plan}, {tryMergeExpressions, "mergeExpressions", &QueryPlanOptimizationSettings::optimize_plan}, {tryPushDownFilter, "pushDownFilter", &QueryPlanOptimizationSettings::filter_push_down}, - }}; + {tryExecuteFunctionsAfterSorting, "liftUpFunctions", &QueryPlanOptimizationSettings::optimize_plan}, + }}; return optimizations; } diff --git a/src/Processors/QueryPlan/Optimizations/liftUpFunctions.cpp b/src/Processors/QueryPlan/Optimizations/liftUpFunctions.cpp new file mode 100644 index 00000000000..abf7ee48cb4 --- /dev/null +++ b/src/Processors/QueryPlan/Optimizations/liftUpFunctions.cpp @@ -0,0 +1,80 @@ +#include +#include +#include +#include + +#include +#include + +namespace DB::QueryPlanOptimizations +{ + +void swapSortingAndUnnecessaryCalculation(QueryPlan::Node * parent_node, ActionsDAGPtr && actions) +{ + QueryPlan::Node * child_node = parent_node->children.front(); + + auto & parent_step = parent_node->step; + auto & child_step = child_node->step; + auto * sorting_step = typeid_cast(parent_step.get()); + + // Sorting -> UnnecessaryCalculations + std::swap(parent_step, child_step); + // UnnecessaryCalculations -> Sorting + + sorting_step->updateInputStream(child_node->children.at(0)->step->getOutputStream()); + auto input_header = child_step->getInputStreams().at(0).header; + sorting_step->updateOutputStream(input_header); + parent_step = std::make_unique(child_step->getOutputStream(), std::move(actions)); +} + +size_t tryExecuteFunctionsAfterSorting(QueryPlan::Node * parent_node, QueryPlan::Nodes & nodes) +{ + if (parent_node->children.size() != 1) + return 0; + + QueryPlan::Node * child_node = parent_node->children.front(); + + auto & parent_step = parent_node->step; + auto & child_step = child_node->step; + auto * sorting_step = typeid_cast(parent_step.get()); + auto * expression_step = typeid_cast(child_step.get()); + + if (!sorting_step || !expression_step) + return 0; + + const auto & sort_columns = sorting_step->getSortDescription(); + const auto & expression = expression_step->getExpression(); + + for (auto sc : sort_columns) + LOG_TRACE(&Poco::Logger::get("Optimizer"), "sort_columns: {}", sc.column_name); + + auto split_actions = expression->splitActionsBySortingDescription(sort_columns); + LOG_TRACE(&Poco::Logger::get("Optimizer"), "source: {}", expression->dumpDAG()); + LOG_TRACE(&Poco::Logger::get("Optimizer"), "first: {}", split_actions.first->dumpDAG()); + LOG_TRACE(&Poco::Logger::get("Optimizer"), "second: {}", split_actions.second->dumpDAG()); + + // No calculations can be postponed. + if (split_actions.second->trivial()) + return 0; + + // Everything can be done after the sorting. + if (split_actions.first->trivial()) + { + swapSortingAndUnnecessaryCalculation(parent_node, std::move(split_actions.second)); + return 2; + } + + // Sorting -> Expression + auto & node = nodes.emplace_back(); + + node.children.swap(child_node->children); + child_node->children.emplace_back(&node); + + node.step = std::make_unique(node.children.at(0)->step->getOutputStream(), std::move(split_actions.first)); + // Sorting (parent_node) -> UnnecessaryCalculations (child_node) -> NecessaryCalculations (node) + swapSortingAndUnnecessaryCalculation(parent_node, std::move(split_actions.second)); + // UnnecessaryCalculations (child_node) -> Sorting (parent_node) -> NecessaryCalculations (node) + + return 3; +} +} diff --git a/src/Processors/QueryPlan/SortingStep.cpp b/src/Processors/QueryPlan/SortingStep.cpp index 1e56c02504b..38da1381fa9 100644 --- a/src/Processors/QueryPlan/SortingStep.cpp +++ b/src/Processors/QueryPlan/SortingStep.cpp @@ -1,11 +1,12 @@ +#include +#include +#include #include -#include +#include +#include #include #include -#include -#include -#include -#include +#include #include namespace DB @@ -88,6 +89,19 @@ SortingStep::SortingStep( output_stream->sort_mode = DataStream::SortMode::Stream; } +void SortingStep::updateInputStream(const DataStream & input_stream) +{ + input_streams.clear(); + input_streams.emplace_back(input_stream); +} + +void SortingStep::updateOutputStream(Block result_header) +{ + if (input_streams.size() != 1) + throw std::runtime_error{"wasted"}; + output_stream = createOutputStream(input_streams.at(0), result_header, getDataStreamTraits()); +} + void SortingStep::updateLimit(size_t limit_) { if (limit_ && (limit == 0 || limit_ < limit)) diff --git a/src/Processors/QueryPlan/SortingStep.h b/src/Processors/QueryPlan/SortingStep.h index 8e253e71f44..d828cd35dff 100644 --- a/src/Processors/QueryPlan/SortingStep.h +++ b/src/Processors/QueryPlan/SortingStep.h @@ -49,6 +49,11 @@ public: /// Add limit or change it to lower value. void updateLimit(size_t limit_); + void updateInputStream(const DataStream & input_stream); + void updateOutputStream(Block result_header); + + SortDescription getSortDescription() const { return result_description; } + private: enum class Type diff --git a/tests/performance/function_calculation_after_sorting_and_limit.xml b/tests/performance/function_calculation_after_sorting_and_limit.xml new file mode 100644 index 00000000000..ddb8f860600 --- /dev/null +++ b/tests/performance/function_calculation_after_sorting_and_limit.xml @@ -0,0 +1,4 @@ + + SELECT sipHash64(number) FROM numbers(1e8) ORDER BY number LIMIT 5 + SELECT sipHash64(number) FROM numbers(1e8) ORDER BY number + 1 LIMIT 5 + diff --git a/tests/queries/0_stateless/01655_plan_optimizations.reference b/tests/queries/0_stateless/01655_plan_optimizations.reference index 33a7ff44b74..6c792c1092e 100644 --- a/tests/queries/0_stateless/01655_plan_optimizations.reference +++ b/tests/queries/0_stateless/01655_plan_optimizations.reference @@ -142,3 +142,15 @@ Filter Filter 2 3 2 3 +> function calculation should be done after sorting and limit (if possible) +> the whole Expression node could be moved after Sorting +Expression +Limit +Expression +Sorting +> Expression should be divided into two subnodes and only one of them could be moved after Sorting +Expression +Limit +Expression +Sorting +Expression diff --git a/tests/queries/0_stateless/01655_plan_optimizations.sh b/tests/queries/0_stateless/01655_plan_optimizations.sh index b66d788a338..d2f6914ff88 100755 --- a/tests/queries/0_stateless/01655_plan_optimizations.sh +++ b/tests/queries/0_stateless/01655_plan_optimizations.sh @@ -196,3 +196,13 @@ $CLICKHOUSE_CLIENT -q " select a, b from ( select number + 1 as a, number + 2 as b from numbers(2) union all select number + 1 as b, number + 2 as a from numbers(2) ) where a != 1 settings enable_optimize_predicate_expression = 0" + +echo "> function calculation should be done after sorting and limit (if possible)" +echo "> the whole Expression node could be moved after Sorting" +$CLICKHOUSE_CLIENT -q " + explain select sipHash64(number) from numbers(100) order by number limit 5" | + sed 's/ //g' | grep -o "^ *\(Expression\|Limit\|Sorting\)" +echo "> Expression should be divided into two subnodes and only one of them could be moved after Sorting" +$CLICKHOUSE_CLIENT -q " + explain select sipHash64(number) from numbers(100) order by number + 1 limit 5" | + sed 's/ //g' | grep -o "^ *\(Expression\|Limit\|Sorting\)" From b095838444b99b4b843da357a3db37f28d02ece0 Mon Sep 17 00:00:00 2001 From: Nickita Taranov Date: Fri, 25 Mar 2022 17:20:29 +0100 Subject: [PATCH 55/82] stash --- src/Interpreters/ActionsDAG.cpp | 9 +++------ src/Interpreters/ActionsDAG.h | 7 +++---- src/Processors/QueryPlan/Optimizations/Optimizations.h | 3 ++- .../QueryPlan/Optimizations/liftUpFunctions.cpp | 7 +++++-- src/Processors/QueryPlan/SortingStep.cpp | 8 +++----- src/Processors/QueryPlan/SortingStep.h | 2 +- .../0_stateless/01655_plan_optimizations.reference | 2 +- tests/queries/0_stateless/01655_plan_optimizations.sh | 2 +- 8 files changed, 19 insertions(+), 21 deletions(-) diff --git a/src/Interpreters/ActionsDAG.cpp b/src/Interpreters/ActionsDAG.cpp index 151ca631d2d..ea90bedd2f6 100644 --- a/src/Interpreters/ActionsDAG.cpp +++ b/src/Interpreters/ActionsDAG.cpp @@ -1527,15 +1527,12 @@ ActionsDAG::SplitResult ActionsDAG::splitActionsBeforeArrayJoin(const NameSet & return res; } -ActionsDAG::SplitResult ActionsDAG::splitActionsBySortingDescription(const SortDescription & sort_description) const +ActionsDAG::SplitResult ActionsDAG::splitActionsBySortingDescription(const NameSet & sort_columns) const { std::unordered_set split_nodes; - for (const auto & sort_column : sort_description) - { - const auto * node = tryFindInIndex(sort_column.column_name); - if (node) + for (const auto & sort_column : sort_columns) + if (const auto * node = tryFindInIndex(sort_column)) split_nodes.insert(node); - } auto res = split(split_nodes); res.second->project_input = project_input; return res; diff --git a/src/Interpreters/ActionsDAG.h b/src/Interpreters/ActionsDAG.h index a7424ac4967..1ff82c8ea60 100644 --- a/src/Interpreters/ActionsDAG.h +++ b/src/Interpreters/ActionsDAG.h @@ -7,8 +7,6 @@ #include "config_core.h" -#include - namespace DB { @@ -276,8 +274,9 @@ public: /// Index of initial actions must contain column_name. SplitResult splitActionsForFilter(const std::string & column_name) const; - /// - SplitResult splitActionsBySortingDescription(const SortDescription & sort_description) const; + /// Splits actions into two parts. The first part contains all the calculations required to calculate sort_columns. + /// The second contains the rest. + SplitResult splitActionsBySortingDescription(const NameSet & sort_columns) const; /// Create actions which may calculate part of filter using only available_inputs. /// If nothing may be calculated, returns nullptr. diff --git a/src/Processors/QueryPlan/Optimizations/Optimizations.h b/src/Processors/QueryPlan/Optimizations/Optimizations.h index 7438bb18cd4..45da00a7ccd 100644 --- a/src/Processors/QueryPlan/Optimizations/Optimizations.h +++ b/src/Processors/QueryPlan/Optimizations/Optimizations.h @@ -44,7 +44,8 @@ size_t tryMergeExpressions(QueryPlan::Node * parent_node, QueryPlan::Nodes &); /// May split FilterStep and push down only part of it. size_t tryPushDownFilter(QueryPlan::Node * parent_node, QueryPlan::Nodes & nodes); -/// +/// Move ExpressionStep up if possible. +/// May split ExpressionStep and lift up only part of it. size_t tryExecuteFunctionsAfterSorting(QueryPlan::Node * parent_node, QueryPlan::Nodes & nodes); inline const auto & getOptimizations() diff --git a/src/Processors/QueryPlan/Optimizations/liftUpFunctions.cpp b/src/Processors/QueryPlan/Optimizations/liftUpFunctions.cpp index abf7ee48cb4..936ce3c3e5f 100644 --- a/src/Processors/QueryPlan/Optimizations/liftUpFunctions.cpp +++ b/src/Processors/QueryPlan/Optimizations/liftUpFunctions.cpp @@ -42,11 +42,14 @@ size_t tryExecuteFunctionsAfterSorting(QueryPlan::Node * parent_node, QueryPlan: if (!sorting_step || !expression_step) return 0; - const auto & sort_columns = sorting_step->getSortDescription(); + NameSet sort_columns; + for (const auto & col : sorting_step->getSortDescription()) + sort_columns.insert(col.column_name); + const auto & expression = expression_step->getExpression(); for (auto sc : sort_columns) - LOG_TRACE(&Poco::Logger::get("Optimizer"), "sort_columns: {}", sc.column_name); + LOG_TRACE(&Poco::Logger::get("Optimizer"), "sort_columns: {}", fmt::join(sort_columns, ", ")); auto split_actions = expression->splitActionsBySortingDescription(sort_columns); LOG_TRACE(&Poco::Logger::get("Optimizer"), "source: {}", expression->dumpDAG()); diff --git a/src/Processors/QueryPlan/SortingStep.cpp b/src/Processors/QueryPlan/SortingStep.cpp index 38da1381fa9..3d75c461cf8 100644 --- a/src/Processors/QueryPlan/SortingStep.cpp +++ b/src/Processors/QueryPlan/SortingStep.cpp @@ -89,17 +89,15 @@ SortingStep::SortingStep( output_stream->sort_mode = DataStream::SortMode::Stream; } -void SortingStep::updateInputStream(const DataStream & input_stream) +void SortingStep::updateInputStream(DataStream input_stream) { input_streams.clear(); - input_streams.emplace_back(input_stream); + input_streams.push_back(std::move(input_stream)); } void SortingStep::updateOutputStream(Block result_header) { - if (input_streams.size() != 1) - throw std::runtime_error{"wasted"}; - output_stream = createOutputStream(input_streams.at(0), result_header, getDataStreamTraits()); + output_stream = createOutputStream(input_streams.front(), std::move(result_header), getDataStreamTraits()); } void SortingStep::updateLimit(size_t limit_) diff --git a/src/Processors/QueryPlan/SortingStep.h b/src/Processors/QueryPlan/SortingStep.h index d828cd35dff..1738d8d4e45 100644 --- a/src/Processors/QueryPlan/SortingStep.h +++ b/src/Processors/QueryPlan/SortingStep.h @@ -49,7 +49,7 @@ public: /// Add limit or change it to lower value. void updateLimit(size_t limit_); - void updateInputStream(const DataStream & input_stream); + void updateInputStream(DataStream input_stream); void updateOutputStream(Block result_header); SortDescription getSortDescription() const { return result_description; } diff --git a/tests/queries/0_stateless/01655_plan_optimizations.reference b/tests/queries/0_stateless/01655_plan_optimizations.reference index 6c792c1092e..5bdda6ac9aa 100644 --- a/tests/queries/0_stateless/01655_plan_optimizations.reference +++ b/tests/queries/0_stateless/01655_plan_optimizations.reference @@ -148,7 +148,7 @@ Expression Limit Expression Sorting -> Expression should be divided into two subnodes and only one of them could be moved after Sorting +> Expression should be divided into two subexpressions and only one of them should be moved after Sorting Expression Limit Expression diff --git a/tests/queries/0_stateless/01655_plan_optimizations.sh b/tests/queries/0_stateless/01655_plan_optimizations.sh index d2f6914ff88..efbd3973b62 100755 --- a/tests/queries/0_stateless/01655_plan_optimizations.sh +++ b/tests/queries/0_stateless/01655_plan_optimizations.sh @@ -202,7 +202,7 @@ echo "> the whole Expression node could be moved after Sorting" $CLICKHOUSE_CLIENT -q " explain select sipHash64(number) from numbers(100) order by number limit 5" | sed 's/ //g' | grep -o "^ *\(Expression\|Limit\|Sorting\)" -echo "> Expression should be divided into two subnodes and only one of them could be moved after Sorting" +echo "> Expression should be divided into two subexpressions and only one of them should be moved after Sorting" $CLICKHOUSE_CLIENT -q " explain select sipHash64(number) from numbers(100) order by number + 1 limit 5" | sed 's/ //g' | grep -o "^ *\(Expression\|Limit\|Sorting\)" From a08c035443a1c3549e73e17cc66bfe33a2f4cac8 Mon Sep 17 00:00:00 2001 From: Nickita Taranov Date: Fri, 25 Mar 2022 17:43:51 +0100 Subject: [PATCH 56/82] stash --- .../QueryPlan/Optimizations/liftUpFunctions.cpp | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/src/Processors/QueryPlan/Optimizations/liftUpFunctions.cpp b/src/Processors/QueryPlan/Optimizations/liftUpFunctions.cpp index 936ce3c3e5f..2fc41a0e8d8 100644 --- a/src/Processors/QueryPlan/Optimizations/liftUpFunctions.cpp +++ b/src/Processors/QueryPlan/Optimizations/liftUpFunctions.cpp @@ -23,7 +23,7 @@ void swapSortingAndUnnecessaryCalculation(QueryPlan::Node * parent_node, Actions sorting_step->updateInputStream(child_node->children.at(0)->step->getOutputStream()); auto input_header = child_step->getInputStreams().at(0).header; - sorting_step->updateOutputStream(input_header); + sorting_step->updateOutputStream(std::move(input_header)); parent_step = std::make_unique(child_step->getOutputStream(), std::move(actions)); } @@ -45,12 +45,7 @@ size_t tryExecuteFunctionsAfterSorting(QueryPlan::Node * parent_node, QueryPlan: NameSet sort_columns; for (const auto & col : sorting_step->getSortDescription()) sort_columns.insert(col.column_name); - const auto & expression = expression_step->getExpression(); - - for (auto sc : sort_columns) - LOG_TRACE(&Poco::Logger::get("Optimizer"), "sort_columns: {}", fmt::join(sort_columns, ", ")); - auto split_actions = expression->splitActionsBySortingDescription(sort_columns); LOG_TRACE(&Poco::Logger::get("Optimizer"), "source: {}", expression->dumpDAG()); LOG_TRACE(&Poco::Logger::get("Optimizer"), "first: {}", split_actions.first->dumpDAG()); @@ -69,11 +64,10 @@ size_t tryExecuteFunctionsAfterSorting(QueryPlan::Node * parent_node, QueryPlan: // Sorting -> Expression auto & node = nodes.emplace_back(); - node.children.swap(child_node->children); child_node->children.emplace_back(&node); - node.step = std::make_unique(node.children.at(0)->step->getOutputStream(), std::move(split_actions.first)); + // Sorting (parent_node) -> UnnecessaryCalculations (child_node) -> NecessaryCalculations (node) swapSortingAndUnnecessaryCalculation(parent_node, std::move(split_actions.second)); // UnnecessaryCalculations (child_node) -> Sorting (parent_node) -> NecessaryCalculations (node) From eedcd61479fc6e35dbbbdee1d67a6e490faf3a7c Mon Sep 17 00:00:00 2001 From: Nickita Taranov Date: Sat, 26 Mar 2022 01:36:06 +0100 Subject: [PATCH 57/82] fix --- .../Optimizations/liftUpFunctions.cpp | 56 +++++++++++------- src/Processors/QueryPlan/SortingStep.cpp | 1 + .../01576_alias_column_rewrite.reference | 9 +-- ...02149_read_in_order_fixed_prefix.reference | 58 ++++++++++--------- 4 files changed, 73 insertions(+), 51 deletions(-) diff --git a/src/Processors/QueryPlan/Optimizations/liftUpFunctions.cpp b/src/Processors/QueryPlan/Optimizations/liftUpFunctions.cpp index 2fc41a0e8d8..a304b91017c 100644 --- a/src/Processors/QueryPlan/Optimizations/liftUpFunctions.cpp +++ b/src/Processors/QueryPlan/Optimizations/liftUpFunctions.cpp @@ -9,7 +9,7 @@ namespace DB::QueryPlanOptimizations { -void swapSortingAndUnnecessaryCalculation(QueryPlan::Node * parent_node, ActionsDAGPtr && actions) +void swapSortingAndUnnecessaryCalculation(QueryPlan::Node * parent_node, ActionsDAGPtr && unneeded_for_sorting) { QueryPlan::Node * child_node = parent_node->children.front(); @@ -17,14 +17,24 @@ void swapSortingAndUnnecessaryCalculation(QueryPlan::Node * parent_node, Actions auto & child_step = child_node->step; auto * sorting_step = typeid_cast(parent_step.get()); - // Sorting -> UnnecessaryCalculations + // Sorting -> Expression std::swap(parent_step, child_step); - // UnnecessaryCalculations -> Sorting + // Expression -> Sorting sorting_step->updateInputStream(child_node->children.at(0)->step->getOutputStream()); - auto input_header = child_step->getInputStreams().at(0).header; + LOG_TRACE( + &Poco::Logger::get("Optimizer"), "New Sorting input header: {}", sorting_step->getInputStreams().at(0).header.dumpStructure()); + auto input_header = sorting_step->getInputStreams().at(0).header; + LOG_TRACE(&Poco::Logger::get("Optimizer"), "Old Sorting output header: {}", sorting_step->getOutputStream().header.dumpStructure()); sorting_step->updateOutputStream(std::move(input_header)); - parent_step = std::make_unique(child_step->getOutputStream(), std::move(actions)); + LOG_TRACE(&Poco::Logger::get("Optimizer"), "New Sorting output header: {}", sorting_step->getOutputStream().header.dumpStructure()); + auto description = parent_node->step->getStepDescription(); + parent_step = std::make_unique(child_step->getOutputStream(), std::move(unneeded_for_sorting)); + LOG_TRACE( + &Poco::Logger::get("Optimizer"), "New Expression input header: {}", parent_step->getInputStreams().at(0).header.dumpStructure()); + LOG_TRACE(&Poco::Logger::get("Optimizer"), "New Expression output header: {}", parent_step->getOutputStream().header.dumpStructure()); + parent_step->setStepDescription(description + " [lifted up part]"); + // UnneededCalculations -> Sorting } size_t tryExecuteFunctionsAfterSorting(QueryPlan::Node * parent_node, QueryPlan::Nodes & nodes) @@ -46,31 +56,35 @@ size_t tryExecuteFunctionsAfterSorting(QueryPlan::Node * parent_node, QueryPlan: for (const auto & col : sorting_step->getSortDescription()) sort_columns.insert(col.column_name); const auto & expression = expression_step->getExpression(); - auto split_actions = expression->splitActionsBySortingDescription(sort_columns); - LOG_TRACE(&Poco::Logger::get("Optimizer"), "source: {}", expression->dumpDAG()); - LOG_TRACE(&Poco::Logger::get("Optimizer"), "first: {}", split_actions.first->dumpDAG()); - LOG_TRACE(&Poco::Logger::get("Optimizer"), "second: {}", split_actions.second->dumpDAG()); + auto [needed_for_sorting, unneeded_for_sorting] = expression->splitActionsBySortingDescription(sort_columns); + LOG_TRACE(&Poco::Logger::get("Optimizer"), "Original Expression: {}", expression->dumpDAG()); + LOG_TRACE(&Poco::Logger::get("Optimizer"), "Needed for Sorting: {}", needed_for_sorting->dumpDAG()); + LOG_TRACE(&Poco::Logger::get("Optimizer"), "Unneeded for Sorting: {}", unneeded_for_sorting->dumpDAG()); + + auto description = child_step->getStepDescription(); // No calculations can be postponed. - if (split_actions.second->trivial()) + if (unneeded_for_sorting->trivial()) return 0; // Everything can be done after the sorting. - if (split_actions.first->trivial()) + /*if (needed_for_sorting->trivial()) { - swapSortingAndUnnecessaryCalculation(parent_node, std::move(split_actions.second)); + swapSortingAndUnnecessaryCalculation(parent_node, std::move(unneeded_for_sorting)); return 2; - } + }*/ - // Sorting -> Expression - auto & node = nodes.emplace_back(); - node.children.swap(child_node->children); - child_node->children.emplace_back(&node); - node.step = std::make_unique(node.children.at(0)->step->getOutputStream(), std::move(split_actions.first)); + // Sorting (parent_node) -> Expression (child_node) + auto & node_with_needed = nodes.emplace_back(); + node_with_needed.children.swap(child_node->children); + child_node->children.emplace_back(&node_with_needed); + node_with_needed.step + = std::make_unique(node_with_needed.children.at(0)->step->getOutputStream(), std::move(needed_for_sorting)); + node_with_needed.step->setStepDescription(std::move(description)); - // Sorting (parent_node) -> UnnecessaryCalculations (child_node) -> NecessaryCalculations (node) - swapSortingAndUnnecessaryCalculation(parent_node, std::move(split_actions.second)); - // UnnecessaryCalculations (child_node) -> Sorting (parent_node) -> NecessaryCalculations (node) + // Sorting (parent_node) -> so far the origin Expression (child_node) -> NeededCalculations (node_with_needed) + swapSortingAndUnnecessaryCalculation(parent_node, std::move(unneeded_for_sorting)); + // UneededCalculations (child_node) -> Sorting (parent_node) -> NeededCalculations (node_with_needed) return 3; } diff --git a/src/Processors/QueryPlan/SortingStep.cpp b/src/Processors/QueryPlan/SortingStep.cpp index 3d75c461cf8..9cc242852bf 100644 --- a/src/Processors/QueryPlan/SortingStep.cpp +++ b/src/Processors/QueryPlan/SortingStep.cpp @@ -98,6 +98,7 @@ void SortingStep::updateInputStream(DataStream input_stream) void SortingStep::updateOutputStream(Block result_header) { output_stream = createOutputStream(input_streams.front(), std::move(result_header), getDataStreamTraits()); + updateDistinctColumns(output_stream->header, output_stream->distinct_columns); } void SortingStep::updateLimit(size_t limit_) diff --git a/tests/queries/0_stateless/01576_alias_column_rewrite.reference b/tests/queries/0_stateless/01576_alias_column_rewrite.reference index 11cc146dd62..68875735110 100644 --- a/tests/queries/0_stateless/01576_alias_column_rewrite.reference +++ b/tests/queries/0_stateless/01576_alias_column_rewrite.reference @@ -35,10 +35,11 @@ Expression (Projection) ReadFromMergeTree (default.test_table) Expression (Projection) Limit (preliminary LIMIT (without OFFSET)) - Sorting - Expression (Before ORDER BY) - SettingQuotaAndLimits (Set limits and quota after reading from storage) - ReadFromMergeTree (default.test_table) + Expression (Before ORDER BY [lifted up part]) + Sorting + Expression (Before ORDER BY) + SettingQuotaAndLimits (Set limits and quota after reading from storage) + ReadFromMergeTree (default.test_table) optimize_aggregation_in_order Expression ((Projection + Before ORDER BY)) Aggregating diff --git a/tests/queries/0_stateless/02149_read_in_order_fixed_prefix.reference b/tests/queries/0_stateless/02149_read_in_order_fixed_prefix.reference index 9e24b7c6ea6..67a043d6646 100644 --- a/tests/queries/0_stateless/02149_read_in_order_fixed_prefix.reference +++ b/tests/queries/0_stateless/02149_read_in_order_fixed_prefix.reference @@ -7,13 +7,15 @@ ExpressionTransform (Limit) Limit - (Sorting) - MergingSortedTransform 2 → 1 - (Expression) - ExpressionTransform × 2 - (SettingQuotaAndLimits) - (ReadFromMergeTree) - MergeTreeInOrder × 2 0 → 1 + (Expression) + ExpressionTransform + (Sorting) + MergingSortedTransform 2 → 1 + (Expression) + ExpressionTransform × 2 + (SettingQuotaAndLimits) + (ReadFromMergeTree) + MergeTreeInOrder × 2 0 → 1 2020-10-01 9 2020-10-01 9 2020-10-01 9 @@ -23,16 +25,18 @@ ExpressionTransform ExpressionTransform (Limit) Limit - (Sorting) - MergingSortedTransform 2 → 1 - (Expression) - ExpressionTransform × 2 - (SettingQuotaAndLimits) - (ReadFromMergeTree) - ReverseTransform - MergeTreeReverse 0 → 1 - ReverseTransform - MergeTreeReverse 0 → 1 + (Expression) + ExpressionTransform + (Sorting) + MergingSortedTransform 2 → 1 + (Expression) + ExpressionTransform × 2 + (SettingQuotaAndLimits) + (ReadFromMergeTree) + ReverseTransform + MergeTreeReverse 0 → 1 + ReverseTransform + MergeTreeReverse 0 → 1 2020-10-01 9 2020-10-01 9 2020-10-01 9 @@ -42,15 +46,17 @@ ExpressionTransform ExpressionTransform (Limit) Limit - (Sorting) - FinishSortingTransform - PartialSortingTransform - MergingSortedTransform 2 → 1 - (Expression) - ExpressionTransform × 2 - (SettingQuotaAndLimits) - (ReadFromMergeTree) - MergeTreeInOrder × 2 0 → 1 + (Expression) + ExpressionTransform + (Sorting) + FinishSortingTransform + PartialSortingTransform + MergingSortedTransform 2 → 1 + (Expression) + ExpressionTransform × 2 + (SettingQuotaAndLimits) + (ReadFromMergeTree) + MergeTreeInOrder × 2 0 → 1 2020-10-11 0 2020-10-11 0 2020-10-11 0 From 3308b9d3afc854339052e2c8ca9a7cf02f4bb142 Mon Sep 17 00:00:00 2001 From: Nickita Taranov Date: Sat, 26 Mar 2022 01:36:41 +0100 Subject: [PATCH 58/82] disable test temporarily --- .../01600_remerge_sort_lowered_memory_bytes_ratio.sql | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/queries/0_stateless/01600_remerge_sort_lowered_memory_bytes_ratio.sql b/tests/queries/0_stateless/01600_remerge_sort_lowered_memory_bytes_ratio.sql index 5de4210d3f2..c0de98efd53 100644 --- a/tests/queries/0_stateless/01600_remerge_sort_lowered_memory_bytes_ratio.sql +++ b/tests/queries/0_stateless/01600_remerge_sort_lowered_memory_bytes_ratio.sql @@ -10,8 +10,8 @@ set max_block_size=40960; -- MergeSortingTransform: Re-merging intermediate ORDER BY data (20 blocks with 819200 rows) to save memory consumption -- MergeSortingTransform: Memory usage is lowered from 186.25 MiB to 95.00 MiB -- MergeSortingTransform: Re-merging is not useful (memory usage was not lowered by remerge_sort_lowered_memory_bytes_ratio=2.0) -select number k, repeat(toString(number), 11) v1, repeat(toString(number), 12) v2 from numbers(3e6) order by k limit 400e3 format Null; -- { serverError 241 } -select number k, repeat(toString(number), 11) v1, repeat(toString(number), 12) v2 from numbers(3e6) order by k limit 400e3 settings remerge_sort_lowered_memory_bytes_ratio=2. format Null; -- { serverError 241 } +select number k, repeat(toString(number), 11) v1, repeat(toString(number), 12) v2 from numbers(3e6) order by k limit 400e3 format Null; +select number k, repeat(toString(number), 11) v1, repeat(toString(number), 12) v2 from numbers(3e6) order by k limit 400e3 settings remerge_sort_lowered_memory_bytes_ratio=2. format Null; -- remerge_sort_lowered_memory_bytes_ratio 1.9 is good (need at least 1.91/0.98=1.94) -- MergeSortingTransform: Re-merging intermediate ORDER BY data (20 blocks with 819200 rows) to save memory consumption From b07f35ce280e923f4f99bd231036894daf356010 Mon Sep 17 00:00:00 2001 From: Nickita Taranov Date: Sat, 26 Mar 2022 02:16:59 +0100 Subject: [PATCH 59/82] fix test --- tests/queries/0_stateless/01655_plan_optimizations.reference | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/queries/0_stateless/01655_plan_optimizations.reference b/tests/queries/0_stateless/01655_plan_optimizations.reference index 5bdda6ac9aa..5b6c6f3d4b1 100644 --- a/tests/queries/0_stateless/01655_plan_optimizations.reference +++ b/tests/queries/0_stateless/01655_plan_optimizations.reference @@ -148,6 +148,7 @@ Expression Limit Expression Sorting +Expression > Expression should be divided into two subexpressions and only one of them should be moved after Sorting Expression Limit From 85fbf6cc621e46fc03535cc8ecf63bb33494c4df Mon Sep 17 00:00:00 2001 From: Nickita Taranov Date: Sat, 26 Mar 2022 13:52:14 +0100 Subject: [PATCH 60/82] update one more test --- .../queries/0_stateless/01591_window_functions.reference | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/tests/queries/0_stateless/01591_window_functions.reference b/tests/queries/0_stateless/01591_window_functions.reference index 655232fcdd4..c766bf16f19 100644 --- a/tests/queries/0_stateless/01591_window_functions.reference +++ b/tests/queries/0_stateless/01591_window_functions.reference @@ -925,10 +925,11 @@ Expression ((Projection + Before ORDER BY)) Window (Window step for window \'ORDER BY o ASC, number ASC\') Sorting (Sorting for window \'ORDER BY o ASC, number ASC\') Window (Window step for window \'ORDER BY number ASC\') - Sorting (Sorting for window \'ORDER BY number ASC\') - Expression ((Before window functions + (Projection + Before ORDER BY))) - SettingQuotaAndLimits (Set limits and quota after reading from storage) - ReadFromStorage (SystemNumbers) + Expression ((Before window functions + (Projection + Before ORDER BY)) [lifted up part]) + Sorting (Sorting for window \'ORDER BY number ASC\') + Expression ((Before window functions + (Projection + Before ORDER BY))) + SettingQuotaAndLimits (Set limits and quota after reading from storage) + ReadFromStorage (SystemNumbers) -- A test case for the sort comparator found by fuzzer. SELECT max(number) OVER (ORDER BY number DESC NULLS FIRST), From a39427f00b61347cbf1399934c8f3efa96b60af1 Mon Sep 17 00:00:00 2001 From: Nickita Taranov Date: Tue, 29 Mar 2022 12:55:48 +0200 Subject: [PATCH 61/82] clean up --- .../QueryPlan/Optimizations/Optimizations.h | 4 +-- .../Optimizations/liftUpFunctions.cpp | 33 ++++--------------- ...emerge_sort_lowered_memory_bytes_ratio.sql | 2 +- .../01655_plan_optimizations.reference | 2 ++ .../0_stateless/01655_plan_optimizations.sh | 3 ++ 5 files changed, 15 insertions(+), 29 deletions(-) diff --git a/src/Processors/QueryPlan/Optimizations/Optimizations.h b/src/Processors/QueryPlan/Optimizations/Optimizations.h index 45da00a7ccd..1d5b83dc9d0 100644 --- a/src/Processors/QueryPlan/Optimizations/Optimizations.h +++ b/src/Processors/QueryPlan/Optimizations/Optimizations.h @@ -44,8 +44,8 @@ size_t tryMergeExpressions(QueryPlan::Node * parent_node, QueryPlan::Nodes &); /// May split FilterStep and push down only part of it. size_t tryPushDownFilter(QueryPlan::Node * parent_node, QueryPlan::Nodes & nodes); -/// Move ExpressionStep up if possible. -/// May split ExpressionStep and lift up only part of it. +/// Move ExpressionStep after SortingStep if possible. +/// May split ExpressionStep and lift up only a part of it. size_t tryExecuteFunctionsAfterSorting(QueryPlan::Node * parent_node, QueryPlan::Nodes & nodes); inline const auto & getOptimizations() diff --git a/src/Processors/QueryPlan/Optimizations/liftUpFunctions.cpp b/src/Processors/QueryPlan/Optimizations/liftUpFunctions.cpp index a304b91017c..8e4242ea73e 100644 --- a/src/Processors/QueryPlan/Optimizations/liftUpFunctions.cpp +++ b/src/Processors/QueryPlan/Optimizations/liftUpFunctions.cpp @@ -9,7 +9,7 @@ namespace DB::QueryPlanOptimizations { -void swapSortingAndUnnecessaryCalculation(QueryPlan::Node * parent_node, ActionsDAGPtr && unneeded_for_sorting) +void swapSortingAndUnneededCalculations(QueryPlan::Node * parent_node, ActionsDAGPtr && unneeded_for_sorting) { QueryPlan::Node * child_node = parent_node->children.front(); @@ -22,17 +22,11 @@ void swapSortingAndUnnecessaryCalculation(QueryPlan::Node * parent_node, Actions // Expression -> Sorting sorting_step->updateInputStream(child_node->children.at(0)->step->getOutputStream()); - LOG_TRACE( - &Poco::Logger::get("Optimizer"), "New Sorting input header: {}", sorting_step->getInputStreams().at(0).header.dumpStructure()); auto input_header = sorting_step->getInputStreams().at(0).header; - LOG_TRACE(&Poco::Logger::get("Optimizer"), "Old Sorting output header: {}", sorting_step->getOutputStream().header.dumpStructure()); sorting_step->updateOutputStream(std::move(input_header)); - LOG_TRACE(&Poco::Logger::get("Optimizer"), "New Sorting output header: {}", sorting_step->getOutputStream().header.dumpStructure()); + auto description = parent_node->step->getStepDescription(); parent_step = std::make_unique(child_step->getOutputStream(), std::move(unneeded_for_sorting)); - LOG_TRACE( - &Poco::Logger::get("Optimizer"), "New Expression input header: {}", parent_step->getInputStreams().at(0).header.dumpStructure()); - LOG_TRACE(&Poco::Logger::get("Optimizer"), "New Expression output header: {}", parent_step->getOutputStream().header.dumpStructure()); parent_step->setStepDescription(description + " [lifted up part]"); // UnneededCalculations -> Sorting } @@ -55,35 +49,22 @@ size_t tryExecuteFunctionsAfterSorting(QueryPlan::Node * parent_node, QueryPlan: NameSet sort_columns; for (const auto & col : sorting_step->getSortDescription()) sort_columns.insert(col.column_name); - const auto & expression = expression_step->getExpression(); - auto [needed_for_sorting, unneeded_for_sorting] = expression->splitActionsBySortingDescription(sort_columns); - LOG_TRACE(&Poco::Logger::get("Optimizer"), "Original Expression: {}", expression->dumpDAG()); - LOG_TRACE(&Poco::Logger::get("Optimizer"), "Needed for Sorting: {}", needed_for_sorting->dumpDAG()); - LOG_TRACE(&Poco::Logger::get("Optimizer"), "Unneeded for Sorting: {}", unneeded_for_sorting->dumpDAG()); - - auto description = child_step->getStepDescription(); + auto [needed_for_sorting, unneeded_for_sorting] = expression_step->getExpression()->splitActionsBySortingDescription(sort_columns); // No calculations can be postponed. if (unneeded_for_sorting->trivial()) return 0; - // Everything can be done after the sorting. - /*if (needed_for_sorting->trivial()) - { - swapSortingAndUnnecessaryCalculation(parent_node, std::move(unneeded_for_sorting)); - return 2; - }*/ - // Sorting (parent_node) -> Expression (child_node) auto & node_with_needed = nodes.emplace_back(); - node_with_needed.children.swap(child_node->children); - child_node->children.emplace_back(&node_with_needed); + std::swap(node_with_needed.children, child_node->children); + child_node->children = {&node_with_needed}; node_with_needed.step = std::make_unique(node_with_needed.children.at(0)->step->getOutputStream(), std::move(needed_for_sorting)); - node_with_needed.step->setStepDescription(std::move(description)); + node_with_needed.step->setStepDescription(child_step->getStepDescription()); // Sorting (parent_node) -> so far the origin Expression (child_node) -> NeededCalculations (node_with_needed) - swapSortingAndUnnecessaryCalculation(parent_node, std::move(unneeded_for_sorting)); + swapSortingAndUnneededCalculations(parent_node, std::move(unneeded_for_sorting)); // UneededCalculations (child_node) -> Sorting (parent_node) -> NeededCalculations (node_with_needed) return 3; diff --git a/tests/queries/0_stateless/01600_remerge_sort_lowered_memory_bytes_ratio.sql b/tests/queries/0_stateless/01600_remerge_sort_lowered_memory_bytes_ratio.sql index c0de98efd53..f89fd1c94ca 100644 --- a/tests/queries/0_stateless/01600_remerge_sort_lowered_memory_bytes_ratio.sql +++ b/tests/queries/0_stateless/01600_remerge_sort_lowered_memory_bytes_ratio.sql @@ -10,7 +10,7 @@ set max_block_size=40960; -- MergeSortingTransform: Re-merging intermediate ORDER BY data (20 blocks with 819200 rows) to save memory consumption -- MergeSortingTransform: Memory usage is lowered from 186.25 MiB to 95.00 MiB -- MergeSortingTransform: Re-merging is not useful (memory usage was not lowered by remerge_sort_lowered_memory_bytes_ratio=2.0) -select number k, repeat(toString(number), 11) v1, repeat(toString(number), 12) v2 from numbers(3e6) order by k limit 400e3 format Null; +select number k, repeat(toString(number), 11) v1, repeat(toString(number), 12) v2 from numbers(3e6) order by k limit 400e3 format Null; -- { serverError 241 }} select number k, repeat(toString(number), 11) v1, repeat(toString(number), 12) v2 from numbers(3e6) order by k limit 400e3 settings remerge_sort_lowered_memory_bytes_ratio=2. format Null; -- remerge_sort_lowered_memory_bytes_ratio 1.9 is good (need at least 1.91/0.98=1.94) diff --git a/tests/queries/0_stateless/01655_plan_optimizations.reference b/tests/queries/0_stateless/01655_plan_optimizations.reference index 5b6c6f3d4b1..218ff7bd8c9 100644 --- a/tests/queries/0_stateless/01655_plan_optimizations.reference +++ b/tests/queries/0_stateless/01655_plan_optimizations.reference @@ -155,3 +155,5 @@ Limit Expression Sorting Expression +> this query should be executed without throwing an exception +0 diff --git a/tests/queries/0_stateless/01655_plan_optimizations.sh b/tests/queries/0_stateless/01655_plan_optimizations.sh index efbd3973b62..1f5d88bd8bf 100755 --- a/tests/queries/0_stateless/01655_plan_optimizations.sh +++ b/tests/queries/0_stateless/01655_plan_optimizations.sh @@ -206,3 +206,6 @@ echo "> Expression should be divided into two subexpressions and only one of the $CLICKHOUSE_CLIENT -q " explain select sipHash64(number) from numbers(100) order by number + 1 limit 5" | sed 's/ //g' | grep -o "^ *\(Expression\|Limit\|Sorting\)" +echo "> this query should be executed without throwing an exception" +$CLICKHOUSE_CLIENT -q " + select throwIf(number = 5) from (select * from numbers(10)) order by number limit 1" From 5590f78dfe981e5396da91fcedb437772610e32f Mon Sep 17 00:00:00 2001 From: Nickita Taranov Date: Tue, 29 Mar 2022 19:51:58 +0200 Subject: [PATCH 62/82] update remerge_sort_lowered_memory_bytes_ratio --- .../01600_remerge_sort_lowered_memory_bytes_ratio.sql | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/queries/0_stateless/01600_remerge_sort_lowered_memory_bytes_ratio.sql b/tests/queries/0_stateless/01600_remerge_sort_lowered_memory_bytes_ratio.sql index f89fd1c94ca..8646b40563e 100644 --- a/tests/queries/0_stateless/01600_remerge_sort_lowered_memory_bytes_ratio.sql +++ b/tests/queries/0_stateless/01600_remerge_sort_lowered_memory_bytes_ratio.sql @@ -10,8 +10,8 @@ set max_block_size=40960; -- MergeSortingTransform: Re-merging intermediate ORDER BY data (20 blocks with 819200 rows) to save memory consumption -- MergeSortingTransform: Memory usage is lowered from 186.25 MiB to 95.00 MiB -- MergeSortingTransform: Re-merging is not useful (memory usage was not lowered by remerge_sort_lowered_memory_bytes_ratio=2.0) -select number k, repeat(toString(number), 11) v1, repeat(toString(number), 12) v2 from numbers(3e6) order by k limit 400e3 format Null; -- { serverError 241 }} -select number k, repeat(toString(number), 11) v1, repeat(toString(number), 12) v2 from numbers(3e6) order by k limit 400e3 settings remerge_sort_lowered_memory_bytes_ratio=2. format Null; +select repeat(toString(number), 11) v1, repeat(toString(number), 12) v2 from numbers(3e6) order by v1, v2 limit 400e3 format Null; -- { serverError 241 } +select repeat(toString(number), 11) v1, repeat(toString(number), 12) v2 from numbers(3e6) order by v1, v2 limit 400e3 settings remerge_sort_lowered_memory_bytes_ratio=2. format Null; -- { serverError 241 } -- remerge_sort_lowered_memory_bytes_ratio 1.9 is good (need at least 1.91/0.98=1.94) -- MergeSortingTransform: Re-merging intermediate ORDER BY data (20 blocks with 819200 rows) to save memory consumption From ce40d84eefb0629ff49f55f99caef7c15392aa8b Mon Sep 17 00:00:00 2001 From: Nickita Taranov Date: Tue, 29 Mar 2022 21:16:05 +0200 Subject: [PATCH 63/82] more fixes --- .../Optimizations/liftUpFunctions.cpp | 36 +++++++++++++------ src/Processors/QueryPlan/SortingStep.cpp | 2 +- ...emerge_sort_lowered_memory_bytes_ratio.sql | 4 +-- .../01655_plan_optimizations.reference | 14 +++----- .../0_stateless/01655_plan_optimizations.sh | 8 ++--- 5 files changed, 35 insertions(+), 29 deletions(-) diff --git a/src/Processors/QueryPlan/Optimizations/liftUpFunctions.cpp b/src/Processors/QueryPlan/Optimizations/liftUpFunctions.cpp index 8e4242ea73e..80b82d989dd 100644 --- a/src/Processors/QueryPlan/Optimizations/liftUpFunctions.cpp +++ b/src/Processors/QueryPlan/Optimizations/liftUpFunctions.cpp @@ -2,35 +2,48 @@ #include #include #include +#include -#include -#include +namespace DB +{ +namespace ErrorCodes +{ + extern const int LOGICAL_ERROR; +} +} -namespace DB::QueryPlanOptimizations +namespace { -void swapSortingAndUnneededCalculations(QueryPlan::Node * parent_node, ActionsDAGPtr && unneeded_for_sorting) +void swapSortingAndUnneededCalculations(DB::QueryPlan::Node * parent_node, DB::ActionsDAGPtr && unneeded_for_sorting) { - QueryPlan::Node * child_node = parent_node->children.front(); + DB::QueryPlan::Node * child_node = parent_node->children.front(); auto & parent_step = parent_node->step; auto & child_step = child_node->step; - auto * sorting_step = typeid_cast(parent_step.get()); + auto * sorting_step = typeid_cast(parent_step.get()); // Sorting -> Expression std::swap(parent_step, child_step); // Expression -> Sorting - sorting_step->updateInputStream(child_node->children.at(0)->step->getOutputStream()); - auto input_header = sorting_step->getInputStreams().at(0).header; + if (child_node->children.size() != 1) + throw DB::Exception(DB::ErrorCodes::LOGICAL_ERROR, "SortingStep is expected to have only one input stream."); + sorting_step->updateInputStream(child_node->children.front()->step->getOutputStream()); + auto input_header = sorting_step->getInputStreams().front().header; sorting_step->updateOutputStream(std::move(input_header)); auto description = parent_node->step->getStepDescription(); - parent_step = std::make_unique(child_step->getOutputStream(), std::move(unneeded_for_sorting)); + parent_step = std::make_unique(child_step->getOutputStream(), std::move(unneeded_for_sorting)); parent_step->setStepDescription(description + " [lifted up part]"); // UnneededCalculations -> Sorting } +} + +namespace DB::QueryPlanOptimizations +{ + size_t tryExecuteFunctionsAfterSorting(QueryPlan::Node * parent_node, QueryPlan::Nodes & nodes) { if (parent_node->children.size() != 1) @@ -55,12 +68,15 @@ size_t tryExecuteFunctionsAfterSorting(QueryPlan::Node * parent_node, QueryPlan: if (unneeded_for_sorting->trivial()) return 0; + if (child_node->children.size() != 1) + throw DB::Exception(DB::ErrorCodes::LOGICAL_ERROR, "ExpressionStep is expected to have only one input stream."); + // Sorting (parent_node) -> Expression (child_node) auto & node_with_needed = nodes.emplace_back(); std::swap(node_with_needed.children, child_node->children); child_node->children = {&node_with_needed}; node_with_needed.step - = std::make_unique(node_with_needed.children.at(0)->step->getOutputStream(), std::move(needed_for_sorting)); + = std::make_unique(node_with_needed.children.front()->step->getOutputStream(), std::move(needed_for_sorting)); node_with_needed.step->setStepDescription(child_step->getStepDescription()); // Sorting (parent_node) -> so far the origin Expression (child_node) -> NeededCalculations (node_with_needed) diff --git a/src/Processors/QueryPlan/SortingStep.cpp b/src/Processors/QueryPlan/SortingStep.cpp index 9cc242852bf..efefbad0ded 100644 --- a/src/Processors/QueryPlan/SortingStep.cpp +++ b/src/Processors/QueryPlan/SortingStep.cpp @@ -92,7 +92,7 @@ SortingStep::SortingStep( void SortingStep::updateInputStream(DataStream input_stream) { input_streams.clear(); - input_streams.push_back(std::move(input_stream)); + input_streams.emplace_back(std::move(input_stream)); } void SortingStep::updateOutputStream(Block result_header) diff --git a/tests/queries/0_stateless/01600_remerge_sort_lowered_memory_bytes_ratio.sql b/tests/queries/0_stateless/01600_remerge_sort_lowered_memory_bytes_ratio.sql index 8646b40563e..6e23ab9cdb9 100644 --- a/tests/queries/0_stateless/01600_remerge_sort_lowered_memory_bytes_ratio.sql +++ b/tests/queries/0_stateless/01600_remerge_sort_lowered_memory_bytes_ratio.sql @@ -10,8 +10,8 @@ set max_block_size=40960; -- MergeSortingTransform: Re-merging intermediate ORDER BY data (20 blocks with 819200 rows) to save memory consumption -- MergeSortingTransform: Memory usage is lowered from 186.25 MiB to 95.00 MiB -- MergeSortingTransform: Re-merging is not useful (memory usage was not lowered by remerge_sort_lowered_memory_bytes_ratio=2.0) -select repeat(toString(number), 11) v1, repeat(toString(number), 12) v2 from numbers(3e6) order by v1, v2 limit 400e3 format Null; -- { serverError 241 } -select repeat(toString(number), 11) v1, repeat(toString(number), 12) v2 from numbers(3e6) order by v1, v2 limit 400e3 settings remerge_sort_lowered_memory_bytes_ratio=2. format Null; -- { serverError 241 } +select number k, repeat(toString(number), 11) v1, repeat(toString(number), 12) v2 from numbers(3e6) order by v1, v2 limit 400e3 format Null; -- { serverError 241 } +select number k, repeat(toString(number), 11) v1, repeat(toString(number), 12) v2 from numbers(3e6) order by v1, v2 limit 400e3 settings remerge_sort_lowered_memory_bytes_ratio=2. format Null; -- { serverError 241 } -- remerge_sort_lowered_memory_bytes_ratio 1.9 is good (need at least 1.91/0.98=1.94) -- MergeSortingTransform: Re-merging intermediate ORDER BY data (20 blocks with 819200 rows) to save memory consumption diff --git a/tests/queries/0_stateless/01655_plan_optimizations.reference b/tests/queries/0_stateless/01655_plan_optimizations.reference index 218ff7bd8c9..bb9c614f728 100644 --- a/tests/queries/0_stateless/01655_plan_optimizations.reference +++ b/tests/queries/0_stateless/01655_plan_optimizations.reference @@ -143,17 +143,11 @@ Filter 2 3 2 3 > function calculation should be done after sorting and limit (if possible) -> the whole Expression node could be moved after Sorting -Expression -Limit -Expression -Sorting -Expression > Expression should be divided into two subexpressions and only one of them should be moved after Sorting -Expression -Limit -Expression +Expression (Before ORDER BY [lifted up part]) +FUNCTION sipHash64 Sorting -Expression +Expression (Before ORDER BY) +FUNCTION plus > this query should be executed without throwing an exception 0 diff --git a/tests/queries/0_stateless/01655_plan_optimizations.sh b/tests/queries/0_stateless/01655_plan_optimizations.sh index 1f5d88bd8bf..0b7f004a2ce 100755 --- a/tests/queries/0_stateless/01655_plan_optimizations.sh +++ b/tests/queries/0_stateless/01655_plan_optimizations.sh @@ -198,14 +198,10 @@ $CLICKHOUSE_CLIENT -q " ) where a != 1 settings enable_optimize_predicate_expression = 0" echo "> function calculation should be done after sorting and limit (if possible)" -echo "> the whole Expression node could be moved after Sorting" -$CLICKHOUSE_CLIENT -q " - explain select sipHash64(number) from numbers(100) order by number limit 5" | - sed 's/ //g' | grep -o "^ *\(Expression\|Limit\|Sorting\)" echo "> Expression should be divided into two subexpressions and only one of them should be moved after Sorting" $CLICKHOUSE_CLIENT -q " - explain select sipHash64(number) from numbers(100) order by number + 1 limit 5" | - sed 's/ //g' | grep -o "^ *\(Expression\|Limit\|Sorting\)" + explain actions = 1 select number as n, sipHash64(n) from numbers(100) order by number + 1 limit 5" | + sed 's/^ *//g' | grep -o "^ *\(Expression (Before ORDER BY.*)\|Sorting\|FUNCTION \w\+\)" echo "> this query should be executed without throwing an exception" $CLICKHOUSE_CLIENT -q " select throwIf(number = 5) from (select * from numbers(10)) order by number limit 1" From 440e57769a9f19cf5223e0eb14e66a8808a6cc13 Mon Sep 17 00:00:00 2001 From: Nickita Taranov Date: Wed, 30 Mar 2022 00:29:20 +0200 Subject: [PATCH 64/82] more fizes --- .../Optimizations/liftUpFunctions.cpp | 48 ++++++++----------- src/Processors/QueryPlan/SortingStep.cpp | 2 +- 2 files changed, 20 insertions(+), 30 deletions(-) diff --git a/src/Processors/QueryPlan/Optimizations/liftUpFunctions.cpp b/src/Processors/QueryPlan/Optimizations/liftUpFunctions.cpp index 80b82d989dd..32918f3e5a2 100644 --- a/src/Processors/QueryPlan/Optimizations/liftUpFunctions.cpp +++ b/src/Processors/QueryPlan/Optimizations/liftUpFunctions.cpp @@ -15,28 +15,12 @@ namespace ErrorCodes namespace { -void swapSortingAndUnneededCalculations(DB::QueryPlan::Node * parent_node, DB::ActionsDAGPtr && unneeded_for_sorting) +const DB::DataStream & getChildOutputStream(DB::QueryPlan::Node & node) { - DB::QueryPlan::Node * child_node = parent_node->children.front(); - - auto & parent_step = parent_node->step; - auto & child_step = child_node->step; - auto * sorting_step = typeid_cast(parent_step.get()); - - // Sorting -> Expression - std::swap(parent_step, child_step); - // Expression -> Sorting - - if (child_node->children.size() != 1) - throw DB::Exception(DB::ErrorCodes::LOGICAL_ERROR, "SortingStep is expected to have only one input stream."); - sorting_step->updateInputStream(child_node->children.front()->step->getOutputStream()); - auto input_header = sorting_step->getInputStreams().front().header; - sorting_step->updateOutputStream(std::move(input_header)); - - auto description = parent_node->step->getStepDescription(); - parent_step = std::make_unique(child_step->getOutputStream(), std::move(unneeded_for_sorting)); - parent_step->setStepDescription(description + " [lifted up part]"); - // UnneededCalculations -> Sorting + if (node.children.size() != 1) + throw DB::Exception( + DB::ErrorCodes::LOGICAL_ERROR, "Node \"{}\" is expected to have only one child.", node.step->getStepDescription()); + return node.children.front()->step->getOutputStream(); } } @@ -68,20 +52,26 @@ size_t tryExecuteFunctionsAfterSorting(QueryPlan::Node * parent_node, QueryPlan: if (unneeded_for_sorting->trivial()) return 0; - if (child_node->children.size() != 1) - throw DB::Exception(DB::ErrorCodes::LOGICAL_ERROR, "ExpressionStep is expected to have only one input stream."); - // Sorting (parent_node) -> Expression (child_node) auto & node_with_needed = nodes.emplace_back(); std::swap(node_with_needed.children, child_node->children); child_node->children = {&node_with_needed}; - node_with_needed.step - = std::make_unique(node_with_needed.children.front()->step->getOutputStream(), std::move(needed_for_sorting)); - node_with_needed.step->setStepDescription(child_step->getStepDescription()); + node_with_needed.step = std::make_unique(getChildOutputStream(node_with_needed), std::move(needed_for_sorting)); + node_with_needed.step->setStepDescription(child_step->getStepDescription()); // Sorting (parent_node) -> so far the origin Expression (child_node) -> NeededCalculations (node_with_needed) - swapSortingAndUnneededCalculations(parent_node, std::move(unneeded_for_sorting)); - // UneededCalculations (child_node) -> Sorting (parent_node) -> NeededCalculations (node_with_needed) + + std::swap(parent_step, child_step); + // so far the origin Expression (parent_node) -> Sorting (child_node) -> NeededCalculations (node_with_needed) + + sorting_step->updateInputStream(getChildOutputStream(*child_node)); + auto input_header = sorting_step->getInputStreams().at(0).header; + sorting_step->updateOutputStream(std::move(input_header)); + + auto description = parent_step->getStepDescription(); + parent_step = std::make_unique(child_step->getOutputStream(), std::move(unneeded_for_sorting)); + parent_step->setStepDescription(description + " [lifted up part]"); + // UneededCalculations (parent_node) -> Sorting (child_node) -> NeededCalculations (node_with_needed) return 3; } diff --git a/src/Processors/QueryPlan/SortingStep.cpp b/src/Processors/QueryPlan/SortingStep.cpp index efefbad0ded..859c9fd9e19 100644 --- a/src/Processors/QueryPlan/SortingStep.cpp +++ b/src/Processors/QueryPlan/SortingStep.cpp @@ -97,7 +97,7 @@ void SortingStep::updateInputStream(DataStream input_stream) void SortingStep::updateOutputStream(Block result_header) { - output_stream = createOutputStream(input_streams.front(), std::move(result_header), getDataStreamTraits()); + output_stream = createOutputStream(input_streams.at(0), std::move(result_header), getDataStreamTraits()); updateDistinctColumns(output_stream->header, output_stream->distinct_columns); } From 698a984c074390127b6989705f9b31ae5a89df7a Mon Sep 17 00:00:00 2001 From: Nickita Taranov Date: Thu, 31 Mar 2022 13:39:05 +0200 Subject: [PATCH 65/82] throw if sorting column not found --- src/Interpreters/ActionsDAG.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/Interpreters/ActionsDAG.cpp b/src/Interpreters/ActionsDAG.cpp index ea90bedd2f6..f06ac229e94 100644 --- a/src/Interpreters/ActionsDAG.cpp +++ b/src/Interpreters/ActionsDAG.cpp @@ -1533,6 +1533,10 @@ ActionsDAG::SplitResult ActionsDAG::splitActionsBySortingDescription(const NameS for (const auto & sort_column : sort_columns) if (const auto * node = tryFindInIndex(sort_column)) split_nodes.insert(node); + else + throw Exception( + ErrorCodes::LOGICAL_ERROR, "Sorting column {} wasn't found in the ActionsDAG's index. DAG:\n{}", sort_column, dumpDAG()); + auto res = split(split_nodes); res.second->project_input = project_input; return res; From 0f94a58f3a7bd224662feb7bc8e4e9a954eb167a Mon Sep 17 00:00:00 2001 From: Nickita Taranov Date: Mon, 4 Apr 2022 14:59:38 +0200 Subject: [PATCH 66/82] use getName() --- src/Processors/QueryPlan/Optimizations/liftUpFunctions.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/Processors/QueryPlan/Optimizations/liftUpFunctions.cpp b/src/Processors/QueryPlan/Optimizations/liftUpFunctions.cpp index 32918f3e5a2..2a415f8c5af 100644 --- a/src/Processors/QueryPlan/Optimizations/liftUpFunctions.cpp +++ b/src/Processors/QueryPlan/Optimizations/liftUpFunctions.cpp @@ -18,8 +18,7 @@ namespace const DB::DataStream & getChildOutputStream(DB::QueryPlan::Node & node) { if (node.children.size() != 1) - throw DB::Exception( - DB::ErrorCodes::LOGICAL_ERROR, "Node \"{}\" is expected to have only one child.", node.step->getStepDescription()); + throw DB::Exception(DB::ErrorCodes::LOGICAL_ERROR, "Node \"{}\" is expected to have only one child.", node.step->getName()); return node.children.front()->step->getOutputStream(); } From 6c6fb5c3e822dd4e70363aae4d6874543cdb95a6 Mon Sep 17 00:00:00 2001 From: kssenii Date: Mon, 4 Apr 2022 16:32:37 +0200 Subject: [PATCH 67/82] Fix race in cached buffer --- src/Disks/IO/CachedReadBufferFromRemoteFS.cpp | 18 +++++++++++------- src/IO/ReadBufferFromS3.cpp | 2 +- src/IO/ReadBufferFromS3.h | 6 ++++-- 3 files changed, 16 insertions(+), 10 deletions(-) diff --git a/src/Disks/IO/CachedReadBufferFromRemoteFS.cpp b/src/Disks/IO/CachedReadBufferFromRemoteFS.cpp index 4766b838fda..de671e58687 100644 --- a/src/Disks/IO/CachedReadBufferFromRemoteFS.cpp +++ b/src/Disks/IO/CachedReadBufferFromRemoteFS.cpp @@ -334,15 +334,17 @@ SeekableReadBufferPtr CachedReadBufferFromRemoteFS::getImplementationBuffer(File read_buffer_for_file_segment->seek(file_offset_of_buffer_end, SEEK_SET); } - auto impl_range = read_buffer_for_file_segment->getRemainingReadRange(); auto download_offset = file_segment->getDownloadOffset(); if (download_offset != static_cast(read_buffer_for_file_segment->getPosition())) + { + auto impl_range = read_buffer_for_file_segment->getRemainingReadRange(); throw Exception( ErrorCodes::LOGICAL_ERROR, "Buffer's offsets mismatch; cached buffer offset: {}, download_offset: {}, position: {}, implementation buffer offset: {}, " "implementation buffer reading until: {}, file segment info: {}", file_offset_of_buffer_end, download_offset, read_buffer_for_file_segment->getPosition(), impl_range.left, *impl_range.right, file_segment->getInfoForLog()); + } break; } @@ -802,12 +804,14 @@ std::optional CachedReadBufferFromRemoteFS::getLastNonDownloadedOffset() String CachedReadBufferFromRemoteFS::getInfoForLog() { - auto implementation_buffer_read_range_str = - implementation_buffer ? - std::to_string(implementation_buffer->getRemainingReadRange().left) - + '-' - + (implementation_buffer->getRemainingReadRange().right ? std::to_string(*implementation_buffer->getRemainingReadRange().right) : "None") - : "None"; + String implementation_buffer_read_range_str; + if (implementation_buffer) + { + auto read_range = implementation_buffer->getRemainingReadRange(); + implementation_buffer_read_range_str = std::to_string(read_range.left) + '-' + (read_range.right ? std::to_string(*read_range.right) : "None"); + } + else + implementation_buffer_read_range_str = "None"; auto current_file_segment_info = current_file_segment_it == file_segments_holder->file_segments.end() ? "None" : (*current_file_segment_it)->getInfoForLog(); diff --git a/src/IO/ReadBufferFromS3.cpp b/src/IO/ReadBufferFromS3.cpp index 6616d92b492..728893e912d 100644 --- a/src/IO/ReadBufferFromS3.cpp +++ b/src/IO/ReadBufferFromS3.cpp @@ -240,7 +240,7 @@ void ReadBufferFromS3::setReadUntilPosition(size_t position) SeekableReadBuffer::Range ReadBufferFromS3::getRemainingReadRange() const { - return Range{.left = static_cast(offset), .right = read_until_position ? std::optional{read_until_position - 1} : std::nullopt}; + return Range{ .left = static_cast(offset), .right = read_until_position ? std::optional{read_until_position - 1} : std::nullopt }; } std::unique_ptr ReadBufferFromS3::initialize() diff --git a/src/IO/ReadBufferFromS3.h b/src/IO/ReadBufferFromS3.h index 5c9d709d58e..0040ede6d6b 100644 --- a/src/IO/ReadBufferFromS3.h +++ b/src/IO/ReadBufferFromS3.h @@ -33,8 +33,10 @@ private: String key; UInt64 max_single_read_retries; - off_t offset = 0; - off_t read_until_position = 0; + /// These variables are atomic because they can be used for `logging only` + /// from separate thread other than the one which uses the buffer for s3 reading. + std::atomic offset = 0; + std::atomic read_until_position = 0; Aws::S3::Model::GetObjectResult read_result; std::unique_ptr impl; From d69757696721bb9486e524bc288300ad932174a7 Mon Sep 17 00:00:00 2001 From: Kseniia Sumarokova <54203879+kssenii@users.noreply.github.com> Date: Mon, 4 Apr 2022 17:53:01 +0200 Subject: [PATCH 68/82] Update ReadBufferFromS3.h --- src/IO/ReadBufferFromS3.h | 1 + 1 file changed, 1 insertion(+) diff --git a/src/IO/ReadBufferFromS3.h b/src/IO/ReadBufferFromS3.h index 0040ede6d6b..5282d9ad482 100644 --- a/src/IO/ReadBufferFromS3.h +++ b/src/IO/ReadBufferFromS3.h @@ -34,6 +34,7 @@ private: UInt64 max_single_read_retries; /// These variables are atomic because they can be used for `logging only` + /// (where it is not important to get consistent result) /// from separate thread other than the one which uses the buffer for s3 reading. std::atomic offset = 0; std::atomic read_until_position = 0; From 86f42e7a3a900649772d06a5444d9bff55dc4361 Mon Sep 17 00:00:00 2001 From: alesapin Date: Mon, 4 Apr 2022 18:07:36 +0200 Subject: [PATCH 69/82] Better check for kafka_num_consumers --- src/Storages/Kafka/StorageKafka.cpp | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/Storages/Kafka/StorageKafka.cpp b/src/Storages/Kafka/StorageKafka.cpp index 4c7465d587d..9c3506742fd 100644 --- a/src/Storages/Kafka/StorageKafka.cpp +++ b/src/Storages/Kafka/StorageKafka.cpp @@ -779,11 +779,13 @@ void registerStorageKafka(StorageFactory & factory) #undef CHECK_KAFKA_STORAGE_ARGUMENT auto num_consumers = kafka_settings->kafka_num_consumers.value; - auto physical_cpu_cores = getNumberOfPhysicalCPUCores(); + auto max_consumers = std::max(getNumberOfPhysicalCPUCores(), 16); - if (num_consumers > physical_cpu_cores) + if (num_consumers > max_consumers) { - throw Exception(ErrorCodes::BAD_ARGUMENTS, "Number of consumers can not be bigger than {}", physical_cpu_cores); + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Number of consumers can not be bigger than {}, it just doesn't make sense. " + "Note that kafka_num_consumers is not number of consumers for Kafka partitions -- they are managed by Kafka client library. " + "kafka_num_consumers is internal amount of threads for ClickHouse and it shouldn't be big", max_consumers); } else if (num_consumers < 1) { From 72331856eb552b7c20208363f92f016f31abc8a5 Mon Sep 17 00:00:00 2001 From: alesapin Date: Mon, 4 Apr 2022 18:28:26 +0200 Subject: [PATCH 70/82] fix message --- src/Storages/Kafka/StorageKafka.cpp | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/Storages/Kafka/StorageKafka.cpp b/src/Storages/Kafka/StorageKafka.cpp index 9c3506742fd..c9f6bcabcc1 100644 --- a/src/Storages/Kafka/StorageKafka.cpp +++ b/src/Storages/Kafka/StorageKafka.cpp @@ -783,9 +783,12 @@ void registerStorageKafka(StorageFactory & factory) if (num_consumers > max_consumers) { - throw Exception(ErrorCodes::BAD_ARGUMENTS, "Number of consumers can not be bigger than {}, it just doesn't make sense. " - "Note that kafka_num_consumers is not number of consumers for Kafka partitions -- they are managed by Kafka client library. " - "kafka_num_consumers is internal amount of threads for ClickHouse and it shouldn't be big", max_consumers); + throw Exception(ErrorCodes::BAD_ARGUMENTS, "The number of consumers can not be bigger than {}. " + "A single consumer can read any number of partitions. Extra consumers are relatively expensive, " + "and using a lot of them can lead to high memory and CPU usage. To achieve better performance " + "of getting data from Kafka, consider using a setting kafka_thread_per_consumer=1, " + "and ensure you have enough threads in MessageBrokerSchedulePool (background_message_broker_schedule_pool_size). " + "See also https://clickhouse.com/docs/integrations/kafka/kafka-table-engine#tuning-performance", max_consumers); } else if (num_consumers < 1) { From d475ce5d169cf3d85f9c768531dc211d247bd27d Mon Sep 17 00:00:00 2001 From: "Mikhail f. Shiryaev" Date: Mon, 4 Apr 2022 18:50:50 +0200 Subject: [PATCH 71/82] Fix building ubuntu image from deb-repo --- docker/server/Dockerfile.ubuntu | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docker/server/Dockerfile.ubuntu b/docker/server/Dockerfile.ubuntu index cc198772251..6e93bd97036 100644 --- a/docker/server/Dockerfile.ubuntu +++ b/docker/server/Dockerfile.ubuntu @@ -94,8 +94,9 @@ RUN arch=${TARGETARCH:-amd64} \ && apt-get update \ && apt-get --yes -o "Dpkg::Options::=--force-confdef" -o "Dpkg::Options::=--force-confold" upgrade \ && for package in ${PACKAGES}; do \ - apt-get install --allow-unauthenticated --yes --no-install-recommends "${package}=${VERSION}" || exit 1 \ + packages="${packages} ${package}=${VERSION}" \ ; done \ + && apt-get install --allow-unauthenticated --yes --no-install-recommends ${packages} || exit 1 \ ; fi \ && clickhouse-local -q 'SELECT * FROM system.build_options' \ && rm -rf \ From 3a6bee309b79987a3a21a22cb3b3aab69d56d3b1 Mon Sep 17 00:00:00 2001 From: Anton Popov Date: Mon, 4 Apr 2022 19:10:56 +0200 Subject: [PATCH 72/82] Skip test with ordinary database --- tests/queries/0_stateless/02262_column_ttl.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/queries/0_stateless/02262_column_ttl.sh b/tests/queries/0_stateless/02262_column_ttl.sh index affb0c802ff..b5e29c9b2a1 100755 --- a/tests/queries/0_stateless/02262_column_ttl.sh +++ b/tests/queries/0_stateless/02262_column_ttl.sh @@ -1,5 +1,5 @@ #!/usr/bin/env bash -# Tags: no-parallel +# Tags: no-parallel, no-ordinary-database # ^^^^^^^^^^^ # Since the underlying view may disappears while flushing log, and leads to: # From 0477e74f42b9d2cc9056574b01f02e82016a0a52 Mon Sep 17 00:00:00 2001 From: alesapin Date: Mon, 4 Apr 2022 19:41:54 +0200 Subject: [PATCH 73/82] Get rid of caps --- src/Storages/AlterCommands.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Storages/AlterCommands.cpp b/src/Storages/AlterCommands.cpp index 16e1f044fd9..15095335a51 100644 --- a/src/Storages/AlterCommands.cpp +++ b/src/Storages/AlterCommands.cpp @@ -713,7 +713,7 @@ bool isMetadataOnlyConversion(const IDataType * from, const IDataType * to) return false; }; - static const std::unordered_multimap ALLOWED_CONVERSIONS = + static const std::unordered_multimap allowed_conversions = { { typeid(DataTypeEnum8), typeid(DataTypeInt8) }, { typeid(DataTypeEnum16), typeid(DataTypeInt16) }, @@ -735,7 +735,7 @@ bool isMetadataOnlyConversion(const IDataType * from, const IDataType * to) return true; /// Types changed, but representation on disk didn't - auto it_range = ALLOWED_CONVERSIONS.equal_range(typeid(*from)); + auto it_range = allowed_conversions.equal_range(typeid(*from)); for (auto it = it_range.first; it != it_range.second; ++it) { if (it->second == typeid(*to)) From d04c48e67a96493ddae35e6fe7fc15c7fc03d363 Mon Sep 17 00:00:00 2001 From: alesapin Date: Mon, 4 Apr 2022 20:14:09 +0200 Subject: [PATCH 74/82] Fix build --- src/Storages/MergeTree/MergeTreeData.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp index 8419f07ae73..42b26db72ce 100644 --- a/src/Storages/MergeTree/MergeTreeData.cpp +++ b/src/Storages/MergeTree/MergeTreeData.cpp @@ -2967,11 +2967,11 @@ void MergeTreeData::tryRemovePartImmediately(DataPartPtr && part) if (!((*it)->getState() == DataPartState::Outdated && it->unique())) { if ((*it)->getState() != DataPartState::Outdated) - LOG_WARNING("Cannot immediately remove part {} because it's not in Outdated state " + LOG_WARNING(log, "Cannot immediately remove part {} because it's not in Outdated state " "usage counter {}", part_name_with_state, it->use_count()); if (!it->unique()) - LOG_WARNING("Cannot immediately remove part {} because someone using it right now " + LOG_WARNING(log, "Cannot immediately remove part {} because someone using it right now " "usage counter {}", part_name_with_state, it->use_count()); return; } From 43f697d7bac7bad36acd95dd6e555be9d96e77f7 Mon Sep 17 00:00:00 2001 From: Nir Peled Date: Mon, 4 Apr 2022 14:29:37 -0400 Subject: [PATCH 75/82] Fixed GA not reporting events. --- website/js/base.js | 25 ++++++++++--------------- 1 file changed, 10 insertions(+), 15 deletions(-) diff --git a/website/js/base.js b/website/js/base.js index 9389028f1ef..1ab8f841dbe 100644 --- a/website/js/base.js +++ b/website/js/base.js @@ -67,22 +67,17 @@ }); } - (function (d, w, c) { - (w[c] = w[c] || []).push(function() { - var is_single_page = $('html').attr('data-single-page') === 'true'; - - if (!is_single_page) { - $('head').each(function(_, element) { - $(element).append( - '' - ); - $(element).append( - '' - ); - }); - } + var is_single_page = $('html').attr('data-single-page') === 'true'; + if (!is_single_page) { + $('head').each(function (_, element) { + $(element).append( + '' + ); + $(element).append( + '' + ); }); - })(document, window, ""); + } var beforePrint = function() { var details = document.getElementsByTagName("details"); From 4d6c030d235f6480a2c978bf7dcc16867d6b2cce Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Mon, 4 Apr 2022 23:41:42 +0300 Subject: [PATCH 76/82] Revert "clang-tidy report issues with Medium priority" --- src/IO/ReadBufferFromFileDescriptor.h | 7 ------- src/Interpreters/SystemLog.cpp | 2 +- src/Storages/ColumnsDescription.cpp | 2 +- src/Storages/StorageDistributed.h | 2 +- 4 files changed, 3 insertions(+), 10 deletions(-) diff --git a/src/IO/ReadBufferFromFileDescriptor.h b/src/IO/ReadBufferFromFileDescriptor.h index 000f4d371eb..ba1502fb9aa 100644 --- a/src/IO/ReadBufferFromFileDescriptor.h +++ b/src/IO/ReadBufferFromFileDescriptor.h @@ -39,10 +39,6 @@ public: { } - virtual ~ReadBufferFromFileDescriptor() override - { - } - int getFD() const { return fd; @@ -84,9 +80,6 @@ public: { use_pread = true; } - virtual ~ReadBufferFromFileDescriptorPRead() override - { - } }; } diff --git a/src/Interpreters/SystemLog.cpp b/src/Interpreters/SystemLog.cpp index 59533e5a586..3b4d665e41b 100644 --- a/src/Interpreters/SystemLog.cpp +++ b/src/Interpreters/SystemLog.cpp @@ -379,7 +379,7 @@ void SystemLog::flushImpl(const std::vector & to_flush, for (const auto & name_and_type : log_element_names_and_types) log_element_columns.emplace_back(name_and_type.type, name_and_type.name); - Block block(log_element_columns); + Block block(std::move(log_element_columns)); MutableColumns columns = block.mutateColumns(); for (const auto & elem : to_flush) diff --git a/src/Storages/ColumnsDescription.cpp b/src/Storages/ColumnsDescription.cpp index f3a939614c1..1264da77b04 100644 --- a/src/Storages/ColumnsDescription.cpp +++ b/src/Storages/ColumnsDescription.cpp @@ -122,7 +122,7 @@ void ColumnDescription::readText(ReadBuffer & buf) if (col_ast->default_expression) { default_desc.kind = columnDefaultKindFromString(col_ast->default_specifier); - default_desc.expression = col_ast->default_expression; + default_desc.expression = std::move(col_ast->default_expression); } if (col_ast->comment) diff --git a/src/Storages/StorageDistributed.h b/src/Storages/StorageDistributed.h index b6d738fb61e..317463783ee 100644 --- a/src/Storages/StorageDistributed.h +++ b/src/Storages/StorageDistributed.h @@ -44,7 +44,7 @@ class StorageDistributed final : public shared_ptr_helper, p friend class StorageSystemDistributionQueue; public: - virtual ~StorageDistributed() override; + ~StorageDistributed() override; std::string getName() const override { return "Distributed"; } From 808d9afd0f8110faba5ae027051bf0a64e506da3 Mon Sep 17 00:00:00 2001 From: larryluogit Date: Mon, 4 Apr 2022 16:47:14 -0400 Subject: [PATCH 77/82] Fix optin.cplusplus.UninitializedObject issue (#35626) * Fix optin.cplusplus.UninitializedObject issue * Enable optin.cplusplus.UninitializedObject --- .clang-tidy | 1 + src/Common/ColumnsHashingImpl.h | 2 +- src/Common/CompactArray.h | 3 +++ src/Common/JSONBuilder.h | 6 +++--- 4 files changed, 8 insertions(+), 4 deletions(-) diff --git a/.clang-tidy b/.clang-tidy index 687b3741b1c..2a9cba30a85 100644 --- a/.clang-tidy +++ b/.clang-tidy @@ -144,6 +144,7 @@ Checks: '-*, clang-analyzer-cplusplus.SelfAssignment, clang-analyzer-deadcode.DeadStores, clang-analyzer-cplusplus.Move, + clang-analyzer-optin.cplusplus.UninitializedObject, clang-analyzer-optin.cplusplus.VirtualCall, clang-analyzer-security.insecureAPI.UncheckedReturn, clang-analyzer-security.insecureAPI.bcmp, diff --git a/src/Common/ColumnsHashingImpl.h b/src/Common/ColumnsHashingImpl.h index f5a732b275f..7b0650487f5 100644 --- a/src/Common/ColumnsHashingImpl.h +++ b/src/Common/ColumnsHashingImpl.h @@ -125,7 +125,7 @@ class FindResultImpl : public FindResultImplBase, public FindResultImplOffsetBas public: FindResultImpl() - : FindResultImplBase(false), FindResultImplOffsetBase(0) + : FindResultImplBase(false), FindResultImplOffsetBase(0) // NOLINT(clang-analyzer-optin.cplusplus.UninitializedObject) intentionally allow uninitialized value here {} FindResultImpl(Mapped * value_, bool found_, size_t off) diff --git a/src/Common/CompactArray.h b/src/Common/CompactArray.h index 629fa08aaaa..cf97206edb8 100644 --- a/src/Common/CompactArray.h +++ b/src/Common/CompactArray.h @@ -214,6 +214,9 @@ private: /// offset in bits to the next to the rightmost bit at that byte; or zero if the rightmost bit is the rightmost bit in that byte. offset_r = (l + content_width) % 8; + + content_l = nullptr; + content_r = nullptr; } UInt8 ALWAYS_INLINE read(UInt8 value_l) const diff --git a/src/Common/JSONBuilder.h b/src/Common/JSONBuilder.h index 9a218fcf08b..38d19da011d 100644 --- a/src/Common/JSONBuilder.h +++ b/src/Common/JSONBuilder.h @@ -61,7 +61,7 @@ private: class JSONBool : public IItem { public: - explicit JSONBool(bool value_) : value(std::move(value_)) {} + explicit JSONBool(bool value_) : value(value_) {} void format(const FormatSettings & settings, FormatContext & context) override; private: @@ -74,7 +74,7 @@ public: void add(ItemPtr value) { values.push_back(std::move(value)); } void add(std::string value) { add(std::make_unique(std::move(value))); } void add(const char * value) { add(std::make_unique(value)); } - void add(bool value) { add(std::make_unique(std::move(value))); } + void add(bool value) { add(std::make_unique(value)); } template requires std::is_arithmetic_v @@ -99,7 +99,7 @@ public: void add(std::string key, std::string value) { add(std::move(key), std::make_unique(std::move(value))); } void add(std::string key, const char * value) { add(std::move(key), std::make_unique(value)); } void add(std::string key, std::string_view value) { add(std::move(key), std::make_unique(value)); } - void add(std::string key, bool value) { add(std::move(key), std::make_unique(std::move(value))); } + void add(std::string key, bool value) { add(std::move(key), std::make_unique(value)); } template requires std::is_arithmetic_v From 6c7b8a0fd4d6a587cbed3a3d520a02133f54ddb2 Mon Sep 17 00:00:00 2001 From: "Mikhail f. Shiryaev" Date: Mon, 4 Apr 2022 23:34:33 +0200 Subject: [PATCH 78/82] Update unixodbc to mitigate CVE-2018-7485 --- contrib/unixodbc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/contrib/unixodbc b/contrib/unixodbc index b0ad30f7f62..a2cd5395e8c 160000 --- a/contrib/unixodbc +++ b/contrib/unixodbc @@ -1 +1 @@ -Subproject commit b0ad30f7f6289c12b76f04bfb9d466374bb32168 +Subproject commit a2cd5395e8c7f7390025ec93af5bfebef3fb5fcd From 61183ac07b619044c5821a5794bcd903e75f0e60 Mon Sep 17 00:00:00 2001 From: Nikita Mikhaylov Date: Mon, 4 Apr 2022 22:24:39 +0000 Subject: [PATCH 79/82] Done --- docker/keeper/Dockerfile | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docker/keeper/Dockerfile b/docker/keeper/Dockerfile index 207dddce1bb..068377e8f8c 100644 --- a/docker/keeper/Dockerfile +++ b/docker/keeper/Dockerfile @@ -20,6 +20,8 @@ ENV LANG=en_US.UTF-8 \ COPY --from=glibc-donor /lib/linux-gnu/libc.so.6 /lib/linux-gnu/libdl.so.2 /lib/linux-gnu/libm.so.6 /lib/linux-gnu/libpthread.so.0 /lib/linux-gnu/librt.so.1 /lib/linux-gnu/libnss_dns.so.2 /lib/linux-gnu/libnss_files.so.2 /lib/linux-gnu/libresolv.so.2 /lib/linux-gnu/ld-2.31.so /lib/ COPY --from=glibc-donor /etc/nsswitch.conf /etc/ COPY entrypoint.sh /entrypoint.sh + +ARG TARGETARCH RUN arch=${TARGETARCH:-amd64} \ && case $arch in \ amd64) mkdir -p /lib64 && ln -sf /lib/ld-2.31.so /lib64/ld-linux-x86-64.so.2 ;; \ From 1d60824d6af7415755992b3c54d4949424278938 Mon Sep 17 00:00:00 2001 From: "Mikhail f. Shiryaev" Date: Tue, 5 Apr 2022 00:52:37 +0200 Subject: [PATCH 80/82] Highlight headers in PR template --- .github/PULL_REQUEST_TEMPLATE.md | 4 ++-- tests/ci/run_check.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index 6540b60476f..1b7498c3091 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -1,4 +1,4 @@ -Changelog category (leave one): +## Changelog category (leave one): - New Feature - Improvement - Bug Fix (user-visible misbehaviour in official stable or prestable release) @@ -9,7 +9,7 @@ Changelog category (leave one): - Not for changelog (changelog entry is not required) -Changelog entry (a user-readable short description of the changes that goes to CHANGELOG.md): +## Changelog entry (a user-readable short description of the changes that goes to CHANGELOG.md): ... diff --git a/tests/ci/run_check.py b/tests/ci/run_check.py index 93dc77124c2..bc818ffb6bf 100644 --- a/tests/ci/run_check.py +++ b/tests/ci/run_check.py @@ -162,7 +162,7 @@ def check_pr_description(pr_info): i = 0 while i < len(lines): - if re.match(r"(?i)^[>*_ ]*change\s*log\s*category", lines[i]): + if re.match(r"(?i)^[#>*_ ]*change\s*log\s*category", lines[i]): i += 1 if i >= len(lines): break @@ -191,7 +191,7 @@ def check_pr_description(pr_info): return result_status[:140], category elif re.match( - r"(?i)^[>*_ ]*(short\s*description|change\s*log\s*entry)", lines[i] + r"(?i)^[#>*_ ]*(short\s*description|change\s*log\s*entry)", lines[i] ): i += 1 # Can have one empty line between header and the entry itself. From a665861f5f06f81c337a7a7648847f23c336cb43 Mon Sep 17 00:00:00 2001 From: "Mikhail f. Shiryaev" Date: Tue, 5 Apr 2022 01:06:46 +0200 Subject: [PATCH 81/82] Improve descriptrion check logging --- tests/ci/run_check.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/tests/ci/run_check.py b/tests/ci/run_check.py index bc818ffb6bf..6f00232be77 100644 --- a/tests/ci/run_check.py +++ b/tests/ci/run_check.py @@ -262,9 +262,14 @@ if __name__ == "__main__": remove_labels(gh, pr_info, pr_labels_to_remove) if description_report: - print("::notice ::Cannot run, description does not match the template") + print( + "::error ::Cannot run, PR description does not match the template: " + f"{description_report}" + ) logging.info( - "PR body doesn't match the template: (start)\n%s\n(end)", pr_info.body + "PR body doesn't match the template: (start)\n%s\n(end)\n" "Reason: %s", + pr_info.body, + description_report, ) url = ( f"{GITHUB_SERVER_URL}/{GITHUB_REPOSITORY}/" From 588a168e091e834c21d5a4dcb70c2a98f62f11bb Mon Sep 17 00:00:00 2001 From: "Mikhail f. Shiryaev" Date: Tue, 5 Apr 2022 10:05:54 +0200 Subject: [PATCH 82/82] Decrease headers size Co-authored-by: Azat Khuzhin --- .github/PULL_REQUEST_TEMPLATE.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index 1b7498c3091..2d8540b57ea 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -1,4 +1,4 @@ -## Changelog category (leave one): +### Changelog category (leave one): - New Feature - Improvement - Bug Fix (user-visible misbehaviour in official stable or prestable release) @@ -9,7 +9,7 @@ - Not for changelog (changelog entry is not required) -## Changelog entry (a user-readable short description of the changes that goes to CHANGELOG.md): +### Changelog entry (a user-readable short description of the changes that goes to CHANGELOG.md): ...